diff --git a/13.go b/13.go index 62cb1b1..ed36be0 100644 --- a/13.go +++ b/13.go @@ -37,9 +37,9 @@ const ( P503PubKeySz = 378 P503PrvKeySz = 32 P503SharedSecretSz = 126 - SidhP503Curve25519PubKeySz = x25519SharedSecretSz + P503PubKeySz - SidhP503Curve25519PrvKeySz = x25519SharedSecretSz + P503PrvKeySz - SidhP503Curve25519SharedKeySz = x25519SharedSecretSz + P503SharedSecretSz + SIDHp503Curve25519PubKeySz = x25519SharedSecretSz + P503PubKeySz + SIDHp503Curve25519PrvKeySz = x25519SharedSecretSz + P503PrvKeySz + SIDHp503Curve25519SharedKeySz = x25519SharedSecretSz + P503SharedSecretSz ) const ( @@ -78,10 +78,13 @@ type dhKex interface { } // Key Exchange strategies per curve type -type kexNist struct{} // Used by NIST curves; P-256, P-384, P-512 -type kexX25519 struct{} // Used by X25519 -type kexSidhP503 struct{} // Used by SIDH/P503 -type kexHybridSidhP503X25519 struct{} // Used by SIDH-ECDH hybrid scheme +type kexNist struct{} // Used by NIST curves; P-256, P-384, P-512 +type kexX25519 struct{} // Used by X25519 +type kexSIDHp503 struct{} // Used by SIDH/P503 +type kexHybridSIDHp503X25519 struct { + classicKEX kexX25519 + pqKEX kexSIDHp503 +} // Used by SIDH-ECDH hybrid scheme // Routing map for key exchange strategies var dhKexStrat = map[CurveID]dhKex{ @@ -89,8 +92,7 @@ var dhKexStrat = map[CurveID]dhKex{ CurveP384: &kexNist{}, CurveP521: &kexNist{}, X25519: &kexX25519{}, - sidhP503: &kexSidhP503{}, - HybridSidhP503Curve25519: &kexHybridSidhP503X25519{}, + HybridSIDHp503Curve25519: &kexHybridSIDHp503X25519{}, } func newKeySchedule13(suite *cipherSuite, config *Config, clientRandom []byte) *keySchedule13 { @@ -1222,17 +1224,17 @@ func (kexX25519) derive(c *Conn, ks keyShare, secretKey []byte) []byte { } // KEX: SIDH/503 -func (kexSidhP503) generate(c *Conn, groupId CurveID) ([]byte, keyShare, error) { +func (kexSIDHp503) generate(c *Conn, groupId CurveID) ([]byte, keyShare, error) { var variant, _ = getSidhKeyVariant(c.isClient) var prvKey = sidh.NewPrivateKey(sidh.FP_503, variant) if prvKey.Generate(c.config.rand()) != nil { return nil, keyShare{}, errors.New("tls: private SIDH key generation failed") } pubKey := prvKey.GeneratePublicKey() - return prvKey.Export(), keyShare{group: sidhP503, data: pubKey.Export()}, nil + return prvKey.Export(), keyShare{group: 0, data: pubKey.Export()}, nil } -func (kexSidhP503) derive(c *Conn, ks keyShare, key []byte) []byte { +func (kexSIDHp503) derive(c *Conn, ks keyShare, key []byte) []byte { var prvVariant, pubVariant = getSidhKeyVariant(c.isClient) var prvKeySize = P503PrvKeySz @@ -1256,12 +1258,12 @@ func (kexSidhP503) derive(c *Conn, ks keyShare, key []byte) []byte { } // KEX Hybrid SIDH/503-X25519 -func (kexHybridSidhP503X25519) generate(c *Conn, groupId CurveID) (private []byte, ks keyShare, err error) { - var pubHybrid [SidhP503Curve25519PubKeySz]byte - var prvHybrid [SidhP503Curve25519PrvKeySz]byte +func (kex *kexHybridSIDHp503X25519) generate(c *Conn, groupId CurveID) (private []byte, ks keyShare, err error) { + var pubHybrid [SIDHp503Curve25519PubKeySz]byte + var prvHybrid [SIDHp503Curve25519PrvKeySz]byte // Generate ephemeral key for classic x25519 - private, ks, err = dhKexStrat[X25519].generate(c, groupId) + private, ks, err = kex.classicKEX.generate(c, groupId) if err != nil { return } @@ -1269,33 +1271,33 @@ func (kexHybridSidhP503X25519) generate(c *Conn, groupId CurveID) (private []byt copy(pubHybrid[:], ks.data) // Generate PQ ephemeral key for SIDH - private, ks, err = dhKexStrat[sidhP503].generate(c, groupId) + private, ks, err = kex.pqKEX.generate(c, groupId) if err != nil { return } copy(prvHybrid[x25519SharedSecretSz:], private) copy(pubHybrid[x25519SharedSecretSz:], ks.data) - return prvHybrid[:], keyShare{group: HybridSidhP503Curve25519, data: pubHybrid[:]}, nil + return prvHybrid[:], keyShare{group: HybridSIDHp503Curve25519, data: pubHybrid[:]}, nil } -func (kexHybridSidhP503X25519) derive(c *Conn, ks keyShare, key []byte) []byte { - var sharedKey [SidhP503Curve25519SharedKeySz]byte +func (kex *kexHybridSIDHp503X25519) derive(c *Conn, ks keyShare, key []byte) []byte { + var sharedKey [SIDHp503Curve25519SharedKeySz]byte var ret []byte var tmpKs keyShare // Key agreement for classic tmpKs.group = X25519 tmpKs.data = ks.data[:x25519SharedSecretSz] - ret = dhKexStrat[X25519].derive(c, tmpKs, key[:x25519SharedSecretSz]) + ret = kex.classicKEX.derive(c, tmpKs, key[:x25519SharedSecretSz]) if ret == nil { return nil } copy(sharedKey[:], ret) // Key agreement for PQ - tmpKs.group = sidhP503 + tmpKs.group = 0 tmpKs.data = ks.data[x25519SharedSecretSz:] - ret = dhKexStrat[sidhP503].derive(c, tmpKs, key[x25519SharedSecretSz:]) + ret = kex.pqKEX.derive(c, tmpKs, key[x25519SharedSecretSz:]) if ret == nil { return nil } diff --git a/_dev/boring/server.sh b/_dev/boring/server.sh index c08ecb5..6fcf87f 100755 --- a/_dev/boring/server.sh +++ b/_dev/boring/server.sh @@ -24,7 +24,7 @@ bssl server \ # ECDSA and SIDH/P503-X25519 bssl server \ -key ecdsa.pem \ - -curves x25519sidh503 \ + -curves X25519-SIDHp503:X25519:P-256:P-384:P-521 \ -min-version tls1.3 -max-version tls1.3 \ -accept 7443 -loop -www \ -debug 2>&1 & diff --git a/_dev/boring/sidh_d451453067cd665a5c38830fbbaac9e599234a5e.patch b/_dev/boring/sidh_d451453067cd665a5c38830fbbaac9e599234a5e.patch deleted file mode 100644 index 25cec65..0000000 --- a/_dev/boring/sidh_d451453067cd665a5c38830fbbaac9e599234a5e.patch +++ /dev/null @@ -1,13967 +0,0 @@ -diff --git a/CMakeLists.txt b/CMakeLists.txt -index 4eb0d0d6..a43fec27 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -446,6 +446,7 @@ add_subdirectory(ssl/test) - add_subdirectory(fipstools) - add_subdirectory(tool) - add_subdirectory(decrepit) -+add_subdirectory(third_party/sidh) - - if(FUZZ) - if(LIBFUZZER_FROM_DEPS) -diff --git a/include/openssl/nid.h b/include/openssl/nid.h -index afeb2dea..e12ebf3e 100644 ---- a/include/openssl/nid.h -+++ b/include/openssl/nid.h -@@ -4194,6 +4194,9 @@ extern "C" { - #define SN_X25519 "X25519" - #define NID_X25519 948 - -+#define SN_x25519sidh503 "x25519sidh503" -+#define NID_x25519sidh503 0x0105 -+ - #define SN_ED25519 "ED25519" - #define NID_ED25519 949 - #define OBJ_ED25519 1L, 3L, 101L, 112L -diff --git a/include/openssl/ssl.h b/include/openssl/ssl.h -index d6169816..1d93d671 100644 ---- a/include/openssl/ssl.h -+++ b/include/openssl/ssl.h -@@ -2177,6 +2177,7 @@ OPENSSL_EXPORT int SSL_set1_curves_list(SSL *ssl, const char *curves); - #define SSL_CURVE_SECP384R1 24 - #define SSL_CURVE_SECP521R1 25 - #define SSL_CURVE_X25519 29 -+#define SSL_CURVE_sidh503 0x0105 - - // SSL_get_curve_id returns the ID of the curve used by |ssl|'s most recently - // completed handshake or 0 if not applicable. -diff --git a/ssl/CMakeLists.txt b/ssl/CMakeLists.txt -index 6881089f..c08e93b7 100644 ---- a/ssl/CMakeLists.txt -+++ b/ssl/CMakeLists.txt -@@ -58,3 +58,9 @@ if(WIN32) - target_link_libraries(ssl_test ws2_32) - endif() - add_dependencies(all_tests ssl_test) -+ -+ -+if(EXP_SIDH) -+ add_definitions(-DBORINGSSL_USE_SIDH) -+ target_link_libraries(ssl sidh503) -+endif() -\ No newline at end of file -diff --git a/ssl/handshake_client.cc b/ssl/handshake_client.cc -index cb9b6dec..4765b8d9 100644 ---- a/ssl/handshake_client.cc -+++ b/ssl/handshake_client.cc -@@ -985,6 +985,7 @@ static enum ssl_hs_wait_t do_read_server_key_exchange(SSL_HANDSHAKE *hs) { - !hs->peer_key.CopyFrom(point)) { - return ssl_hs_error; - } -+ hs->key_share->SetInitiator(true); - } else if (!(alg_k & SSL_kPSK)) { - OPENSSL_PUT_ERROR(SSL, SSL_R_UNEXPECTED_MESSAGE); - ssl_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_UNEXPECTED_MESSAGE); -diff --git a/ssl/handshake_server.cc b/ssl/handshake_server.cc -index 0159c9e9..caf8b370 100644 ---- a/ssl/handshake_server.cc -+++ b/ssl/handshake_server.cc -@@ -811,7 +811,10 @@ static enum ssl_hs_wait_t do_send_server_certificate(SSL_HANDSHAKE *hs) { - hs->new_session->group_id = group_id; - - // Set up ECDH, generate a key, and emit the public half. -- hs->key_share = SSLKeyShare::Create(group_id); -+ if ((hs->key_share = SSLKeyShare::Create(group_id)) == nullptr) { -+ return ssl_hs_error; -+ } -+ hs->key_share->SetInitiator(false); - if (!hs->key_share || - !CBB_add_u8(cbb.get(), NAMED_CURVE_TYPE) || - !CBB_add_u16(cbb.get(), group_id) || -diff --git a/ssl/internal.h b/ssl/internal.h -index 46c52486..dff62531 100644 ---- a/ssl/internal.h -+++ b/ssl/internal.h -@@ -934,12 +934,14 @@ bool ssl_public_key_verify(SSL *ssl, Span signature, - // SSLKeyShare abstracts over Diffie-Hellman-like key exchanges. - class SSLKeyShare { - public: -+ SSLKeyShare() : isInitiator(false) {} - virtual ~SSLKeyShare() {} - static constexpr bool kAllowUniquePtr = true; - HAS_VIRTUAL_DESTRUCTOR - - // Create returns a SSLKeyShare instance for use with group |group_id| or -- // nullptr on error. -+ // nullptr on error. |isClient| indicates whether key share is created -+ // on a client (true) or a server (false) side. - static UniquePtr Create(uint16_t group_id); - - // Create deserializes an SSLKeyShare instance previously serialized by -@@ -977,6 +979,13 @@ class SSLKeyShare { - // Deserialize initializes the state of the key exchange from |in|, returning - // true if successful and false otherwise. It is called by |Create|. - virtual bool Deserialize(CBS *in) { return false; } -+ -+ // Sets flag indicating role of the key share owner. True for initiator of the -+ // handshake, false for responder. -+ void SetInitiator(bool flag) { isInitiator = flag; } -+ -+ protected: -+ bool isInitiator; - }; - - // ssl_nid_to_group_id looks up the group corresponding to |nid|. On success, it -diff --git a/ssl/ssl_key_share.cc b/ssl/ssl_key_share.cc -index c7f6f88f..93a4ddfc 100644 ---- a/ssl/ssl_key_share.cc -+++ b/ssl/ssl_key_share.cc -@@ -30,6 +30,25 @@ - #include "internal.h" - #include "../crypto/internal.h" - -+#ifdef BORINGSSL_USE_SIDH -+extern "C" -+{ -+ #include -+} -+ -+namespace { -+ // Definitions for SIDH/P503 -+ const size_t kSIDH503_PrvAKeyBitsSz = 250; // Bit size of SIDH private key (type A) -+ const size_t kSIDH503_PrvBKeyBitsSz = 252; // Bit size of SIDH private key (type B) -+ const size_t kSIDH503_PubKeyBytesSz = 378; // Byte size of SIDH public key -+ const size_t kSIDH_SsByteSz = 126; // Shared secret byte size -+ const size_t kX25519_SsByteSz = 32; // Both for public and private key -+} -+#endif -+ -+constexpr size_t BitsToBytes(size_t bits) { -+ return (bits + 7) / 8; -+} - - namespace bssl { - -@@ -211,16 +230,123 @@ class X25519KeyShare : public SSLKeyShare { - uint8_t private_key_[32]; - }; - -+#ifdef BORINGSSL_USE_SIDH -+class SIDH503X25519KeyShare : public SSLKeyShare { -+public: -+ SIDH503X25519KeyShare() {} -+ ~SIDH503X25519KeyShare() override { -+ OPENSSL_cleanse(private_x25519, sizeof(private_x25519)); -+ OPENSSL_cleanse(private_SIDH, sizeof(private_SIDH)); -+ } -+ -+ uint16_t GroupID() const override { -+ return SSL_CURVE_sidh503; -+ } -+ -+ bool Offer(CBB *out) override { -+ uint8_t public_x25519[32]; -+ uint8_t public_SIDH[kSIDH503_PubKeyBytesSz]; -+ const size_t prvKeyBitSz = isInitiator?kSIDH503_PrvAKeyBitsSz:kSIDH503_PrvBKeyBitsSz; -+ -+ // Scoped BN -+ UniquePtr bn_ctx(BN_CTX_new()); -+ if (!bn_ctx) { -+ return false; -+ } -+ BN_CTXScope scope(bn_ctx.get()); -+ -+ // Calculate private key for Alice. Needs to be in range [0, 2^0xFA - 1] and < 250 bits -+ BIGNUM *bn_sidh_prv = BN_CTX_get(bn_ctx.get()); -+ if (!bn_sidh_prv) { -+ return false; -+ } -+ -+ if (!BN_rand(bn_sidh_prv, prvKeyBitSz, BN_RAND_TOP_ONE, BN_RAND_BOTTOM_ANY)) { -+ return false; -+ } -+ -+ // Convert to little endian -+ if (!BN_bn2le_padded(private_SIDH, sizeof(private_SIDH), bn_sidh_prv)) { -+ return false; -+ } -+ -+ X25519_keypair(public_x25519, private_x25519); -+ if (isInitiator) { -+ // Always returns 0 -+ (void)EphemeralKeyGeneration_A_SIDHp503(private_SIDH, public_SIDH); -+ } else { -+ // Always returns 0 -+ (void)EphemeralKeyGeneration_B_SIDHp503(private_SIDH, public_SIDH); -+ } -+ -+ return -+ CBB_add_bytes(out, public_x25519, sizeof(public_x25519)) && -+ CBB_add_bytes(out, public_SIDH, sizeof(public_SIDH)); -+ } -+ -+ bool Finish(Array *out_secret, uint8_t *out_alert, -+ Span peer_key) override { -+ *out_alert = SSL_AD_INTERNAL_ERROR; -+ -+ Array secret; -+ if (!secret.Init(sizeof(private_x25519) + kSIDH_SsByteSz)) { -+ OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE); -+ return false; -+ } -+ -+ if (peer_key.size() != (kX25519_SsByteSz + kSIDH503_PubKeyBytesSz) || -+ !X25519(secret.data(), private_x25519, peer_key.data())) { -+ *out_alert = SSL_AD_DECODE_ERROR; -+ OPENSSL_PUT_ERROR(SSL, SSL_R_BAD_ECPOINT); -+ return false; -+ } -+ -+ if (isInitiator) { -+ // Always returns 0 -+ (void)EphemeralSecretAgreement_A_SIDHp503(private_SIDH, peer_key.data() + 32, secret.data() + sizeof(private_x25519)); -+ } else { -+ (void)EphemeralSecretAgreement_B_SIDHp503(private_SIDH, peer_key.data() + 32, secret.data() + sizeof(private_x25519)); -+ } -+ *out_secret = std::move(secret); -+ return true; -+ } -+ -+ bool Serialize(CBB *out) override { -+ return (CBB_add_asn1_uint64(out, GroupID()) && -+ CBB_add_asn1_octet_string(out, private_x25519, sizeof(private_x25519)) && -+ CBB_add_asn1_octet_string(out, private_SIDH, sizeof(private_SIDH))); -+ } -+ -+ bool Deserialize(CBS *in) override { -+ CBS key; -+ if (!CBS_get_asn1(in, &key, CBS_ASN1_OCTETSTRING) || -+ CBS_len(&key) != (sizeof(private_x25519) + sizeof(private_SIDH)) || -+ !CBS_copy_bytes(&key, private_x25519, sizeof(private_x25519)) || -+ !CBS_copy_bytes(&key, private_SIDH, sizeof(private_SIDH))) { -+ return false; -+ } -+ return true; -+ } -+ -+private: -+ uint8_t private_x25519[kX25519_SsByteSz]; -+ uint8_t private_SIDH[BitsToBytes(kSIDH503_PrvAKeyBitsSz)]; -+}; -+#endif // BORINGSSL_USE_SIDH -+ - CONSTEXPR_ARRAY struct { - int nid; - uint16_t group_id; -- const char name[8], alias[11]; -+ const char name[16], alias[16]; - } kNamedGroups[] = { - {NID_secp224r1, SSL_CURVE_SECP224R1, "P-224", "secp224r1"}, - {NID_X9_62_prime256v1, SSL_CURVE_SECP256R1, "P-256", "prime256v1"}, - {NID_secp384r1, SSL_CURVE_SECP384R1, "P-384", "secp384r1"}, - {NID_secp521r1, SSL_CURVE_SECP521R1, "P-521", "secp521r1"}, - {NID_X25519, SSL_CURVE_X25519, "X25519", "x25519"}, -+#ifdef BORINGSSL_USE_SIDH -+ {NID_x25519sidh503, SSL_CURVE_sidh503, "x25519sidh503", "x25519sidh503"}, -+#endif - }; - - } // namespace -@@ -241,6 +367,10 @@ UniquePtr SSLKeyShare::Create(uint16_t group_id) { - New(NID_secp521r1, SSL_CURVE_SECP521R1)); - case SSL_CURVE_X25519: - return UniquePtr(New()); -+#ifdef BORINGSSL_USE_SIDH -+ case SSL_CURVE_sidh503: -+ return UniquePtr(New()); -+#endif // BORINGSSL_USE_SIDH - default: - return nullptr; - } -diff --git a/ssl/t1_lib.cc b/ssl/t1_lib.cc -index dde767e9..806523af 100644 ---- a/ssl/t1_lib.cc -+++ b/ssl/t1_lib.cc -@@ -2177,7 +2177,11 @@ static bool ext_key_share_add_clienthello(SSL_HANDSHAKE *hs, CBB *out) { - group_id = groups[0]; - } - -- hs->key_share = SSLKeyShare::Create(group_id); -+ if ((hs->key_share = SSLKeyShare::Create(group_id)) == nullptr) { -+ return false; -+ } -+ hs->key_share->SetInitiator(true); -+ - CBB key_exchange; - if (!hs->key_share || - !CBB_add_u16(&kse_bytes, group_id) || -diff --git a/third_party/sidh/CMakeLists.txt b/third_party/sidh/CMakeLists.txt -new file mode 100644 -index 00000000..d7213c8e ---- /dev/null -+++ b/third_party/sidh/CMakeLists.txt -@@ -0,0 +1,62 @@ -+cmake_minimum_required(VERSION 2.8.11) -+ -+add_definitions(-D __LINUX__) -+set(ASM_EXT S) -+enable_language(ASM) -+ -+# Compile to object files, we will link them with libssl -+add_library( -+ sidh503 -+ -+ STATIC -+ -+ src/P503/P503.c -+ src/random/random.c -+ src/sha3/fips202.c -+) -+ -+if(OPENSSL_NO_ASM) -+ target_sources( -+ sidh503 -+ PRIVATE -+ src/P503/generic/fp_generic.c -+ ) -+elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64") -+ add_definitions(-D _AMD64_) -+ add_definitions(-D _FAST_ -D _ADX_) -+ target_sources( -+ sidh503 -+ -+ PRIVATE -+ -+ src/P503/AMD64/fp_x64.c -+ src/P503/AMD64/fp_x64_asm.${ASM_EXT} -+ ) -+elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64") -+ add_definitions(-lrt) -+ add_definitions(-D _ARM64_) -+ target_sources( -+ sidh503 -+ -+ PRIVATE -+ -+ src/P503/ARM64/fp_arm64.c -+ src/P503/ARM64/fp_arm64_asm.${ASM_EXT} -+ ) -+elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "arm64") -+ add_definitions(-lrt) -+ add_definitions(-D _ARM64_) -+ target_sources( -+ sidh503 -+ -+ PRIVATE -+ -+ src/P503/ARM64/fp_arm64.c -+ src/P503/ARM64/fp_arm64_asm.${ASM_EXT} -+ ) -+endif() -+ -+target_include_directories(sidh503 PUBLIC -+ src -+ src/P503 -+) -diff --git a/third_party/sidh/Makefile b/third_party/sidh/Makefile -new file mode 100644 -index 00000000..c9fe9caa ---- /dev/null -+++ b/third_party/sidh/Makefile -@@ -0,0 +1,167 @@ -+#### Makefile for compilation on Linux #### -+ -+OPT?=-O3 -+CC?=gcc -+ -+ifeq ($(CC),$(filter $(CC),gcc cc)) -+OPT+= -fPIC -fPIE -+endif -+ -+ARCHITECTURE=_AMD64_ -+USE_OPT_LEVEL=_FAST_ -+ifeq "$(ARCH)" "x64" -+ ARCHITECTURE=_AMD64_ -+ USE_OPT_LEVEL=_FAST_ -+else ifeq "$(ARCH)" "x86" -+ ARCHITECTURE=_X86_ -+ USE_OPT_LEVEL=_GENERIC_ -+else ifeq "$(ARCH)" "ARM" -+ ARCHITECTURE=_ARM_ -+ USE_OPT_LEVEL=_GENERIC_ -+ ARM_SETTING=-lrt -+else ifeq "$(ARCH)" "ARM64" -+ ARCHITECTURE=_ARM64_ -+ USE_OPT_LEVEL=_FAST_ -+ ARM_SETTING=-lrt -+endif -+ -+ifeq "$(OPT_LEVEL)" "GENERIC" -+ USE_OPT_LEVEL=_GENERIC_ -+endif -+ -+ifeq "$(ARCHITECTURE)" "_AMD64_" -+ ifeq "$(USE_OPT_LEVEL)" "_FAST_" -+ MULX=-D _MULX_ -+ ifeq "$(USE_MULX)" "FALSE" -+ MULX= -+ else -+ ADX=-D _ADX_ -+ ifeq "$(USE_ADX)" "FALSE" -+ ADX= -+ endif -+ endif -+ endif -+endif -+ -+ifeq "$(SET)" "EXTENDED" -+ ADDITIONAL_SETTINGS=-fwrapv -fomit-frame-pointer -march=native -+endif -+ -+AR=ar rcs -+RANLIB=ranlib -+ -+CFLAGS=$(OPT) $(ADDITIONAL_SETTINGS) -D $(ARCHITECTURE) -D __LINUX__ -D $(USE_OPT_LEVEL) $(MULX) $(ADX) -+LDFLAGS=-lm -+ifeq "$(USE_OPT_LEVEL)" "_GENERIC_" -+ EXTRA_OBJECTS_503=objs503/fp_generic.o -+ EXTRA_OBJECTS_751=objs751/fp_generic.o -+else ifeq "$(USE_OPT_LEVEL)" "_FAST_" -+ifeq "$(ARCHITECTURE)" "_AMD64_" -+ EXTRA_OBJECTS_503=objs503/fp_x64.o objs503/fp_x64_asm.o -+ EXTRA_OBJECTS_751=objs751/fp_x64.o objs751/fp_x64_asm.o -+else ifeq "$(ARCHITECTURE)" "_ARM64_" -+ EXTRA_OBJECTS_503=objs503/fp_arm64.o objs503/fp_arm64_asm.o -+ EXTRA_OBJECTS_751=objs751/fp_arm64.o objs751/fp_arm64_asm.o -+endif -+endif -+OBJECTS_503=objs503/P503.o $(EXTRA_OBJECTS_503) objs/random.o objs/fips202.o -+OBJECTS_751=objs751/P751.o $(EXTRA_OBJECTS_751) objs/random.o objs/fips202.o -+ -+all: lib503 lib751 tests KATS -+ -+objs503/%.o: src/P503/%.c -+ @mkdir -p $(@D) -+ $(CC) -c $(CFLAGS) $< -o $@ -+ -+objs751/%.o: src/P751/%.c -+ @mkdir -p $(@D) -+ $(CC) -c $(CFLAGS) $< -o $@ -+ -+ifeq "$(USE_OPT_LEVEL)" "_GENERIC_" -+ objs503/fp_generic.o: src/P503/generic/fp_generic.c -+ $(CC) -c $(CFLAGS) src/P503/generic/fp_generic.c -o objs503/fp_generic.o -+ -+ objs751/fp_generic.o: src/P751/generic/fp_generic.c -+ $(CC) -c $(CFLAGS) src/P751/generic/fp_generic.c -o objs751/fp_generic.o -+else ifeq "$(USE_OPT_LEVEL)" "_FAST_" -+ifeq "$(ARCHITECTURE)" "_AMD64_" -+ objs503/fp_x64.o: src/P503/AMD64/fp_x64.c -+ $(CC) -c $(CFLAGS) src/P503/AMD64/fp_x64.c -o objs503/fp_x64.o -+ -+ objs503/fp_x64_asm.o: src/P503/AMD64/fp_x64_asm.S -+ $(CC) -c $(CFLAGS) src/P503/AMD64/fp_x64_asm.S -o objs503/fp_x64_asm.o -+ -+ objs751/fp_x64.o: src/P751/AMD64/fp_x64.c -+ $(CC) -c $(CFLAGS) src/P751/AMD64/fp_x64.c -o objs751/fp_x64.o -+ -+ objs751/fp_x64_asm.o: src/P751/AMD64/fp_x64_asm.S -+ $(CC) -c $(CFLAGS) src/P751/AMD64/fp_x64_asm.S -o objs751/fp_x64_asm.o -+else ifeq "$(ARCHITECTURE)" "_ARM64_" -+ objs503/fp_arm64.o: src/P503/ARM64/fp_arm64.c -+ $(CC) -c $(CFLAGS) src/P503/ARM64/fp_arm64.c -o objs503/fp_arm64.o -+ -+ objs503/fp_arm64_asm.o: src/P503/ARM64/fp_arm64_asm.S -+ $(CC) -c $(CFLAGS) src/P503/ARM64/fp_arm64_asm.S -o objs503/fp_arm64_asm.o -+ -+ objs751/fp_arm64.o: src/P751/ARM64/fp_arm64.c -+ $(CC) -c $(CFLAGS) src/P751/ARM64/fp_arm64.c -o objs751/fp_arm64.o -+ -+ objs751/fp_arm64_asm.o: src/P751/ARM64/fp_arm64_asm.S -+ $(CC) -c $(CFLAGS) src/P751/ARM64/fp_arm64_asm.S -o objs751/fp_arm64_asm.o -+endif -+endif -+ -+objs/random.o: src/random/random.c -+ @mkdir -p $(@D) -+ $(CC) -c $(CFLAGS) src/random/random.c -o objs/random.o -+ -+objs/fips202.o: src/sha3/fips202.c -+ $(CC) -c $(CFLAGS) src/sha3/fips202.c -o objs/fips202.o -+ -+lib503: $(OBJECTS_503) -+ rm -rf lib503 sike503 sidh503 -+ mkdir lib503 sike503 sidh503 -+ $(AR) lib503/libsidh.a $^ -+ $(RANLIB) lib503/libsidh.a -+ -+lib751: $(OBJECTS_751) -+ rm -rf lib751 sike751 sidh751 -+ mkdir lib751 sike751 sidh751 -+ $(AR) lib751/libsidh.a $^ -+ $(RANLIB) lib751/libsidh.a -+ -+tests: lib503 lib751 -+ $(CC) $(CFLAGS) -L./lib503 tests/arith_tests-p503.c tests/test_extras.c -lsidh $(LDFLAGS) -o arith_tests-p503 $(ARM_SETTING) -+ $(CC) $(CFLAGS) -L./lib751 tests/arith_tests-p751.c tests/test_extras.c -lsidh $(LDFLAGS) -o arith_tests-p751 $(ARM_SETTING) -+ $(CC) $(CFLAGS) -L./lib503 tests/test_SIDHp503.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh503/test_SIDH $(ARM_SETTING) -+ $(CC) $(CFLAGS) -L./lib751 tests/test_SIDHp751.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh751/test_SIDH $(ARM_SETTING) -+ $(CC) $(CFLAGS) -L./lib503 tests/test_SIKEp503.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike503/test_SIKE $(ARM_SETTING) -+ $(CC) $(CFLAGS) -L./lib751 tests/test_SIKEp751.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike751/test_SIKE $(ARM_SETTING) -+ $(CC) $(CFLAGS) -L./lib503 tests/arith_tests-p503.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike503/arith_test $(ARM_SETTING) -+ $(CC) $(CFLAGS) -L./lib751 tests/arith_tests-p751.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike751/arith_test $(ARM_SETTING) -+ -+# AES -+AES_OBJS=objs/aes.o objs/aes_c.o -+ -+objs/%.o: tests/aes/%.c -+ @mkdir -p $(@D) -+ $(CC) -c $(CFLAGS) $< -o $@ -+ -+lib503_for_KATs: $(OBJECTS_503) $(AES_OBJS) -+ $(AR) lib503/libsidh_for_testing.a $^ -+ $(RANLIB) lib503/libsidh_for_testing.a -+ -+lib751_for_KATs: $(OBJECTS_751) $(AES_OBJS) -+ $(AR) lib751/libsidh_for_testing.a $^ -+ $(RANLIB) lib751/libsidh_for_testing.a -+ -+KATS: lib503_for_KATs lib751_for_KATs -+ $(CC) $(CFLAGS) -L./lib503 tests/PQCtestKAT_kem503.c tests/rng/rng.c -lsidh_for_testing $(LDFLAGS) -o sike503/PQCtestKAT_kem $(ARM_SETTING) -+ $(CC) $(CFLAGS) -L./lib751 tests/PQCtestKAT_kem751.c tests/rng/rng.c -lsidh_for_testing $(LDFLAGS) -o sike751/PQCtestKAT_kem $(ARM_SETTING) -+ -+check: tests -+ -+.PHONY: clean -+ -+clean: -+ rm -rf *.req objs503 objs751 objs lib503 lib751 sidh503 sidh751 sike503 sike751 arith_tests-* -diff --git a/third_party/sidh/src/P503/AMD64/fp_x64.c b/third_party/sidh/src/P503/AMD64/fp_x64.c -new file mode 100644 -index 00000000..8f5305ea ---- /dev/null -+++ b/third_party/sidh/src/P503/AMD64/fp_x64.c -@@ -0,0 +1,523 @@ -+/******************************************************************************************** -+* SIDH: an efficient supersingular isogeny cryptography library -+* -+* Abstract: modular arithmetic optimized for x64 platforms for P503 -+*********************************************************************************************/ -+ -+#include "../P503_internal.h" -+ -+ -+// Global constants -+extern const uint64_t p503[NWORDS_FIELD]; -+extern const uint64_t p503p1[NWORDS_FIELD]; -+extern const uint64_t p503x2[NWORDS_FIELD]; -+ -+ -+__inline void fpadd503(const digit_t* a, const digit_t* b, digit_t* c) -+{ // Modular addition, c = a+b mod p503. -+ // Inputs: a, b in [0, 2*p503-1] -+ // Output: c in [0, 2*p503-1] -+ -+#if (OS_TARGET == OS_WIN) -+ unsigned int i, carry = 0; -+ digit_t mask; -+ -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ ADDC(carry, a[i], b[i], carry, c[i]); -+ } -+ -+ carry = 0; -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ SUBC(carry, c[i], ((digit_t*)p503x2)[i], carry, c[i]); -+ } -+ mask = 0 - (digit_t)carry; -+ -+ carry = 0; -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ ADDC(carry, c[i], ((digit_t*)p503x2)[i] & mask, carry, c[i]); -+ } -+ -+#elif (OS_TARGET == OS_LINUX) -+ -+ fpadd503_asm(a, b, c); -+ -+#endif -+} -+ -+ -+__inline void fpsub503(const digit_t* a, const digit_t* b, digit_t* c) -+{ // Modular subtraction, c = a-b mod p503. -+ // Inputs: a, b in [0, 2*p503-1] -+ // Output: c in [0, 2*p503-1] -+ -+#if (OS_TARGET == OS_WIN) -+ unsigned int i, borrow = 0; -+ digit_t mask; -+ -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ SUBC(borrow, a[i], b[i], borrow, c[i]); -+ } -+ mask = 0 - (digit_t)borrow; -+ -+ borrow = 0; -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ ADDC(borrow, c[i], ((digit_t*)p503x2)[i] & mask, borrow, c[i]); -+ } -+ -+#elif (OS_TARGET == OS_LINUX) -+ -+ fpsub503_asm(a, b, c); -+ -+#endif -+} -+ -+ -+__inline void fpneg503(digit_t* a) -+{ // Modular negation, a = -a mod p503. -+ // Input/output: a in [0, 2*p503-1] -+ unsigned int i, borrow = 0; -+ -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ SUBC(borrow, ((digit_t*)p503x2)[i], a[i], borrow, a[i]); -+ } -+} -+ -+ -+void fpdiv2_503(const digit_t* a, digit_t* c) -+{ // Modular division by two, c = a/2 mod p503. -+ // Input : a in [0, 2*p503-1] -+ // Output: c in [0, 2*p503-1] -+ unsigned int i, carry = 0; -+ digit_t mask; -+ -+ mask = 0 - (digit_t)(a[0] & 1); // If a is odd compute a+p503 -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ ADDC(carry, a[i], ((digit_t*)p503)[i] & mask, carry, c[i]); -+ } -+ -+ mp_shiftr1(c, NWORDS_FIELD); -+} -+ -+ -+void fpcorrection503(digit_t* a) -+{ // Modular correction to reduce field element a in [0, 2*p503-1] to [0, p503-1]. -+ unsigned int i, borrow = 0; -+ digit_t mask; -+ -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ SUBC(borrow, a[i], ((digit_t*)p503)[i], borrow, a[i]); -+ } -+ mask = 0 - (digit_t)borrow; -+ -+ borrow = 0; -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ ADDC(borrow, a[i], ((digit_t*)p503)[i] & mask, borrow, a[i]); -+ } -+} -+ -+ -+void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) -+{ // Multiprecision multiply, c = a*b, where lng(a) = lng(b) = nwords. -+ -+ UNREFERENCED_PARAMETER(nwords); -+ -+#if (OS_TARGET == OS_WIN) -+ digit_t t = 0; -+ uint128_t uv = {0}; -+ unsigned int carry = 0; -+ -+ MULADD128(a[0], b[0], uv, carry, uv); -+ t += carry; -+ c[0] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[0], b[1], uv, carry, uv); -+ t += carry; -+ MULADD128(a[1], b[0], uv, carry, uv); -+ t += carry; -+ c[1] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[0], b[2], uv, carry, uv); -+ t += carry; -+ MULADD128(a[1], b[1], uv, carry, uv); -+ t += carry; -+ MULADD128(a[2], b[0], uv, carry, uv); -+ t += carry; -+ c[2] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[0], b[3], uv, carry, uv); -+ t += carry; -+ MULADD128(a[2], b[1], uv, carry, uv); -+ t += carry; -+ MULADD128(a[1], b[2], uv, carry, uv); -+ t += carry; -+ MULADD128(a[3], b[0], uv, carry, uv); -+ t += carry; -+ c[3] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[0], b[4], uv, carry, uv); -+ t += carry; -+ MULADD128(a[3], b[1], uv, carry, uv); -+ t += carry; -+ MULADD128(a[2], b[2], uv, carry, uv); -+ t += carry; -+ MULADD128(a[1], b[3], uv, carry, uv); -+ t += carry; -+ MULADD128(a[4], b[0], uv, carry, uv); -+ t += carry; -+ c[4] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[0], b[5], uv, carry, uv); -+ t += carry; -+ MULADD128(a[4], b[1], uv, carry, uv); -+ t += carry; -+ MULADD128(a[3], b[2], uv, carry, uv); -+ t += carry; -+ MULADD128(a[2], b[3], uv, carry, uv); -+ t += carry; -+ MULADD128(a[1], b[4], uv, carry, uv); -+ t += carry; -+ MULADD128(a[5], b[0], uv, carry, uv); -+ t += carry; -+ c[5] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[0], b[6], uv, carry, uv); -+ t += carry; -+ MULADD128(a[5], b[1], uv, carry, uv); -+ t += carry; -+ MULADD128(a[4], b[2], uv, carry, uv); -+ t += carry; -+ MULADD128(a[3], b[3], uv, carry, uv); -+ t += carry; -+ MULADD128(a[2], b[4], uv, carry, uv); -+ t += carry; -+ MULADD128(a[1], b[5], uv, carry, uv); -+ t += carry; -+ MULADD128(a[6], b[0], uv, carry, uv); -+ t += carry; -+ c[6] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[0], b[7], uv, carry, uv); -+ t += carry; -+ MULADD128(a[6], b[1], uv, carry, uv); -+ t += carry; -+ MULADD128(a[5], b[2], uv, carry, uv); -+ t += carry; -+ MULADD128(a[4], b[3], uv, carry, uv); -+ t += carry; -+ MULADD128(a[3], b[4], uv, carry, uv); -+ t += carry; -+ MULADD128(a[2], b[5], uv, carry, uv); -+ t += carry; -+ MULADD128(a[1], b[6], uv, carry, uv); -+ t += carry; -+ MULADD128(a[7], b[0], uv, carry, uv); -+ t += carry; -+ c[7] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[7], b[1], uv, carry, uv); -+ t += carry; -+ MULADD128(a[6], b[2], uv, carry, uv); -+ t += carry; -+ MULADD128(a[5], b[3], uv, carry, uv); -+ t += carry; -+ MULADD128(a[4], b[4], uv, carry, uv); -+ t += carry; -+ MULADD128(a[3], b[5], uv, carry, uv); -+ t += carry; -+ MULADD128(a[2], b[6], uv, carry, uv); -+ t += carry; -+ MULADD128(a[1], b[7], uv, carry, uv); -+ t += carry; -+ c[8] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[7], b[2], uv, carry, uv); -+ t += carry; -+ MULADD128(a[6], b[3], uv, carry, uv); -+ t += carry; -+ MULADD128(a[5], b[4], uv, carry, uv); -+ t += carry; -+ MULADD128(a[4], b[5], uv, carry, uv); -+ t += carry; -+ MULADD128(a[3], b[6], uv, carry, uv); -+ t += carry; -+ MULADD128(a[2], b[7], uv, carry, uv); -+ t += carry; -+ c[9] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[7], b[3], uv, carry, uv); -+ t += carry; -+ MULADD128(a[6], b[4], uv, carry, uv); -+ t += carry; -+ MULADD128(a[5], b[5], uv, carry, uv); -+ t += carry; -+ MULADD128(a[4], b[6], uv, carry, uv); -+ t += carry; -+ MULADD128(a[3], b[7], uv, carry, uv); -+ t += carry; -+ c[10] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[7], b[4], uv, carry, uv); -+ t += carry; -+ MULADD128(a[6], b[5], uv, carry, uv); -+ t += carry; -+ MULADD128(a[5], b[6], uv, carry, uv); -+ t += carry; -+ MULADD128(a[4], b[7], uv, carry, uv); -+ t += carry; -+ c[11] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[7], b[5], uv, carry, uv); -+ t += carry; -+ MULADD128(a[6], b[6], uv, carry, uv); -+ t += carry; -+ MULADD128(a[5], b[7], uv, carry, uv); -+ t += carry; -+ c[12] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[7], b[6], uv, carry, uv); -+ t += carry; -+ MULADD128(a[6], b[7], uv, carry, uv); -+ t += carry; -+ c[13] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ -+ MULADD128(a[7], b[7], uv, carry, uv); -+ c[14] = uv[0]; -+ c[15] = uv[1]; -+ -+#elif (OS_TARGET == OS_LINUX) -+ -+ mul503_asm(a, b, c); -+ -+#endif -+} -+ -+ -+void rdc_mont(const digit_t* ma, digit_t* mc) -+{ // Montgomery reduction exploiting special form of the prime. -+ // mc = ma*R^-1 mod p503x2, where R = 2^512. -+ // If ma < 2^512*p503, the output mc is in the range [0, 2*p503-1]. -+ // ma is assumed to be in Montgomery representation. -+ -+#if (OS_TARGET == OS_WIN) -+ unsigned int carry; -+ digit_t t = 0; -+ uint128_t uv = {0}; -+ -+ mc[0] = ma[0]; -+ mc[1] = ma[1]; -+ mc[2] = ma[2]; -+ MUL128(mc[0], ((digit_t*)p503p1)[3], uv); -+ ADDC(0, uv[0], ma[3], carry, uv[0]); -+ ADDC(carry, uv[1], 0, carry, uv[1]); -+ mc[3] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = 0; -+ -+ MULADD128(mc[0], ((digit_t*)p503p1)[4], uv, carry, uv); -+ MULADD128(mc[1], ((digit_t*)p503p1)[3], uv, carry, uv); -+ t += carry; -+ ADDC(0, uv[0], ma[4], carry, uv[0]); -+ ADDC(carry, uv[1], 0, carry, uv[1]); -+ t += carry; -+ mc[4] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(mc[0], ((digit_t*)p503p1)[5], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[1], ((digit_t*)p503p1)[4], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[2], ((digit_t*)p503p1)[3], uv, carry, uv); -+ t += carry; -+ ADDC(0, uv[0], ma[5], carry, uv[0]); -+ ADDC(carry, uv[1], 0, carry, uv[1]); -+ t += carry; -+ mc[5] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(mc[0], ((digit_t*)p503p1)[6], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[1], ((digit_t*)p503p1)[5], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[2], ((digit_t*)p503p1)[4], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[3], ((digit_t*)p503p1)[3], uv, carry, uv); -+ t += carry; -+ ADDC(0, uv[0], ma[6], carry, uv[0]); -+ ADDC(carry, uv[1], 0, carry, uv[1]); -+ t += carry; -+ mc[6] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(mc[0], ((digit_t*)p503p1)[7], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[1], ((digit_t*)p503p1)[6], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[2], ((digit_t*)p503p1)[5], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[3], ((digit_t*)p503p1)[4], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[4], ((digit_t*)p503p1)[3], uv, carry, uv); -+ t += carry; -+ ADDC(0, uv[0], ma[7], carry, uv[0]); -+ ADDC(carry, uv[1], 0, carry, uv[1]); -+ t += carry; -+ mc[7] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(mc[1], ((digit_t*)p503p1)[7], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[2], ((digit_t*)p503p1)[6], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[3], ((digit_t*)p503p1)[5], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[4], ((digit_t*)p503p1)[4], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[5], ((digit_t*)p503p1)[3], uv, carry, uv); -+ t += carry; -+ ADDC(0, uv[0], ma[8], carry, uv[0]); -+ ADDC(carry, uv[1], 0, carry, uv[1]); -+ t += carry; -+ mc[0] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(mc[2], ((digit_t*)p503p1)[7], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[3], ((digit_t*)p503p1)[6], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[4], ((digit_t*)p503p1)[5], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[5], ((digit_t*)p503p1)[4], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[6], ((digit_t*)p503p1)[3], uv, carry, uv); -+ t += carry; -+ ADDC(0, uv[0], ma[9], carry, uv[0]); -+ ADDC(carry, uv[1], 0, carry, uv[1]); -+ t += carry; -+ mc[1] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(mc[3], ((digit_t*)p503p1)[7], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[4], ((digit_t*)p503p1)[6], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[5], ((digit_t*)p503p1)[5], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[6], ((digit_t*)p503p1)[4], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[7], ((digit_t*)p503p1)[3], uv, carry, uv); -+ t += carry; -+ ADDC(0, uv[0], ma[10], carry, uv[0]); -+ ADDC(carry, uv[1], 0, carry, uv[1]); -+ t += carry; -+ mc[2] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(mc[4], ((digit_t*)p503p1)[7], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[5], ((digit_t*)p503p1)[6], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[6], ((digit_t*)p503p1)[5], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[7], ((digit_t*)p503p1)[4], uv, carry, uv); -+ t += carry; -+ ADDC(0, uv[0], ma[11], carry, uv[0]); -+ ADDC(carry, uv[1], 0, carry, uv[1]); -+ t += carry; -+ mc[3] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(mc[5], ((digit_t*)p503p1)[7], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[6], ((digit_t*)p503p1)[6], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[7], ((digit_t*)p503p1)[5], uv, carry, uv); -+ t += carry; -+ ADDC(0, uv[0], ma[12], carry, uv[0]); -+ ADDC(carry, uv[1], 0, carry, uv[1]); -+ t += carry; -+ mc[4] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(mc[6], ((digit_t*)p503p1)[7], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[7], ((digit_t*)p503p1)[6], uv, carry, uv); -+ t += carry; -+ ADDC(0, uv[0], ma[13], carry, uv[0]); -+ ADDC(carry, uv[1], 0, carry, uv[1]); -+ t += carry; -+ mc[5] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(mc[7], ((digit_t*)p503p1)[7], uv, carry, uv); -+ t += carry; -+ ADDC(0, uv[0], ma[14], carry, mc[6]); -+ ADDC(carry, uv[1], 0, carry, uv[1]); -+ ADDC(0, uv[1], ma[15], carry, mc[7]); -+ -+#elif (OS_TARGET == OS_LINUX) -+ -+ rdc503_asm(ma, mc); -+ -+#endif -+} -diff --git a/third_party/sidh/src/P503/AMD64/fp_x64_asm.S b/third_party/sidh/src/P503/AMD64/fp_x64_asm.S -new file mode 100644 -index 00000000..b698e682 ---- /dev/null -+++ b/third_party/sidh/src/P503/AMD64/fp_x64_asm.S -@@ -0,0 +1,1684 @@ -+//******************************************************************************************* -+// SIDH: an efficient supersingular isogeny cryptography library -+// -+// Abstract: field arithmetic in x64 assembly for P503 on Linux -+//******************************************************************************************* -+ -+.intel_syntax noprefix -+ -+// Registers that are used for parameter passing: -+#define reg_p1 rdi -+#define reg_p2 rsi -+#define reg_p3 rdx -+ -+// p503 + 1 -+#define p503p1_3 0xAC00000000000000 -+#define p503p1_4 0x13085BDA2211E7A0 -+#define p503p1_5 0x1B9BF6C87B7E7DAF -+#define p503p1_6 0x6045C6BDDA77A4D0 -+#define p503p1_7 0x004066F541811E1E -+// p503 x 2 -+#define p503x2_0 0xFFFFFFFFFFFFFFFE -+#define p503x2_1 0xFFFFFFFFFFFFFFFF -+#define p503x2_3 0x57FFFFFFFFFFFFFF -+#define p503x2_4 0x2610B7B44423CF41 -+#define p503x2_5 0x3737ED90F6FCFB5E -+#define p503x2_6 0xC08B8D7BB4EF49A0 -+#define p503x2_7 0x0080CDEA83023C3C -+ -+p503p1_nz: -+.quad 0xAC00000000000000 -+.quad 0x13085BDA2211E7A0 -+.quad 0x1B9BF6C87B7E7DAF -+.quad 0x6045C6BDDA77A4D0 -+.quad 0x004066F541811E1E -+ -+// Define addition instructions -+#ifdef _MULX_ -+#ifdef _ADX_ -+ -+#define ADD1 adox -+#define ADC1 adox -+#define ADD2 adcx -+#define ADC2 adcx -+ -+#else -+ -+#define ADD1 add -+#define ADC1 adc -+#define ADD2 add -+#define ADC2 adc -+ -+#endif -+#endif -+ -+ -+.text -+//*********************************************************************** -+// Field addition -+// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2] -+//*********************************************************************** -+.global fpadd503_asm -+fpadd503_asm: -+ push r12 -+ push r13 -+ push r14 -+ push r15 -+ -+ xor rax, rax -+ mov r8, [reg_p1] -+ mov r9, [reg_p1+8] -+ mov r10, [reg_p1+16] -+ mov r11, [reg_p1+24] -+ mov r12, [reg_p1+32] -+ mov r13, [reg_p1+40] -+ mov r14, [reg_p1+48] -+ mov r15, [reg_p1+56] -+ add r8, [reg_p2] -+ adc r9, [reg_p2+8] -+ adc r10, [reg_p2+16] -+ adc r11, [reg_p2+24] -+ adc r12, [reg_p2+32] -+ adc r13, [reg_p2+40] -+ adc r14, [reg_p2+48] -+ adc r15, [reg_p2+56] -+ -+ movq rcx, p503x2_0 -+ sub r8, rcx -+ movq rcx, p503x2_1 -+ sbb r9, rcx -+ sbb r10, rcx -+ movq rcx, p503x2_3 -+ sbb r11, rcx -+ movq rcx, p503x2_4 -+ sbb r12, rcx -+ movq rcx, p503x2_5 -+ sbb r13, rcx -+ movq rcx, p503x2_6 -+ sbb r14, rcx -+ movq rcx, p503x2_7 -+ sbb r15, rcx -+ sbb rax, 0 -+ -+ mov rdi, p503x2_0 -+ and rdi, rax -+ mov rsi, p503x2_1 -+ and rsi, rax -+ movq rcx, p503x2_3 -+ and rcx, rax -+ -+ add r8, rdi -+ adc r9, rsi -+ adc r10, rsi -+ adc r11, rcx -+ mov [reg_p3], r8 -+ mov [reg_p3+8], r9 -+ mov [reg_p3+16], r10 -+ mov [reg_p3+24], r11 -+ setc cl -+ -+ movq r8, p503x2_4 -+ and r8, rax -+ movq r9, p503x2_5 -+ and r9, rax -+ movq r10, p503x2_6 -+ and r10, rax -+ movq r11, p503x2_7 -+ and r11, rax -+ -+ bt rcx, 0 -+ adc r12, r8 -+ adc r13, r9 -+ adc r14, r10 -+ adc r15, r11 -+ mov [reg_p3+32], r12 -+ mov [reg_p3+40], r13 -+ mov [reg_p3+48], r14 -+ mov [reg_p3+56], r15 -+ -+ pop r15 -+ pop r14 -+ pop r13 -+ pop r12 -+ ret -+ -+ -+//*********************************************************************** -+// Field subtraction -+// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] -+//*********************************************************************** -+.global fpsub503_asm -+fpsub503_asm: -+ push r12 -+ push r13 -+ push r14 -+ push r15 -+ -+ xor rax, rax -+ mov r8, [reg_p1] -+ mov r9, [reg_p1+8] -+ mov r10, [reg_p1+16] -+ mov r11, [reg_p1+24] -+ mov r12, [reg_p1+32] -+ mov r13, [reg_p1+40] -+ mov r14, [reg_p1+48] -+ mov r15, [reg_p1+56] -+ sub r8, [reg_p2] -+ sbb r9, [reg_p2+8] -+ sbb r10, [reg_p2+16] -+ sbb r11, [reg_p2+24] -+ sbb r12, [reg_p2+32] -+ sbb r13, [reg_p2+40] -+ sbb r14, [reg_p2+48] -+ sbb r15, [reg_p2+56] -+ sbb rax, 0 -+ -+ mov rdi, p503x2_0 -+ and rdi, rax -+ mov rsi, p503x2_1 -+ and rsi, rax -+ movq rcx, p503x2_3 -+ and rcx, rax -+ -+ add r8, rdi -+ adc r9, rsi -+ adc r10, rsi -+ adc r11, rcx -+ mov [reg_p3], r8 -+ mov [reg_p3+8], r9 -+ mov [reg_p3+16], r10 -+ mov [reg_p3+24], r11 -+ setc cl -+ -+ movq r8, p503x2_4 -+ and r8, rax -+ movq r9, p503x2_5 -+ and r9, rax -+ movq r10, p503x2_6 -+ and r10, rax -+ movq r11, p503x2_7 -+ and r11, rax -+ -+ bt rcx, 0 -+ adc r12, r8 -+ adc r13, r9 -+ adc r14, r10 -+ adc r15, r11 -+ mov [reg_p3+32], r12 -+ mov [reg_p3+40], r13 -+ mov [reg_p3+48], r14 -+ mov [reg_p3+56], r15 -+ -+ pop r15 -+ pop r14 -+ pop r13 -+ pop r12 -+ ret -+ -+ -+#ifdef _MULX_ -+ -+///////////////////////////////////////////////////////////////// MACRO -+// Schoolbook integer multiplication, a full row at a time -+// Inputs: memory pointers M0 and M1 -+// Outputs: memory pointer C -+// Temps: regs T0:T9 -+///////////////////////////////////////////////////////////////// -+ -+#ifdef _ADX_ -+.macro MUL256_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9 -+ mov rdx, \M0 -+ mulx \T0, \T1, \M1 // T0:T1 = A0*B0 -+ mov \C, \T1 // C0_final -+ mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1 -+ xor rax, rax -+ adox \T0, \T2 -+ mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2 -+ adox \T1, \T3 -+ mulx \T3, \T4, 24\M1 // T3:T4 = A0*B3 -+ adox \T2, \T4 -+ -+ mov rdx, 8\M0 -+ mulx \T5, \T4, \M1 // T5:T4 = A1*B0 -+ adox \T3, rax -+ xor rax, rax -+ mulx \T6, \T7, 8\M1 // T6:T7 = A1*B1 -+ adox \T4, \T0 -+ mov 8\C, \T4 // C1_final -+ adcx \T5, \T7 -+ mulx \T7, \T8, 16\M1 // T7:T8 = A1*B2 -+ adcx \T6, \T8 -+ adox \T5, \T1 -+ mulx \T8, \T9, 24\M1 // T8:T9 = A1*B3 -+ adcx \T7, \T9 -+ adcx \T8, rax -+ adox \T6, \T2 -+ -+ mov rdx, 16\M0 -+ mulx \T1, \T0, \M1 // T1:T0 = A2*B0 -+ adox \T7, \T3 -+ adox \T8, rax -+ xor rax, rax -+ mulx \T2, \T3, 8\M1 // T2:T3 = A2*B1 -+ adox \T0, \T5 -+ mov 16\C, \T0 // C2_final -+ adcx \T1, \T3 -+ mulx \T3, \T4, 16\M1 // T3:T4 = A2*B2 -+ adcx \T2, \T4 -+ adox \T1, \T6 -+ mulx \T4,\T9, 24\M1 // T3:T4 = A2*B3 -+ adcx \T3, \T9 -+ mov rdx, 24\M0 -+ adcx \T4, rax -+ -+ adox \T2, \T7 -+ adox \T3, \T8 -+ adox \T4, rax -+ -+ mulx \T5, \T0, \M1 // T5:T0 = A3*B0 -+ xor rax, rax -+ mulx \T6, \T7, 8\M1 // T6:T7 = A3*B1 -+ adcx \T5, \T7 -+ adox \T1, \T0 -+ mulx \T7, \T8, 16\M1 // T7:T8 = A3*B2 -+ adcx \T6, \T8 -+ adox \T2, \T5 -+ mulx \T8, \T9, 24\M1 // T8:T9 = A3*B3 -+ adcx \T7, \T9 -+ adcx \T8, rax -+ -+ adox \T3, \T6 -+ adox \T4, \T7 -+ adox \T8, rax -+ mov 24\C, \T1 // C3_final -+ mov 32\C, \T2 // C4_final -+ mov 40\C, \T3 // C5_final -+ mov 48\C, \T4 // C6_final -+ mov 56\C, \T8 // C7_final -+.endm -+ -+#else -+ -+.macro MUL256_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9 -+ mov rdx, \M0 -+ mulx \T0, \T1, \M1 // T0:T1 = A0*B0 -+ mov \C, \T1 // C0_final -+ mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1 -+ add \T0, \T2 -+ mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2 -+ adc \T1, \T3 -+ mulx \T3, \T4, 24\M1 // T3:T4 = A0*B3 -+ adc \T2, \T4 -+ mov rdx, 8\M0 -+ adc \T3, 0 -+ -+ mulx \T5, \T4, \M1 // T5:T4 = A1*B0 -+ mulx \T6, \T7, 8\M1 // T6:T7 = A1*B1 -+ add \T5, \T7 -+ mulx \T7, \T8, 16\M1 // T7:T8 = A1*B2 -+ adc \T6, \T8 -+ mulx \T8, \T9, 24\M1 // T8:T9 = A1*B3 -+ adc \T7, \T9 -+ adc \T8, 0 -+ -+ add \T4, \T0 -+ mov 8\C, \T4 // C1_final -+ adc \T5, \T1 -+ adc \T6, \T2 -+ adc \T7, \T3 -+ mov rdx, 16\M0 -+ adc \T8, 0 -+ -+ mulx \T1, \T0, \M1 // T1:T0 = A2*B0 -+ mulx \T2, \T3, 8\M1 // T2:T3 = A2*B1 -+ add \T1, \T3 -+ mulx \T3, \T4, 16\M1 // T3:T4 = A2*B2 -+ adc \T2, \T4 -+ mulx \T4,\T9, 24\M1 // T3:T4 = A2*B3 -+ adc \T3, \T9 -+ mov rdx, 24\M0 -+ adc \T4, 0 -+ -+ add \T0, \T5 -+ mov 16\C, \T0 // C2_final -+ adc \T1, \T6 -+ adc \T2, \T7 -+ adc \T3, \T8 -+ adc \T4, 0 -+ -+ mulx \T5, \T0, \M1 // T5:T0 = A3*B0 -+ mulx \T6, \T7, 8\M1 // T6:T7 = A3*B1 -+ add \T5, \T7 -+ mulx \T7, \T8, 16\M1 // T7:T8 = A3*B2 -+ adc \T6, \T8 -+ mulx \T8, \T9, 24\M1 // T8:T9 = A3*B3 -+ adc \T7, \T9 -+ adc \T8, 0 -+ -+ add \T1, \T0 -+ mov 24\C, \T1 // C3_final -+ adc \T2, \T5 -+ mov 32\C, \T2 // C4_final -+ adc \T3, \T6 -+ mov 40\C, \T3 // C5_final -+ adc \T4, \T7 -+ mov 48\C, \T4 // C6_final -+ adc \T8, 0 -+ mov 56\C, \T8 // C7_final -+.endm -+#endif -+ -+ -+//***************************************************************************** -+// 503-bit multiplication using Karatsuba (one level), schoolbook (one level) -+//***************************************************************************** -+.global mul503_asm -+mul503_asm: -+ push r12 -+ push r13 -+ push r14 -+ push r15 -+ mov rcx, reg_p3 -+ -+ // r8-r11 <- AH + AL, rax <- mask -+ xor rax, rax -+ mov r8, [reg_p1] -+ mov r9, [reg_p1+8] -+ mov r10, [reg_p1+16] -+ mov r11, [reg_p1+24] -+ push rbx -+ push rbp -+ sub rsp, 96 -+ add r8, [reg_p1+32] -+ adc r9, [reg_p1+40] -+ adc r10, [reg_p1+48] -+ adc r11, [reg_p1+56] -+ sbb rax, 0 -+ mov [rsp], r8 -+ mov [rsp+8], r9 -+ mov [rsp+16], r10 -+ mov [rsp+24], r11 -+ -+ // r12-r15 <- BH + BL, rbx <- mask -+ xor rbx, rbx -+ mov r12, [reg_p2] -+ mov r13, [reg_p2+8] -+ mov r14, [reg_p2+16] -+ mov r15, [reg_p2+24] -+ add r12, [reg_p2+32] -+ adc r13, [reg_p2+40] -+ adc r14, [reg_p2+48] -+ adc r15, [reg_p2+56] -+ sbb rbx, 0 -+ mov [rsp+32], r12 -+ mov [rsp+40], r13 -+ mov [rsp+48], r14 -+ mov [rsp+56], r15 -+ -+ // r12-r15 <- masked (BH + BL) -+ and r12, rax -+ and r13, rax -+ and r14, rax -+ and r15, rax -+ -+ // r8-r11 <- masked (AH + AL) -+ and r8, rbx -+ and r9, rbx -+ and r10, rbx -+ and r11, rbx -+ -+ // r8-r11 <- masked (AH + AL) + masked (AH + AL) -+ add r8, r12 -+ adc r9, r13 -+ adc r10, r14 -+ adc r11, r15 -+ mov [rsp+64], r8 -+ mov [rsp+72], r9 -+ mov [rsp+80], r10 -+ mov [rsp+88], r11 -+ -+ // [rcx+64] <- (AH+AL) x (BH+BL), low part -+ MUL256_SCHOOL [rsp], [rsp+32], [rcx+64], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp -+ -+ // [rcx] <- AL x BL -+ MUL256_SCHOOL [reg_p1], [reg_p2], [rcx], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp // Result C0-C3 -+ -+ // [rsp] <- AH x BH -+ MUL256_SCHOOL [reg_p1+32], [reg_p2+32], [rsp], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp -+ -+ // r8-r11 <- (AH+AL) x (BH+BL), final step -+ mov r8, [rsp+64] -+ mov r9, [rsp+72] -+ mov r10, [rsp+80] -+ mov r11, [rsp+88] -+ mov rax, [rcx+96] -+ add r8, rax -+ mov rax, [rcx+104] -+ adc r9, rax -+ mov rax, [rcx+112] -+ adc r10, rax -+ mov rax, [rcx+120] -+ adc r11, rax -+ -+ // [rcx+64], x3-x5 <- (AH+AL) x (BH+BL) - ALxBL -+ mov r12, [rcx+64] -+ mov r13, [rcx+72] -+ mov r14, [rcx+80] -+ mov r15, [rcx+88] -+ sub r12, [rcx] -+ sbb r13, [rcx+8] -+ sbb r14, [rcx+16] -+ sbb r15, [rcx+24] -+ sbb r8, [rcx+32] -+ sbb r9, [rcx+40] -+ sbb r10, [rcx+48] -+ sbb r11, [rcx+56] -+ -+ // r8-r15 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH -+ sub r12, [rsp] -+ sbb r13, [rsp+8] -+ sbb r14, [rsp+16] -+ sbb r15, [rsp+24] -+ sbb r8, [rsp+32] -+ sbb r9, [rsp+40] -+ sbb r10, [rsp+48] -+ sbb r11, [rsp+56] -+ -+ add r12, [rcx+32] -+ mov [rcx+32], r12 // Result C4-C7 -+ adc r13, [rcx+40] -+ mov [rcx+40], r13 -+ adc r14, [rcx+48] -+ mov [rcx+48], r14 -+ adc r15, [rcx+56] -+ mov [rcx+56], r15 -+ mov rax, [rsp] -+ adc r8, rax -+ mov [rcx+64], r8 // Result C8-C15 -+ mov rax, [rsp+8] -+ adc r9, rax -+ mov [rcx+72], r9 -+ mov rax, [rsp+16] -+ adc r10, rax -+ mov [rcx+80], r10 -+ mov rax, [rsp+24] -+ adc r11, rax -+ mov [rcx+88], r11 -+ mov r12, [rsp+32] -+ adc r12, 0 -+ mov [rcx+96], r12 -+ mov r13, [rsp+40] -+ adc r13, 0 -+ mov [rcx+104], r13 -+ mov r14, [rsp+48] -+ adc r14, 0 -+ mov [rcx+112], r14 -+ mov r15, [rsp+56] -+ adc r15, 0 -+ mov [rcx+120], r15 -+ -+ add rsp, 96 -+ pop rbp -+ pop rbx -+ pop r15 -+ pop r14 -+ pop r13 -+ pop r12 -+ ret -+ -+#else -+ -+//*********************************************************************** -+// Integer multiplication -+// Based on Karatsuba method -+// Operation: c [reg_p3] = a [reg_p1] * b [reg_p2] -+// NOTE: a=c or b=c are not allowed -+//*********************************************************************** -+.global mul503_asm -+mul503_asm: -+ push r12 -+ push r13 -+ push r14 -+ mov rcx, reg_p3 -+ -+ // rcx[0-3] <- AH+AL -+ xor rax, rax -+ mov r8, [reg_p1+32] -+ mov r9, [reg_p1+40] -+ mov r10, [reg_p1+48] -+ mov r11, [reg_p1+56] -+ add r8, [reg_p1] -+ adc r9, [reg_p1+8] -+ adc r10, [reg_p1+16] -+ adc r11, [reg_p1+24] -+ push r15 -+ mov [rcx], r8 -+ mov [rcx+8], r9 -+ mov [rcx+16], r10 -+ mov [rcx+24], r11 -+ sbb rax, 0 -+ sub rsp, 80 // Allocating space in stack -+ -+ // r12-r15 <- BH+BL -+ xor rdx, rdx -+ mov r12, [reg_p2+32] -+ mov r13, [reg_p2+40] -+ mov r14, [reg_p2+48] -+ mov r15, [reg_p2+56] -+ add r12, [reg_p2] -+ adc r13, [reg_p2+8] -+ adc r14, [reg_p2+16] -+ adc r15, [reg_p2+24] -+ sbb rdx, 0 -+ mov [rsp+64], rax -+ mov [rsp+72], rdx -+ -+ // (rsp[0-3],r8,r9,r10,r11) <- (AH+AL)*(BH+BL) -+ mov rax, [rcx] -+ mul r12 -+ mov [rsp], rax // c0 -+ mov r8, rdx -+ -+ xor r9, r9 -+ mov rax, [rcx] -+ mul r13 -+ add r8, rax -+ adc r9, rdx -+ -+ xor r10, r10 -+ mov rax, [rcx+8] -+ mul r12 -+ add r8, rax -+ mov [rsp+8], r8 // c1 -+ adc r9, rdx -+ adc r10, 0 -+ -+ xor r8, r8 -+ mov rax, [rcx] -+ mul r14 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov rax, [rcx+16] -+ mul r12 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov rax, [rcx+8] -+ mul r13 -+ add r9, rax -+ mov [rsp+16], r9 // c2 -+ adc r10, rdx -+ adc r8, 0 -+ -+ xor r9, r9 -+ mov rax, [rcx] -+ mul r15 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov rax, [rcx+24] -+ mul r12 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov rax, [rcx+8] -+ mul r14 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov rax, [rcx+16] -+ mul r13 -+ add r10, rax -+ mov [rsp+24], r10 // c3 -+ adc r8, rdx -+ adc r9, 0 -+ -+ xor r10, r10 -+ mov rax, [rcx+8] -+ mul r15 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ mov rax, [rcx+24] -+ mul r13 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ mov rax, [rcx+16] -+ mul r14 -+ add r8, rax -+ mov [rsp+32], r8 // c4 -+ adc r9, rdx -+ adc r10, 0 -+ -+ xor r11, r11 -+ mov rax, [rcx+16] -+ mul r15 -+ add r9, rax -+ adc r10, rdx -+ adc r11, 0 -+ -+ mov rax, [rcx+24] -+ mul r14 -+ add r9, rax // c5 -+ adc r10, rdx -+ adc r11, 0 -+ -+ mov rax, [rcx+24] -+ mul r15 -+ add r10, rax // c6 -+ adc r11, rdx // c7 -+ -+ mov rax, [rsp+64] -+ and r12, rax -+ and r13, rax -+ and r14, rax -+ and r15, rax -+ add r12, r8 -+ adc r13, r9 -+ adc r14, r10 -+ adc r15, r11 -+ -+ mov rax, [rsp+72] -+ mov r8, [rcx] -+ mov r9, [rcx+8] -+ mov r10, [rcx+16] -+ mov r11, [rcx+24] -+ and r8, rax -+ and r9, rax -+ and r10, rax -+ and r11, rax -+ add r8, r12 -+ adc r9, r13 -+ adc r10, r14 -+ adc r11, r15 -+ mov [rsp+32], r8 -+ mov [rsp+40], r9 -+ mov [rsp+48], r10 -+ mov [rsp+56], r11 -+ -+ // rcx[0-7] <- AL*BL -+ mov r11, [reg_p1] -+ mov rax, [reg_p2] -+ mul r11 -+ xor r9, r9 -+ mov [rcx], rax // c0 -+ mov r8, rdx -+ -+ mov r14, [reg_p1+16] -+ mov rax, [reg_p2+8] -+ mul r11 -+ xor r10, r10 -+ add r8, rax -+ adc r9, rdx -+ -+ mov r12, [reg_p1+8] -+ mov rax, [reg_p2] -+ mul r12 -+ add r8, rax -+ mov [rcx+8], r8 // c1 -+ adc r9, rdx -+ adc r10, 0 -+ -+ xor r8, r8 -+ mov rax, [reg_p2+16] -+ mul r11 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov r13, [reg_p2] -+ mov rax, r14 -+ mul r13 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov rax, [reg_p2+8] -+ mul r12 -+ add r9, rax -+ mov [rcx+16], r9 // c2 -+ adc r10, rdx -+ adc r8, 0 -+ -+ xor r9, r9 -+ mov rax, [reg_p2+24] -+ mul r11 -+ mov r15, [reg_p1+24] -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov rax, r15 -+ mul r13 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov rax, [reg_p2+16] -+ mul r12 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov rax, [reg_p2+8] -+ mul r14 -+ add r10, rax -+ mov [rcx+24], r10 // c3 -+ adc r8, rdx -+ adc r9, 0 -+ -+ xor r10, r10 -+ mov rax, [reg_p2+24] -+ mul r12 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ mov rax, [reg_p2+8] -+ mul r15 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ mov rax, [reg_p2+16] -+ mul r14 -+ add r8, rax -+ mov [rcx+32], r8 // c4 -+ adc r9, rdx -+ adc r10, 0 -+ -+ xor r8, r8 -+ mov rax, [reg_p2+24] -+ mul r14 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov rax, [reg_p2+16] -+ mul r15 -+ add r9, rax -+ mov [rcx+40], r9 // c5 -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov rax, [reg_p2+24] -+ mul r15 -+ add r10, rax -+ mov [rcx+48], r10 // c6 -+ adc r8, rdx -+ mov [rcx+56], r8 // c7 -+ -+ // rcx[8-15] <- AH*BH -+ mov r11, [reg_p1+32] -+ mov rax, [reg_p2+32] -+ mul r11 -+ xor r9, r9 -+ mov [rcx+64], rax // c0 -+ mov r8, rdx -+ -+ mov r14, [reg_p1+48] -+ mov rax, [reg_p2+40] -+ mul r11 -+ xor r10, r10 -+ add r8, rax -+ adc r9, rdx -+ -+ mov r12, [reg_p1+40] -+ mov rax, [reg_p2+32] -+ mul r12 -+ add r8, rax -+ mov [rcx+72], r8 // c1 -+ adc r9, rdx -+ adc r10, 0 -+ -+ xor r8, r8 -+ mov rax, [reg_p2+48] -+ mul r11 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov r13, [reg_p2+32] -+ mov rax, r14 -+ mul r13 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov rax, [reg_p2+40] -+ mul r12 -+ add r9, rax -+ mov [rcx+80], r9 // c2 -+ adc r10, rdx -+ adc r8, 0 -+ -+ xor r9, r9 -+ mov rax, [reg_p2+56] -+ mul r11 -+ mov r15, [reg_p1+56] -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov rax, r15 -+ mul r13 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov rax, [reg_p2+48] -+ mul r12 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov rax, [reg_p2+40] -+ mul r14 -+ add r10, rax -+ mov [rcx+88], r10 // c3 -+ adc r8, rdx -+ adc r9, 0 -+ -+ xor r10, r10 -+ mov rax, [reg_p2+56] -+ mul r12 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ mov rax, [reg_p2+40] -+ mul r15 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ mov rax, [reg_p2+48] -+ mul r14 -+ add r8, rax -+ mov [rcx+96], r8 // c4 -+ adc r9, rdx -+ adc r10, 0 -+ -+ xor r8, r8 -+ mov rax, [reg_p2+56] -+ mul r14 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov rax, [reg_p2+48] -+ mul r15 -+ add r9, rax -+ mov [rcx+104], r9 // c5 -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov rax, [reg_p2+56] -+ mul r15 -+ add r10, rax -+ mov [rcx+112], r10 // c6 -+ adc r8, rdx -+ mov [rcx+120], r8 // c7 -+ -+ // [r8-r15] <- (AH+AL)*(BH+BL) - AL*BL -+ mov r8, [rsp] -+ sub r8, [rcx] -+ mov r9, [rsp+8] -+ sbb r9, [rcx+8] -+ mov r10, [rsp+16] -+ sbb r10, [rcx+16] -+ mov r11, [rsp+24] -+ sbb r11, [rcx+24] -+ mov r12, [rsp+32] -+ sbb r12, [rcx+32] -+ mov r13, [rsp+40] -+ sbb r13, [rcx+40] -+ mov r14, [rsp+48] -+ sbb r14, [rcx+48] -+ mov r15, [rsp+56] -+ sbb r15, [rcx+56] -+ -+ // [r8-r15] <- (AH+AL)*(BH+BL) - AL*BL - AH*BH -+ mov rax, [rcx+64] -+ sub r8, rax -+ mov rax, [rcx+72] -+ sbb r9, rax -+ mov rax, [rcx+80] -+ sbb r10, rax -+ mov rax, [rcx+88] -+ sbb r11, rax -+ mov rax, [rcx+96] -+ sbb r12, rax -+ mov rdx, [rcx+104] -+ sbb r13, rdx -+ mov rdi, [rcx+112] -+ sbb r14, rdi -+ mov rsi, [rcx+120] -+ sbb r15, rsi -+ -+ // Final result -+ add r8, [rcx+32] -+ mov [rcx+32], r8 -+ adc r9, [rcx+40] -+ mov [rcx+40], r9 -+ adc r10, [rcx+48] -+ mov [rcx+48], r10 -+ adc r11, [rcx+56] -+ mov [rcx+56], r11 -+ adc r12, [rcx+64] -+ mov [rcx+64], r12 -+ adc r13, [rcx+72] -+ mov [rcx+72], r13 -+ adc r14, [rcx+80] -+ mov [rcx+80], r14 -+ adc r15, [rcx+88] -+ mov [rcx+88], r15 -+ adc rax, 0 -+ mov [rcx+96], rax -+ adc rdx, 0 -+ mov [rcx+104], rdx -+ adc rdi, 0 -+ mov [rcx+112], rdi -+ adc rsi, 0 -+ mov [rcx+120], rsi -+ -+ add rsp, 80 // Restoring space in stack -+ pop r15 -+ pop r14 -+ pop r13 -+ pop r12 -+ ret -+ -+#endif -+ -+ -+#ifdef _MULX_ -+ -+///////////////////////////////////////////////////////////////// MACRO -+// Schoolbook integer multiplication -+// Inputs: memory pointers M0 and M1 -+// Outputs: regs T0:T6 -+// Temps: regs T7:T9 -+///////////////////////////////////////////////////////////////// -+.macro MUL128x320_SCHOOL M0, M1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9 -+ mov rdx, \M0 -+ mulx \T1, \T0, \M1 // T0 <- C0_final -+ mulx \T2, \T4, 8\M1 -+ xor rax, rax -+ mulx \T3, \T5, 16\M1 -+ ADD1 \T1, \T4 -+ ADC1 \T2, \T5 -+ mulx \T4, \T7, 24\M1 -+ ADC1 \T3, \T7 -+ mulx \T5, \T6, 32\M1 -+ ADC1 \T4, \T6 -+ ADC1 \T5, rax -+ -+ mov rdx, 8\M0 -+ mulx \T7, \T6, \M1 -+ ADD2 \T1, \T6 // T1 <- C1_final -+ ADC2 \T2, \T7 -+ mulx \T6, \T8, 8\M1 -+ ADC2 \T3, \T6 -+ mulx \T9, \T7, 16\M1 -+ ADC2 \T4, \T9 -+ mulx \T6, \T9, 24\M1 -+ ADC2 \T5, \T6 -+ mulx \T6, rdx, 32\M1 -+ ADC2 \T6, rax -+ -+ xor rax, rax -+ ADD1 \T2, \T8 -+ ADC1 \T3, \T7 -+ ADC1 \T4, \T9 -+ ADC1 \T5, rdx -+ ADC1 \T6, rax -+.endm -+ -+ -+//************************************************************************************** -+// Montgomery reduction -+// Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015 -+// Operation: c [reg_p2] = a [reg_p1] -+// NOTE: a=c is not allowed -+//************************************************************************************** -+.global rdc503_asm -+rdc503_asm: -+ push rbx -+ push r12 -+ push r13 -+ push r14 -+ push r15 -+ -+ // a[0-1] x p503p1_nz --> result: r8:r14 -+ MUL128x320_SCHOOL [reg_p1], [p503p1_nz], r8, r9, r10, r11, r12, r13, r14, rbx, rcx, r15 -+ -+ xor r15, r15 -+ add r8, [reg_p1+24] -+ adc r9, [reg_p1+32] -+ adc r10, [reg_p1+40] -+ adc r11, [reg_p1+48] -+ adc r12, [reg_p1+56] -+ adc r13, [reg_p1+64] -+ adc r14, [reg_p1+72] -+ adc r15, [reg_p1+80] -+ mov [reg_p1+24], r8 -+ mov [reg_p1+32], r9 -+ mov [reg_p1+40], r10 -+ mov [reg_p1+48], r11 -+ mov [reg_p1+56], r12 -+ mov [reg_p1+64], r13 -+ mov [reg_p1+72], r14 -+ mov [reg_p1+80], r15 -+ mov r8, [reg_p1+88] -+ mov r9, [reg_p1+96] -+ mov r10, [reg_p1+104] -+ mov r11, [reg_p1+112] -+ mov r12, [reg_p1+120] -+ adc r8, 0 -+ adc r9, 0 -+ adc r10, 0 -+ adc r11, 0 -+ adc r12, 0 -+ mov [reg_p1+88], r8 -+ mov [reg_p1+96], r9 -+ mov [reg_p1+104], r10 -+ mov [reg_p1+112], r11 -+ mov [reg_p1+120], r12 -+ -+ // a[2-3] x p503p1_nz --> result: r8:r14 -+ MUL128x320_SCHOOL [reg_p1+16], [p503p1_nz], r8, r9, r10, r11, r12, r13, r14, rbx, rcx, r15 -+ -+ xor r15, r15 -+ add r8, [reg_p1+40] -+ adc r9, [reg_p1+48] -+ adc r10, [reg_p1+56] -+ adc r11, [reg_p1+64] -+ adc r12, [reg_p1+72] -+ adc r13, [reg_p1+80] -+ adc r14, [reg_p1+88] -+ adc r15, [reg_p1+96] -+ mov [reg_p1+40], r8 -+ mov [reg_p1+48], r9 -+ mov [reg_p1+56], r10 -+ mov [reg_p1+64], r11 -+ mov [reg_p1+72], r12 -+ mov [reg_p1+80], r13 -+ mov [reg_p1+88], r14 -+ mov [reg_p1+96], r15 -+ mov r8, [reg_p1+104] -+ mov r9, [reg_p1+112] -+ mov r10, [reg_p1+120] -+ adc r8, 0 -+ adc r9, 0 -+ adc r10, 0 -+ mov [reg_p1+104], r8 -+ mov [reg_p1+112], r9 -+ mov [reg_p1+120], r10 -+ -+ // a[4-5] x p503p1_nz --> result: r8:r14 -+ MUL128x320_SCHOOL [reg_p1+32], [p503p1_nz], r8, r9, r10, r11, r12, r13, r14, rbx, rcx, r15 -+ -+ xor r15, r15 -+ xor rbx, rbx -+ add r8, [reg_p1+56] -+ adc r9, [reg_p1+64] -+ adc r10, [reg_p1+72] -+ adc r11, [reg_p1+80] -+ adc r12, [reg_p1+88] -+ adc r13, [reg_p1+96] -+ adc r14, [reg_p1+104] -+ adc r15, [reg_p1+112] -+ adc rbx, [reg_p1+120] -+ mov [reg_p1+56], r8 -+ mov [reg_p2], r9 // Final result c0 -+ mov [reg_p1+72], r10 -+ mov [reg_p1+80], r11 -+ mov [reg_p1+88], r12 -+ mov [reg_p1+96], r13 -+ mov [reg_p1+104], r14 -+ mov [reg_p1+112], r15 -+ mov [reg_p1+120], rbx -+ -+ // a[6-7] x p503p1_nz --> result: r8:r14 -+ MUL128x320_SCHOOL [reg_p1+48], [p503p1_nz], r8, r9, r10, r11, r12, r13, r14, rbx, rcx, r15 -+ -+ // Final result c1:c7 -+ add r8, [reg_p1+72] -+ adc r9, [reg_p1+80] -+ adc r10, [reg_p1+88] -+ adc r11, [reg_p1+96] -+ adc r12, [reg_p1+104] -+ adc r13, [reg_p1+112] -+ adc r14, [reg_p1+120] -+ mov [reg_p2+8], r8 -+ mov [reg_p2+16], r9 -+ mov [reg_p2+24], r10 -+ mov [reg_p2+32], r11 -+ mov [reg_p2+40], r12 -+ mov [reg_p2+48], r13 -+ mov [reg_p2+56], r14 -+ -+ pop r15 -+ pop r14 -+ pop r13 -+ pop r12 -+ pop rbx -+ ret -+ -+ #else -+ -+//*********************************************************************** -+// Montgomery reduction -+// Based on comba method -+// Operation: c [reg_p2] = a [reg_p1] -+// NOTE: a=c is not allowed -+//*********************************************************************** -+.global rdc503_asm -+rdc503_asm: -+ push r12 -+ push r13 -+ push r14 -+ push r15 -+ -+ mov r11, [reg_p1] -+ movq rax, p503p1_3 -+ mul r11 -+ xor r8, r8 -+ add rax, [reg_p1+24] -+ mov [reg_p2+24], rax // z3 -+ adc r8, rdx -+ -+ xor r9, r9 -+ movq rax, p503p1_4 -+ mul r11 -+ xor r10, r10 -+ add r8, rax -+ adc r9, rdx -+ -+ mov r12, [reg_p1+8] -+ movq rax, p503p1_3 -+ mul r12 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ add r8, [reg_p1+32] -+ mov [reg_p2+32], r8 // z4 -+ adc r9, 0 -+ adc r10, 0 -+ -+ xor r8, r8 -+ movq rax, p503p1_5 -+ mul r11 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ movq rax, p503p1_4 -+ mul r12 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov r13, [reg_p1+16] -+ movq rax, p503p1_3 -+ mul r13 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ add r9, [reg_p1+40] -+ mov [reg_p2+40], r9 // z5 -+ adc r10, 0 -+ adc r8, 0 -+ -+ xor r9, r9 -+ movq rax, p503p1_6 -+ mul r11 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ movq rax, p503p1_5 -+ mul r12 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ movq rax, p503p1_4 -+ mul r13 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov r14, [reg_p2+24] -+ movq rax, p503p1_3 -+ mul r14 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ add r10, [reg_p1+48] -+ mov [reg_p2+48], r10 // z6 -+ adc r8, 0 -+ adc r9, 0 -+ -+ xor r10, r10 -+ movq rax, p503p1_7 -+ mul r11 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ movq rax, p503p1_6 -+ mul r12 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ movq rax, p503p1_5 -+ mul r13 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ movq rax, p503p1_4 -+ mul r14 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ mov r15, [reg_p2+32] -+ movq rax, p503p1_3 -+ mul r15 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ add r8, [reg_p1+56] -+ mov [reg_p2+56], r8 // z7 -+ adc r9, 0 -+ adc r10, 0 -+ -+ xor r8, r8 -+ movq rax, p503p1_7 -+ mul r12 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ movq rax, p503p1_6 -+ mul r13 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ movq rax, p503p1_5 -+ mul r14 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ movq rax, p503p1_4 -+ mul r15 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov rcx, [reg_p2+40] -+ movq rax, p503p1_3 -+ mul rcx -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ add r9, [reg_p1+64] -+ mov [reg_p2], r9 // z0 -+ adc r10, 0 -+ adc r8, 0 -+ -+ xor r9, r9 -+ movq rax, p503p1_7 -+ mul r13 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ movq rax, p503p1_6 -+ mul r14 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ movq rax, p503p1_5 -+ mul r15 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ movq rax, p503p1_4 -+ mul rcx -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov r13, [reg_p2+48] -+ movq rax, p503p1_3 -+ mul r13 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ add r10, [reg_p1+72] -+ mov [reg_p2+8], r10 // z1 -+ adc r8, 0 -+ adc r9, 0 -+ -+ xor r10, r10 -+ movq rax, p503p1_7 -+ mul r14 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ movq rax, p503p1_6 -+ mul r15 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ movq rax, p503p1_5 -+ mul rcx -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ movq rax, p503p1_4 -+ mul r13 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ mov r14, [reg_p2+56] -+ movq rax, p503p1_3 -+ mul r14 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ add r8, [reg_p1+80] -+ mov [reg_p2+16], r8 // z2 -+ adc r9, 0 -+ adc r10, 0 -+ -+ xor r8, r8 -+ movq rax, p503p1_7 -+ mul r15 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ movq rax, p503p1_6 -+ mul rcx -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ movq rax, p503p1_5 -+ mul r13 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ movq rax, p503p1_4 -+ mul r14 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ add r9, [reg_p1+88] -+ mov [reg_p2+24], r9 // z3 -+ adc r10, 0 -+ adc r8, 0 -+ -+ xor r9, r9 -+ movq rax, p503p1_7 -+ mul rcx -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ movq rax, p503p1_6 -+ mul r13 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ movq rax, p503p1_5 -+ mul r14 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ add r10, [reg_p1+96] -+ mov [reg_p2+32], r10 // z4 -+ adc r8, 0 -+ adc r9, 0 -+ -+ xor r10, r10 -+ movq rax, p503p1_7 -+ mul r13 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ movq rax, p503p1_6 -+ mul r14 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ add r8, [reg_p1+104] // z5 -+ mov [reg_p2+40], r8 // z5 -+ adc r9, 0 -+ adc r10, 0 -+ -+ movq rax, p503p1_7 -+ mul r14 -+ add r9, rax -+ adc r10, rdx -+ add r9, [reg_p1+112] // z6 -+ mov [reg_p2+48], r9 // z6 -+ adc r10, 0 -+ add r10, [reg_p1+120] // z7 -+ mov [reg_p2+56], r10 // z7 -+ -+ pop r15 -+ pop r14 -+ pop r13 -+ pop r12 -+ ret -+ -+ #endif -+ -+ -+//*********************************************************************** -+// 503-bit multiprecision addition -+// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2] -+//*********************************************************************** -+.global mp_add503_asm -+mp_add503_asm: -+ mov r8, [reg_p1] -+ mov r9, [reg_p1+8] -+ mov r10, [reg_p1+16] -+ mov r11, [reg_p1+24] -+ add r8, [reg_p2] -+ adc r9, [reg_p2+8] -+ adc r10, [reg_p2+16] -+ adc r11, [reg_p2+24] -+ mov [reg_p3], r8 -+ mov [reg_p3+8], r9 -+ mov [reg_p3+16], r10 -+ mov [reg_p3+24], r11 -+ -+ mov r8, [reg_p1+32] -+ mov r9, [reg_p1+40] -+ mov r10, [reg_p1+48] -+ mov r11, [reg_p1+56] -+ adc r8, [reg_p2+32] -+ adc r9, [reg_p2+40] -+ adc r10, [reg_p2+48] -+ adc r11, [reg_p2+56] -+ mov [reg_p3+32], r8 -+ mov [reg_p3+40], r9 -+ mov [reg_p3+48], r10 -+ mov [reg_p3+56], r11 -+ ret -+ -+ -+//*********************************************************************** -+// 2x503-bit multiprecision subtraction -+// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2]. Returns borrow mask -+//*********************************************************************** -+.global mp_sub503x2_asm -+mp_sub503x2_asm: -+ xor rax, rax -+ mov r8, [reg_p1] -+ mov r9, [reg_p1+8] -+ mov r10, [reg_p1+16] -+ mov r11, [reg_p1+24] -+ mov rcx, [reg_p1+32] -+ sub r8, [reg_p2] -+ sbb r9, [reg_p2+8] -+ sbb r10, [reg_p2+16] -+ sbb r11, [reg_p2+24] -+ sbb rcx, [reg_p2+32] -+ mov [reg_p3], r8 -+ mov [reg_p3+8], r9 -+ mov [reg_p3+16], r10 -+ mov [reg_p3+24], r11 -+ mov [reg_p3+32], rcx -+ -+ mov r8, [reg_p1+40] -+ mov r9, [reg_p1+48] -+ mov r10, [reg_p1+56] -+ mov r11, [reg_p1+64] -+ mov rcx, [reg_p1+72] -+ sbb r8, [reg_p2+40] -+ sbb r9, [reg_p2+48] -+ sbb r10, [reg_p2+56] -+ sbb r11, [reg_p2+64] -+ sbb rcx, [reg_p2+72] -+ mov [reg_p3+40], r8 -+ mov [reg_p3+48], r9 -+ mov [reg_p3+56], r10 -+ mov [reg_p3+64], r11 -+ mov [reg_p3+72], rcx -+ -+ mov r8, [reg_p1+80] -+ mov r9, [reg_p1+88] -+ mov r10, [reg_p1+96] -+ mov r11, [reg_p1+104] -+ mov rcx, [reg_p1+112] -+ sbb r8, [reg_p2+80] -+ sbb r9, [reg_p2+88] -+ sbb r10, [reg_p2+96] -+ sbb r11, [reg_p2+104] -+ sbb rcx, [reg_p2+112] -+ mov [reg_p3+80], r8 -+ mov [reg_p3+88], r9 -+ mov [reg_p3+96], r10 -+ mov [reg_p3+104], r11 -+ mov [reg_p3+112], rcx -+ -+ mov r8, [reg_p1+120] -+ sbb r8, [reg_p2+120] -+ sbb rax, 0 -+ mov [reg_p3+120], r8 -+ ret -+ -+ -+//*********************************************************************** -+// Double 2x503-bit multiprecision subtraction -+// Operation: c [reg_p3] = c [reg_p3] - a [reg_p1] - b [reg_p2] -+//*********************************************************************** -+.global mp_dblsub503x2_asm -+mp_dblsub503x2_asm: -+ push r12 -+ push r13 -+ push r14 -+ -+ xor rax, rax -+ mov r8, [reg_p3] -+ mov r9, [reg_p3+8] -+ mov r10, [reg_p3+16] -+ mov r11, [reg_p3+24] -+ mov r12, [reg_p3+32] -+ mov r13, [reg_p3+40] -+ mov r14, [reg_p3+48] -+ mov rcx, [reg_p3+56] -+ sub r8, [reg_p1] -+ sbb r9, [reg_p1+8] -+ sbb r10, [reg_p1+16] -+ sbb r11, [reg_p1+24] -+ sbb r12, [reg_p1+32] -+ sbb r13, [reg_p1+40] -+ sbb r14, [reg_p1+48] -+ sbb rcx, [reg_p1+56] -+ adc rax, 0 -+ sub r8, [reg_p2] -+ sbb r9, [reg_p2+8] -+ sbb r10, [reg_p2+16] -+ sbb r11, [reg_p2+24] -+ sbb r12, [reg_p2+32] -+ sbb r13, [reg_p2+40] -+ sbb r14, [reg_p2+48] -+ sbb rcx, [reg_p2+56] -+ adc rax, 0 -+ mov [reg_p3], r8 -+ mov [reg_p3+8], r9 -+ mov [reg_p3+16], r10 -+ mov [reg_p3+24], r11 -+ mov [reg_p3+32], r12 -+ mov [reg_p3+40], r13 -+ mov [reg_p3+48], r14 -+ mov [reg_p3+56], rcx -+ -+ mov r8, [reg_p3+64] -+ mov r9, [reg_p3+72] -+ mov r10, [reg_p3+80] -+ mov r11, [reg_p3+88] -+ mov r12, [reg_p3+96] -+ mov r13, [reg_p3+104] -+ mov r14, [reg_p3+112] -+ mov rcx, [reg_p3+120] -+ sub r8, rax -+ sbb r8, [reg_p1+64] -+ sbb r9, [reg_p1+72] -+ sbb r10, [reg_p1+80] -+ sbb r11, [reg_p1+88] -+ sbb r12, [reg_p1+96] -+ sbb r13, [reg_p1+104] -+ sbb r14, [reg_p1+112] -+ sbb rcx, [reg_p1+120] -+ sub r8, [reg_p2+64] -+ sbb r9, [reg_p2+72] -+ sbb r10, [reg_p2+80] -+ sbb r11, [reg_p2+88] -+ sbb r12, [reg_p2+96] -+ sbb r13, [reg_p2+104] -+ sbb r14, [reg_p2+112] -+ sbb rcx, [reg_p2+120] -+ mov [reg_p3+64], r8 -+ mov [reg_p3+72], r9 -+ mov [reg_p3+80], r10 -+ mov [reg_p3+88], r11 -+ mov [reg_p3+96], r12 -+ mov [reg_p3+104], r13 -+ mov [reg_p3+112], r14 -+ mov [reg_p3+120], rcx -+ -+ pop r14 -+ pop r13 -+ pop r12 -+ ret -diff --git a/third_party/sidh/src/P503/ARM64/fp_arm64.c b/third_party/sidh/src/P503/ARM64/fp_arm64.c -new file mode 100644 -index 00000000..e92c40d6 ---- /dev/null -+++ b/third_party/sidh/src/P503/ARM64/fp_arm64.c -@@ -0,0 +1,93 @@ -+/******************************************************************************************** -+* SIDH: an efficient supersingular isogeny cryptography library -+* -+* Abstract: modular arithmetic optimized for 64-bit ARMv8 platforms for P503 -+*********************************************************************************************/ -+ -+#include "../P503_internal.h" -+ -+// Global constants -+extern const uint64_t p503[NWORDS_FIELD]; -+extern const uint64_t p503x2[NWORDS_FIELD]; -+ -+ -+__inline void fpadd503(const digit_t* a, const digit_t* b, digit_t* c) -+{ // Modular addition, c = a+b mod p503. -+ // Inputs: a, b in [0, 2*p503-1] -+ // Output: c in [0, 2*p503-1] -+ -+ fpadd503_asm(a, b, c); -+} -+ -+ -+__inline void fpsub503(const digit_t* a, const digit_t* b, digit_t* c) -+{ // Modular subtraction, c = a-b mod p503. -+ // Inputs: a, b in [0, 2*p503-1] -+ // Output: c in [0, 2*p503-1] -+ -+ fpsub503_asm(a, b, c); -+} -+ -+ -+__inline void fpneg503(digit_t* a) -+{ // Modular negation, a = -a mod p503. -+ // Input/output: a in [0, 2*p503-1] -+ unsigned int i, borrow = 0; -+ -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ SUBC(borrow, ((digit_t*)p503x2)[i], a[i], borrow, a[i]); -+ } -+} -+ -+ -+void fpdiv2_503(const digit_t* a, digit_t* c) -+{ // Modular division by two, c = a/2 mod p503. -+ // Input : a in [0, 2*p503-1] -+ // Output: c in [0, 2*p503-1] -+ unsigned int i, carry = 0; -+ digit_t mask; -+ -+ mask = 0 - (digit_t)(a[0] & 1); // If a is odd compute a+p521 -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ ADDC(carry, a[i], ((digit_t*)p503)[i] & mask, carry, c[i]); -+ } -+ -+ mp_shiftr1(c, NWORDS_FIELD); -+} -+ -+ -+void fpcorrection503(digit_t* a) -+{ // Modular correction to reduce field element a in [0, 2*p503-1] to [0, p503-1]. -+ unsigned int i, borrow = 0; -+ digit_t mask; -+ -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ SUBC(borrow, a[i], ((digit_t*)p503)[i], borrow, a[i]); -+ } -+ mask = 0 - (digit_t)borrow; -+ -+ borrow = 0; -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ ADDC(borrow, a[i], ((digit_t*)p503)[i] & mask, borrow, a[i]); -+ } -+} -+ -+ -+void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) -+{ // Multiprecision multiply, c = a*b, where lng(a) = lng(b) = nwords. -+ -+ UNREFERENCED_PARAMETER(nwords); -+ -+ mul503_asm(a, b, c); -+} -+ -+ -+ -+void rdc_mont(const digit_t* ma, digit_t* mc) -+{ // Montgomery reduction exploiting special form of the prime. -+ // mc = ma*R^-1 mod p503x2, where R = 2^512. -+ // If ma < 2^512*p503, the output mc is in the range [0, 2*p503-1]. -+ // ma is assumed to be in Montgomery representation. -+ -+ rdc503_asm(ma, mc); -+} -diff --git a/third_party/sidh/src/P503/ARM64/fp_arm64_asm.S b/third_party/sidh/src/P503/ARM64/fp_arm64_asm.S -new file mode 100644 -index 00000000..ada3a40f ---- /dev/null -+++ b/third_party/sidh/src/P503/ARM64/fp_arm64_asm.S -@@ -0,0 +1,829 @@ -+//******************************************************************************************* -+// SIDH: an efficient supersingular isogeny cryptography library -+// -+// Abstract: field arithmetic in 64-bit ARMv8 assembly for P503 on Linux -+//******************************************************************************************* -+ -+.data -+ -+// p503 + 1 -+p503p1: -+.quad 0xAC00000000000000 -+.quad 0x13085BDA2211E7A0 -+.quad 0x1B9BF6C87B7E7DAF -+.quad 0x6045C6BDDA77A4D0 -+.quad 0x004066F541811E1E -+ -+// 2 * p503 -+p503x2: -+.quad 0xFFFFFFFFFFFFFFFE -+.quad 0xFFFFFFFFFFFFFFFF -+.quad 0x57FFFFFFFFFFFFFF -+.quad 0x2610B7B44423CF41 -+.quad 0x3737ED90F6FCFB5E -+.quad 0xC08B8D7BB4EF49A0 -+.quad 0x0080CDEA83023C3C -+ -+p503p1_nz_s8: -+.quad 0x85BDA2211E7A0AC -+.quad 0x9BF6C87B7E7DAF13 -+.quad 0x45C6BDDA77A4D01B -+.quad 0x4066F541811E1E60 -+ -+ -+.text -+//*********************************************************************** -+// Field addition -+// Operation: c [x2] = a [x0] + b [x1] -+//*********************************************************************** -+.global fpadd503_asm -+fpadd503_asm: -+ ldp x3, x4, [x0,#0] -+ ldp x5, x6, [x0,#16] -+ ldp x11, x12, [x1,#0] -+ ldp x13, x14, [x1,#16] -+ -+ // Add a + b -+ adds x3, x3, x11 -+ adcs x4, x4, x12 -+ adcs x5, x5, x13 -+ adcs x6, x6, x14 -+ ldp x7, x8, [x0,#32] -+ ldp x9, x10, [x0,#48] -+ ldp x15, x16, [x1,#32] -+ ldp x17, x18, [x1,#48] -+ adcs x7, x7, x15 -+ adcs x8, x8, x16 -+ adcs x9, x9, x17 -+ adc x10, x10, x18 -+ -+ // Subtract 2xp503 -+ ldr x11, p503x2 -+ ldr x12, p503x2 + 8 -+ ldr x13, p503x2 + 16 -+ ldr x14, p503x2 + 24 -+ subs x3, x3, x11 -+ sbcs x4, x4, x12 -+ sbcs x5, x5, x12 -+ sbcs x6, x6, x13 -+ sbcs x7, x7, x14 -+ ldr x15, p503x2 + 32 -+ ldr x16, p503x2 + 40 -+ ldr x17, p503x2 + 48 -+ sbcs x8, x8, x15 -+ sbcs x9, x9, x16 -+ sbcs x10, x10, x17 -+ sbc x18, xzr, xzr -+ -+ // Add 2xp503 anded with the mask in x18 -+ and x11, x11, x18 -+ and x12, x12, x18 -+ and x13, x13, x18 -+ and x14, x14, x18 -+ and x15, x15, x18 -+ and x16, x16, x18 -+ and x17, x17, x18 -+ -+ adds x3, x3, x11 -+ adcs x4, x4, x12 -+ adcs x5, x5, x12 -+ adcs x6, x6, x13 -+ adcs x7, x7, x14 -+ adcs x8, x8, x15 -+ adcs x9, x9, x16 -+ adc x10, x10, x17 -+ -+ stp x3, x4, [x2,#0] -+ stp x5, x6, [x2,#16] -+ stp x7, x8, [x2,#32] -+ stp x9, x10, [x2,#48] -+ ret -+ -+ -+//*********************************************************************** -+// Field subtraction -+// Operation: c [x2] = a [x0] - b [x1] -+//*********************************************************************** -+.global fpsub503_asm -+fpsub503_asm: -+ ldp x3, x4, [x0,#0] -+ ldp x5, x6, [x0,#16] -+ ldp x11, x12, [x1,#0] -+ ldp x13, x14, [x1,#16] -+ -+ // Subtract a - b -+ subs x3, x3, x11 -+ sbcs x4, x4, x12 -+ sbcs x5, x5, x13 -+ sbcs x6, x6, x14 -+ ldp x7, x8, [x0,#32] -+ ldp x9, x10, [x0,#48] -+ ldp x15, x16, [x1,#32] -+ ldp x17, x18, [x1,#48] -+ sbcs x7, x7, x15 -+ sbcs x8, x8, x16 -+ sbcs x9, x9, x17 -+ sbcs x10, x10, x18 -+ sbc x18, xzr, xzr -+ -+ // Add 2xp503 anded with the mask in x18 -+ ldr x11, p503x2 -+ ldr x12, p503x2 + 8 -+ ldr x13, p503x2 + 16 -+ ldr x14, p503x2 + 24 -+ and x11, x11, x18 -+ and x12, x12, x18 -+ and x13, x13, x18 -+ and x14, x14, x18 -+ ldr x15, p503x2 + 32 -+ ldr x16, p503x2 + 40 -+ ldr x17, p503x2 + 48 -+ and x15, x15, x18 -+ and x16, x16, x18 -+ and x17, x17, x18 -+ -+ adds x3, x3, x11 -+ adcs x4, x4, x12 -+ adcs x5, x5, x12 -+ adcs x6, x6, x13 -+ adcs x7, x7, x14 -+ adcs x8, x8, x15 -+ adcs x9, x9, x16 -+ adc x10, x10, x17 -+ -+ stp x3, x4, [x2,#0] -+ stp x5, x6, [x2,#16] -+ stp x7, x8, [x2,#32] -+ stp x9, x10, [x2,#48] -+ ret -+ -+ -+//////////////////////////////////////////// MACRO -+.macro MUL128_COMBA_CUT A0, A1, B0, B1, C0, C1, C2, C3, T0 -+ mul \A0, \A1, \B0 -+ umulh \B0, \A1, \B0 -+ adds \C1, \C1, \C3 -+ adc \C2, \C2, xzr -+ -+ mul \T0, \A1, \B1 -+ umulh \B1, \A1, \B1 -+ adds \C1, \C1, \A0 -+ adcs \C2, \C2, \B0 -+ adc \C3, xzr, xzr -+ -+ adds \C2, \C2, \T0 -+ adc \C3, \C3, \B1 -+.endm -+ -+ -+//////////////////////////////////////////// MACRO -+.macro MUL256_KARATSUBA_COMBA M,A0,A1,A2,A3,B0,B1,B2,B3,C0,C1,C2,C3,C4,C5,C6,C7,T0,T1 -+ -+ // A0-A1 <- AH + AL, T0 <- mask -+ adds \A0, \A0, \A2 -+ adcs \A1, \A1, \A3 -+ adc \T0, xzr, xzr -+ -+ // C6, T1 <- BH + BL, C7 <- mask -+ adds \C6, \B0, \B2 -+ adcs \T1, \B1, \B3 -+ adc \C7, xzr, xzr -+ -+ // C0-C1 <- masked (BH + BL) -+ sub \C2, xzr, \T0 -+ sub \C3, xzr, \C7 -+ and \C0, \C6, \C2 -+ and \C1, \T1, \C2 -+ -+ // C4-C5 <- masked (AH + AL), T0 <- combined carry -+ and \C4, \A0, \C3 -+ and \C5, \A1, \C3 -+ mul \C2, \A0, \C6 -+ mul \C3, \A0, \T1 -+ and \T0, \T0, \C7 -+ -+ // C0-C1, T0 <- (AH+AL) x (BH+BL), part 1 -+ adds \C0, \C4, \C0 -+ umulh \C4, \A0, \T1 -+ adcs \C1, \C5, \C1 -+ umulh \C5, \A0, \C6 -+ adc \T0, \T0, xzr -+ -+ // C2-C5 <- (AH+AL) x (BH+BL), low part -+ MUL128_COMBA_CUT \A0, \A1, \C6, \T1, \C2, \C3, \C4, \C5, \C7 -+ ldp \A0, \A1, [\M,#0] -+ -+ // C2-C5, T0 <- (AH+AL) x (BH+BL), final part -+ adds \C4, \C0, \C4 -+ umulh \C7, \A0, \B0 -+ umulh \T1, \A0, \B1 -+ adcs \C5, \C1, \C5 -+ mul \C0, \A0, \B0 -+ mul \C1, \A0, \B1 -+ adc \T0, \T0, xzr -+ -+ // C0-C1, T1, C7 <- AL x BL -+ MUL128_COMBA_CUT \A0, \A1, \B0, \B1, \C0, \C1, \T1, \C7, \C6 -+ -+ // C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL -+ mul \A0, \A2, \B2 -+ umulh \B0, \A2, \B2 -+ subs \C2, \C2, \C0 -+ sbcs \C3, \C3, \C1 -+ sbcs \C4, \C4, \T1 -+ mul \A1, \A2, \B3 -+ umulh \C6, \A2, \B3 -+ sbcs \C5, \C5, \C7 -+ sbc \T0, \T0, xzr -+ -+ // A0, A1, C6, B0 <- AH x BH -+ MUL128_COMBA_CUT \A2, \A3, \B2, \B3, \A0, \A1, \C6, \B0, \B1 -+ -+ // C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH -+ subs \C2, \C2, \A0 -+ sbcs \C3, \C3, \A1 -+ sbcs \C4, \C4, \C6 -+ sbcs \C5, \C5, \B0 -+ sbc \T0, \T0, xzr -+ -+ adds \C2, \C2, \T1 -+ adcs \C3, \C3, \C7 -+ adcs \C4, \C4, \A0 -+ adcs \C5, \C5, \A1 -+ adcs \C6, \T0, \C6 -+ adc \C7, \B0, xzr -+.endm -+ -+ -+//*********************************************************************************** -+// 512-bit integer multiplication using Karatsuba (two levels), Comba (lower level) -+// Operation: c [x2] = a [x0] * b [x1] -+//*********************************************************************************** -+.global mul503_asm -+mul503_asm: -+ sub sp, sp, #96 -+ stp x19, x20, [sp,#0] -+ stp x21, x22, [sp,#16] -+ stp x23, x24, [sp,#32] -+ stp x25, x26, [sp,#48] -+ stp x27, x28, [sp,#64] -+ str x29, [sp, #80] -+ -+ ldp x3, x4, [x0] -+ ldp x5, x6, [x0,#16] -+ ldp x7, x8, [x0,#32] -+ ldp x9, x10, [x0,#48] -+ ldp x11, x12, [x1,#0] -+ ldp x13, x14, [x1,#16] -+ ldp x15, x16, [x1,#32] -+ ldp x17, x18, [x1,#48] -+ -+ // x26-x29 <- AH + AL, x7 <- mask -+ adds x26, x3, x7 -+ adcs x27, x4, x8 -+ adcs x28, x5, x9 -+ adcs x29, x6, x10 -+ adc x7, xzr, xzr -+ -+ // x11-x14 <- BH + BL, x8 <- mask -+ adds x11, x11, x15 -+ adcs x12, x12, x16 -+ adcs x13, x13, x17 -+ adcs x14, x14, x18 -+ adc x8, xzr, xzr -+ -+ // x15-x18 <- masked (BH + BL) -+ sub x9, xzr, x7 -+ sub x10, xzr, x8 -+ and x15, x11, x9 -+ and x16, x12, x9 -+ and x17, x13, x9 -+ and x18, x14, x9 -+ -+ // x19-x22 <- masked (AH + AL), x7 <- combined carry -+ and x19, x26, x10 -+ and x20, x27, x10 -+ and x21, x28, x10 -+ and x22, x29, x10 -+ and x7, x7, x8 -+ -+ // x15-x18, x7 <- masked (AH+AL) + masked (BH+BL), step 1 -+ adds x15, x15, x19 -+ adcs x16, x16, x20 -+ adcs x17, x17, x21 -+ adcs x18, x18, x22 -+ adc x7, x7, xzr -+ -+ // x8-x10,x19-x23 <- (AH+AL) x (BH+BL), low part -+ stp x26, x27, [x2,#0] -+ MUL256_KARATSUBA_COMBA x2, x26, x27, x28, x29, x11, x12, x13, x14, x8, x9, x10, x19, x20, x21, x22, x23, x24, x25 -+ -+ // x15-x18, x7 <- (AH+AL) x (BH+BL), final step -+ adds x15, x15, x20 -+ adcs x16, x16, x21 -+ adcs x17, x17, x22 -+ adcs x18, x18, x23 -+ adc x7, x7, xzr -+ -+ // x20-x27 <- AL x BL -+ ldp x11, x12, [x1,#0] -+ ldp x13, x14, [x1,#16] -+ MUL256_KARATSUBA_COMBA x0, x3, x4, x5, x6, x11, x12, x13, x14, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29 -+ -+ // x13-x14, x3-x5 <- (AH+AL) x (BH+BL) - ALxBL -+ subs x8, x8, x20 -+ sbcs x9, x9, x21 -+ sbcs x10, x10, x22 -+ sbcs x19, x19, x23 -+ sbcs x15, x15, x24 -+ sbcs x16, x16, x25 -+ sbcs x17, x17, x26 -+ sbcs x18, x18, x27 -+ sbc x7, x7, xzr -+ -+ stp x20, x21, [x2] -+ stp x22, x23, [x2,#16] -+ -+ ldp x3, x4, [x0,#32] -+ ldp x5, x6, [x0,#48] -+ ldp x11, x12, [x1,#32] -+ ldp x13, x14, [x1,#48] -+ -+ adds x8, x8, x24 -+ adcs x9, x9, x25 -+ adcs x10, x10, x26 -+ adcs x19, x19, x27 -+ adc x1, xzr, xzr -+ -+ // x20-x27 <- AH x BH -+ add x0, x0, #32 -+ MUL256_KARATSUBA_COMBA x0, x3, x4, x5, x6, x11, x12, x13, x14, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29 -+ neg x1, x1 -+ -+ // x13-x14, x3-x5 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH -+ subs x8, x8, x20 -+ sbcs x9, x9, x21 -+ sbcs x10, x10, x22 -+ sbcs x19, x19, x23 -+ sbcs x15, x15, x24 -+ sbcs x16, x16, x25 -+ sbcs x17, x17, x26 -+ sbcs x18, x18, x27 -+ sbc x7, x7, xzr -+ -+ stp x8, x9, [x2,#32] -+ stp x10, x19, [x2,#48] -+ -+ adds x1, x1, #1 -+ adcs x15, x15, x20 -+ adcs x16, x16, x21 -+ adcs x17, x17, x22 -+ adcs x18, x18, x23 -+ adcs x24, x7, x24 -+ adcs x25, x25, xzr -+ adcs x26, x26, xzr -+ adc x27, x27, xzr -+ -+ stp x15, x16, [x2,#64] -+ stp x17, x18, [x2,#80] -+ stp x24, x25, [x2,#96] -+ stp x26, x27, [x2,#112] -+ -+ ldp x19, x20, [sp,#0] -+ ldp x21, x22, [sp,#16] -+ ldp x23, x24, [sp,#32] -+ ldp x25, x26, [sp,#48] -+ ldp x27, x28, [sp,#64] -+ ldr x29, [sp,#80] -+ add sp, sp, #96 -+ ret -+ -+ -+//////////////////////////////////////////// MACRO -+.macro MUL128x256_COMBA_CUT A0, A1, B0, B1, B2, B3, C0, C1, C2, C3, C4, C5, T0, T1, T2, T3 -+ mul \T0, \A1, \B0 -+ umulh \T1, \A1, \B0 -+ adds \C1, \C1, \C3 -+ adc \C2, \C2, xzr -+ -+ mul \T2, \A0, \B2 -+ umulh \T3, \A0, \B2 -+ adds \C1, \C1, \T0 -+ adcs \C2, \C2, \T1 -+ adc \C3, xzr, xzr -+ -+ mul \T0, \A1, \B1 -+ umulh \T1, \A1, \B1 -+ adds \C2, \C2, \T2 -+ adcs \C3, \C3, \T3 -+ adc \C4, xzr, xzr -+ -+ mul \T2, \A0, \B3 -+ umulh \T3, \A0, \B3 -+ adds \C2, \C2, \T0 -+ adcs \C3, \C3, \T1 -+ adc \C4, \C4, xzr -+ -+ mul \T0, \A1, \B2 -+ umulh \T1, \A1, \B2 -+ adds \C3, \C3, \T2 -+ adcs \C4, \C4, \T3 -+ adc \C5, xzr, xzr -+ -+ mul \T2, \A1, \B3 -+ umulh \T3, \A1, \B3 -+ adds \C3, \C3, \T0 -+ adcs \C4, \C4, \T1 -+ adc \C5, \C5, xzr -+ adds \C4, \C4, \T2 -+ adc \C5, \C5, \T3 -+.endm -+ -+ -+//************************************************************************************** -+// Montgomery reduction -+// Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015 -+// Operation: mc [x1] = ma [x0] -+// NOTE: ma=mc is not allowed -+//************************************************************************************** -+.global rdc503_asm -+rdc503_asm: -+ sub sp, sp, #96 -+ stp x19, x20, [sp] -+ stp x21, x22, [sp, #16] -+ stp x23, x24, [sp, #32] -+ stp x25, x26, [sp, #48] -+ stp x27, x28, [sp, #64] -+ stp x29, x30, [sp, #80] -+ -+ ldp x2, x3, [x0,#0] // a[0-1] -+ -+ // Load the prime constant -+ ldr x24, p503p1_nz_s8 + 0 -+ ldr x25, p503p1_nz_s8 + 8 -+ ldr x26, p503p1_nz_s8 + 16 -+ ldr x27, p503p1_nz_s8 + 24 -+ -+ // a[0-1] x p503p1_nz_s8 --> result: x4:x9 -+ mul x4, x2, x24 // a[0] x p503p1_nz_s8[0] -+ umulh x7, x2, x24 -+ mul x5, x2, x25 // a[0] x p503p1_nz_s8[1] -+ umulh x6, x2, x25 -+ MUL128x256_COMBA_CUT x2, x3, x24, x25, x26, x27, x4, x5, x6, x7, x8, x9, x28, x29, x30, x10 -+ -+ ldp x3, x11, [x0,#16] // a[2] -+ ldp x12, x13, [x0,#32] -+ ldp x14, x15, [x0,#48] -+ -+ orr x10, xzr, x9, lsr #8 -+ lsl x9, x9, #56 -+ orr x9, x9, x8, lsr #8 -+ lsl x8, x8, #56 -+ orr x8, x8, x7, lsr #8 -+ lsl x7, x7, #56 -+ orr x7, x7, x6, lsr #8 -+ lsl x6, x6, #56 -+ orr x6, x6, x5, lsr #8 -+ lsl x5, x5, #56 -+ orr x5, x5, x4, lsr #8 -+ lsl x4, x4, #56 -+ -+ adds x11, x4, x11 // a[3] -+ adcs x12, x5, x12 // a[4] -+ adcs x13, x6, x13 -+ adcs x14, x7, x14 -+ adcs x15, x8, x15 -+ ldp x16, x17, [x0,#64] -+ ldp x18, x19, [x0,#80] -+ mul x4, x3, x24 // a[2] x p503p1_nz_s8[0] -+ umulh x7, x3, x24 -+ adcs x16, x9, x16 -+ adcs x17, x10, x17 -+ adcs x18, xzr, x18 -+ adcs x19, xzr, x19 -+ ldp x20, x21, [x0,#96] -+ ldp x22, x23, [x0,#112] -+ mul x5, x3, x25 // a[2] x p503p1_nz_s8[1] -+ umulh x6, x3, x25 -+ adcs x20, xzr, x20 -+ adcs x21, xzr, x21 -+ adcs x22, xzr, x22 -+ adc x23, xzr, x23 -+ -+ // a[2-3] x p503p1_nz_s8 --> result: x4:x9 -+ MUL128x256_COMBA_CUT x3, x11, x24, x25, x26, x27, x4, x5, x6, x7, x8, x9, x28, x29, x30, x10 -+ -+ orr x10, xzr, x9, lsr #8 -+ lsl x9, x9, #56 -+ orr x9, x9, x8, lsr #8 -+ lsl x8, x8, #56 -+ orr x8, x8, x7, lsr #8 -+ lsl x7, x7, #56 -+ orr x7, x7, x6, lsr #8 -+ lsl x6, x6, #56 -+ orr x6, x6, x5, lsr #8 -+ lsl x5, x5, #56 -+ orr x5, x5, x4, lsr #8 -+ lsl x4, x4, #56 -+ -+ adds x13, x4, x13 // a[5] -+ adcs x14, x5, x14 // a[6] -+ adcs x15, x6, x15 -+ adcs x16, x7, x16 -+ mul x4, x12, x24 // a[4] x p503p1_nz_s8[0] -+ umulh x7, x12, x24 -+ adcs x17, x8, x17 -+ adcs x18, x9, x18 -+ adcs x19, x10, x19 -+ adcs x20, xzr, x20 -+ mul x5, x12, x25 // a[4] x p503p1_nz_s8[1] -+ umulh x6, x12, x25 -+ adcs x21, xzr, x21 -+ adcs x22, xzr, x22 -+ adc x23, xzr, x23 -+ -+ // a[4-5] x p503p1_nz_s8 --> result: x4:x9 -+ MUL128x256_COMBA_CUT x12, x13, x24, x25, x26, x27, x4, x5, x6, x7, x8, x9, x28, x29, x30, x10 -+ -+ orr x10, xzr, x9, lsr #8 -+ lsl x9, x9, #56 -+ orr x9, x9, x8, lsr #8 -+ lsl x8, x8, #56 -+ orr x8, x8, x7, lsr #8 -+ lsl x7, x7, #56 -+ orr x7, x7, x6, lsr #8 -+ lsl x6, x6, #56 -+ orr x6, x6, x5, lsr #8 -+ lsl x5, x5, #56 -+ orr x5, x5, x4, lsr #8 -+ lsl x4, x4, #56 -+ -+ adds x15, x4, x15 // a[7] -+ adcs x16, x5, x16 // a[8] -+ adcs x17, x6, x17 -+ adcs x18, x7, x18 -+ mul x4, x14, x24 // a[6] x p503p1_nz_s8[0] -+ umulh x7, x14, x24 -+ adcs x19, x8, x19 -+ adcs x20, x9, x20 -+ adcs x21, x10, x21 -+ mul x5, x14, x25 // a[6] x p503p1_nz_s8[1] -+ umulh x6, x14, x25 -+ adcs x22, xzr, x22 -+ adc x23, xzr, x23 -+ -+ // a[6-7] x p503p1_nz_s8 --> result: x4:x9 -+ MUL128x256_COMBA_CUT x14, x15, x24, x25, x26, x27, x4, x5, x6, x7, x8, x9, x28, x29, x30, x10 -+ -+ orr x10, xzr, x9, lsr #8 -+ lsl x9, x9, #56 -+ orr x9, x9, x8, lsr #8 -+ lsl x8, x8, #56 -+ orr x8, x8, x7, lsr #8 -+ lsl x7, x7, #56 -+ orr x7, x7, x6, lsr #8 -+ lsl x6, x6, #56 -+ orr x6, x6, x5, lsr #8 -+ lsl x5, x5, #56 -+ orr x5, x5, x4, lsr #8 -+ lsl x4, x4, #56 -+ -+ adds x17, x4, x17 -+ adcs x18, x5, x18 -+ adcs x19, x6, x19 -+ adcs x20, x7, x20 -+ stp x16, x17, [x1,#0] // Final result -+ stp x18, x19, [x1,#16] -+ adcs x21, x8, x21 -+ adcs x22, x9, x22 -+ adc x23, x10, x23 -+ stp x20, x21, [x1,#32] -+ stp x22, x23, [x1,#48] -+ -+ ldp x19, x20, [sp] -+ ldp x21, x22, [sp, #16] -+ ldp x23, x24, [sp, #32] -+ ldp x25, x26, [sp, #48] -+ ldp x27, x28, [sp, #64] -+ ldp x29, x30, [sp, #80] -+ add sp, sp, #96 -+ ret -+ -+ -+//*********************************************************************** -+// 503-bit multiprecision addition -+// Operation: c [x2] = a [x0] + b [x1] -+//*********************************************************************** -+.global mp_add503_asm -+mp_add503_asm: -+ ldp x3, x4, [x0,#0] -+ ldp x5, x6, [x0,#16] -+ ldp x11, x12, [x1,#0] -+ ldp x13, x14, [x1,#16] -+ -+ adds x3, x3, x11 -+ adcs x4, x4, x12 -+ adcs x5, x5, x13 -+ adcs x6, x6, x14 -+ ldp x7, x8, [x0,#32] -+ ldp x9, x10, [x0,#48] -+ ldp x15, x16, [x1,#32] -+ ldp x17, x18, [x1,#48] -+ adcs x7, x7, x15 -+ adcs x8, x8, x16 -+ adcs x9, x9, x17 -+ adc x10, x10, x18 -+ -+ stp x3, x4, [x2,#0] -+ stp x5, x6, [x2,#16] -+ stp x7, x8, [x2,#32] -+ stp x9, x10, [x2,#48] -+ ret -+ -+ -+//*********************************************************************** -+// 2x503-bit multiprecision addition -+// Operation: c [x2] = a [x0] + b [x1] -+//*********************************************************************** -+.global mp_add503x2_asm -+mp_add503x2_asm: -+ ldp x3, x4, [x0,#0] -+ ldp x5, x6, [x0,#16] -+ ldp x11, x12, [x1,#0] -+ ldp x13, x14, [x1,#16] -+ adds x3, x3, x11 -+ adcs x4, x4, x12 -+ adcs x5, x5, x13 -+ adcs x6, x6, x14 -+ ldp x7, x8, [x0,#32] -+ ldp x9, x10, [x0,#48] -+ ldp x15, x16, [x1,#32] -+ ldp x17, x18, [x1,#48] -+ adcs x7, x7, x15 -+ adcs x8, x8, x16 -+ adcs x9, x9, x17 -+ adcs x10, x10, x18 -+ -+ stp x3, x4, [x2,#0] -+ stp x5, x6, [x2,#16] -+ stp x7, x8, [x2,#32] -+ stp x9, x10, [x2,#48] -+ -+ ldp x3, x4, [x0,#64] -+ ldp x5, x6, [x0,#80] -+ ldp x11, x12, [x1,#64] -+ ldp x13, x14, [x1,#80] -+ adcs x3, x3, x11 -+ adcs x4, x4, x12 -+ adcs x5, x5, x13 -+ adcs x6, x6, x14 -+ ldp x7, x8, [x0,#96] -+ ldp x9, x10, [x0,#112] -+ ldp x15, x16, [x1,#96] -+ ldp x17, x18, [x1,#112] -+ adcs x7, x7, x15 -+ adcs x8, x8, x16 -+ adcs x9, x9, x17 -+ adc x10, x10, x18 -+ -+ stp x3, x4, [x2,#64] -+ stp x5, x6, [x2,#80] -+ stp x7, x8, [x2,#96] -+ stp x9, x10, [x2,#112] -+ ret -+ -+ -+//*********************************************************************** -+// 2x503-bit multiprecision subtraction -+// Operation: c [x2] = a [x0] - b [x1]. Returns borrow mask -+//*********************************************************************** -+.global mp_sub503x2_asm -+mp_sub503x2_asm: -+ ldp x3, x4, [x0,#0] -+ ldp x5, x6, [x0,#16] -+ ldp x11, x12, [x1,#0] -+ ldp x13, x14, [x1,#16] -+ subs x3, x3, x11 -+ sbcs x4, x4, x12 -+ sbcs x5, x5, x13 -+ sbcs x6, x6, x14 -+ ldp x7, x8, [x0,#32] -+ ldp x9, x10, [x0,#48] -+ ldp x15, x16, [x1,#32] -+ ldp x17, x18, [x1,#48] -+ sbcs x7, x7, x15 -+ sbcs x8, x8, x16 -+ sbcs x9, x9, x17 -+ sbcs x10, x10, x18 -+ -+ stp x3, x4, [x2,#0] -+ stp x5, x6, [x2,#16] -+ stp x7, x8, [x2,#32] -+ stp x9, x10, [x2,#48] -+ -+ ldp x3, x4, [x0,#64] -+ ldp x5, x6, [x0,#80] -+ ldp x11, x12, [x1,#64] -+ ldp x13, x14, [x1,#80] -+ sbcs x3, x3, x11 -+ sbcs x4, x4, x12 -+ sbcs x5, x5, x13 -+ sbcs x6, x6, x14 -+ ldp x7, x8, [x0,#96] -+ ldp x9, x10, [x0,#112] -+ ldp x15, x16, [x1,#96] -+ ldp x17, x18, [x1,#112] -+ sbcs x7, x7, x15 -+ sbcs x8, x8, x16 -+ sbcs x9, x9, x17 -+ sbcs x10, x10, x18 -+ sbc x0, xzr, xzr -+ -+ stp x3, x4, [x2,#64] -+ stp x5, x6, [x2,#80] -+ stp x7, x8, [x2,#96] -+ stp x9, x10, [x2,#112] -+ ret -+ -+ -+//*********************************************************************** -+// Double 2x503-bit multiprecision subtraction -+// Operation: c [x2] = c [x2] - a [x0] - b [x1] -+//*********************************************************************** -+.global mp_dblsub503x2_asm -+mp_dblsub503x2_asm: -+ sub sp, sp, #32 -+ stp x27, x28, [sp, #0] -+ stp x29, x30, [sp, #16] -+ ldp x3, x4, [x2,#0] -+ ldp x5, x6, [x2,#16] -+ ldp x7, x8, [x2,#32] -+ ldp x9, x10, [x2,#48] -+ ldp x11, x12, [x2,#64] -+ ldp x13, x14, [x2,#80] -+ ldp x15, x16, [x2,#96] -+ ldp x17, x18, [x2,#112] -+ -+ ldp x27, x28, [x0,#0] -+ ldp x29, x30, [x0,#16] -+ subs x3, x3, x27 -+ sbcs x4, x4, x28 -+ sbcs x5, x5, x29 -+ sbcs x6, x6, x30 -+ ldp x27, x28, [x0,#32] -+ ldp x29, x30, [x0,#48] -+ sbcs x7, x7, x27 -+ sbcs x8, x8, x28 -+ sbcs x9, x9, x29 -+ sbcs x10, x10, x30 -+ ldp x27, x28, [x0,#64] -+ ldp x29, x30, [x0,#80] -+ sbcs x11, x11, x27 -+ sbcs x12, x12, x28 -+ sbcs x13, x13, x29 -+ sbcs x14, x14, x30 -+ ldp x27, x28, [x0,#96] -+ ldp x29, x30, [x0,#112] -+ sbcs x15, x15, x27 -+ sbcs x16, x16, x28 -+ sbcs x17, x17, x29 -+ sbc x18, x18, x30 -+ -+ ldp x27, x28, [x1,#0] -+ ldp x29, x30, [x1,#16] -+ subs x3, x3, x27 -+ sbcs x4, x4, x28 -+ sbcs x5, x5, x29 -+ sbcs x6, x6, x30 -+ ldp x27, x28, [x1,#32] -+ ldp x29, x30, [x1,#48] -+ sbcs x7, x7, x27 -+ sbcs x8, x8, x28 -+ sbcs x9, x9, x29 -+ sbcs x10, x10, x30 -+ ldp x27, x28, [x1,#64] -+ ldp x29, x30, [x1,#80] -+ sbcs x11, x11, x27 -+ sbcs x12, x12, x28 -+ sbcs x13, x13, x29 -+ sbcs x14, x14, x30 -+ ldp x27, x28, [x1,#96] -+ ldp x29, x30, [x1,#112] -+ sbcs x15, x15, x27 -+ sbcs x16, x16, x28 -+ sbcs x17, x17, x29 -+ sbc x18, x18, x30 -+ -+ stp x3, x4, [x2,#0] -+ stp x5, x6, [x2,#16] -+ stp x7, x8, [x2,#32] -+ stp x9, x10, [x2,#48] -+ stp x11, x12, [x2,#64] -+ stp x13, x14, [x2,#80] -+ stp x15, x16, [x2,#96] -+ stp x17, x18, [x2,#112] -+ -+ ldp x27, x28, [sp, #0] -+ ldp x29, x30, [sp, #16] -+ add sp, sp, #32 -+ ret -diff --git a/third_party/sidh/src/P503/P503.c b/third_party/sidh/src/P503/P503.c -new file mode 100644 -index 00000000..dcd7a84c ---- /dev/null -+++ b/third_party/sidh/src/P503/P503.c -@@ -0,0 +1,126 @@ -+/******************************************************************************************** -+* SIDH: an efficient supersingular isogeny cryptography library -+* -+* Abstract: supersingular isogeny parameters and generation of functions for P503 -+*********************************************************************************************/ -+ -+#include "P503_api.h" -+#include "P503_internal.h" -+ -+ -+// Encoding of field elements, elements over Z_order, elements over GF(p^2) and elliptic curve points: -+// -------------------------------------------------------------------------------------------------- -+// Elements over GF(p) and Z_order are encoded with the least significant octet (and digit) located at the leftmost position (i.e., little endian format). -+// Elements (a+b*i) over GF(p^2), where a and b are defined over GF(p), are encoded as {a, b}, with a in the least significant position. -+// Elliptic curve points P = (x,y) are encoded as {x, y}, with x in the least significant position. -+// Internally, the number of digits used to represent all these elements is obtained by approximating the number of bits to the immediately greater multiple of 32. -+// For example, a 503-bit field element is represented with Ceil(503 / 64) = 8 64-bit digits or Ceil(503 / 32) = 16 32-bit digits. -+ -+// -+// Curve isogeny system "SIDHp503". Base curve: Montgomery curve By^2 = Cx^3 + Ax^2 + Cx defined over GF(p503^2), where A=0, B=1, C=1 and p503 = 2^250*3^159-1 -+// -+ -+const uint64_t p503[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xABFFFFFFFFFFFFFF, -+ 0x13085BDA2211E7A0, 0x1B9BF6C87B7E7DAF, 0x6045C6BDDA77A4D0, 0x004066F541811E1E }; -+const uint64_t p503p1[NWORDS64_FIELD] = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xAC00000000000000, -+ 0x13085BDA2211E7A0, 0x1B9BF6C87B7E7DAF, 0x6045C6BDDA77A4D0, 0x004066F541811E1E }; -+const uint64_t p503x2[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x57FFFFFFFFFFFFFF, -+ 0x2610B7B44423CF41, 0x3737ED90F6FCFB5E, 0xC08B8D7BB4EF49A0, 0x0080CDEA83023C3C }; -+// Order of Alice's subgroup -+const uint64_t Alice_order[NWORDS64_ORDER] = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0400000000000000 }; -+// Order of Bob's subgroup -+const uint64_t Bob_order[NWORDS64_ORDER] = { 0xC216F6888479E82B, 0xE6FDB21EDF9F6BC4, 0x1171AF769DE93406, 0x1019BD5060478798 }; -+// Alice's generator values {XPA0 + XPA1*i, XQA0, XRA0 + XRA1*i} in GF(p503^2), expressed in Montgomery representation -+const uint64_t A_gen[5*NWORDS64_FIELD] = { 0xE7EF4AA786D855AF, 0xED5758F03EB34D3B, 0x09AE172535A86AA9, 0x237B9CC07D622723, -+ 0xE3A284CBA4E7932D, 0x27481D9176C5E63F, 0x6A323FF55C6E71BF, 0x002ECC31A6FB8773, // XPA0 -+ 0x64D02E4E90A620B8, 0xDAB8128537D4B9F1, 0x4BADF77B8A228F98, 0x0F5DBDF9D1FB7D1B, -+ 0xBEC4DB288E1A0DCC, 0xE76A8665E80675DB, 0x6D6F252E12929463, 0x003188BD1463FACC, // XPA1 -+ 0xB79D41025DE85D56, 0x0B867DA9DF169686, 0x740E5368021C827D, 0x20615D72157BF25C, -+ 0xFF1590013C9B9F5B, 0xC884DCADE8C16CEA, 0xEBD05E53BF724E01, 0x0032FEF8FDA5748C, // XQA0 -+ 0x12E2E849AA0A8006, 0x41CF47008635A1E8, 0x9CD720A70798AED7, 0x42A820B42FCF04CF, -+ 0x7BF9BAD32AAE88B1, 0xF619127A54090BBE, 0x1CB10D8F56408EAA, 0x001D6B54C3C0EDEB, // XRA0 -+ 0x34DB54931CBAAC36, 0x420A18CB8DD5F0C4, 0x32008C1A48C0F44D, 0x3B3BA772B1CFD44D, -+ 0xA74B058FDAF13515, 0x095FC9CA7EEC17B4, 0x448E829D28F120F8, 0x00261EC3ED16A489 }; // XRA1 -+// Bob's generator values {XPB0 + XPB1*i, XQB0, XRB0 + XRB1*i} in GF(p503^2), expressed in Montgomery representation -+const uint64_t B_gen[5*NWORDS64_FIELD] = { 0x7EDE37F4FA0BC727, 0xF7F8EC5C8598941C, 0xD15519B516B5F5C8, 0xF6D5AC9B87A36282, -+ 0x7B19F105B30E952E, 0x13BD8B2025B4EBEE, 0x7B96D27F4EC579A2, 0x00140850CAB7E5DE, // XPB0 -+ 0x7764909DAE7B7B2D, 0x578ABB16284911AB, 0x76E2BFD146A6BF4D, 0x4824044B23AA02F0, -+ 0x1105048912A321F3, 0xB8A2E482CF0F10C1, 0x42FF7D0BE2152085, 0x0018E599C5223352, // XPB1 -+ 0x4256C520FB388820, 0x744FD7C3BAAF0A13, 0x4B6A2DDDB12CBCB8, 0xE46826E27F427DF8, -+ 0xFE4A663CD505A61B, 0xD6B3A1BAF025C695, 0x7C3BB62B8FCC00BD, 0x003AFDDE4A35746C, // XQB0 -+ 0x75601CD1E6C0DFCB, 0x1A9007239B58F93E, 0xC1F1BE80C62107AC, 0x7F513B898F29FF08, -+ 0xEA0BEDFF43E1F7B2, 0x2C6D94018CBAE6D0, 0x3A430D31BCD84672, 0x000D26892ECCFE83, // XRB0 -+ 0x1119D62AEA3007A1, 0xE3702AA4E04BAE1B, 0x9AB96F7D59F990E7, 0xF58440E8B43319C0, -+ 0xAF8134BEE1489775, 0xE7F7774E905192AA, 0xF54AE09308E98039, 0x001EF7A041A86112 }; // XRB1 -+// Montgomery constant Montgomery_R2 = (2^512)^2 mod p503 -+const uint64_t Montgomery_R2[NWORDS64_FIELD] = { 0x5289A0CF641D011F, 0x9B88257189FED2B9, 0xA3B365D58DC8F17A, 0x5BC57AB6EFF168EC, -+ 0x9E51998BD84D4423, 0xBF8999CBAC3B5695, 0x46E9127BCE14CDB6, 0x003F6CFCE8B81771 }; -+// Value one in Montgomery representation -+const uint64_t Montgomery_one[NWORDS64_FIELD] = { 0x00000000000003F9, 0x0000000000000000, 0x0000000000000000, 0xB400000000000000, -+ 0x63CB1A6EA6DED2B4, 0x51689D8D667EB37D, 0x8ACD77C71AB24142, 0x0026FBAEC60F5953 }; -+// Value (2^256)^2 mod 3^159 -+const uint64_t Montgomery_Rprime[NWORDS64_ORDER] = { 0x0C2615CA3C5BAA99, 0x5A4FF3072AB6AA6A, 0xA6AFD4B039AD6AA2, 0x010DA06A26DD05CB }; -+// Value -(3^159)^-1 mod 2^256 -+const uint64_t Montgomery_rprime[NWORDS64_ORDER] = { 0x49C8A87190C0697D, 0x2EB7968EA0F0A558, 0x944257B696777FA2, 0xBAA4DDCD6139D2B3 }; -+// Value order_Bob/3 mod p503 -+const uint64_t Border_div3[NWORDS_ORDER] = { 0xEB5CFCD82C28A2B9, 0x4CFF3B5F9FDFCE96, 0xB07B3A7CDF4DBC02, 0x055DE9C5756D2D32 }; -+ -+ -+// Fixed parameters for isogeny tree computation -+const unsigned int strat_Alice[MAX_Alice-1] = { -+61, 32, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, -+4, 2, 1, 1, 2, 1, 1, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, -+1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 29, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, -+1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 13, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, -+1, 1, 2, 1, 1, 5, 4, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1 }; -+ -+const unsigned int strat_Bob[MAX_Bob-1] = { -+71, 38, 21, 13, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 5, 4, 2, 1, 1, 2, 1, -+1, 2, 1, 1, 1, 9, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, 2, 1, 1, 17, 9, -+5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 1, 2, 1, -+1, 4, 2, 1, 1, 2, 1, 1, 33, 17, 9, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, -+2, 1, 1, 8, 4, 2, 1, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 16, 8, 4, 2, 1, 1, 1, 2, -+1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1 }; -+ -+// Setting up macro defines and including GF(p), GF(p^2), curve, isogeny and kex functions -+#define fpcopy fpcopy503 -+#define fpzero fpzero503 -+#define fpadd fpadd503 -+#define fpsub fpsub503 -+#define fpneg fpneg503 -+#define fpdiv2 fpdiv2_503 -+#define fpcorrection fpcorrection503 -+#define fpmul_mont fpmul503_mont -+#define fpsqr_mont fpsqr503_mont -+#define fpinv_mont fpinv503_mont -+#define fpinv_chain_mont fpinv503_chain_mont -+#define fpinv_mont_bingcd fpinv503_mont_bingcd -+#define fp2copy fp2copy503 -+#define fp2zero fp2zero503 -+#define fp2add fp2add503 -+#define fp2sub fp2sub503 -+#define fp2neg fp2neg503 -+#define fp2div2 fp2div2_503 -+#define fp2correction fp2correction503 -+#define fp2mul_mont fp2mul503_mont -+#define fp2sqr_mont fp2sqr503_mont -+#define fp2inv_mont fp2inv503_mont -+#define fp2inv_mont_bingcd fp2inv503_mont_bingcd -+#define fpequal_non_constant_time fpequal503_non_constant_time -+#define mp_add_asm mp_add503_asm -+#define mp_subx2_asm mp_sub503x2_asm -+#define mp_dblsubx2_asm mp_dblsub503x2_asm -+#define crypto_kem_keypair crypto_kem_keypair_SIKEp503 -+#define crypto_kem_enc crypto_kem_enc_SIKEp503 -+#define crypto_kem_dec crypto_kem_dec_SIKEp503 -+#define random_mod_order_A random_mod_order_A_SIDHp503 -+#define random_mod_order_B random_mod_order_B_SIDHp503 -+#define EphemeralKeyGeneration_A EphemeralKeyGeneration_A_SIDHp503 -+#define EphemeralKeyGeneration_B EphemeralKeyGeneration_B_SIDHp503 -+#define EphemeralSecretAgreement_A EphemeralSecretAgreement_A_SIDHp503 -+#define EphemeralSecretAgreement_B EphemeralSecretAgreement_B_SIDHp503 -+ -+#include "../fpx.c" -+#include "../ec_isogeny.c" -+#include "../sidh.c" -+#include "../sike.c" -diff --git a/third_party/sidh/src/P503/P503_api.h b/third_party/sidh/src/P503/P503_api.h -new file mode 100644 -index 00000000..b595cf40 ---- /dev/null -+++ b/third_party/sidh/src/P503/P503_api.h -@@ -0,0 +1,107 @@ -+/******************************************************************************************** -+* SIDH: an efficient supersingular isogeny cryptography library -+* -+* Abstract: API header file for P503 -+*********************************************************************************************/ -+ -+#ifndef __P503_API_H__ -+#define __P503_API_H__ -+ -+ -+/*********************** Key encapsulation mechanism API ***********************/ -+ -+#define CRYPTO_SECRETKEYBYTES 434 // MSG_BYTES + SECRETKEY_B_BYTES + CRYPTO_PUBLICKEYBYTES bytes -+#define CRYPTO_PUBLICKEYBYTES 378 -+#define CRYPTO_BYTES 16 -+#define CRYPTO_CIPHERTEXTBYTES 402 // CRYPTO_PUBLICKEYBYTES + MSG_BYTES bytes -+ -+// Algorithm name -+#define CRYPTO_ALGNAME "SIKEp503" -+ -+// SIKE's key generation -+// It produces a private key sk and computes the public key pk. -+// Outputs: secret key sk (CRYPTO_SECRETKEYBYTES = 434 bytes) -+// public key pk (CRYPTO_PUBLICKEYBYTES = 378 bytes) -+int crypto_kem_keypair_SIKEp503(unsigned char *pk, unsigned char *sk); -+ -+// SIKE's encapsulation -+// Input: public key pk (CRYPTO_PUBLICKEYBYTES = 378 bytes) -+// Outputs: shared secret ss (CRYPTO_BYTES = 16 bytes) -+// ciphertext message ct (CRYPTO_CIPHERTEXTBYTES = 402 bytes) -+int crypto_kem_enc_SIKEp503(unsigned char *ct, unsigned char *ss, const unsigned char *pk); -+ -+// SIKE's decapsulation -+// Input: secret key sk (CRYPTO_SECRETKEYBYTES = 434 bytes) -+// ciphertext message ct (CRYPTO_CIPHERTEXTBYTES = 402 bytes) -+// Outputs: shared secret ss (CRYPTO_BYTES = 16 bytes) -+int crypto_kem_dec_SIKEp503(unsigned char *ss, const unsigned char *ct, const unsigned char *sk); -+ -+ -+// Encoding of keys for KEM-based isogeny system "SIKEp503" (wire format): -+// ---------------------------------------------------------------------- -+// Elements over GF(p503) are encoded in 63 octets in little endian format (i.e., the least significant octet is located in the lowest memory address). -+// Elements (a+b*i) over GF(p503^2), where a and b are defined over GF(p503), are encoded as {a, b}, with a in the lowest memory portion. -+// -+// Private keys sk consist of the concatenation of a 24-byte random value, a value in the range [0, 2^252-1] and the public key pk. In the SIKE API, -+// private keys are encoded in 434 octets in little endian format. -+// Public keys pk consist of 3 elements in GF(p503^2). In the SIKE API, pk is encoded in 378 octets. -+// Ciphertexts ct consist of the concatenation of a public key value and a 24-byte value. In the SIKE API, ct is encoded in 378 + 24 = 402 octets. -+// Shared keys ss consist of a value of 16 octets. -+ -+ -+/*********************** Ephemeral key exchange API ***********************/ -+ -+#define SIDH_SECRETKEYBYTES 32 -+#define SIDH_PUBLICKEYBYTES 378 -+#define SIDH_BYTES 126 -+ -+// SECURITY NOTE: SIDH supports ephemeral Diffie-Hellman key exchange. It is NOT secure to use it with static keys. -+// See "On the Security of Supersingular Isogeny Cryptosystems", S.D. Galbraith, C. Petit, B. Shani and Y.B. Ti, in ASIACRYPT 2016, 2016. -+// Extended version available at: http://eprint.iacr.org/2016/859 -+ -+// Generation of Alice's secret key -+// Outputs random value in [0, 2^250 - 1] to be used as Alice's private key -+void random_mod_order_A_SIDHp503(unsigned char* random_digits); -+ -+// Generation of Bob's secret key -+// Outputs random value in [0, 2^Floor(Log(2,3^159)) - 1] to be used as Bob's private key -+void random_mod_order_B_SIDHp503(unsigned char* random_digits); -+ -+// Alice's ephemeral public key generation -+// Input: a private key PrivateKeyA in the range [0, 2^250 - 1], stored in 32 bytes. -+// Output: the public key PublicKeyA consisting of 3 GF(p503^2) elements encoded in 378 bytes. -+int EphemeralKeyGeneration_A_SIDHp503(const unsigned char* PrivateKeyA, unsigned char* PublicKeyA); -+ -+// Bob's ephemeral key-pair generation -+// It produces a private key PrivateKeyB and computes the public key PublicKeyB. -+// The private key is an integer in the range [0, 2^Floor(Log(2,3^159)) - 1], stored in 32 bytes. -+// The public key consists of 3 GF(p503^2) elements encoded in 378 bytes. -+int EphemeralKeyGeneration_B_SIDHp503(const unsigned char* PrivateKeyB, unsigned char* PublicKeyB); -+ -+// Alice's ephemeral shared secret computation -+// It produces a shared secret key SharedSecretA using her secret key PrivateKeyA and Bob's public key PublicKeyB -+// Inputs: Alice's PrivateKeyA is an integer in the range [0, 2^250 - 1], stored in 32 bytes. -+// Bob's PublicKeyB consists of 3 GF(p503^2) elements encoded in 378 bytes. -+// Output: a shared secret SharedSecretA that consists of one element in GF(p503^2) encoded in 126 bytes. -+int EphemeralSecretAgreement_A_SIDHp503(const unsigned char* PrivateKeyA, const unsigned char* PublicKeyB, unsigned char* SharedSecretA); -+ -+// Bob's ephemeral shared secret computation -+// It produces a shared secret key SharedSecretB using his secret key PrivateKeyB and Alice's public key PublicKeyA -+// Inputs: Bob's PrivateKeyB is an integer in the range [0, 2^Floor(Log(2,3^159)) - 1], stored in 32 bytes. -+// Alice's PublicKeyA consists of 3 GF(p503^2) elements encoded in 378 bytes. -+// Output: a shared secret SharedSecretB that consists of one element in GF(p503^2) encoded in 126 bytes. -+int EphemeralSecretAgreement_B_SIDHp503(const unsigned char* PrivateKeyB, const unsigned char* PublicKeyA, unsigned char* SharedSecretB); -+ -+ -+// Encoding of keys for KEX-based isogeny system "SIDHp503" (wire format): -+// ---------------------------------------------------------------------- -+// Elements over GF(p503) are encoded in 63 octets in little endian format (i.e., the least significant octet is located in the lowest memory address). -+// Elements (a+b*i) over GF(p503^2), where a and b are defined over GF(p503), are encoded as {a, b}, with a in the lowest memory portion. -+// -+// Private keys PrivateKeyA and PrivateKeyB can have values in the range [0, 2^250-1] and [0, 2^252-1], resp. In the SIDH API, private keys are encoded -+// in 32 octets in little endian format. -+// Public keys PublicKeyA and PublicKeyB consist of 3 elements in GF(p503^2). In the SIDH API, they are encoded in 378 octets. -+// Shared keys SharedSecretA and SharedSecretB consist of one element in GF(p503^2). In the SIDH API, they are encoded in 126 octets. -+ -+ -+#endif -\ No newline at end of file -diff --git a/third_party/sidh/src/P503/P503_internal.h b/third_party/sidh/src/P503/P503_internal.h -new file mode 100644 -index 00000000..33dadaa1 ---- /dev/null -+++ b/third_party/sidh/src/P503/P503_internal.h -@@ -0,0 +1,246 @@ -+/******************************************************************************************** -+* SIDH: an efficient supersingular isogeny cryptography library -+* -+* Abstract: internal header file for P503 -+*********************************************************************************************/ -+ -+#ifndef __P503_INTERNAL_H__ -+#define __P503_INTERNAL_H__ -+ -+#include "../config.h" -+ -+ -+#if (TARGET == TARGET_AMD64) -+ #define NWORDS_FIELD 8 // Number of words of a 503-bit field element -+ #define p503_ZERO_WORDS 3 // Number of "0" digits in the least significant part of p503 + 1 -+#elif (TARGET == TARGET_x86) -+ #define NWORDS_FIELD 16 -+ #define p503_ZERO_WORDS 7 -+#elif (TARGET == TARGET_ARM) -+ #define NWORDS_FIELD 16 -+ #define p503_ZERO_WORDS 7 -+#elif (TARGET == TARGET_ARM64) -+ #define NWORDS_FIELD 8 -+ #define p503_ZERO_WORDS 3 -+#endif -+ -+ -+// Basic constants -+ -+#define NBITS_FIELD 503 -+#define MAXBITS_FIELD 512 -+#define MAXWORDS_FIELD ((MAXBITS_FIELD+RADIX-1)/RADIX) // Max. number of words to represent field elements -+#define NWORDS64_FIELD ((NBITS_FIELD+63)/64) // Number of 64-bit words of a 503-bit field element -+#define NBITS_ORDER 256 -+#define NWORDS_ORDER ((NBITS_ORDER+RADIX-1)/RADIX) // Number of words of oA and oB, where oA and oB are the subgroup orders of Alice and Bob, resp. -+#define NWORDS64_ORDER ((NBITS_ORDER+63)/64) // Number of 64-bit words of a 256-bit element -+#define MAXBITS_ORDER NBITS_ORDER -+#define MAXWORDS_ORDER ((MAXBITS_ORDER+RADIX-1)/RADIX) // Max. number of words to represent elements in [1, oA-1] or [1, oB]. -+#define ALICE 0 -+#define BOB 1 -+#define OALICE_BITS 250 -+#define OBOB_BITS 253 -+#define OBOB_EXPON 159 -+#define MASK_ALICE 0x03 -+#define MASK_BOB 0x0F -+#define PRIME p503 -+#define PARAM_A 0 -+#define PARAM_C 1 -+// Fixed parameters for isogeny tree computation -+#define MAX_INT_POINTS_ALICE 7 -+#define MAX_INT_POINTS_BOB 8 -+#define MAX_Alice 125 -+#define MAX_Bob 159 -+#define MSG_BYTES 24 -+#define SECRETKEY_A_BYTES (OALICE_BITS + 7) / 8 -+#define SECRETKEY_B_BYTES (OBOB_BITS + 7) / 8 -+#define FP2_ENCODED_BYTES 2*((NBITS_FIELD + 7) / 8) -+ -+ -+// SIDH's basic element definitions and point representations -+ -+typedef digit_t felm_t[NWORDS_FIELD]; // Datatype for representing 503-bit field elements (512-bit max.) -+typedef digit_t dfelm_t[2*NWORDS_FIELD]; // Datatype for representing double-precision 2x503-bit field elements (512-bit max.) -+typedef felm_t f2elm_t[2]; // Datatype for representing quadratic extension field elements GF(p503^2) -+ -+typedef struct { f2elm_t X; f2elm_t Z; } point_proj; // Point representation in projective XZ Montgomery coordinates. -+typedef point_proj point_proj_t[1]; -+ -+ -+ -+/**************** Function prototypes ****************/ -+/************* Multiprecision functions **************/ -+ -+// Copy wordsize digits, c = a, where lng(a) = nwords -+void copy_words(const digit_t* a, digit_t* c, const unsigned int nwords); -+ -+// Multiprecision addition, c = a+b, where lng(a) = lng(b) = nwords. Returns the carry bit -+unsigned int mp_add(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords); -+ -+// 503-bit multiprecision addition, c = a+b -+void mp_add503(const digit_t* a, const digit_t* b, digit_t* c); -+void mp_add503_asm(const digit_t* a, const digit_t* b, digit_t* c); -+ -+// Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = nwords. Returns the borrow bit -+unsigned int mp_sub(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords); -+digit_t mp_sub503x2_asm(const digit_t* a, const digit_t* b, digit_t* c); -+ -+// Double 2x503-bit multiprecision subtraction, c = c-a-b, where c > a and c > b -+void mp_dblsub503x2_asm(const digit_t* a, const digit_t* b, digit_t* c); -+ -+// Multiprecision left shift -+void mp_shiftleft(digit_t* x, unsigned int shift, const unsigned int nwords); -+ -+// Multiprecision right shift by one -+void mp_shiftr1(digit_t* x, const unsigned int nwords); -+ -+// Multiprecision left right shift by one -+void mp_shiftl1(digit_t* x, const unsigned int nwords); -+ -+// Digit multiplication, digit * digit -> 2-digit result -+void digit_x_digit(const digit_t a, const digit_t b, digit_t* c); -+ -+// Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = nwords. -+void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords); -+ -+/************ Field arithmetic functions *************/ -+ -+// Copy of a field element, c = a -+void fpcopy503(const digit_t* a, digit_t* c); -+ -+// Zeroing a field element, a = 0 -+void fpzero503(digit_t* a); -+ -+// Non constant-time comparison of two field elements. If a = b return TRUE, otherwise, return FALSE -+bool fpequal503_non_constant_time(const digit_t* a, const digit_t* b); -+ -+// Modular addition, c = a+b mod p503 -+extern void fpadd503(const digit_t* a, const digit_t* b, digit_t* c); -+extern void fpadd503_asm(const digit_t* a, const digit_t* b, digit_t* c); -+ -+// Modular subtraction, c = a-b mod p503 -+extern void fpsub503(const digit_t* a, const digit_t* b, digit_t* c); -+extern void fpsub503_asm(const digit_t* a, const digit_t* b, digit_t* c); -+ -+// Modular negation, a = -a mod p503 -+extern void fpneg503(digit_t* a); -+ -+// Modular division by two, c = a/2 mod p503. -+void fpdiv2_503(const digit_t* a, digit_t* c); -+ -+// Modular correction to reduce field element a in [0, 2*p503-1] to [0, p503-1]. -+void fpcorrection503(digit_t* a); -+ -+// 503-bit Montgomery reduction, c = a mod p -+void rdc_mont(const digit_t* a, digit_t* c); -+ -+// Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod p503, where R=2^768 -+void fpmul503_mont(const digit_t* a, const digit_t* b, digit_t* c); -+void mul503_asm(const digit_t* a, const digit_t* b, digit_t* c); -+void rdc503_asm(const digit_t* ma, digit_t* mc); -+ -+// Field squaring using Montgomery arithmetic, c = a*b*R^-1 mod p503, where R=2^768 -+void fpsqr503_mont(const digit_t* ma, digit_t* mc); -+ -+// Conversion to Montgomery representation -+void to_mont(const digit_t* a, digit_t* mc); -+ -+// Conversion from Montgomery representation to standard representation -+void from_mont(const digit_t* ma, digit_t* c); -+ -+// Field inversion, a = a^-1 in GF(p503) -+void fpinv503_mont(digit_t* a); -+ -+// Field inversion, a = a^-1 in GF(p503) using the binary GCD -+void fpinv503_mont_bingcd(digit_t* a); -+ -+// Chain to compute (p503-3)/4 using Montgomery arithmetic -+void fpinv503_chain_mont(digit_t* a); -+ -+/************ GF(p^2) arithmetic functions *************/ -+ -+// Copy of a GF(p503^2) element, c = a -+void fp2copy503(const f2elm_t a, f2elm_t c); -+ -+// Zeroing a GF(p503^2) element, a = 0 -+void fp2zero503(f2elm_t a); -+ -+// GF(p503^2) negation, a = -a in GF(p503^2) -+void fp2neg503(f2elm_t a); -+ -+// GF(p503^2) addition, c = a+b in GF(p503^2) -+extern void fp2add503(const f2elm_t a, const f2elm_t b, f2elm_t c); -+ -+// GF(p503^2) subtraction, c = a-b in GF(p503^2) -+extern void fp2sub503(const f2elm_t a, const f2elm_t b, f2elm_t c); -+ -+// GF(p503^2) division by two, c = a/2 in GF(p503^2) -+void fp2div2_503(const f2elm_t a, f2elm_t c); -+ -+// Modular correction, a = a in GF(p503^2) -+void fp2correction503(f2elm_t a); -+ -+// GF(p503^2) squaring using Montgomery arithmetic, c = a^2 in GF(p503^2) -+void fp2sqr503_mont(const f2elm_t a, f2elm_t c); -+ -+// GF(p503^2) multiplication using Montgomery arithmetic, c = a*b in GF(p503^2) -+void fp2mul503_mont(const f2elm_t a, const f2elm_t b, f2elm_t c); -+ -+// Conversion of a GF(p503^2) element to Montgomery representation -+void to_fp2mont(const f2elm_t a, f2elm_t mc); -+ -+// Conversion of a GF(p503^2) element from Montgomery representation to standard representation -+void from_fp2mont(const f2elm_t ma, f2elm_t c); -+ -+// GF(p503^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2) -+void fp2inv503_mont(f2elm_t a); -+ -+// GF(p503^2) inversion, a = (a0-i*a1)/(a0^2+a1^2), GF(p503) inversion done using the binary GCD -+void fp2inv503_mont_bingcd(f2elm_t a); -+ -+// n-way Montgomery inversion -+void mont_n_way_inv(const f2elm_t* vec, const int n, f2elm_t* out); -+ -+/************ Elliptic curve and isogeny functions *************/ -+ -+// Computes the j-invariant of a Montgomery curve with projective constant. -+void j_inv(const f2elm_t A, const f2elm_t C, f2elm_t jinv); -+ -+// Simultaneous doubling and differential addition. -+void xDBLADD(point_proj_t P, point_proj_t Q, const f2elm_t xPQ, const f2elm_t A24); -+ -+// Doubling of a Montgomery point in projective coordinates (X:Z). -+void xDBL(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24); -+ -+// Computes [2^e](X:Z) on Montgomery curve with projective constant via e repeated doublings. -+void xDBLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24, const int e); -+ -+// Differential addition. -+void xADD(point_proj_t P, const point_proj_t Q, const f2elm_t xPQ); -+ -+// Computes the corresponding 4-isogeny of a projective Montgomery point (X4:Z4) of order 4. -+void get_4_isog(const point_proj_t P, f2elm_t A24plus, f2elm_t C24, f2elm_t* coeff); -+ -+// Evaluates the isogeny at the point (X:Z) in the domain of the isogeny. -+void eval_4_isog(point_proj_t P, f2elm_t* coeff); -+ -+// Tripling of a Montgomery point in projective coordinates (X:Z). -+void xTPL(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus); -+ -+// Computes [3^e](X:Z) on Montgomery curve with projective constant via e repeated triplings. -+void xTPLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus, const int e); -+ -+// Computes the corresponding 3-isogeny of a projective Montgomery point (X3:Z3) of order 3. -+void get_3_isog(const point_proj_t P, f2elm_t A24minus, f2elm_t A24plus, f2elm_t* coeff); -+ -+// Computes the 3-isogeny R=phi(X:Z), given projective point (X3:Z3) of order 3 on a Montgomery curve and a point P with coefficients given in coeff. -+void eval_3_isog(point_proj_t Q, const f2elm_t* coeff); -+ -+// 3-way simultaneous inversion -+void inv_3_way(f2elm_t z1, f2elm_t z2, f2elm_t z3); -+ -+// Given the x-coordinates of P, Q, and R, returns the value A corresponding to the Montgomery curve E_A: y^2=x^3+A*x^2+x such that R=Q-P on E_A. -+void get_A(const f2elm_t xP, const f2elm_t xQ, const f2elm_t xR, f2elm_t A); -+ -+ -+#endif -diff --git a/third_party/sidh/src/P503/generic/fp_generic.c b/third_party/sidh/src/P503/generic/fp_generic.c -new file mode 100644 -index 00000000..d8dab8ac ---- /dev/null -+++ b/third_party/sidh/src/P503/generic/fp_generic.c -@@ -0,0 +1,224 @@ -+/******************************************************************************************** -+* SIDH: an efficient supersingular isogeny cryptography library -+* -+* Abstract: portable modular arithmetic for P503 -+*********************************************************************************************/ -+ -+#include "../P503_internal.h" -+ -+ -+// Global constants -+extern const uint64_t p503[NWORDS_FIELD]; -+extern const uint64_t p503p1[NWORDS_FIELD]; -+extern const uint64_t p503x2[NWORDS_FIELD]; -+ -+ -+__inline void fpadd503(const digit_t* a, const digit_t* b, digit_t* c) -+{ // Modular addition, c = a+b mod p503. -+ // Inputs: a, b in [0, 2*p503-1] -+ // Output: c in [0, 2*p503-1] -+ unsigned int i, carry = 0; -+ digit_t mask; -+ -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ ADDC(carry, a[i], b[i], carry, c[i]); -+ } -+ -+ carry = 0; -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ SUBC(carry, c[i], ((digit_t*)p503x2)[i], carry, c[i]); -+ } -+ mask = 0 - (digit_t)carry; -+ -+ carry = 0; -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ ADDC(carry, c[i], ((digit_t*)p503x2)[i] & mask, carry, c[i]); -+ } -+} -+ -+ -+__inline void fpsub503(const digit_t* a, const digit_t* b, digit_t* c) -+{ // Modular subtraction, c = a-b mod p503. -+ // Inputs: a, b in [0, 2*p503-1] -+ // Output: c in [0, 2*p503-1] -+ unsigned int i, borrow = 0; -+ digit_t mask; -+ -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ SUBC(borrow, a[i], b[i], borrow, c[i]); -+ } -+ mask = 0 - (digit_t)borrow; -+ -+ borrow = 0; -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ ADDC(borrow, c[i], ((digit_t*)p503x2)[i] & mask, borrow, c[i]); -+ } -+} -+ -+ -+__inline void fpneg503(digit_t* a) -+{ // Modular negation, a = -a mod p503. -+ // Input/output: a in [0, 2*p503-1] -+ unsigned int i, borrow = 0; -+ -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ SUBC(borrow, ((digit_t*)p503x2)[i], a[i], borrow, a[i]); -+ } -+} -+ -+ -+void fpdiv2_503(const digit_t* a, digit_t* c) -+{ // Modular division by two, c = a/2 mod p503. -+ // Input : a in [0, 2*p503-1] -+ // Output: c in [0, 2*p503-1] -+ unsigned int i, carry = 0; -+ digit_t mask; -+ -+ mask = 0 - (digit_t)(a[0] & 1); // If a is odd compute a+p503 -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ ADDC(carry, a[i], ((digit_t*)p503)[i] & mask, carry, c[i]); -+ } -+ -+ mp_shiftr1(c, NWORDS_FIELD); -+} -+ -+ -+void fpcorrection503(digit_t* a) -+{ // Modular correction to reduce field element a in [0, 2*p503-1] to [0, p503-1]. -+ unsigned int i, borrow = 0; -+ digit_t mask; -+ -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ SUBC(borrow, a[i], ((digit_t*)p503)[i], borrow, a[i]); -+ } -+ mask = 0 - (digit_t)borrow; -+ -+ borrow = 0; -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ ADDC(borrow, a[i], ((digit_t*)p503)[i] & mask, borrow, a[i]); -+ } -+} -+ -+ -+void digit_x_digit(const digit_t a, const digit_t b, digit_t* c) -+{ // Digit multiplication, digit * digit -> 2-digit result -+ register digit_t al, ah, bl, bh, temp; -+ digit_t albl, albh, ahbl, ahbh, res1, res2, res3, carry; -+ digit_t mask_low = (digit_t)(-1) >> (sizeof(digit_t)*4), mask_high = (digit_t)(-1) << (sizeof(digit_t)*4); -+ -+ al = a & mask_low; // Low part -+ ah = a >> (sizeof(digit_t) * 4); // High part -+ bl = b & mask_low; -+ bh = b >> (sizeof(digit_t) * 4); -+ -+ albl = al*bl; -+ albh = al*bh; -+ ahbl = ah*bl; -+ ahbh = ah*bh; -+ c[0] = albl & mask_low; // C00 -+ -+ res1 = albl >> (sizeof(digit_t) * 4); -+ res2 = ahbl & mask_low; -+ res3 = albh & mask_low; -+ temp = res1 + res2 + res3; -+ carry = temp >> (sizeof(digit_t) * 4); -+ c[0] ^= temp << (sizeof(digit_t) * 4); // C01 -+ -+ res1 = ahbl >> (sizeof(digit_t) * 4); -+ res2 = albh >> (sizeof(digit_t) * 4); -+ res3 = ahbh & mask_low; -+ temp = res1 + res2 + res3 + carry; -+ c[1] = temp & mask_low; // C10 -+ carry = temp & mask_high; -+ c[1] ^= (ahbh & mask_high) + carry; // C11 -+} -+ -+ -+void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) -+{ // Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = nwords. -+ unsigned int i, j; -+ digit_t t = 0, u = 0, v = 0, UV[2]; -+ unsigned int carry = 0; -+ -+ for (i = 0; i < nwords; i++) { -+ for (j = 0; j <= i; j++) { -+ MUL(a[j], b[i-j], UV+1, UV[0]); -+ ADDC(0, UV[0], v, carry, v); -+ ADDC(carry, UV[1], u, carry, u); -+ t += carry; -+ } -+ c[i] = v; -+ v = u; -+ u = t; -+ t = 0; -+ } -+ -+ for (i = nwords; i < 2*nwords-1; i++) { -+ for (j = i-nwords+1; j < nwords; j++) { -+ MUL(a[j], b[i-j], UV+1, UV[0]); -+ ADDC(0, UV[0], v, carry, v); -+ ADDC(carry, UV[1], u, carry, u); -+ t += carry; -+ } -+ c[i] = v; -+ v = u; -+ u = t; -+ t = 0; -+ } -+ c[2*nwords-1] = v; -+} -+ -+ -+void rdc_mont(const digit_t* ma, digit_t* mc) -+{ // Efficient Montgomery reduction using comba and exploiting the special form of the prime p503. -+ // mc = ma*R^-1 mod p503x2, where R = 2^512. -+ // If ma < 2^512*p503, the output mc is in the range [0, 2*p503-1]. -+ // ma is assumed to be in Montgomery representation. -+ unsigned int i, j, carry, count = p503_ZERO_WORDS; -+ digit_t UV[2], t = 0, u = 0, v = 0; -+ -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ mc[i] = 0; -+ } -+ -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ for (j = 0; j < i; j++) { -+ if (j < (i-p503_ZERO_WORDS+1)) { -+ MUL(mc[j], ((digit_t*)p503p1)[i-j], UV+1, UV[0]); -+ ADDC(0, UV[0], v, carry, v); -+ ADDC(carry, UV[1], u, carry, u); -+ t += carry; -+ } -+ } -+ ADDC(0, v, ma[i], carry, v); -+ ADDC(carry, u, 0, carry, u); -+ t += carry; -+ mc[i] = v; -+ v = u; -+ u = t; -+ t = 0; -+ } -+ -+ for (i = NWORDS_FIELD; i < 2*NWORDS_FIELD-1; i++) { -+ if (count > 0) { -+ count -= 1; -+ } -+ for (j = i-NWORDS_FIELD+1; j < NWORDS_FIELD; j++) { -+ if (j < (NWORDS_FIELD-count)) { -+ MUL(mc[j], ((digit_t*)p503p1)[i-j], UV+1, UV[0]); -+ ADDC(0, UV[0], v, carry, v); -+ ADDC(carry, UV[1], u, carry, u); -+ t += carry; -+ } -+ } -+ ADDC(0, v, ma[i], carry, v); -+ ADDC(carry, u, 0, carry, u); -+ t += carry; -+ mc[i-NWORDS_FIELD] = v; -+ v = u; -+ u = t; -+ t = 0; -+ } -+ ADDC(0, v, ma[2*NWORDS_FIELD-1], carry, v); -+ mc[NWORDS_FIELD-1] = v; -+} -\ No newline at end of file -diff --git a/third_party/sidh/src/P751/AMD64/fp_x64.c b/third_party/sidh/src/P751/AMD64/fp_x64.c -new file mode 100644 -index 00000000..63ff177d ---- /dev/null -+++ b/third_party/sidh/src/P751/AMD64/fp_x64.c -@@ -0,0 +1,861 @@ -+/******************************************************************************************** -+* SIDH: an efficient supersingular isogeny cryptography library -+* -+* Abstract: modular arithmetic optimized for x64 platforms for P751 -+*********************************************************************************************/ -+ -+#include "../P751_internal.h" -+ -+ -+// Global constants -+extern const uint64_t p751[NWORDS_FIELD]; -+extern const uint64_t p751p1[NWORDS_FIELD]; -+extern const uint64_t p751x2[NWORDS_FIELD]; -+ -+ -+__inline void fpadd751(const digit_t* a, const digit_t* b, digit_t* c) -+{ // Modular addition, c = a+b mod p751. -+ // Inputs: a, b in [0, 2*p751-1] -+ // Output: c in [0, 2*p751-1] -+ -+#if (OS_TARGET == OS_WIN) -+ unsigned int i, carry = 0; -+ digit_t mask; -+ -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ ADDC(carry, a[i], b[i], carry, c[i]); -+ } -+ -+ carry = 0; -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ SUBC(carry, c[i], ((digit_t*)p751x2)[i], carry, c[i]); -+ } -+ mask = 0 - (digit_t)carry; -+ -+ carry = 0; -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ ADDC(carry, c[i], ((digit_t*)p751x2)[i] & mask, carry, c[i]); -+ } -+ -+#elif (OS_TARGET == OS_LINUX) -+ -+ fpadd751_asm(a, b, c); -+ -+#endif -+} -+ -+ -+__inline void fpsub751(const digit_t* a, const digit_t* b, digit_t* c) -+{ // Modular subtraction, c = a-b mod p751. -+ // Inputs: a, b in [0, 2*p751-1] -+ // Output: c in [0, 2*p751-1] -+ -+#if (OS_TARGET == OS_WIN) -+ unsigned int i, borrow = 0; -+ digit_t mask; -+ -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ SUBC(borrow, a[i], b[i], borrow, c[i]); -+ } -+ mask = 0 - (digit_t)borrow; -+ -+ borrow = 0; -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ ADDC(borrow, c[i], ((digit_t*)p751x2)[i] & mask, borrow, c[i]); -+ } -+ -+#elif (OS_TARGET == OS_LINUX) -+ -+ fpsub751_asm(a, b, c); -+ -+#endif -+} -+ -+ -+__inline void fpneg751(digit_t* a) -+{ // Modular negation, a = -a mod p751. -+ // Input/output: a in [0, 2*p751-1] -+ unsigned int i, borrow = 0; -+ -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ SUBC(borrow, ((digit_t*)p751x2)[i], a[i], borrow, a[i]); -+ } -+} -+ -+ -+void fpdiv2_751(const digit_t* a, digit_t* c) -+{ // Modular division by two, c = a/2 mod p751. -+ // Input : a in [0, 2*p751-1] -+ // Output: c in [0, 2*p751-1] -+ unsigned int i, carry = 0; -+ digit_t mask; -+ -+ mask = 0 - (digit_t)(a[0] & 1); // If a is odd compute a+p751 -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ ADDC(carry, a[i], ((digit_t*)p751)[i] & mask, carry, c[i]); -+ } -+ -+ mp_shiftr1(c, NWORDS_FIELD); -+} -+ -+ -+void fpcorrection751(digit_t* a) -+{ // Modular correction to reduce field element a in [0, 2*p751-1] to [0, p751-1]. -+ unsigned int i, borrow = 0; -+ digit_t mask; -+ -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ SUBC(borrow, a[i], ((digit_t*)p751)[i], borrow, a[i]); -+ } -+ mask = 0 - (digit_t)borrow; -+ -+ borrow = 0; -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ ADDC(borrow, a[i], ((digit_t*)p751)[i] & mask, borrow, a[i]); -+ } -+} -+ -+ -+void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) -+{ // Multiprecision multiply, c = a*b, where lng(a) = lng(b) = nwords. -+ -+ UNREFERENCED_PARAMETER(nwords); -+ -+#if (OS_TARGET == OS_WIN) -+ digit_t t = 0; -+ uint128_t uv = {0}; -+ unsigned int carry = 0; -+ -+ MULADD128(a[0], b[0], uv, carry, uv); -+ t += carry; -+ c[0] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[0], b[1], uv, carry, uv); -+ t += carry; -+ MULADD128(a[1], b[0], uv, carry, uv); -+ t += carry; -+ c[1] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[0], b[2], uv, carry, uv); -+ t += carry; -+ MULADD128(a[1], b[1], uv, carry, uv); -+ t += carry; -+ MULADD128(a[2], b[0], uv, carry, uv); -+ t += carry; -+ c[2] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[0], b[3], uv, carry, uv); -+ t += carry; -+ MULADD128(a[2], b[1], uv, carry, uv); -+ t += carry; -+ MULADD128(a[1], b[2], uv, carry, uv); -+ t += carry; -+ MULADD128(a[3], b[0], uv, carry, uv); -+ t += carry; -+ c[3] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[0], b[4], uv, carry, uv); -+ t += carry; -+ MULADD128(a[3], b[1], uv, carry, uv); -+ t += carry; -+ MULADD128(a[2], b[2], uv, carry, uv); -+ t += carry; -+ MULADD128(a[1], b[3], uv, carry, uv); -+ t += carry; -+ MULADD128(a[4], b[0], uv, carry, uv); -+ t += carry; -+ c[4] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[0], b[5], uv, carry, uv); -+ t += carry; -+ MULADD128(a[4], b[1], uv, carry, uv); -+ t += carry; -+ MULADD128(a[3], b[2], uv, carry, uv); -+ t += carry; -+ MULADD128(a[2], b[3], uv, carry, uv); -+ t += carry; -+ MULADD128(a[1], b[4], uv, carry, uv); -+ t += carry; -+ MULADD128(a[5], b[0], uv, carry, uv); -+ t += carry; -+ c[5] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[0], b[6], uv, carry, uv); -+ t += carry; -+ MULADD128(a[5], b[1], uv, carry, uv); -+ t += carry; -+ MULADD128(a[4], b[2], uv, carry, uv); -+ t += carry; -+ MULADD128(a[3], b[3], uv, carry, uv); -+ t += carry; -+ MULADD128(a[2], b[4], uv, carry, uv); -+ t += carry; -+ MULADD128(a[1], b[5], uv, carry, uv); -+ t += carry; -+ MULADD128(a[6], b[0], uv, carry, uv); -+ t += carry; -+ c[6] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[0], b[7], uv, carry, uv); -+ t += carry; -+ MULADD128(a[6], b[1], uv, carry, uv); -+ t += carry; -+ MULADD128(a[5], b[2], uv, carry, uv); -+ t += carry; -+ MULADD128(a[4], b[3], uv, carry, uv); -+ t += carry; -+ MULADD128(a[3], b[4], uv, carry, uv); -+ t += carry; -+ MULADD128(a[2], b[5], uv, carry, uv); -+ t += carry; -+ MULADD128(a[1], b[6], uv, carry, uv); -+ t += carry; -+ MULADD128(a[7], b[0], uv, carry, uv); -+ t += carry; -+ c[7] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[0], b[8], uv, carry, uv); -+ t += carry; -+ MULADD128(a[7], b[1], uv, carry, uv); -+ t += carry; -+ MULADD128(a[6], b[2], uv, carry, uv); -+ t += carry; -+ MULADD128(a[5], b[3], uv, carry, uv); -+ t += carry; -+ MULADD128(a[4], b[4], uv, carry, uv); -+ t += carry; -+ MULADD128(a[3], b[5], uv, carry, uv); -+ t += carry; -+ MULADD128(a[2], b[6], uv, carry, uv); -+ t += carry; -+ MULADD128(a[1], b[7], uv, carry, uv); -+ t += carry; -+ MULADD128(a[8], b[0], uv, carry, uv); -+ t += carry; -+ c[8] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[0], b[9], uv, carry, uv); -+ t += carry; -+ MULADD128(a[8], b[1], uv, carry, uv); -+ t += carry; -+ MULADD128(a[7], b[2], uv, carry, uv); -+ t += carry; -+ MULADD128(a[6], b[3], uv, carry, uv); -+ t += carry; -+ MULADD128(a[5], b[4], uv, carry, uv); -+ t += carry; -+ MULADD128(a[4], b[5], uv, carry, uv); -+ t += carry; -+ MULADD128(a[3], b[6], uv, carry, uv); -+ t += carry; -+ MULADD128(a[2], b[7], uv, carry, uv); -+ t += carry; -+ MULADD128(a[1], b[8], uv, carry, uv); -+ t += carry; -+ MULADD128(a[9], b[0], uv, carry, uv); -+ t += carry; -+ c[9] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[0], b[10], uv, carry, uv); -+ t += carry; -+ MULADD128(a[9], b[1], uv, carry, uv); -+ t += carry; -+ MULADD128(a[8], b[2], uv, carry, uv); -+ t += carry; -+ MULADD128(a[7], b[3], uv, carry, uv); -+ t += carry; -+ MULADD128(a[6], b[4], uv, carry, uv); -+ t += carry; -+ MULADD128(a[5], b[5], uv, carry, uv); -+ t += carry; -+ MULADD128(a[4], b[6], uv, carry, uv); -+ t += carry; -+ MULADD128(a[3], b[7], uv, carry, uv); -+ t += carry; -+ MULADD128(a[2], b[8], uv, carry, uv); -+ t += carry; -+ MULADD128(a[1], b[9], uv, carry, uv); -+ t += carry; -+ MULADD128(a[10], b[0], uv, carry, uv); -+ t += carry; -+ c[10] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[0], b[11], uv, carry, uv); -+ t += carry; -+ MULADD128(a[10], b[1], uv, carry, uv); -+ t += carry; -+ MULADD128(a[9], b[2], uv, carry, uv); -+ t += carry; -+ MULADD128(a[8], b[3], uv, carry, uv); -+ t += carry; -+ MULADD128(a[7], b[4], uv, carry, uv); -+ t += carry; -+ MULADD128(a[6], b[5], uv, carry, uv); -+ t += carry; -+ MULADD128(a[5], b[6], uv, carry, uv); -+ t += carry; -+ MULADD128(a[4], b[7], uv, carry, uv); -+ t += carry; -+ MULADD128(a[3], b[8], uv, carry, uv); -+ t += carry; -+ MULADD128(a[2], b[9], uv, carry, uv); -+ t += carry; -+ MULADD128(a[1], b[10], uv, carry, uv); -+ t += carry; -+ MULADD128(a[11], b[0], uv, carry, uv); -+ t += carry; -+ c[11] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[1], b[11], uv, carry, uv); -+ t += carry; -+ MULADD128(a[10], b[2], uv, carry, uv); -+ t += carry; -+ MULADD128(a[9], b[3], uv, carry, uv); -+ t += carry; -+ MULADD128(a[8], b[4], uv, carry, uv); -+ t += carry; -+ MULADD128(a[7], b[5], uv, carry, uv); -+ t += carry; -+ MULADD128(a[6], b[6], uv, carry, uv); -+ t += carry; -+ MULADD128(a[5], b[7], uv, carry, uv); -+ t += carry; -+ MULADD128(a[4], b[8], uv, carry, uv); -+ t += carry; -+ MULADD128(a[3], b[9], uv, carry, uv); -+ t += carry; -+ MULADD128(a[2], b[10], uv, carry, uv); -+ t += carry; -+ MULADD128(a[11], b[1], uv, carry, uv); -+ t += carry; -+ c[12] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[11], b[2], uv, carry, uv); -+ t += carry; -+ MULADD128(a[10], b[3], uv, carry, uv); -+ t += carry; -+ MULADD128(a[9], b[4], uv, carry, uv); -+ t += carry; -+ MULADD128(a[8], b[5], uv, carry, uv); -+ t += carry; -+ MULADD128(a[7], b[6], uv, carry, uv); -+ t += carry; -+ MULADD128(a[6], b[7], uv, carry, uv); -+ t += carry; -+ MULADD128(a[5], b[8], uv, carry, uv); -+ t += carry; -+ MULADD128(a[4], b[9], uv, carry, uv); -+ t += carry; -+ MULADD128(a[3], b[10], uv, carry, uv); -+ t += carry; -+ MULADD128(a[2], b[11], uv, carry, uv); -+ t += carry; -+ c[13] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[11], b[3], uv, carry, uv); -+ t += carry; -+ MULADD128(a[10], b[4], uv, carry, uv); -+ t += carry; -+ MULADD128(a[9], b[5], uv, carry, uv); -+ t += carry; -+ MULADD128(a[8], b[6], uv, carry, uv); -+ t += carry; -+ MULADD128(a[7], b[7], uv, carry, uv); -+ t += carry; -+ MULADD128(a[6], b[8], uv, carry, uv); -+ t += carry; -+ MULADD128(a[5], b[9], uv, carry, uv); -+ t += carry; -+ MULADD128(a[4], b[10], uv, carry, uv); -+ t += carry; -+ MULADD128(a[3], b[11], uv, carry, uv); -+ t += carry; -+ c[14] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[11], b[4], uv, carry, uv); -+ t += carry; -+ MULADD128(a[10], b[5], uv, carry, uv); -+ t += carry; -+ MULADD128(a[9], b[6], uv, carry, uv); -+ t += carry; -+ MULADD128(a[8], b[7], uv, carry, uv); -+ t += carry; -+ MULADD128(a[7], b[8], uv, carry, uv); -+ t += carry; -+ MULADD128(a[6], b[9], uv, carry, uv); -+ t += carry; -+ MULADD128(a[5], b[10], uv, carry, uv); -+ t += carry; -+ MULADD128(a[4], b[11], uv, carry, uv); -+ t += carry; -+ c[15] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[11], b[5], uv, carry, uv); -+ t += carry; -+ MULADD128(a[10], b[6], uv, carry, uv); -+ t += carry; -+ MULADD128(a[9], b[7], uv, carry, uv); -+ t += carry; -+ MULADD128(a[8], b[8], uv, carry, uv); -+ t += carry; -+ MULADD128(a[7], b[9], uv, carry, uv); -+ t += carry; -+ MULADD128(a[6], b[10], uv, carry, uv); -+ t += carry; -+ MULADD128(a[5], b[11], uv, carry, uv); -+ t += carry; -+ c[16] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[11], b[6], uv, carry, uv); -+ t += carry; -+ MULADD128(a[10], b[7], uv, carry, uv); -+ t += carry; -+ MULADD128(a[9], b[8], uv, carry, uv); -+ t += carry; -+ MULADD128(a[8], b[9], uv, carry, uv); -+ t += carry; -+ MULADD128(a[7], b[10], uv, carry, uv); -+ t += carry; -+ MULADD128(a[6], b[11], uv, carry, uv); -+ t += carry; -+ c[17] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[11], b[7], uv, carry, uv); -+ t += carry; -+ MULADD128(a[10], b[8], uv, carry, uv); -+ t += carry; -+ MULADD128(a[9], b[9], uv, carry, uv); -+ t += carry; -+ MULADD128(a[8], b[10], uv, carry, uv); -+ t += carry; -+ MULADD128(a[7], b[11], uv, carry, uv); -+ t += carry; -+ c[18] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[11], b[8], uv, carry, uv); -+ t += carry; -+ MULADD128(a[10], b[9], uv, carry, uv); -+ t += carry; -+ MULADD128(a[9], b[10], uv, carry, uv); -+ t += carry; -+ MULADD128(a[8], b[11], uv, carry, uv); -+ t += carry; -+ c[19] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[11], b[9], uv, carry, uv); -+ t += carry; -+ MULADD128(a[10], b[10], uv, carry, uv); -+ t += carry; -+ MULADD128(a[9], b[11], uv, carry, uv); -+ t += carry; -+ c[20] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(a[11], b[10], uv, carry, uv); -+ t += carry; -+ MULADD128(a[10], b[11], uv, carry, uv); -+ t += carry; -+ c[21] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ -+ MULADD128(a[11], b[11], uv, carry, uv); -+ c[22] = uv[0]; -+ c[23] = uv[1]; -+ -+#elif (OS_TARGET == OS_LINUX) -+ -+ mul751_asm(a, b, c); -+ -+#endif -+} -+ -+ -+void rdc_mont(const digit_t* ma, digit_t* mc) -+{ // Montgomery reduction exploiting special form of the prime. -+ // mc = ma*R^-1 mod p751x2, where R = 2^768. -+ // If ma < 2^768*p751, the output mc is in the range [0, 2*p751-1]. -+ // ma is assumed to be in Montgomery representation. -+ -+#if (OS_TARGET == OS_WIN) -+ unsigned int carry; -+ digit_t t = 0; -+ uint128_t uv = {0}; -+ -+ mc[0] = ma[0]; -+ mc[1] = ma[1]; -+ mc[2] = ma[2]; -+ mc[3] = ma[3]; -+ mc[4] = ma[4]; -+ MUL128(mc[0], ((digit_t*)p751p1)[5], uv); -+ ADDC(0, uv[0], ma[5], carry, uv[0]); -+ ADDC(carry, uv[1], 0, carry, uv[1]); -+ mc[5] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = 0; -+ -+ MULADD128(mc[0], ((digit_t*)p751p1)[6], uv, carry, uv); -+ MULADD128(mc[1], ((digit_t*)p751p1)[5], uv, carry, uv); -+ t += carry; -+ ADDC(0, uv[0], ma[6], carry, uv[0]); -+ ADDC(carry, uv[1], 0, carry, uv[1]); -+ t += carry; -+ mc[6] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(mc[0], ((digit_t*)p751p1)[7], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[1], ((digit_t*)p751p1)[6], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[2], ((digit_t*)p751p1)[5], uv, carry, uv); -+ t += carry; -+ ADDC(0, uv[0], ma[7], carry, uv[0]); -+ ADDC(carry, uv[1], 0, carry, uv[1]); -+ t += carry; -+ mc[7] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(mc[0], ((digit_t*)p751p1)[8], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[1], ((digit_t*)p751p1)[7], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[2], ((digit_t*)p751p1)[6], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[3], ((digit_t*)p751p1)[5], uv, carry, uv); -+ t += carry; -+ ADDC(0, uv[0], ma[8], carry, uv[0]); -+ ADDC(carry, uv[1], 0, carry, uv[1]); -+ t += carry; -+ mc[8] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(mc[0], ((digit_t*)p751p1)[9], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[1], ((digit_t*)p751p1)[8], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[2], ((digit_t*)p751p1)[7], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[3], ((digit_t*)p751p1)[6], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[4], ((digit_t*)p751p1)[5], uv, carry, uv); -+ t += carry; -+ ADDC(0, uv[0], ma[9], carry, uv[0]); -+ ADDC(carry, uv[1], 0, carry, uv[1]); -+ t += carry; -+ mc[9] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(mc[0], ((digit_t*)p751p1)[10], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[1], ((digit_t*)p751p1)[9], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[2], ((digit_t*)p751p1)[8], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[3], ((digit_t*)p751p1)[7], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[4], ((digit_t*)p751p1)[6], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[5], ((digit_t*)p751p1)[5], uv, carry, uv); -+ t += carry; -+ ADDC(0, uv[0], ma[10], carry, uv[0]); -+ ADDC(carry, uv[1], 0, carry, uv[1]); -+ t += carry; -+ mc[10] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(mc[0], ((digit_t*)p751p1)[11], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[1], ((digit_t*)p751p1)[10], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[2], ((digit_t*)p751p1)[9], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[3], ((digit_t*)p751p1)[8], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[4], ((digit_t*)p751p1)[7], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[5], ((digit_t*)p751p1)[6], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[6], ((digit_t*)p751p1)[5], uv, carry, uv); -+ t += carry; -+ ADDC(0, uv[0], ma[11], carry, uv[0]); -+ ADDC(carry, uv[1], 0, carry, uv[1]); -+ t += carry; -+ mc[11] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(mc[1], ((digit_t*)p751p1)[11], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[2], ((digit_t*)p751p1)[10], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[3], ((digit_t*)p751p1)[9], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[4], ((digit_t*)p751p1)[8], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[5], ((digit_t*)p751p1)[7], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[6], ((digit_t*)p751p1)[6], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[7], ((digit_t*)p751p1)[5], uv, carry, uv); -+ t += carry; -+ ADDC(0, uv[0], ma[12], carry, uv[0]); -+ ADDC(carry, uv[1], 0, carry, uv[1]); -+ t += carry; -+ mc[0] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(mc[2], ((digit_t*)p751p1)[11], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[3], ((digit_t*)p751p1)[10], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[4], ((digit_t*)p751p1)[9], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[5], ((digit_t*)p751p1)[8], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[6], ((digit_t*)p751p1)[7], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[7], ((digit_t*)p751p1)[6], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[8], ((digit_t*)p751p1)[5], uv, carry, uv); -+ t += carry; -+ ADDC(0, uv[0], ma[13], carry, uv[0]); -+ ADDC(carry, uv[1], 0, carry, uv[1]); -+ t += carry; -+ mc[1] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(mc[3], ((digit_t*)p751p1)[11], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[4], ((digit_t*)p751p1)[10], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[5], ((digit_t*)p751p1)[9], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[6], ((digit_t*)p751p1)[8], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[7], ((digit_t*)p751p1)[7], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[8], ((digit_t*)p751p1)[6], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[9], ((digit_t*)p751p1)[5], uv, carry, uv); -+ t += carry; -+ ADDC(0, uv[0], ma[14], carry, uv[0]); -+ ADDC(carry, uv[1], 0, carry, uv[1]); -+ t += carry; -+ mc[2] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(mc[4], ((digit_t*)p751p1)[11], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[5], ((digit_t*)p751p1)[10], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[6], ((digit_t*)p751p1)[9], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[7], ((digit_t*)p751p1)[8], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[8], ((digit_t*)p751p1)[7], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[9], ((digit_t*)p751p1)[6], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[10], ((digit_t*)p751p1)[5], uv, carry, uv); -+ t += carry; -+ ADDC(0, uv[0], ma[15], carry, uv[0]); -+ ADDC(carry, uv[1], 0, carry, uv[1]); -+ t += carry; -+ mc[3] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(mc[5], ((digit_t*)p751p1)[11], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[6], ((digit_t*)p751p1)[10], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[7], ((digit_t*)p751p1)[9], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[8], ((digit_t*)p751p1)[8], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[9], ((digit_t*)p751p1)[7], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[10], ((digit_t*)p751p1)[6], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[11], ((digit_t*)p751p1)[5], uv, carry, uv); -+ t += carry; -+ ADDC(0, uv[0], ma[16], carry, uv[0]); -+ ADDC(carry, uv[1], 0, carry, uv[1]); -+ t += carry; -+ mc[4] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(mc[6], ((digit_t*)p751p1)[11], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[7], ((digit_t*)p751p1)[10], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[8], ((digit_t*)p751p1)[9], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[9], ((digit_t*)p751p1)[8], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[10], ((digit_t*)p751p1)[7], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[11], ((digit_t*)p751p1)[6], uv, carry, uv); -+ t += carry; -+ ADDC(0, uv[0], ma[17], carry, uv[0]); -+ ADDC(carry, uv[1], 0, carry, uv[1]); -+ t += carry; -+ mc[5] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(mc[7], ((digit_t*)p751p1)[11], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[8], ((digit_t*)p751p1)[10], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[9], ((digit_t*)p751p1)[9], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[10], ((digit_t*)p751p1)[8], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[11], ((digit_t*)p751p1)[7], uv, carry, uv); -+ t += carry; -+ ADDC(0, uv[0], ma[18], carry, uv[0]); -+ ADDC(carry, uv[1], 0, carry, uv[1]); -+ t += carry; -+ mc[6] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(mc[8], ((digit_t*)p751p1)[11], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[9], ((digit_t*)p751p1)[10], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[10], ((digit_t*)p751p1)[9], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[11], ((digit_t*)p751p1)[8], uv, carry, uv); -+ t += carry; -+ ADDC(0, uv[0], ma[19], carry, uv[0]); -+ ADDC(carry, uv[1], 0, carry, uv[1]); -+ t += carry; -+ mc[7] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(mc[9], ((digit_t*)p751p1)[11], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[10], ((digit_t*)p751p1)[10], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[11], ((digit_t*)p751p1)[9], uv, carry, uv); -+ t += carry; -+ ADDC(0, uv[0], ma[20], carry, uv[0]); -+ ADDC(carry, uv[1], 0, carry, uv[1]); -+ t += carry; -+ mc[8] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(mc[10], ((digit_t*)p751p1)[11], uv, carry, uv); -+ t += carry; -+ MULADD128(mc[11], ((digit_t*)p751p1)[10], uv, carry, uv); -+ t += carry; -+ ADDC(0, uv[0], ma[21], carry, uv[0]); -+ ADDC(carry, uv[1], 0, carry, uv[1]); -+ t += carry; -+ mc[9] = uv[0]; -+ uv[0] = uv[1]; -+ uv[1] = t; -+ t = 0; -+ -+ MULADD128(mc[11], ((digit_t*)p751p1)[11], uv, carry, uv); -+ t += carry; -+ ADDC(0, uv[0], ma[22], carry, mc[10]); -+ ADDC(carry, uv[1], 0, carry, uv[1]); -+ ADDC(0, uv[1], ma[23], carry, mc[11]); -+ -+#elif (OS_TARGET == OS_LINUX) -+ -+ rdc751_asm(ma, mc); -+ -+#endif -+} -diff --git a/third_party/sidh/src/P751/AMD64/fp_x64_asm.S b/third_party/sidh/src/P751/AMD64/fp_x64_asm.S -new file mode 100644 -index 00000000..b76c415d ---- /dev/null -+++ b/third_party/sidh/src/P751/AMD64/fp_x64_asm.S -@@ -0,0 +1,3009 @@ -+//******************************************************************************************* -+// SIDH: an efficient supersingular isogeny cryptography library -+// -+// Abstract: field arithmetic in x64 assembly for P751 on Linux -+//******************************************************************************************* -+ -+.intel_syntax noprefix -+ -+// Registers that are used for parameter passing: -+#define reg_p1 rdi -+#define reg_p2 rsi -+#define reg_p3 rdx -+ -+// p751 + 1 -+#define p751p1_5 0xEEB0000000000000 -+#define p751p1_6 0xE3EC968549F878A8 -+#define p751p1_7 0xDA959B1A13F7CC76 -+#define p751p1_8 0x084E9867D6EBE876 -+#define p751p1_9 0x8562B5045CB25748 -+#define p751p1_10 0x0E12909F97BADC66 -+#define p751p1_11 0x00006FE5D541F71C -+// p751 x 2 -+#define p751x2_0 0xFFFFFFFFFFFFFFFE -+#define p751x2_1 0xFFFFFFFFFFFFFFFF -+#define p751x2_5 0xDD5FFFFFFFFFFFFF -+#define p751x2_6 0xC7D92D0A93F0F151 -+#define p751x2_7 0xB52B363427EF98ED -+#define p751x2_8 0x109D30CFADD7D0ED -+#define p751x2_9 0x0AC56A08B964AE90 -+#define p751x2_10 0x1C25213F2F75B8CD -+#define p751x2_11 0x0000DFCBAA83EE38 -+ -+p751p1_nz: -+.quad 0xEEB0000000000000 -+.quad 0xE3EC968549F878A8 -+.quad 0xDA959B1A13F7CC76 -+.quad 0x084E9867D6EBE876 -+.quad 0x8562B5045CB25748 -+.quad 0x0E12909F97BADC66 -+.quad 0x00006FE5D541F71C -+ -+ -+.text -+//*********************************************************************** -+// Field addition -+// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2] -+//*********************************************************************** -+.global fpadd751_asm -+fpadd751_asm: -+ push r12 -+ push r13 -+ push r14 -+ push r15 -+ -+ mov r8, [reg_p1] -+ mov r9, [reg_p1+8] -+ mov r10, [reg_p1+16] -+ mov r11, [reg_p1+24] -+ mov r12, [reg_p1+32] -+ mov r13, [reg_p1+40] -+ mov r14, [reg_p1+48] -+ mov r15, [reg_p1+56] -+ mov rcx, [reg_p1+64] -+ add r8, [reg_p2] -+ adc r9, [reg_p2+8] -+ adc r10, [reg_p2+16] -+ adc r11, [reg_p2+24] -+ adc r12, [reg_p2+32] -+ adc r13, [reg_p2+40] -+ adc r14, [reg_p2+48] -+ adc r15, [reg_p2+56] -+ adc rcx, [reg_p2+64] -+ mov rax, [reg_p1+72] -+ adc rax, [reg_p2+72] -+ mov [reg_p3+72], rax -+ mov rax, [reg_p1+80] -+ adc rax, [reg_p2+80] -+ mov [reg_p3+80], rax -+ mov rax, [reg_p1+88] -+ adc rax, [reg_p2+88] -+ mov [reg_p3+88], rax -+ -+ movq rax, p751x2_0 -+ sub r8, rax -+ movq rax, p751x2_1 -+ sbb r9, rax -+ sbb r10, rax -+ sbb r11, rax -+ sbb r12, rax -+ movq rax, p751x2_5 -+ sbb r13, rax -+ movq rax, p751x2_6 -+ sbb r14, rax -+ movq rax, p751x2_7 -+ sbb r15, rax -+ movq rax, p751x2_8 -+ sbb rcx, rax -+ mov [reg_p3], r8 -+ mov [reg_p3+8], r9 -+ mov [reg_p3+16], r10 -+ mov [reg_p3+24], r11 -+ mov [reg_p3+32], r12 -+ mov [reg_p3+40], r13 -+ mov [reg_p3+48], r14 -+ mov [reg_p3+56], r15 -+ mov [reg_p3+64], rcx -+ mov r8, [reg_p3+72] -+ mov r9, [reg_p3+80] -+ mov r10, [reg_p3+88] -+ movq rax, p751x2_9 -+ sbb r8, rax -+ movq rax, p751x2_10 -+ sbb r9, rax -+ movq rax, p751x2_11 -+ sbb r10, rax -+ mov [reg_p3+72], r8 -+ mov [reg_p3+80], r9 -+ mov [reg_p3+88], r10 -+ movq rax, 0 -+ sbb rax, 0 -+ -+ mov rsi, p751x2_0 -+ and rsi, rax -+ mov r8, p751x2_1 -+ and r8, rax -+ movq r9, p751x2_5 -+ and r9, rax -+ movq r10, p751x2_6 -+ and r10, rax -+ movq r11, p751x2_7 -+ and r11, rax -+ movq r12, p751x2_8 -+ and r12, rax -+ movq r13, p751x2_9 -+ and r13, rax -+ movq r14, p751x2_10 -+ and r14, rax -+ movq r15, p751x2_11 -+ and r15, rax -+ -+ add rsi, [reg_p3] -+ mov [reg_p3], rsi -+ mov rax, [reg_p3+8] -+ adc rax, r8 -+ mov [reg_p3+8], rax -+ mov rax, [reg_p3+16] -+ adc rax, r8 -+ mov [reg_p3+16], rax -+ mov rax, [reg_p3+24] -+ adc rax, r8 -+ mov [reg_p3+24], rax -+ mov rax, [reg_p3+32] -+ adc rax, r8 -+ mov [reg_p3+32], rax -+ adc r9, [reg_p3+40] -+ adc r10, [reg_p3+48] -+ adc r11, [reg_p3+56] -+ adc r12, [reg_p3+64] -+ adc r13, [reg_p3+72] -+ adc r14, [reg_p3+80] -+ adc r15, [reg_p3+88] -+ mov [reg_p3+40], r9 -+ mov [reg_p3+48], r10 -+ mov [reg_p3+56], r11 -+ mov [reg_p3+64], r12 -+ mov [reg_p3+72], r13 -+ mov [reg_p3+80], r14 -+ mov [reg_p3+88], r15 -+ -+ pop r15 -+ pop r14 -+ pop r13 -+ pop r12 -+ ret -+ -+ -+//*********************************************************************** -+// Field subtraction -+// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] -+//*********************************************************************** -+.global fpsub751_asm -+fpsub751_asm: -+ push r12 -+ push r13 -+ push r14 -+ push r15 -+ -+ mov r8, [reg_p1] -+ mov r9, [reg_p1+8] -+ mov r10, [reg_p1+16] -+ mov r11, [reg_p1+24] -+ mov r12, [reg_p1+32] -+ mov r13, [reg_p1+40] -+ mov r14, [reg_p1+48] -+ mov r15, [reg_p1+56] -+ mov rcx, [reg_p1+64] -+ sub r8, [reg_p2] -+ sbb r9, [reg_p2+8] -+ sbb r10, [reg_p2+16] -+ sbb r11, [reg_p2+24] -+ sbb r12, [reg_p2+32] -+ sbb r13, [reg_p2+40] -+ sbb r14, [reg_p2+48] -+ sbb r15, [reg_p2+56] -+ sbb rcx, [reg_p2+64] -+ mov [reg_p3], r8 -+ mov [reg_p3+8], r9 -+ mov [reg_p3+16], r10 -+ mov [reg_p3+24], r11 -+ mov [reg_p3+32], r12 -+ mov [reg_p3+40], r13 -+ mov [reg_p3+48], r14 -+ mov [reg_p3+56], r15 -+ mov [reg_p3+64], rcx -+ mov rax, [reg_p1+72] -+ sbb rax, [reg_p2+72] -+ mov [reg_p3+72], rax -+ mov rax, [reg_p1+80] -+ sbb rax, [reg_p2+80] -+ mov [reg_p3+80], rax -+ mov rax, [reg_p1+88] -+ sbb rax, [reg_p2+88] -+ mov [reg_p3+88], rax -+ movq rax, 0 -+ sbb rax, 0 -+ -+ mov rsi, p751x2_0 -+ and rsi, rax -+ mov r8, p751x2_1 -+ and r8, rax -+ movq r9, p751x2_5 -+ and r9, rax -+ movq r10, p751x2_6 -+ and r10, rax -+ movq r11, p751x2_7 -+ and r11, rax -+ movq r12, p751x2_8 -+ and r12, rax -+ movq r13, p751x2_9 -+ and r13, rax -+ movq r14, p751x2_10 -+ and r14, rax -+ movq r15, p751x2_11 -+ and r15, rax -+ -+ mov rax, [reg_p3] -+ add rax, rsi -+ mov [reg_p3], rax -+ mov rax, [reg_p3+8] -+ adc rax, r8 -+ mov [reg_p3+8], rax -+ mov rax, [reg_p3+16] -+ adc rax, r8 -+ mov [reg_p3+16], rax -+ mov rax, [reg_p3+24] -+ adc rax, r8 -+ mov [reg_p3+24], rax -+ mov rax, [reg_p3+32] -+ adc rax, r8 -+ mov [reg_p3+32], rax -+ adc r9, [reg_p3+40] -+ adc r10, [reg_p3+48] -+ adc r11, [reg_p3+56] -+ adc r12, [reg_p3+64] -+ adc r13, [reg_p3+72] -+ adc r14, [reg_p3+80] -+ adc r15, [reg_p3+88] -+ mov [reg_p3+40], r9 -+ mov [reg_p3+48], r10 -+ mov [reg_p3+56], r11 -+ mov [reg_p3+64], r12 -+ mov [reg_p3+72], r13 -+ mov [reg_p3+80], r14 -+ mov [reg_p3+88], r15 -+ -+ pop r15 -+ pop r14 -+ pop r13 -+ pop r12 -+ ret -+ -+ -+#ifdef _MULX_ -+ -+/////////////////////////////////////////////////////////////////////////// MACRO -+// Schoolbook integer multiplication -+// Inputs: memory pointers M0 and M1 -+// Outputs: memory pointer C -+// Temps: stack space for two 64-bit values (case w/o _ADX_), regs T0:T7 -+/////////////////////////////////////////////////////////////////////////// -+#ifdef _ADX_ -+ -+.macro MUL384_SCHOOL M0, M1, C, S, T0, T1, T2, T3, T4, T5, T6, T7 -+ mov rdx, \M0 -+ mulx \T0, \T1, \M1 -+ mulx \T2, \T3, 8\M1 -+ mov \C, \T1 // C0_final -+ xor rax, rax -+ mulx \T4, \T5, 16\M1 -+ adox \T0, \T3 -+ adox \T2, \T5 -+ mulx \T1, \T3, 24\M1 -+ adox \T4, \T3 -+ mulx \T5, \T6, 32\M1 -+ adox \T1, \T6 -+ mulx \T3, \T7, 40\M1 -+ adox \T5, \T7 -+ adox \T3, rax -+ -+ mov rdx, 8\M0 -+ mulx \T6, \T7, \M1 -+ xor rax, rax -+ adcx \T0, \T7 -+ mov 8\C, \T0 // C1_final -+ adcx \T2, \T6 -+ mulx \T6, \T7, 8\M1 -+ adox \T2, \T7 -+ adcx \T4, \T6 -+ mulx \T0, \T6, 16\M1 -+ adox \T4, \T6 -+ adcx \T0, \T1 -+ mulx \T1, \T7, 24\M1 -+ adcx \T1, \T5 -+ mulx \T5, \T6, 32\M1 -+ adcx \T3, \T5 -+ mulx \T5, rdx, 40\M1 -+ adcx \T5, rax -+ -+ adox \T0, \T7 -+ adox \T1, \T6 -+ adox \T3, rdx -+ adox \T5, rax -+ -+ mov rdx, 16\M0 -+ mulx \T6, \T7, \M1 -+ xor rax, rax -+ adcx \T2, \T7 -+ mov 16\C, \T2 // C2_final -+ adcx \T4, \T6 -+ mulx \T6, \T7, 8\M1 -+ adox \T4, \T7 -+ adcx \T0, \T6 -+ mulx \T2, \T6, 16\M1 -+ adox \T0, \T6 -+ adcx \T1, \T2 -+ mulx \T2, \T7, 24\M1 -+ adcx \T3, \T2 -+ mulx \T2, \T6, 32\M1 -+ adcx \T5, \T2 -+ mulx \T2, rdx, 40\M1 -+ adcx \T2, rax -+ -+ adox \T1, \T7 -+ adox \T3, \T6 -+ adox \T5, rdx -+ adox \T2, rax -+ -+ mov rdx, 24\M0 -+ mulx \T6, \T7, \M1 -+ xor rax, rax -+ adcx \T4, \T7 -+ mov 24\C, \T4 // C3_final -+ adcx \T0, \T6 -+ mulx \T6, \T7, 8\M1 -+ adox \T0, \T7 -+ adcx \T1, \T6 -+ mulx \T4, \T6, 16\M1 -+ adox \T1, \T6 -+ adcx \T3, \T4 -+ mulx \T4, \T7, 24\M1 -+ adcx \T5, \T4 -+ mulx \T4, \T6, 32\M1 -+ adcx \T2, \T4 -+ mulx \T4, rdx, 40\M1 -+ adcx \T4, rax -+ -+ adox \T3, \T7 -+ adox \T5, \T6 -+ adox \T2, rdx -+ adox \T4, rax -+ -+ mov rdx, 32\M0 -+ mulx \T6, \T7, \M1 -+ xor rax, rax -+ adcx \T0, \T7 -+ mov 32\C, \T0 // C4_final -+ adcx \T1, \T6 -+ mulx \T6, \T7, 8\M1 -+ adox \T1, \T7 -+ adcx \T3, \T6 -+ mulx \T0, \T6, 16\M1 -+ adox \T3, \T6 -+ adcx \T5, \T0 -+ mulx \T0, \T7, 24\M1 -+ adcx \T2, \T0 -+ mulx \T0, \T6, 32\M1 -+ adcx \T4, \T0 -+ mulx \T0, rdx, 40\M1 -+ adcx \T0, rax -+ -+ adox \T5, \T7 -+ adox \T2, \T6 -+ adox \T4, rdx -+ adox \T0, rax -+ -+ mov rdx, 40\M0 -+ mulx \T6, \T7, \M1 -+ xor rax, rax -+ adcx \T1, \T7 -+ mov 40\C, \T1 // C5_final -+ adcx \T3, \T6 -+ mulx \T6, \T7, 8\M1 -+ adox \T3, \T7 -+ adcx \T5, \T6 -+ mulx \T1, \T6, 16\M1 -+ adox \T5, \T6 -+ adcx \T2, \T1 -+ mulx \T1, \T7, 24\M1 -+ adcx \T4, \T1 -+ mulx \T1, \T6, 32\M1 -+ adcx \T0, \T1 -+ mulx \T1, rdx, 40\M1 -+ adcx \T1, rax -+ -+ adox \T2, \T7 -+ adox \T4, \T6 -+ adox \T0, rdx -+ adox \T1, rax -+ mov 48\C, \T3 -+ mov 56\C, \T5 -+ mov 64\C, \T2 -+ mov 72\C, \T4 -+ mov 80\C, \T0 -+ mov 88\C, \T1 -+.endm -+ -+#else -+ -+.macro MUL384_SCHOOL M0, M1, C, S, T0, T1, T2, T3, T4, T5, T6, T7 -+ mov rdx, \M0 -+ mulx \T0, \T1, \M1 -+ mulx \T2, \T3, 8\M1 -+ mov \C, \T1 // C0_final -+ xor rax, rax -+ mulx \T4, \T5, 16\M1 -+ add \T0, \T3 -+ adc \T2, \T5 -+ mulx \T1, \T3, 24\M1 -+ adc \T4, \T3 -+ mulx \T5, \T6, 32\M1 -+ adc \T1, \T6 -+ mulx \T3, \T7, 40\M1 -+ adc \T5, \T7 -+ adc \T3, rax -+ -+ mov rdx, 8\M0 -+ mulx \T6, \T7, \M1 -+ add \T0, \T7 -+ mov 8\C, \T0 // C1_final -+ adc \T2, \T6 -+ mulx \T6, \T7, 8\M1 -+ mov \S, \T7 // store T7 -+ adc \T4, \T6 -+ mulx \T0, \T6, 16\M1 -+ mov 8\S, \T6 // store T6 -+ adc \T0, \T1 -+ mulx \T1, \T7, 24\M1 -+ adc \T1, \T5 -+ mulx \T5, \T6, 32\M1 -+ adc \T3, \T5 -+ mulx \T5, rdx, 40\M1 -+ adc \T5, rax -+ -+ xor rax, rax -+ add \T2, \S -+ adc \T4, 8\S -+ adc \T0, \T7 -+ adc \T1, \T6 -+ adc \T3, rdx -+ adc \T5, rax -+ -+ mov rdx, 16\M0 -+ mulx \T6, \T7, \M1 -+ add \T2, \T7 -+ mov 16\C, \T2 // C2_final -+ adc \T4, \T6 -+ mulx \T6, \T7, 8\M1 -+ mov \S, \T7 // store T7 -+ adc \T0, \T6 -+ mulx \T2, \T6, 16\M1 -+ mov 8\S, \T6 // store T6 -+ adc \T1, \T2 -+ mulx \T2, \T7, 24\M1 -+ adc \T3, \T2 -+ mulx \T2, \T6, 32\M1 -+ adc \T5, \T2 -+ mulx \T2, rdx, 40\M1 -+ adc \T2, rax -+ -+ xor rax, rax -+ add \T4, \S -+ adc \T0, 8\S -+ adc \T1, \T7 -+ adc \T3, \T6 -+ adc \T5, rdx -+ adc \T2, rax -+ -+ mov rdx, 24\M0 -+ mulx \T6, \T7, \M1 -+ add \T4, \T7 -+ mov 24\C, \T4 // C3_final -+ adc \T0, \T6 -+ mulx \T6, \T7, 8\M1 -+ mov \S, \T7 // store T7 -+ adc \T1, \T6 -+ mulx \T4, \T6, 16\M1 -+ mov 8\S, \T6 // store T6 -+ adc \T3, \T4 -+ mulx \T4, \T7, 24\M1 -+ adc \T5, \T4 -+ mulx \T4, \T6, 32\M1 -+ adc \T2, \T4 -+ mulx \T4, rdx, 40\M1 -+ adc \T4, rax -+ -+ xor rax, rax -+ add \T0, \S -+ adc \T1, 8\S -+ adc \T3, \T7 -+ adc \T5, \T6 -+ adc \T2, rdx -+ adc \T4, rax -+ -+ mov rdx, 32\M0 -+ mulx \T6, \T7, \M1 -+ add \T0, \T7 -+ mov 32\C, \T0 // C4_final -+ adc \T1, \T6 -+ mulx \T6, \T7, 8\M1 -+ mov \S, \T7 // store T7 -+ adc \T3, \T6 -+ mulx \T0, \T6, 16\M1 -+ mov 8\S, \T6 // store T6 -+ adc \T5, \T0 -+ mulx \T0, \T7, 24\M1 -+ adc \T2, \T0 -+ mulx \T0, \T6, 32\M1 -+ adc \T4, \T0 -+ mulx \T0, rdx, 40\M1 -+ adc \T0, rax -+ -+ xor rax, rax -+ add \T1, \S -+ adc \T3, 8\S -+ adc \T5, \T7 -+ adc \T2, \T6 -+ adc \T4, rdx -+ adc \T0, rax -+ -+ mov rdx, 40\M0 -+ mulx \T6, \T7, \M1 -+ add \T1, \T7 -+ mov 40\C, \T1 // C5_final -+ adc \T3, \T6 -+ mulx \T6, \T7, 8\M1 -+ mov \S, \T7 // store T7 -+ adc \T5, \T6 -+ mulx \T1, \T6, 16\M1 -+ mov 8\S, \T6 // store T6 -+ adc \T2, \T1 -+ mulx \T1, \T7, 24\M1 -+ adc \T4, \T1 -+ mulx \T1, \T6, 32\M1 -+ adc \T0, \T1 -+ mulx \T1, rdx, 40\M1 -+ adc \T1, rax -+ -+ add \T3, \S -+ adc \T5, 8\S -+ adc \T2, \T7 -+ adc \T4, \T6 -+ adc \T0, rdx -+ adc \T1, 0 -+ mov 48\C, \T3 -+ mov 56\C, \T5 -+ mov 64\C, \T2 -+ mov 72\C, \T4 -+ mov 80\C, \T0 -+ mov 88\C, \T1 -+.endm -+ -+#endif -+ -+ -+//***************************************************************************** -+// 751-bit multiplication using Karatsuba (one level), schoolbook (two levels) -+//***************************************************************************** -+.global mul751_asm -+mul751_asm: -+ push r12 -+ push r13 -+ push r14 -+ push r15 -+ mov rcx, reg_p3 -+ -+ // [rsp] <- AH + AL, rax <- mask -+ xor rax, rax -+ mov r8, [reg_p1] -+ mov r9, [reg_p1+8] -+ mov r10, [reg_p1+16] -+ mov r11, [reg_p1+24] -+ mov r12, [reg_p1+32] -+ mov r13, [reg_p1+40] -+ push rbx -+ push rbp -+ sub rsp, 152 -+ add r8, [reg_p1+48] -+ adc r9, [reg_p1+56] -+ adc r10, [reg_p1+64] -+ adc r11, [reg_p1+72] -+ adc r12, [reg_p1+80] -+ adc r13, [reg_p1+88] -+ sbb rax, 0 -+ mov [rsp], r8 -+ mov [rsp+8], r9 -+ mov [rsp+16], r10 -+ mov [rsp+24], r11 -+ mov [rsp+32], r12 -+ mov [rsp+40], r13 -+ -+ // [rsp+48] <- BH + BL, rdx <- mask -+ xor rdx, rdx -+ mov r8, [reg_p2] -+ mov r9, [reg_p2+8] -+ mov rbx, [reg_p2+16] -+ mov rbp, [reg_p2+24] -+ mov r14, [reg_p2+32] -+ mov r15, [reg_p2+40] -+ add r8, [reg_p2+48] -+ adc r9, [reg_p2+56] -+ adc rbx, [reg_p2+64] -+ adc rbp, [reg_p2+72] -+ adc r14, [reg_p2+80] -+ adc r15, [reg_p2+88] -+ sbb rdx, 0 -+ mov [rsp+48], r8 -+ mov [rsp+56], r9 -+ mov [rsp+64], rbx -+ mov [rsp+72], rbp -+ mov [rsp+80], r14 -+ mov [rsp+88], r15 -+ -+ // [rcx] <- masked (BH + BL) -+ and r8, rax -+ and r9, rax -+ and rbx, rax -+ and rbp, rax -+ and r14, rax -+ and r15, rax -+ mov [rcx], r8 -+ mov [rcx+8], r9 -+ mov [rcx+16], rbx ///// -+ mov [rcx+24], rbp ///// -+ -+ // r8-r13 <- masked (AH + AL) -+ mov r8, [rsp] -+ mov r9, [rsp+8] -+ and r8, rdx -+ and r9, rdx -+ and r10, rdx -+ and r11, rdx -+ and r12, rdx -+ and r13, rdx -+ -+ // [rsp+96] <- masked (AH + AL) + masked (AH + AL) -+ mov rax, [rcx] -+ mov rdx, [rcx+8] -+ add r8, rax -+ adc r9, rdx -+ adc r10, rbx -+ adc r11, rbp -+ adc r12, r14 -+ adc r13, r15 -+ mov [rsp+96], r8 -+ mov [rsp+104], r9 -+ mov [rsp+112], r10 -+ mov [rsp+120], r11 -+ -+ // [rcx] <- AL x BL -+ MUL384_SCHOOL [reg_p1], [reg_p2], [rcx], [rsp+128], r8, r9, r10, r11, rbx, rbp, r14, r15 // Result C0-C5 -+ -+ // [rcx+96] <- (AH+AL) x (BH+BL), low part -+ MUL384_SCHOOL [rsp], [rsp+48], [rcx+96], [rsp+128], r8, r9, r10, r11, rbx, rbp, r14, r15 -+ -+ // [rsp] <- AH x BH -+ MUL384_SCHOOL [reg_p1+48], [reg_p2+48], [rsp], [rsp+128], r8, r9, r10, r11, rbx, rbp, r14, r15 -+ -+ // r8-r13 <- (AH+AL) x (BH+BL), final step -+ mov r8, [rsp+96] -+ mov r9, [rsp+104] -+ mov r10, [rsp+112] -+ mov r11, [rsp+120] -+ mov rax, [rcx+144] -+ add r8, rax -+ mov rax, [rcx+152] -+ adc r9, rax -+ mov rax, [rcx+160] -+ adc r10, rax -+ mov rax, [rcx+168] -+ adc r11, rax -+ mov rax, [rcx+176] -+ adc r12, rax -+ mov rax, [rcx+184] -+ adc r13, rax -+ -+ // rdi,rdx,rbx,rbp,r14,r15,r8-r13 <- (AH+AL) x (BH+BL) - ALxBL -+ mov rdi, [rcx+96] -+ sub rdi, [rcx] -+ mov rdx, [rcx+104] -+ sbb rdx, [rcx+8] -+ mov rbx, [rcx+112] -+ sbb rbx, [rcx+16] -+ mov rbp, [rcx+120] -+ sbb rbp, [rcx+24] -+ mov r14, [rcx+128] -+ sbb r14, [rcx+32] -+ mov r15, [rcx+136] -+ sbb r15, [rcx+40] -+ sbb r8, [rcx+48] -+ sbb r9, [rcx+56] -+ sbb r10, [rcx+64] -+ sbb r11, [rcx+72] -+ sbb r12, [rcx+80] -+ sbb r13, [rcx+88] -+ -+ // rdi,rdx,rbx,rbp,r14,r15,r8-r13 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH -+ sub rdi, [rsp] -+ sbb rdx, [rsp+8] -+ sbb rbx, [rsp+16] -+ sbb rbp, [rsp+24] -+ sbb r14, [rsp+32] -+ sbb r15, [rsp+40] -+ sbb r8, [rsp+48] -+ sbb r9, [rsp+56] -+ sbb r10, [rsp+64] -+ sbb r11, [rsp+72] -+ sbb r12, [rsp+80] -+ sbb r13, [rsp+88] -+ -+ mov rax, [rcx+48] -+ add rax, rdi -+ mov [rcx+48], rax // Result C6-C11 -+ mov rax, [rcx+56] -+ adc rax, rdx -+ mov [rcx+56], rax -+ mov rax, [rcx+64] -+ adc rax, rbx -+ mov [rcx+64], rax -+ mov rax, [rcx+72] -+ adc rax, rbp -+ mov [rcx+72], rax -+ mov rax, [rcx+80] -+ adc rax, r14 -+ mov [rcx+80], rax -+ mov rax, [rcx+88] -+ adc rax, r15 -+ mov [rcx+88], rax -+ mov rax, [rsp] -+ adc r8, rax -+ mov [rcx+96], r8 // Result C8-C15 -+ mov rax, [rsp+8] -+ adc r9, rax -+ mov [rcx+104], r9 -+ mov rax, [rsp+16] -+ adc r10, rax -+ mov [rcx+112], r10 -+ mov rax, [rsp+24] -+ adc r11, rax -+ mov [rcx+120], r11 -+ mov rax, [rsp+32] -+ adc r12, rax -+ mov [rcx+128], r12 -+ mov rax, [rsp+40] -+ adc r13, rax -+ mov [rcx+136], r13 -+ mov r8, [rsp+48] -+ mov r9, [rsp+56] -+ mov r10, [rsp+64] -+ mov r11, [rsp+72] -+ mov r12, [rsp+80] -+ mov r13, [rsp+88] -+ adc r8, 0 -+ adc r9, 0 -+ adc r10, 0 -+ adc r11, 0 -+ adc r12, 0 -+ adc r13, 0 -+ add rsp, 152 -+ mov [rcx+144], r8 -+ mov [rcx+152], r9 -+ mov [rcx+160], r10 -+ mov [rcx+168], r11 -+ mov [rcx+176], r12 -+ mov [rcx+184], r13 -+ -+ pop rbp -+ pop rbx -+ pop r15 -+ pop r14 -+ pop r13 -+ pop r12 -+ ret -+ -+#else -+ -+//*********************************************************************** -+// Integer multiplication -+// Based on Karatsuba method -+// Operation: c [reg_p3] = a [reg_p1] * b [reg_p2] -+// NOTE: a=c or b=c are not allowed -+//*********************************************************************** -+.global mul751_asm -+mul751_asm: -+ push r12 -+ push r13 -+ push r14 -+ mov rcx, reg_p3 -+ -+ // rcx[0-5] <- AH+AL -+ xor rax, rax -+ mov r8, [reg_p1+48] -+ mov r9, [reg_p1+56] -+ mov r10, [reg_p1+64] -+ mov r11, [reg_p1+72] -+ mov r12, [reg_p1+80] -+ mov r13, [reg_p1+88] -+ add r8, [reg_p1] -+ adc r9, [reg_p1+8] -+ adc r10, [reg_p1+16] -+ adc r11, [reg_p1+24] -+ adc r12, [reg_p1+32] -+ adc r13, [reg_p1+40] -+ push r15 -+ mov [rcx], r8 -+ mov [rcx+8], r9 -+ mov [rcx+16], r10 -+ mov [rcx+24], r11 -+ mov [rcx+32], r12 -+ mov [rcx+40], r13 -+ sbb rax, 0 -+ sub rsp, 96 // Allocating space in stack -+ -+ // rcx[6-11] <- BH+BL -+ xor rdx, rdx -+ mov r8, [reg_p2+48] -+ mov r9, [reg_p2+56] -+ mov r10, [reg_p2+64] -+ mov r11, [reg_p2+72] -+ mov r12, [reg_p2+80] -+ mov r13, [reg_p2+88] -+ add r8, [reg_p2] -+ adc r9, [reg_p2+8] -+ adc r10, [reg_p2+16] -+ adc r11, [reg_p2+24] -+ adc r12, [reg_p2+32] -+ adc r13, [reg_p2+40] -+ mov [rcx+48], r8 -+ mov [rcx+56], r9 -+ mov [rcx+64], r10 -+ mov [rcx+72], r11 -+ mov [rcx+80], r12 -+ mov [rcx+88], r13 -+ sbb rdx, 0 -+ mov [rsp+80], rax -+ mov [rsp+88], rdx -+ -+ // (rsp[0-8],r10,r8,r9) <- (AH+AL)*(BH+BL) -+ mov r11, [rcx] -+ mov rax, r8 -+ mul r11 -+ mov [rsp], rax // c0 -+ mov r14, rdx -+ -+ xor r15, r15 -+ mov rax, r9 -+ mul r11 -+ xor r9, r9 -+ add r14, rax -+ adc r9, rdx -+ -+ mov r12, [rcx+8] -+ mov rax, r8 -+ mul r12 -+ add r14, rax -+ mov [rsp+8], r14 // c1 -+ adc r9, rdx -+ adc r15, 0 -+ -+ xor r8, r8 -+ mov rax, r10 -+ mul r11 -+ add r9, rax -+ mov r13, [rcx+48] -+ adc r15, rdx -+ adc r8, 0 -+ -+ mov rax, [rcx+16] -+ mul r13 -+ add r9, rax -+ adc r15, rdx -+ mov rax, [rcx+56] -+ adc r8, 0 -+ -+ mul r12 -+ add r9, rax -+ mov [rsp+16], r9 // c2 -+ adc r15, rdx -+ adc r8, 0 -+ -+ xor r9, r9 -+ mov rax, [rcx+72] -+ mul r11 -+ add r15, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov rax, [rcx+24] -+ mul r13 -+ add r15, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov rax, r10 -+ mul r12 -+ add r15, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov r14, [rcx+16] -+ mov rax, [rcx+56] -+ mul r14 -+ add r15, rax -+ mov [rsp+24], r15 // c3 -+ adc r8, rdx -+ adc r9, 0 -+ -+ xor r10, r10 -+ mov rax, [rcx+80] -+ mul r11 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ mov rax, [rcx+64] -+ mul r14 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ mov r15, [rcx+48] -+ mov rax, [rcx+32] -+ mul r15 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ mov rax, [rcx+72] -+ mul r12 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ mov r13, [rcx+24] -+ mov rax, [rcx+56] -+ mul r13 -+ add r8, rax -+ mov [rsp+32], r8 // c4 -+ adc r9, rdx -+ adc r10, 0 -+ -+ xor r8, r8 -+ mov rax, [rcx+88] -+ mul r11 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov rax, [rcx+64] -+ mul r13 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov rax, [rcx+72] -+ mul r14 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov rax, [rcx+40] -+ mul r15 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov rax, [rcx+80] -+ mul r12 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov r15, [rcx+32] -+ mov rax, [rcx+56] -+ mul r15 -+ add r9, rax -+ mov [rsp+40], r9 // c5 -+ adc r10, rdx -+ adc r8, 0 -+ -+ xor r9, r9 -+ mov rax, [rcx+64] -+ mul r15 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov rax, [rcx+88] -+ mul r12 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov rax, [rcx+80] -+ mul r14 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov r11, [rcx+40] -+ mov rax, [rcx+56] -+ mul r11 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov rax, [rcx+72] -+ mul r13 -+ add r10, rax -+ mov [rsp+48], r10 // c6 -+ adc r8, rdx -+ adc r9, 0 -+ -+ xor r10, r10 -+ mov rax, [rcx+88] -+ mul r14 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ mov rax, [rcx+64] -+ mul r11 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ mov rax, [rcx+80] -+ mul r13 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ mov rax, [rcx+72] -+ mul r15 -+ add r8, rax -+ mov [rsp+56], r8 // c7 -+ adc r9, rdx -+ adc r10, 0 -+ -+ xor r8, r8 -+ mov rax, [rcx+72] -+ mul r11 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov rax, [rcx+80] -+ mul r15 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov rax, [rcx+88] -+ mul r13 -+ add r9, rax -+ mov [rsp+64], r9 // c8 -+ adc r10, rdx -+ adc r8, 0 -+ -+ xor r9, r9 -+ mov rax, [rcx+88] -+ mul r15 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov rax, [rcx+80] -+ mul r11 -+ add r10, rax // c9 -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov rax, [rcx+88] -+ mul r11 -+ add r8, rax // c10 -+ adc r9, rdx // c11 -+ -+ mov rax, [rsp+88] -+ mov rdx, [rcx] -+ and r12, rax -+ and r14, rax -+ and rdx, rax -+ and r13, rax -+ and r15, rax -+ and r11, rax -+ mov rax, [rsp+48] -+ add rdx, rax -+ mov rax, [rsp+56] -+ adc r12, rax -+ mov rax, [rsp+64] -+ adc r14, rax -+ adc r13, r10 -+ adc r15, r8 -+ adc r11, r9 -+ mov rax, [rsp+80] -+ mov [rsp+48], rdx -+ mov [rsp+56], r12 -+ mov [rsp+64], r14 -+ mov [rsp+72], r13 -+ mov [rsp+80], r15 -+ mov [rsp+88], r11 -+ -+ mov r8, [rcx+48] -+ mov r9, [rcx+56] -+ mov r10, [rcx+64] -+ mov r11, [rcx+72] -+ mov r12, [rcx+80] -+ mov r13, [rcx+88] -+ and r8, rax -+ and r9, rax -+ and r10, rax -+ and r11, rax -+ and r12, rax -+ and r13, rax -+ mov rax, [rsp+48] -+ add r8, rax -+ mov rax, [rsp+56] -+ adc r9, rax -+ mov rax, [rsp+64] -+ adc r10, rax -+ mov rax, [rsp+72] -+ adc r11, rax -+ mov rax, [rsp+80] -+ adc r12, rax -+ mov rax, [rsp+88] -+ adc r13, rax -+ mov [rsp+48], r8 -+ mov [rsp+56], r9 -+ mov [rsp+72], r11 -+ -+ // rcx[0-11] <- AL*BL -+ mov r11, [reg_p1] -+ mov rax, [reg_p2] -+ mul r11 -+ xor r9, r9 -+ mov [rcx], rax // c0 -+ mov [rsp+64], r10 -+ mov r8, rdx -+ -+ mov rax, [reg_p2+8] -+ mul r11 -+ xor r10, r10 -+ add r8, rax -+ mov [rsp+80], r12 -+ adc r9, rdx -+ -+ mov r12, [reg_p1+8] -+ mov rax, [reg_p2] -+ mul r12 -+ add r8, rax -+ mov [rcx+8], r8 // c1 -+ adc r9, rdx -+ mov [rsp+88], r13 -+ adc r10, 0 -+ -+ xor r8, r8 -+ mov rax, [reg_p2+16] -+ mul r11 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov r13, [reg_p2] -+ mov rax, [reg_p1+16] -+ mul r13 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov rax, [reg_p2+8] -+ mul r12 -+ add r9, rax -+ mov [rcx+16], r9 // c2 -+ adc r10, rdx -+ adc r8, 0 -+ -+ xor r9, r9 -+ mov rax, [reg_p2+24] -+ mul r11 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov rax, [reg_p1+24] -+ mul r13 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov rax, [reg_p2+16] -+ mul r12 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov r14, [reg_p1+16] -+ mov rax, [reg_p2+8] -+ mul r14 -+ add r10, rax -+ mov [rcx+24], r10 // c3 -+ adc r8, rdx -+ adc r9, 0 -+ -+ xor r10, r10 -+ mov rax, [reg_p2+32] -+ mul r11 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ mov rax, [reg_p2+16] -+ mul r14 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ mov rax, [reg_p1+32] -+ mul r13 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ mov rax, [reg_p2+24] -+ mul r12 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ mov r13, [reg_p1+24] -+ mov rax, [reg_p2+8] -+ mul r13 -+ add r8, rax -+ mov [rcx+32], r8 // c4 -+ adc r9, rdx -+ adc r10, 0 -+ -+ xor r8, r8 -+ mov rax, [reg_p2+40] -+ mul r11 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov rax, [reg_p2+16] -+ mul r13 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov rax, [reg_p2+24] -+ mul r14 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov r11, [reg_p1+40] -+ mov rax, [reg_p2] -+ mul r11 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov rax, [reg_p2+32] -+ mul r12 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov r15, [reg_p1+32] -+ mov rax, [reg_p2+8] -+ mul r15 -+ add r9, rax -+ mov [rcx+40], r9 // c5 -+ adc r10, rdx -+ adc r8, 0 -+ -+ xor r9, r9 -+ mov rax, [reg_p2+16] -+ mul r15 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov rax, [reg_p2+40] -+ mul r12 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov rax, [reg_p2+32] -+ mul r14 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov rax, [reg_p2+8] -+ mul r11 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov rax, [reg_p2+24] -+ mul r13 -+ add r10, rax -+ mov [rcx+48], r10 // c6 -+ adc r8, rdx -+ adc r9, 0 -+ -+ xor r10, r10 -+ mov rax, [reg_p2+40] -+ mul r14 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ mov rax, [reg_p2+16] -+ mul r11 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ mov rax, [reg_p2+32] -+ mul r13 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ mov rax, [reg_p2+24] -+ mul r15 -+ add r8, rax -+ mov [rcx+56], r8 // c7 -+ adc r9, rdx -+ adc r10, 0 -+ -+ xor r8, r8 -+ mov rax, [reg_p2+24] -+ mul r11 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov rax, [reg_p2+32] -+ mul r15 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov rax, [reg_p2+40] -+ mul r13 -+ add r9, rax -+ mov [rcx+64], r9 // c8 -+ adc r10, rdx -+ adc r8, 0 -+ -+ xor r9, r9 -+ mov rax, [reg_p2+40] -+ mul r15 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov rax, [reg_p2+32] -+ mul r11 -+ add r10, rax -+ mov [rcx+72], r10 // c9 -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov rax, [reg_p2+40] -+ mul r11 -+ add r8, rax -+ mov [rcx+80], r8 // c10 -+ adc r9, rdx -+ mov [rcx+88], r9 // c11 -+ -+ // rcx[12-23] <- AH*BH -+ mov r11, [reg_p1+48] -+ mov rax, [reg_p2+48] -+ mul r11 -+ xor r9, r9 -+ mov [rcx+96], rax // c0 -+ mov r8, rdx -+ -+ mov rax, [reg_p2+56] -+ mul r11 -+ xor r10, r10 -+ add r8, rax -+ adc r9, rdx -+ -+ mov r12, [reg_p1+56] -+ mov rax, [reg_p2+48] -+ mul r12 -+ add r8, rax -+ mov [rcx+104], r8 // c1 -+ adc r9, rdx -+ adc r10, 0 -+ -+ xor r8, r8 -+ mov rax, [reg_p2+64] -+ mul r11 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov r13, [reg_p2+48] -+ mov rax, [reg_p1+64] -+ mul r13 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov rax, [reg_p2+56] -+ mul r12 -+ add r9, rax -+ mov [rcx+112], r9 // c2 -+ adc r10, rdx -+ adc r8, 0 -+ -+ xor r9, r9 -+ mov rax, [reg_p2+72] -+ mul r11 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov rax, [reg_p1+72] -+ mul r13 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov rax, [reg_p2+64] -+ mul r12 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov r14, [reg_p1+64] -+ mov rax, [reg_p2+56] -+ mul r14 -+ add r10, rax -+ mov [rcx+120], r10 // c3 -+ adc r8, rdx -+ adc r9, 0 -+ -+ xor r10, r10 -+ mov rax, [reg_p2+80] -+ mul r11 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ mov rax, [reg_p2+64] -+ mul r14 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ mov r15, [reg_p1+80] -+ mov rax, r13 -+ mul r15 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ mov rax, [reg_p2+72] -+ mul r12 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ mov r13, [reg_p1+72] -+ mov rax, [reg_p2+56] -+ mul r13 -+ add r8, rax -+ mov [rcx+128], r8 // c4 -+ adc r9, rdx -+ adc r10, 0 -+ -+ xor r8, r8 -+ mov rax, [reg_p2+88] -+ mul r11 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov rax, [reg_p2+64] -+ mul r13 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov rax, [reg_p2+72] -+ mul r14 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov r11, [reg_p1+88] -+ mov rax, [reg_p2+48] -+ mul r11 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov rax, [reg_p2+80] -+ mul r12 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov rax, [reg_p2+56] -+ mul r15 -+ add r9, rax -+ mov [rcx+136], r9 // c5 -+ adc r10, rdx -+ adc r8, 0 -+ -+ xor r9, r9 -+ mov rax, [reg_p2+64] -+ mul r15 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov rax, [reg_p2+88] -+ mul r12 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov rax, [reg_p2+80] -+ mul r14 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov rax, [reg_p2+56] -+ mul r11 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov rax, [reg_p2+72] -+ mul r13 -+ add r10, rax -+ mov [rcx+144], r10 // c6 -+ adc r8, rdx -+ adc r9, 0 -+ -+ xor r10, r10 -+ mov rax, [reg_p2+88] -+ mul r14 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ mov rax, [reg_p2+64] -+ mul r11 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ mov rax, [reg_p2+80] -+ mul r13 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ mov rax, [reg_p2+72] -+ mul r15 -+ add r8, rax -+ mov [rcx+152], r8 // c7 -+ adc r9, rdx -+ adc r10, 0 -+ -+ xor r8, r8 -+ mov rax, [reg_p2+72] -+ mul r11 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov rax, [reg_p2+80] -+ mul r15 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov rax, [reg_p2+88] -+ mul r13 -+ add r9, rax -+ mov [rcx+160], r9 // c8 -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov rax, [reg_p2+88] -+ mul r15 -+ add r10, rax -+ adc r8, rdx -+ -+ mov rax, [reg_p2+80] -+ mul r11 -+ add r10, rax -+ mov [rcx+168], r10 // c9 -+ adc r8, rdx -+ -+ mov rax, [reg_p2+88] -+ mul r11 -+ add r8, rax -+ mov [rcx+176], r8 // c10 -+ adc rdx, 0 -+ mov [rcx+184], rdx // c11 -+ -+ // [r8-r15,rax,rdx,rdi,[rsp]] <- (AH+AL)*(BH+BL) - AL*BL -+ mov r8, [rsp] -+ sub r8, [rcx] -+ mov r9, [rsp+8] -+ sbb r9, [rcx+8] -+ mov r10, [rsp+16] -+ sbb r10, [rcx+16] -+ mov r11, [rsp+24] -+ sbb r11, [rcx+24] -+ mov r12, [rsp+32] -+ sbb r12, [rcx+32] -+ mov r13, [rsp+40] -+ sbb r13, [rcx+40] -+ mov r14, [rsp+48] -+ sbb r14, [rcx+48] -+ mov r15, [rsp+56] -+ sbb r15, [rcx+56] -+ mov rax, [rsp+64] -+ sbb rax, [rcx+64] -+ mov rdx, [rsp+72] -+ sbb rdx, [rcx+72] -+ mov rdi, [rsp+80] -+ sbb rdi, [rcx+80] -+ mov rsi, [rsp+88] -+ sbb rsi, [rcx+88] -+ mov [rsp], rsi -+ -+ // [r8-r15,rax,rdx,rdi,[rsp]] <- (AH+AL)*(BH+BL) - AL*BL - AH*BH -+ mov rsi, [rcx+96] -+ sub r8, rsi -+ mov rsi, [rcx+104] -+ sbb r9, rsi -+ mov rsi, [rcx+112] -+ sbb r10, rsi -+ mov rsi, [rcx+120] -+ sbb r11, rsi -+ mov rsi, [rcx+128] -+ sbb r12, rsi -+ mov rsi, [rcx+136] -+ sbb r13, rsi -+ mov rsi, [rcx+144] -+ sbb r14, rsi -+ mov rsi, [rcx+152] -+ sbb r15, rsi -+ mov rsi, [rcx+160] -+ sbb rax, rsi -+ mov rsi, [rcx+168] -+ sbb rdx, rsi -+ mov rsi, [rcx+176] -+ sbb rdi, rsi -+ mov rsi, [rsp] -+ sbb rsi, [rcx+184] -+ -+ // Final result -+ add r8, [rcx+48] -+ mov [rcx+48], r8 -+ adc r9, [rcx+56] -+ mov [rcx+56], r9 -+ adc r10, [rcx+64] -+ mov [rcx+64], r10 -+ adc r11, [rcx+72] -+ mov [rcx+72], r11 -+ adc r12, [rcx+80] -+ mov [rcx+80], r12 -+ adc r13, [rcx+88] -+ mov [rcx+88], r13 -+ adc r14, [rcx+96] -+ mov [rcx+96], r14 -+ adc r15, [rcx+104] -+ mov [rcx+104], r15 -+ adc rax, [rcx+112] -+ mov [rcx+112], rax -+ adc rdx, [rcx+120] -+ mov [rcx+120], rdx -+ adc rdi, [rcx+128] -+ mov [rcx+128], rdi -+ adc rsi, [rcx+136] -+ mov [rcx+136], rsi -+ mov rax, [rcx+144] -+ adc rax, 0 -+ mov [rcx+144], rax -+ mov rax, [rcx+152] -+ adc rax, 0 -+ mov [rcx+152], rax -+ mov rax, [rcx+160] -+ adc rax, 0 -+ mov [rcx+160], rax -+ mov rax, [rcx+168] -+ adc rax, 0 -+ mov [rcx+168], rax -+ mov rax, [rcx+176] -+ adc rax, 0 -+ mov [rcx+176], rax -+ mov rax, [rcx+184] -+ adc rax, 0 -+ mov [rcx+184], rax -+ -+ add rsp, 96 // Restoring space in stack -+ pop r15 -+ pop r14 -+ pop r13 -+ pop r12 -+ ret -+ -+#endif -+ -+ -+#ifdef _MULX_ -+ -+///////////////////////////////////////////////////////////////// MACRO -+// Schoolbook integer multiplication -+// Inputs: memory pointers M0 and M1 -+// Outputs: memory locations C, C+8, C+16, and regs T0:T7 -+// Temps: memory locations regs T7:T9 -+///////////////////////////////////////////////////////////////// -+#ifdef _ADX_ -+ -+.macro MUL256x448_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10 -+ mov rdx, \M0 -+ mulx \T0, \T1, \M1 -+ mulx \T2, \T3, 8\M1 -+ mov \C, \T1 // C0_final -+ xor rax, rax -+ mulx \T4, \T5, 16\M1 -+ adox \T0, \T3 -+ adox \T2, \T5 -+ mulx \T1, \T3, 24\M1 -+ adox \T4, \T3 -+ mulx \T5, \T6, 32\M1 -+ adox \T1, \T6 -+ mulx \T3, \T7, 40\M1 -+ adox \T5, \T7 -+ mulx \T6, \T8, 48\M1 -+ adox \T3, \T8 -+ adox \T6, rax -+ -+ mov rdx, 8\M0 -+ mulx \T8, \T7, \M1 -+ xor rax, rax -+ adcx \T0, \T7 -+ mov 8\C, \T0 // C1_final -+ adcx \T2, \T8 -+ mulx \T7, \T8, 8\M1 -+ adox \T2, \T8 -+ adcx \T4, \T7 -+ mulx \T0, \T8, 16\M1 -+ adox \T4, \T8 -+ adcx \T0, \T1 -+ mulx \T1, \T7, 24\M1 -+ adcx \T1, \T5 -+ mulx \T5, \T8, 32\M1 -+ adcx \T3, \T5 -+ mulx \T5, \T9, 40\M1 -+ adcx \T6, \T5 -+ mulx \T5, rdx, 48\M1 -+ adcx \T5, rax -+ -+ adox \T0, \T7 -+ adox \T1, \T8 -+ adox \T3, \T9 -+ adox \T6, rdx -+ adox \T5, rax -+ -+ mov rdx, 16\M0 -+ mulx \T8, \T7, \M1 -+ xor rax, rax -+ adcx \T2, \T7 -+ mov 16\C, \T2 // C2_final -+ adcx \T4, \T8 -+ mulx \T8, \T7, 8\M1 -+ adox \T4, \T7 -+ adcx \T0, \T8 -+ mulx \T2, \T8, 16\M1 -+ adox \T0, \T8 -+ adcx \T1, \T2 -+ mulx \T2, \T7, 24\M1 -+ adcx \T3, \T2 -+ mulx \T2, \T8, 32\M1 -+ adcx \T6, \T2 -+ mulx \T2, \T9, 40\M1 -+ adcx \T5, \T2 -+ mulx \T2, rdx, 48\M1 -+ adcx \T2, rax -+ -+ adox \T1, \T7 -+ adox \T3, \T8 -+ adox \T6, \T9 -+ adox \T5, rdx -+ adox \T2, rax -+ -+ mov rdx, 24\M0 -+ mulx \T8, \T7, \M1 -+ xor rax, rax -+ adcx \T7, \T4 -+ adcx \T0, \T8 -+ mulx \T8, \T10, 8\M1 -+ adox \T0, \T10 -+ adcx \T1, \T8 -+ mulx \T4, \T8, 16\M1 -+ adox \T1, \T8 -+ adcx \T3, \T4 -+ mulx \T4, \T10, 24\M1 -+ adcx \T6, \T4 -+ mulx \T4, \T8, 32\M1 -+ adcx \T5, \T4 -+ mulx \T4, \T9, 40\M1 -+ adcx \T2, \T4 -+ mulx \T4, rdx, 48\M1 -+ adcx \T4, rax -+ -+ adox \T3, \T10 -+ adox \T6, \T8 -+ adox \T5, \T9 -+ adox \T2, rdx -+ adox \T4, rax -+.endm -+ -+#else -+ -+.macro MUL256x448_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10 -+ mov rdx, \M0 -+ mulx \T0, \T1, \M1 -+ mulx \T2, \T3, 8\M1 -+ mov \C, \T1 // C0_final -+ xor rax, rax -+ mulx \T4, \T5, 16\M1 -+ add \T0, \T3 -+ adc \T2, \T5 -+ mulx \T1, \T3, 24\M1 -+ adc \T4, \T3 -+ mulx \T5, \T6, 32\M1 -+ adc \T1, \T6 -+ mulx \T3, \T7, 40\M1 -+ adc \T5, \T7 -+ mulx \T6, \T8, 48\M1 -+ adc \T3, \T8 -+ adc \T6, rax -+ -+ mov rdx, 8\M0 -+ mulx \T8, \T7, \M1 -+ add \T0, \T7 -+ mov 8\C, \T0 // C1_final -+ adc \T2, \T8 -+ mulx \T7, \T8, 8\M1 -+ mov 32\C, \T8 // store -+ adc \T4, \T7 -+ mulx \T0, \T8, 16\M1 -+ mov 40\C, \T8 // store -+ adc \T0, \T1 -+ mulx \T1, \T7, 24\M1 -+ adc \T1, \T5 -+ mulx \T5, \T8, 32\M1 -+ adc \T3, \T5 -+ mulx \T5, \T9, 40\M1 -+ adc \T6, \T5 -+ mulx \T5, rdx, 48\M1 -+ adc \T5, rax -+ -+ xor rax, rax -+ add \T2, 32\C -+ adc \T4, 40\C -+ adc \T0, \T7 -+ adc \T1, \T8 -+ adc \T3, \T9 -+ adc \T6, rdx -+ adc \T5, rax -+ -+ mov rdx, 16\M0 -+ mulx \T8, \T7, \M1 -+ add \T2, \T7 -+ mov 16\C, \T2 // C2_final -+ adc \T4, \T8 -+ mulx \T8, \T7, 8\M1 -+ mov 32\C, \T7 // store -+ adc \T0, \T8 -+ mulx \T2, \T8, 16\M1 -+ mov 40\C, \T8 // store -+ adc \T1, \T2 -+ mulx \T2, \T7, 24\M1 -+ adc \T3, \T2 -+ mulx \T2, \T8, 32\M1 -+ adc \T6, \T2 -+ mulx \T2, \T9, 40\M1 -+ adc \T5, \T2 -+ mulx \T2, rdx, 48\M1 -+ adc \T2, rax -+ -+ xor rax, rax -+ add \T4, 32\C -+ adc \T0, 40\C -+ adc \T1, \T7 -+ adc \T3, \T8 -+ adc \T6, \T9 -+ adc \T5, rdx -+ adc \T2, rax -+ -+ mov rdx, 24\M0 -+ mulx \T8, \T7, \M1 -+ add \T7, \T4 -+ adc \T0, \T8 -+ mulx \T8, \T10, 8\M1 -+ mov 32\C, \T10 // store -+ adc \T1, \T8 -+ mulx \T4, \T8, 16\M1 -+ mov 40\C, \T8 // store -+ adc \T3, \T4 -+ mulx \T4, \T10, 24\M1 -+ adc \T6, \T4 -+ mulx \T4, \T8, 32\M1 -+ adc \T5, \T4 -+ mulx \T4, \T9, 40\M1 -+ adc \T2, \T4 -+ mulx \T4, rdx, 48\M1 -+ adc \T4, rax -+ -+ xor rax, rax -+ add \T0, 32\C -+ adc \T1, 40\C -+ adc \T3, \T10 -+ adc \T6, \T8 -+ adc \T5, \T9 -+ adc \T2, rdx -+ adc \T4, rax -+.endm -+ -+#endif -+ -+ -+//************************************************************************************** -+// Montgomery reduction -+// Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015 -+// Operation: c [reg_p2] = a [reg_p1] -+// NOTE: a=c is not allowed -+//************************************************************************************** -+.global rdc751_asm -+rdc751_asm: -+ push rbx -+ push rbp -+ push r12 -+ push r13 -+ push r14 -+ push r15 -+ -+ // a[0-3] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 -+ MUL256x448_SCHOOL [reg_p1], [p751p1_nz], [reg_p2+48], r8, r9, r13, r10, r14, r12, r11, rbp, rbx, rcx, r15 -+ -+ xor r15, r15 -+ mov rax, [reg_p2+48] -+ mov rdx, [reg_p2+56] -+ mov rbx, [reg_p2+64] -+ add rax, [reg_p1+40] -+ adc rdx, [reg_p1+48] -+ adc rbx, [reg_p1+56] -+ mov [reg_p1+40], rax -+ mov [reg_p1+48], rdx -+ mov [reg_p1+56], rbx -+ adc rbp, [reg_p1+64] -+ adc r8, [reg_p1+72] -+ adc r9, [reg_p1+80] -+ adc r10, [reg_p1+88] -+ adc r11, [reg_p1+96] -+ adc r12, [reg_p1+104] -+ adc r13, [reg_p1+112] -+ adc r14, [reg_p1+120] -+ adc r15, [reg_p1+128] -+ mov [reg_p1+64], rbp -+ mov [reg_p1+72], r8 -+ mov [reg_p1+80], r9 -+ mov [reg_p1+88], r10 -+ mov [reg_p1+96], r11 -+ mov [reg_p1+104], r12 -+ mov [reg_p1+112], r13 -+ mov [reg_p1+120], r14 -+ mov [reg_p1+128], r15 -+ mov r8, [reg_p1+136] -+ mov r9, [reg_p1+144] -+ mov r10, [reg_p1+152] -+ mov r11, [reg_p1+160] -+ mov r12, [reg_p1+168] -+ mov r13, [reg_p1+176] -+ mov r14, [reg_p1+184] -+ adc r8, 0 -+ adc r9, 0 -+ adc r10, 0 -+ adc r11, 0 -+ adc r12, 0 -+ adc r13, 0 -+ adc r14, 0 -+ mov [reg_p1+136], r8 -+ mov [reg_p1+144], r9 -+ mov [reg_p1+152], r10 -+ mov [reg_p1+160], r11 -+ mov [reg_p1+168], r12 -+ mov [reg_p1+176], r13 -+ mov [reg_p1+184], r14 -+ -+ // a[4-7] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 -+ MUL256x448_SCHOOL [reg_p1+32], [p751p1_nz], [reg_p2+48], r8, r9, r13, r10, r14, r12, r11, rbp, rbx, rcx, r15 -+ -+ xor r15, r15 -+ mov rax, [reg_p2+48] -+ mov rdx, [reg_p2+56] -+ mov rbx, [reg_p2+64] -+ add rax, [reg_p1+72] -+ adc rdx, [reg_p1+80] -+ adc rbx, [reg_p1+88] -+ mov [reg_p1+72], rax -+ mov [reg_p1+80], rdx -+ mov [reg_p1+88], rbx -+ adc rbp, [reg_p1+96] -+ adc r8, [reg_p1+104] -+ adc r9, [reg_p1+112] -+ adc r10, [reg_p1+120] -+ adc r11, [reg_p1+128] -+ adc r12, [reg_p1+136] -+ adc r13, [reg_p1+144] -+ adc r14, [reg_p1+152] -+ adc r15, [reg_p1+160] -+ mov [reg_p2], rbp // Final result c0 -+ mov [reg_p1+104], r8 -+ mov [reg_p1+112], r9 -+ mov [reg_p1+120], r10 -+ mov [reg_p1+128], r11 -+ mov [reg_p1+136], r12 -+ mov [reg_p1+144], r13 -+ mov [reg_p1+152], r14 -+ mov [reg_p1+160], r15 -+ mov r12, [reg_p1+168] -+ mov r13, [reg_p1+176] -+ mov r14, [reg_p1+184] -+ adc r12, 0 -+ adc r13, 0 -+ adc r14, 0 -+ mov [reg_p1+168], r12 -+ mov [reg_p1+176], r13 -+ mov [reg_p1+184], r14 -+ -+ // a[8-11] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 -+ MUL256x448_SCHOOL [reg_p1+64], [p751p1_nz], [reg_p2+48], r8, r9, r13, r10, r14, r12, r11, rbp, rbx, rcx, r15 -+ -+ // Final result c1:c11 -+ mov rax, [reg_p2+48] -+ mov rdx, [reg_p2+56] -+ mov rbx, [reg_p2+64] -+ add rax, [reg_p1+104] -+ adc rdx, [reg_p1+112] -+ adc rbx, [reg_p1+120] -+ mov [reg_p2+8], rax -+ mov [reg_p2+16], rdx -+ mov [reg_p2+24], rbx -+ adc rbp, [reg_p1+128] -+ adc r8, [reg_p1+136] -+ adc r9, [reg_p1+144] -+ adc r10, [reg_p1+152] -+ adc r11, [reg_p1+160] -+ adc r12, [reg_p1+168] -+ adc r13, [reg_p1+176] -+ adc r14, [reg_p1+184] -+ mov [reg_p2+32], rbp -+ mov [reg_p2+40], r8 -+ mov [reg_p2+48], r9 -+ mov [reg_p2+56], r10 -+ mov [reg_p2+64], r11 -+ mov [reg_p2+72], r12 -+ mov [reg_p2+80], r13 -+ mov [reg_p2+88], r14 -+ -+ pop r15 -+ pop r14 -+ pop r13 -+ pop r12 -+ pop rbp -+ pop rbx -+ ret -+ -+ #else -+ -+//*********************************************************************** -+// Montgomery reduction -+// Based on comba method -+// Operation: c [reg_p2] = a [reg_p1] -+// NOTE: a=c is not allowed -+//*********************************************************************** -+.global rdc751_asm -+rdc751_asm: -+ push r12 -+ push r13 -+ push r14 -+ push r15 -+ -+ mov r11, [reg_p1] -+ movq rax, p751p1_5 -+ mul r11 -+ xor r8, r8 -+ add rax, [reg_p1+40] -+ mov [reg_p2+40], rax // z5 -+ adc r8, rdx -+ -+ xor r9, r9 -+ movq rax, p751p1_6 -+ mul r11 -+ xor r10, r10 -+ add r8, rax -+ adc r9, rdx -+ -+ mov r12, [reg_p1+8] -+ movq rax, p751p1_5 -+ mul r12 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ add r8, [reg_p1+48] -+ mov [reg_p2+48], r8 // z6 -+ adc r9, 0 -+ adc r10, 0 -+ -+ xor r8, r8 -+ movq rax, p751p1_7 -+ mul r11 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ movq rax, p751p1_6 -+ mul r12 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov r13, [reg_p1+16] -+ movq rax, p751p1_5 -+ mul r13 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ add r9, [reg_p1+56] -+ mov [reg_p2+56], r9 // z7 -+ adc r10, 0 -+ adc r8, 0 -+ -+ xor r9, r9 -+ movq rax, p751p1_8 -+ mul r11 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ movq rax, p751p1_7 -+ mul r12 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ movq rax, p751p1_6 -+ mul r13 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov r14, [reg_p1+24] -+ movq rax, p751p1_5 -+ mul r14 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ add r10, [reg_p1+64] -+ mov [reg_p2+64], r10 // z8 -+ adc r8, 0 -+ adc r9, 0 -+ -+ xor r10, r10 -+ movq rax, p751p1_9 -+ mul r11 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ movq rax, p751p1_8 -+ mul r12 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ movq rax, p751p1_7 -+ mul r13 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ movq rax, p751p1_6 -+ mul r14 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ mov r15, [reg_p1+32] -+ movq rax, p751p1_5 -+ mul r15 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ add r8, [reg_p1+72] -+ mov [reg_p2+72], r8 // z9 -+ adc r9, 0 -+ adc r10, 0 -+ -+ xor r8, r8 -+ movq rax, p751p1_10 -+ mul r11 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ movq rax, p751p1_9 -+ mul r12 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ movq rax, p751p1_8 -+ mul r13 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ movq rax, p751p1_7 -+ mul r14 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ movq rax, p751p1_6 -+ mul r15 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov rcx, [reg_p2+40] -+ movq rax, p751p1_5 -+ mul rcx -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ add r9, [reg_p1+80] -+ mov [reg_p2+80], r9 // z10 -+ adc r10, 0 -+ adc r8, 0 -+ -+ xor r9, r9 -+ movq rax, p751p1_11 -+ mul r11 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ movq rax, p751p1_10 -+ mul r12 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ movq rax, p751p1_9 -+ mul r13 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ movq rax, p751p1_8 -+ mul r14 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ movq rax, p751p1_7 -+ mul r15 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ movq rax, p751p1_6 -+ mul rcx -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov r11, [reg_p2+48] -+ movq rax, p751p1_5 -+ mul r11 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ add r10, [reg_p1+88] -+ mov [reg_p2+88], r10 // z11 -+ adc r8, 0 -+ adc r9, 0 -+ -+ xor r10, r10 -+ movq rax, p751p1_11 -+ mul r12 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ movq rax, p751p1_10 -+ mul r13 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ movq rax, p751p1_9 -+ mul r14 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ movq rax, p751p1_8 -+ mul r15 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ movq rax, p751p1_7 -+ mul rcx -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ movq rax, p751p1_6 -+ mul r11 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ mov r12, [reg_p2+56] -+ movq rax, p751p1_5 -+ mul r12 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ add r8, [reg_p1+96] -+ mov [reg_p2], r8 // z0 -+ adc r9, 0 -+ adc r10, 0 -+ -+ xor r8, r8 -+ movq rax, p751p1_11 -+ mul r13 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ movq rax, p751p1_10 -+ mul r14 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ movq rax, p751p1_9 -+ mul r15 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ movq rax, p751p1_8 -+ mul rcx -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ movq rax, p751p1_7 -+ mul r11 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ movq rax, p751p1_6 -+ mul r12 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov r13, [reg_p2+64] -+ movq rax, p751p1_5 -+ mul r13 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ add r9, [reg_p1+104] -+ mov [reg_p2+8], r9 // z1 -+ adc r10, 0 -+ adc r8, 0 -+ -+ xor r9, r9 -+ movq rax, p751p1_11 -+ mul r14 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ movq rax, p751p1_10 -+ mul r15 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ movq rax, p751p1_9 -+ mul rcx -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ movq rax, p751p1_8 -+ mul r11 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ movq rax, p751p1_7 -+ mul r12 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ movq rax, p751p1_6 -+ mul r13 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ mov r14, [reg_p2+72] -+ movq rax, p751p1_5 -+ mul r14 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ add r10, [reg_p1+112] -+ mov [reg_p2+16], r10 // z2 -+ adc r8, 0 -+ adc r9, 0 -+ -+ xor r10, r10 -+ movq rax, p751p1_11 -+ mul r15 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ movq rax, p751p1_10 -+ mul rcx -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ movq rax, p751p1_9 -+ mul r11 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ movq rax, p751p1_8 -+ mul r12 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ movq rax, p751p1_7 -+ mul r13 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ movq rax, p751p1_6 -+ mul r14 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ mov r15, [reg_p2+80] -+ movq rax, p751p1_5 -+ mul r15 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ add r8, [reg_p1+120] -+ mov [reg_p2+24], r8 // z3 -+ adc r9, 0 -+ adc r10, 0 -+ -+ xor r8, r8 -+ movq rax, p751p1_11 -+ mul rcx -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ movq rax, p751p1_10 -+ mul r11 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ movq rax, p751p1_9 -+ mul r12 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ movq rax, p751p1_8 -+ mul r13 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ movq rax, p751p1_7 -+ mul r14 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ movq rax, p751p1_6 -+ mul r15 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ mov rcx, [reg_p2+88] -+ movq rax, p751p1_5 -+ mul rcx -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ add r9, [reg_p1+128] -+ mov [reg_p2+32], r9 // z4 -+ adc r10, 0 -+ adc r8, 0 -+ -+ xor r9, r9 -+ movq rax, p751p1_11 -+ mul r11 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ movq rax, p751p1_10 -+ mul r12 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ movq rax, p751p1_9 -+ mul r13 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ movq rax, p751p1_8 -+ mul r14 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ movq rax, p751p1_7 -+ mul r15 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ movq rax, p751p1_6 -+ mul rcx -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ add r10, [reg_p1+136] -+ mov [reg_p2+40], r10 // z5 -+ adc r8, 0 -+ adc r9, 0 -+ -+ xor r10, r10 -+ movq rax, p751p1_11 -+ mul r12 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ movq rax, p751p1_10 -+ mul r13 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ movq rax, p751p1_9 -+ mul r14 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ movq rax, p751p1_8 -+ mul r15 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ movq rax, p751p1_7 -+ mul rcx -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ add r8, [reg_p1+144] -+ mov [reg_p2+48], r8 // z6 -+ adc r9, 0 -+ adc r10, 0 -+ -+ xor r8, r8 -+ movq rax, p751p1_11 -+ mul r13 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ movq rax, p751p1_10 -+ mul r14 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ movq rax, p751p1_9 -+ mul r15 -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ -+ movq rax, p751p1_8 -+ mul rcx -+ add r9, rax -+ adc r10, rdx -+ adc r8, 0 -+ add r9, [reg_p1+152] -+ mov [reg_p2+56], r9 // z7 -+ adc r10, 0 -+ adc r8, 0 -+ -+ xor r9, r9 -+ movq rax, p751p1_11 -+ mul r14 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ movq rax, p751p1_10 -+ mul r15 -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ -+ movq rax, p751p1_9 -+ mul rcx -+ add r10, rax -+ adc r8, rdx -+ adc r9, 0 -+ add r10, [reg_p1+160] -+ mov [reg_p2+64], r10 // z8 -+ adc r8, 0 -+ adc r9, 0 -+ -+ xor r10, r10 -+ movq rax, p751p1_11 -+ mul r15 -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ -+ movq rax, p751p1_10 -+ mul rcx -+ add r8, rax -+ adc r9, rdx -+ adc r10, 0 -+ add r8, [reg_p1+168] // z9 -+ mov [reg_p2+72], r8 // z9 -+ adc r9, 0 -+ adc r10, 0 -+ -+ movq rax, p751p1_11 -+ mul rcx -+ add r9, rax -+ adc r10, rdx -+ add r9, [reg_p1+176] // z10 -+ mov [reg_p2+80], r9 // z10 -+ adc r10, 0 -+ add r10, [reg_p1+184] // z11 -+ mov [reg_p2+88], r10 // z11 -+ -+ pop r15 -+ pop r14 -+ pop r13 -+ pop r12 -+ ret -+ -+ #endif -+ -+ -+//*********************************************************************** -+// 751-bit multiprecision addition -+// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2] -+//*********************************************************************** -+.global mp_add751_asm -+mp_add751_asm: -+ mov r8, [reg_p1] -+ mov r9, [reg_p1+8] -+ mov r10, [reg_p1+16] -+ mov r11, [reg_p1+24] -+ mov rax, [reg_p1+32] -+ mov rcx, [reg_p1+40] -+ add r8, [reg_p2] -+ adc r9, [reg_p2+8] -+ adc r10, [reg_p2+16] -+ adc r11, [reg_p2+24] -+ adc rax, [reg_p2+32] -+ adc rcx, [reg_p2+40] -+ mov [reg_p3], r8 -+ mov [reg_p3+8], r9 -+ mov [reg_p3+16], r10 -+ mov [reg_p3+24], r11 -+ mov [reg_p3+32], rax -+ mov [reg_p3+40], rcx -+ -+ mov r8, [reg_p1+48] -+ mov r9, [reg_p1+56] -+ mov r10, [reg_p1+64] -+ mov r11, [reg_p1+72] -+ mov rax, [reg_p1+80] -+ mov rcx, [reg_p1+88] -+ adc r8, [reg_p2+48] -+ adc r9, [reg_p2+56] -+ adc r10, [reg_p2+64] -+ adc r11, [reg_p2+72] -+ adc rax, [reg_p2+80] -+ adc rcx, [reg_p2+88] -+ mov [reg_p3+48], r8 -+ mov [reg_p3+56], r9 -+ mov [reg_p3+64], r10 -+ mov [reg_p3+72], r11 -+ mov [reg_p3+80], rax -+ mov [reg_p3+88], rcx -+ ret -+ -+ -+//*********************************************************************** -+// 2x751-bit multiprecision subtraction -+// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2]. Returns borrow mask -+//*********************************************************************** -+.global mp_sub751x2_asm -+mp_sub751x2_asm: -+ xor rax, rax -+ mov r8, [reg_p1] -+ mov r9, [reg_p1+8] -+ mov r10, [reg_p1+16] -+ mov r11, [reg_p1+24] -+ mov rcx, [reg_p1+32] -+ sub r8, [reg_p2] -+ sbb r9, [reg_p2+8] -+ sbb r10, [reg_p2+16] -+ sbb r11, [reg_p2+24] -+ sbb rcx, [reg_p2+32] -+ mov [reg_p3], r8 -+ mov [reg_p3+8], r9 -+ mov [reg_p3+16], r10 -+ mov [reg_p3+24], r11 -+ mov [reg_p3+32], rcx -+ -+ mov r8, [reg_p1+40] -+ mov r9, [reg_p1+48] -+ mov r10, [reg_p1+56] -+ mov r11, [reg_p1+64] -+ mov rcx, [reg_p1+72] -+ sbb r8, [reg_p2+40] -+ sbb r9, [reg_p2+48] -+ sbb r10, [reg_p2+56] -+ sbb r11, [reg_p2+64] -+ sbb rcx, [reg_p2+72] -+ mov [reg_p3+40], r8 -+ mov [reg_p3+48], r9 -+ mov [reg_p3+56], r10 -+ mov [reg_p3+64], r11 -+ mov [reg_p3+72], rcx -+ -+ mov r8, [reg_p1+80] -+ mov r9, [reg_p1+88] -+ mov r10, [reg_p1+96] -+ mov r11, [reg_p1+104] -+ mov rcx, [reg_p1+112] -+ sbb r8, [reg_p2+80] -+ sbb r9, [reg_p2+88] -+ sbb r10, [reg_p2+96] -+ sbb r11, [reg_p2+104] -+ sbb rcx, [reg_p2+112] -+ mov [reg_p3+80], r8 -+ mov [reg_p3+88], r9 -+ mov [reg_p3+96], r10 -+ mov [reg_p3+104], r11 -+ mov [reg_p3+112], rcx -+ -+ mov r8, [reg_p1+120] -+ mov r9, [reg_p1+128] -+ mov r10, [reg_p1+136] -+ mov r11, [reg_p1+144] -+ mov rcx, [reg_p1+152] -+ sbb r8, [reg_p2+120] -+ sbb r9, [reg_p2+128] -+ sbb r10, [reg_p2+136] -+ sbb r11, [reg_p2+144] -+ sbb rcx, [reg_p2+152] -+ mov [reg_p3+120], r8 -+ mov [reg_p3+128], r9 -+ mov [reg_p3+136], r10 -+ mov [reg_p3+144], r11 -+ mov [reg_p3+152], rcx -+ -+ mov r8, [reg_p1+160] -+ mov r9, [reg_p1+168] -+ mov r10, [reg_p1+176] -+ mov r11, [reg_p1+184] -+ sbb r8, [reg_p2+160] -+ sbb r9, [reg_p2+168] -+ sbb r10, [reg_p2+176] -+ sbb r11, [reg_p2+184] -+ sbb rax, 0 -+ mov [reg_p3+160], r8 -+ mov [reg_p3+168], r9 -+ mov [reg_p3+176], r10 -+ mov [reg_p3+184], r11 -+ ret -+ -+ -+//*********************************************************************** -+// Double 2x751-bit multiprecision subtraction -+// Operation: c [reg_p3] = c [reg_p3] - a [reg_p1] - b [reg_p2] -+//*********************************************************************** -+.global mp_dblsub751x2_asm -+mp_dblsub751x2_asm: -+ push r12 -+ push r13 -+ push r14 -+ push r15 -+ -+ xor rax, rax -+ mov r8, [reg_p3] -+ mov r9, [reg_p3+8] -+ mov r10, [reg_p3+16] -+ mov r11, [reg_p3+24] -+ mov r12, [reg_p3+32] -+ mov r13, [reg_p3+40] -+ mov r14, [reg_p3+48] -+ mov r15, [reg_p3+56] -+ sub r8, [reg_p1] -+ sbb r9, [reg_p1+8] -+ sbb r10, [reg_p1+16] -+ sbb r11, [reg_p1+24] -+ sbb r12, [reg_p1+32] -+ sbb r13, [reg_p1+40] -+ sbb r14, [reg_p1+48] -+ sbb r15, [reg_p1+56] -+ adc rax, 0 -+ sub r8, [reg_p2] -+ sbb r9, [reg_p2+8] -+ sbb r10, [reg_p2+16] -+ sbb r11, [reg_p2+24] -+ sbb r12, [reg_p2+32] -+ sbb r13, [reg_p2+40] -+ sbb r14, [reg_p2+48] -+ sbb r15, [reg_p2+56] -+ adc rax, 0 -+ mov [reg_p3], r8 -+ mov [reg_p3+8], r9 -+ mov [reg_p3+16], r10 -+ mov [reg_p3+24], r11 -+ mov [reg_p3+32], r12 -+ mov [reg_p3+40], r13 -+ mov [reg_p3+48], r14 -+ mov [reg_p3+56], r15 -+ -+ xor rcx, rcx -+ mov r8, [reg_p3+64] -+ mov r9, [reg_p3+72] -+ mov r10, [reg_p3+80] -+ mov r11, [reg_p3+88] -+ mov r12, [reg_p3+96] -+ mov r13, [reg_p3+104] -+ mov r14, [reg_p3+112] -+ mov r15, [reg_p3+120] -+ sub r8, rax -+ sbb r8, [reg_p1+64] -+ sbb r9, [reg_p1+72] -+ sbb r10, [reg_p1+80] -+ sbb r11, [reg_p1+88] -+ sbb r12, [reg_p1+96] -+ sbb r13, [reg_p1+104] -+ sbb r14, [reg_p1+112] -+ sbb r15, [reg_p1+120] -+ adc rcx, 0 -+ sub r8, [reg_p2+64] -+ sbb r9, [reg_p2+72] -+ sbb r10, [reg_p2+80] -+ sbb r11, [reg_p2+88] -+ sbb r12, [reg_p2+96] -+ sbb r13, [reg_p2+104] -+ sbb r14, [reg_p2+112] -+ sbb r15, [reg_p2+120] -+ adc rcx, 0 -+ mov [reg_p3+64], r8 -+ mov [reg_p3+72], r9 -+ mov [reg_p3+80], r10 -+ mov [reg_p3+88], r11 -+ mov [reg_p3+96], r12 -+ mov [reg_p3+104], r13 -+ mov [reg_p3+112], r14 -+ mov [reg_p3+120], r15 -+ -+ mov r8, [reg_p3+128] -+ mov r9, [reg_p3+136] -+ mov r10, [reg_p3+144] -+ mov r11, [reg_p3+152] -+ mov r12, [reg_p3+160] -+ mov r13, [reg_p3+168] -+ mov r14, [reg_p3+176] -+ mov r15, [reg_p3+184] -+ sub r8, rcx -+ sbb r8, [reg_p1+128] -+ sbb r9, [reg_p1+136] -+ sbb r10, [reg_p1+144] -+ sbb r11, [reg_p1+152] -+ sbb r12, [reg_p1+160] -+ sbb r13, [reg_p1+168] -+ sbb r14, [reg_p1+176] -+ sbb r15, [reg_p1+184] -+ sub r8, [reg_p2+128] -+ sbb r9, [reg_p2+136] -+ sbb r10, [reg_p2+144] -+ sbb r11, [reg_p2+152] -+ sbb r12, [reg_p2+160] -+ sbb r13, [reg_p2+168] -+ sbb r14, [reg_p2+176] -+ sbb r15, [reg_p2+184] -+ mov [reg_p3+128], r8 -+ mov [reg_p3+136], r9 -+ mov [reg_p3+144], r10 -+ mov [reg_p3+152], r11 -+ mov [reg_p3+160], r12 -+ mov [reg_p3+168], r13 -+ mov [reg_p3+176], r14 -+ mov [reg_p3+184], r15 -+ -+ pop r15 -+ pop r14 -+ pop r13 -+ pop r12 -+ ret -diff --git a/third_party/sidh/src/P751/ARM64/fp_arm64.c b/third_party/sidh/src/P751/ARM64/fp_arm64.c -new file mode 100644 -index 00000000..096e11ed ---- /dev/null -+++ b/third_party/sidh/src/P751/ARM64/fp_arm64.c -@@ -0,0 +1,93 @@ -+/******************************************************************************************** -+* SIDH: an efficient supersingular isogeny cryptography library -+* -+* Abstract: modular arithmetic optimized for 64-bit ARMv8 platforms for P751 -+*********************************************************************************************/ -+ -+#include "../P751_internal.h" -+ -+// Global constants -+extern const uint64_t p751[NWORDS_FIELD]; -+extern const uint64_t p751x2[NWORDS_FIELD]; -+ -+ -+__inline void fpadd751(const digit_t* a, const digit_t* b, digit_t* c) -+{ // Modular addition, c = a+b mod p751. -+ // Inputs: a, b in [0, 2*p751-1] -+ // Output: c in [0, 2*p751-1] -+ -+ fpadd751_asm(a, b, c); -+} -+ -+ -+__inline void fpsub751(const digit_t* a, const digit_t* b, digit_t* c) -+{ // Modular subtraction, c = a-b mod p751. -+ // Inputs: a, b in [0, 2*p751-1] -+ // Output: c in [0, 2*p751-1] -+ -+ fpsub751_asm(a, b, c); -+} -+ -+ -+__inline void fpneg751(digit_t* a) -+{ // Modular negation, a = -a mod p751. -+ // Input/output: a in [0, 2*p751-1] -+ unsigned int i, borrow = 0; -+ -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ SUBC(borrow, ((digit_t*)p751x2)[i], a[i], borrow, a[i]); -+ } -+} -+ -+ -+void fpdiv2_751(const digit_t* a, digit_t* c) -+{ // Modular division by two, c = a/2 mod p751. -+ // Input : a in [0, 2*p751-1] -+ // Output: c in [0, 2*p751-1] -+ unsigned int i, carry = 0; -+ digit_t mask; -+ -+ mask = 0 - (digit_t)(a[0] & 1); // If a is odd compute a+p521 -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ ADDC(carry, a[i], ((digit_t*)p751)[i] & mask, carry, c[i]); -+ } -+ -+ mp_shiftr1(c, NWORDS_FIELD); -+} -+ -+ -+void fpcorrection751(digit_t* a) -+{ // Modular correction to reduce field element a in [0, 2*p751-1] to [0, p751-1]. -+ unsigned int i, borrow = 0; -+ digit_t mask; -+ -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ SUBC(borrow, a[i], ((digit_t*)p751)[i], borrow, a[i]); -+ } -+ mask = 0 - (digit_t)borrow; -+ -+ borrow = 0; -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ ADDC(borrow, a[i], ((digit_t*)p751)[i] & mask, borrow, a[i]); -+ } -+} -+ -+ -+void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) -+{ // Multiprecision multiply, c = a*b, where lng(a) = lng(b) = nwords. -+ -+ UNREFERENCED_PARAMETER(nwords); -+ -+ mul751_asm(a, b, c); -+} -+ -+ -+ -+void rdc_mont(const digit_t* ma, digit_t* mc) -+{ // Montgomery reduction exploiting special form of the prime. -+ // mc = ma*R^-1 mod p751x2, where R = 2^768. -+ // If ma < 2^768*p751, the output mc is in the range [0, 2*p751-1]. -+ // ma is assumed to be in Montgomery representation. -+ -+ rdc751_asm(ma, mc); -+} -diff --git a/third_party/sidh/src/P751/ARM64/fp_arm64_asm.S b/third_party/sidh/src/P751/ARM64/fp_arm64_asm.S -new file mode 100644 -index 00000000..995cb45a ---- /dev/null -+++ b/third_party/sidh/src/P751/ARM64/fp_arm64_asm.S -@@ -0,0 +1,2511 @@ -+//******************************************************************************************* -+// SIDH: an efficient supersingular isogeny cryptography library -+// -+// Author: David Urbanik; dburbani@uwaterloo.ca -+// -+// Abstract: Assembly optimizations for finite field arithmetic over P751 on 64-bit ARM. -+// -+// File was modified to allow inputs in [0, 2*p751-1]. -+//******************************************************************************************* -+ -+.data -+ -+// p751 + 1 -+p751p1: -+.quad 0xEEB0000000000000 -+.quad 0xE3EC968549F878A8 -+.quad 0xDA959B1A13F7CC76 -+.quad 0x084E9867D6EBE876 -+.quad 0x8562B5045CB25748 -+.quad 0x0E12909F97BADC66 -+.quad 0x00006FE5D541F71C -+ -+// p751 -+p751: -+.quad 0xFFFFFFFFFFFFFFFF -+.quad 0xEEAFFFFFFFFFFFFF -+.quad 0xE3EC968549F878A8 -+.quad 0xDA959B1A13F7CC76 -+.quad 0x084E9867D6EBE876 -+.quad 0x8562B5045CB25748 -+.quad 0x0E12909F97BADC66 -+.quad 0x00006FE5D541F71C -+ -+// 2 * p751 -+p751x2: -+.quad 0xFFFFFFFFFFFFFFFE -+.quad 0xFFFFFFFFFFFFFFFF -+.quad 0xDD5FFFFFFFFFFFFF -+.quad 0xC7D92D0A93F0F151 -+.quad 0xB52B363427EF98ED -+.quad 0x109D30CFADD7D0ED -+.quad 0x0AC56A08B964AE90 -+.quad 0x1C25213F2F75B8CD -+.quad 0x0000DFCBAA83EE38 -+ -+ -+.text -+//*********************************************************************** -+// Field addition -+// Operation: c [x2] = a [x0] + b [x1] -+//*********************************************************************** -+.global fpadd751_asm -+fpadd751_asm: -+ // Arguments are 3 pointers of type digit_t*, where the first two arguments are summands and the third is the result register. -+ // These arguments are stored in x0, x1, and x2 respectively. -+ -+ // load first summand into x3 - x14 -+ ldp x3, x4, [x0,#0] -+ ldp x5, x6, [x0,#16] -+ ldp x7, x8, [x0,#32] -+ ldp x9, x10, [x0,#48] -+ ldp x11, x12, [x0,#64] -+ ldp x13, x14, [x0,#80] -+ -+ // add first summand and second summand and store result in x3 - x14 -+ ldp x15, x16, [x1,#0] -+ ldp x17, x18, [x1,#16] -+ adds x3, x3, x15 -+ adcs x4, x4, x16 -+ adcs x5, x5, x17 -+ adcs x6, x6, x18 -+ ldp x15, x16, [x1,#32] -+ ldp x17, x18, [x1,#48] -+ adcs x7, x7, x15 -+ adcs x8, x8, x16 -+ adcs x9, x9, x17 -+ adcs x10, x10, x18 -+ ldp x15, x16, [x1,#64] -+ ldp x17, x18, [x1,#80] -+ adcs x11, x11, x15 -+ adcs x12, x12, x16 -+ adcs x13, x13, x17 -+ adcs x14, x14, x18 -+ -+ // subtract 2xp751 to the resut in x3 - x14 -+ ldr x16, p751x2 -+ subs x3, x3, x16 -+ ldr x15, p751x2 + 8 -+ sbcs x4, x4, x15 -+ sbcs x5, x5, x15 -+ sbcs x6, x6, x15 -+ sbcs x7, x7, x15 -+ ldr x16, p751x2 + 16 -+ ldr x17, p751x2 + 24 -+ sbcs x8, x8, x16 -+ ldr x18, p751x2 + 32 -+ sbcs x9, x9, x17 -+ ldr x16, p751x2 + 40 -+ sbcs x10, x10, x18 -+ ldr x17, p751x2 + 48 -+ sbcs x11, x11, x16 -+ ldr x18, p751x2 + 56 -+ sbcs x12, x12, x17 -+ ldr x15, p751x2 + 64 -+ sbcs x13, x13, x18 -+ sbcs x14, x14, x15 -+ sbc x15, xzr, xzr -+ -+ // add 2xp751 back but anded with the mask in x15 -+ ldr x16, p751x2 -+ and x16, x16, x15 -+ ldr x17, p751x2 + 8 -+ and x17, x17, x15 -+ ldr x18, p751x2 + 16 -+ and x18, x18, x15 -+ -+ adds x3, x3, x16 -+ adcs x4, x4, x17 -+ adcs x5, x5, x17 -+ adcs x6, x6, x17 -+ adcs x7, x7, x17 -+ adcs x8, x8, x18 -+ -+ ldr x16, p751x2 + 24 -+ and x16, x16, x15 -+ adcs x9, x9, x16 -+ -+ ldr x16, p751x2 + 32 -+ and x16, x16, x15 -+ ldr x17, p751x2 + 40 -+ and x17, x17, x15 -+ ldr x18, p751x2 + 48 -+ and x18, x18, x15 -+ -+ adcs x10, x10, x16 -+ adcs x11, x11, x17 -+ adcs x12, x12, x18 -+ -+ ldr x16, p751x2 + 56 -+ and x16, x16, x15 -+ ldr x17, p751x2 + 64 -+ and x17, x17, x15 -+ -+ adcs x13, x13, x16 -+ adcs x14, x14, x17 -+ -+ stp x3, x4, [x2,#0] -+ stp x5, x6, [x2,#16] -+ stp x7, x8, [x2,#32] -+ stp x9, x10, [x2,#48] -+ stp x11, x12, [x2,#64] -+ stp x13, x14, [x2,#80] -+ ret -+ -+ -+//*********************************************************************** -+// Field subtraction -+// Operation: c [x2] = a [x0] - b [x1] -+//*********************************************************************** -+.global fpsub751_asm -+fpsub751_asm: -+ ldp x3, x4, [x0,#0] -+ ldp x5, x6, [x0,#16] -+ ldp x7, x8, [x0,#32] -+ ldp x9, x10, [x0,#48] -+ ldp x11, x12, [x0,#64] -+ ldp x13, x14, [x0,#80] -+ -+ ldp x15, x16, [x1, #0] -+ subs x3, x3, x15 -+ sbcs x4, x4, x16 -+ ldp x15, x16, [x1, #16] -+ sbcs x5, x5, x15 -+ sbcs x6, x6, x16 -+ ldp x15, x16, [x1, #32] -+ sbcs x7, x7, x15 -+ sbcs x8, x8, x16 -+ ldp x15, x16, [x1, #48] -+ sbcs x9, x9, x15 -+ sbcs x10, x10, x16 -+ ldp x15, x16, [x1, #64] -+ sbcs x11, x11, x15 -+ sbcs x12, x12, x16 -+ ldp x15, x16, [x1, #80] -+ sbcs x13, x13, x15 -+ sbcs x14, x14, x16 -+ sbc x17, xzr, xzr -+ -+ ldr x15, p751x2 -+ and x15, x15, x17 -+ ldr x16, p751x2 + 8 -+ and x16, x16, x17 -+ ldr x18, p751x2 + 16 -+ and x18, x18, x17 -+ -+ adds x3, x3, x15 -+ adcs x4, x4, x16 -+ adcs x5, x5, x16 -+ adcs x6, x6, x16 -+ adcs x7, x7, x16 -+ adcs x8, x8, x18 -+ -+ ldr x15, p751x2 + 24 -+ and x15, x15, x17 -+ ldr x16, p751x2 + 32 -+ and x16, x16, x17 -+ -+ adcs x9, x9, x15 -+ adcs x10, x10, x16 -+ -+ ldr x15, p751x2 + 40 -+ and x15, x15, x17 -+ ldr x16, p751x2 + 48 -+ and x16, x16, x17 -+ -+ adcs x11, x11, x15 -+ adcs x12, x12, x16 -+ -+ ldr x15, p751x2 + 56 -+ and x15, x15, x17 -+ ldr x16, p751x2 + 64 -+ and x16, x16, x17 -+ -+ adcs x13, x13, x15 -+ adcs x14, x14, x16 -+ -+ stp x3, x4, [x2,#0] -+ stp x5, x6, [x2,#16] -+ stp x7, x8, [x2,#32] -+ stp x9, x10, [x2,#48] -+ stp x11, x12, [x2,#64] -+ stp x13, x14, [x2,#80] -+ ret -+ -+ -+//*********************************************************************** -+// Integer multiplication using Comba method -+// Operation: c [x2] = a [x0] * b [x1] -+//*********************************************************************** -+.global mul751_asm -+mul751_asm: -+ sub sp, sp, #80 -+ stp x19, x20, [sp] -+ stp x21, x22, [sp, #16] -+ stp x23, x24, [sp, #32] -+ stp x25, x26, [sp, #48] -+ stp x27, x28, [sp, #64] -+ -+ ldp x3, x4, [x0, #0] -+ ldp x5, x6, [x1, #0] -+ mul x18, x3, x5 -+ umulh x17, x3, x5 -+ // c0 is now in x18 -+ -+ // a0 * b1 -+ mul x13, x3, x6 -+ umulh x14, x3, x6 -+ -+ adds x17, x17, x13 -+ adcs x16, x14, xzr -+ adcs x15, xzr, xzr -+ -+ // b0 * a1 -+ mul x13, x4, x5 -+ umulh x14, x4, x5 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, x15, xzr -+ -+ // store c0 and c1 -+ stp x18, x17, [x2, #0] -+ -+ // load a2, a3, b2, b3 -+ ldp x7, x8, [x0, #16] -+ ldp x9, x10, [x1, #16] -+ -+ // a0 * b2 -+ mul x13, x3, x9 -+ umulh x14, x3, x9 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, xzr, xzr -+ -+ // a1 * b1 -+ mul x13, x4, x6 -+ umulh x14, x4, x6 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, x18, xzr -+ -+ // a2 * b0 -+ mul x13, x7, x5 -+ umulh x14, x7, x5 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, x18, xzr -+ -+ // c2 is now in x16 -+ -+ // a0 * b3 -+ mul x13, x3, x10 -+ umulh x14, x3, x10 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, xzr, xzr -+ -+ // a1 * b2 -+ mul x13, x4, x9 -+ umulh x14, x4, x9 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, x17, xzr -+ -+ // a2 * b1 -+ mul x13, x7, x6 -+ umulh x14, x7, x6 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, x17, xzr -+ -+ // a3 * b0 -+ mul x13, x8, x5 -+ umulh x14, x8, x5 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, x17, xzr -+ -+ // store c2 and c3 -+ stp x16, x15, [x2, #16] -+ -+ // a1 * b3 -+ mul x13, x4, x10 -+ umulh x14, x4, x10 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, xzr, xzr -+ -+ // a2 * b2 -+ mul x13, x7, x9 -+ umulh x14, x7, x9 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, x16, xzr -+ -+ // a3 * b1 -+ mul x13, x8, x6 -+ umulh x14, x8, x6 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, x16, xzr -+ -+ // load a4, a5 -+ ldp x11, x12, [x0, #32] -+ -+ // a4 * b0 -+ mul x13, x11, x5 -+ umulh x14, x11, x5 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, x16, xzr -+ -+ // load b4, b5 -+ ldp x19, x20, [x1, #32] -+ -+ // a0 * b4 -+ mul x13, x3, x19 -+ umulh x14, x3, x19 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, x16, xzr -+ -+ // c4 is now in x18 -+ -+ // a0 * b5 -+ mul x13, x3, x20 -+ umulh x14, x3, x20 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, xzr, xzr -+ -+ // a1 * b4 -+ mul x13, x4, x19 -+ umulh x14, x4, x19 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, x15, xzr -+ -+ // a2 * b3 -+ mul x13, x7, x10 -+ umulh x14, x7, x10 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, x15, xzr -+ -+ // a3 * b2 -+ mul x13, x8, x9 -+ umulh x14, x8, x9 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, x15, xzr -+ -+ // a4 * b1 -+ mul x13, x11, x6 -+ umulh x14, x11, x6 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, x15, xzr -+ -+ // a5 * b0 -+ mul x13, x12, x5 -+ umulh x14, x12, x5 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, x15, xzr -+ -+ // store c4 and c5 -+ stp x18, x17, [x2, #32] -+ -+ // load a6, a7 -+ ldp x21, x22, [x0, #48] -+ -+ // a6 * b0 -+ mul x13, x21, x5 -+ umulh x14, x21, x5 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, xzr, xzr -+ -+ // a5 * b1 -+ mul x13, x12, x6 -+ umulh x14, x12, x6 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, x18, xzr -+ -+ // a4 * b2 -+ mul x13, x11, x9 -+ umulh x14, x11, x9 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, x18, xzr -+ -+ // a3 * b3 -+ mul x13, x8, x10 -+ umulh x14, x8, x10 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, x18, xzr -+ -+ // a2 * b4 -+ mul x13, x7, x19 -+ umulh x14, x7, x19 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, x18, xzr -+ -+ // a1 * b5 -+ mul x13, x4, x20 -+ umulh x14, x4, x20 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, x18, xzr -+ -+ // load b6, b7 -+ ldp x23, x24, [x1, #48] -+ -+ // a0 * b6 -+ mul x13, x3, x23 -+ umulh x14, x3, x23 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, x18, xzr -+ -+ // c6 is now in x16 -+ -+ // a0 * b7 -+ mul x13, x3, x24 -+ umulh x14, x3, x24 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, xzr, xzr -+ -+ // a1 * b6 -+ mul x13, x4, x23 -+ umulh x14, x4, x23 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, x17, xzr -+ -+ // a2 * b5 -+ mul x13, x7, x20 -+ umulh x14, x7, x20 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, x17, xzr -+ -+ // a3 * b4 -+ mul x13, x8, x19 -+ umulh x14, x8, x19 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, x17, xzr -+ -+ // a4 * b3 -+ mul x13, x11, x10 -+ umulh x14, x11, x10 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, x17, xzr -+ -+ // a5 * b2 -+ mul x13, x12, x9 -+ umulh x14, x12, x9 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, x17, xzr -+ -+ // a6 * b1 -+ mul x13, x21, x6 -+ umulh x14, x21, x6 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, x17, xzr -+ -+ // a7 * b0 -+ mul x13, x22, x5 -+ umulh x14, x22, x5 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, x17, xzr -+ -+ // store c6 and c7 -+ stp x16, x15, [x2, #48] -+ -+ // load a8, a9 -+ ldp x25, x26, [x0, #64] -+ -+ // a8 * b0 -+ mul x13, x25, x5 -+ umulh x14, x25, x5 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, xzr, xzr -+ -+ // a7 * b1 -+ mul x13, x22, x6 -+ umulh x14, x22, x6 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, x16, xzr -+ -+ // a6 * b2 -+ mul x13, x21, x9 -+ umulh x14, x21, x9 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, x16, xzr -+ -+ // a5 * b3 -+ mul x13, x12, x10 -+ umulh x14, x12, x10 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, x16, xzr -+ -+ // a4 * b4 -+ mul x13, x11, x19 -+ umulh x14, x11, x19 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, x16, xzr -+ -+ // a3 * b5 -+ mul x13, x8, x20 -+ umulh x14, x8, x20 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, x16, xzr -+ -+ // a2 * b6 -+ mul x13, x7, x23 -+ umulh x14, x7, x23 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, x16, xzr -+ -+ // a1 * b7 -+ mul x13, x4, x24 -+ umulh x14, x4, x24 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, x16, xzr -+ -+ // load b8, b9 -+ ldp x27, x28, [x1, #64] -+ -+ // a0 * b8 -+ mul x13, x3, x27 -+ umulh x14, x3, x27 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, x16, xzr -+ -+ // c8 is now in x18 -+ -+ // a0 * b9 -+ mul x13, x3, x28 -+ umulh x14, x3, x28 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, xzr, xzr -+ -+ // a1 * b8 -+ mul x13, x4, x27 -+ umulh x14, x4, x27 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, x15, xzr -+ -+ // a2 * b7 -+ mul x13, x7, x24 -+ umulh x14, x7, x24 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, x15, xzr -+ -+ // a3 * b6 -+ mul x13, x8, x23 -+ umulh x14, x8, x23 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, x15, xzr -+ -+ // a4 * b5 -+ mul x13, x11, x20 -+ umulh x14, x11, x20 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, x15, xzr -+ -+ // a5 * b4 -+ mul x13, x12, x19 -+ umulh x14, x12, x19 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, x15, xzr -+ -+ // a6 * b3 -+ mul x13, x21, x10 -+ umulh x14, x21, x10 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, x15, xzr -+ -+ // a7 * b2 -+ mul x13, x22, x9 -+ umulh x14, x22, x9 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, x15, xzr -+ -+ // a8 * b1 -+ mul x13, x25, x6 -+ umulh x14, x25, x6 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, x15, xzr -+ -+ // a9 * b0 -+ mul x13, x26, x5 -+ umulh x14, x26, x5 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, x15, xzr -+ -+ // store c8 and c9 -+ stp x18, x17, [x2, #64] -+ -+ // load a10, a11; a0 and a1 unloaded -+ ldp x3, x4, [x0, #80] -+ -+ // a10 * b0 -+ mul x13, x3, x5 -+ umulh x14, x3, x5 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, xzr, xzr -+ -+ // a9 * b1 -+ mul x13, x26, x6 -+ umulh x14, x26, x6 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, x18, xzr -+ -+ // a8 * b2 -+ mul x13, x25, x9 -+ umulh x14, x25, x9 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, x18, xzr -+ -+ // a7 * b3 -+ mul x13, x22, x10 -+ umulh x14, x22, x10 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, x18, xzr -+ -+ // a6 * b4 -+ mul x13, x21, x19 -+ umulh x14, x21, x19 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, x18, xzr -+ -+ // a5 * b5 -+ mul x13, x12, x20 -+ umulh x14, x12, x20 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, x18, xzr -+ -+ // a4 * b6 -+ mul x13, x11, x23 -+ umulh x14, x11, x23 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, x18, xzr -+ -+ // a3 * b7 -+ mul x13, x8, x24 -+ umulh x14, x8, x24 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, x18, xzr -+ -+ // a2 * b8 -+ mul x13, x7, x27 -+ umulh x14, x7, x27 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, x18, xzr -+ -+ // load a0, a1; b0 and b1 unloaded -+ ldp x5, x6, [x0, #0] -+ -+ // a1 * b9 -+ mul x13, x6, x28 -+ umulh x14, x6, x28 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, x18, xzr -+ -+ // load b10, b11; a10 and a11 unloaded -+ ldp x3, x4, [x1, #80] -+ -+ // a0 * b10 -+ mul x13, x3, x5 -+ umulh x14, x3, x5 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, x18, xzr -+ -+ // c10 now in x16 -+ -+ // a0 * b11 -+ mul x13, x4, x5 -+ umulh x14, x4, x5 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, xzr, xzr -+ -+ // a1 * b10 -+ mul x13, x3, x6 -+ umulh x14, x3, x6 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, x17, xzr -+ -+ // a2 * b9 -+ mul x13, x7, x28 -+ umulh x14, x7, x28 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, x17, xzr -+ -+ // a3 * b8 -+ mul x13, x8, x27 -+ umulh x14, x8, x27 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, x17, xzr -+ -+ // a4 * b7 -+ mul x13, x11, x24 -+ umulh x14, x11, x24 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, x17, xzr -+ -+ // a5 * b6 -+ mul x13, x12, x23 -+ umulh x14, x12, x23 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, x17, xzr -+ -+ // a6 * b5 -+ mul x13, x21, x20 -+ umulh x14, x21, x20 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, x17, xzr -+ -+ // a7 * b4 -+ mul x13, x22, x19 -+ umulh x14, x22, x19 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, x17, xzr -+ -+ // a8 * b3 -+ mul x13, x25, x10 -+ umulh x14, x25, x10 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, x17, xzr -+ -+ // a9 * b2 -+ mul x13, x26, x9 -+ umulh x14, x26, x9 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, x17, xzr -+ -+ // load a10, a11; b10 and b11 unloaded -+ ldp x3, x4, [x0, #80] -+ // load b0, b1; a0 and a1 unloaded -+ ldp x5, x6, [x1, #0] -+ -+ // a10 * b1 -+ mul x13, x3, x6 -+ umulh x14, x3, x6 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, x17, xzr -+ -+ // a11 * b0 -+ mul x13, x4, x5 -+ umulh x14, x4, x5 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, x17, xzr -+ -+ // store c10 and c11 -+ stp x16, x15, [x2, #80] -+ -+ // a11 * b1 -+ mul x13, x4, x6 -+ umulh x14, x4, x6 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, xzr, xzr -+ -+ // a10 * b2 -+ mul x13, x9, x3 -+ umulh x14, x9, x3 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, x16, xzr -+ -+ // a9 * b3 -+ mul x13, x26, x10 -+ umulh x14, x26, x10 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, x16, xzr -+ -+ // a8 * b4 -+ mul x13, x25, x19 -+ umulh x14, x25, x19 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, x16, xzr -+ -+ // a7 * b5 -+ mul x13, x22, x20 -+ umulh x14, x22, x20 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, x16, xzr -+ -+ // a6 * b6 -+ mul x13, x21, x23 -+ umulh x14, x21, x23 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, x16, xzr -+ -+ // a5 * b7 -+ mul x13, x12, x24 -+ umulh x14, x12, x24 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, x16, xzr -+ -+ // a4 * b8 -+ mul x13, x11, x27 -+ umulh x14, x11, x27 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, x16, xzr -+ -+ // a3 * b9 -+ mul x13, x8, x28 -+ umulh x14, x8, x28 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, x16, xzr -+ -+ // load b10, b11; a10 and a11 unloaded -+ ldp x3, x4, [x1, #80] -+ // load a0, a1; b0 and b1 unloaded -+ ldp x5, x6, [x0, #0] -+ -+ // a2 * b10 -+ mul x13, x7, x3 -+ umulh x14, x7, x3 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, x16, xzr -+ -+ // a1 * b11 -+ mul x13, x6, x4 -+ umulh x14, x6, x4 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, x16, xzr -+ -+ // c12 now in x18 -+ -+ // a2 * b11 -+ mul x13, x7, x4 -+ umulh x14, x7, x4 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, xzr, xzr -+ -+ // a3 * b10 -+ mul x13, x8, x3 -+ umulh x14, x8, x3 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, x15, xzr -+ -+ // a4 * b9 -+ mul x13, x11, x28 -+ umulh x14, x11, x28 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, x15, xzr -+ -+ // a5 * b8 -+ mul x13, x12, x27 -+ umulh x14, x12, x27 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, x15, xzr -+ -+ // a6 * b7 -+ mul x13, x21, x24 -+ umulh x14, x21, x24 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, x15, xzr -+ -+ // a7 * b6 -+ mul x13, x22, x23 -+ umulh x14, x22, x23 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, x15, xzr -+ -+ // a8 * b5 -+ mul x13, x25, x20 -+ umulh x14, x25, x20 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, x15, xzr -+ -+ // a9 * b4 -+ mul x13, x26, x19 -+ umulh x14, x26, x19 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, x15, xzr -+ -+ // load a10, a11; a0 and a1 unloaded -+ ldp x5, x6, [x0, #80] -+ -+ // a10 * b3 -+ mul x13, x5, x10 -+ umulh x14, x5, x10 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, x15, xzr -+ -+ // a11 * b2 -+ mul x13, x6, x9 -+ umulh x14, x6, x9 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, x15, xzr -+ -+ // store c12 and c13 -+ stp x18, x17, [x2, #96] -+ -+ // a11 * b3 -+ mul x13, x6, x10 -+ umulh x14, x6, x10 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, xzr, xzr -+ -+ // a10 * b4 -+ mul x13, x5, x19 -+ umulh x14, x5, x19 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, x18, xzr -+ -+ // a9 * b5 -+ mul x13, x26, x20 -+ umulh x14, x26, x20 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, x18, xzr -+ -+ // a8 * b6 -+ mul x13, x25, x23 -+ umulh x14, x25, x23 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, x18, xzr -+ -+ // a7 * b7 -+ mul x13, x22, x24 -+ umulh x14, x22, x24 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, x18, xzr -+ -+ // a6 * b8 -+ mul x13, x21, x27 -+ umulh x14, x21, x27 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, x18, xzr -+ -+ // a5 * b9 -+ mul x13, x12, x28 -+ umulh x14, x12, x28 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, x18, xzr -+ -+ // a4 * b10 -+ mul x13, x11, x3 -+ umulh x14, x11, x3 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, x18, xzr -+ -+ // a3 * b11 -+ mul x13, x8, x4 -+ umulh x14, x8, x4 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, x18, xzr -+ -+ // c14 is now in x16 -+ -+ // a4 * b11 -+ mul x13, x11, x4 -+ umulh x14, x11, x4 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, xzr, xzr -+ -+ // a5 * b10 -+ mul x13, x12, x3 -+ umulh x14, x12, x3 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, x17, xzr -+ -+ // a6 * b9 -+ mul x13, x21, x28 -+ umulh x14, x21, x28 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, x17, xzr -+ -+ // a7 * b8 -+ mul x13, x22, x27 -+ umulh x14, x22, x27 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, x17, xzr -+ -+ // a8 * b7 -+ mul x13, x25, x24 -+ umulh x14, x25, x24 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, x17, xzr -+ -+ // a9 * b6 -+ mul x13, x26, x23 -+ umulh x14, x26, x23 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, x17, xzr -+ -+ // a10 * b5 -+ mul x13, x5, x20 -+ umulh x14, x5, x20 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, x17, xzr -+ -+ // a11 * b4 -+ mul x13, x6, x19 -+ umulh x14, x6, x19 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, x17, xzr -+ -+ // c15 is now in x15 -+ -+ // store c14 and c15 -+ stp x16, x15, [x2, #112] -+ -+ // a11 * b5 -+ mul x13, x6, x20 -+ umulh x14, x6, x20 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, xzr, xzr -+ -+ // a10 * b6 -+ mul x13, x5, x23 -+ umulh x14, x5, x23 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, x16, xzr -+ -+ // a9 * b7 -+ mul x13, x26, x24 -+ umulh x14, x26, x24 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, x16, xzr -+ -+ // a8 * b8 -+ mul x13, x25, x27 -+ umulh x14, x25, x27 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, x16, xzr -+ -+ // a7 * b9 -+ mul x13, x22, x28 -+ umulh x14, x22, x28 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, x16, xzr -+ -+ // a6 * b10 -+ mul x13, x21, x3 -+ umulh x14, x21, x3 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, x16, xzr -+ -+ // a5 * b11 -+ mul x13, x12, x4 -+ umulh x14, x12, x4 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, x16, xzr -+ -+ // c16 is now in x18 -+ -+ // a6 * b11 -+ mul x13, x21, x4 -+ umulh x14, x21, x4 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, xzr, xzr -+ -+ // a7 * b10 -+ mul x13, x22, x3 -+ umulh x14, x22, x3 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, x15, xzr -+ -+ // a8 * b9 -+ mul x13, x25, x28 -+ umulh x14, x25, x28 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, x15, xzr -+ -+ // a9 * b8 -+ mul x13, x26, x27 -+ umulh x14, x26, x27 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, x15, xzr -+ -+ // a10 * b7 -+ mul x13, x5, x24 -+ umulh x14, x5, x24 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, x15, xzr -+ -+ // a11 * b6 -+ mul x13, x6, x23 -+ umulh x14, x6, x23 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, x15, xzr -+ -+ // store c16 and c17 -+ stp x18, x17, [x2, #128] -+ -+ // a11 * b7 -+ mul x13, x6, x24 -+ umulh x14, x6, x24 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, xzr, xzr -+ -+ // a10 * b8 -+ mul x13, x5, x27 -+ umulh x14, x5, x27 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, x18, xzr -+ -+ // a9 * b9 -+ mul x13, x26, x28 -+ umulh x14, x26, x28 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, x18, xzr -+ -+ // a8 * b10 -+ mul x13, x25, x3 -+ umulh x14, x25, x3 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, x18, xzr -+ -+ // a7 * b11 -+ mul x13, x22, x4 -+ umulh x14, x22, x4 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ adcs x18, x18, xzr -+ -+ // c18 is now in x16 -+ -+ // a8 * b11 -+ mul x13, x25, x4 -+ umulh x14, x25, x4 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, xzr, xzr -+ -+ // a9 * b10 -+ mul x13, x26, x3 -+ umulh x14, x26, x3 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, x17, xzr -+ -+ // a10 * b9 -+ mul x13, x5, x28 -+ umulh x14, x5, x28 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, x17, xzr -+ -+ // a11 * b8 -+ mul x13, x6, x27 -+ umulh x14, x6, x27 -+ -+ adds x15, x15, x13 -+ adcs x18, x18, x14 -+ adcs x17, x17, xzr -+ -+ // store c18 and c19 -+ stp x16, x15, [x2, #144] -+ -+ // a11 * b9 -+ mul x13, x6, x28 -+ umulh x14, x6, x28 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, xzr, xzr -+ -+ // a10 * b10 -+ mul x13, x5, x3 -+ umulh x14, x5, x3 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, x16, xzr -+ -+ // a9 * b11 -+ mul x13, x26, x4 -+ umulh x14, x26, x4 -+ -+ adds x18, x18, x13 -+ adcs x17, x17, x14 -+ adcs x16, x16, xzr -+ -+ // c20 is now in x18 -+ -+ // a10 * b11 -+ mul x13, x5, x4 -+ umulh x14, x5, x4 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, xzr, xzr -+ -+ // a11 * b10 -+ mul x13, x6, x3 -+ umulh x14, x6, x3 -+ -+ adds x17, x17, x13 -+ adcs x16, x16, x14 -+ adcs x15, x15, xzr -+ -+ // store c20 and c21 -+ stp x18, x17, [x2, #160] -+ -+ // a11 * b11 -+ mul x13, x4, x6 -+ umulh x14, x4, x6 -+ -+ adds x16, x16, x13 -+ adcs x15, x15, x14 -+ -+ // store c22 and c23 -+ stp x16, x15, [x2, #176] -+ -+ ldp x19, x20, [sp] -+ ldp x21, x22, [sp, #16] -+ ldp x23, x24, [sp, #32] -+ ldp x25, x26, [sp, #48] -+ ldp x27, x28, [sp, #64] -+ add sp, sp, #80 -+ ret -+ -+ -+//*********************************************************************** -+// Montgomery reduction -+// Based on comba method -+// Operation: mc [x1] = ma [x0] -+// NOTE: ma=mc is not allowed -+//*********************************************************************** -+.global rdc751_asm -+rdc751_asm: -+ // ma is in x0 -+ // mc is in x1 -+ -+ sub sp, sp, #80 -+ stp x19, x20, [sp] -+ stp x21, x22, [sp, #16] -+ stp x23, x24, [sp, #32] -+ stp x25, x26, [sp, #48] -+ stp x27, x28, [sp, #64] -+ -+ // load the prime values into x14 through x20 -+ ldr x14, p751p1 + 0 -+ ldr x15, p751p1 + 8 -+ ldr x16, p751p1 + 16 -+ ldr x17, p751p1 + 24 -+ ldr x18, p751p1 + 32 -+ ldr x19, p751p1 + 40 -+ ldr x20, p751p1 + 48 -+ -+ // the values mc[0] through mc[11] will be held in x2 through x13 -+ // until the very end when they will be stored -+ -+ // load mc[0] through mc[4] and ma[5] -+ ldp x2, x3, [x0, #0] -+ ldp x4, x5, [x0, #16] -+ ldp x6, x21, [x0, #32] -+ -+ // ma[5] iteration -+ mul x22, x2, x14 -+ umulh x23, x2, x14 -+ adds x24, x22, x21 -+ adcs x25, x23, xzr -+ add x7, x24, xzr // set mc[5] -+ -+ // ma[6] iteration -+ -+ ldr x21, [x0, #48] -+ -+ mul x22, x2, x15 -+ umulh x23, x2, x15 -+ adds x25, x25, x22 -+ adcs x26, x23, xzr -+ -+ mul x22, x3, x14 -+ umulh x23, x3, x14 -+ adds x25, x25, x22 -+ adcs x26, x26, x23 -+ adcs x24, xzr, xzr -+ -+ adds x25, x25, x21 -+ adcs x26, x26, xzr -+ adcs x24, x24, xzr -+ add x8, x25, xzr // set mc[6] -+ -+ // ma[7] iteration -+ -+ ldr x21, [x0, #56] -+ mul x22, x2, x16 -+ umulh x23, x2, x16 -+ adds x26, x26, x22 -+ adcs x24, x24, x23 -+ adcs x25, xzr, xzr -+ -+ mul x22, x3, x15 -+ umulh x23, x3, x15 -+ adds x26, x26, x22 -+ adcs x24, x24, x23 -+ adcs x25, x25, xzr -+ -+ mul x22, x4, x14 -+ umulh x23, x4, x14 -+ adds x26, x26, x22 -+ adcs x24, x24, x23 -+ adcs x25, x25, xzr -+ -+ adds x26, x26, x21 -+ adcs x24, x24, xzr -+ adcs x25, x25, xzr -+ add x9, x26, xzr // set mc[7] -+ -+ // ma[8] iteration -+ -+ ldr x21, [x0, #64] -+ mul x22, x2, x17 -+ umulh x23, x2, x17 -+ adds x24, x24, x22 -+ adcs x25, x25, x23 -+ adcs x26, xzr, xzr -+ -+ mul x22, x3, x16 -+ umulh x23, x3, x16 -+ adds x24, x24, x22 -+ adcs x25, x25, x23 -+ adcs x26, x26, xzr -+ -+ mul x22, x4, x15 -+ umulh x23, x4, x15 -+ adds x24, x24, x22 -+ adcs x25, x25, x23 -+ adcs x26, x26, xzr -+ -+ mul x22, x5, x14 -+ umulh x23, x5, x14 -+ adds x24, x24, x22 -+ adcs x25, x25, x23 -+ adcs x26, x26, xzr -+ -+ adds x24, x24, x21 -+ adcs x25, x25, xzr -+ adcs x26, x26, xzr -+ add x10, x24, xzr // set mc[8] -+ -+ // ma[9] iteration -+ -+ ldr x21, [x0, #72] -+ mul x22, x2, x18 -+ umulh x23, x2, x18 -+ adds x25, x25, x22 -+ adcs x26, x26, x23 -+ adcs x24, xzr, xzr -+ -+ mul x22, x3, x17 -+ umulh x23, x3, x17 -+ adds x25, x25, x22 -+ adcs x26, x26, x23 -+ adcs x24, x24, xzr -+ -+ mul x22, x4, x16 -+ umulh x23, x4, x16 -+ adds x25, x25, x22 -+ adcs x26, x26, x23 -+ adcs x24, x24, xzr -+ -+ mul x22, x5, x15 -+ umulh x23, x5, x15 -+ adds x25, x25, x22 -+ adcs x26, x26, x23 -+ adcs x24, x24, xzr -+ -+ mul x22, x6, x14 -+ umulh x23, x6, x14 -+ adds x25, x25, x22 -+ adcs x26, x26, x23 -+ adcs x24, x24, xzr -+ -+ adds x25, x25, x21 -+ adcs x26, x26, xzr -+ adcs x24, x24, xzr -+ add x11, x25, xzr // set mc[9] -+ -+ // ma[10] iteration -+ -+ ldr x21, [x0, #80] -+ mul x22, x2, x19 -+ umulh x23, x2, x19 -+ adds x26, x26, x22 -+ adcs x24, x24, x23 -+ adcs x25, xzr, xzr -+ -+ mul x22, x3, x18 -+ umulh x23, x3, x18 -+ adds x26, x26, x22 -+ adcs x24, x24, x23 -+ adcs x25, x25, xzr -+ -+ mul x22, x4, x17 -+ umulh x23, x4, x17 -+ adds x26, x26, x22 -+ adcs x24, x24, x23 -+ adcs x25, x25, xzr -+ -+ mul x22, x5, x16 -+ umulh x23, x5, x16 -+ adds x26, x26, x22 -+ adcs x24, x24, x23 -+ adcs x25, x25, xzr -+ -+ mul x22, x6, x15 -+ umulh x23, x6, x15 -+ adds x26, x26, x22 -+ adcs x24, x24, x23 -+ adcs x25, x25, xzr -+ -+ mul x22, x7, x14 -+ umulh x23, x7, x14 -+ adds x26, x26, x22 -+ adcs x24, x24, x23 -+ adcs x25, x25, xzr -+ -+ adds x26, x26, x21 -+ adcs x24, x24, xzr -+ adcs x25, x25, xzr -+ add x12, x26, xzr // set mc[10] -+ -+ // ma[11] iteration -+ ldr x21, [x0, #88] -+ -+ mul x22, x2, x20 -+ umulh x23, x2, x20 -+ adds x24, x24, x22 -+ adcs x25, x25, x23 -+ adcs x26, xzr, xzr -+ -+ mul x22, x3, x19 -+ umulh x23, x3, x19 -+ adds x24, x24, x22 -+ adcs x25, x25, x23 -+ adcs x26, x26, xzr -+ -+ mul x22, x4, x18 -+ umulh x23, x4, x18 -+ adds x24, x24, x22 -+ adcs x25, x25, x23 -+ adcs x26, x26, xzr -+ -+ mul x22, x5, x17 -+ umulh x23, x5, x17 -+ adds x24, x24, x22 -+ adcs x25, x25, x23 -+ adcs x26, x26, xzr -+ -+ mul x22, x6, x16 -+ umulh x23, x6, x16 -+ adds x24, x24, x22 -+ adcs x25, x25, x23 -+ adcs x26, x26, xzr -+ -+ mul x22, x7, x15 -+ umulh x23, x7, x15 -+ adds x24, x24, x22 -+ adcs x25, x25, x23 -+ adcs x26, x26, xzr -+ -+ mul x22, x8, x14 -+ umulh x23, x8, x14 -+ adds x24, x24, x22 -+ adcs x25, x25, x23 -+ adcs x26, x26, xzr -+ -+ adds x24, x24, x21 -+ adcs x25, x25, xzr -+ adcs x26, x26, xzr -+ add x13, x24, xzr // set mc[11] -+ -+ // ma[12] iteration -+ -+ ldr x21, [x0, #96] -+ mul x22, x3, x20 -+ umulh x23, x3, x20 -+ adds x25, x25, x22 -+ adcs x26, x26, x23 -+ adcs x24, xzr, xzr -+ -+ mul x22, x4, x19 -+ umulh x23, x4, x19 -+ adds x25, x25, x22 -+ adcs x26, x26, x23 -+ adcs x24, x24, xzr -+ -+ mul x22, x5, x18 -+ umulh x23, x5, x18 -+ adds x25, x25, x22 -+ adcs x26, x26, x23 -+ adcs x24, x24, xzr -+ -+ mul x22, x6, x17 -+ umulh x23, x6, x17 -+ adds x25, x25, x22 -+ adcs x26, x26, x23 -+ adcs x24, x24, xzr -+ -+ mul x22, x7, x16 -+ umulh x23, x7, x16 -+ adds x25, x25, x22 -+ adcs x26, x26, x23 -+ adcs x24, x24, xzr -+ -+ mul x22, x8, x15 -+ umulh x23, x8, x15 -+ adds x25, x25, x22 -+ adcs x26, x26, x23 -+ adcs x24, x24, xzr -+ -+ mul x22, x9, x14 -+ umulh x23, x9, x14 -+ adds x25, x25, x22 -+ adcs x26, x26, x23 -+ adcs x24, x24, xzr -+ -+ adds x25, x25, x21 -+ adcs x26, x26, xzr -+ adcs x24, x24, xzr -+ add x2, x25, xzr // set mc[0] -+ -+ // ma[13] iteration -+ -+ ldr x21, [x0, #104] -+ mul x22, x4, x20 -+ umulh x23, x4, x20 -+ adds x26, x26, x22 -+ adcs x24, x24, x23 -+ adcs x25, xzr, xzr -+ -+ mul x22, x5, x19 -+ umulh x23, x5, x19 -+ adds x26, x26, x22 -+ adcs x24, x24, x23 -+ adcs x25, x25, xzr -+ -+ mul x22, x6, x18 -+ umulh x23, x6, x18 -+ adds x26, x26, x22 -+ adcs x24, x24, x23 -+ adcs x25, x25, xzr -+ -+ mul x22, x7, x17 -+ umulh x23, x7, x17 -+ adds x26, x26, x22 -+ adcs x24, x24, x23 -+ adcs x25, x25, xzr -+ -+ mul x22, x8, x16 -+ umulh x23, x8, x16 -+ adds x26, x26, x22 -+ adcs x24, x24, x23 -+ adcs x25, x25, xzr -+ -+ mul x22, x9, x15 -+ umulh x23, x9, x15 -+ adds x26, x26, x22 -+ adcs x24, x24, x23 -+ adcs x25, x25, xzr -+ -+ mul x22, x10, x14 -+ umulh x23, x10, x14 -+ adds x26, x26, x22 -+ adcs x24, x24, x23 -+ adcs x25, x25, xzr -+ -+ adds x26, x26, x21 -+ adcs x24, x24, xzr -+ adcs x25, x25, xzr -+ add x3, x26, xzr // set mc[1] -+ -+ // ma[14] iteration -+ -+ ldr x21, [x0, #112] -+ mul x22, x5, x20 -+ umulh x23, x5, x20 -+ adds x24, x24, x22 -+ adcs x25, x25, x23 -+ adcs x26, xzr, xzr -+ -+ mul x22, x6, x19 -+ umulh x23, x6, x19 -+ adds x24, x24, x22 -+ adcs x25, x25, x23 -+ adcs x26, x26, xzr -+ -+ mul x22, x7, x18 -+ umulh x23, x7, x18 -+ adds x24, x24, x22 -+ adcs x25, x25, x23 -+ adcs x26, x26, xzr -+ -+ mul x22, x8, x17 -+ umulh x23, x8, x17 -+ adds x24, x24, x22 -+ adcs x25, x25, x23 -+ adcs x26, x26, xzr -+ -+ mul x22, x9, x16 -+ umulh x23, x9, x16 -+ adds x24, x24, x22 -+ adcs x25, x25, x23 -+ adcs x26, x26, xzr -+ -+ mul x22, x10, x15 -+ umulh x23, x10, x15 -+ adds x24, x24, x22 -+ adcs x25, x25, x23 -+ adcs x26, x26, xzr -+ -+ mul x22, x11, x14 -+ umulh x23, x11, x14 -+ adds x24, x24, x22 -+ adcs x25, x25, x23 -+ adcs x26, x26, xzr -+ -+ adds x24, x24, x21 -+ adcs x25, x25, xzr -+ adcs x26, x26, xzr -+ add x4, x24, xzr // set mc[2] -+ -+ // ma[15] iteration -+ -+ ldr x21, [x0, #120] -+ mul x22, x6, x20 -+ umulh x23, x6, x20 -+ adds x25, x25, x22 -+ adcs x26, x26, x23 -+ adcs x24, xzr, xzr -+ -+ mul x22, x7, x19 -+ umulh x23, x7, x19 -+ adds x25, x25, x22 -+ adcs x26, x26, x23 -+ adcs x24, x24, xzr -+ -+ mul x22, x8, x18 -+ umulh x23, x8, x18 -+ adds x25, x25, x22 -+ adcs x26, x26, x23 -+ adcs x24, x24, xzr -+ -+ mul x22, x9, x17 -+ umulh x23, x9, x17 -+ adds x25, x25, x22 -+ adcs x26, x26, x23 -+ adcs x24, x24, xzr -+ -+ mul x22, x10, x16 -+ umulh x23, x10, x16 -+ adds x25, x25, x22 -+ adcs x26, x26, x23 -+ adcs x24, x24, xzr -+ -+ mul x22, x11, x15 -+ umulh x23, x11, x15 -+ adds x25, x25, x22 -+ adcs x26, x26, x23 -+ adcs x24, x24, xzr -+ -+ mul x22, x12, x14 -+ umulh x23, x12, x14 -+ adds x25, x25, x22 -+ adcs x26, x26, x23 -+ adcs x24, x24, xzr -+ -+ adds x25, x25, x21 -+ adcs x26, x26, xzr -+ adcs x24, x24, xzr -+ add x5, x25, xzr // set mc[3] -+ -+ // ma[16] iteration -+ -+ ldr x21, [x0, #128] -+ mul x22, x7, x20 -+ umulh x23, x7, x20 -+ adds x26, x26, x22 -+ adcs x24, x24, x23 -+ adcs x25, xzr, xzr -+ -+ mul x22, x8, x19 -+ umulh x23, x8, x19 -+ adds x26, x26, x22 -+ adcs x24, x24, x23 -+ adcs x25, x25, xzr -+ -+ mul x22, x9, x18 -+ umulh x23, x9, x18 -+ adds x26, x26, x22 -+ adcs x24, x24, x23 -+ adcs x25, x25, xzr -+ -+ mul x22, x10, x17 -+ umulh x23, x10, x17 -+ adds x26, x26, x22 -+ adcs x24, x24, x23 -+ adcs x25, x25, xzr -+ -+ mul x22, x11, x16 -+ umulh x23, x11, x16 -+ adds x26, x26, x22 -+ adcs x24, x24, x23 -+ adcs x25, x25, xzr -+ -+ mul x22, x12, x15 -+ umulh x23, x12, x15 -+ adds x26, x26, x22 -+ adcs x24, x24, x23 -+ adcs x25, x25, xzr -+ -+ mul x22, x13, x14 -+ umulh x23, x13, x14 -+ adds x26, x26, x22 -+ adcs x24, x24, x23 -+ adcs x25, x25, xzr -+ -+ adds x26, x26, x21 -+ adcs x24, x24, xzr -+ adcs x25, x25, xzr -+ add x6, x26, xzr // set mc[4] -+ -+ // ma[17] iteration -+ -+ ldr x21, [x0, #136] -+ mul x22, x8, x20 -+ umulh x23, x8, x20 -+ adds x24, x24, x22 -+ adcs x25, x25, x23 -+ adcs x26, xzr, xzr -+ -+ mul x22, x9, x19 -+ umulh x23, x9, x19 -+ adds x24, x24, x22 -+ adcs x25, x25, x23 -+ adcs x26, x26, xzr -+ -+ mul x22, x10, x18 -+ umulh x23, x10, x18 -+ adds x24, x24, x22 -+ adcs x25, x25, x23 -+ adcs x26, x26, xzr -+ -+ mul x22, x11, x17 -+ umulh x23, x11, x17 -+ adds x24, x24, x22 -+ adcs x25, x25, x23 -+ adcs x26, x26, xzr -+ -+ mul x22, x12, x16 -+ umulh x23, x12, x16 -+ adds x24, x24, x22 -+ adcs x25, x25, x23 -+ adcs x26, x26, xzr -+ -+ mul x22, x13, x15 -+ umulh x23, x13, x15 -+ adds x24, x24, x22 -+ adcs x25, x25, x23 -+ adcs x26, x26, xzr -+ -+ adds x24, x24, x21 -+ adcs x25, x25, xzr -+ adcs x26, x26, xzr -+ add x7, x24, xzr // set mc[5] -+ -+ // ma[18] iteration -+ -+ ldr x21, [x0, #144] -+ mul x22, x9, x20 -+ umulh x23, x9, x20 -+ adds x25, x25, x22 -+ adcs x26, x26, x23 -+ adcs x24, xzr, xzr -+ -+ mul x22, x10, x19 -+ umulh x23, x10, x19 -+ adds x25, x25, x22 -+ adcs x26, x26, x23 -+ adcs x24, x24, xzr -+ -+ mul x22, x11, x18 -+ umulh x23, x11, x18 -+ adds x25, x25, x22 -+ adcs x26, x26, x23 -+ adcs x24, x24, xzr -+ -+ mul x22, x12, x17 -+ umulh x23, x12, x17 -+ adds x25, x25, x22 -+ adcs x26, x26, x23 -+ adcs x24, x24, xzr -+ -+ mul x22, x13, x16 -+ umulh x23, x13, x16 -+ adds x25, x25, x22 -+ adcs x26, x26, x23 -+ adcs x24, x24, xzr -+ -+ adds x25, x25, x21 -+ adcs x26, x26, xzr -+ adcs x24, x24, xzr -+ add x8, x25, xzr // set mc[6] -+ -+ // ma[19] iteration -+ -+ ldr x21, [x0, #152] -+ mul x22, x10, x20 -+ umulh x23, x10, x20 -+ adds x26, x26, x22 -+ adcs x24, x24, x23 -+ adcs x25, xzr, xzr -+ -+ mul x22, x11, x19 -+ umulh x23, x11, x19 -+ adds x26, x26, x22 -+ adcs x24, x24, x23 -+ adcs x25, x25, xzr -+ -+ mul x22, x12, x18 -+ umulh x23, x12, x18 -+ adds x26, x26, x22 -+ adcs x24, x24, x23 -+ adcs x25, x25, xzr -+ -+ mul x22, x13, x17 -+ umulh x23, x13, x17 -+ adds x26, x26, x22 -+ adcs x24, x24, x23 -+ adcs x25, x25, xzr -+ -+ adds x26, x26, x21 -+ adcs x24, x24, xzr -+ adcs x25, x25, xzr -+ add x9, x26, xzr // set mc[7] -+ -+ // ma[20] iteration -+ ldr x21, [x0, #160] -+ -+ mul x22, x11, x20 -+ umulh x23, x11, x20 -+ adds x24, x24, x22 -+ adcs x25, x25, x23 -+ adcs x26, xzr, xzr -+ -+ mul x22, x12, x19 -+ umulh x23, x12, x19 -+ adds x24, x24, x22 -+ adcs x25, x25, x23 -+ adcs x26, x26, xzr -+ -+ mul x22, x13, x18 -+ umulh x23, x13, x18 -+ adds x24, x24, x22 -+ adcs x25, x25, x23 -+ adcs x26, x26, xzr -+ -+ adds x24, x24, x21 -+ adcs x25, x25, xzr -+ adcs x26, x26, xzr -+ add x10, x24, xzr // set mc[8] -+ -+ // ma[21] iteration -+ -+ ldr x21, [x0, #168] -+ mul x22, x12, x20 -+ umulh x23, x12, x20 -+ adds x25, x25, x22 -+ adcs x26, x26, x23 -+ adcs x24, xzr, xzr -+ -+ mul x22, x13, x19 -+ umulh x23, x13, x19 -+ adds x25, x25, x22 -+ adcs x26, x26, x23 -+ adcs x24, x24, xzr -+ -+ adds x25, x25, x21 -+ adcs x26, x26, xzr -+ adcs x24, x24, xzr -+ add x11, x25, xzr // set mc[9] -+ -+ // ma[22] iteration -+ -+ ldr x21, [x0, #176] -+ mul x22, x13, x20 -+ umulh x23, x13, x20 -+ adds x26, x26, x22 -+ adcs x24, x24, x23 -+ adds x26, x26, x21 -+ -+ ldr x21, [x0, #184] -+ adcs x24, x24, x21 -+ add x12, x26, xzr // set mc[10] -+ add x13, x24, xzr // set mc[11] -+ -+ stp x2, x3, [x1, #0] -+ stp x4, x5, [x1, #16] -+ stp x6, x7, [x1, #32] -+ stp x8, x9, [x1, #48] -+ stp x10, x11, [x1, #64] -+ stp x12, x13, [x1, #80] -+ -+ ldp x19, x20, [sp] -+ ldp x21, x22, [sp, #16] -+ ldp x23, x24, [sp, #32] -+ ldp x25, x26, [sp, #48] -+ ldp x27, x28, [sp, #64] -+ add sp, sp, #80 -+ ret -+ -+ -+//*********************************************************************** -+// 751-bit multiprecision addition -+// Operation: c [x2] = a [x0] + b [x1] -+//*********************************************************************** -+.global mp_add751_asm -+mp_add751_asm: -+ ldp x3, x4, [x0,#0] -+ ldp x5, x6, [x0,#16] -+ ldp x7, x8, [x0,#32] -+ ldp x9, x10, [x0,#48] -+ ldp x11, x12, [x0,#64] -+ ldp x13, x14, [x0,#80] -+ -+ ldp x15, x16, [x1,#0] -+ ldp x17, x18, [x1,#16] -+ adds x3, x3, x15 -+ adcs x4, x4, x16 -+ adcs x5, x5, x17 -+ adcs x6, x6, x18 -+ ldp x15, x16, [x1,#32] -+ ldp x17, x18, [x1,#48] -+ adcs x7, x7, x15 -+ adcs x8, x8, x16 -+ adcs x9, x9, x17 -+ adcs x10, x10, x18 -+ ldp x15, x16, [x1,#64] -+ ldp x17, x18, [x1,#80] -+ adcs x11, x11, x15 -+ adcs x12, x12, x16 -+ adcs x13, x13, x17 -+ adc x14, x14, x18 -+ -+ stp x3, x4, [x2,#0] -+ stp x5, x6, [x2,#16] -+ stp x7, x8, [x2,#32] -+ stp x9, x10, [x2,#48] -+ stp x11, x12, [x2,#64] -+ stp x13, x14, [x2,#80] -+ ret -+ -+ -+//*********************************************************************** -+// 2x751-bit multiprecision addition -+// Operation: c [x2] = a [x0] + b [x1] -+//*********************************************************************** -+.global mp_add751x2_asm -+mp_add751x2_asm: -+ ldp x3, x4, [x0,#0] -+ ldp x5, x6, [x0,#16] -+ ldp x7, x8, [x0,#32] -+ ldp x9, x10, [x0,#48] -+ ldp x11, x12, [x0,#64] -+ ldp x13, x14, [x0,#80] -+ -+ ldp x15, x16, [x1,#0] -+ ldp x17, x18, [x1,#16] -+ adds x3, x3, x15 -+ adcs x4, x4, x16 -+ adcs x5, x5, x17 -+ adcs x6, x6, x18 -+ ldp x15, x16, [x1,#32] -+ ldp x17, x18, [x1,#48] -+ adcs x7, x7, x15 -+ adcs x8, x8, x16 -+ adcs x9, x9, x17 -+ adcs x10, x10, x18 -+ ldp x15, x16, [x1,#64] -+ ldp x17, x18, [x1,#80] -+ adcs x11, x11, x15 -+ adcs x12, x12, x16 -+ adcs x13, x13, x17 -+ adcs x14, x14, x18 -+ -+ stp x3, x4, [x2,#0] -+ stp x5, x6, [x2,#16] -+ stp x7, x8, [x2,#32] -+ stp x9, x10, [x2,#48] -+ stp x11, x12, [x2,#64] -+ stp x13, x14, [x2,#80] -+ -+ ldp x3, x4, [x0,#96] -+ ldp x5, x6, [x0,#112] -+ ldp x7, x8, [x0,#128] -+ ldp x9, x10, [x0,#144] -+ ldp x11, x12, [x0,#160] -+ ldp x13, x14, [x0,#176] -+ -+ ldp x15, x16, [x1,#96] -+ ldp x17, x18, [x1,#112] -+ adcs x3, x3, x15 -+ adcs x4, x4, x16 -+ adcs x5, x5, x17 -+ adcs x6, x6, x18 -+ ldp x15, x16, [x1,#128] -+ ldp x17, x18, [x1,#144] -+ adcs x7, x7, x15 -+ adcs x8, x8, x16 -+ adcs x9, x9, x17 -+ adcs x10, x10, x18 -+ ldp x15, x16, [x1,#160] -+ ldp x17, x18, [x1,#176] -+ adcs x11, x11, x15 -+ adcs x12, x12, x16 -+ adcs x13, x13, x17 -+ adc x14, x14, x18 -+ -+ stp x3, x4, [x2,#96] -+ stp x5, x6, [x2,#112] -+ stp x7, x8, [x2,#128] -+ stp x9, x10, [x2,#144] -+ stp x11, x12, [x2,#160] -+ stp x13, x14, [x2,#176] -+ ret -+ -+ -+//*********************************************************************** -+// 2x751-bit multiprecision subtraction -+// Operation: c [x2] = a [x0] - b [x1]. Returns borrow mask -+//*********************************************************************** -+.global mp_sub751x2_asm -+mp_sub751x2_asm: -+ ldp x3, x4, [x0,#0] -+ ldp x5, x6, [x0,#16] -+ ldp x7, x8, [x0,#32] -+ ldp x9, x10, [x0,#48] -+ ldp x11, x12, [x0,#64] -+ ldp x13, x14, [x0,#80] -+ -+ ldp x15, x16, [x1,#0] -+ ldp x17, x18, [x1,#16] -+ subs x3, x3, x15 -+ sbcs x4, x4, x16 -+ sbcs x5, x5, x17 -+ sbcs x6, x6, x18 -+ ldp x15, x16, [x1,#32] -+ ldp x17, x18, [x1,#48] -+ sbcs x7, x7, x15 -+ sbcs x8, x8, x16 -+ sbcs x9, x9, x17 -+ sbcs x10, x10, x18 -+ ldp x15, x16, [x1,#64] -+ ldp x17, x18, [x1,#80] -+ sbcs x11, x11, x15 -+ sbcs x12, x12, x16 -+ sbcs x13, x13, x17 -+ sbcs x14, x14, x18 -+ -+ stp x3, x4, [x2,#0] -+ stp x5, x6, [x2,#16] -+ stp x7, x8, [x2,#32] -+ stp x9, x10, [x2,#48] -+ stp x11, x12, [x2,#64] -+ stp x13, x14, [x2,#80] -+ -+ ldp x3, x4, [x0,#96] -+ ldp x5, x6, [x0,#112] -+ ldp x7, x8, [x0,#128] -+ ldp x9, x10, [x0,#144] -+ ldp x11, x12, [x0,#160] -+ ldp x13, x14, [x0,#176] -+ -+ ldp x15, x16, [x1,#96] -+ ldp x17, x18, [x1,#112] -+ sbcs x3, x3, x15 -+ sbcs x4, x4, x16 -+ sbcs x5, x5, x17 -+ sbcs x6, x6, x18 -+ ldp x15, x16, [x1,#128] -+ ldp x17, x18, [x1,#144] -+ sbcs x7, x7, x15 -+ sbcs x8, x8, x16 -+ sbcs x9, x9, x17 -+ sbcs x10, x10, x18 -+ ldp x15, x16, [x1,#160] -+ ldp x17, x18, [x1,#176] -+ sbcs x11, x11, x15 -+ sbcs x12, x12, x16 -+ sbcs x13, x13, x17 -+ sbcs x14, x14, x18 -+ sbc x0, xzr, xzr -+ -+ stp x3, x4, [x2,#96] -+ stp x5, x6, [x2,#112] -+ stp x7, x8, [x2,#128] -+ stp x9, x10, [x2,#144] -+ stp x11, x12, [x2,#160] -+ stp x13, x14, [x2,#176] -+ ret -+ -+ -+//*********************************************************************** -+// Double 2x751-bit multiprecision subtraction -+// Operation: c [x2] = c [x2] - a [x0] - b [x1] -+//*********************************************************************** -+.global mp_dblsub751x2_asm -+mp_dblsub751x2_asm: -+ sub sp, sp, #96 -+ stp x19, x20, [sp] -+ stp x21, x22, [sp, #16] -+ stp x23, x24, [sp, #32] -+ stp x25, x26, [sp, #48] -+ stp x27, x28, [sp, #64] -+ stp x29, x30, [sp, #80] -+ ldp x3, x4, [x2,#0] -+ ldp x5, x6, [x2,#16] -+ ldp x7, x8, [x2,#32] -+ ldp x9, x10, [x2,#48] -+ ldp x11, x12, [x2,#64] -+ ldp x13, x14, [x2,#80] -+ ldp x15, x16, [x2,#96] -+ ldp x17, x18, [x2,#112] -+ ldp x19, x20, [x2,#128] -+ ldp x21, x22, [x2,#144] -+ ldp x23, x24, [x2,#160] -+ ldp x25, x26, [x2,#176] -+ -+ ldp x27, x28, [x0,#0] -+ ldp x29, x30, [x0,#16] -+ subs x3, x3, x27 -+ sbcs x4, x4, x28 -+ sbcs x5, x5, x29 -+ sbcs x6, x6, x30 -+ ldp x27, x28, [x0,#32] -+ ldp x29, x30, [x0,#48] -+ sbcs x7, x7, x27 -+ sbcs x8, x8, x28 -+ sbcs x9, x9, x29 -+ sbcs x10, x10, x30 -+ ldp x27, x28, [x0,#64] -+ ldp x29, x30, [x0,#80] -+ sbcs x11, x11, x27 -+ sbcs x12, x12, x28 -+ sbcs x13, x13, x29 -+ sbcs x14, x14, x30 -+ ldp x27, x28, [x0,#96] -+ ldp x29, x30, [x0,#112] -+ sbcs x15, x15, x27 -+ sbcs x16, x16, x28 -+ sbcs x17, x17, x29 -+ sbcs x18, x18, x30 -+ ldp x27, x28, [x0,#128] -+ ldp x29, x30, [x0,#144] -+ sbcs x19, x19, x27 -+ sbcs x20, x20, x28 -+ sbcs x21, x21, x29 -+ sbcs x22, x22, x30 -+ ldp x27, x28, [x0,#160] -+ ldp x29, x30, [x0,#176] -+ sbcs x23, x23, x27 -+ sbcs x24, x24, x28 -+ sbcs x25, x25, x29 -+ sbc x26, x26, x30 -+ -+ ldp x27, x28, [x1,#0] -+ ldp x29, x30, [x1,#16] -+ subs x3, x3, x27 -+ sbcs x4, x4, x28 -+ sbcs x5, x5, x29 -+ sbcs x6, x6, x30 -+ ldp x27, x28, [x1,#32] -+ ldp x29, x30, [x1,#48] -+ sbcs x7, x7, x27 -+ sbcs x8, x8, x28 -+ sbcs x9, x9, x29 -+ sbcs x10, x10, x30 -+ ldp x27, x28, [x1,#64] -+ ldp x29, x30, [x1,#80] -+ sbcs x11, x11, x27 -+ sbcs x12, x12, x28 -+ sbcs x13, x13, x29 -+ sbcs x14, x14, x30 -+ ldp x27, x28, [x1,#96] -+ ldp x29, x30, [x1,#112] -+ sbcs x15, x15, x27 -+ sbcs x16, x16, x28 -+ sbcs x17, x17, x29 -+ sbcs x18, x18, x30 -+ ldp x27, x28, [x1,#128] -+ ldp x29, x30, [x1,#144] -+ sbcs x19, x19, x27 -+ sbcs x20, x20, x28 -+ sbcs x21, x21, x29 -+ sbcs x22, x22, x30 -+ ldp x27, x28, [x1,#160] -+ ldp x29, x30, [x1,#176] -+ sbcs x23, x23, x27 -+ sbcs x24, x24, x28 -+ sbcs x25, x25, x29 -+ sbc x26, x26, x30 -+ -+ stp x3, x4, [x2,#0] -+ stp x5, x6, [x2,#16] -+ stp x7, x8, [x2,#32] -+ stp x9, x10, [x2,#48] -+ stp x11, x12, [x2,#64] -+ stp x13, x14, [x2,#80] -+ stp x15, x16, [x2,#96] -+ stp x17, x18, [x2,#112] -+ stp x19, x20, [x2,#128] -+ stp x21, x22, [x2,#144] -+ stp x23, x24, [x2,#160] -+ stp x25, x26, [x2,#176] -+ -+ ldp x19, x20, [sp] -+ ldp x21, x22, [sp, #16] -+ ldp x23, x24, [sp, #32] -+ ldp x25, x26, [sp, #48] -+ ldp x27, x28, [sp, #64] -+ ldp x29, x30, [sp, #80] -+ add sp, sp, #96 -+ ret -diff --git a/third_party/sidh/src/P751/P751.c b/third_party/sidh/src/P751/P751.c -new file mode 100644 -index 00000000..ea7bcb78 ---- /dev/null -+++ b/third_party/sidh/src/P751/P751.c -@@ -0,0 +1,131 @@ -+/******************************************************************************************** -+* SIDH: an efficient supersingular isogeny cryptography library -+* -+* Abstract: supersingular isogeny parameters and generation of functions for P751 -+*********************************************************************************************/ -+ -+#include "P751_api.h" -+#include "P751_internal.h" -+ -+ -+// Encoding of field elements, elements over Z_order, elements over GF(p^2) and elliptic curve points: -+// -------------------------------------------------------------------------------------------------- -+// Elements over GF(p) and Z_order are encoded with the least significant octet (and digit) located at the leftmost position (i.e., little endian format). -+// Elements (a+b*i) over GF(p^2), where a and b are defined over GF(p), are encoded as {a, b}, with a in the least significant position. -+// Elliptic curve points P = (x,y) are encoded as {x, y}, with x in the least significant position. -+// Internally, the number of digits used to represent all these elements is obtained by approximating the number of bits to the immediately greater multiple of 32. -+// For example, a 751-bit field element is represented with Ceil(751 / 64) = 12 64-bit digits or Ceil(751 / 32) = 24 32-bit digits. -+ -+// -+// Curve isogeny system "SIDHp751". Base curve: Montgomery curve By^2 = Cx^3 + Ax^2 + Cx defined over GF(p751^2), where A=0, B=1, C=1 and p751 = 2^372*3^239-1 -+// -+ -+const uint64_t p751[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xEEAFFFFFFFFFFFFF, -+ 0xE3EC968549F878A8, 0xDA959B1A13F7CC76, 0x084E9867D6EBE876, 0x8562B5045CB25748, 0x0E12909F97BADC66, 0x00006FE5D541F71C }; -+const uint64_t p751p1[NWORDS64_FIELD] = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xEEB0000000000000, -+ 0xE3EC968549F878A8, 0xDA959B1A13F7CC76, 0x084E9867D6EBE876, 0x8562B5045CB25748, 0x0E12909F97BADC66, 0x00006FE5D541F71C }; -+const uint64_t p751x2[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xDD5FFFFFFFFFFFFF, -+ 0xC7D92D0A93F0F151, 0xB52B363427EF98ED, 0x109D30CFADD7D0ED, 0x0AC56A08B964AE90, 0x1C25213F2F75B8CD, 0x0000DFCBAA83EE38 }; -+// Order of Alice's subgroup -+const uint64_t Alice_order[NWORDS64_ORDER] = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0010000000000000 }; -+// Order of Bob's subgroup -+const uint64_t Bob_order[NWORDS64_ORDER] = { 0xC968549F878A8EEB, 0x59B1A13F7CC76E3E, 0xE9867D6EBE876DA9, 0x2B5045CB25748084, 0x2909F97BADC66856, 0x06FE5D541F71C0E1 }; -+// Alice's generator values {XPA0 + XPA1*i, XQA0, XRA0 + XRA1*i} in GF(p751^2), expressed in Montgomery representation -+const uint64_t A_gen[5 * NWORDS64_FIELD] = { 0xC2FC08CEAB50AD8B, 0x1D7D710F55E457B1, 0xE8738D92953DCD6E, 0xBAA7EBEE8A3418AA, 0xC9A288345F03F46F, 0xC8D18D167CFE2616, -+ 0x02043761F6B1C045, 0xAA1975E13180E7E9, 0x9E13D3FDC6690DE6, 0x3A024640A3A3BB4F, 0x4E5AD44E6ACBBDAE, 0x0000544BEB561DAD, // XPA0 -+ 0xE6CC41D21582E411, 0x07C2ECB7C5DF400A, 0xE8E34B521432AEC4, 0x50761E2AB085167D, 0x032CFBCAA6094B3C, 0x6C522F5FDF9DDD71, -+ 0x1319217DC3A1887D, 0xDC4FB25803353A86, 0x362C8D7B63A6AB09, 0x39DCDFBCE47EA488, 0x4C27C99A2C28D409, 0x00003CB0075527C4, // XPA1 -+ 0xD56FE52627914862, 0x1FAD60DC96B5BAEA, 0x01E137D0BF07AB91, 0x404D3E9252161964, 0x3C5385E4CD09A337, 0x4476426769E4AF73, -+ 0x9790C6DB989DFE33, 0xE06E1C04D2AA8B5E, 0x38C08185EDEA73B9, 0xAA41F678A4396CA6, 0x92B9259B2229E9A0, 0x00002F9326818BE0, // XQA0 -+ 0x0BB84441DFFD19B3, 0x84B4DEA99B48C18E, 0x692DE648AD313805, 0xE6D72761B6DFAEE0, 0x223975C672C3058D, 0xA0FDE0C3CBA26FDC, -+ 0xA5326132A922A3CA, 0xCA5E7F5D5EA96FA4, 0x127C7EFE33FFA8C6, 0x4749B1567E2A23C4, 0x2B7DF5B4AF413BFA, 0x0000656595B9623C, // XRA0 -+ 0xED78C17F1EC71BE8, 0xF824D6DF753859B1, 0x33A10839B2A8529F, 0xFC03E9E25FDEA796, 0xC4708A8054DF1762, 0x4034F2EC034C6467, -+ 0xABFB70FBF06ECC79, 0xDABE96636EC108B7, 0x49CBCFB090605FD3, 0x20B89711819A45A7, 0xFB8E1590B2B0F63E, 0x0000556A5F964AB2 }; // XRA1 -+// Bob's generator values {XPB0 + XPB1*i, XQB0, XRB0 + XRB1*i} in GF(p751^2), expressed in Montgomery representation -+const uint64_t B_gen[5 * NWORDS64_FIELD] = { 0xCFB6D71EF867AB0B, 0x4A5FDD76E9A45C76, 0x38B1EE69194B1F03, 0xF6E7B18A7761F3F0, 0xFCF01A486A52C84C, 0xCBE2F63F5AA75466, -+ 0x6487BCE837B5E4D6, 0x7747F5A8C622E9B8, 0x4CBFE1E4EE6AEBBA, 0x8A8616A13FA91512, 0x53DB980E1579E0A5, 0x000058FEBFF3BE69, // XPB0 -+ 0xA492034E7C075CC3, 0x677BAF00B04AA430, 0x3AAE0C9A755C94C8, 0x1DC4B064E9EBB08B, 0x3684EDD04E826C66, 0x9BAA6CB661F01B22, -+ 0x20285A00AD2EFE35, 0xDCE95ABD0497065F, 0x16C7FBB3778E3794, 0x26B3AC29CEF25AAF, 0xFB3C28A31A30AC1D, 0x000046ED190624EE, // XPB1 -+ 0xF1A8C9ED7B96C4AB, 0x299429DA5178486E, 0xEF4926F20CD5C2F4, 0x683B2E2858B4716A, 0xDDA2FBCC3CAC3EEB, 0xEC055F9F3A600460, -+ 0xD5A5A17A58C3848B, 0x4652D836F42EAED5, 0x2F2E71ED78B3A3B3, 0xA771C057180ADD1D, 0xC780A5D2D835F512, 0x0000114EA3B55AC1, // XQB0 -+ 0x1C0D6733769D0F31, 0xF084C3086E2659D1, 0xE23D5DA27BCBD133, 0xF38EC9A8D5864025, 0x6426DC781B3B645B, 0x4B24E8E3C9FB03EE, -+ 0x6432792F9D2CEA30, 0x7CC8E8B1AE76E857, 0x7F32BFB626BB8963, 0xB9F05995B48D7B74, 0x4D71200A7D67E042, 0x0000228457AF0637, // XRB0 -+ 0x4AE37E7D8F72BD95, 0xDD2D504B3E993488, 0x5D14E7FA1ECB3C3E, 0x127610CEB75D6350, 0x255B4B4CAC446B11, 0x9EA12336C1F70CAF, -+ 0x79FA68A2147BC2F8, 0x11E895CFDADBBC49, 0xE4B9D3C4D6356C18, 0x44B25856A67F951C, 0x5851541F61308D0B, 0x00002FFD994F7E4C }; // XRB1 -+// Montgomery constant Montgomery_R2 = (2^768)^2 mod p751 -+const uint64_t Montgomery_R2[NWORDS64_FIELD] = { 0x233046449DAD4058, 0xDB010161A696452A, 0x5E36941472E3FD8E, 0xF40BFE2082A2E706, 0x4932CCA8904F8751 ,0x1F735F1F1EE7FC81, -+ 0xA24F4D80C1048E18, 0xB56C383CCDB607C5, 0x441DD47B735F9C90, 0x5673ED2C6A6AC82A, 0x06C905261132294B, 0x000041AD830F1F35 }; -+// Value one in Montgomery representation -+const uint64_t Montgomery_one[NWORDS64_FIELD] = { 0x00000000000249ad, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x8310000000000000, -+ 0x5527b1e4375c6c66, 0x697797bf3f4f24d0, 0xc89db7b2ac5c4e2e, 0x4ca4b439d2076956, 0x10f7926c7512c7e9, 0x00002d5b24bce5e2 }; -+// Value (2^384)^2 mod 3^239 -+const uint64_t Montgomery_Rprime[NWORDS64_ORDER] = { 0x1A55482318541298, 0x070A6370DFA12A03, 0xCB1658E0E3823A40, 0xB3B7384EB5DEF3F9, 0xCBCA952F7006EA33, 0x00569EF8EC94864C }; -+// Value -(3^239)^-1 mod 2^384 -+const uint64_t Montgomery_rprime[NWORDS64_ORDER] = { 0x48062A91D3AB563D, 0x6CE572751303C2F5, 0x5D1319F3F160EC9D, 0xE35554E8C2D5623A, 0xCA29300232BC79A5, 0x8AAD843D646D78C5 }; -+// Value order_Bob/3 mod p751 -+const uint64_t Border_div3[NWORDS_ORDER] = { 0xEDCD718A828384F9, 0x733B35BFD4427A14, 0xF88229CF94D7CF38, 0x63C56C990C7C2AD6, 0xB858A87E8F4222C7, 0x0254C9C6B525EAF5 }; -+ -+ -+// Fixed parameters for isogeny tree computation -+const unsigned int strat_Alice[MAX_Alice-1] = { -+80, 48, 27, 15, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 7, 4, 2, 1, 1, 2, 1, -+1, 3, 2, 1, 1, 1, 1, 12, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 1, 5, 3, 2, 1, 1, -+1, 1, 2, 1, 1, 1, 21, 12, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 1, 5, 3, 2, 1, -+1, 1, 1, 2, 1, 1, 1, 9, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, 2, 1, 1, -+33, 20, 12, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 1, 5, 3, 2, 1, 1, 1, 1, 2, 1, -+1, 1, 8, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 2, 1, 1, 16, 8, 4, 2, 1, 1, -+1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1 }; -+ -+const unsigned int strat_Bob[MAX_Bob-1] = { -+112, 63, 32, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, -+1, 4, 2, 1, 1, 2, 1, 1, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, -+1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 31, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, -+1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 15, 8, 4, 2, 1, 1, 2, 1, 1, 4, -+2, 1, 1, 2, 1, 1, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 1, 49, 31, 16, 8, 4, 2, -+1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, -+15, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2, 1, -+1, 1, 1, 21, 12, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 5, 3, 2, 1, 1, 1, 1, -+2, 1, 1, 1, 9, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, 2, 1, 1 }; -+ -+// Setting up macro defines and including GF(p), GF(p^2), curve, isogeny and kex functions -+#define fpcopy fpcopy751 -+#define fpzero fpzero751 -+#define fpadd fpadd751 -+#define fpsub fpsub751 -+#define fpneg fpneg751 -+#define fpdiv2 fpdiv2_751 -+#define fpcorrection fpcorrection751 -+#define fpmul_mont fpmul751_mont -+#define fpsqr_mont fpsqr751_mont -+#define fpinv_mont fpinv751_mont -+#define fpinv_chain_mont fpinv751_chain_mont -+#define fpinv_mont_bingcd fpinv751_mont_bingcd -+#define fp2copy fp2copy751 -+#define fp2zero fp2zero751 -+#define fp2add fp2add751 -+#define fp2sub fp2sub751 -+#define fp2neg fp2neg751 -+#define fp2div2 fp2div2_751 -+#define fp2correction fp2correction751 -+#define fp2mul_mont fp2mul751_mont -+#define fp2sqr_mont fp2sqr751_mont -+#define fp2inv_mont fp2inv751_mont -+#define fp2inv_mont_bingcd fp2inv751_mont_bingcd -+#define fpequal_non_constant_time fpequal751_non_constant_time -+#define mp_add_asm mp_add751_asm -+#define mp_subx2_asm mp_sub751x2_asm -+#define mp_dblsubx2_asm mp_dblsub751x2_asm -+#define crypto_kem_keypair crypto_kem_keypair_SIKEp751 -+#define crypto_kem_enc crypto_kem_enc_SIKEp751 -+#define crypto_kem_dec crypto_kem_dec_SIKEp751 -+#define random_mod_order_A random_mod_order_A_SIDHp751 -+#define random_mod_order_B random_mod_order_B_SIDHp751 -+#define EphemeralKeyGeneration_A EphemeralKeyGeneration_A_SIDHp751 -+#define EphemeralKeyGeneration_B EphemeralKeyGeneration_B_SIDHp751 -+#define EphemeralSecretAgreement_A EphemeralSecretAgreement_A_SIDHp751 -+#define EphemeralSecretAgreement_B EphemeralSecretAgreement_B_SIDHp751 -+ -+#include "../fpx.c" -+#include "../ec_isogeny.c" -+#include "../sidh.c" -+#include "../sike.c" -diff --git a/third_party/sidh/src/P751/P751_api.h b/third_party/sidh/src/P751/P751_api.h -new file mode 100644 -index 00000000..269decda ---- /dev/null -+++ b/third_party/sidh/src/P751/P751_api.h -@@ -0,0 +1,107 @@ -+/******************************************************************************************** -+* SIDH: an efficient supersingular isogeny cryptography library -+* -+* Abstract: API header file for P751 -+*********************************************************************************************/ -+ -+#ifndef __P751_API_H__ -+#define __P751_API_H__ -+ -+ -+/*********************** Key encapsulation mechanism API ***********************/ -+ -+#define CRYPTO_SECRETKEYBYTES 644 // MSG_BYTES + SECRETKEY_B_BYTES + CRYPTO_PUBLICKEYBYTES bytes -+#define CRYPTO_PUBLICKEYBYTES 564 -+#define CRYPTO_BYTES 24 -+#define CRYPTO_CIPHERTEXTBYTES 596 // CRYPTO_PUBLICKEYBYTES + MSG_BYTES bytes -+ -+// Algorithm name -+#define CRYPTO_ALGNAME "SIKEp751" -+ -+// SIKE's key generation -+// It produces a private key sk and computes the public key pk. -+// Outputs: secret key sk (CRYPTO_SECRETKEYBYTES = 644 bytes) -+// public key pk (CRYPTO_PUBLICKEYBYTES = 564 bytes) -+int crypto_kem_keypair_SIKEp751(unsigned char *pk, unsigned char *sk); -+ -+// SIKE's encapsulation -+// Input: public key pk (CRYPTO_PUBLICKEYBYTES = 564 bytes) -+// Outputs: shared secret ss (CRYPTO_BYTES = 24 bytes) -+// ciphertext message ct (CRYPTO_CIPHERTEXTBYTES = 596 bytes) -+int crypto_kem_enc_SIKEp751(unsigned char *ct, unsigned char *ss, const unsigned char *pk); -+ -+// SIKE's decapsulation -+// Input: secret key sk (CRYPTO_SECRETKEYBYTES = 644 bytes) -+// ciphertext message ct (CRYPTO_CIPHERTEXTBYTES = 596 bytes) -+// Outputs: shared secret ss (CRYPTO_BYTES = 24 bytes) -+int crypto_kem_dec_SIKEp751(unsigned char *ss, const unsigned char *ct, const unsigned char *sk); -+ -+ -+// Encoding of keys for KEM-based isogeny system "SIKEp751" (wire format): -+// ---------------------------------------------------------------------- -+// Elements over GF(p751) are encoded in 94 octets in little endian format (i.e., the least significant octet is located in the lowest memory address). -+// Elements (a+b*i) over GF(p751^2), where a and b are defined over GF(p751), are encoded as {a, b}, with a in the lowest memory portion. -+// -+// Private keys sk consist of the concatenation of a 32-byte random value, a value in the range [0, 2^378-1] and the public key pk. In the SIKE API, -+// private keys are encoded in 644 octets in little endian format. -+// Public keys pk consist of 3 elements in GF(p751^2). In the SIKE API, pk is encoded in 564 octets. -+// Ciphertexts ct consist of the concatenation of a public key value and a 32-byte value. In the SIKE API, ct is encoded in 564 + 32 = 596 octets. -+// Shared keys ss consist of a value of 24 octets. -+ -+ -+/*********************** Ephemeral key exchange API ***********************/ -+ -+#define SIDH_SECRETKEYBYTES 48 -+#define SIDH_PUBLICKEYBYTES 564 -+#define SIDH_BYTES 188 -+ -+// SECURITY NOTE: SIDH supports ephemeral Diffie-Hellman key exchange. It is NOT secure to use it with static keys. -+// See "On the Security of Supersingular Isogeny Cryptosystems", S.D. Galbraith, C. Petit, B. Shani and Y.B. Ti, in ASIACRYPT 2016, 2016. -+// Extended version available at: http://eprint.iacr.org/2016/859 -+ -+// Generation of Alice's secret key -+// Outputs random value in [0, 2^372 - 1] to be used as Alice's private key -+void random_mod_order_A_SIDHp751(unsigned char* random_digits); -+ -+// Generation of Bob's secret key -+// Outputs random value in [0, 2^Floor(Log(2,3^239)) - 1] to be used as Bob's private key -+void random_mod_order_B_SIDHp751(unsigned char* random_digits); -+ -+// Alice's ephemeral public key generation -+// Input: a private key PrivateKeyA in the range [0, 2^372 - 1], stored in 47 bytes. -+// Output: the public key PublicKeyA consisting of 3 GF(p751^2) elements encoded in 564 bytes. -+int EphemeralKeyGeneration_A_SIDHp751(const unsigned char* PrivateKeyA, unsigned char* PublicKeyA); -+ -+// Bob's ephemeral key-pair generation -+// It produces a private key PrivateKeyB and computes the public key PublicKeyB. -+// The private key is an integer in the range [0, 2^Floor(Log(2,3^239)) - 1], stored in 48 bytes. -+// The public key consists of 3 GF(p751^2) elements encoded in 564 bytes. -+int EphemeralKeyGeneration_B_SIDHp751(const unsigned char* PrivateKeyB, unsigned char* PublicKeyB); -+ -+// Alice's ephemeral shared secret computation -+// It produces a shared secret key SharedSecretA using her secret key PrivateKeyA and Bob's public key PublicKeyB -+// Inputs: Alice's PrivateKeyA is an integer in the range [0, 2^372 - 1], stored in 47 bytes. -+// Bob's PublicKeyB consists of 3 GF(p751^2) elements encoded in 564 bytes. -+// Output: a shared secret SharedSecretA that consists of one element in GF(p751^2) encoded in 188 bytes. -+int EphemeralSecretAgreement_A_SIDHp751(const unsigned char* PrivateKeyA, const unsigned char* PublicKeyB, unsigned char* SharedSecretA); -+ -+// Bob's ephemeral shared secret computation -+// It produces a shared secret key SharedSecretB using his secret key PrivateKeyB and Alice's public key PublicKeyA -+// Inputs: Bob's PrivateKeyB is an integer in the range [0, 2^Floor(Log(2,3^239)) - 1], stored in 48 bytes. -+// Alice's PublicKeyA consists of 3 GF(p751^2) elements encoded in 564 bytes. -+// Output: a shared secret SharedSecretB that consists of one element in GF(p751^2) encoded in 188 bytes. -+int EphemeralSecretAgreement_B_SIDHp751(const unsigned char* PrivateKeyB, const unsigned char* PublicKeyA, unsigned char* SharedSecretB); -+ -+ -+// Encoding of keys for KEX-based isogeny system "SIDHp751" (wire format): -+// ---------------------------------------------------------------------- -+// Elements over GF(p751) are encoded in 94 octets in little endian format (i.e., the least significant octet is located in the lowest memory address). -+// Elements (a+b*i) over GF(p751^2), where a and b are defined over GF(p751), are encoded as {a, b}, with a in the lowest memory portion. -+// -+// Private keys PrivateKeyA and PrivateKeyB can have values in the range [0, 2^372-1] and [0, 2^378-1], resp. In the SIDH API, private keys are encoded -+// in 48 octets in little endian format. -+// Public keys PublicKeyA and PublicKeyB consist of 3 elements in GF(p751^2). In the SIDH API, they are encoded in 564 octets. -+// Shared keys SharedSecretA and SharedSecretB consist of one element in GF(p751^2). In the SIDH API, they are encoded in 188 octets. -+ -+ -+#endif -\ No newline at end of file -diff --git a/third_party/sidh/src/P751/P751_internal.h b/third_party/sidh/src/P751/P751_internal.h -new file mode 100644 -index 00000000..ffa52530 ---- /dev/null -+++ b/third_party/sidh/src/P751/P751_internal.h -@@ -0,0 +1,245 @@ -+/******************************************************************************************** -+* SIDH: an efficient supersingular isogeny cryptography library -+* -+* Abstract: internal header file for P751 -+*********************************************************************************************/ -+ -+#ifndef __P751_INTERNAL_H__ -+#define __P751_INTERNAL_H__ -+ -+#include "../config.h" -+ -+ -+#if (TARGET == TARGET_AMD64) -+ #define NWORDS_FIELD 12 // Number of words of a 751-bit field element -+ #define p751_ZERO_WORDS 5 // Number of "0" digits in the least significant part of p751 + 1 -+#elif (TARGET == TARGET_x86) -+ #define NWORDS_FIELD 24 -+ #define p751_ZERO_WORDS 11 -+#elif (TARGET == TARGET_ARM) -+ #define NWORDS_FIELD 24 -+ #define p751_ZERO_WORDS 11 -+#elif (TARGET == TARGET_ARM64) -+ #define NWORDS_FIELD 12 -+ #define p751_ZERO_WORDS 5 -+#endif -+ -+ -+// Basic constants -+ -+#define NBITS_FIELD 751 -+#define MAXBITS_FIELD 768 -+#define MAXWORDS_FIELD ((MAXBITS_FIELD+RADIX-1)/RADIX) // Max. number of words to represent field elements -+#define NWORDS64_FIELD ((NBITS_FIELD+63)/64) // Number of 64-bit words of a 751-bit field element -+#define NBITS_ORDER 384 -+#define NWORDS_ORDER ((NBITS_ORDER+RADIX-1)/RADIX) // Number of words of oA and oB, where oA and oB are the subgroup orders of Alice and Bob, resp. -+#define NWORDS64_ORDER ((NBITS_ORDER+63)/64) // Number of 64-bit words of a 384-bit element -+#define MAXBITS_ORDER NBITS_ORDER -+#define MAXWORDS_ORDER ((MAXBITS_ORDER+RADIX-1)/RADIX) // Max. number of words to represent elements in [1, oA-1] or [1, oB]. -+#define ALICE 0 -+#define BOB 1 -+#define OALICE_BITS 372 -+#define OBOB_BITS 379 -+#define OBOB_EXPON 239 -+#define MASK_ALICE 0x0F -+#define MASK_BOB 0x03 -+#define PRIME p751 -+#define PARAM_A 0 -+#define PARAM_C 1 -+// Fixed parameters for isogeny tree computation -+#define MAX_INT_POINTS_ALICE 8 -+#define MAX_INT_POINTS_BOB 10 -+#define MAX_Alice 186 -+#define MAX_Bob 239 -+#define MSG_BYTES 32 -+#define SECRETKEY_A_BYTES (OALICE_BITS + 7) / 8 -+#define SECRETKEY_B_BYTES (OBOB_BITS + 7) / 8 -+#define FP2_ENCODED_BYTES 2*((NBITS_FIELD + 7) / 8) -+ -+// SIDH's basic element definitions and point representations -+ -+typedef digit_t felm_t[NWORDS_FIELD]; // Datatype for representing 751-bit field elements (768-bit max.) -+typedef digit_t dfelm_t[2*NWORDS_FIELD]; // Datatype for representing double-precision 2x751-bit field elements (2x768-bit max.) -+typedef felm_t f2elm_t[2]; // Datatype for representing quadratic extension field elements GF(p751^2) -+ -+typedef struct { f2elm_t X; f2elm_t Z; } point_proj; // Point representation in projective XZ Montgomery coordinates. -+typedef point_proj point_proj_t[1]; -+ -+ -+ -+/**************** Function prototypes ****************/ -+/************* Multiprecision functions **************/ -+ -+// Copy wordsize digits, c = a, where lng(a) = nwords -+void copy_words(const digit_t* a, digit_t* c, const unsigned int nwords); -+ -+// Multiprecision addition, c = a+b, where lng(a) = lng(b) = nwords. Returns the carry bit -+unsigned int mp_add(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords); -+ -+// 751-bit multiprecision addition, c = a+b -+void mp_add751(const digit_t* a, const digit_t* b, digit_t* c); -+void mp_add751_asm(const digit_t* a, const digit_t* b, digit_t* c); -+ -+// Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = nwords. Returns the borrow bit -+unsigned int mp_sub(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords); -+digit_t mp_sub751x2_asm(const digit_t* a, const digit_t* b, digit_t* c); -+ -+// Double 2x751-bit multiprecision subtraction, c = c-a-b, where c > a and c > b -+void mp_dblsub751x2_asm(const digit_t* a, const digit_t* b, digit_t* c); -+ -+// Multiprecision left shift -+void mp_shiftleft(digit_t* x, unsigned int shift, const unsigned int nwords); -+ -+// Multiprecision right shift by one -+void mp_shiftr1(digit_t* x, const unsigned int nwords); -+ -+// Multiprecision left right shift by one -+void mp_shiftl1(digit_t* x, const unsigned int nwords); -+ -+// Digit multiplication, digit * digit -> 2-digit result -+void digit_x_digit(const digit_t a, const digit_t b, digit_t* c); -+ -+// Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = nwords. -+void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords); -+ -+/************ Field arithmetic functions *************/ -+ -+// Copy of a field element, c = a -+void fpcopy751(const digit_t* a, digit_t* c); -+ -+// Zeroing a field element, a = 0 -+void fpzero751(digit_t* a); -+ -+// Non constant-time comparison of two field elements. If a = b return TRUE, otherwise, return FALSE -+bool fpequal751_non_constant_time(const digit_t* a, const digit_t* b); -+ -+// Modular addition, c = a+b mod p751 -+extern void fpadd751(const digit_t* a, const digit_t* b, digit_t* c); -+extern void fpadd751_asm(const digit_t* a, const digit_t* b, digit_t* c); -+ -+// Modular subtraction, c = a-b mod p751 -+extern void fpsub751(const digit_t* a, const digit_t* b, digit_t* c); -+extern void fpsub751_asm(const digit_t* a, const digit_t* b, digit_t* c); -+ -+// Modular negation, a = -a mod p751 -+extern void fpneg751(digit_t* a); -+ -+// Modular division by two, c = a/2 mod p751. -+void fpdiv2_751(const digit_t* a, digit_t* c); -+ -+// Modular correction to reduce field element a in [0, 2*p751-1] to [0, p751-1]. -+void fpcorrection751(digit_t* a); -+ -+// 751-bit Montgomery reduction, c = a mod p -+void rdc_mont(const digit_t* a, digit_t* c); -+ -+// Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod p751, where R=2^768 -+void fpmul751_mont(const digit_t* a, const digit_t* b, digit_t* c); -+void mul751_asm(const digit_t* a, const digit_t* b, digit_t* c); -+void rdc751_asm(const digit_t* ma, digit_t* mc); -+ -+// Field squaring using Montgomery arithmetic, c = a*b*R^-1 mod p751, where R=2^768 -+void fpsqr751_mont(const digit_t* ma, digit_t* mc); -+ -+// Conversion to Montgomery representation -+void to_mont(const digit_t* a, digit_t* mc); -+ -+// Conversion from Montgomery representation to standard representation -+void from_mont(const digit_t* ma, digit_t* c); -+ -+// Field inversion, a = a^-1 in GF(p751) -+void fpinv751_mont(digit_t* a); -+ -+// Field inversion, a = a^-1 in GF(p751) using the binary GCD -+void fpinv751_mont_bingcd(digit_t* a); -+ -+// Chain to compute (p751-3)/4 using Montgomery arithmetic -+void fpinv751_chain_mont(digit_t* a); -+ -+/************ GF(p^2) arithmetic functions *************/ -+ -+// Copy of a GF(p751^2) element, c = a -+void fp2copy751(const f2elm_t a, f2elm_t c); -+ -+// Zeroing a GF(p751^2) element, a = 0 -+void fp2zero751(f2elm_t a); -+ -+// GF(p751^2) negation, a = -a in GF(p751^2) -+void fp2neg751(f2elm_t a); -+ -+// GF(p751^2) addition, c = a+b in GF(p751^2) -+extern void fp2add751(const f2elm_t a, const f2elm_t b, f2elm_t c); -+ -+// GF(p751^2) subtraction, c = a-b in GF(p751^2) -+extern void fp2sub751(const f2elm_t a, const f2elm_t b, f2elm_t c); -+ -+// GF(p751^2) division by two, c = a/2 in GF(p751^2) -+void fp2div2_751(const f2elm_t a, f2elm_t c); -+ -+// Modular correction, a = a in GF(p751^2) -+void fp2correction751(f2elm_t a); -+ -+// GF(p751^2) squaring using Montgomery arithmetic, c = a^2 in GF(p751^2) -+void fp2sqr751_mont(const f2elm_t a, f2elm_t c); -+ -+// GF(p751^2) multiplication using Montgomery arithmetic, c = a*b in GF(p751^2) -+void fp2mul751_mont(const f2elm_t a, const f2elm_t b, f2elm_t c); -+ -+// Conversion of a GF(p751^2) element to Montgomery representation -+void to_fp2mont(const f2elm_t a, f2elm_t mc); -+ -+// Conversion of a GF(p751^2) element from Montgomery representation to standard representation -+void from_fp2mont(const f2elm_t ma, f2elm_t c); -+ -+// GF(p751^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2) -+void fp2inv751_mont(f2elm_t a); -+ -+// GF(p751^2) inversion, a = (a0-i*a1)/(a0^2+a1^2), GF(p751) inversion done using the binary GCD -+void fp2inv751_mont_bingcd(f2elm_t a); -+ -+// n-way Montgomery inversion -+void mont_n_way_inv(const f2elm_t* vec, const int n, f2elm_t* out); -+ -+/************ Elliptic curve and isogeny functions *************/ -+ -+// Computes the j-invariant of a Montgomery curve with projective constant. -+void j_inv(const f2elm_t A, const f2elm_t C, f2elm_t jinv); -+ -+// Simultaneous doubling and differential addition. -+void xDBLADD(point_proj_t P, point_proj_t Q, const f2elm_t xPQ, const f2elm_t A24); -+ -+// Doubling of a Montgomery point in projective coordinates (X:Z). -+void xDBL(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24); -+ -+// Computes [2^e](X:Z) on Montgomery curve with projective constant via e repeated doublings. -+void xDBLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24, const int e); -+ -+// Differential addition. -+void xADD(point_proj_t P, const point_proj_t Q, const f2elm_t xPQ); -+ -+// Computes the corresponding 4-isogeny of a projective Montgomery point (X4:Z4) of order 4. -+void get_4_isog(const point_proj_t P, f2elm_t A24plus, f2elm_t C24, f2elm_t* coeff); -+ -+// Evaluates the isogeny at the point (X:Z) in the domain of the isogeny. -+void eval_4_isog(point_proj_t P, f2elm_t* coeff); -+ -+// Tripling of a Montgomery point in projective coordinates (X:Z). -+void xTPL(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus); -+ -+// Computes [3^e](X:Z) on Montgomery curve with projective constant via e repeated triplings. -+void xTPLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus, const int e); -+ -+// Computes the corresponding 3-isogeny of a projective Montgomery point (X3:Z3) of order 3. -+void get_3_isog(const point_proj_t P, f2elm_t A24minus, f2elm_t A24plus, f2elm_t* coeff); -+ -+// Computes the 3-isogeny R=phi(X:Z), given projective point (X3:Z3) of order 3 on a Montgomery curve and a point P with coefficients given in coeff. -+void eval_3_isog(point_proj_t Q, const f2elm_t* coeff); -+ -+// 3-way simultaneous inversion -+void inv_3_way(f2elm_t z1, f2elm_t z2, f2elm_t z3); -+ -+// Given the x-coordinates of P, Q, and R, returns the value A corresponding to the Montgomery curve E_A: y^2=x^3+A*x^2+x such that R=Q-P on E_A. -+void get_A(const f2elm_t xP, const f2elm_t xQ, const f2elm_t xR, f2elm_t A); -+ -+ -+#endif -diff --git a/third_party/sidh/src/P751/generic/fp_generic.c b/third_party/sidh/src/P751/generic/fp_generic.c -new file mode 100644 -index 00000000..ec47384a ---- /dev/null -+++ b/third_party/sidh/src/P751/generic/fp_generic.c -@@ -0,0 +1,224 @@ -+/******************************************************************************************** -+* SIDH: an efficient supersingular isogeny cryptography library -+* -+* Abstract: portable modular arithmetic for P751 -+*********************************************************************************************/ -+ -+#include "../P751_internal.h" -+ -+ -+// Global constants -+extern const uint64_t p751[NWORDS_FIELD]; -+extern const uint64_t p751p1[NWORDS_FIELD]; -+extern const uint64_t p751x2[NWORDS_FIELD]; -+ -+ -+__inline void fpadd751(const digit_t* a, const digit_t* b, digit_t* c) -+{ // Modular addition, c = a+b mod p751. -+ // Inputs: a, b in [0, 2*p751-1] -+ // Output: c in [0, 2*p751-1] -+ unsigned int i, carry = 0; -+ digit_t mask; -+ -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ ADDC(carry, a[i], b[i], carry, c[i]); -+ } -+ -+ carry = 0; -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ SUBC(carry, c[i], ((digit_t*)p751x2)[i], carry, c[i]); -+ } -+ mask = 0 - (digit_t)carry; -+ -+ carry = 0; -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ ADDC(carry, c[i], ((digit_t*)p751x2)[i] & mask, carry, c[i]); -+ } -+} -+ -+ -+__inline void fpsub751(const digit_t* a, const digit_t* b, digit_t* c) -+{ // Modular subtraction, c = a-b mod p751. -+ // Inputs: a, b in [0, 2*p751-1] -+ // Output: c in [0, 2*p751-1] -+ unsigned int i, borrow = 0; -+ digit_t mask; -+ -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ SUBC(borrow, a[i], b[i], borrow, c[i]); -+ } -+ mask = 0 - (digit_t)borrow; -+ -+ borrow = 0; -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ ADDC(borrow, c[i], ((digit_t*)p751x2)[i] & mask, borrow, c[i]); -+ } -+} -+ -+ -+__inline void fpneg751(digit_t* a) -+{ // Modular negation, a = -a mod p751. -+ // Input/output: a in [0, 2*p751-1] -+ unsigned int i, borrow = 0; -+ -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ SUBC(borrow, ((digit_t*)p751x2)[i], a[i], borrow, a[i]); -+ } -+} -+ -+ -+void fpdiv2_751(const digit_t* a, digit_t* c) -+{ // Modular division by two, c = a/2 mod p751. -+ // Input : a in [0, 2*p751-1] -+ // Output: c in [0, 2*p751-1] -+ unsigned int i, carry = 0; -+ digit_t mask; -+ -+ mask = 0 - (digit_t)(a[0] & 1); // If a is odd compute a+p751 -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ ADDC(carry, a[i], ((digit_t*)p751)[i] & mask, carry, c[i]); -+ } -+ -+ mp_shiftr1(c, NWORDS_FIELD); -+} -+ -+ -+void fpcorrection751(digit_t* a) -+{ // Modular correction to reduce field element a in [0, 2*p751-1] to [0, p751-1]. -+ unsigned int i, borrow = 0; -+ digit_t mask; -+ -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ SUBC(borrow, a[i], ((digit_t*)p751)[i], borrow, a[i]); -+ } -+ mask = 0 - (digit_t)borrow; -+ -+ borrow = 0; -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ ADDC(borrow, a[i], ((digit_t*)p751)[i] & mask, borrow, a[i]); -+ } -+} -+ -+ -+void digit_x_digit(const digit_t a, const digit_t b, digit_t* c) -+{ // Digit multiplication, digit * digit -> 2-digit result -+ register digit_t al, ah, bl, bh, temp; -+ digit_t albl, albh, ahbl, ahbh, res1, res2, res3, carry; -+ digit_t mask_low = (digit_t)(-1) >> (sizeof(digit_t)*4), mask_high = (digit_t)(-1) << (sizeof(digit_t)*4); -+ -+ al = a & mask_low; // Low part -+ ah = a >> (sizeof(digit_t) * 4); // High part -+ bl = b & mask_low; -+ bh = b >> (sizeof(digit_t) * 4); -+ -+ albl = al*bl; -+ albh = al*bh; -+ ahbl = ah*bl; -+ ahbh = ah*bh; -+ c[0] = albl & mask_low; // C00 -+ -+ res1 = albl >> (sizeof(digit_t) * 4); -+ res2 = ahbl & mask_low; -+ res3 = albh & mask_low; -+ temp = res1 + res2 + res3; -+ carry = temp >> (sizeof(digit_t) * 4); -+ c[0] ^= temp << (sizeof(digit_t) * 4); // C01 -+ -+ res1 = ahbl >> (sizeof(digit_t) * 4); -+ res2 = albh >> (sizeof(digit_t) * 4); -+ res3 = ahbh & mask_low; -+ temp = res1 + res2 + res3 + carry; -+ c[1] = temp & mask_low; // C10 -+ carry = temp & mask_high; -+ c[1] ^= (ahbh & mask_high) + carry; // C11 -+} -+ -+ -+void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) -+{ // Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = nwords. -+ unsigned int i, j; -+ digit_t t = 0, u = 0, v = 0, UV[2]; -+ unsigned int carry = 0; -+ -+ for (i = 0; i < nwords; i++) { -+ for (j = 0; j <= i; j++) { -+ MUL(a[j], b[i-j], UV+1, UV[0]); -+ ADDC(0, UV[0], v, carry, v); -+ ADDC(carry, UV[1], u, carry, u); -+ t += carry; -+ } -+ c[i] = v; -+ v = u; -+ u = t; -+ t = 0; -+ } -+ -+ for (i = nwords; i < 2*nwords-1; i++) { -+ for (j = i-nwords+1; j < nwords; j++) { -+ MUL(a[j], b[i-j], UV+1, UV[0]); -+ ADDC(0, UV[0], v, carry, v); -+ ADDC(carry, UV[1], u, carry, u); -+ t += carry; -+ } -+ c[i] = v; -+ v = u; -+ u = t; -+ t = 0; -+ } -+ c[2*nwords-1] = v; -+} -+ -+ -+void rdc_mont(const digit_t* ma, digit_t* mc) -+{ // Efficient Montgomery reduction using comba and exploiting the special form of the prime p751. -+ // mc = ma*R^-1 mod p751x2, where R = 2^768. -+ // If ma < 2^768*p751, the output mc is in the range [0, 2*p751-1]. -+ // ma is assumed to be in Montgomery representation. -+ unsigned int i, j, carry, count = p751_ZERO_WORDS; -+ digit_t UV[2], t = 0, u = 0, v = 0; -+ -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ mc[i] = 0; -+ } -+ -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ for (j = 0; j < i; j++) { -+ if (j < (i-p751_ZERO_WORDS+1)) { -+ MUL(mc[j], ((digit_t*)p751p1)[i-j], UV+1, UV[0]); -+ ADDC(0, UV[0], v, carry, v); -+ ADDC(carry, UV[1], u, carry, u); -+ t += carry; -+ } -+ } -+ ADDC(0, v, ma[i], carry, v); -+ ADDC(carry, u, 0, carry, u); -+ t += carry; -+ mc[i] = v; -+ v = u; -+ u = t; -+ t = 0; -+ } -+ -+ for (i = NWORDS_FIELD; i < 2*NWORDS_FIELD-1; i++) { -+ if (count > 0) { -+ count -= 1; -+ } -+ for (j = i-NWORDS_FIELD+1; j < NWORDS_FIELD; j++) { -+ if (j < (NWORDS_FIELD-count)) { -+ MUL(mc[j], ((digit_t*)p751p1)[i-j], UV+1, UV[0]); -+ ADDC(0, UV[0], v, carry, v); -+ ADDC(carry, UV[1], u, carry, u); -+ t += carry; -+ } -+ } -+ ADDC(0, v, ma[i], carry, v); -+ ADDC(carry, u, 0, carry, u); -+ t += carry; -+ mc[i-NWORDS_FIELD] = v; -+ v = u; -+ u = t; -+ t = 0; -+ } -+ ADDC(0, v, ma[2*NWORDS_FIELD-1], carry, v); -+ mc[NWORDS_FIELD-1] = v; -+} -\ No newline at end of file -diff --git a/third_party/sidh/src/config.h b/third_party/sidh/src/config.h -new file mode 100644 -index 00000000..08dd9295 ---- /dev/null -+++ b/third_party/sidh/src/config.h -@@ -0,0 +1,265 @@ -+/******************************************************************************************** -+* SIDH: an efficient supersingular isogeny cryptography library -+* -+* Abstract: configuration file and platform-dependent macros -+*********************************************************************************************/ -+ -+#ifndef __CONFIG_H__ -+#define __CONFIG_H__ -+ -+#include -+#include -+#include -+ -+ -+// Definition of operating system -+ -+#define OS_WIN 1 -+#define OS_LINUX 2 -+ -+#if defined(__WINDOWS__) // Microsoft Windows OS -+ #define OS_TARGET OS_WIN -+#elif defined(__LINUX__) // Linux OS -+ #define OS_TARGET OS_LINUX -+#else -+ #error -- "Unsupported OS" -+#endif -+ -+ -+// Definition of compiler -+ -+#define COMPILER_VC 1 -+#define COMPILER_GCC 2 -+#define COMPILER_CLANG 3 -+ -+#if defined(_MSC_VER) // Microsoft Visual C compiler -+ #define COMPILER COMPILER_VC -+#elif defined(__GNUC__) // GNU GCC compiler -+ #define COMPILER COMPILER_GCC -+#elif defined(__clang__) // Clang compiler -+ #define COMPILER COMPILER_CLANG -+#else -+ #error -- "Unsupported COMPILER" -+#endif -+ -+ -+// Definition of the targeted architecture and basic data types -+ -+#define TARGET_AMD64 1 -+#define TARGET_x86 2 -+#define TARGET_ARM 3 -+#define TARGET_ARM64 4 -+ -+#if defined(_AMD64_) -+ #define TARGET TARGET_AMD64 -+ #define RADIX 64 -+ #define LOG2RADIX 6 -+ typedef uint64_t digit_t; // Unsigned 64-bit digit -+#elif defined(_X86_) -+ #define TARGET TARGET_x86 -+ #define RADIX 32 -+ #define LOG2RADIX 5 -+ typedef uint32_t digit_t; // Unsigned 32-bit digit -+#elif defined(_ARM_) -+ #define TARGET TARGET_ARM -+ #define RADIX 32 -+ #define LOG2RADIX 5 -+ typedef uint32_t digit_t; // Unsigned 32-bit digit -+#elif defined(_ARM64_) -+ #define TARGET TARGET_ARM64 -+ #define RADIX 64 -+ #define LOG2RADIX 6 -+ typedef uint64_t digit_t; // Unsigned 64-bit digit -+#else -+ #error -- "Unsupported ARCHITECTURE" -+#endif -+ -+#define RADIX64 64 -+ -+ -+// Selection of generic, portable implementation -+ -+#if defined(_GENERIC_) -+ #define GENERIC_IMPLEMENTATION -+#elif defined(_FAST_) -+ #define FAST_IMPLEMENTATION -+#endif -+ -+ -+// Extended datatype support -+ -+#if defined(GENERIC_IMPLEMENTATION) -+ typedef uint64_t uint128_t[2]; -+#elif (TARGET == TARGET_AMD64 && OS_TARGET == OS_LINUX) && (COMPILER == COMPILER_GCC || COMPILER == COMPILER_CLANG) -+ #define UINT128_SUPPORT -+ typedef unsigned uint128_t __attribute__((mode(TI))); -+#elif (TARGET == TARGET_ARM64 && OS_TARGET == OS_LINUX) && (COMPILER == COMPILER_GCC || COMPILER == COMPILER_CLANG) -+ #define UINT128_SUPPORT -+ typedef unsigned uint128_t __attribute__((mode(TI))); -+#elif (TARGET == TARGET_AMD64) && (OS_TARGET == OS_WIN && COMPILER == COMPILER_VC) -+ #define SCALAR_INTRIN_SUPPORT -+ typedef uint64_t uint128_t[2]; -+#else -+ #error -- "Unsupported configuration" -+#endif -+ -+ -+// Macro definitions -+ -+#define NBITS_TO_NBYTES(nbits) (((nbits)+7)/8) // Conversion macro from number of bits to number of bytes -+#define NBITS_TO_NWORDS(nbits) (((nbits)+(sizeof(digit_t)*8)-1)/(sizeof(digit_t)*8)) // Conversion macro from number of bits to number of computer words -+#define NBYTES_TO_NWORDS(nbytes) (((nbytes)+sizeof(digit_t)-1)/sizeof(digit_t)) // Conversion macro from number of bytes to number of computer words -+ -+// Macro to avoid compiler warnings when detecting unreferenced parameters -+#define UNREFERENCED_PARAMETER(PAR) ((void)(PAR)) -+ -+ -+/********************** Constant-time unsigned comparisons ***********************/ -+ -+// The following functions return 1 (TRUE) if condition is true, 0 (FALSE) otherwise -+ -+static __inline unsigned int is_digit_nonzero_ct(digit_t x) -+{ // Is x != 0? -+ return (unsigned int)((x | (0-x)) >> (RADIX-1)); -+} -+ -+static __inline unsigned int is_digit_zero_ct(digit_t x) -+{ // Is x = 0? -+ return (unsigned int)(1 ^ is_digit_nonzero_ct(x)); -+} -+ -+static __inline unsigned int is_digit_lessthan_ct(digit_t x, digit_t y) -+{ // Is x < y? -+ return (unsigned int)((x ^ ((x ^ y) | ((x - y) ^ y))) >> (RADIX-1)); -+} -+ -+ -+/********************** Macros for platform-dependent operations **********************/ -+ -+#if defined(GENERIC_IMPLEMENTATION) -+ -+// Digit multiplication -+#define MUL(multiplier, multiplicand, hi, lo) \ -+ digit_x_digit((multiplier), (multiplicand), &(lo)); -+ -+// Digit addition with carry -+#define ADDC(carryIn, addend1, addend2, carryOut, sumOut) \ -+ { digit_t tempReg = (addend1) + (digit_t)(carryIn); \ -+ (sumOut) = (addend2) + tempReg; \ -+ (carryOut) = (is_digit_lessthan_ct(tempReg, (digit_t)(carryIn)) | is_digit_lessthan_ct((sumOut), tempReg)); } -+ -+// Digit subtraction with borrow -+#define SUBC(borrowIn, minuend, subtrahend, borrowOut, differenceOut) \ -+ { digit_t tempReg = (minuend) - (subtrahend); \ -+ unsigned int borrowReg = (is_digit_lessthan_ct((minuend), (subtrahend)) | ((borrowIn) & is_digit_zero_ct(tempReg))); \ -+ (differenceOut) = tempReg - (digit_t)(borrowIn); \ -+ (borrowOut) = borrowReg; } -+ -+// Shift right with flexible datatype -+#define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize) \ -+ (shiftOut) = ((lowIn) >> (shift)) ^ ((highIn) << (DigitSize - (shift))); -+ -+// Shift left with flexible datatype -+#define SHIFTL(highIn, lowIn, shift, shiftOut, DigitSize) \ -+ (shiftOut) = ((highIn) << (shift)) ^ ((lowIn) >> (DigitSize - (shift))); -+ -+// 64x64-bit multiplication -+#define MUL128(multiplier, multiplicand, product) \ -+ mp_mul((digit_t*)&(multiplier), (digit_t*)&(multiplicand), (digit_t*)&(product), NWORDS_FIELD/2); -+ -+// 128-bit addition, inputs < 2^127 -+#define ADD128(addend1, addend2, addition) \ -+ mp_add((digit_t*)(addend1), (digit_t*)(addend2), (digit_t*)(addition), NWORDS_FIELD); -+ -+// 128-bit addition with output carry -+#define ADC128(addend1, addend2, carry, addition) \ -+ (carry) = mp_add((digit_t*)(addend1), (digit_t*)(addend2), (digit_t*)(addition), NWORDS_FIELD); -+ -+#elif (TARGET == TARGET_AMD64 && OS_TARGET == OS_WIN) -+ -+// Digit multiplication -+#define MUL(multiplier, multiplicand, hi, lo) \ -+ (lo) = _umul128((multiplier), (multiplicand), (hi)); -+ -+// Digit addition with carry -+#define ADDC(carryIn, addend1, addend2, carryOut, sumOut) \ -+ (carryOut) = _addcarry_u64((carryIn), (addend1), (addend2), &(sumOut)); -+ -+// Digit subtraction with borrow -+#define SUBC(borrowIn, minuend, subtrahend, borrowOut, differenceOut) \ -+ (borrowOut) = _subborrow_u64((borrowIn), (minuend), (subtrahend), &(differenceOut)); -+ -+// Digit shift right -+#define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize) \ -+ (shiftOut) = __shiftright128((lowIn), (highIn), (shift)); -+ -+// Digit shift left -+#define SHIFTL(highIn, lowIn, shift, shiftOut, DigitSize) \ -+ (shiftOut) = __shiftleft128((lowIn), (highIn), (shift)); -+ -+// 64x64-bit multiplication -+#define MUL128(multiplier, multiplicand, product) \ -+ (product)[0] = _umul128((multiplier), (multiplicand), &(product)[1]); -+ -+// 128-bit addition, inputs < 2^127 -+#define ADD128(addend1, addend2, addition) \ -+ { unsigned char carry = _addcarry_u64(0, (addend1)[0], (addend2)[0], &(addition)[0]); \ -+ _addcarry_u64(carry, (addend1)[1], (addend2)[1], &(addition)[1]); } -+ -+// 128-bit addition with output carry -+#define ADC128(addend1, addend2, carry, addition) \ -+ (carry) = _addcarry_u64(0, (addend1)[0], (addend2)[0], &(addition)[0]); \ -+ (carry) = _addcarry_u64((carry), (addend1)[1], (addend2)[1], &(addition)[1]); -+ -+// 128-bit subtraction, subtrahend < 2^127 -+#define SUB128(minuend, subtrahend, difference) \ -+ { unsigned char borrow = _subborrow_u64(0, (minuend)[0], (subtrahend)[0], &(difference)[0]); \ -+ _subborrow_u64(borrow, (minuend)[1], (subtrahend)[1], &(difference)[1]); } -+ -+// 128-bit right shift, max. shift value is 64 -+#define SHIFTR128(Input, shift, shiftOut) \ -+ (shiftOut)[0] = __shiftright128((Input)[0], (Input)[1], (shift)); \ -+ (shiftOut)[1] = (Input)[1] >> (shift); -+ -+// 128-bit left shift, max. shift value is 64 -+#define SHIFTL128(Input, shift, shiftOut) \ -+ (shiftOut)[1] = __shiftleft128((Input)[0], (Input)[1], (shift)); \ -+ (shiftOut)[0] = (Input)[0] << (shift); -+ -+#define MULADD128(multiplier, multiplicand, addend, carry, result); \ -+ { uint128_t product; \ -+ MUL128(multiplier, multiplicand, product); \ -+ ADC128(addend, product, carry, result); } -+ -+#elif ((TARGET == TARGET_AMD64 || TARGET == TARGET_ARM64) && OS_TARGET == OS_LINUX) -+ -+// Digit multiplication -+#define MUL(multiplier, multiplicand, hi, lo) \ -+ { uint128_t tempReg = (uint128_t)(multiplier) * (uint128_t)(multiplicand); \ -+ *(hi) = (digit_t)(tempReg >> RADIX); \ -+ (lo) = (digit_t)tempReg; } -+ -+// Digit addition with carry -+#define ADDC(carryIn, addend1, addend2, carryOut, sumOut) \ -+ { uint128_t tempReg = (uint128_t)(addend1) + (uint128_t)(addend2) + (uint128_t)(carryIn); \ -+ (carryOut) = (digit_t)(tempReg >> RADIX); \ -+ (sumOut) = (digit_t)tempReg; } -+ -+// Digit subtraction with borrow -+#define SUBC(borrowIn, minuend, subtrahend, borrowOut, differenceOut) \ -+ { uint128_t tempReg = (uint128_t)(minuend) - (uint128_t)(subtrahend) - (uint128_t)(borrowIn); \ -+ (borrowOut) = (digit_t)(tempReg >> (sizeof(uint128_t)*8 - 1)); \ -+ (differenceOut) = (digit_t)tempReg; } -+ -+// Digit shift right -+#define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize) \ -+ (shiftOut) = ((lowIn) >> (shift)) ^ ((highIn) << (RADIX - (shift))); -+ -+// Digit shift left -+#define SHIFTL(highIn, lowIn, shift, shiftOut, DigitSize) \ -+ (shiftOut) = ((highIn) << (shift)) ^ ((lowIn) >> (RADIX - (shift))); -+ -+#endif -+ -+ -+#endif -diff --git a/third_party/sidh/src/ec_isogeny.c b/third_party/sidh/src/ec_isogeny.c -new file mode 100644 -index 00000000..fefbaaa7 ---- /dev/null -+++ b/third_party/sidh/src/ec_isogeny.c -@@ -0,0 +1,333 @@ -+/******************************************************************************************** -+* SIDH: an efficient supersingular isogeny cryptography library -+* -+* Abstract: elliptic curve and isogeny functions -+*********************************************************************************************/ -+ -+ -+void xDBL(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24) -+{ // Doubling of a Montgomery point in projective coordinates (X:Z). -+ // Input: projective Montgomery x-coordinates P = (X1:Z1), where x1=X1/Z1 and Montgomery curve constants A+2C and 4C. -+ // Output: projective Montgomery x-coordinates Q = 2*P = (X2:Z2). -+ f2elm_t t0, t1; -+ -+ fp2sub(P->X, P->Z, t0); // t0 = X1-Z1 -+ fp2add(P->X, P->Z, t1); // t1 = X1+Z1 -+ fp2sqr_mont(t0, t0); // t0 = (X1-Z1)^2 -+ fp2sqr_mont(t1, t1); // t1 = (X1+Z1)^2 -+ fp2mul_mont(C24, t0, Q->Z); // Z2 = C24*(X1-Z1)^2 -+ fp2mul_mont(t1, Q->Z, Q->X); // X2 = C24*(X1-Z1)^2*(X1+Z1)^2 -+ fp2sub(t1, t0, t1); // t1 = (X1+Z1)^2-(X1-Z1)^2 -+ fp2mul_mont(A24plus, t1, t0); // t0 = A24plus*[(X1+Z1)^2-(X1-Z1)^2] -+ fp2add(Q->Z, t0, Q->Z); // Z2 = A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2 -+ fp2mul_mont(Q->Z, t1, Q->Z); // Z2 = [A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2]*[(X1+Z1)^2-(X1-Z1)^2] -+} -+ -+ -+void xDBLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24, const int e) -+{ // Computes [2^e](X:Z) on Montgomery curve with projective constant via e repeated doublings. -+ // Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A+2C and 4C. -+ // Output: projective Montgomery x-coordinates Q <- (2^e)*P. -+ int i; -+ -+ copy_words((digit_t*)P, (digit_t*)Q, 2*2*NWORDS_FIELD); -+ -+ for (i = 0; i < e; i++) { -+ xDBL(Q, Q, A24plus, C24); -+ } -+} -+ -+ -+void get_4_isog(const point_proj_t P, f2elm_t A24plus, f2elm_t C24, f2elm_t* coeff) -+{ // Computes the corresponding 4-isogeny of a projective Montgomery point (X4:Z4) of order 4. -+ // Input: projective point of order four P = (X4:Z4). -+ // Output: the 4-isogenous Montgomery curve with projective coefficients A+2C/4C and the 3 coefficients -+ // that are used to evaluate the isogeny at a point in eval_4_isog(). -+ -+ fp2sub(P->X, P->Z, coeff[1]); // coeff[1] = X4-Z4 -+ fp2add(P->X, P->Z, coeff[2]); // coeff[2] = X4+Z4 -+ fp2sqr_mont(P->Z, coeff[0]); // coeff[0] = Z4^2 -+ fp2add(coeff[0], coeff[0], coeff[0]); // coeff[0] = 2*Z4^2 -+ fp2sqr_mont(coeff[0], C24); // C24 = 4*Z4^4 -+ fp2add(coeff[0], coeff[0], coeff[0]); // coeff[0] = 4*Z4^2 -+ fp2sqr_mont(P->X, A24plus); // A24plus = X4^2 -+ fp2add(A24plus, A24plus, A24plus); // A24plus = 2*X4^2 -+ fp2sqr_mont(A24plus, A24plus); // A24plus = 4*X4^4 -+} -+ -+ -+void eval_4_isog(point_proj_t P, f2elm_t* coeff) -+{ // Evaluates the isogeny at the point (X:Z) in the domain of the isogeny, given a 4-isogeny phi defined -+ // by the 3 coefficients in coeff (computed in the function get_4_isog()). -+ // Inputs: the coefficients defining the isogeny, and the projective point P = (X:Z). -+ // Output: the projective point P = phi(P) = (X:Z) in the codomain. -+ f2elm_t t0, t1; -+ -+ fp2add(P->X, P->Z, t0); // t0 = X+Z -+ fp2sub(P->X, P->Z, t1); // t1 = X-Z -+ fp2mul_mont(t0, coeff[1], P->X); // X = (X+Z)*coeff[1] -+ fp2mul_mont(t1, coeff[2], P->Z); // Z = (X-Z)*coeff[2] -+ fp2mul_mont(t0, t1, t0); // t0 = (X+Z)*(X-Z) -+ fp2mul_mont(t0, coeff[0], t0); // t0 = coeff[0]*(X+Z)*(X-Z) -+ fp2add(P->X, P->Z, t1); // t1 = (X-Z)*coeff[2] + (X+Z)*coeff[1] -+ fp2sub(P->X, P->Z, P->Z); // Z = (X-Z)*coeff[2] - (X+Z)*coeff[1] -+ fp2sqr_mont(t1, t1); // t1 = [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2 -+ fp2sqr_mont(P->Z, P->Z); // Z = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2 -+ fp2add(t1, t0, P->X); // X = coeff[0]*(X+Z)*(X-Z) + [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2 -+ fp2sub(P->Z, t0, t0); // t0 = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2 - coeff[0]*(X+Z)*(X-Z) -+ fp2mul_mont(P->X, t1, P->X); // Xfinal -+ fp2mul_mont(P->Z, t0, P->Z); // Zfinal -+} -+ -+ -+void xTPL(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus) -+{ // Tripling of a Montgomery point in projective coordinates (X:Z). -+ // Input: projective Montgomery x-coordinates P = (X:Z), where x=X/Z and Montgomery curve constants A24plus = A+2C and A24minus = A-2C. -+ // Output: projective Montgomery x-coordinates Q = 3*P = (X3:Z3). -+ f2elm_t t0, t1, t2, t3, t4, t5, t6; -+ -+ fp2sub(P->X, P->Z, t0); // t0 = X-Z -+ fp2sqr_mont(t0, t2); // t2 = (X-Z)^2 -+ fp2add(P->X, P->Z, t1); // t1 = X+Z -+ fp2sqr_mont(t1, t3); // t3 = (X+Z)^2 -+ fp2add(t0, t1, t4); // t4 = 2*X -+ fp2sub(t1, t0, t0); // t0 = 2*Z -+ fp2sqr_mont(t4, t1); // t1 = 4*X^2 -+ fp2sub(t1, t3, t1); // t1 = 4*X^2 - (X+Z)^2 -+ fp2sub(t1, t2, t1); // t1 = 4*X^2 - (X+Z)^2 - (X-Z)^2 -+ fp2mul_mont(t3, A24plus, t5); // t5 = A24plus*(X+Z)^2 -+ fp2mul_mont(t3, t5, t3); // t3 = A24plus*(X+Z)^3 -+ fp2mul_mont(A24minus, t2, t6); // t6 = A24minus*(X-Z)^2 -+ fp2mul_mont(t2, t6, t2); // t2 = A24minus*(X-Z)^3 -+ fp2sub(t2, t3, t3); // t3 = A24minus*(X-Z)^3 - coeff*(X+Z)^3 -+ fp2sub(t5, t6, t2); // t2 = A24plus*(X+Z)^2 - A24minus*(X-Z)^2 -+ fp2mul_mont(t1, t2, t1); // t1 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] -+ fp2add(t3, t1, t2); // t2 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] + A24minus*(X-Z)^3 - coeff*(X+Z)^3 -+ fp2sqr_mont(t2, t2); // t2 = t2^2 -+ fp2mul_mont(t4, t2, Q->X); // X3 = 2*X*t2 -+ fp2sub(t3, t1, t1); // t1 = A24minus*(X-Z)^3 - A24plus*(X+Z)^3 - [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] -+ fp2sqr_mont(t1, t1); // t1 = t1^2 -+ fp2mul_mont(t0, t1, Q->Z); // Z3 = 2*Z*t1 -+} -+ -+ -+void xTPLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus, const int e) -+{ // Computes [3^e](X:Z) on Montgomery curve with projective constant via e repeated triplings. -+ // Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A24plus = A+2C and A24minus = A-2C. -+ // Output: projective Montgomery x-coordinates Q <- (3^e)*P. -+ int i; -+ -+ copy_words((digit_t*)P, (digit_t*)Q, 2*2*NWORDS_FIELD); -+ -+ for (i = 0; i < e; i++) { -+ xTPL(Q, Q, A24minus, A24plus); -+ } -+} -+ -+ -+void get_3_isog(const point_proj_t P, f2elm_t A24minus, f2elm_t A24plus, f2elm_t* coeff) -+{ // Computes the corresponding 3-isogeny of a projective Montgomery point (X3:Z3) of order 3. -+ // Input: projective point of order three P = (X3:Z3). -+ // Output: the 3-isogenous Montgomery curve with projective coefficient A/C. -+ f2elm_t t0, t1, t2, t3, t4; -+ -+ fp2sub(P->X, P->Z, coeff[0]); // coeff0 = X-Z -+ fp2sqr_mont(coeff[0], t0); // t0 = (X-Z)^2 -+ fp2add(P->X, P->Z, coeff[1]); // coeff1 = X+Z -+ fp2sqr_mont(coeff[1], t1); // t1 = (X+Z)^2 -+ fp2add(t0, t1, t2); // t2 = (X+Z)^2 + (X-Z)^2 -+ fp2add(coeff[0], coeff[1], t3); // t3 = 2*X -+ fp2sqr_mont(t3, t3); // t3 = 4*X^2 -+ fp2sub(t3, t2, t3); // t3 = 4*X^2 - (X+Z)^2 - (X-Z)^2 -+ fp2add(t1, t3, t2); // t2 = 4*X^2 - (X-Z)^2 -+ fp2add(t3, t0, t3); // t3 = 4*X^2 - (X+Z)^2 -+ fp2add(t0, t3, t4); // t4 = 4*X^2 - (X+Z)^2 + (X-Z)^2 -+ fp2add(t4, t4, t4); // t4 = 2(4*X^2 - (X+Z)^2 + (X-Z)^2) -+ fp2add(t1, t4, t4); // t4 = 8*X^2 - (X+Z)^2 + 2*(X-Z)^2 -+ fp2mul_mont(t2, t4, A24minus); // A24minus = [4*X^2 - (X-Z)^2]*[8*X^2 - (X+Z)^2 + 2*(X-Z)^2] -+ fp2add(t1, t2, t4); // t4 = 4*X^2 + (X+Z)^2 - (X-Z)^2 -+ fp2add(t4, t4, t4); // t4 = 2(4*X^2 + (X+Z)^2 - (X-Z)^2) -+ fp2add(t0, t4, t4); // t4 = 8*X^2 + 2*(X+Z)^2 - (X-Z)^2 -+ fp2mul_mont(t3, t4, t4); // t4 = [4*X^2 - (X+Z)^2]*[8*X^2 + 2*(X+Z)^2 - (X-Z)^2] -+ fp2sub(t4, A24minus, t0); // t0 = [4*X^2 - (X+Z)^2]*[8*X^2 + 2*(X+Z)^2 - (X-Z)^2] - [4*X^2 - (X-Z)^2]*[8*X^2 - (X+Z)^2 + 2*(X-Z)^2] -+ fp2add(A24minus, t0, A24plus); // A24plus = 8*X^2 - (X+Z)^2 + 2*(X-Z)^2 -+} -+ -+ -+void eval_3_isog(point_proj_t Q, const f2elm_t* coeff) -+{ // Computes the 3-isogeny R=phi(X:Z), given projective point (X3:Z3) of order 3 on a Montgomery curve and -+ // a point P with 2 coefficients in coeff (computed in the function get_3_isog()). -+ // Inputs: projective points P = (X3:Z3) and Q = (X:Z). -+ // Output: the projective point Q <- phi(Q) = (X3:Z3). -+ f2elm_t t0, t1, t2; -+ -+ fp2add(Q->X, Q->Z, t0); // t0 = X+Z -+ fp2sub(Q->X, Q->Z, t1); // t1 = X-Z -+ fp2mul_mont(t0, coeff[0], t0); // t0 = coeff0*(X+Z) -+ fp2mul_mont(t1, coeff[1], t1); // t1 = coeff1*(X-Z) -+ fp2add(t0, t1, t2); // t2 = coeff0*(X+Z) + coeff1*(X-Z) -+ fp2sub(t1, t0, t0); // t0 = coeff1*(X-Z) - coeff0*(X+Z) -+ fp2sqr_mont(t2, t2); // t2 = [coeff0*(X+Z) + coeff1*(X-Z)]^2 -+ fp2sqr_mont(t0, t0); // t0 = [coeff1*(X-Z) - coeff0*(X+Z)]^2 -+ fp2mul_mont(Q->X, t2, Q->X); // X3final = X*[coeff0*(X+Z) + coeff1*(X-Z)]^2 -+ fp2mul_mont(Q->Z, t0, Q->Z); // Z3final = Z*[coeff1*(X-Z) - coeff0*(X+Z)]^2 -+} -+ -+ -+void inv_3_way(f2elm_t z1, f2elm_t z2, f2elm_t z3) -+{ // 3-way simultaneous inversion -+ // Input: z1,z2,z3 -+ // Output: 1/z1,1/z2,1/z3 (override inputs). -+ f2elm_t t0, t1, t2, t3; -+ -+ fp2mul_mont(z1, z2, t0); // t0 = z1*z2 -+ fp2mul_mont(z3, t0, t1); // t1 = z1*z2*z3 -+ fp2inv_mont(t1); // t1 = 1/(z1*z2*z3) -+ fp2mul_mont(z3, t1, t2); // t2 = 1/(z1*z2) -+ fp2mul_mont(t2, z2, t3); // t3 = 1/z1 -+ fp2mul_mont(t2, z1, z2); // z2 = 1/z2 -+ fp2mul_mont(t0, t1, z3); // z3 = 1/z3 -+ fp2copy(t3, z1); // z1 = 1/z1 -+} -+ -+ -+void get_A(const f2elm_t xP, const f2elm_t xQ, const f2elm_t xR, f2elm_t A) -+{ // Given the x-coordinates of P, Q, and R, returns the value A corresponding to the Montgomery curve E_A: y^2=x^3+A*x^2+x such that R=Q-P on E_A. -+ // Input: the x-coordinates xP, xQ, and xR of the points P, Q and R. -+ // Output: the coefficient A corresponding to the curve E_A: y^2=x^3+A*x^2+x. -+ f2elm_t t0, t1, one = {0}; -+ -+ fpcopy((digit_t*)&Montgomery_one, one[0]); -+ fp2add(xP, xQ, t1); // t1 = xP+xQ -+ fp2mul_mont(xP, xQ, t0); // t0 = xP*xQ -+ fp2mul_mont(xR, t1, A); // A = xR*t1 -+ fp2add(t0, A, A); // A = A+t0 -+ fp2mul_mont(t0, xR, t0); // t0 = t0*xR -+ fp2sub(A, one, A); // A = A-1 -+ fp2add(t0, t0, t0); // t0 = t0+t0 -+ fp2add(t1, xR, t1); // t1 = t1+xR -+ fp2add(t0, t0, t0); // t0 = t0+t0 -+ fp2sqr_mont(A, A); // A = A^2 -+ fp2inv_mont(t0); // t0 = 1/t0 -+ fp2mul_mont(A, t0, A); // A = A*t0 -+ fp2sub(A, t1, A); // Afinal = A-t1 -+} -+ -+ -+void j_inv(const f2elm_t A, const f2elm_t C, f2elm_t jinv) -+{ // Computes the j-invariant of a Montgomery curve with projective constant. -+ // Input: A,C in GF(p^2). -+ // Output: j=256*(A^2-3*C^2)^3/(C^4*(A^2-4*C^2)), which is the j-invariant of the Montgomery curve B*y^2=x^3+(A/C)*x^2+x or (equivalently) j-invariant of B'*y^2=C*x^3+A*x^2+C*x. -+ f2elm_t t0, t1; -+ -+ fp2sqr_mont(A, jinv); // jinv = A^2 -+ fp2sqr_mont(C, t1); // t1 = C^2 -+ fp2add(t1, t1, t0); // t0 = t1+t1 -+ fp2sub(jinv, t0, t0); // t0 = jinv-t0 -+ fp2sub(t0, t1, t0); // t0 = t0-t1 -+ fp2sub(t0, t1, jinv); // jinv = t0-t1 -+ fp2sqr_mont(t1, t1); // t1 = t1^2 -+ fp2mul_mont(jinv, t1, jinv); // jinv = jinv*t1 -+ fp2add(t0, t0, t0); // t0 = t0+t0 -+ fp2add(t0, t0, t0); // t0 = t0+t0 -+ fp2sqr_mont(t0, t1); // t1 = t0^2 -+ fp2mul_mont(t0, t1, t0); // t0 = t0*t1 -+ fp2add(t0, t0, t0); // t0 = t0+t0 -+ fp2add(t0, t0, t0); // t0 = t0+t0 -+ fp2inv_mont(jinv); // jinv = 1/jinv -+ fp2mul_mont(jinv, t0, jinv); // jinv = t0*jinv -+} -+ -+ -+void xDBLADD(point_proj_t P, point_proj_t Q, const f2elm_t xPQ, const f2elm_t A24) -+{ // Simultaneous doubling and differential addition. -+ // Input: projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ, affine difference xPQ=x(P-Q) and Montgomery curve constant A24=(A+2)/4. -+ // Output: projective Montgomery points P <- 2*P = (X2P:Z2P) such that x(2P)=X2P/Z2P, and Q <- P+Q = (XQP:ZQP) such that = x(Q+P)=XQP/ZQP. -+ f2elm_t t0, t1, t2; -+ -+ fp2add(P->X, P->Z, t0); // t0 = XP+ZP -+ fp2sub(P->X, P->Z, t1); // t1 = XP-ZP -+ fp2sqr_mont(t0, P->X); // XP = (XP+ZP)^2 -+ fp2sub(Q->X, Q->Z, t2); // t2 = XQ-ZQ -+ fp2correction(t2); -+ fp2add(Q->X, Q->Z, Q->X); // XQ = XQ+ZQ -+ fp2mul_mont(t0, t2, t0); // t0 = (XP+ZP)*(XQ-ZQ) -+ fp2sqr_mont(t1, P->Z); // ZP = (XP-ZP)^2 -+ fp2mul_mont(t1, Q->X, t1); // t1 = (XP-ZP)*(XQ+ZQ) -+ fp2sub(P->X, P->Z, t2); // t2 = (XP+ZP)^2-(XP-ZP)^2 -+ fp2mul_mont(P->X, P->Z, P->X); // XP = (XP+ZP)^2*(XP-ZP)^2 -+ fp2mul_mont(t2, A24, Q->X); // XQ = A24*[(XP+ZP)^2-(XP-ZP)^2] -+ fp2sub(t0, t1, Q->Z); // ZQ = (XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ) -+ fp2add(Q->X, P->Z, P->Z); // ZP = A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2 -+ fp2add(t0, t1, Q->X); // XQ = (XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ) -+ fp2mul_mont(P->Z, t2, P->Z); // ZP = [A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2]*[(XP+ZP)^2-(XP-ZP)^2] -+ fp2sqr_mont(Q->Z, Q->Z); // ZQ = [(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2 -+ fp2sqr_mont(Q->X, Q->X); // XQ = [(XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ)]^2 -+ fp2mul_mont(Q->Z, xPQ, Q->Z); // ZQ = xPQ*[(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2 -+} -+ -+ -+static void swap_points(point_proj_t P, point_proj_t Q, const digit_t option) -+{ // Swap points. -+ // If option = 0 then P <- P and Q <- Q, else if option = 0xFF...FF then P <- Q and Q <- P -+ digit_t temp; -+ unsigned int i; -+ -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ temp = option & (P->X[0][i] ^ Q->X[0][i]); -+ P->X[0][i] = temp ^ P->X[0][i]; -+ Q->X[0][i] = temp ^ Q->X[0][i]; -+ temp = option & (P->Z[0][i] ^ Q->Z[0][i]); -+ P->Z[0][i] = temp ^ P->Z[0][i]; -+ Q->Z[0][i] = temp ^ Q->Z[0][i]; -+ temp = option & (P->X[1][i] ^ Q->X[1][i]); -+ P->X[1][i] = temp ^ P->X[1][i]; -+ Q->X[1][i] = temp ^ Q->X[1][i]; -+ temp = option & (P->Z[1][i] ^ Q->Z[1][i]); -+ P->Z[1][i] = temp ^ P->Z[1][i]; -+ Q->Z[1][i] = temp ^ Q->Z[1][i]; -+ } -+} -+ -+ -+static void LADDER3PT(const f2elm_t xP, const f2elm_t xQ, const f2elm_t xPQ, const digit_t* m, const unsigned int AliceOrBob, point_proj_t R, const f2elm_t A) -+{ -+ point_proj_t R0 = {0}, R2 = {0}; -+ f2elm_t A24 = {0}; -+ digit_t mask; -+ int i, nbits, bit, swap, prevbit = 0; -+ -+ if (AliceOrBob == ALICE) { -+ nbits = OALICE_BITS; -+ } else { -+ nbits = OBOB_BITS; -+ } -+ -+ // Initializing constant -+ fpcopy((digit_t*)&Montgomery_one, A24[0]); -+ fp2add(A24, A24, A24); -+ fp2add(A, A24, A24); -+ fp2div2(A24, A24); -+ fp2div2(A24, A24); // A24 = (A+2)/4 -+ -+ // Initializing points -+ fp2copy(xQ, R0->X); -+ fpcopy((digit_t*)&Montgomery_one, (digit_t*)R0->Z); -+ fp2copy(xPQ, R2->X); -+ fpcopy((digit_t*)&Montgomery_one, (digit_t*)R2->Z); -+ fp2copy(xP, R->X); -+ fpcopy((digit_t*)&Montgomery_one, (digit_t*)R->Z); -+ fpzero((digit_t*)(R->Z)[1]); -+ -+ // Main loop -+ for (i = 0; i < nbits; i++) { -+ bit = (m[i >> LOG2RADIX] >> (i & (RADIX-1))) & 1; -+ swap = bit ^ prevbit; -+ prevbit = bit; -+ mask = 0 - (digit_t)swap; -+ -+ swap_points(R, R2, mask); -+ xDBLADD(R0, R2, R->X, A24); -+ fp2mul_mont(R2->X, R->Z, R2->X); -+ } -+} -diff --git a/third_party/sidh/src/fpx.c b/third_party/sidh/src/fpx.c -new file mode 100644 -index 00000000..6e5e33a5 ---- /dev/null -+++ b/third_party/sidh/src/fpx.c -@@ -0,0 +1,558 @@ -+/******************************************************************************************** -+* SIDH: an efficient supersingular isogeny cryptography library -+* -+* Abstract: core functions over GF(p) and GF(p^2) -+*********************************************************************************************/ -+ -+ -+__inline void fpcopy(const felm_t a, felm_t c) -+{ // Copy a field element, c = a. -+ unsigned int i; -+ -+ for (i = 0; i < NWORDS_FIELD; i++) -+ c[i] = a[i]; -+} -+ -+ -+__inline void fpzero(felm_t a) -+{ // Zero a field element, a = 0. -+ unsigned int i; -+ -+ for (i = 0; i < NWORDS_FIELD; i++) -+ a[i] = 0; -+} -+ -+ -+void to_mont(const felm_t a, felm_t mc) -+{ // Conversion to Montgomery representation, -+ // mc = a*R^2*R^(-1) mod p = a*R mod p, where a in [0, p-1]. -+ // The Montgomery constant R^2 mod p is the global value "Montgomery_R2". -+ -+ fpmul_mont(a, (digit_t*)&Montgomery_R2, mc); -+} -+ -+ -+void from_mont(const felm_t ma, felm_t c) -+{ // Conversion from Montgomery representation to standard representation, -+ // c = ma*R^(-1) mod p = a mod p, where ma in [0, p-1]. -+ digit_t one[NWORDS_FIELD] = {0}; -+ -+ one[0] = 1; -+ fpmul_mont(ma, one, c); -+ fpcorrection(c); -+} -+ -+ -+void copy_words(const digit_t* a, digit_t* c, const unsigned int nwords) -+{ // Copy wordsize digits, c = a, where lng(a) = nwords. -+ unsigned int i; -+ -+ for (i = 0; i < nwords; i++) { -+ c[i] = a[i]; -+ } -+} -+ -+ -+void fpmul_mont(const felm_t ma, const felm_t mb, felm_t mc) -+{ // Multiprecision multiplication, c = a*b mod p. -+ dfelm_t temp = {0}; -+ -+ mp_mul(ma, mb, temp, NWORDS_FIELD); -+ rdc_mont(temp, mc); -+} -+ -+ -+void fpsqr_mont(const felm_t ma, felm_t mc) -+{ // Multiprecision squaring, c = a^2 mod p. -+ dfelm_t temp = {0}; -+ -+ mp_mul(ma, ma, temp, NWORDS_FIELD); -+ rdc_mont(temp, mc); -+} -+ -+ -+void fpinv_mont(felm_t a) -+{ // Field inversion using Montgomery arithmetic, a = a^(-1)*R mod p. -+ felm_t tt; -+ -+ fpcopy(a, tt); -+ fpinv_chain_mont(tt); -+ fpsqr_mont(tt, tt); -+ fpsqr_mont(tt, tt); -+ fpmul_mont(a, tt, a); -+} -+ -+ -+void fp2copy(const f2elm_t a, f2elm_t c) -+{ // Copy a GF(p^2) element, c = a. -+ fpcopy(a[0], c[0]); -+ fpcopy(a[1], c[1]); -+} -+ -+ -+void fp2zero(f2elm_t a) -+{ // Zero a GF(p^2) element, a = 0. -+ fpzero(a[0]); -+ fpzero(a[1]); -+} -+ -+ -+void fp2neg(f2elm_t a) -+{ // GF(p^2) negation, a = -a in GF(p^2). -+ fpneg(a[0]); -+ fpneg(a[1]); -+} -+ -+ -+__inline void fp2add(const f2elm_t a, const f2elm_t b, f2elm_t c) -+{ // GF(p^2) addition, c = a+b in GF(p^2). -+ fpadd(a[0], b[0], c[0]); -+ fpadd(a[1], b[1], c[1]); -+} -+ -+ -+__inline void fp2sub(const f2elm_t a, const f2elm_t b, f2elm_t c) -+{ // GF(p^2) subtraction, c = a-b in GF(p^2). -+ fpsub(a[0], b[0], c[0]); -+ fpsub(a[1], b[1], c[1]); -+} -+ -+ -+void fp2div2(const f2elm_t a, f2elm_t c) -+{ // GF(p^2) division by two, c = a/2 in GF(p^2). -+ fpdiv2(a[0], c[0]); -+ fpdiv2(a[1], c[1]); -+} -+ -+ -+void fp2correction(f2elm_t a) -+{ // Modular correction, a = a in GF(p^2). -+ fpcorrection(a[0]); -+ fpcorrection(a[1]); -+} -+ -+ -+__inline static void mp_addfast(const digit_t* a, const digit_t* b, digit_t* c) -+{ // Multiprecision addition, c = a+b. -+#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) -+ -+ mp_add(a, b, c, NWORDS_FIELD); -+ -+#elif (OS_TARGET == OS_LINUX) -+ -+ mp_add_asm(a, b, c); -+ -+#endif -+} -+ -+ -+void fp2sqr_mont(const f2elm_t a, f2elm_t c) -+{ // GF(p^2) squaring using Montgomery arithmetic, c = a^2 in GF(p^2). -+ // Inputs: a = a0+a1*i, where a0, a1 are in [0, 2*p-1] -+ // Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1] -+ felm_t t1, t2, t3; -+ -+ mp_addfast(a[0], a[1], t1); // t1 = a0+a1 -+ fpsub(a[0], a[1], t2); // t2 = a0-a1 -+ mp_addfast(a[0], a[0], t3); // t3 = 2a0 -+ fpmul_mont(t1, t2, c[0]); // c0 = (a0+a1)(a0-a1) -+ fpmul_mont(t3, a[1], c[1]); // c1 = 2a0*a1 -+} -+ -+ -+__inline unsigned int mp_sub(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) -+{ // Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = nwords. Returns the borrow bit. -+ unsigned int i, borrow = 0; -+ -+ for (i = 0; i < nwords; i++) { -+ SUBC(borrow, a[i], b[i], borrow, c[i]); -+ } -+ -+ return borrow; -+} -+ -+ -+__inline static digit_t mp_subfast(const digit_t* a, const digit_t* b, digit_t* c) -+{ // Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = 2*NWORDS_FIELD. -+ // If c < 0 then returns mask = 0xFF..F, else mask = 0x00..0 -+#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) -+ -+ return (0 - (digit_t)mp_sub(a, b, c, 2*NWORDS_FIELD)); -+ -+#elif (OS_TARGET == OS_LINUX) -+ -+ return mp_subx2_asm(a, b, c); -+ -+#endif -+} -+ -+ -+__inline static void mp_dblsubfast(const digit_t* a, const digit_t* b, digit_t* c) -+{ // Multiprecision subtraction, c = c-a-b, where lng(a) = lng(b) = 2*NWORDS_FIELD. -+ // Inputs should be s.t. c > a and c > b -+#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) -+ -+ mp_sub(c, a, c, 2*NWORDS_FIELD); -+ mp_sub(c, b, c, 2*NWORDS_FIELD); -+ -+#elif (OS_TARGET == OS_LINUX) -+ -+ mp_dblsubx2_asm(a, b, c); -+ -+#endif -+} -+ -+ -+void fp2mul_mont(const f2elm_t a, const f2elm_t b, f2elm_t c) -+{ // GF(p^2) multiplication using Montgomery arithmetic, c = a*b in GF(p^2). -+ // Inputs: a = a0+a1*i and b = b0+b1*i, where a0, a1, b0, b1 are in [0, 2*p-1] -+ // Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1] -+ felm_t t1, t2; -+ dfelm_t tt1, tt2, tt3; -+ digit_t mask; -+ unsigned int i; -+ -+ mp_addfast(a[0], a[1], t1); // t1 = a0+a1 -+ mp_addfast(b[0], b[1], t2); // t2 = b0+b1 -+ mp_mul(a[0], b[0], tt1, NWORDS_FIELD); // tt1 = a0*b0 -+ mp_mul(a[1], b[1], tt2, NWORDS_FIELD); // tt2 = a1*b1 -+ mp_mul(t1, t2, tt3, NWORDS_FIELD); // tt3 = (a0+a1)*(b0+b1) -+ mp_dblsubfast(tt1, tt2, tt3); // tt3 = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 -+ mask = mp_subfast(tt1, tt2, tt1); // tt1 = a0*b0 - a1*b1. If tt1 < 0 then mask = 0xFF..F, else if tt1 >= 0 then mask = 0x00..0 -+ -+ for (i = 0; i < NWORDS_FIELD; i++) { -+ t1[i] = ((digit_t*)PRIME)[i] & mask; -+ } -+ -+ rdc_mont(tt3, c[1]); // c[1] = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 -+ mp_addfast((digit_t*)&tt1[NWORDS_FIELD], t1, (digit_t*)&tt1[NWORDS_FIELD]); -+ rdc_mont(tt1, c[0]); // c[0] = a0*b0 - a1*b1 -+} -+ -+ -+void fpinv_chain_mont(felm_t a) -+{ // Chain to compute a^(p-3)/4 using Montgomery arithmetic. -+ unsigned int i, j; -+ -+#if (NBITS_FIELD == 503) -+ felm_t t[15], tt; -+ -+ // Precomputed table -+ fpsqr_mont(a, tt); -+ fpmul_mont(a, tt, t[0]); -+ for (i = 0; i <= 13; i++) fpmul_mont(t[i], tt, t[i+1]); -+ -+ fpcopy(a, tt); -+ for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(a, tt, tt); -+ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[8], tt, tt); -+ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[6], tt, tt); -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[9], tt, tt); -+ for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[0], tt, tt); -+ for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(a, tt, tt); -+ for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[6], tt, tt); -+ for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[2], tt, tt); -+ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[8], tt, tt); -+ for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(a, tt, tt); -+ for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[10], tt, tt); -+ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[0], tt, tt); -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[10], tt, tt); -+ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[10], tt, tt); -+ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[5], tt, tt); -+ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[2], tt, tt); -+ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[6], tt, tt); -+ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[3], tt, tt); -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[5], tt, tt); -+ for (i = 0; i < 12; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[12], tt, tt); -+ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[8], tt, tt); -+ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[6], tt, tt); -+ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[12], tt, tt); -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[11], tt, tt); -+ for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[6], tt, tt); -+ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[5], tt, tt); -+ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[14], tt, tt); -+ for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[14], tt, tt); -+ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[5], tt, tt); -+ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[6], tt, tt); -+ for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[8], tt, tt); -+ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(a, tt, tt); -+ for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[4], tt, tt); -+ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[6], tt, tt); -+ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[5], tt, tt); -+ for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[7], tt, tt); -+ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(a, tt, tt); -+ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[0], tt, tt); -+ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[11], tt, tt); -+ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[13], tt, tt); -+ for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[1], tt, tt); -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[10], tt, tt); -+ for (j = 0; j < 49; j++) { -+ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[14], tt, tt); -+ } -+ fpcopy(tt, a); -+ -+#elif (NBITS_FIELD == 751) -+ felm_t t[27], tt; -+ -+ // Precomputed table -+ fpsqr_mont(a, tt); -+ fpmul_mont(a, tt, t[0]); -+ fpmul_mont(t[0], tt, t[1]); -+ fpmul_mont(t[1], tt, t[2]); -+ fpmul_mont(t[2], tt, t[3]); -+ fpmul_mont(t[3], tt, t[3]); -+ for (i = 3; i <= 8; i++) fpmul_mont(t[i], tt, t[i+1]); -+ fpmul_mont(t[9], tt, t[9]); -+ for (i = 9; i <= 20; i++) fpmul_mont(t[i], tt, t[i+1]); -+ fpmul_mont(t[21], tt, t[21]); -+ for (i = 21; i <= 24; i++) fpmul_mont(t[i], tt, t[i+1]); -+ fpmul_mont(t[25], tt, t[25]); -+ fpmul_mont(t[25], tt, t[26]); -+ -+ fpcopy(a, tt); -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[20], tt, tt); -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[24], tt, tt); -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[11], tt, tt); -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[8], tt, tt); -+ for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[2], tt, tt); -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[23], tt, tt); -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[2], tt, tt); -+ for (i = 0; i < 9; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[2], tt, tt); -+ for (i = 0; i < 10; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[15], tt, tt); -+ for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[13], tt, tt); -+ for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[26], tt, tt); -+ for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[20], tt, tt); -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[11], tt, tt); -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[10], tt, tt); -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[14], tt, tt); -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[4], tt, tt); -+ for (i = 0; i < 10; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[18], tt, tt); -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[1], tt, tt); -+ for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[22], tt, tt); -+ for (i = 0; i < 10; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[6], tt, tt); -+ for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[24], tt, tt); -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[9], tt, tt); -+ for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[18], tt, tt); -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[17], tt, tt); -+ for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(a, tt, tt); -+ for (i = 0; i < 10; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[16], tt, tt); -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[7], tt, tt); -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[0], tt, tt); -+ for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[12], tt, tt); -+ for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[19], tt, tt); -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[22], tt, tt); -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[25], tt, tt); -+ for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[2], tt, tt); -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[10], tt, tt); -+ for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[22], tt, tt); -+ for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[18], tt, tt); -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[4], tt, tt); -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[14], tt, tt); -+ for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[13], tt, tt); -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[5], tt, tt); -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[23], tt, tt); -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[21], tt, tt); -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[2], tt, tt); -+ for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[23], tt, tt); -+ for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[12], tt, tt); -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[9], tt, tt); -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[3], tt, tt); -+ for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[13], tt, tt); -+ for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[17], tt, tt); -+ for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[26], tt, tt); -+ for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[5], tt, tt); -+ for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[8], tt, tt); -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[2], tt, tt); -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[11], tt, tt); -+ for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[20], tt, tt); -+ for (j = 0; j < 61; j++) { -+ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); -+ fpmul_mont(t[26], tt, tt); -+ } -+ fpcopy(tt, a); -+#endif -+} -+ -+ -+void fp2inv_mont(f2elm_t a) -+{// GF(p^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2). -+ f2elm_t t1; -+ -+ fpsqr_mont(a[0], t1[0]); // t10 = a0^2 -+ fpsqr_mont(a[1], t1[1]); // t11 = a1^2 -+ fpadd(t1[0], t1[1], t1[0]); // t10 = a0^2+a1^2 -+ fpinv_mont(t1[0]); // t10 = (a0^2+a1^2)^-1 -+ fpneg(a[1]); // a = a0-i*a1 -+ fpmul_mont(a[0], t1[0], a[0]); -+ fpmul_mont(a[1], t1[0], a[1]); // a = (a0-i*a1)*(a0^2+a1^2)^-1 -+} -+ -+ -+void to_fp2mont(const f2elm_t a, f2elm_t mc) -+{ // Conversion of a GF(p^2) element to Montgomery representation, -+ // mc_i = a_i*R^2*R^(-1) = a_i*R in GF(p^2). -+ -+ to_mont(a[0], mc[0]); -+ to_mont(a[1], mc[1]); -+} -+ -+ -+void from_fp2mont(const f2elm_t ma, f2elm_t c) -+{ // Conversion of a GF(p^2) element from Montgomery representation to standard representation, -+ // c_i = ma_i*R^(-1) = a_i in GF(p^2). -+ -+ from_mont(ma[0], c[0]); -+ from_mont(ma[1], c[1]); -+} -+ -+ -+__inline unsigned int mp_add(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) -+{ // Multiprecision addition, c = a+b, where lng(a) = lng(b) = nwords. Returns the carry bit. -+ unsigned int i, carry = 0; -+ -+ for (i = 0; i < nwords; i++) { -+ ADDC(carry, a[i], b[i], carry, c[i]); -+ } -+ -+ return carry; -+} -+ -+ -+void mp_shiftleft(digit_t* x, unsigned int shift, const unsigned int nwords) -+{ -+ unsigned int i, j = 0; -+ -+ while (shift > RADIX) { -+ j += 1; -+ shift -= RADIX; -+ } -+ -+ for (i = 0; i < nwords-j; i++) -+ x[nwords-1-i] = x[nwords-1-i-j]; -+ for (i = nwords-j; i < nwords; i++) -+ x[nwords-1-i] = 0; -+ if (shift != 0) { -+ for (j = nwords-1; j > 0; j--) -+ SHIFTL(x[j], x[j-1], shift, x[j], RADIX); -+ x[0] <<= shift; -+ } -+} -+ -+ -+void mp_shiftr1(digit_t* x, const unsigned int nwords) -+{ // Multiprecision right shift by one. -+ unsigned int i; -+ -+ for (i = 0; i < nwords-1; i++) { -+ SHIFTR(x[i+1], x[i], 1, x[i], RADIX); -+ } -+ x[nwords-1] >>= 1; -+} -+ -+ -+void mp_shiftl1(digit_t* x, const unsigned int nwords) -+{ // Multiprecision left shift by one. -+ int i; -+ -+ for (i = nwords-1; i > 0; i--) { -+ SHIFTL(x[i], x[i-1], 1, x[i], RADIX); -+ } -+ x[0] <<= 1; -+} -diff --git a/third_party/sidh/src/random/random.c b/third_party/sidh/src/random/random.c -new file mode 100644 -index 00000000..7f445b81 ---- /dev/null -+++ b/third_party/sidh/src/random/random.c -@@ -0,0 +1,61 @@ -+/******************************************************************************************** -+* Hardware-based random number generation function -+* -+* It uses /dev/urandom in Linux and CNG's BCryptGenRandom function in Windows -+*********************************************************************************************/ -+ -+#include "random.h" -+#include -+#if defined(__WINDOWS__) -+ #include -+ #include -+#elif defined(__LINUX__) -+ #include -+ #include -+ static int lock = -1; -+#endif -+ -+#define passed 0 -+#define failed 1 -+ -+ -+static __inline void delay(unsigned int count) -+{ -+ while (count--) {} -+} -+ -+ -+int randombytes(unsigned char* random_array, unsigned long long nbytes) -+{ // Generation of "nbytes" of random values -+ -+#if defined(__WINDOWS__) -+ if (!BCRYPT_SUCCESS(BCryptGenRandom(NULL, random_array, (unsigned long)nbytes, BCRYPT_USE_SYSTEM_PREFERRED_RNG))) { -+ return failed; -+ } -+ -+#elif defined(__LINUX__) -+ int r, n = (int)nbytes, count = 0; -+ -+ if (lock == -1) { -+ do { -+ lock = open("/dev/urandom", O_RDONLY); -+ if (lock == -1) { -+ delay(0xFFFFF); -+ } -+ } while (lock == -1); -+ } -+ -+ while (n > 0) { -+ do { -+ r = read(lock, random_array+count, n); -+ if (r == -1) { -+ delay(0xFFFF); -+ } -+ } while (r == -1); -+ count += r; -+ n -= r; -+ } -+#endif -+ -+ return passed; -+} -\ No newline at end of file -diff --git a/third_party/sidh/src/random/random.h b/third_party/sidh/src/random/random.h -new file mode 100644 -index 00000000..fbed5f82 ---- /dev/null -+++ b/third_party/sidh/src/random/random.h -@@ -0,0 +1,9 @@ -+#ifndef __RANDOM_H__ -+#define __RANDOM_H__ -+ -+ -+// Generate random bytes and output the result to random_array -+int randombytes(unsigned char* random_array, unsigned long long nbytes); -+ -+ -+#endif -\ No newline at end of file -diff --git a/third_party/sidh/src/sha3/fips202.c b/third_party/sidh/src/sha3/fips202.c -new file mode 100644 -index 00000000..f21926d0 ---- /dev/null -+++ b/third_party/sidh/src/sha3/fips202.c -@@ -0,0 +1,572 @@ -+/******************************************************************************************** -+* SHA3-derived functions: SHAKE and cSHAKE -+* -+* Based on the public domain implementation in crypto_hash/keccakc512/simple/ -+* from http://bench.cr.yp.to/supercop.html by Ronny Van Keer -+* and the public domain "TweetFips202" implementation from https://twitter.com/tweetfips202 -+* by Gilles Van Assche, Daniel J. Bernstein, and Peter Schwabe -+* -+* See NIST Special Publication 800-185 for more information: -+* http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-185.pdf -+* -+*********************************************************************************************/ -+ -+#include -+#include -+#include "fips202.h" -+ -+#define NROUNDS 24 -+#define ROL(a, offset) ((a << offset) ^ (a >> (64-offset))) -+ -+ -+static uint64_t load64(const unsigned char *x) -+{ -+ unsigned long long r = 0, i; -+ -+ for (i = 0; i < 8; ++i) { -+ r |= (unsigned long long)x[i] << 8 * i; -+ } -+ return r; -+} -+ -+ -+static void store64(uint8_t *x, uint64_t u) -+{ -+ unsigned int i; -+ -+ for (i = 0; i < 8; ++i) { -+ x[i] = (uint8_t)u; -+ u >>= 8; -+ } -+} -+ -+ -+static const uint64_t KeccakF_RoundConstants[NROUNDS] = -+{ -+ (uint64_t)0x0000000000000001ULL, -+ (uint64_t)0x0000000000008082ULL, -+ (uint64_t)0x800000000000808aULL, -+ (uint64_t)0x8000000080008000ULL, -+ (uint64_t)0x000000000000808bULL, -+ (uint64_t)0x0000000080000001ULL, -+ (uint64_t)0x8000000080008081ULL, -+ (uint64_t)0x8000000000008009ULL, -+ (uint64_t)0x000000000000008aULL, -+ (uint64_t)0x0000000000000088ULL, -+ (uint64_t)0x0000000080008009ULL, -+ (uint64_t)0x000000008000000aULL, -+ (uint64_t)0x000000008000808bULL, -+ (uint64_t)0x800000000000008bULL, -+ (uint64_t)0x8000000000008089ULL, -+ (uint64_t)0x8000000000008003ULL, -+ (uint64_t)0x8000000000008002ULL, -+ (uint64_t)0x8000000000000080ULL, -+ (uint64_t)0x000000000000800aULL, -+ (uint64_t)0x800000008000000aULL, -+ (uint64_t)0x8000000080008081ULL, -+ (uint64_t)0x8000000000008080ULL, -+ (uint64_t)0x0000000080000001ULL, -+ (uint64_t)0x8000000080008008ULL -+}; -+ -+ -+static void KeccakF1600_StatePermute(uint64_t * state) { -+ int round; -+ -+ uint64_t Aba, Abe, Abi, Abo, Abu; -+ uint64_t Aga, Age, Agi, Ago, Agu; -+ uint64_t Aka, Ake, Aki, Ako, Aku; -+ uint64_t Ama, Ame, Ami, Amo, Amu; -+ uint64_t Asa, Ase, Asi, Aso, Asu; -+ uint64_t BCa, BCe, BCi, BCo, BCu; -+ uint64_t Da, De, Di, Do, Du; -+ uint64_t Eba, Ebe, Ebi, Ebo, Ebu; -+ uint64_t Ega, Ege, Egi, Ego, Egu; -+ uint64_t Eka, Eke, Eki, Eko, Eku; -+ uint64_t Ema, Eme, Emi, Emo, Emu; -+ uint64_t Esa, Ese, Esi, Eso, Esu; -+ -+ //copyFromState(A, state) -+ Aba = state[ 0]; -+ Abe = state[ 1]; -+ Abi = state[ 2]; -+ Abo = state[ 3]; -+ Abu = state[ 4]; -+ Aga = state[ 5]; -+ Age = state[ 6]; -+ Agi = state[ 7]; -+ Ago = state[ 8]; -+ Agu = state[ 9]; -+ Aka = state[10]; -+ Ake = state[11]; -+ Aki = state[12]; -+ Ako = state[13]; -+ Aku = state[14]; -+ Ama = state[15]; -+ Ame = state[16]; -+ Ami = state[17]; -+ Amo = state[18]; -+ Amu = state[19]; -+ Asa = state[20]; -+ Ase = state[21]; -+ Asi = state[22]; -+ Aso = state[23]; -+ Asu = state[24]; -+ -+ for( round = 0; round < NROUNDS; round += 2 ) -+ { -+ // prepareTheta -+ BCa = Aba^Aga^Aka^Ama^Asa; -+ BCe = Abe^Age^Ake^Ame^Ase; -+ BCi = Abi^Agi^Aki^Ami^Asi; -+ BCo = Abo^Ago^Ako^Amo^Aso; -+ BCu = Abu^Agu^Aku^Amu^Asu; -+ -+ //thetaRhoPiChiIotaPrepareTheta(round , A, E) -+ Da = BCu^ROL(BCe, 1); -+ De = BCa^ROL(BCi, 1); -+ Di = BCe^ROL(BCo, 1); -+ Do = BCi^ROL(BCu, 1); -+ Du = BCo^ROL(BCa, 1); -+ -+ Aba ^= Da; -+ BCa = Aba; -+ Age ^= De; -+ BCe = ROL(Age, 44); -+ Aki ^= Di; -+ BCi = ROL(Aki, 43); -+ Amo ^= Do; -+ BCo = ROL(Amo, 21); -+ Asu ^= Du; -+ BCu = ROL(Asu, 14); -+ Eba = BCa ^((~BCe)& BCi ); -+ Eba ^= (uint64_t)KeccakF_RoundConstants[round]; -+ Ebe = BCe ^((~BCi)& BCo ); -+ Ebi = BCi ^((~BCo)& BCu ); -+ Ebo = BCo ^((~BCu)& BCa ); -+ Ebu = BCu ^((~BCa)& BCe ); -+ -+ Abo ^= Do; -+ BCa = ROL(Abo, 28); -+ Agu ^= Du; -+ BCe = ROL(Agu, 20); -+ Aka ^= Da; -+ BCi = ROL(Aka, 3); -+ Ame ^= De; -+ BCo = ROL(Ame, 45); -+ Asi ^= Di; -+ BCu = ROL(Asi, 61); -+ Ega = BCa ^((~BCe)& BCi ); -+ Ege = BCe ^((~BCi)& BCo ); -+ Egi = BCi ^((~BCo)& BCu ); -+ Ego = BCo ^((~BCu)& BCa ); -+ Egu = BCu ^((~BCa)& BCe ); -+ -+ Abe ^= De; -+ BCa = ROL(Abe, 1); -+ Agi ^= Di; -+ BCe = ROL(Agi, 6); -+ Ako ^= Do; -+ BCi = ROL(Ako, 25); -+ Amu ^= Du; -+ BCo = ROL(Amu, 8); -+ Asa ^= Da; -+ BCu = ROL(Asa, 18); -+ Eka = BCa ^((~BCe)& BCi ); -+ Eke = BCe ^((~BCi)& BCo ); -+ Eki = BCi ^((~BCo)& BCu ); -+ Eko = BCo ^((~BCu)& BCa ); -+ Eku = BCu ^((~BCa)& BCe ); -+ -+ Abu ^= Du; -+ BCa = ROL(Abu, 27); -+ Aga ^= Da; -+ BCe = ROL(Aga, 36); -+ Ake ^= De; -+ BCi = ROL(Ake, 10); -+ Ami ^= Di; -+ BCo = ROL(Ami, 15); -+ Aso ^= Do; -+ BCu = ROL(Aso, 56); -+ Ema = BCa ^((~BCe)& BCi ); -+ Eme = BCe ^((~BCi)& BCo ); -+ Emi = BCi ^((~BCo)& BCu ); -+ Emo = BCo ^((~BCu)& BCa ); -+ Emu = BCu ^((~BCa)& BCe ); -+ -+ Abi ^= Di; -+ BCa = ROL(Abi, 62); -+ Ago ^= Do; -+ BCe = ROL(Ago, 55); -+ Aku ^= Du; -+ BCi = ROL(Aku, 39); -+ Ama ^= Da; -+ BCo = ROL(Ama, 41); -+ Ase ^= De; -+ BCu = ROL(Ase, 2); -+ Esa = BCa ^((~BCe)& BCi ); -+ Ese = BCe ^((~BCi)& BCo ); -+ Esi = BCi ^((~BCo)& BCu ); -+ Eso = BCo ^((~BCu)& BCa ); -+ Esu = BCu ^((~BCa)& BCe ); -+ -+ // prepareTheta -+ BCa = Eba^Ega^Eka^Ema^Esa; -+ BCe = Ebe^Ege^Eke^Eme^Ese; -+ BCi = Ebi^Egi^Eki^Emi^Esi; -+ BCo = Ebo^Ego^Eko^Emo^Eso; -+ BCu = Ebu^Egu^Eku^Emu^Esu; -+ -+ //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) -+ Da = BCu^ROL(BCe, 1); -+ De = BCa^ROL(BCi, 1); -+ Di = BCe^ROL(BCo, 1); -+ Do = BCi^ROL(BCu, 1); -+ Du = BCo^ROL(BCa, 1); -+ -+ Eba ^= Da; -+ BCa = Eba; -+ Ege ^= De; -+ BCe = ROL(Ege, 44); -+ Eki ^= Di; -+ BCi = ROL(Eki, 43); -+ Emo ^= Do; -+ BCo = ROL(Emo, 21); -+ Esu ^= Du; -+ BCu = ROL(Esu, 14); -+ Aba = BCa ^((~BCe)& BCi ); -+ Aba ^= (uint64_t)KeccakF_RoundConstants[round+1]; -+ Abe = BCe ^((~BCi)& BCo ); -+ Abi = BCi ^((~BCo)& BCu ); -+ Abo = BCo ^((~BCu)& BCa ); -+ Abu = BCu ^((~BCa)& BCe ); -+ -+ Ebo ^= Do; -+ BCa = ROL(Ebo, 28); -+ Egu ^= Du; -+ BCe = ROL(Egu, 20); -+ Eka ^= Da; -+ BCi = ROL(Eka, 3); -+ Eme ^= De; -+ BCo = ROL(Eme, 45); -+ Esi ^= Di; -+ BCu = ROL(Esi, 61); -+ Aga = BCa ^((~BCe)& BCi ); -+ Age = BCe ^((~BCi)& BCo ); -+ Agi = BCi ^((~BCo)& BCu ); -+ Ago = BCo ^((~BCu)& BCa ); -+ Agu = BCu ^((~BCa)& BCe ); -+ -+ Ebe ^= De; -+ BCa = ROL(Ebe, 1); -+ Egi ^= Di; -+ BCe = ROL(Egi, 6); -+ Eko ^= Do; -+ BCi = ROL(Eko, 25); -+ Emu ^= Du; -+ BCo = ROL(Emu, 8); -+ Esa ^= Da; -+ BCu = ROL(Esa, 18); -+ Aka = BCa ^((~BCe)& BCi ); -+ Ake = BCe ^((~BCi)& BCo ); -+ Aki = BCi ^((~BCo)& BCu ); -+ Ako = BCo ^((~BCu)& BCa ); -+ Aku = BCu ^((~BCa)& BCe ); -+ -+ Ebu ^= Du; -+ BCa = ROL(Ebu, 27); -+ Ega ^= Da; -+ BCe = ROL(Ega, 36); -+ Eke ^= De; -+ BCi = ROL(Eke, 10); -+ Emi ^= Di; -+ BCo = ROL(Emi, 15); -+ Eso ^= Do; -+ BCu = ROL(Eso, 56); -+ Ama = BCa ^((~BCe)& BCi ); -+ Ame = BCe ^((~BCi)& BCo ); -+ Ami = BCi ^((~BCo)& BCu ); -+ Amo = BCo ^((~BCu)& BCa ); -+ Amu = BCu ^((~BCa)& BCe ); -+ -+ Ebi ^= Di; -+ BCa = ROL(Ebi, 62); -+ Ego ^= Do; -+ BCe = ROL(Ego, 55); -+ Eku ^= Du; -+ BCi = ROL(Eku, 39); -+ Ema ^= Da; -+ BCo = ROL(Ema, 41); -+ Ese ^= De; -+ BCu = ROL(Ese, 2); -+ Asa = BCa ^((~BCe)& BCi ); -+ Ase = BCe ^((~BCi)& BCo ); -+ Asi = BCi ^((~BCo)& BCu ); -+ Aso = BCo ^((~BCu)& BCa ); -+ Asu = BCu ^((~BCa)& BCe ); -+ } -+ -+ //copyToState(state, A) -+ state[ 0] = Aba; -+ state[ 1] = Abe; -+ state[ 2] = Abi; -+ state[ 3] = Abo; -+ state[ 4] = Abu; -+ state[ 5] = Aga; -+ state[ 6] = Age; -+ state[ 7] = Agi; -+ state[ 8] = Ago; -+ state[ 9] = Agu; -+ state[10] = Aka; -+ state[11] = Ake; -+ state[12] = Aki; -+ state[13] = Ako; -+ state[14] = Aku; -+ state[15] = Ama; -+ state[16] = Ame; -+ state[17] = Ami; -+ state[18] = Amo; -+ state[19] = Amu; -+ state[20] = Asa; -+ state[21] = Ase; -+ state[22] = Asi; -+ state[23] = Aso; -+ state[24] = Asu; -+ -+ #undef round -+} -+ -+#include -+#define MIN(a, b) ((a) < (b) ? (a) : (b)) -+ -+ -+static void keccak_absorb(uint64_t *s, unsigned int r, const unsigned char *m, unsigned long long int mlen, unsigned char p) -+{ -+ unsigned long long i; -+ unsigned char t[200]; -+ -+ while (mlen >= r) -+ { -+ for (i = 0; i < r / 8; ++i) -+ s[i] ^= load64(m + 8 * i); -+ -+ KeccakF1600_StatePermute(s); -+ mlen -= r; -+ m += r; -+ } -+ -+ for (i = 0; i < r; ++i) -+ t[i] = 0; -+ for (i = 0; i < mlen; ++i) -+ t[i] = m[i]; -+ t[i] = p; -+ t[r - 1] |= 128; -+ for (i = 0; i < r / 8; ++i) -+ s[i] ^= load64(t + 8 * i); -+} -+ -+ -+static void keccak_squeezeblocks(unsigned char *h, unsigned long long int nblocks, uint64_t *s, unsigned int r) -+{ -+ unsigned int i; -+ -+ while(nblocks > 0) -+ { -+ KeccakF1600_StatePermute(s); -+ for (i = 0; i < (r>>3); i++) -+ { -+ store64(h+8*i, s[i]); -+ } -+ h += r; -+ nblocks--; -+ } -+} -+ -+ -+/********** SHAKE128 ***********/ -+ -+void shake128_absorb(uint64_t *s, const unsigned char *input, unsigned int inputByteLen) -+{ -+ keccak_absorb(s, SHAKE128_RATE, input, inputByteLen, 0x1F); -+} -+ -+ -+void shake128_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s) -+{ -+ keccak_squeezeblocks(output, nblocks, s, SHAKE128_RATE); -+} -+ -+ -+void shake128(unsigned char *output, unsigned long long outlen, const unsigned char *input, unsigned long long inlen) -+{ -+ uint64_t s[25] = {0}; -+ unsigned char t[SHAKE128_RATE]; -+ unsigned long long nblocks = outlen/SHAKE128_RATE; -+ size_t i; -+ -+ /* Absorb input */ -+ keccak_absorb(s, SHAKE128_RATE, input, inlen, 0x1F); -+ -+ /* Squeeze output */ -+ keccak_squeezeblocks(output, nblocks, s, SHAKE128_RATE); -+ -+ output += nblocks*SHAKE128_RATE; -+ outlen -= nblocks*SHAKE128_RATE; -+ -+ if (outlen) -+ { -+ keccak_squeezeblocks(t, 1, s, SHAKE128_RATE); -+ for (i = 0; i < outlen; i++) -+ output[i] = t[i]; -+ } -+} -+ -+ -+/********** cSHAKE128 ***********/ -+ -+void cshake128_simple_absorb(uint64_t s[25], uint16_t cstm, const unsigned char *in, unsigned long long inlen) -+{ -+ unsigned char *sep = (unsigned char*)s; -+ unsigned int i; -+ -+ for (i = 0; i < 25; i++) -+ s[i] = 0; -+ -+ /* Absorb customization (domain-separation) string */ -+ sep[0] = 0x01; -+ sep[1] = 0xa8; -+ sep[2] = 0x01; -+ sep[3] = 0x00; -+ sep[4] = 0x01; -+ sep[5] = 16; // fixed bitlen of cstm -+ sep[6] = cstm & 0xff; -+ sep[7] = cstm >> 8; -+ -+ KeccakF1600_StatePermute(s); -+ -+ /* Absorb input */ -+ keccak_absorb(s, SHAKE128_RATE, in, inlen, 0x04); -+} -+ -+ -+void cshake128_simple_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s) -+{ -+ keccak_squeezeblocks(output, nblocks, s, SHAKE128_RATE); -+} -+ -+ -+void cshake128_simple(unsigned char *output, unsigned long long outlen, uint16_t cstm, const unsigned char *in, unsigned long long inlen) -+{ -+ uint64_t s[25]; -+ unsigned char t[SHAKE128_RATE]; -+ unsigned int i; -+ -+ cshake128_simple_absorb(s, cstm, in, inlen); -+ -+ /* Squeeze output */ -+ keccak_squeezeblocks(output, outlen/SHAKE128_RATE, s, SHAKE128_RATE); -+ output += (outlen/SHAKE128_RATE)*SHAKE128_RATE; -+ -+ if (outlen%SHAKE128_RATE) -+ { -+ keccak_squeezeblocks(t, 1, s, SHAKE128_RATE); -+ for (i = 0; i < outlen%SHAKE128_RATE; i++) -+ output[i] = t[i]; -+ } -+} -+ -+ -+/********** SHAKE256 ***********/ -+ -+void shake256_absorb(uint64_t *s, const unsigned char *input, unsigned int inputByteLen) -+{ -+ keccak_absorb(s, SHAKE256_RATE, input, inputByteLen, 0x1F); -+} -+ -+ -+void shake256_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s) -+{ -+ keccak_squeezeblocks(output, nblocks, s, SHAKE256_RATE); -+} -+ -+ -+void shake256(unsigned char *output, unsigned long long outlen, const unsigned char *input, unsigned long long inlen) -+{ -+ uint64_t s[25]; -+ unsigned char t[SHAKE256_RATE]; -+ unsigned long long nblocks = outlen/SHAKE256_RATE; -+ size_t i; -+ -+ for (i = 0; i < 25; ++i) -+ s[i] = 0; -+ -+ /* Absorb input */ -+ keccak_absorb(s, SHAKE256_RATE, input, inlen, 0x1F); -+ -+ /* Squeeze output */ -+ keccak_squeezeblocks(output, nblocks, s, SHAKE256_RATE); -+ -+ output += nblocks*SHAKE256_RATE; -+ outlen -= nblocks*SHAKE256_RATE; -+ -+ if (outlen) -+ { -+ keccak_squeezeblocks(t, 1, s, SHAKE256_RATE); -+ for (i = 0; i < outlen; i++) -+ output[i] = t[i]; -+ } -+} -+ -+ -+/********** cSHAKE256 ***********/ -+ -+void cshake256_simple_absorb(uint64_t s[25], uint16_t cstm, const unsigned char *in, unsigned long long inlen) -+{ -+ unsigned char *sep = (unsigned char*)s; -+ unsigned int i; -+ -+ for (i = 0; i < 25; i++) -+ s[i] = 0; -+ -+ /* Absorb customization (domain-separation) string */ -+ sep[0] = 0x01; -+ sep[1] = 0x88; -+ sep[2] = 0x01; -+ sep[3] = 0x00; -+ sep[4] = 0x01; -+ sep[5] = 16; // fixed bitlen of cstm -+ sep[6] = cstm & 0xff; -+ sep[7] = cstm >> 8; -+ -+ KeccakF1600_StatePermute(s); -+ -+ /* Absorb input */ -+ keccak_absorb(s, SHAKE256_RATE, in, inlen, 0x04); -+} -+ -+ -+void cshake256_simple_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s) -+{ -+ keccak_squeezeblocks(output, nblocks, s, SHAKE256_RATE); -+} -+ -+ -+void cshake256_simple(unsigned char *output, unsigned long long outlen, uint16_t cstm, const unsigned char *in, unsigned long long inlen) -+{ -+ uint64_t s[25]; -+ unsigned char t[SHAKE256_RATE]; -+ unsigned int i; -+ -+ cshake256_simple_absorb(s, cstm, in, inlen); -+ -+ /* Squeeze output */ -+ keccak_squeezeblocks(output, outlen/SHAKE256_RATE, s, SHAKE256_RATE); -+ output += (outlen/SHAKE256_RATE)*SHAKE256_RATE; -+ -+ if(outlen%SHAKE256_RATE) -+ { -+ keccak_squeezeblocks(t, 1, s, SHAKE256_RATE); -+ for (i = 0; i < outlen%SHAKE256_RATE; i++) -+ output[i] = t[i]; -+ } -+} -\ No newline at end of file -diff --git a/third_party/sidh/src/sha3/fips202.h b/third_party/sidh/src/sha3/fips202.h -new file mode 100644 -index 00000000..55b400ae ---- /dev/null -+++ b/third_party/sidh/src/sha3/fips202.h -@@ -0,0 +1,27 @@ -+#ifndef FIPS202_H -+#define FIPS202_H -+ -+#include -+ -+ -+#define SHAKE128_RATE 168 -+#define SHAKE256_RATE 136 -+ -+void shake128_absorb(uint64_t *s, const unsigned char *input, unsigned int inputByteLen); -+void shake128_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s); -+void shake128(unsigned char *output, unsigned long long outlen, const unsigned char *input, unsigned long long inlen); -+ -+void cshake128_simple_absorb(uint64_t *s, uint16_t cstm, const unsigned char *in, unsigned long long inlen); -+void cshake128_simple_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s); -+void cshake128_simple(unsigned char *output, unsigned long long outlen, uint16_t cstm, const unsigned char *in, unsigned long long inlen); -+ -+void shake256_absorb(uint64_t *s, const unsigned char *input, unsigned int inputByteLen); -+void shake256_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s); -+void shake256(unsigned char *output, unsigned long long outlen, const unsigned char *input, unsigned long long inlen); -+ -+void cshake256_simple_absorb(uint64_t *s, uint16_t cstm, const unsigned char *in, unsigned long long inlen); -+void cshake256_simple_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s); -+void cshake256_simple(unsigned char *output, unsigned long long outlen, uint16_t cstm, const unsigned char *in, unsigned long long inlen); -+ -+ -+#endif -diff --git a/third_party/sidh/src/sidh.c b/third_party/sidh/src/sidh.c -new file mode 100644 -index 00000000..d8aff37d ---- /dev/null -+++ b/third_party/sidh/src/sidh.c -@@ -0,0 +1,333 @@ -+/******************************************************************************************** -+* SIDH: an efficient supersingular isogeny cryptography library -+* -+* Abstract: ephemeral supersingular isogeny Diffie-Hellman key exchange (SIDH) -+*********************************************************************************************/ -+ -+#include "random/random.h" -+ -+ -+static void clear_words(void* mem, digit_t nwords) -+{ // Clear digits from memory. "nwords" indicates the number of digits to be zeroed. -+ // This function uses the volatile type qualifier to inform the compiler not to optimize out the memory clearing. -+ unsigned int i; -+ volatile digit_t *v = mem; -+ -+ for (i = 0; i < nwords; i++) { -+ v[i] = 0; -+ } -+} -+ -+ -+static void init_basis(digit_t *gen, f2elm_t XP, f2elm_t XQ, f2elm_t XR) -+{ // Initialization of basis points -+ -+ fpcopy(gen, XP[0]); -+ fpcopy(gen + NWORDS_FIELD, XP[1]); -+ fpcopy(gen + 2*NWORDS_FIELD, XQ[0]); -+ fpzero(XQ[1]); -+ fpcopy(gen + 3*NWORDS_FIELD, XR[0]); -+ fpcopy(gen + 4*NWORDS_FIELD, XR[1]); -+} -+ -+ -+static void fp2_encode(const f2elm_t x, unsigned char *enc) -+{ // Conversion of GF(p^2) element from Montgomery to standard representation, and encoding by removing leading 0 bytes -+ unsigned int i; -+ f2elm_t t; -+ -+ from_fp2mont(x, t); -+ for (i = 0; i < FP2_ENCODED_BYTES / 2; i++) { -+ enc[i] = ((unsigned char*)t)[i]; -+ enc[i + FP2_ENCODED_BYTES / 2] = ((unsigned char*)t)[i + MAXBITS_FIELD / 8]; -+ } -+} -+ -+ -+static void fp2_decode(const unsigned char *enc, f2elm_t x) -+{ // Parse byte sequence back into GF(p^2) element, and conversion to Montgomery representation -+ unsigned int i; -+ -+ for (i = 0; i < 2*(MAXBITS_FIELD / 8); i++) ((unsigned char *)x)[i] = 0; -+ for (i = 0; i < FP2_ENCODED_BYTES / 2; i++) { -+ ((unsigned char*)x)[i] = enc[i]; -+ ((unsigned char*)x)[i + MAXBITS_FIELD / 8] = enc[i + FP2_ENCODED_BYTES / 2]; -+ } -+ to_fp2mont(x, x); -+} -+ -+ -+void random_mod_order_A(unsigned char* random_digits) -+{ // Generation of Alice's secret key -+ // Outputs random value in [0, 2^eA - 1] -+ unsigned long long nbytes = NBITS_TO_NBYTES(OALICE_BITS); -+ -+ clear_words((void*)random_digits, MAXWORDS_ORDER); -+ randombytes(random_digits, nbytes); -+ random_digits[nbytes-1] &= MASK_ALICE; // Masking last byte -+} -+ -+ -+void random_mod_order_B(unsigned char* random_digits) -+{ // Generation of Bob's secret key -+ // Outputs random value in [0, 2^Floor(Log(2, oB)) - 1] -+ unsigned long long nbytes = NBITS_TO_NBYTES(OBOB_BITS-1); -+ -+ clear_words((void*)random_digits, MAXWORDS_ORDER); -+ randombytes(random_digits, nbytes); -+ random_digits[nbytes-1] &= MASK_BOB; // Masking last byte -+} -+ -+ -+int EphemeralKeyGeneration_A(const unsigned char* PrivateKeyA, unsigned char* PublicKeyA) -+{ // Alice's ephemeral public key generation -+ // Input: a private key PrivateKeyA in the range [0, 2^eA - 1]. -+ // Output: the public key PublicKeyA consisting of 3 elements in GF(p^2) which are encoded by removing leading 0 bytes. -+ point_proj_t R, phiP = {0}, phiQ = {0}, phiR = {0}, pts[MAX_INT_POINTS_ALICE]; -+ f2elm_t XPA, XQA, XRA, coeff[3], A24plus = {0}, C24 = {0}, A = {0}; -+ unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_ALICE], npts = 0, ii = 0; -+ -+ // Initialize basis points -+ init_basis((digit_t*)A_gen, XPA, XQA, XRA); -+ init_basis((digit_t*)B_gen, phiP->X, phiQ->X, phiR->X); -+ fpcopy((digit_t*)&Montgomery_one, (phiP->Z)[0]); -+ fpcopy((digit_t*)&Montgomery_one, (phiQ->Z)[0]); -+ fpcopy((digit_t*)&Montgomery_one, (phiR->Z)[0]); -+ -+ // Initialize constants -+ fpcopy((digit_t*)&Montgomery_one, A24plus[0]); -+ fp2add(A24plus, A24plus, C24); -+ -+ // Retrieve kernel point -+ LADDER3PT(XPA, XQA, XRA, (digit_t*)PrivateKeyA, ALICE, R, A); -+ -+ // Traverse tree -+ index = 0; -+ for (row = 1; row < MAX_Alice; row++) { -+ while (index < MAX_Alice-row) { -+ fp2copy(R->X, pts[npts]->X); -+ fp2copy(R->Z, pts[npts]->Z); -+ pts_index[npts++] = index; -+ m = strat_Alice[ii++]; -+ xDBLe(R, R, A24plus, C24, (int)(2*m)); -+ index += m; -+ } -+ get_4_isog(R, A24plus, C24, coeff); -+ -+ for (i = 0; i < npts; i++) { -+ eval_4_isog(pts[i], coeff); -+ } -+ eval_4_isog(phiP, coeff); -+ eval_4_isog(phiQ, coeff); -+ eval_4_isog(phiR, coeff); -+ -+ fp2copy(pts[npts-1]->X, R->X); -+ fp2copy(pts[npts-1]->Z, R->Z); -+ index = pts_index[npts-1]; -+ npts -= 1; -+ } -+ -+ get_4_isog(R, A24plus, C24, coeff); -+ eval_4_isog(phiP, coeff); -+ eval_4_isog(phiQ, coeff); -+ eval_4_isog(phiR, coeff); -+ -+ inv_3_way(phiP->Z, phiQ->Z, phiR->Z); -+ fp2mul_mont(phiP->X, phiP->Z, phiP->X); -+ fp2mul_mont(phiQ->X, phiQ->Z, phiQ->X); -+ fp2mul_mont(phiR->X, phiR->Z, phiR->X); -+ -+ // Format public key -+ fp2_encode(phiP->X, PublicKeyA); -+ fp2_encode(phiQ->X, PublicKeyA + FP2_ENCODED_BYTES); -+ fp2_encode(phiR->X, PublicKeyA + 2*FP2_ENCODED_BYTES); -+ -+ return 0; -+} -+ -+ -+int EphemeralKeyGeneration_B(const unsigned char* PrivateKeyB, unsigned char* PublicKeyB) -+{ // Bob's ephemeral public key generation -+ // Input: a private key PrivateKeyB in the range [0, 2^Floor(Log(2,oB)) - 1]. -+ // Output: the public key PublicKeyB consisting of 3 elements in GF(p^2) which are encoded by removing leading 0 bytes. -+ point_proj_t R, phiP = {0}, phiQ = {0}, phiR = {0}, pts[MAX_INT_POINTS_BOB]; -+ f2elm_t XPB, XQB, XRB, coeff[3], A24plus = {0}, A24minus = {0}, A = {0}; -+ unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_BOB], npts = 0, ii = 0; -+ -+ // Initialize basis points -+ init_basis((digit_t*)B_gen, XPB, XQB, XRB); -+ init_basis((digit_t*)A_gen, phiP->X, phiQ->X, phiR->X); -+ fpcopy((digit_t*)&Montgomery_one, (phiP->Z)[0]); -+ fpcopy((digit_t*)&Montgomery_one, (phiQ->Z)[0]); -+ fpcopy((digit_t*)&Montgomery_one, (phiR->Z)[0]); -+ -+ // Initialize constants -+ fpcopy((digit_t*)&Montgomery_one, A24plus[0]); -+ fp2add(A24plus, A24plus, A24plus); -+ fp2copy(A24plus, A24minus); -+ fp2neg(A24minus); -+ -+ // Retrieve kernel point -+ LADDER3PT(XPB, XQB, XRB, (digit_t*)PrivateKeyB, BOB, R, A); -+ -+ // Traverse tree -+ index = 0; -+ for (row = 1; row < MAX_Bob; row++) { -+ while (index < MAX_Bob-row) { -+ fp2copy(R->X, pts[npts]->X); -+ fp2copy(R->Z, pts[npts]->Z); -+ pts_index[npts++] = index; -+ m = strat_Bob[ii++]; -+ xTPLe(R, R, A24minus, A24plus, (int)m); -+ index += m; -+ } -+ get_3_isog(R, A24minus, A24plus, coeff); -+ -+ for (i = 0; i < npts; i++) { -+ eval_3_isog(pts[i], coeff); -+ } -+ eval_3_isog(phiP, coeff); -+ eval_3_isog(phiQ, coeff); -+ eval_3_isog(phiR, coeff); -+ -+ fp2copy(pts[npts-1]->X, R->X); -+ fp2copy(pts[npts-1]->Z, R->Z); -+ index = pts_index[npts-1]; -+ npts -= 1; -+ } -+ -+ get_3_isog(R, A24minus, A24plus, coeff); -+ eval_3_isog(phiP, coeff); -+ eval_3_isog(phiQ, coeff); -+ eval_3_isog(phiR, coeff); -+ -+ inv_3_way(phiP->Z, phiQ->Z, phiR->Z); -+ fp2mul_mont(phiP->X, phiP->Z, phiP->X); -+ fp2mul_mont(phiQ->X, phiQ->Z, phiQ->X); -+ fp2mul_mont(phiR->X, phiR->Z, phiR->X); -+ -+ // Format public key -+ fp2_encode(phiP->X, PublicKeyB); -+ fp2_encode(phiQ->X, PublicKeyB + FP2_ENCODED_BYTES); -+ fp2_encode(phiR->X, PublicKeyB + 2*FP2_ENCODED_BYTES); -+ -+ return 0; -+} -+ -+ -+int EphemeralSecretAgreement_A(const unsigned char* PrivateKeyA, const unsigned char* PublicKeyB, unsigned char* SharedSecretA) -+{ // Alice's ephemeral shared secret computation -+ // It produces a shared secret key SharedSecretA using her secret key PrivateKeyA and Bob's public key PublicKeyB -+ // Inputs: Alice's PrivateKeyA is an integer in the range [0, oA-1]. -+ // Bob's PublicKeyB consists of 3 elements in GF(p^2) encoded by removing leading 0 bytes. -+ // Output: a shared secret SharedSecretA that consists of one element in GF(p^2) encoded by removing leading 0 bytes. -+ point_proj_t R, pts[MAX_INT_POINTS_ALICE]; -+ f2elm_t coeff[3], PKB[3], jinv; -+ f2elm_t A24plus = {0}, C24 = {0}, A = {0}; -+ unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_ALICE], npts = 0, ii = 0; -+ -+ // Initialize images of Bob's basis -+ fp2_decode(PublicKeyB, PKB[0]); -+ fp2_decode(PublicKeyB + FP2_ENCODED_BYTES, PKB[1]); -+ fp2_decode(PublicKeyB + 2*FP2_ENCODED_BYTES, PKB[2]); -+ -+ // Initialize constants -+ get_A(PKB[0], PKB[1], PKB[2], A); // TODO: Can return projective A? -+ fpadd((digit_t*)&Montgomery_one, (digit_t*)&Montgomery_one, C24[0]); -+ fp2add(A, C24, A24plus); -+ fpadd(C24[0], C24[0], C24[0]); -+ -+ // Retrieve kernel point -+ LADDER3PT(PKB[0], PKB[1], PKB[2], (digit_t*)PrivateKeyA, ALICE, R, A); -+ -+ // Traverse tree -+ index = 0; -+ for (row = 1; row < MAX_Alice; row++) { -+ while (index < MAX_Alice-row) { -+ fp2copy(R->X, pts[npts]->X); -+ fp2copy(R->Z, pts[npts]->Z); -+ pts_index[npts++] = index; -+ m = strat_Alice[ii++]; -+ xDBLe(R, R, A24plus, C24, (int)(2*m)); -+ index += m; -+ } -+ get_4_isog(R, A24plus, C24, coeff); -+ -+ for (i = 0; i < npts; i++) { -+ eval_4_isog(pts[i], coeff); -+ } -+ -+ fp2copy(pts[npts-1]->X, R->X); -+ fp2copy(pts[npts-1]->Z, R->Z); -+ index = pts_index[npts-1]; -+ npts -= 1; -+ } -+ -+ get_4_isog(R, A24plus, C24, coeff); -+ fp2div2(C24, C24); -+ fp2sub(A24plus, C24, A24plus); -+ fp2div2(C24, C24); -+ j_inv(A24plus, C24, jinv); -+ fp2_encode(jinv, SharedSecretA); // Format shared secret -+ -+ return 0; -+} -+ -+ -+int EphemeralSecretAgreement_B(const unsigned char* PrivateKeyB, const unsigned char* PublicKeyA, unsigned char* SharedSecretB) -+{ // Bob's ephemeral shared secret computation -+ // It produces a shared secret key SharedSecretB using his secret key PrivateKeyB and Alice's public key PublicKeyA -+ // Inputs: Bob's PrivateKeyB is an integer in the range [0, 2^Floor(Log(2,oB)) - 1]. -+ // Alice's PublicKeyA consists of 3 elements in GF(p^2) encoded by removing leading 0 bytes. -+ // Output: a shared secret SharedSecretB that consists of one element in GF(p^2) encoded by removing leading 0 bytes. -+ point_proj_t R, pts[MAX_INT_POINTS_BOB]; -+ f2elm_t coeff[3], PKB[3], jinv; -+ f2elm_t A24plus = {0}, A24minus = {0}, A = {0}; -+ unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_BOB], npts = 0, ii = 0; -+ -+ // Initialize images of Alice's basis -+ fp2_decode(PublicKeyA, PKB[0]); -+ fp2_decode(PublicKeyA + FP2_ENCODED_BYTES, PKB[1]); -+ fp2_decode(PublicKeyA + 2*FP2_ENCODED_BYTES, PKB[2]); -+ -+ // Initialize constants -+ get_A(PKB[0], PKB[1], PKB[2], A); // TODO: Can return projective A? -+ fpadd((digit_t*)&Montgomery_one, (digit_t*)&Montgomery_one, A24minus[0]); -+ fp2add(A, A24minus, A24plus); -+ fp2sub(A, A24minus, A24minus); -+ -+ // Retrieve kernel point -+ LADDER3PT(PKB[0], PKB[1], PKB[2], (digit_t*)PrivateKeyB, BOB, R, A); -+ -+ // Traverse tree -+ index = 0; -+ for (row = 1; row < MAX_Bob; row++) { -+ while (index < MAX_Bob-row) { -+ fp2copy(R->X, pts[npts]->X); -+ fp2copy(R->Z, pts[npts]->Z); -+ pts_index[npts++] = index; -+ m = strat_Bob[ii++]; -+ xTPLe(R, R, A24minus, A24plus, (int)m); -+ index += m; -+ } -+ get_3_isog(R, A24minus, A24plus, coeff); -+ -+ for (i = 0; i < npts; i++) { -+ eval_3_isog(pts[i], coeff); -+ } -+ -+ fp2copy(pts[npts-1]->X, R->X); -+ fp2copy(pts[npts-1]->Z, R->Z); -+ index = pts_index[npts-1]; -+ npts -= 1; -+ } -+ -+ get_3_isog(R, A24minus, A24plus, coeff); -+ fp2add(A24plus, A24minus, A); -+ fp2add(A, A, A); -+ fp2sub(A24plus, A24minus, A24plus); -+ j_inv(A, A24plus, jinv); -+ fp2_encode(jinv, SharedSecretB); // Format shared secret -+ -+ return 0; -+} -\ No newline at end of file -diff --git a/third_party/sidh/src/sike.c b/third_party/sidh/src/sike.c -new file mode 100644 -index 00000000..013b16c3 ---- /dev/null -+++ b/third_party/sidh/src/sike.c -@@ -0,0 +1,98 @@ -+/******************************************************************************************** -+* SIDH: an efficient supersingular isogeny cryptography library -+* -+* Abstract: supersingular isogeny key encapsulation (SIKE) protocol -+*********************************************************************************************/ -+ -+#include -+#include "sha3/fips202.h" -+ -+ -+int crypto_kem_keypair(unsigned char *pk, unsigned char *sk) -+{ // SIKE's key generation -+ // Outputs: secret key sk (CRYPTO_SECRETKEYBYTES = MSG_BYTES + SECRETKEY_B_BYTES + CRYPTO_PUBLICKEYBYTES bytes) -+ // public key pk (CRYPTO_PUBLICKEYBYTES bytes) -+ -+ // Generate lower portion of secret key sk <- s||SK -+ randombytes(sk, MSG_BYTES); -+ random_mod_order_B(sk + MSG_BYTES); -+ -+ // Generate public key pk -+ EphemeralKeyGeneration_B(sk + MSG_BYTES, pk); -+ -+ // Append public key pk to secret key sk -+ memcpy(&sk[MSG_BYTES + SECRETKEY_B_BYTES], pk, CRYPTO_PUBLICKEYBYTES); -+ -+ return 0; -+} -+ -+ -+int crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk) -+{ // SIKE's encapsulation -+ // Input: public key pk (CRYPTO_PUBLICKEYBYTES bytes) -+ // Outputs: shared secret ss (CRYPTO_BYTES bytes) -+ // ciphertext message ct (CRYPTO_CIPHERTEXTBYTES = CRYPTO_PUBLICKEYBYTES + MSG_BYTES bytes) -+ const uint16_t G = 0; -+ const uint16_t H = 1; -+ const uint16_t P = 2; -+ unsigned char ephemeralsk[SECRETKEY_A_BYTES]; -+ unsigned char jinvariant[FP2_ENCODED_BYTES]; -+ unsigned char h[MSG_BYTES]; -+ unsigned char temp[CRYPTO_CIPHERTEXTBYTES+MSG_BYTES]; -+ unsigned int i; -+ -+ // Generate ephemeralsk <- G(m||pk) mod oA -+ randombytes(temp, MSG_BYTES); -+ memcpy(&temp[MSG_BYTES], pk, CRYPTO_PUBLICKEYBYTES); -+ cshake256_simple(ephemeralsk, SECRETKEY_A_BYTES, G, temp, CRYPTO_PUBLICKEYBYTES+MSG_BYTES); -+ ephemeralsk[SECRETKEY_A_BYTES - 1] &= MASK_ALICE; -+ -+ // Encrypt -+ EphemeralKeyGeneration_A(ephemeralsk, ct); -+ EphemeralSecretAgreement_A(ephemeralsk, pk, jinvariant); -+ cshake256_simple(h, MSG_BYTES, P, jinvariant, FP2_ENCODED_BYTES); -+ for (i = 0; i < MSG_BYTES; i++) ct[i + CRYPTO_PUBLICKEYBYTES] = temp[i] ^ h[i]; -+ -+ // Generate shared secret ss <- H(m||ct) -+ memcpy(&temp[MSG_BYTES], ct, CRYPTO_CIPHERTEXTBYTES); -+ cshake256_simple(ss, CRYPTO_BYTES, H, temp, CRYPTO_CIPHERTEXTBYTES+MSG_BYTES); -+ -+ return 0; -+} -+ -+ -+int crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk) -+{ // SIKE's decapsulation -+ // Input: secret key sk (CRYPTO_SECRETKEYBYTES = MSG_BYTES + SECRETKEY_B_BYTES + CRYPTO_PUBLICKEYBYTES bytes) -+ // ciphertext message ct (CRYPTO_CIPHERTEXTBYTES = CRYPTO_PUBLICKEYBYTES + MSG_BYTES bytes) -+ // Outputs: shared secret ss (CRYPTO_BYTES bytes) -+ const uint16_t G = 0; -+ const uint16_t H = 1; -+ const uint16_t P = 2; -+ unsigned char ephemeralsk_[SECRETKEY_A_BYTES]; -+ unsigned char jinvariant_[FP2_ENCODED_BYTES]; -+ unsigned char h_[MSG_BYTES]; -+ unsigned char c0_[CRYPTO_PUBLICKEYBYTES]; -+ unsigned char temp[CRYPTO_CIPHERTEXTBYTES+MSG_BYTES]; -+ unsigned int i; -+ -+ // Decrypt -+ EphemeralSecretAgreement_B(sk + MSG_BYTES, ct, jinvariant_); -+ cshake256_simple(h_, MSG_BYTES, P, jinvariant_, FP2_ENCODED_BYTES); -+ for (i = 0; i < MSG_BYTES; i++) temp[i] = ct[i + CRYPTO_PUBLICKEYBYTES] ^ h_[i]; -+ -+ // Generate ephemeralsk_ <- G(m||pk) mod oA -+ memcpy(&temp[MSG_BYTES], &sk[MSG_BYTES + SECRETKEY_B_BYTES], CRYPTO_PUBLICKEYBYTES); -+ cshake256_simple(ephemeralsk_, SECRETKEY_A_BYTES, G, temp, CRYPTO_PUBLICKEYBYTES+MSG_BYTES); -+ ephemeralsk_[SECRETKEY_A_BYTES - 1] &= MASK_ALICE; -+ -+ // Generate shared secret ss <- H(m||ct) or output ss <- H(s||ct) -+ EphemeralKeyGeneration_A(ephemeralsk_, c0_); -+ if (memcmp(c0_, ct, CRYPTO_PUBLICKEYBYTES) != 0) { -+ memcpy(temp, sk, MSG_BYTES); -+ } -+ memcpy(&temp[MSG_BYTES], ct, CRYPTO_CIPHERTEXTBYTES); -+ cshake256_simple(ss, CRYPTO_BYTES, H, temp, CRYPTO_CIPHERTEXTBYTES+MSG_BYTES); -+ -+ return 0; -+} -\ No newline at end of file diff --git a/_dev/boring/sidh_ff433815b51c34496bb6bea13e73e29e5c278238.patch b/_dev/boring/sidh_ff433815b51c34496bb6bea13e73e29e5c278238.patch new file mode 100644 index 0000000..545e8b8 --- /dev/null +++ b/_dev/boring/sidh_ff433815b51c34496bb6bea13e73e29e5c278238.patch @@ -0,0 +1,5176 @@ +diff --git a/CMakeLists.txt b/CMakeLists.txt +index bfde5d588..a0f0da3b2 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -520,6 +520,7 @@ add_subdirectory(ssl/test) + add_subdirectory(fipstools) + add_subdirectory(tool) + add_subdirectory(decrepit) ++add_subdirectory(third_party/sidh) + + if(FUZZ) + add_subdirectory(fuzz) +diff --git a/crypto/CMakeLists.txt b/crypto/CMakeLists.txt +index b1ca70e15..7135f55fc 100644 +--- a/crypto/CMakeLists.txt ++++ b/crypto/CMakeLists.txt +@@ -396,6 +396,7 @@ add_library( + ../third_party/fiat/curve25519.c + + $ ++ $ + + ${CRYPTO_ARCH_SOURCES} + ${CRYPTO_FIPS_OBJECTS} +diff --git a/include/openssl/nid.h b/include/openssl/nid.h +index afeb2dea4..effe19205 100644 +--- a/include/openssl/nid.h ++++ b/include/openssl/nid.h +@@ -4234,6 +4234,12 @@ extern "C" { + #define LN_auth_any "auth-any" + #define NID_auth_any 958 + ++#define SN_X25519_SIDHp503 "X25519-SIDHp503" ++/* This ID is only needed by kNamedGroups (ssl_key_share.c). It isn't ++ used in kObjects array (obj_dat.h). It can't be smaller than than ++ NUM_NID (obj_dat.h) ++*/ ++#define NID_X25519_SIDHp503 960 + + #if defined(__cplusplus) + } /* extern C */ +diff --git a/include/openssl/ssl.h b/include/openssl/ssl.h +index 17c559259..9cbf536e1 100644 +--- a/include/openssl/ssl.h ++++ b/include/openssl/ssl.h +@@ -2177,6 +2177,7 @@ OPENSSL_EXPORT int SSL_set1_curves_list(SSL *ssl, const char *curves); + #define SSL_CURVE_SECP384R1 24 + #define SSL_CURVE_SECP521R1 25 + #define SSL_CURVE_X25519 29 ++#define SSL_CURVE_X25519_SIDHp503 0xFE30 + + // SSL_get_curve_id returns the ID of the curve used by |ssl|'s most recently + // completed handshake or 0 if not applicable. +diff --git a/ssl/CMakeLists.txt b/ssl/CMakeLists.txt +index d6c1294f1..8d7cfa14f 100644 +--- a/ssl/CMakeLists.txt ++++ b/ssl/CMakeLists.txt +@@ -1,4 +1,4 @@ +-include_directories(../include) ++include_directories(../include ../third_party/sidh/include) + + add_library( + ssl +diff --git a/ssl/handshake_client.cc b/ssl/handshake_client.cc +index c1d54bd8f..d141cc399 100644 +--- a/ssl/handshake_client.cc ++++ b/ssl/handshake_client.cc +@@ -1011,6 +1011,7 @@ static enum ssl_hs_wait_t do_read_server_key_exchange(SSL_HANDSHAKE *hs) { + !hs->peer_key.CopyFrom(point)) { + return ssl_hs_error; + } ++ hs->key_share->SetInitiator(true); + } else if (!(alg_k & SSL_kPSK)) { + OPENSSL_PUT_ERROR(SSL, SSL_R_UNEXPECTED_MESSAGE); + ssl_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_UNEXPECTED_MESSAGE); +diff --git a/ssl/handshake_server.cc b/ssl/handshake_server.cc +index c4f3b75e5..1c93fd16d 100644 +--- a/ssl/handshake_server.cc ++++ b/ssl/handshake_server.cc +@@ -932,7 +932,10 @@ static enum ssl_hs_wait_t do_send_server_certificate(SSL_HANDSHAKE *hs) { + hs->new_session->group_id = group_id; + + // Set up ECDH, generate a key, and emit the public half. +- hs->key_share = SSLKeyShare::Create(group_id); ++ if ((hs->key_share = SSLKeyShare::Create(group_id)) == nullptr) { ++ return ssl_hs_error; ++ } ++ hs->key_share->SetInitiator(false); + if (!hs->key_share || + !CBB_add_u8(cbb.get(), NAMED_CURVE_TYPE) || + !CBB_add_u16(cbb.get(), group_id) || +diff --git a/ssl/internal.h b/ssl/internal.h +index f8a2ea70a..ddd6bb397 100644 +--- a/ssl/internal.h ++++ b/ssl/internal.h +@@ -998,12 +998,20 @@ class SSLKeyShare { + // Deserialize initializes the state of the key exchange from |in|, returning + // true if successful and false otherwise. It is called by |Create|. + virtual bool Deserialize(CBS *in) { return false; } ++ ++ // Sets flag indicating role of the key share owner. True for initiator of the ++ // handshake, false for responder. ++ void SetInitiator(bool flag) { is_initiator_ = flag; } ++ ++ protected: ++ bool is_initiator_ = false; + }; + + struct NamedGroup { + int nid; + uint16_t group_id; +- const char name[8], alias[11]; ++ const char name[16], alias[15]; ++ uint16_t min_protocol_ver; + }; + + // NamedGroups returns all supported groups. +@@ -1019,6 +1027,10 @@ bool ssl_nid_to_group_id(uint16_t *out_group_id, int nid); + // true. Otherwise, it returns false. + bool ssl_name_to_group_id(uint16_t *out_group_id, const char *name, size_t len); + ++// ssl_get_protocol_ver_for_group looks up the minimal version of a TLS that ++// supports |group_id|. In case |group_id| is not a valid ID of a group function ++// returns 0. ++uint16_t ssl_get_protocol_ver_for_group(uint16_t group_id); + + // Handshake messages. + +diff --git a/ssl/ssl_key_share.cc b/ssl/ssl_key_share.cc +index 55c74633c..f36e33bb5 100644 +--- a/ssl/ssl_key_share.cc ++++ b/ssl/ssl_key_share.cc +@@ -30,6 +30,8 @@ + #include "internal.h" + #include "../crypto/internal.h" + ++#include "sidh/def_p503.h" ++#include "sidh/P503_api.h" + + BSSL_NAMESPACE_BEGIN + +@@ -207,12 +209,88 @@ class X25519KeyShare : public SSLKeyShare { + uint8_t private_key_[32]; + }; + ++class SIDHp503X25519KeyShare : public SSLKeyShare { ++public: ++ ~SIDHp503X25519KeyShare() override { ++ OPENSSL_cleanse(private_x25519_, sizeof(private_x25519_)); ++ OPENSSL_cleanse(private_SIDH_, sizeof(private_SIDH_)); ++ } ++ ++ uint16_t GroupID() const override { ++ return SSL_CURVE_X25519_SIDHp503; ++ } ++ ++ bool Offer(CBB *out) override { ++ uint8_t public_x25519[32] = {0}; ++ uint8_t public_SIDH[SIDHp503_PUB_BYTESZ] = {0}; ++ ++ X25519_keypair(public_x25519, private_x25519_); ++ if (EphemeralKeyPair_SIDHp503(private_SIDH_, public_SIDH, is_initiator_)) { ++ return false; ++ } ++ ++ return ++ CBB_add_bytes(out, public_x25519, sizeof(public_x25519)) && ++ CBB_add_bytes(out, public_SIDH, sizeof(public_SIDH)); ++ } ++ ++ bool Finish(Array *out_secret, uint8_t *out_alert, ++ Span peer_key) override { ++ *out_alert = SSL_AD_INTERNAL_ERROR; ++ ++ Array secret; ++ if (!secret.Init(sizeof(private_x25519_) + SIDHp503_SS_BYTESZ)) { ++ OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE); ++ return false; ++ } ++ ++ if (peer_key.size() != (32 + SIDHp503_PUB_BYTESZ) || ++ !X25519(secret.data(), private_x25519_, peer_key.data())) { ++ *out_alert = SSL_AD_DECODE_ERROR; ++ OPENSSL_PUT_ERROR(SSL, SSL_R_BAD_ECPOINT); ++ return false; ++ } ++ ++ if (is_initiator_) { ++ // Never fails ++ (void)EphemeralSecretAgreement_A_SIDHp503(private_SIDH_, peer_key.data() + 32, secret.data() + sizeof(private_x25519_)); ++ } else { ++ (void)EphemeralSecretAgreement_B_SIDHp503(private_SIDH_, peer_key.data() + 32, secret.data() + sizeof(private_x25519_)); ++ } ++ ++ *out_secret = std::move(secret); ++ return true; ++ } ++ ++ bool Serialize(CBB *out) override { ++ return (CBB_add_asn1_uint64(out, GroupID()) && ++ CBB_add_asn1_octet_string(out, private_x25519_, sizeof(private_x25519_)) && ++ CBB_add_asn1_octet_string(out, private_SIDH_, sizeof(private_SIDH_))); ++ } ++ ++ bool Deserialize(CBS *in) override { ++ CBS key; ++ if (!CBS_get_asn1(in, &key, CBS_ASN1_OCTETSTRING) || ++ CBS_len(&key) != (sizeof(private_x25519_) + sizeof(private_SIDH_)) || ++ !CBS_copy_bytes(&key, private_x25519_, sizeof(private_x25519_)) || ++ !CBS_copy_bytes(&key, private_SIDH_, sizeof(private_SIDH_))) { ++ return false; ++ } ++ return true; ++ } ++ ++private: ++ uint8_t private_x25519_[32]; ++ uint8_t private_SIDH_[SIDHp503_PRV_KEY_BYTESZ_MAX]; ++}; ++ + CONSTEXPR_ARRAY NamedGroup kNamedGroups[] = { +- {NID_secp224r1, SSL_CURVE_SECP224R1, "P-224", "secp224r1"}, +- {NID_X9_62_prime256v1, SSL_CURVE_SECP256R1, "P-256", "prime256v1"}, +- {NID_secp384r1, SSL_CURVE_SECP384R1, "P-384", "secp384r1"}, +- {NID_secp521r1, SSL_CURVE_SECP521R1, "P-521", "secp521r1"}, +- {NID_X25519, SSL_CURVE_X25519, "X25519", "x25519"}, ++ {NID_secp224r1, SSL_CURVE_SECP224R1, "P-224", "secp224r1", TLS1_VERSION}, ++ {NID_X9_62_prime256v1, SSL_CURVE_SECP256R1, "P-256", "prime256v1", TLS1_VERSION}, ++ {NID_secp384r1, SSL_CURVE_SECP384R1, "P-384", "secp384r1", TLS1_VERSION}, ++ {NID_secp521r1, SSL_CURVE_SECP521R1, "P-521", "secp521r1", TLS1_VERSION}, ++ {NID_X25519, SSL_CURVE_X25519, "X25519", "x25519", TLS1_VERSION}, ++ {NID_X25519_SIDHp503, SSL_CURVE_X25519_SIDHp503, "X25519-SIDHp503", "x25519sidhp503", TLS1_3_VERSION}, + }; + + } // namespace +@@ -237,6 +315,8 @@ UniquePtr SSLKeyShare::Create(uint16_t group_id) { + New(NID_secp521r1, SSL_CURVE_SECP521R1)); + case SSL_CURVE_X25519: + return UniquePtr(New()); ++ case SSL_CURVE_X25519_SIDHp503: ++ return UniquePtr(New()); + default: + return nullptr; + } +@@ -288,6 +368,15 @@ bool ssl_name_to_group_id(uint16_t *out_group_id, const char *name, size_t len) + return false; + } + ++uint16_t ssl_get_protocol_ver_for_group(uint16_t group_id) { ++ for (const auto &group : kNamedGroups) { ++ if (group.group_id == group_id) { ++ return group.min_protocol_ver; ++ } ++ } ++ return 0; ++} ++ + BSSL_NAMESPACE_END + + using namespace bssl; +diff --git a/ssl/t1_lib.cc b/ssl/t1_lib.cc +index 678e4a3b7..6bfd21c82 100644 +--- a/ssl/t1_lib.cc ++++ b/ssl/t1_lib.cc +@@ -324,7 +324,8 @@ bool tls1_get_shared_group(SSL_HANDSHAKE *hs, uint16_t *out_group_id) { + + for (uint16_t pref_group : pref) { + for (uint16_t supp_group : supp) { +- if (pref_group == supp_group) { ++ if ((pref_group == supp_group) && ++ (ssl_get_protocol_ver_for_group(pref_group)<=ssl_protocol_version(ssl))) { + *out_group_id = pref_group; + return true; + } +@@ -2177,7 +2178,10 @@ static bool ext_key_share_add_clienthello(SSL_HANDSHAKE *hs, CBB *out) { + group_id = groups[0]; + } + +- hs->key_share = SSLKeyShare::Create(group_id); ++ if ((hs->key_share = SSLKeyShare::Create(group_id)) == nullptr) { ++ return false; ++ } ++ hs->key_share->SetInitiator(true); + CBB key_exchange; + if (!hs->key_share || + !CBB_add_u16(&kse_bytes, group_id) || +diff --git a/third_party/sidh/CMakeLists.txt b/third_party/sidh/CMakeLists.txt +new file mode 100644 +index 000000000..6befbfe47 +--- /dev/null ++++ b/third_party/sidh/CMakeLists.txt +@@ -0,0 +1,54 @@ ++cmake_minimum_required(VERSION 2.8.11) ++include_directories(../../include) ++ ++set(ASM_EXT S) ++enable_language(ASM) ++add_definitions(-pedantic) ++ ++# Compile to object files, we will link them with libssl ++add_library( ++ sidh ++ ++ OBJECT ++ ++ src/ec_isogeny.c ++ src/fpx.c ++ src/P503.c ++ src/sidh.c ++) ++ ++# Architecture specific settings ++if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64" OR ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "arm64") ++ add_definitions(-lrt) ++endif() ++ ++# Platform specific sources ++if(OPENSSL_NO_ASM) ++ target_sources( ++ sidh ++ PRIVATE ++ src/generic/fp_generic.c ++ ) ++elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64") ++ target_sources( ++ sidh ++ ++ PRIVATE ++ ++ src/AMD64/fp_x64.c ++ src/AMD64/fp_x64_asm.${ASM_EXT} ++ ) ++elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64" OR ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "arm64") ++ target_sources( ++ sidh ++ ++ PRIVATE ++ ++ src/ARM64/fp_arm64.c ++ src/ARM64/fp_arm64_asm.${ASM_EXT} ++ ) ++endif() ++ ++target_include_directories(sidh PUBLIC ++ include ++) +diff --git a/third_party/sidh/LICENSE b/third_party/sidh/LICENSE +new file mode 100644 +index 000000000..d76bde897 +--- /dev/null ++++ b/third_party/sidh/LICENSE +@@ -0,0 +1,56 @@ ++MIT License ++ ++Copyright (c) Microsoft Corporation. All rights reserved. ++ ++Permission is hereby granted, free of charge, to any person obtaining a copy ++of this software and associated documentation files (the "Software"), to deal ++in the Software without restriction, including without limitation the rights ++to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++copies of the Software, and to permit persons to whom the Software is ++furnished to do so, subject to the following conditions: ++ ++The above copyright notice and this permission notice shall be included in all ++copies or substantial portions of the Software. ++ ++THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++SOFTWARE ++ ++======================================================================== ++ ++Performance improvements and aligning the code to nicely fit to ++BoringSSL was done by Cloudflare and is licensed under BSD3 licence. ++ ++======================================================================== ++ ++Copyright (c) 2018 Cloudflare. All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++ ++ * Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above ++copyright notice, this list of conditions and the following disclaimer ++in the documentation and/or other materials provided with the ++distribution. ++ * Neither the name of Cloudflare nor the names of its ++contributors may be used to endorse or promote products derived from ++this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +diff --git a/third_party/sidh/include/sidh/P503_api.h b/third_party/sidh/include/sidh/P503_api.h +new file mode 100644 +index 000000000..6b25196a5 +--- /dev/null ++++ b/third_party/sidh/include/sidh/P503_api.h +@@ -0,0 +1,65 @@ ++/******************************************************************************************** ++* SIDH: an efficient supersingular isogeny cryptography library ++* ++* Abstract: API header file for P503 ++*********************************************************************************************/ ++ ++#ifndef P503_API_H__ ++#define P503_API_H__ ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/*********************** Ephemeral key exchange API ***********************/ ++ ++// Encoding of keys for KEX-based isogeny system "SIDHp503" (wire format): ++// ---------------------------------------------------------------------- ++// Elements over GF(p503) are encoded in 63 octets in little endian format (i.e., the least significant octet is located in the lowest memory address). ++// Elements (a+b*i) over GF(p503^2), where a and b are defined over GF(p503), are encoded as {a, b}, with a in the lowest memory portion. ++// ++// Private keys PrivateKeyA and PrivateKeyB can have values in the range [0, 2^250-1] and [0, 2^252-1], resp. In the SIDH API, private keys are encoded ++// in 32 octets in little endian format. ++// Public keys PublicKeyA and PublicKeyB consist of 3 elements in GF(p503^2). In the SIDH API, they are encoded in 378 octets. ++// Shared keys SharedSecretA and SharedSecretB consist of one element in GF(p503^2). In the SIDH API, they are encoded in 126 octets. ++ ++// SECURITY NOTE: SIDH supports ephemeral Diffie-Hellman key exchange. It is NOT secure to use it with static keys. ++// See "On the Security of Supersingular Isogeny Cryptosystems", S.D. Galbraith, C. Petit, B. Shani and Y.B. Ti, in ASIACRYPT 2016, 2016. ++// Extended version available at: http://eprint.iacr.org/2016/859 ++ ++// Alice's ephemeral public key generation ++// Input: a private key PrivateKeyA in the range [0, 2^250 - 1], stored in 32 bytes. ++// Output: the public key PublicKeyA consisting of 3 GF(p503^2) elements encoded in 378 bytes. ++int EphemeralKeyGeneration_A_SIDHp503(const unsigned char* PrivateKeyA, unsigned char* PublicKeyA); ++ ++// Bob's ephemeral key-pair generation ++// It produces a private key PrivateKeyB and computes the public key PublicKeyB. ++// The private key is an integer in the range [0, 2^Floor(Log(2,3^159)) - 1], stored in 32 bytes. ++// The public key consists of 3 GF(p503^2) elements encoded in 378 bytes. ++int EphemeralKeyGeneration_B_SIDHp503(const unsigned char* PrivateKeyB, unsigned char* PublicKeyB); ++ ++// Alice's ephemeral shared secret computation ++// It produces a shared secret key SharedSecretA using her secret key PrivateKeyA and Bob's public key PublicKeyB ++// Inputs: Alice's PrivateKeyA is an integer in the range [0, 2^250 - 1], stored in 32 bytes. ++// Bob's PublicKeyB consists of 3 GF(p503^2) elements encoded in 378 bytes. ++// Output: a shared secret SharedSecretA that consists of one element in GF(p503^2) encoded in 126 bytes. ++int EphemeralSecretAgreement_A_SIDHp503(const unsigned char* PrivateKeyA, const unsigned char* PublicKeyB, unsigned char* SharedSecretA); ++ ++// Bob's ephemeral shared secret computation ++// It produces a shared secret key SharedSecretB using his secret key PrivateKeyB and Alice's public key PublicKeyA ++// Inputs: Bob's PrivateKeyB is an integer in the range [0, 2^Floor(Log(2,3^159)) - 1], stored in 32 bytes. ++// Alice's PublicKeyA consists of 3 GF(p503^2) elements encoded in 378 bytes. ++// Output: a shared secret SharedSecretB that consists of one element in GF(p503^2) encoded in 126 bytes. ++int EphemeralSecretAgreement_B_SIDHp503(const unsigned char* PrivateKeyB, const unsigned char* PublicKeyA, unsigned char* SharedSecretB); ++ ++// Generates SIDH/P503 key pair. Internally uses BN_rand() for a entropy source. ++// Input: IsInitiator: 1 for generating public key type A, 0 for type B. ++// Output: the private (PrivateKey) stored in 32 bytes, public key (PublicKey) stored in 378 bytes ++// Returns: 0 on succes, -1 in case of failure ++int EphemeralKeyPair_SIDHp503(unsigned char* PrivateKey, unsigned char* PublicKey, int IsInitiator); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif +diff --git a/third_party/sidh/include/sidh/def_p503.h b/third_party/sidh/include/sidh/def_p503.h +new file mode 100644 +index 000000000..1f39f5988 +--- /dev/null ++++ b/third_party/sidh/include/sidh/def_p503.h +@@ -0,0 +1,70 @@ ++#ifndef DEF_P503_H_ ++#define DEF_P503_H_ ++ ++#include ++#include "openssl/base.h" ++#include "../crypto/internal.h" ++#include "sidh/P503_api.h" ++ ++// Basic constants ++#define SIDHp503_PRV_A_BITSZ 250 // Bit size of SIDH private key (type A) ++#define SIDHp503_PRV_B_BITSZ 252 // Bit size of SIDH private key (type B) ++#define SIDHp503_PUB_BYTESZ 378 // Byte size of SIDH public key ++#define SIDHp503_SS_BYTESZ 126 // Shared secret byte size ++#define SIDHp503_PRV_KEY_BYTESZ_MAX (((SIDHp503_PRV_A_BITSZ>SIDHp503_PRV_B_BITSZ?SIDHp503_PRV_B_BITSZ:SIDHp503_PRV_B_BITSZ)+7)/8) ++#define NBITS_FIELD 503 ++#define NBITS_ORDER 256 ++#define NWORDS_ORDER ((NBITS_ORDER+RADIX-1)/RADIX) // Number of words of oA and oB, where oA and oB are the subgroup orders of Alice and Bob, resp. ++#define NWORDS64_FIELD ((NBITS_FIELD+63)/64) // Number of 64-bit words of a 503-bit field element ++#define NWORDS64_ORDER ((NBITS_ORDER+63)/64) // Number of 64-bit words of a 256-bit element ++#define MAX_Alice 125 ++#define MAX_Bob 159 ++#define RADIX64 64 ++ ++#if defined(OPENSSL_64_BIT) ++ #define NWORDS_FIELD 8 // Number of words of a 503-bit field element ++ #define p503_ZERO_WORDS 3 // Number of "0" digits in the least significant part of p503 + 1 ++ #define RADIX 64 ++ #define LOG2RADIX 6 ++ typedef uint64_t digit_t; // Unsigned 64-bit digit ++#else ++ #define NWORDS_FIELD 16 ++ #define p503_ZERO_WORDS 7 ++ #define RADIX 32 ++ #define LOG2RADIX 5 ++ typedef uint32_t digit_t; // Unsigned 32-bit digit ++#endif ++ ++// Extended datatype support ++#if !defined(BORINGSSL_HAS_UINT128) ++ typedef uint64_t uint128_t[2]; ++#endif ++ ++struct params_t { ++ const uint64_t prime[NWORDS64_FIELD]; ++ const uint64_t primeP1[NWORDS64_FIELD]; ++ const uint64_t primeX2[NWORDS64_FIELD]; ++ // Order of Alice's subgroup ++ const uint64_t Alice_order[NWORDS64_ORDER]; ++ // Order of Bob's subgroup ++ const uint64_t Bob_order[NWORDS64_ORDER]; ++ // Alice's generator values {XPA0 + XPA1*i, XQA0, XRA0 + XRA1*i} in GF(p503^2), expressed in Montgomery representation ++ const uint64_t A_gen[5*NWORDS64_FIELD]; ++ // Bob's generator values {XPB0 + XPB1*i, XQB0, XRB0 + XRB1*i} in GF(p503^2), expressed in Montgomery representation ++ const uint64_t B_gen[5*NWORDS64_FIELD]; ++ // Montgomery constant Montgomery_R2 = (2^512)^2 mod p503 ++ const uint64_t Montgomery_R2[NWORDS64_FIELD]; ++ // Value one in Montgomery representation ++ const uint64_t Montgomery_one[NWORDS64_FIELD]; ++ // Value (2^256)^2 mod 3^159 ++ const uint64_t Montgomery_Rprime[NWORDS64_ORDER]; ++ // Value -(3^159)^-1 mod 2^256 ++ const uint64_t Montgomery_rprime[NWORDS64_ORDER]; ++ // Value order_Bob/3 mod p503 ++ const uint64_t Border_div3[NWORDS_ORDER]; ++ // Fixed parameters for isogeny tree computation ++ const unsigned int strat_Alice[MAX_Alice-1]; ++ const unsigned int strat_Bob[MAX_Bob-1]; ++}; ++ ++#endif // DEF_P503_H_ +diff --git a/third_party/sidh/src/AMD64/fp_x64.c b/third_party/sidh/src/AMD64/fp_x64.c +new file mode 100644 +index 000000000..62bb9692e +--- /dev/null ++++ b/third_party/sidh/src/AMD64/fp_x64.c +@@ -0,0 +1,88 @@ ++/******************************************************************************************** ++* SIDH: an efficient supersingular isogeny cryptography library ++* ++* Abstract: modular arithmetic optimized for x64 platforms for P503 ++*********************************************************************************************/ ++ ++#include "../internal.h" ++#include "../P503_internal.h" ++ ++// Global constants ++extern const struct params_t kP503Params; ++ ++inline void fpadd503(const digit_t* a, const digit_t* b, digit_t* c) ++{ // Modular addition, c = a+b mod p503. ++ // Inputs: a, b in [0, 2*p503-1] ++ // Output: c in [0, 2*p503-1] ++ ++ fpadd503_asm(a, b, c); ++} ++ ++ ++inline void fpsub503(const digit_t* a, const digit_t* b, digit_t* c) ++{ // Modular subtraction, c = a-b mod p503. ++ // Inputs: a, b in [0, 2*p503-1] ++ // Output: c in [0, 2*p503-1] ++ fpsub503_asm(a, b, c); ++} ++ ++ ++inline void fpneg503(digit_t* a) ++{ // Modular negation, a = -a mod p503. ++ // Input/output: a in [0, 2*p503-1] ++ unsigned int i, borrow = 0; ++ ++ for (i = 0; i < NWORDS_FIELD; i++) { ++ SUBC(borrow, ((digit_t*)kP503Params.primeX2)[i], a[i], borrow, a[i]); ++ } ++} ++ ++ ++void fpdiv2_503(const digit_t* a, digit_t* c) ++{ // Modular division by two, c = a/2 mod p503. ++ // Input : a in [0, 2*p503-1] ++ // Output: c in [0, 2*p503-1] ++ unsigned int i, carry = 0; ++ digit_t mask; ++ ++ mask = 0 - (digit_t)(a[0] & 1); // If a is odd compute a+p503 ++ for (i = 0; i < NWORDS_FIELD; i++) { ++ ADDC(carry, a[i], ((digit_t*)kP503Params.prime)[i] & mask, carry, c[i]); ++ } ++ ++ mp_shiftr1(c, NWORDS_FIELD); ++} ++ ++ ++void fpcorrection503(digit_t* a) ++{ // Modular correction to reduce field element a in [0, 2*p503-1] to [0, p503-1]. ++ unsigned int i, borrow = 0; ++ digit_t mask; ++ ++ for (i = 0; i < NWORDS_FIELD; i++) { ++ SUBC(borrow, a[i], ((digit_t*)kP503Params.prime)[i], borrow, a[i]); ++ } ++ mask = 0 - (digit_t)borrow; ++ ++ borrow = 0; ++ for (i = 0; i < NWORDS_FIELD; i++) { ++ ADDC(borrow, a[i], ((digit_t*)kP503Params.prime)[i] & mask, borrow, a[i]); ++ } ++} ++ ++ ++void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) ++{ // Multiprecision multiply, c = a*b, where lng(a) = lng(b) = nwords. ++ ++ UNREFERENCED_PARAMETER(nwords); ++ mul503_asm(a, b, c); ++} ++ ++ ++void rdc_mont(const digit_t* ma, digit_t* mc) ++{ // Montgomery reduction exploiting special form of the prime. ++ // mc = ma*R^-1 mod primeX2, where R = 2^512. ++ // If ma < 2^512*p503, the output mc is in the range [0, 2*p503-1]. ++ // ma is assumed to be in Montgomery representation. ++ rdc503_asm(ma, mc); ++} +diff --git a/third_party/sidh/src/AMD64/fp_x64_asm.S b/third_party/sidh/src/AMD64/fp_x64_asm.S +new file mode 100644 +index 000000000..4e8f3147a +--- /dev/null ++++ b/third_party/sidh/src/AMD64/fp_x64_asm.S +@@ -0,0 +1,1640 @@ ++//******************************************************************************************* ++// SIDH: an efficient supersingular isogeny cryptography library ++// ++// Abstract: field arithmetic in x64 assembly for P503 on Linux ++//******************************************************************************************* ++ ++.intel_syntax noprefix ++ ++// Registers that are used for parameter passing: ++#define reg_p1 rdi ++#define reg_p2 rsi ++#define reg_p3 rdx ++ ++// p503 + 1 ++#define p503p1_3 0xAC00000000000000 ++#define p503p1_4 0x13085BDA2211E7A0 ++#define p503p1_5 0x1B9BF6C87B7E7DAF ++#define p503p1_6 0x6045C6BDDA77A4D0 ++#define p503p1_7 0x004066F541811E1E ++// p503 x 2 ++#define p503x2_0 0xFFFFFFFFFFFFFFFE ++#define p503x2_1 0xFFFFFFFFFFFFFFFF ++#define p503x2_3 0x57FFFFFFFFFFFFFF ++#define p503x2_4 0x2610B7B44423CF41 ++#define p503x2_5 0x3737ED90F6FCFB5E ++#define p503x2_6 0xC08B8D7BB4EF49A0 ++#define p503x2_7 0x0080CDEA83023C3C ++ ++.text ++ ++.extern OPENSSL_ia32cap_P ++.hidden OPENSSL_ia32cap_P ++ ++p503p1_nz: ++.quad 0xAC00000000000000 ++.quad 0x13085BDA2211E7A0 ++.quad 0x1B9BF6C87B7E7DAF ++.quad 0x6045C6BDDA77A4D0 ++.quad 0x004066F541811E1E ++ ++.macro CSWAP16 IDX, M1, M2 ++ movdqu xmm0, [\M1+\IDX*16] ++ movdqu xmm1, [\M2+\IDX*16] ++ movdqa xmm2, xmm1 ++ pxor xmm2, xmm0 ++ pand xmm2, xmm15 ++ pxor xmm0, xmm2 ++ pxor xmm1, xmm2 ++ movdqu [\M1+\IDX*16], xmm0 ++ movdqu [\M2+\IDX*16], xmm1 ++.endm ++ ++.global cswap503_asm ++cswap503_asm: ++ // Fill xmm15. After this step first half of XMM15 is ++ // just zeros and second half is whatever in RDX ++ movq xmm15, rdx ++ ++ // Copy lower double word everywhere else. So that ++ // XMM15=RDX|RDX. As RDX has either all bits set ++ // or non result will be that XMM15 has also either ++ // all bits set or non of them. 68 = 01000100b ++ pshufd xmm15, xmm15, 68 ++ ++ // P[0].X with Q[0].X ++ CSWAP16 0, rdi, rsi ++ CSWAP16 1, rdi, rsi ++ CSWAP16 2, rdi, rsi ++ CSWAP16 3, rdi, rsi ++ ++ // P[0].Z with Q[0].Z ++ CSWAP16 4, rdi, rsi ++ CSWAP16 5, rdi, rsi ++ CSWAP16 6, rdi, rsi ++ CSWAP16 7, rdi, rsi ++ ++ // P[1].X with Q[1].X ++ CSWAP16 8, rdi, rsi ++ CSWAP16 9, rdi, rsi ++ CSWAP16 10, rdi, rsi ++ CSWAP16 11, rdi, rsi ++ ++ // P[1].Z with Q[1].Z ++ CSWAP16 12, rdi, rsi ++ CSWAP16 13, rdi, rsi ++ CSWAP16 14, rdi, rsi ++ CSWAP16 15, rdi, rsi ++ ++ ret ++ ++//*********************************************************************** ++// Field addition ++// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2] ++//*********************************************************************** ++.global fpadd503_asm ++fpadd503_asm: ++ push r12 ++ push r13 ++ push r14 ++ push r15 ++ ++ xor rax, rax ++ mov r8, [reg_p1] ++ mov r9, [reg_p1+8] ++ mov r10, [reg_p1+16] ++ mov r11, [reg_p1+24] ++ mov r12, [reg_p1+32] ++ mov r13, [reg_p1+40] ++ mov r14, [reg_p1+48] ++ mov r15, [reg_p1+56] ++ add r8, [reg_p2] ++ adc r9, [reg_p2+8] ++ adc r10, [reg_p2+16] ++ adc r11, [reg_p2+24] ++ adc r12, [reg_p2+32] ++ adc r13, [reg_p2+40] ++ adc r14, [reg_p2+48] ++ adc r15, [reg_p2+56] ++ ++ mov rcx, p503x2_0 ++ sub r8, rcx ++ mov rcx, p503x2_1 ++ sbb r9, rcx ++ sbb r10, rcx ++ mov rcx, p503x2_3 ++ sbb r11, rcx ++ mov rcx, p503x2_4 ++ sbb r12, rcx ++ mov rcx, p503x2_5 ++ sbb r13, rcx ++ mov rcx, p503x2_6 ++ sbb r14, rcx ++ mov rcx, p503x2_7 ++ sbb r15, rcx ++ sbb rax, 0 ++ ++ mov rdi, p503x2_0 ++ and rdi, rax ++ mov rsi, p503x2_1 ++ and rsi, rax ++ mov rcx, p503x2_3 ++ and rcx, rax ++ ++ add r8, rdi ++ adc r9, rsi ++ adc r10, rsi ++ adc r11, rcx ++ mov [reg_p3], r8 ++ mov [reg_p3+8], r9 ++ mov [reg_p3+16], r10 ++ mov [reg_p3+24], r11 ++ setc cl ++ ++ mov r8, p503x2_4 ++ and r8, rax ++ mov r9, p503x2_5 ++ and r9, rax ++ mov r10, p503x2_6 ++ and r10, rax ++ mov r11, p503x2_7 ++ and r11, rax ++ ++ bt rcx, 0 ++ adc r12, r8 ++ adc r13, r9 ++ adc r14, r10 ++ adc r15, r11 ++ mov [reg_p3+32], r12 ++ mov [reg_p3+40], r13 ++ mov [reg_p3+48], r14 ++ mov [reg_p3+56], r15 ++ ++ pop r15 ++ pop r14 ++ pop r13 ++ pop r12 ++ ret ++ ++ ++//*********************************************************************** ++// Field subtraction ++// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] ++//*********************************************************************** ++.global fpsub503_asm ++fpsub503_asm: ++ push r12 ++ push r13 ++ push r14 ++ push r15 ++ ++ xor rax, rax ++ mov r8, [reg_p1] ++ mov r9, [reg_p1+8] ++ mov r10, [reg_p1+16] ++ mov r11, [reg_p1+24] ++ mov r12, [reg_p1+32] ++ mov r13, [reg_p1+40] ++ mov r14, [reg_p1+48] ++ mov r15, [reg_p1+56] ++ sub r8, [reg_p2] ++ sbb r9, [reg_p2+8] ++ sbb r10, [reg_p2+16] ++ sbb r11, [reg_p2+24] ++ sbb r12, [reg_p2+32] ++ sbb r13, [reg_p2+40] ++ sbb r14, [reg_p2+48] ++ sbb r15, [reg_p2+56] ++ sbb rax, 0 ++ ++ mov rdi, p503x2_0 ++ and rdi, rax ++ mov rsi, p503x2_1 ++ and rsi, rax ++ mov rcx, p503x2_3 ++ and rcx, rax ++ ++ add r8, rdi ++ adc r9, rsi ++ adc r10, rsi ++ adc r11, rcx ++ mov [reg_p3], r8 ++ mov [reg_p3+8], r9 ++ mov [reg_p3+16], r10 ++ mov [reg_p3+24], r11 ++ setc cl ++ ++ mov r8, p503x2_4 ++ and r8, rax ++ mov r9, p503x2_5 ++ and r9, rax ++ mov r10, p503x2_6 ++ and r10, rax ++ mov r11, p503x2_7 ++ and r11, rax ++ ++ bt rcx, 0 ++ adc r12, r8 ++ adc r13, r9 ++ adc r14, r10 ++ adc r15, r11 ++ mov [reg_p3+32], r12 ++ mov [reg_p3+40], r13 ++ mov [reg_p3+48], r14 ++ mov [reg_p3+56], r15 ++ ++ pop r15 ++ pop r14 ++ pop r13 ++ pop r12 ++ ret ++ ++ ++///////////////////////////////////////////////////////////////// MACRO ++// Schoolbook integer multiplication, a full row at a time ++// Inputs: memory pointers M0 and M1 ++// Outputs: memory pointer C ++// Temps: regs T0:T9 ++///////////////////////////////////////////////////////////////// ++ ++.macro MUL256_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9 ++ mov rdx, \M0 ++ mulx \T0, \T1, \M1 // T0:T1 = A0*B0 ++ mov \C, \T1 // C0_final ++ mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1 ++ xor rax, rax ++ adox \T0, \T2 ++ mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2 ++ adox \T1, \T3 ++ mulx \T3, \T4, 24\M1 // T3:T4 = A0*B3 ++ adox \T2, \T4 ++ ++ mov rdx, 8\M0 ++ mulx \T5, \T4, \M1 // T5:T4 = A1*B0 ++ adox \T3, rax ++ xor rax, rax ++ mulx \T6, \T7, 8\M1 // T6:T7 = A1*B1 ++ adox \T4, \T0 ++ mov 8\C, \T4 // C1_final ++ adcx \T5, \T7 ++ mulx \T7, \T8, 16\M1 // T7:T8 = A1*B2 ++ adcx \T6, \T8 ++ adox \T5, \T1 ++ mulx \T8, \T9, 24\M1 // T8:T9 = A1*B3 ++ adcx \T7, \T9 ++ adcx \T8, rax ++ adox \T6, \T2 ++ ++ mov rdx, 16\M0 ++ mulx \T1, \T0, \M1 // T1:T0 = A2*B0 ++ adox \T7, \T3 ++ adox \T8, rax ++ xor rax, rax ++ mulx \T2, \T3, 8\M1 // T2:T3 = A2*B1 ++ adox \T0, \T5 ++ mov 16\C, \T0 // C2_final ++ adcx \T1, \T3 ++ mulx \T3, \T4, 16\M1 // T3:T4 = A2*B2 ++ adcx \T2, \T4 ++ adox \T1, \T6 ++ mulx \T4,\T9, 24\M1 // T3:T4 = A2*B3 ++ adcx \T3, \T9 ++ mov rdx, 24\M0 ++ adcx \T4, rax ++ ++ adox \T2, \T7 ++ adox \T3, \T8 ++ adox \T4, rax ++ ++ mulx \T5, \T0, \M1 // T5:T0 = A3*B0 ++ xor rax, rax ++ mulx \T6, \T7, 8\M1 // T6:T7 = A3*B1 ++ adcx \T5, \T7 ++ adox \T1, \T0 ++ mulx \T7, \T8, 16\M1 // T7:T8 = A3*B2 ++ adcx \T6, \T8 ++ adox \T2, \T5 ++ mulx \T8, \T9, 24\M1 // T8:T9 = A3*B3 ++ adcx \T7, \T9 ++ adcx \T8, rax ++ ++ adox \T3, \T6 ++ adox \T4, \T7 ++ adox \T8, rax ++ mov 24\C, \T1 // C3_final ++ mov 32\C, \T2 // C4_final ++ mov 40\C, \T3 // C5_final ++ mov 48\C, \T4 // C6_final ++ mov 56\C, \T8 // C7_final ++.endm ++ ++//***************************************************************************** ++// 503-bit multiplication using Karatsuba (one level), schoolbook (one level) ++//***************************************************************************** ++mul503_mulx_asm: ++ push r12 ++ push r13 ++ push r14 ++ push r15 ++ mov rcx, reg_p3 ++ ++ // r8-r11 <- AH + AL, rax <- mask ++ xor rax, rax ++ mov r8, [reg_p1] ++ mov r9, [reg_p1+8] ++ mov r10, [reg_p1+16] ++ mov r11, [reg_p1+24] ++ push rbx ++ push rbp ++ sub rsp, 96 ++ add r8, [reg_p1+32] ++ adc r9, [reg_p1+40] ++ adc r10, [reg_p1+48] ++ adc r11, [reg_p1+56] ++ sbb rax, 0 ++ mov [rsp], r8 ++ mov [rsp+8], r9 ++ mov [rsp+16], r10 ++ mov [rsp+24], r11 ++ ++ // r12-r15 <- BH + BL, rbx <- mask ++ xor rbx, rbx ++ mov r12, [reg_p2] ++ mov r13, [reg_p2+8] ++ mov r14, [reg_p2+16] ++ mov r15, [reg_p2+24] ++ add r12, [reg_p2+32] ++ adc r13, [reg_p2+40] ++ adc r14, [reg_p2+48] ++ adc r15, [reg_p2+56] ++ sbb rbx, 0 ++ mov [rsp+32], r12 ++ mov [rsp+40], r13 ++ mov [rsp+48], r14 ++ mov [rsp+56], r15 ++ ++ // r12-r15 <- masked (BH + BL) ++ and r12, rax ++ and r13, rax ++ and r14, rax ++ and r15, rax ++ ++ // r8-r11 <- masked (AH + AL) ++ and r8, rbx ++ and r9, rbx ++ and r10, rbx ++ and r11, rbx ++ ++ // r8-r11 <- masked (AH + AL) + masked (AH + AL) ++ add r8, r12 ++ adc r9, r13 ++ adc r10, r14 ++ adc r11, r15 ++ mov [rsp+64], r8 ++ mov [rsp+72], r9 ++ mov [rsp+80], r10 ++ mov [rsp+88], r11 ++ ++ // [rcx+64] <- (AH+AL) x (BH+BL), low part ++ MUL256_SCHOOL [rsp], [rsp+32], [rcx+64], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp ++ ++ // [rcx] <- AL x BL ++ MUL256_SCHOOL [reg_p1], [reg_p2], [rcx], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp // Result C0-C3 ++ ++ // [rsp] <- AH x BH ++ MUL256_SCHOOL [reg_p1+32], [reg_p2+32], [rsp], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp ++ ++ // r8-r11 <- (AH+AL) x (BH+BL), final step ++ mov r8, [rsp+64] ++ mov r9, [rsp+72] ++ mov r10, [rsp+80] ++ mov r11, [rsp+88] ++ mov rax, [rcx+96] ++ add r8, rax ++ mov rax, [rcx+104] ++ adc r9, rax ++ mov rax, [rcx+112] ++ adc r10, rax ++ mov rax, [rcx+120] ++ adc r11, rax ++ ++ // [rcx+64], x3-x5 <- (AH+AL) x (BH+BL) - ALxBL ++ mov r12, [rcx+64] ++ mov r13, [rcx+72] ++ mov r14, [rcx+80] ++ mov r15, [rcx+88] ++ sub r12, [rcx] ++ sbb r13, [rcx+8] ++ sbb r14, [rcx+16] ++ sbb r15, [rcx+24] ++ sbb r8, [rcx+32] ++ sbb r9, [rcx+40] ++ sbb r10, [rcx+48] ++ sbb r11, [rcx+56] ++ ++ // r8-r15 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH ++ sub r12, [rsp] ++ sbb r13, [rsp+8] ++ sbb r14, [rsp+16] ++ sbb r15, [rsp+24] ++ sbb r8, [rsp+32] ++ sbb r9, [rsp+40] ++ sbb r10, [rsp+48] ++ sbb r11, [rsp+56] ++ ++ add r12, [rcx+32] ++ mov [rcx+32], r12 // Result C4-C7 ++ adc r13, [rcx+40] ++ mov [rcx+40], r13 ++ adc r14, [rcx+48] ++ mov [rcx+48], r14 ++ adc r15, [rcx+56] ++ mov [rcx+56], r15 ++ mov rax, [rsp] ++ adc r8, rax ++ mov [rcx+64], r8 // Result C8-C15 ++ mov rax, [rsp+8] ++ adc r9, rax ++ mov [rcx+72], r9 ++ mov rax, [rsp+16] ++ adc r10, rax ++ mov [rcx+80], r10 ++ mov rax, [rsp+24] ++ adc r11, rax ++ mov [rcx+88], r11 ++ mov r12, [rsp+32] ++ adc r12, 0 ++ mov [rcx+96], r12 ++ mov r13, [rsp+40] ++ adc r13, 0 ++ mov [rcx+104], r13 ++ mov r14, [rsp+48] ++ adc r14, 0 ++ mov [rcx+112], r14 ++ mov r15, [rsp+56] ++ adc r15, 0 ++ mov [rcx+120], r15 ++ ++ add rsp, 96 ++ pop rbp ++ pop rbx ++ pop r15 ++ pop r14 ++ pop r13 ++ pop r12 ++ ret ++ ++//*********************************************************************** ++// Integer multiplication ++// Based on Karatsuba method ++// Operation: c [reg_p3] = a [reg_p1] * b [reg_p2] ++// NOTE: a=c or b=c are not allowed ++//*********************************************************************** ++.global mul503_asm ++mul503_asm: ++ mov ecx, [rip+OPENSSL_ia32cap_P+8] ++ and ecx, 0x80100 ++ cmp ecx, 0x80100 ++ je mul503_mulx_asm ++ ++ push r12 ++ push r13 ++ push r14 ++ mov rcx, reg_p3 ++ ++ // rcx[0-3] <- AH+AL ++ xor rax, rax ++ mov r8, [reg_p1+32] ++ mov r9, [reg_p1+40] ++ mov r10, [reg_p1+48] ++ mov r11, [reg_p1+56] ++ add r8, [reg_p1] ++ adc r9, [reg_p1+8] ++ adc r10, [reg_p1+16] ++ adc r11, [reg_p1+24] ++ push r15 ++ mov [rcx], r8 ++ mov [rcx+8], r9 ++ mov [rcx+16], r10 ++ mov [rcx+24], r11 ++ sbb rax, 0 ++ sub rsp, 80 // Allocating space in stack ++ ++ // r12-r15 <- BH+BL ++ xor rdx, rdx ++ mov r12, [reg_p2+32] ++ mov r13, [reg_p2+40] ++ mov r14, [reg_p2+48] ++ mov r15, [reg_p2+56] ++ add r12, [reg_p2] ++ adc r13, [reg_p2+8] ++ adc r14, [reg_p2+16] ++ adc r15, [reg_p2+24] ++ sbb rdx, 0 ++ mov [rsp+64], rax ++ mov [rsp+72], rdx ++ ++ // (rsp[0-3],r8,r9,r10,r11) <- (AH+AL)*(BH+BL) ++ mov rax, [rcx] ++ mul r12 ++ mov [rsp], rax // c0 ++ mov r8, rdx ++ ++ xor r9, r9 ++ mov rax, [rcx] ++ mul r13 ++ add r8, rax ++ adc r9, rdx ++ ++ xor r10, r10 ++ mov rax, [rcx+8] ++ mul r12 ++ add r8, rax ++ mov [rsp+8], r8 // c1 ++ adc r9, rdx ++ adc r10, 0 ++ ++ xor r8, r8 ++ mov rax, [rcx] ++ mul r14 ++ add r9, rax ++ adc r10, rdx ++ adc r8, 0 ++ ++ mov rax, [rcx+16] ++ mul r12 ++ add r9, rax ++ adc r10, rdx ++ adc r8, 0 ++ ++ mov rax, [rcx+8] ++ mul r13 ++ add r9, rax ++ mov [rsp+16], r9 // c2 ++ adc r10, rdx ++ adc r8, 0 ++ ++ xor r9, r9 ++ mov rax, [rcx] ++ mul r15 ++ add r10, rax ++ adc r8, rdx ++ adc r9, 0 ++ ++ mov rax, [rcx+24] ++ mul r12 ++ add r10, rax ++ adc r8, rdx ++ adc r9, 0 ++ ++ mov rax, [rcx+8] ++ mul r14 ++ add r10, rax ++ adc r8, rdx ++ adc r9, 0 ++ ++ mov rax, [rcx+16] ++ mul r13 ++ add r10, rax ++ mov [rsp+24], r10 // c3 ++ adc r8, rdx ++ adc r9, 0 ++ ++ xor r10, r10 ++ mov rax, [rcx+8] ++ mul r15 ++ add r8, rax ++ adc r9, rdx ++ adc r10, 0 ++ ++ mov rax, [rcx+24] ++ mul r13 ++ add r8, rax ++ adc r9, rdx ++ adc r10, 0 ++ ++ mov rax, [rcx+16] ++ mul r14 ++ add r8, rax ++ mov [rsp+32], r8 // c4 ++ adc r9, rdx ++ adc r10, 0 ++ ++ xor r11, r11 ++ mov rax, [rcx+16] ++ mul r15 ++ add r9, rax ++ adc r10, rdx ++ adc r11, 0 ++ ++ mov rax, [rcx+24] ++ mul r14 ++ add r9, rax // c5 ++ adc r10, rdx ++ adc r11, 0 ++ ++ mov rax, [rcx+24] ++ mul r15 ++ add r10, rax // c6 ++ adc r11, rdx // c7 ++ ++ mov rax, [rsp+64] ++ and r12, rax ++ and r13, rax ++ and r14, rax ++ and r15, rax ++ add r12, r8 ++ adc r13, r9 ++ adc r14, r10 ++ adc r15, r11 ++ ++ mov rax, [rsp+72] ++ mov r8, [rcx] ++ mov r9, [rcx+8] ++ mov r10, [rcx+16] ++ mov r11, [rcx+24] ++ and r8, rax ++ and r9, rax ++ and r10, rax ++ and r11, rax ++ add r8, r12 ++ adc r9, r13 ++ adc r10, r14 ++ adc r11, r15 ++ mov [rsp+32], r8 ++ mov [rsp+40], r9 ++ mov [rsp+48], r10 ++ mov [rsp+56], r11 ++ ++ // rcx[0-7] <- AL*BL ++ mov r11, [reg_p1] ++ mov rax, [reg_p2] ++ mul r11 ++ xor r9, r9 ++ mov [rcx], rax // c0 ++ mov r8, rdx ++ ++ mov r14, [reg_p1+16] ++ mov rax, [reg_p2+8] ++ mul r11 ++ xor r10, r10 ++ add r8, rax ++ adc r9, rdx ++ ++ mov r12, [reg_p1+8] ++ mov rax, [reg_p2] ++ mul r12 ++ add r8, rax ++ mov [rcx+8], r8 // c1 ++ adc r9, rdx ++ adc r10, 0 ++ ++ xor r8, r8 ++ mov rax, [reg_p2+16] ++ mul r11 ++ add r9, rax ++ adc r10, rdx ++ adc r8, 0 ++ ++ mov r13, [reg_p2] ++ mov rax, r14 ++ mul r13 ++ add r9, rax ++ adc r10, rdx ++ adc r8, 0 ++ ++ mov rax, [reg_p2+8] ++ mul r12 ++ add r9, rax ++ mov [rcx+16], r9 // c2 ++ adc r10, rdx ++ adc r8, 0 ++ ++ xor r9, r9 ++ mov rax, [reg_p2+24] ++ mul r11 ++ mov r15, [reg_p1+24] ++ add r10, rax ++ adc r8, rdx ++ adc r9, 0 ++ ++ mov rax, r15 ++ mul r13 ++ add r10, rax ++ adc r8, rdx ++ adc r9, 0 ++ ++ mov rax, [reg_p2+16] ++ mul r12 ++ add r10, rax ++ adc r8, rdx ++ adc r9, 0 ++ ++ mov rax, [reg_p2+8] ++ mul r14 ++ add r10, rax ++ mov [rcx+24], r10 // c3 ++ adc r8, rdx ++ adc r9, 0 ++ ++ xor r10, r10 ++ mov rax, [reg_p2+24] ++ mul r12 ++ add r8, rax ++ adc r9, rdx ++ adc r10, 0 ++ ++ mov rax, [reg_p2+8] ++ mul r15 ++ add r8, rax ++ adc r9, rdx ++ adc r10, 0 ++ ++ mov rax, [reg_p2+16] ++ mul r14 ++ add r8, rax ++ mov [rcx+32], r8 // c4 ++ adc r9, rdx ++ adc r10, 0 ++ ++ xor r8, r8 ++ mov rax, [reg_p2+24] ++ mul r14 ++ add r9, rax ++ adc r10, rdx ++ adc r8, 0 ++ ++ mov rax, [reg_p2+16] ++ mul r15 ++ add r9, rax ++ mov [rcx+40], r9 // c5 ++ adc r10, rdx ++ adc r8, 0 ++ ++ mov rax, [reg_p2+24] ++ mul r15 ++ add r10, rax ++ mov [rcx+48], r10 // c6 ++ adc r8, rdx ++ mov [rcx+56], r8 // c7 ++ ++ // rcx[8-15] <- AH*BH ++ mov r11, [reg_p1+32] ++ mov rax, [reg_p2+32] ++ mul r11 ++ xor r9, r9 ++ mov [rcx+64], rax // c0 ++ mov r8, rdx ++ ++ mov r14, [reg_p1+48] ++ mov rax, [reg_p2+40] ++ mul r11 ++ xor r10, r10 ++ add r8, rax ++ adc r9, rdx ++ ++ mov r12, [reg_p1+40] ++ mov rax, [reg_p2+32] ++ mul r12 ++ add r8, rax ++ mov [rcx+72], r8 // c1 ++ adc r9, rdx ++ adc r10, 0 ++ ++ xor r8, r8 ++ mov rax, [reg_p2+48] ++ mul r11 ++ add r9, rax ++ adc r10, rdx ++ adc r8, 0 ++ ++ mov r13, [reg_p2+32] ++ mov rax, r14 ++ mul r13 ++ add r9, rax ++ adc r10, rdx ++ adc r8, 0 ++ ++ mov rax, [reg_p2+40] ++ mul r12 ++ add r9, rax ++ mov [rcx+80], r9 // c2 ++ adc r10, rdx ++ adc r8, 0 ++ ++ xor r9, r9 ++ mov rax, [reg_p2+56] ++ mul r11 ++ mov r15, [reg_p1+56] ++ add r10, rax ++ adc r8, rdx ++ adc r9, 0 ++ ++ mov rax, r15 ++ mul r13 ++ add r10, rax ++ adc r8, rdx ++ adc r9, 0 ++ ++ mov rax, [reg_p2+48] ++ mul r12 ++ add r10, rax ++ adc r8, rdx ++ adc r9, 0 ++ ++ mov rax, [reg_p2+40] ++ mul r14 ++ add r10, rax ++ mov [rcx+88], r10 // c3 ++ adc r8, rdx ++ adc r9, 0 ++ ++ xor r10, r10 ++ mov rax, [reg_p2+56] ++ mul r12 ++ add r8, rax ++ adc r9, rdx ++ adc r10, 0 ++ ++ mov rax, [reg_p2+40] ++ mul r15 ++ add r8, rax ++ adc r9, rdx ++ adc r10, 0 ++ ++ mov rax, [reg_p2+48] ++ mul r14 ++ add r8, rax ++ mov [rcx+96], r8 // c4 ++ adc r9, rdx ++ adc r10, 0 ++ ++ xor r8, r8 ++ mov rax, [reg_p2+56] ++ mul r14 ++ add r9, rax ++ adc r10, rdx ++ adc r8, 0 ++ ++ mov rax, [reg_p2+48] ++ mul r15 ++ add r9, rax ++ mov [rcx+104], r9 // c5 ++ adc r10, rdx ++ adc r8, 0 ++ ++ mov rax, [reg_p2+56] ++ mul r15 ++ add r10, rax ++ mov [rcx+112], r10 // c6 ++ adc r8, rdx ++ mov [rcx+120], r8 // c7 ++ ++ // [r8-r15] <- (AH+AL)*(BH+BL) - AL*BL ++ mov r8, [rsp] ++ sub r8, [rcx] ++ mov r9, [rsp+8] ++ sbb r9, [rcx+8] ++ mov r10, [rsp+16] ++ sbb r10, [rcx+16] ++ mov r11, [rsp+24] ++ sbb r11, [rcx+24] ++ mov r12, [rsp+32] ++ sbb r12, [rcx+32] ++ mov r13, [rsp+40] ++ sbb r13, [rcx+40] ++ mov r14, [rsp+48] ++ sbb r14, [rcx+48] ++ mov r15, [rsp+56] ++ sbb r15, [rcx+56] ++ ++ // [r8-r15] <- (AH+AL)*(BH+BL) - AL*BL - AH*BH ++ mov rax, [rcx+64] ++ sub r8, rax ++ mov rax, [rcx+72] ++ sbb r9, rax ++ mov rax, [rcx+80] ++ sbb r10, rax ++ mov rax, [rcx+88] ++ sbb r11, rax ++ mov rax, [rcx+96] ++ sbb r12, rax ++ mov rdx, [rcx+104] ++ sbb r13, rdx ++ mov rdi, [rcx+112] ++ sbb r14, rdi ++ mov rsi, [rcx+120] ++ sbb r15, rsi ++ ++ // Final result ++ add r8, [rcx+32] ++ mov [rcx+32], r8 ++ adc r9, [rcx+40] ++ mov [rcx+40], r9 ++ adc r10, [rcx+48] ++ mov [rcx+48], r10 ++ adc r11, [rcx+56] ++ mov [rcx+56], r11 ++ adc r12, [rcx+64] ++ mov [rcx+64], r12 ++ adc r13, [rcx+72] ++ mov [rcx+72], r13 ++ adc r14, [rcx+80] ++ mov [rcx+80], r14 ++ adc r15, [rcx+88] ++ mov [rcx+88], r15 ++ adc rax, 0 ++ mov [rcx+96], rax ++ adc rdx, 0 ++ mov [rcx+104], rdx ++ adc rdi, 0 ++ mov [rcx+112], rdi ++ adc rsi, 0 ++ mov [rcx+120], rsi ++ ++ add rsp, 80 // Restoring space in stack ++ pop r15 ++ pop r14 ++ pop r13 ++ pop r12 ++ ret ++ ++///////////////////////////////////////////////////////////////// MACRO ++// Schoolbook integer multiplication ++// Inputs: memory pointers M0 and M1 ++// Outputs: regs T0:T6 ++// Temps: regs T7:T9 ++///////////////////////////////////////////////////////////////// ++.macro MUL128x320_SCHOOL M0, M1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9 ++ mov rdx, \M0 ++ mulx \T1, \T0, \M1 // T0 <- C0_final ++ mulx \T2, \T4, 8\M1 ++ xor rax, rax ++ ++ mulx \T3, \T5, 16\M1 ++ adox \T1, \T4 ++ adox \T2, \T5 ++ mulx \T4, \T7, 24\M1 ++ adox \T3, \T7 ++ mulx \T5, \T6, 32\M1 ++ adox \T4, \T6 ++ adox \T5, rax ++ ++ mov rdx, 8\M0 ++ mulx \T7, \T6, \M1 ++ adcx \T1, \T6 // T1 <- C1_final ++ adcx \T2, \T7 ++ mulx \T6, \T8, 8\M1 ++ adcx \T3, \T6 ++ mulx \T9, \T7, 16\M1 ++ adcx \T4, \T9 ++ mulx \T6, \T9, 24\M1 ++ adcx \T5, \T6 ++ mulx \T6, rdx, 32\M1 ++ adcx \T6, rax ++ ++ xor rax, rax ++ adox \T2, \T8 ++ adox \T3, \T7 ++ adox \T4, \T9 ++ adox \T5, rdx ++ adox \T6, rax ++.endm ++ ++ ++//************************************************************************************** ++// Montgomery reduction ++// Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015 ++// Operation: c [reg_p2] = a [reg_p1] ++// NOTE: a=c is not allowed ++//************************************************************************************** ++rdc503_mulx_asm: ++ push rbx ++ push r12 ++ push r13 ++ push r14 ++ push r15 ++ ++ // a[0-1] x p503p1_nz --> result: r8:r14 ++ MUL128x320_SCHOOL [reg_p1], [rip+p503p1_nz], r8, r9, r10, r11, r12, r13, r14, rbx, rcx, r15 ++ ++ xor r15, r15 ++ add r8, [reg_p1+24] ++ adc r9, [reg_p1+32] ++ adc r10, [reg_p1+40] ++ adc r11, [reg_p1+48] ++ adc r12, [reg_p1+56] ++ adc r13, [reg_p1+64] ++ adc r14, [reg_p1+72] ++ adc r15, [reg_p1+80] ++ mov [reg_p1+24], r8 ++ mov [reg_p1+32], r9 ++ mov [reg_p1+40], r10 ++ mov [reg_p1+48], r11 ++ mov [reg_p1+56], r12 ++ mov [reg_p1+64], r13 ++ mov [reg_p1+72], r14 ++ mov [reg_p1+80], r15 ++ mov r8, [reg_p1+88] ++ mov r9, [reg_p1+96] ++ mov r10, [reg_p1+104] ++ mov r11, [reg_p1+112] ++ mov r12, [reg_p1+120] ++ adc r8, 0 ++ adc r9, 0 ++ adc r10, 0 ++ adc r11, 0 ++ adc r12, 0 ++ mov [reg_p1+88], r8 ++ mov [reg_p1+96], r9 ++ mov [reg_p1+104], r10 ++ mov [reg_p1+112], r11 ++ mov [reg_p1+120], r12 ++ ++ // a[2-3] x p503p1_nz --> result: r8:r14 ++ MUL128x320_SCHOOL [reg_p1+16], [rip+p503p1_nz], r8, r9, r10, r11, r12, r13, r14, rbx, rcx, r15 ++ ++ xor r15, r15 ++ add r8, [reg_p1+40] ++ adc r9, [reg_p1+48] ++ adc r10, [reg_p1+56] ++ adc r11, [reg_p1+64] ++ adc r12, [reg_p1+72] ++ adc r13, [reg_p1+80] ++ adc r14, [reg_p1+88] ++ adc r15, [reg_p1+96] ++ mov [reg_p1+40], r8 ++ mov [reg_p1+48], r9 ++ mov [reg_p1+56], r10 ++ mov [reg_p1+64], r11 ++ mov [reg_p1+72], r12 ++ mov [reg_p1+80], r13 ++ mov [reg_p1+88], r14 ++ mov [reg_p1+96], r15 ++ mov r8, [reg_p1+104] ++ mov r9, [reg_p1+112] ++ mov r10, [reg_p1+120] ++ adc r8, 0 ++ adc r9, 0 ++ adc r10, 0 ++ mov [reg_p1+104], r8 ++ mov [reg_p1+112], r9 ++ mov [reg_p1+120], r10 ++ ++ // a[4-5] x p503p1_nz --> result: r8:r14 ++ MUL128x320_SCHOOL [reg_p1+32], [rip+p503p1_nz], r8, r9, r10, r11, r12, r13, r14, rbx, rcx, r15 ++ ++ xor r15, r15 ++ xor rbx, rbx ++ add r8, [reg_p1+56] ++ adc r9, [reg_p1+64] ++ adc r10, [reg_p1+72] ++ adc r11, [reg_p1+80] ++ adc r12, [reg_p1+88] ++ adc r13, [reg_p1+96] ++ adc r14, [reg_p1+104] ++ adc r15, [reg_p1+112] ++ adc rbx, [reg_p1+120] ++ mov [reg_p1+56], r8 ++ mov [reg_p2], r9 // Final result c0 ++ mov [reg_p1+72], r10 ++ mov [reg_p1+80], r11 ++ mov [reg_p1+88], r12 ++ mov [reg_p1+96], r13 ++ mov [reg_p1+104], r14 ++ mov [reg_p1+112], r15 ++ mov [reg_p1+120], rbx ++ ++ // a[6-7] x p503p1_nz --> result: r8:r14 ++ MUL128x320_SCHOOL [reg_p1+48], [rip+p503p1_nz], r8, r9, r10, r11, r12, r13, r14, rbx, rcx, r15 ++ ++ // Final result c1:c7 ++ add r8, [reg_p1+72] ++ adc r9, [reg_p1+80] ++ adc r10, [reg_p1+88] ++ adc r11, [reg_p1+96] ++ adc r12, [reg_p1+104] ++ adc r13, [reg_p1+112] ++ adc r14, [reg_p1+120] ++ mov [reg_p2+8], r8 ++ mov [reg_p2+16], r9 ++ mov [reg_p2+24], r10 ++ mov [reg_p2+32], r11 ++ mov [reg_p2+40], r12 ++ mov [reg_p2+48], r13 ++ mov [reg_p2+56], r14 ++ ++ pop r15 ++ pop r14 ++ pop r13 ++ pop r12 ++ pop rbx ++ ret ++ ++//*********************************************************************** ++// Montgomery reduction ++// Based on comba method ++// Operation: c [reg_p2] = a [reg_p1] ++// NOTE: a=c is not allowed ++//*********************************************************************** ++.global rdc503_asm ++rdc503_asm: ++ mov ecx, [rip+OPENSSL_ia32cap_P+8] ++ and ecx, 0x80100 ++ cmp ecx, 0x80100 ++ je rdc503_mulx_asm ++ ++ push r12 ++ push r13 ++ push r14 ++ push r15 ++ ++ mov r11, [reg_p1] ++ mov rax, p503p1_3 ++ mul r11 ++ xor r8, r8 ++ add rax, [reg_p1+24] ++ mov [reg_p2+24], rax // z3 ++ adc r8, rdx ++ ++ xor r9, r9 ++ mov rax, p503p1_4 ++ mul r11 ++ xor r10, r10 ++ add r8, rax ++ adc r9, rdx ++ ++ mov r12, [reg_p1+8] ++ mov rax, p503p1_3 ++ mul r12 ++ add r8, rax ++ adc r9, rdx ++ adc r10, 0 ++ add r8, [reg_p1+32] ++ mov [reg_p2+32], r8 // z4 ++ adc r9, 0 ++ adc r10, 0 ++ ++ xor r8, r8 ++ mov rax, p503p1_5 ++ mul r11 ++ add r9, rax ++ adc r10, rdx ++ adc r8, 0 ++ ++ mov rax, p503p1_4 ++ mul r12 ++ add r9, rax ++ adc r10, rdx ++ adc r8, 0 ++ ++ mov r13, [reg_p1+16] ++ mov rax, p503p1_3 ++ mul r13 ++ add r9, rax ++ adc r10, rdx ++ adc r8, 0 ++ add r9, [reg_p1+40] ++ mov [reg_p2+40], r9 // z5 ++ adc r10, 0 ++ adc r8, 0 ++ ++ xor r9, r9 ++ mov rax, p503p1_6 ++ mul r11 ++ add r10, rax ++ adc r8, rdx ++ adc r9, 0 ++ ++ mov rax, p503p1_5 ++ mul r12 ++ add r10, rax ++ adc r8, rdx ++ adc r9, 0 ++ ++ mov rax, p503p1_4 ++ mul r13 ++ add r10, rax ++ adc r8, rdx ++ adc r9, 0 ++ ++ mov r14, [reg_p2+24] ++ mov rax, p503p1_3 ++ mul r14 ++ add r10, rax ++ adc r8, rdx ++ adc r9, 0 ++ add r10, [reg_p1+48] ++ mov [reg_p2+48], r10 // z6 ++ adc r8, 0 ++ adc r9, 0 ++ ++ xor r10, r10 ++ mov rax, p503p1_7 ++ mul r11 ++ add r8, rax ++ adc r9, rdx ++ adc r10, 0 ++ ++ mov rax, p503p1_6 ++ mul r12 ++ add r8, rax ++ adc r9, rdx ++ adc r10, 0 ++ ++ mov rax, p503p1_5 ++ mul r13 ++ add r8, rax ++ adc r9, rdx ++ adc r10, 0 ++ ++ mov rax, p503p1_4 ++ mul r14 ++ add r8, rax ++ adc r9, rdx ++ adc r10, 0 ++ ++ mov r15, [reg_p2+32] ++ mov rax, p503p1_3 ++ mul r15 ++ add r8, rax ++ adc r9, rdx ++ adc r10, 0 ++ add r8, [reg_p1+56] ++ mov [reg_p2+56], r8 // z7 ++ adc r9, 0 ++ adc r10, 0 ++ ++ xor r8, r8 ++ mov rax, p503p1_7 ++ mul r12 ++ add r9, rax ++ adc r10, rdx ++ adc r8, 0 ++ ++ mov rax, p503p1_6 ++ mul r13 ++ add r9, rax ++ adc r10, rdx ++ adc r8, 0 ++ ++ mov rax, p503p1_5 ++ mul r14 ++ add r9, rax ++ adc r10, rdx ++ adc r8, 0 ++ ++ mov rax, p503p1_4 ++ mul r15 ++ add r9, rax ++ adc r10, rdx ++ adc r8, 0 ++ ++ mov rcx, [reg_p2+40] ++ mov rax, p503p1_3 ++ mul rcx ++ add r9, rax ++ adc r10, rdx ++ adc r8, 0 ++ add r9, [reg_p1+64] ++ mov [reg_p2], r9 // z0 ++ adc r10, 0 ++ adc r8, 0 ++ ++ xor r9, r9 ++ mov rax, p503p1_7 ++ mul r13 ++ add r10, rax ++ adc r8, rdx ++ adc r9, 0 ++ ++ mov rax, p503p1_6 ++ mul r14 ++ add r10, rax ++ adc r8, rdx ++ adc r9, 0 ++ ++ mov rax, p503p1_5 ++ mul r15 ++ add r10, rax ++ adc r8, rdx ++ adc r9, 0 ++ ++ mov rax, p503p1_4 ++ mul rcx ++ add r10, rax ++ adc r8, rdx ++ adc r9, 0 ++ ++ mov r13, [reg_p2+48] ++ mov rax, p503p1_3 ++ mul r13 ++ add r10, rax ++ adc r8, rdx ++ adc r9, 0 ++ add r10, [reg_p1+72] ++ mov [reg_p2+8], r10 // z1 ++ adc r8, 0 ++ adc r9, 0 ++ ++ xor r10, r10 ++ mov rax, p503p1_7 ++ mul r14 ++ add r8, rax ++ adc r9, rdx ++ adc r10, 0 ++ ++ mov rax, p503p1_6 ++ mul r15 ++ add r8, rax ++ adc r9, rdx ++ adc r10, 0 ++ ++ mov rax, p503p1_5 ++ mul rcx ++ add r8, rax ++ adc r9, rdx ++ adc r10, 0 ++ ++ mov rax, p503p1_4 ++ mul r13 ++ add r8, rax ++ adc r9, rdx ++ adc r10, 0 ++ ++ mov r14, [reg_p2+56] ++ mov rax, p503p1_3 ++ mul r14 ++ add r8, rax ++ adc r9, rdx ++ adc r10, 0 ++ add r8, [reg_p1+80] ++ mov [reg_p2+16], r8 // z2 ++ adc r9, 0 ++ adc r10, 0 ++ ++ xor r8, r8 ++ mov rax, p503p1_7 ++ mul r15 ++ add r9, rax ++ adc r10, rdx ++ adc r8, 0 ++ ++ mov rax, p503p1_6 ++ mul rcx ++ add r9, rax ++ adc r10, rdx ++ adc r8, 0 ++ ++ mov rax, p503p1_5 ++ mul r13 ++ add r9, rax ++ adc r10, rdx ++ adc r8, 0 ++ ++ mov rax, p503p1_4 ++ mul r14 ++ add r9, rax ++ adc r10, rdx ++ adc r8, 0 ++ add r9, [reg_p1+88] ++ mov [reg_p2+24], r9 // z3 ++ adc r10, 0 ++ adc r8, 0 ++ ++ xor r9, r9 ++ mov rax, p503p1_7 ++ mul rcx ++ add r10, rax ++ adc r8, rdx ++ adc r9, 0 ++ ++ mov rax, p503p1_6 ++ mul r13 ++ add r10, rax ++ adc r8, rdx ++ adc r9, 0 ++ ++ mov rax, p503p1_5 ++ mul r14 ++ add r10, rax ++ adc r8, rdx ++ adc r9, 0 ++ add r10, [reg_p1+96] ++ mov [reg_p2+32], r10 // z4 ++ adc r8, 0 ++ adc r9, 0 ++ ++ xor r10, r10 ++ mov rax, p503p1_7 ++ mul r13 ++ add r8, rax ++ adc r9, rdx ++ adc r10, 0 ++ ++ mov rax, p503p1_6 ++ mul r14 ++ add r8, rax ++ adc r9, rdx ++ adc r10, 0 ++ add r8, [reg_p1+104] // z5 ++ mov [reg_p2+40], r8 // z5 ++ adc r9, 0 ++ adc r10, 0 ++ ++ mov rax, p503p1_7 ++ mul r14 ++ add r9, rax ++ adc r10, rdx ++ add r9, [reg_p1+112] // z6 ++ mov [reg_p2+48], r9 // z6 ++ adc r10, 0 ++ add r10, [reg_p1+120] // z7 ++ mov [reg_p2+56], r10 // z7 ++ ++ pop r15 ++ pop r14 ++ pop r13 ++ pop r12 ++ ret ++ ++//*********************************************************************** ++// 503-bit multiprecision addition ++// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2] ++//*********************************************************************** ++.global mp_add503_asm ++mp_add503_asm: ++ mov r8, [reg_p1] ++ mov r9, [reg_p1+8] ++ mov r10, [reg_p1+16] ++ mov r11, [reg_p1+24] ++ add r8, [reg_p2] ++ adc r9, [reg_p2+8] ++ adc r10, [reg_p2+16] ++ adc r11, [reg_p2+24] ++ mov [reg_p3], r8 ++ mov [reg_p3+8], r9 ++ mov [reg_p3+16], r10 ++ mov [reg_p3+24], r11 ++ ++ mov r8, [reg_p1+32] ++ mov r9, [reg_p1+40] ++ mov r10, [reg_p1+48] ++ mov r11, [reg_p1+56] ++ adc r8, [reg_p2+32] ++ adc r9, [reg_p2+40] ++ adc r10, [reg_p2+48] ++ adc r11, [reg_p2+56] ++ mov [reg_p3+32], r8 ++ mov [reg_p3+40], r9 ++ mov [reg_p3+48], r10 ++ mov [reg_p3+56], r11 ++ ret ++ ++ ++//*********************************************************************** ++// 2x503-bit multiprecision subtraction ++// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2]. Returns borrow mask ++//*********************************************************************** ++.global mp_sub503x2_asm ++mp_sub503x2_asm: ++ xor rax, rax ++ mov r8, [reg_p1] ++ mov r9, [reg_p1+8] ++ mov r10, [reg_p1+16] ++ mov r11, [reg_p1+24] ++ mov rcx, [reg_p1+32] ++ sub r8, [reg_p2] ++ sbb r9, [reg_p2+8] ++ sbb r10, [reg_p2+16] ++ sbb r11, [reg_p2+24] ++ sbb rcx, [reg_p2+32] ++ mov [reg_p3], r8 ++ mov [reg_p3+8], r9 ++ mov [reg_p3+16], r10 ++ mov [reg_p3+24], r11 ++ mov [reg_p3+32], rcx ++ ++ mov r8, [reg_p1+40] ++ mov r9, [reg_p1+48] ++ mov r10, [reg_p1+56] ++ mov r11, [reg_p1+64] ++ mov rcx, [reg_p1+72] ++ sbb r8, [reg_p2+40] ++ sbb r9, [reg_p2+48] ++ sbb r10, [reg_p2+56] ++ sbb r11, [reg_p2+64] ++ sbb rcx, [reg_p2+72] ++ mov [reg_p3+40], r8 ++ mov [reg_p3+48], r9 ++ mov [reg_p3+56], r10 ++ mov [reg_p3+64], r11 ++ mov [reg_p3+72], rcx ++ ++ mov r8, [reg_p1+80] ++ mov r9, [reg_p1+88] ++ mov r10, [reg_p1+96] ++ mov r11, [reg_p1+104] ++ mov rcx, [reg_p1+112] ++ sbb r8, [reg_p2+80] ++ sbb r9, [reg_p2+88] ++ sbb r10, [reg_p2+96] ++ sbb r11, [reg_p2+104] ++ sbb rcx, [reg_p2+112] ++ mov [reg_p3+80], r8 ++ mov [reg_p3+88], r9 ++ mov [reg_p3+96], r10 ++ mov [reg_p3+104], r11 ++ mov [reg_p3+112], rcx ++ ++ mov r8, [reg_p1+120] ++ sbb r8, [reg_p2+120] ++ sbb rax, 0 ++ mov [reg_p3+120], r8 ++ ret ++ ++ ++//*********************************************************************** ++// Double 2x503-bit multiprecision subtraction ++// Operation: c [reg_p3] = c [reg_p3] - a [reg_p1] - b [reg_p2] ++//*********************************************************************** ++.global mp_dblsub503x2_asm ++mp_dblsub503x2_asm: ++ push r12 ++ push r13 ++ push r14 ++ ++ xor rax, rax ++ mov r8, [reg_p3] ++ mov r9, [reg_p3+8] ++ mov r10, [reg_p3+16] ++ mov r11, [reg_p3+24] ++ mov r12, [reg_p3+32] ++ mov r13, [reg_p3+40] ++ mov r14, [reg_p3+48] ++ mov rcx, [reg_p3+56] ++ sub r8, [reg_p1] ++ sbb r9, [reg_p1+8] ++ sbb r10, [reg_p1+16] ++ sbb r11, [reg_p1+24] ++ sbb r12, [reg_p1+32] ++ sbb r13, [reg_p1+40] ++ sbb r14, [reg_p1+48] ++ sbb rcx, [reg_p1+56] ++ adc rax, 0 ++ sub r8, [reg_p2] ++ sbb r9, [reg_p2+8] ++ sbb r10, [reg_p2+16] ++ sbb r11, [reg_p2+24] ++ sbb r12, [reg_p2+32] ++ sbb r13, [reg_p2+40] ++ sbb r14, [reg_p2+48] ++ sbb rcx, [reg_p2+56] ++ adc rax, 0 ++ mov [reg_p3], r8 ++ mov [reg_p3+8], r9 ++ mov [reg_p3+16], r10 ++ mov [reg_p3+24], r11 ++ mov [reg_p3+32], r12 ++ mov [reg_p3+40], r13 ++ mov [reg_p3+48], r14 ++ mov [reg_p3+56], rcx ++ ++ mov r8, [reg_p3+64] ++ mov r9, [reg_p3+72] ++ mov r10, [reg_p3+80] ++ mov r11, [reg_p3+88] ++ mov r12, [reg_p3+96] ++ mov r13, [reg_p3+104] ++ mov r14, [reg_p3+112] ++ mov rcx, [reg_p3+120] ++ sub r8, rax ++ sbb r8, [reg_p1+64] ++ sbb r9, [reg_p1+72] ++ sbb r10, [reg_p1+80] ++ sbb r11, [reg_p1+88] ++ sbb r12, [reg_p1+96] ++ sbb r13, [reg_p1+104] ++ sbb r14, [reg_p1+112] ++ sbb rcx, [reg_p1+120] ++ sub r8, [reg_p2+64] ++ sbb r9, [reg_p2+72] ++ sbb r10, [reg_p2+80] ++ sbb r11, [reg_p2+88] ++ sbb r12, [reg_p2+96] ++ sbb r13, [reg_p2+104] ++ sbb r14, [reg_p2+112] ++ sbb rcx, [reg_p2+120] ++ mov [reg_p3+64], r8 ++ mov [reg_p3+72], r9 ++ mov [reg_p3+80], r10 ++ mov [reg_p3+88], r11 ++ mov [reg_p3+96], r12 ++ mov [reg_p3+104], r13 ++ mov [reg_p3+112], r14 ++ mov [reg_p3+120], rcx ++ ++ pop r14 ++ pop r13 ++ pop r12 ++ ret +diff --git a/third_party/sidh/src/ARM64/fp_arm64.c b/third_party/sidh/src/ARM64/fp_arm64.c +new file mode 100644 +index 000000000..3a99cfc19 +--- /dev/null ++++ b/third_party/sidh/src/ARM64/fp_arm64.c +@@ -0,0 +1,92 @@ ++/******************************************************************************************** ++* SIDH: an efficient supersingular isogeny cryptography library ++* ++* Abstract: modular arithmetic optimized for 64-bit ARMv8 platforms for P503 ++*********************************************************************************************/ ++ ++#include "../internal.h" ++#include "../P503_internal.h" ++ ++// Global constants ++extern const struct params_t kP503Params; ++ ++inline void fpadd503(const digit_t* a, const digit_t* b, digit_t* c) ++{ // Modular addition, c = a+b mod p503. ++ // Inputs: a, b in [0, 2*p503-1] ++ // Output: c in [0, 2*p503-1] ++ ++ fpadd503_asm(a, b, c); ++} ++ ++ ++inline void fpsub503(const digit_t* a, const digit_t* b, digit_t* c) ++{ // Modular subtraction, c = a-b mod p503. ++ // Inputs: a, b in [0, 2*p503-1] ++ // Output: c in [0, 2*p503-1] ++ ++ fpsub503_asm(a, b, c); ++} ++ ++ ++inline void fpneg503(digit_t* a) ++{ // Modular negation, a = -a mod p503. ++ // Input/output: a in [0, 2*p503-1] ++ unsigned int i, borrow = 0; ++ ++ for (i = 0; i < NWORDS_FIELD; i++) { ++ SUBC(borrow, ((digit_t*)kP503Params.primeX2)[i], a[i], borrow, a[i]); ++ } ++} ++ ++ ++void fpdiv2_503(const digit_t* a, digit_t* c) ++{ // Modular division by two, c = a/2 mod p503. ++ // Input : a in [0, 2*p503-1] ++ // Output: c in [0, 2*p503-1] ++ unsigned int i, carry = 0; ++ digit_t mask; ++ ++ mask = 0 - (digit_t)(a[0] & 1); // If a is odd compute a+p521 ++ for (i = 0; i < NWORDS_FIELD; i++) { ++ ADDC(carry, a[i], ((digit_t*)kP503Params.prime)[i] & mask, carry, c[i]); ++ } ++ ++ mp_shiftr1(c, NWORDS_FIELD); ++} ++ ++ ++void fpcorrection503(digit_t* a) ++{ // Modular correction to reduce field element a in [0, 2*p503-1] to [0, p503-1]. ++ unsigned int i, borrow = 0; ++ digit_t mask; ++ ++ for (i = 0; i < NWORDS_FIELD; i++) { ++ SUBC(borrow, a[i], ((digit_t*)kP503Params.prime)[i], borrow, a[i]); ++ } ++ mask = 0 - (digit_t)borrow; ++ ++ borrow = 0; ++ for (i = 0; i < NWORDS_FIELD; i++) { ++ ADDC(borrow, a[i], ((digit_t*)kP503Params.prime)[i] & mask, borrow, a[i]); ++ } ++} ++ ++ ++void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) ++{ // Multiprecision multiply, c = a*b, where lng(a) = lng(b) = nwords. ++ ++ UNREFERENCED_PARAMETER(nwords); ++ ++ mul503_asm(a, b, c); ++} ++ ++ ++ ++void rdc_mont(const digit_t* ma, digit_t* mc) ++{ // Montgomery reduction exploiting special form of the prime. ++ // mc = ma*R^-1 mod p503x2, where R = 2^512. ++ // If ma < 2^512*p503, the output mc is in the range [0, 2*p503-1]. ++ // ma is assumed to be in Montgomery representation. ++ ++ rdc503_asm(ma, mc); ++} +diff --git a/third_party/sidh/src/ARM64/fp_arm64_asm.S b/third_party/sidh/src/ARM64/fp_arm64_asm.S +new file mode 100644 +index 000000000..7f96452c2 +--- /dev/null ++++ b/third_party/sidh/src/ARM64/fp_arm64_asm.S +@@ -0,0 +1,826 @@ ++//******************************************************************************************* ++// SIDH: an efficient supersingular isogeny cryptography library ++// ++// Abstract: field arithmetic in 64-bit ARMv8 assembly for P503 on Linux ++//******************************************************************************************* ++ ++// p503 + 1 ++p503p1: ++#define P503P1_0 0xAC00000000000000 ++#define P503P1_1 0x13085BDA2211E7A0 ++#define P503P1_2 0x1B9BF6C87B7E7DAF ++#define P503P1_3 0x6045C6BDDA77A4D0 ++#define P503P1_4 0x004066F541811E1E ++ ++// 2 * p503 ++p503x2: ++#define P503x2_0 0xFFFFFFFFFFFFFFFE ++#define P503x2_1 0xFFFFFFFFFFFFFFFF ++#define P503x2_2 0x57FFFFFFFFFFFFFF ++#define P503x2_3 0x2610B7B44423CF41 ++#define P503x2_4 0x3737ED90F6FCFB5E ++#define P503x2_5 0xC08B8D7BB4EF49A0 ++#define P503x2_6 0x0080CDEA83023C3C ++ ++#define P503P1_NZ_S8_0 0x85BDA2211E7A0AC ++#define P503P1_NZ_S8_1 0x9BF6C87B7E7DAF13 ++#define P503P1_NZ_S8_2 0x45C6BDDA77A4D01B ++#define P503P1_NZ_S8_3 0x4066F541811E1E60 ++ ++ ++.text ++//*********************************************************************** ++// Field addition ++// Operation: c [x2] = a [x0] + b [x1] ++//*********************************************************************** ++.global fpadd503_asm ++fpadd503_asm: ++ ldp x3, x4, [x0,#0] ++ ldp x5, x6, [x0,#16] ++ ldp x11, x12, [x1,#0] ++ ldp x13, x14, [x1,#16] ++ ++ // Add a + b ++ adds x3, x3, x11 ++ adcs x4, x4, x12 ++ adcs x5, x5, x13 ++ adcs x6, x6, x14 ++ ldp x7, x8, [x0,#32] ++ ldp x9, x10, [x0,#48] ++ ldp x15, x16, [x1,#32] ++ ldp x17, x18, [x1,#48] ++ adcs x7, x7, x15 ++ adcs x8, x8, x16 ++ adcs x9, x9, x17 ++ adc x10, x10, x18 ++ ++ // Subtract 2xp503 ++ ldr x11, =P503x2_0 ++ ldr x12, =P503x2_1 ++ ldr x13, =P503x2_2 ++ ldr x14, =P503x2_3 ++ subs x3, x3, x11 ++ sbcs x4, x4, x12 ++ sbcs x5, x5, x12 ++ sbcs x6, x6, x13 ++ sbcs x7, x7, x14 ++ ldr x15, =P503x2_4 ++ ldr x16, =P503x2_5 ++ ldr x17, =P503x2_6 ++ sbcs x8, x8, x15 ++ sbcs x9, x9, x16 ++ sbcs x10, x10, x17 ++ sbc x18, xzr, xzr ++ ++ // Add 2xp503 anded with the mask in x18 ++ and x11, x11, x18 ++ and x12, x12, x18 ++ and x13, x13, x18 ++ and x14, x14, x18 ++ and x15, x15, x18 ++ and x16, x16, x18 ++ and x17, x17, x18 ++ ++ adds x3, x3, x11 ++ adcs x4, x4, x12 ++ adcs x5, x5, x12 ++ adcs x6, x6, x13 ++ adcs x7, x7, x14 ++ adcs x8, x8, x15 ++ adcs x9, x9, x16 ++ adc x10, x10, x17 ++ ++ stp x3, x4, [x2,#0] ++ stp x5, x6, [x2,#16] ++ stp x7, x8, [x2,#32] ++ stp x9, x10, [x2,#48] ++ ret ++ ++ ++//*********************************************************************** ++// Field subtraction ++// Operation: c [x2] = a [x0] - b [x1] ++//*********************************************************************** ++.global fpsub503_asm ++fpsub503_asm: ++ ldp x3, x4, [x0,#0] ++ ldp x5, x6, [x0,#16] ++ ldp x11, x12, [x1,#0] ++ ldp x13, x14, [x1,#16] ++ ++ // Subtract a - b ++ subs x3, x3, x11 ++ sbcs x4, x4, x12 ++ sbcs x5, x5, x13 ++ sbcs x6, x6, x14 ++ ldp x7, x8, [x0,#32] ++ ldp x9, x10, [x0,#48] ++ ldp x15, x16, [x1,#32] ++ ldp x17, x18, [x1,#48] ++ sbcs x7, x7, x15 ++ sbcs x8, x8, x16 ++ sbcs x9, x9, x17 ++ sbcs x10, x10, x18 ++ sbc x18, xzr, xzr ++ ++ // Add 2xp503 anded with the mask in x18 ++ ldr x11, =P503x2_0 ++ ldr x12, =P503x2_1 ++ ldr x13, =P503x2_2 ++ ldr x14, =P503x2_3 ++ and x11, x11, x18 ++ and x12, x12, x18 ++ and x13, x13, x18 ++ and x14, x14, x18 ++ ldr x15, =P503x2_4 ++ ldr x16, =P503x2_5 ++ ldr x17, =P503x2_6 ++ and x15, x15, x18 ++ and x16, x16, x18 ++ and x17, x17, x18 ++ ++ adds x3, x3, x11 ++ adcs x4, x4, x12 ++ adcs x5, x5, x12 ++ adcs x6, x6, x13 ++ adcs x7, x7, x14 ++ adcs x8, x8, x15 ++ adcs x9, x9, x16 ++ adc x10, x10, x17 ++ ++ stp x3, x4, [x2,#0] ++ stp x5, x6, [x2,#16] ++ stp x7, x8, [x2,#32] ++ stp x9, x10, [x2,#48] ++ ret ++ ++ ++//////////////////////////////////////////// MACRO ++.macro MUL128_COMBA_CUT A0, A1, B0, B1, C0, C1, C2, C3, T0 ++ mul \A0, \A1, \B0 ++ umulh \B0, \A1, \B0 ++ adds \C1, \C1, \C3 ++ adc \C2, \C2, xzr ++ ++ mul \T0, \A1, \B1 ++ umulh \B1, \A1, \B1 ++ adds \C1, \C1, \A0 ++ adcs \C2, \C2, \B0 ++ adc \C3, xzr, xzr ++ ++ adds \C2, \C2, \T0 ++ adc \C3, \C3, \B1 ++.endm ++ ++ ++//////////////////////////////////////////// MACRO ++.macro MUL256_KARATSUBA_COMBA M,A0,A1,A2,A3,B0,B1,B2,B3,C0,C1,C2,C3,C4,C5,C6,C7,T0,T1 ++ ++ // A0-A1 <- AH + AL, T0 <- mask ++ adds \A0, \A0, \A2 ++ adcs \A1, \A1, \A3 ++ adc \T0, xzr, xzr ++ ++ // C6, T1 <- BH + BL, C7 <- mask ++ adds \C6, \B0, \B2 ++ adcs \T1, \B1, \B3 ++ adc \C7, xzr, xzr ++ ++ // C0-C1 <- masked (BH + BL) ++ sub \C2, xzr, \T0 ++ sub \C3, xzr, \C7 ++ and \C0, \C6, \C2 ++ and \C1, \T1, \C2 ++ ++ // C4-C5 <- masked (AH + AL), T0 <- combined carry ++ and \C4, \A0, \C3 ++ and \C5, \A1, \C3 ++ mul \C2, \A0, \C6 ++ mul \C3, \A0, \T1 ++ and \T0, \T0, \C7 ++ ++ // C0-C1, T0 <- (AH+AL) x (BH+BL), part 1 ++ adds \C0, \C4, \C0 ++ umulh \C4, \A0, \T1 ++ adcs \C1, \C5, \C1 ++ umulh \C5, \A0, \C6 ++ adc \T0, \T0, xzr ++ ++ // C2-C5 <- (AH+AL) x (BH+BL), low part ++ MUL128_COMBA_CUT \A0, \A1, \C6, \T1, \C2, \C3, \C4, \C5, \C7 ++ ldp \A0, \A1, [\M,#0] ++ ++ // C2-C5, T0 <- (AH+AL) x (BH+BL), final part ++ adds \C4, \C0, \C4 ++ umulh \C7, \A0, \B0 ++ umulh \T1, \A0, \B1 ++ adcs \C5, \C1, \C5 ++ mul \C0, \A0, \B0 ++ mul \C1, \A0, \B1 ++ adc \T0, \T0, xzr ++ ++ // C0-C1, T1, C7 <- AL x BL ++ MUL128_COMBA_CUT \A0, \A1, \B0, \B1, \C0, \C1, \T1, \C7, \C6 ++ ++ // C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL ++ mul \A0, \A2, \B2 ++ umulh \B0, \A2, \B2 ++ subs \C2, \C2, \C0 ++ sbcs \C3, \C3, \C1 ++ sbcs \C4, \C4, \T1 ++ mul \A1, \A2, \B3 ++ umulh \C6, \A2, \B3 ++ sbcs \C5, \C5, \C7 ++ sbc \T0, \T0, xzr ++ ++ // A0, A1, C6, B0 <- AH x BH ++ MUL128_COMBA_CUT \A2, \A3, \B2, \B3, \A0, \A1, \C6, \B0, \B1 ++ ++ // C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH ++ subs \C2, \C2, \A0 ++ sbcs \C3, \C3, \A1 ++ sbcs \C4, \C4, \C6 ++ sbcs \C5, \C5, \B0 ++ sbc \T0, \T0, xzr ++ ++ adds \C2, \C2, \T1 ++ adcs \C3, \C3, \C7 ++ adcs \C4, \C4, \A0 ++ adcs \C5, \C5, \A1 ++ adcs \C6, \T0, \C6 ++ adc \C7, \B0, xzr ++.endm ++ ++ ++//*********************************************************************************** ++// 512-bit integer multiplication using Karatsuba (two levels), Comba (lower level) ++// Operation: c [x2] = a [x0] * b [x1] ++//*********************************************************************************** ++.global mul503_asm ++mul503_asm: ++ sub sp, sp, #96 ++ stp x19, x20, [sp,#0] ++ stp x21, x22, [sp,#16] ++ stp x23, x24, [sp,#32] ++ stp x25, x26, [sp,#48] ++ stp x27, x28, [sp,#64] ++ str x29, [sp, #80] ++ ++ ldp x3, x4, [x0] ++ ldp x5, x6, [x0,#16] ++ ldp x7, x8, [x0,#32] ++ ldp x9, x10, [x0,#48] ++ ldp x11, x12, [x1,#0] ++ ldp x13, x14, [x1,#16] ++ ldp x15, x16, [x1,#32] ++ ldp x17, x18, [x1,#48] ++ ++ // x26-x29 <- AH + AL, x7 <- mask ++ adds x26, x3, x7 ++ adcs x27, x4, x8 ++ adcs x28, x5, x9 ++ adcs x29, x6, x10 ++ adc x7, xzr, xzr ++ ++ // x11-x14 <- BH + BL, x8 <- mask ++ adds x11, x11, x15 ++ adcs x12, x12, x16 ++ adcs x13, x13, x17 ++ adcs x14, x14, x18 ++ adc x8, xzr, xzr ++ ++ // x15-x18 <- masked (BH + BL) ++ sub x9, xzr, x7 ++ sub x10, xzr, x8 ++ and x15, x11, x9 ++ and x16, x12, x9 ++ and x17, x13, x9 ++ and x18, x14, x9 ++ ++ // x19-x22 <- masked (AH + AL), x7 <- combined carry ++ and x19, x26, x10 ++ and x20, x27, x10 ++ and x21, x28, x10 ++ and x22, x29, x10 ++ and x7, x7, x8 ++ ++ // x15-x18, x7 <- masked (AH+AL) + masked (BH+BL), step 1 ++ adds x15, x15, x19 ++ adcs x16, x16, x20 ++ adcs x17, x17, x21 ++ adcs x18, x18, x22 ++ adc x7, x7, xzr ++ ++ // x8-x10,x19-x23 <- (AH+AL) x (BH+BL), low part ++ stp x26, x27, [x2,#0] ++ MUL256_KARATSUBA_COMBA x2, x26, x27, x28, x29, x11, x12, x13, x14, x8, x9, x10, x19, x20, x21, x22, x23, x24, x25 ++ ++ // x15-x18, x7 <- (AH+AL) x (BH+BL), final step ++ adds x15, x15, x20 ++ adcs x16, x16, x21 ++ adcs x17, x17, x22 ++ adcs x18, x18, x23 ++ adc x7, x7, xzr ++ ++ // x20-x27 <- AL x BL ++ ldp x11, x12, [x1,#0] ++ ldp x13, x14, [x1,#16] ++ MUL256_KARATSUBA_COMBA x0, x3, x4, x5, x6, x11, x12, x13, x14, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29 ++ ++ // x13-x14, x3-x5 <- (AH+AL) x (BH+BL) - ALxBL ++ subs x8, x8, x20 ++ sbcs x9, x9, x21 ++ sbcs x10, x10, x22 ++ sbcs x19, x19, x23 ++ sbcs x15, x15, x24 ++ sbcs x16, x16, x25 ++ sbcs x17, x17, x26 ++ sbcs x18, x18, x27 ++ sbc x7, x7, xzr ++ ++ stp x20, x21, [x2] ++ stp x22, x23, [x2,#16] ++ ++ ldp x3, x4, [x0,#32] ++ ldp x5, x6, [x0,#48] ++ ldp x11, x12, [x1,#32] ++ ldp x13, x14, [x1,#48] ++ ++ adds x8, x8, x24 ++ adcs x9, x9, x25 ++ adcs x10, x10, x26 ++ adcs x19, x19, x27 ++ adc x1, xzr, xzr ++ ++ // x20-x27 <- AH x BH ++ add x0, x0, #32 ++ MUL256_KARATSUBA_COMBA x0, x3, x4, x5, x6, x11, x12, x13, x14, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29 ++ neg x1, x1 ++ ++ // x13-x14, x3-x5 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH ++ subs x8, x8, x20 ++ sbcs x9, x9, x21 ++ sbcs x10, x10, x22 ++ sbcs x19, x19, x23 ++ sbcs x15, x15, x24 ++ sbcs x16, x16, x25 ++ sbcs x17, x17, x26 ++ sbcs x18, x18, x27 ++ sbc x7, x7, xzr ++ ++ stp x8, x9, [x2,#32] ++ stp x10, x19, [x2,#48] ++ ++ adds x1, x1, #1 ++ adcs x15, x15, x20 ++ adcs x16, x16, x21 ++ adcs x17, x17, x22 ++ adcs x18, x18, x23 ++ adcs x24, x7, x24 ++ adcs x25, x25, xzr ++ adcs x26, x26, xzr ++ adc x27, x27, xzr ++ ++ stp x15, x16, [x2,#64] ++ stp x17, x18, [x2,#80] ++ stp x24, x25, [x2,#96] ++ stp x26, x27, [x2,#112] ++ ++ ldp x19, x20, [sp,#0] ++ ldp x21, x22, [sp,#16] ++ ldp x23, x24, [sp,#32] ++ ldp x25, x26, [sp,#48] ++ ldp x27, x28, [sp,#64] ++ ldr x29, [sp,#80] ++ add sp, sp, #96 ++ ret ++ ++ ++//////////////////////////////////////////// MACRO ++.macro MUL128x256_COMBA_CUT A0, A1, B0, B1, B2, B3, C0, C1, C2, C3, C4, C5, T0, T1, T2, T3 ++ mul \T0, \A1, \B0 ++ umulh \T1, \A1, \B0 ++ adds \C1, \C1, \C3 ++ adc \C2, \C2, xzr ++ ++ mul \T2, \A0, \B2 ++ umulh \T3, \A0, \B2 ++ adds \C1, \C1, \T0 ++ adcs \C2, \C2, \T1 ++ adc \C3, xzr, xzr ++ ++ mul \T0, \A1, \B1 ++ umulh \T1, \A1, \B1 ++ adds \C2, \C2, \T2 ++ adcs \C3, \C3, \T3 ++ adc \C4, xzr, xzr ++ ++ mul \T2, \A0, \B3 ++ umulh \T3, \A0, \B3 ++ adds \C2, \C2, \T0 ++ adcs \C3, \C3, \T1 ++ adc \C4, \C4, xzr ++ ++ mul \T0, \A1, \B2 ++ umulh \T1, \A1, \B2 ++ adds \C3, \C3, \T2 ++ adcs \C4, \C4, \T3 ++ adc \C5, xzr, xzr ++ ++ mul \T2, \A1, \B3 ++ umulh \T3, \A1, \B3 ++ adds \C3, \C3, \T0 ++ adcs \C4, \C4, \T1 ++ adc \C5, \C5, xzr ++ adds \C4, \C4, \T2 ++ adc \C5, \C5, \T3 ++.endm ++ ++ ++//************************************************************************************** ++// Montgomery reduction ++// Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015 ++// Operation: mc [x1] = ma [x0] ++// NOTE: ma=mc is not allowed ++//************************************************************************************** ++.global rdc503_asm ++rdc503_asm: ++ sub sp, sp, #96 ++ stp x19, x20, [sp] ++ stp x21, x22, [sp, #16] ++ stp x23, x24, [sp, #32] ++ stp x25, x26, [sp, #48] ++ stp x27, x28, [sp, #64] ++ stp x29, x30, [sp, #80] ++ ++ ldp x2, x3, [x0,#0] // a[0-1] ++ ++ // Load the prime constant ++ ldr x24, =P503P1_NZ_S8_0 ++ ldr x25, =P503P1_NZ_S8_1 ++ ldr x26, =P503P1_NZ_S8_2 ++ ldr x27, =P503P1_NZ_S8_3 ++ ++ // a[0-1] x p503p1_nz_s8 --> result: x4:x9 ++ mul x4, x2, x24 // a[0] x p503p1_nz_s8[0] ++ umulh x7, x2, x24 ++ mul x5, x2, x25 // a[0] x p503p1_nz_s8[1] ++ umulh x6, x2, x25 ++ MUL128x256_COMBA_CUT x2, x3, x24, x25, x26, x27, x4, x5, x6, x7, x8, x9, x28, x29, x30, x10 ++ ++ ldp x3, x11, [x0,#16] // a[2] ++ ldp x12, x13, [x0,#32] ++ ldp x14, x15, [x0,#48] ++ ++ orr x10, xzr, x9, lsr #8 ++ lsl x9, x9, #56 ++ orr x9, x9, x8, lsr #8 ++ lsl x8, x8, #56 ++ orr x8, x8, x7, lsr #8 ++ lsl x7, x7, #56 ++ orr x7, x7, x6, lsr #8 ++ lsl x6, x6, #56 ++ orr x6, x6, x5, lsr #8 ++ lsl x5, x5, #56 ++ orr x5, x5, x4, lsr #8 ++ lsl x4, x4, #56 ++ ++ adds x11, x4, x11 // a[3] ++ adcs x12, x5, x12 // a[4] ++ adcs x13, x6, x13 ++ adcs x14, x7, x14 ++ adcs x15, x8, x15 ++ ldp x16, x17, [x0,#64] ++ ldp x18, x19, [x0,#80] ++ mul x4, x3, x24 // a[2] x p503p1_nz_s8[0] ++ umulh x7, x3, x24 ++ adcs x16, x9, x16 ++ adcs x17, x10, x17 ++ adcs x18, xzr, x18 ++ adcs x19, xzr, x19 ++ ldp x20, x21, [x0,#96] ++ ldp x22, x23, [x0,#112] ++ mul x5, x3, x25 // a[2] x p503p1_nz_s8[1] ++ umulh x6, x3, x25 ++ adcs x20, xzr, x20 ++ adcs x21, xzr, x21 ++ adcs x22, xzr, x22 ++ adc x23, xzr, x23 ++ ++ // a[2-3] x p503p1_nz_s8 --> result: x4:x9 ++ MUL128x256_COMBA_CUT x3, x11, x24, x25, x26, x27, x4, x5, x6, x7, x8, x9, x28, x29, x30, x10 ++ ++ orr x10, xzr, x9, lsr #8 ++ lsl x9, x9, #56 ++ orr x9, x9, x8, lsr #8 ++ lsl x8, x8, #56 ++ orr x8, x8, x7, lsr #8 ++ lsl x7, x7, #56 ++ orr x7, x7, x6, lsr #8 ++ lsl x6, x6, #56 ++ orr x6, x6, x5, lsr #8 ++ lsl x5, x5, #56 ++ orr x5, x5, x4, lsr #8 ++ lsl x4, x4, #56 ++ ++ adds x13, x4, x13 // a[5] ++ adcs x14, x5, x14 // a[6] ++ adcs x15, x6, x15 ++ adcs x16, x7, x16 ++ mul x4, x12, x24 // a[4] x p503p1_nz_s8[0] ++ umulh x7, x12, x24 ++ adcs x17, x8, x17 ++ adcs x18, x9, x18 ++ adcs x19, x10, x19 ++ adcs x20, xzr, x20 ++ mul x5, x12, x25 // a[4] x p503p1_nz_s8[1] ++ umulh x6, x12, x25 ++ adcs x21, xzr, x21 ++ adcs x22, xzr, x22 ++ adc x23, xzr, x23 ++ ++ // a[4-5] x p503p1_nz_s8 --> result: x4:x9 ++ MUL128x256_COMBA_CUT x12, x13, x24, x25, x26, x27, x4, x5, x6, x7, x8, x9, x28, x29, x30, x10 ++ ++ orr x10, xzr, x9, lsr #8 ++ lsl x9, x9, #56 ++ orr x9, x9, x8, lsr #8 ++ lsl x8, x8, #56 ++ orr x8, x8, x7, lsr #8 ++ lsl x7, x7, #56 ++ orr x7, x7, x6, lsr #8 ++ lsl x6, x6, #56 ++ orr x6, x6, x5, lsr #8 ++ lsl x5, x5, #56 ++ orr x5, x5, x4, lsr #8 ++ lsl x4, x4, #56 ++ ++ adds x15, x4, x15 // a[7] ++ adcs x16, x5, x16 // a[8] ++ adcs x17, x6, x17 ++ adcs x18, x7, x18 ++ mul x4, x14, x24 // a[6] x p503p1_nz_s8[0] ++ umulh x7, x14, x24 ++ adcs x19, x8, x19 ++ adcs x20, x9, x20 ++ adcs x21, x10, x21 ++ mul x5, x14, x25 // a[6] x p503p1_nz_s8[1] ++ umulh x6, x14, x25 ++ adcs x22, xzr, x22 ++ adc x23, xzr, x23 ++ ++ // a[6-7] x p503p1_nz_s8 --> result: x4:x9 ++ MUL128x256_COMBA_CUT x14, x15, x24, x25, x26, x27, x4, x5, x6, x7, x8, x9, x28, x29, x30, x10 ++ ++ orr x10, xzr, x9, lsr #8 ++ lsl x9, x9, #56 ++ orr x9, x9, x8, lsr #8 ++ lsl x8, x8, #56 ++ orr x8, x8, x7, lsr #8 ++ lsl x7, x7, #56 ++ orr x7, x7, x6, lsr #8 ++ lsl x6, x6, #56 ++ orr x6, x6, x5, lsr #8 ++ lsl x5, x5, #56 ++ orr x5, x5, x4, lsr #8 ++ lsl x4, x4, #56 ++ ++ adds x17, x4, x17 ++ adcs x18, x5, x18 ++ adcs x19, x6, x19 ++ adcs x20, x7, x20 ++ stp x16, x17, [x1,#0] // Final result ++ stp x18, x19, [x1,#16] ++ adcs x21, x8, x21 ++ adcs x22, x9, x22 ++ adc x23, x10, x23 ++ stp x20, x21, [x1,#32] ++ stp x22, x23, [x1,#48] ++ ++ ldp x19, x20, [sp] ++ ldp x21, x22, [sp, #16] ++ ldp x23, x24, [sp, #32] ++ ldp x25, x26, [sp, #48] ++ ldp x27, x28, [sp, #64] ++ ldp x29, x30, [sp, #80] ++ add sp, sp, #96 ++ ret ++ ++ ++//*********************************************************************** ++// 503-bit multiprecision addition ++// Operation: c [x2] = a [x0] + b [x1] ++//*********************************************************************** ++.global mp_add503_asm ++mp_add503_asm: ++ ldp x3, x4, [x0,#0] ++ ldp x5, x6, [x0,#16] ++ ldp x11, x12, [x1,#0] ++ ldp x13, x14, [x1,#16] ++ ++ adds x3, x3, x11 ++ adcs x4, x4, x12 ++ adcs x5, x5, x13 ++ adcs x6, x6, x14 ++ ldp x7, x8, [x0,#32] ++ ldp x9, x10, [x0,#48] ++ ldp x15, x16, [x1,#32] ++ ldp x17, x18, [x1,#48] ++ adcs x7, x7, x15 ++ adcs x8, x8, x16 ++ adcs x9, x9, x17 ++ adc x10, x10, x18 ++ ++ stp x3, x4, [x2,#0] ++ stp x5, x6, [x2,#16] ++ stp x7, x8, [x2,#32] ++ stp x9, x10, [x2,#48] ++ ret ++ ++ ++//*********************************************************************** ++// 2x503-bit multiprecision addition ++// Operation: c [x2] = a [x0] + b [x1] ++//*********************************************************************** ++.global mp_add503x2_asm ++mp_add503x2_asm: ++ ldp x3, x4, [x0,#0] ++ ldp x5, x6, [x0,#16] ++ ldp x11, x12, [x1,#0] ++ ldp x13, x14, [x1,#16] ++ adds x3, x3, x11 ++ adcs x4, x4, x12 ++ adcs x5, x5, x13 ++ adcs x6, x6, x14 ++ ldp x7, x8, [x0,#32] ++ ldp x9, x10, [x0,#48] ++ ldp x15, x16, [x1,#32] ++ ldp x17, x18, [x1,#48] ++ adcs x7, x7, x15 ++ adcs x8, x8, x16 ++ adcs x9, x9, x17 ++ adcs x10, x10, x18 ++ ++ stp x3, x4, [x2,#0] ++ stp x5, x6, [x2,#16] ++ stp x7, x8, [x2,#32] ++ stp x9, x10, [x2,#48] ++ ++ ldp x3, x4, [x0,#64] ++ ldp x5, x6, [x0,#80] ++ ldp x11, x12, [x1,#64] ++ ldp x13, x14, [x1,#80] ++ adcs x3, x3, x11 ++ adcs x4, x4, x12 ++ adcs x5, x5, x13 ++ adcs x6, x6, x14 ++ ldp x7, x8, [x0,#96] ++ ldp x9, x10, [x0,#112] ++ ldp x15, x16, [x1,#96] ++ ldp x17, x18, [x1,#112] ++ adcs x7, x7, x15 ++ adcs x8, x8, x16 ++ adcs x9, x9, x17 ++ adc x10, x10, x18 ++ ++ stp x3, x4, [x2,#64] ++ stp x5, x6, [x2,#80] ++ stp x7, x8, [x2,#96] ++ stp x9, x10, [x2,#112] ++ ret ++ ++ ++//*********************************************************************** ++// 2x503-bit multiprecision subtraction ++// Operation: c [x2] = a [x0] - b [x1]. Returns borrow mask ++//*********************************************************************** ++.global mp_sub503x2_asm ++mp_sub503x2_asm: ++ ldp x3, x4, [x0,#0] ++ ldp x5, x6, [x0,#16] ++ ldp x11, x12, [x1,#0] ++ ldp x13, x14, [x1,#16] ++ subs x3, x3, x11 ++ sbcs x4, x4, x12 ++ sbcs x5, x5, x13 ++ sbcs x6, x6, x14 ++ ldp x7, x8, [x0,#32] ++ ldp x9, x10, [x0,#48] ++ ldp x15, x16, [x1,#32] ++ ldp x17, x18, [x1,#48] ++ sbcs x7, x7, x15 ++ sbcs x8, x8, x16 ++ sbcs x9, x9, x17 ++ sbcs x10, x10, x18 ++ ++ stp x3, x4, [x2,#0] ++ stp x5, x6, [x2,#16] ++ stp x7, x8, [x2,#32] ++ stp x9, x10, [x2,#48] ++ ++ ldp x3, x4, [x0,#64] ++ ldp x5, x6, [x0,#80] ++ ldp x11, x12, [x1,#64] ++ ldp x13, x14, [x1,#80] ++ sbcs x3, x3, x11 ++ sbcs x4, x4, x12 ++ sbcs x5, x5, x13 ++ sbcs x6, x6, x14 ++ ldp x7, x8, [x0,#96] ++ ldp x9, x10, [x0,#112] ++ ldp x15, x16, [x1,#96] ++ ldp x17, x18, [x1,#112] ++ sbcs x7, x7, x15 ++ sbcs x8, x8, x16 ++ sbcs x9, x9, x17 ++ sbcs x10, x10, x18 ++ sbc x0, xzr, xzr ++ ++ stp x3, x4, [x2,#64] ++ stp x5, x6, [x2,#80] ++ stp x7, x8, [x2,#96] ++ stp x9, x10, [x2,#112] ++ ret ++ ++ ++//*********************************************************************** ++// Double 2x503-bit multiprecision subtraction ++// Operation: c [x2] = c [x2] - a [x0] - b [x1] ++//*********************************************************************** ++.global mp_dblsub503x2_asm ++mp_dblsub503x2_asm: ++ sub sp, sp, #32 ++ stp x27, x28, [sp, #0] ++ stp x29, x30, [sp, #16] ++ ldp x3, x4, [x2,#0] ++ ldp x5, x6, [x2,#16] ++ ldp x7, x8, [x2,#32] ++ ldp x9, x10, [x2,#48] ++ ldp x11, x12, [x2,#64] ++ ldp x13, x14, [x2,#80] ++ ldp x15, x16, [x2,#96] ++ ldp x17, x18, [x2,#112] ++ ++ ldp x27, x28, [x0,#0] ++ ldp x29, x30, [x0,#16] ++ subs x3, x3, x27 ++ sbcs x4, x4, x28 ++ sbcs x5, x5, x29 ++ sbcs x6, x6, x30 ++ ldp x27, x28, [x0,#32] ++ ldp x29, x30, [x0,#48] ++ sbcs x7, x7, x27 ++ sbcs x8, x8, x28 ++ sbcs x9, x9, x29 ++ sbcs x10, x10, x30 ++ ldp x27, x28, [x0,#64] ++ ldp x29, x30, [x0,#80] ++ sbcs x11, x11, x27 ++ sbcs x12, x12, x28 ++ sbcs x13, x13, x29 ++ sbcs x14, x14, x30 ++ ldp x27, x28, [x0,#96] ++ ldp x29, x30, [x0,#112] ++ sbcs x15, x15, x27 ++ sbcs x16, x16, x28 ++ sbcs x17, x17, x29 ++ sbc x18, x18, x30 ++ ++ ldp x27, x28, [x1,#0] ++ ldp x29, x30, [x1,#16] ++ subs x3, x3, x27 ++ sbcs x4, x4, x28 ++ sbcs x5, x5, x29 ++ sbcs x6, x6, x30 ++ ldp x27, x28, [x1,#32] ++ ldp x29, x30, [x1,#48] ++ sbcs x7, x7, x27 ++ sbcs x8, x8, x28 ++ sbcs x9, x9, x29 ++ sbcs x10, x10, x30 ++ ldp x27, x28, [x1,#64] ++ ldp x29, x30, [x1,#80] ++ sbcs x11, x11, x27 ++ sbcs x12, x12, x28 ++ sbcs x13, x13, x29 ++ sbcs x14, x14, x30 ++ ldp x27, x28, [x1,#96] ++ ldp x29, x30, [x1,#112] ++ sbcs x15, x15, x27 ++ sbcs x16, x16, x28 ++ sbcs x17, x17, x29 ++ sbc x18, x18, x30 ++ ++ stp x3, x4, [x2,#0] ++ stp x5, x6, [x2,#16] ++ stp x7, x8, [x2,#32] ++ stp x9, x10, [x2,#48] ++ stp x11, x12, [x2,#64] ++ stp x13, x14, [x2,#80] ++ stp x15, x16, [x2,#96] ++ stp x17, x18, [x2,#112] ++ ++ ldp x27, x28, [sp, #0] ++ ldp x29, x30, [sp, #16] ++ add sp, sp, #32 ++ ret +diff --git a/third_party/sidh/src/P503.c b/third_party/sidh/src/P503.c +new file mode 100644 +index 000000000..fab51a38b +--- /dev/null ++++ b/third_party/sidh/src/P503.c +@@ -0,0 +1,99 @@ ++/******************************************************************************************** ++* SIDH: an efficient supersingular isogeny cryptography library ++* ++* Abstract: supersingular isogeny parameters and generation of functions for P503 ++*********************************************************************************************/ ++ ++#include "sidh/def_p503.h" ++#include "sidh/P503_api.h" ++#include "P503_internal.h" ++ ++// Encoding of field elements, elements over Z_order, elements over GF(p^2) and elliptic curve points: ++// -------------------------------------------------------------------------------------------------- ++// Elements over GF(p) and Z_order are encoded with the least significant octet (and digit) located at the leftmost position (i.e., little endian format). ++// Elements (a+b*i) over GF(p^2), where a and b are defined over GF(p), are encoded as {a, b}, with a in the least significant position. ++// Elliptic curve points P = (x,y) are encoded as {x, y}, with x in the least significant position. ++// Internally, the number of digits used to represent all these elements is obtained by approximating the number of bits to the immediately greater multiple of 32. ++// For example, a 503-bit field element is represented with Ceil(503 / 64) = 8 64-bit digits or Ceil(503 / 32) = 16 32-bit digits. ++ ++// ++// Curve isogeny system "SIDHp503". Base curve: Montgomery curve By^2 = Cx^3 + Ax^2 + Cx defined over GF(p503^2), where A=0, B=1, C=1 and p503 = 2^250*3^159-1 ++// ++ ++const struct params_t kP503Params = { ++ .prime = { ++ 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xABFFFFFFFFFFFFFF, ++ 0x13085BDA2211E7A0, 0x1B9BF6C87B7E7DAF, 0x6045C6BDDA77A4D0, 0x004066F541811E1E ++ }, ++ ++ .primeP1 = { ++ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xAC00000000000000, ++ 0x13085BDA2211E7A0, 0x1B9BF6C87B7E7DAF, 0x6045C6BDDA77A4D0, 0x004066F541811E1E ++ }, ++ .primeX2 = { ++ 0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x57FFFFFFFFFFFFFF, ++ 0x2610B7B44423CF41, 0x3737ED90F6FCFB5E, 0xC08B8D7BB4EF49A0, 0x0080CDEA83023C3C ++ }, ++ .Alice_order = { ++ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0400000000000000 ++ }, ++ .Bob_order = { ++ 0xC216F6888479E82B, 0xE6FDB21EDF9F6BC4, 0x1171AF769DE93406, 0x1019BD5060478798 ++ }, ++ .A_gen = { ++ 0xE7EF4AA786D855AF, 0xED5758F03EB34D3B, 0x09AE172535A86AA9, 0x237B9CC07D622723, ++ 0xE3A284CBA4E7932D, 0x27481D9176C5E63F, 0x6A323FF55C6E71BF, 0x002ECC31A6FB8773, // XPA0 ++ 0x64D02E4E90A620B8, 0xDAB8128537D4B9F1, 0x4BADF77B8A228F98, 0x0F5DBDF9D1FB7D1B, ++ 0xBEC4DB288E1A0DCC, 0xE76A8665E80675DB, 0x6D6F252E12929463, 0x003188BD1463FACC, // XPA1 ++ 0xB79D41025DE85D56, 0x0B867DA9DF169686, 0x740E5368021C827D, 0x20615D72157BF25C, ++ 0xFF1590013C9B9F5B, 0xC884DCADE8C16CEA, 0xEBD05E53BF724E01, 0x0032FEF8FDA5748C, // XQA0 ++ 0x12E2E849AA0A8006, 0x41CF47008635A1E8, 0x9CD720A70798AED7, 0x42A820B42FCF04CF, ++ 0x7BF9BAD32AAE88B1, 0xF619127A54090BBE, 0x1CB10D8F56408EAA, 0x001D6B54C3C0EDEB, // XRA0 ++ 0x34DB54931CBAAC36, 0x420A18CB8DD5F0C4, 0x32008C1A48C0F44D, 0x3B3BA772B1CFD44D, ++ 0xA74B058FDAF13515, 0x095FC9CA7EEC17B4, 0x448E829D28F120F8, 0x00261EC3ED16A489 // XRA1 ++ }, ++ .B_gen = { ++ 0x7EDE37F4FA0BC727, 0xF7F8EC5C8598941C, 0xD15519B516B5F5C8, 0xF6D5AC9B87A36282, ++ 0x7B19F105B30E952E, 0x13BD8B2025B4EBEE, 0x7B96D27F4EC579A2, 0x00140850CAB7E5DE, // XPB0 ++ 0x7764909DAE7B7B2D, 0x578ABB16284911AB, 0x76E2BFD146A6BF4D, 0x4824044B23AA02F0, ++ 0x1105048912A321F3, 0xB8A2E482CF0F10C1, 0x42FF7D0BE2152085, 0x0018E599C5223352, // XPB1 ++ 0x4256C520FB388820, 0x744FD7C3BAAF0A13, 0x4B6A2DDDB12CBCB8, 0xE46826E27F427DF8, ++ 0xFE4A663CD505A61B, 0xD6B3A1BAF025C695, 0x7C3BB62B8FCC00BD, 0x003AFDDE4A35746C, // XQB0 ++ 0x75601CD1E6C0DFCB, 0x1A9007239B58F93E, 0xC1F1BE80C62107AC, 0x7F513B898F29FF08, ++ 0xEA0BEDFF43E1F7B2, 0x2C6D94018CBAE6D0, 0x3A430D31BCD84672, 0x000D26892ECCFE83, // XRB0 ++ 0x1119D62AEA3007A1, 0xE3702AA4E04BAE1B, 0x9AB96F7D59F990E7, 0xF58440E8B43319C0, ++ 0xAF8134BEE1489775, 0xE7F7774E905192AA, 0xF54AE09308E98039, 0x001EF7A041A86112 // XRB1 ++ }, ++ .Montgomery_R2 = { ++ 0x5289A0CF641D011F, 0x9B88257189FED2B9, 0xA3B365D58DC8F17A, 0x5BC57AB6EFF168EC, ++ 0x9E51998BD84D4423, 0xBF8999CBAC3B5695, 0x46E9127BCE14CDB6, 0x003F6CFCE8B81771 ++ }, ++ .Montgomery_one = { ++ 0x00000000000003F9, 0x0000000000000000, 0x0000000000000000, 0xB400000000000000, ++ 0x63CB1A6EA6DED2B4, 0x51689D8D667EB37D, 0x8ACD77C71AB24142, 0x0026FBAEC60F5953 ++ }, ++ .Montgomery_Rprime = { ++ 0x0C2615CA3C5BAA99, 0x5A4FF3072AB6AA6A, 0xA6AFD4B039AD6AA2, 0x010DA06A26DD05CB ++ }, ++ .Montgomery_rprime = { ++ 0x49C8A87190C0697D, 0x2EB7968EA0F0A558, 0x944257B696777FA2, 0xBAA4DDCD6139D2B3 ++ }, ++ .Border_div3 = { ++ 0xEB5CFCD82C28A2B9, 0x4CFF3B5F9FDFCE96, 0xB07B3A7CDF4DBC02, 0x055DE9C5756D2D32 ++ }, ++ .strat_Alice = { ++ 61, 32, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, ++ 4, 2, 1, 1, 2, 1, 1, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, ++ 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 29, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, ++ 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 13, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, ++ 1, 1, 2, 1, 1, 5, 4, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1 ++ }, ++ .strat_Bob = { ++ 71, 38, 21, 13, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 5, 4, 2, 1, 1, 2, 1, ++ 1, 2, 1, 1, 1, 9, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, 2, 1, 1, 17, 9, ++ 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 1, 2, 1, ++ 1, 4, 2, 1, 1, 2, 1, 1, 33, 17, 9, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, ++ 2, 1, 1, 8, 4, 2, 1, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 16, 8, 4, 2, 1, 1, 1, 2, ++ 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1 ++ } ++}; +diff --git a/third_party/sidh/src/P503_internal.h b/third_party/sidh/src/P503_internal.h +new file mode 100644 +index 000000000..67aed146c +--- /dev/null ++++ b/third_party/sidh/src/P503_internal.h +@@ -0,0 +1,279 @@ ++/******************************************************************************************** ++* SIDH: an efficient supersingular isogeny cryptography library ++* ++* Abstract: internal header file for P503 ++*********************************************************************************************/ ++ ++#ifndef P503_INTERNAL_H__ ++#define P503_INTERNAL_H__ ++ ++#include ++#include "sidh/def_p503.h" ++ ++// Macro definitions ++#define NBITS_TO_NBYTES(nbits) (((nbits)+7)/8) // Conversion macro from number of bits to number of bytes ++#define NBITS_TO_NWORDS(nbits) (((nbits)+(sizeof(digit_t)*8)-1)/(sizeof(digit_t)*8)) // Conversion macro from number of bits to number of computer words ++#define NBYTES_TO_NWORDS(nbytes) (((nbytes)+sizeof(digit_t)-1)/sizeof(digit_t)) // Conversion macro from number of bytes to number of computer words ++ ++// Macro to avoid compiler warnings when detecting unreferenced parameters ++#define UNREFERENCED_PARAMETER(PAR) ((void)(PAR)) ++ ++// SIDH Basic, internally used, constants ++#define MAXBITS_FIELD 512 ++#define MAXWORDS_FIELD ((MAXBITS_FIELD+RADIX-1)/RADIX) // Max. number of words to represent field elements ++#define MAXBITS_ORDER NBITS_ORDER ++#define MAXWORDS_ORDER ((MAXBITS_ORDER+RADIX-1)/RADIX) // Max. number of words to represent elements in [1, oA-1] or [1, oB]. ++#define ALICE 0 ++#define BOB 1 ++#define OALICE_BITS 250 ++#define OBOB_BITS 253 ++#define OBOB_EXPON 159 ++// Fixed parameters for isogeny tree computation ++#define MAX_INT_POINTS_ALICE 7 ++#define MAX_INT_POINTS_BOB 8 ++#define FP2_ENCODED_BYTES 2*((NBITS_FIELD + 7) / 8) ++ ++// Setting up macro defines and including GF(p), GF(p^2), curve, isogeny and kex functions ++#define fpcopy fpcopy503 ++#define fpzero fpzero503 ++#define fpadd fpadd503 ++#define fpsub fpsub503 ++#define fpneg fpneg503 ++#define fpdiv2 fpdiv2_503 ++#define fpcorrection fpcorrection503 ++#define fpmul_mont fpmul503_mont ++#define fpsqr_mont fpsqr503_mont ++#define fpinv_mont fpinv503_mont ++#define fpinv_chain_mont fpinv503_chain_mont ++#define fpinv_mont_bingcd fpinv503_mont_bingcd ++#define fp2copy fp2copy503 ++#define fp2zero fp2zero503 ++#define fp2add fp2add503 ++#define fp2sub fp2sub503 ++#define fp2neg fp2neg503 ++#define fp2div2 fp2div2_503 ++#define fp2correction fp2correction503 ++#define fp2mul_mont fp2mul503_mont ++#define fp2sqr_mont fp2sqr503_mont ++#define fp2inv_mont fp2inv503_mont ++#define fp2inv_mont_bingcd fp2inv503_mont_bingcd ++#define fpequal_non_constant_time fpequal503_non_constant_time ++#define cswap_asm cswap503_asm ++#define mp_add_asm mp_add503_asm ++#define mp_subx2_asm mp_sub503x2_asm ++#define mp_dblsubx2_asm mp_dblsub503x2_asm ++#define crypto_kem_keypair crypto_kem_keypair_SIKEp503 ++#define crypto_kem_enc crypto_kem_enc_SIKEp503 ++#define crypto_kem_dec crypto_kem_dec_SIKEp503 ++#define random_mod_order_A random_mod_order_A_SIDHp503 ++#define random_mod_order_B random_mod_order_B_SIDHp503 ++#define EphemeralKeyGeneration_A EphemeralKeyGeneration_A_SIDHp503 ++#define EphemeralKeyGeneration_B EphemeralKeyGeneration_B_SIDHp503 ++#define EphemeralSecretAgreement_A EphemeralSecretAgreement_A_SIDHp503 ++#define EphemeralSecretAgreement_B EphemeralSecretAgreement_B_SIDHp503 ++ ++// SIDH's basic element definitions and point representations ++ ++typedef digit_t felm_t[NWORDS_FIELD]; // Datatype for representing 503-bit field elements (512-bit max.) ++typedef digit_t dfelm_t[2*NWORDS_FIELD]; // Datatype for representing double-precision 2x503-bit field elements (512-bit max.) ++ ++/* An element in F_{p^2}, is composed of two coefficients ++ from F_p, * i.e. Fp2 element = c0 + c1*i in F_{p^2} ++ */ ++typedef struct { ++ felm_t c0; ++ felm_t c1; ++} fp2; ++ ++// Our F_{p^2} element type is a pointer to the struct. ++typedef fp2 f2elm_t[1]; ++ ++typedef struct { f2elm_t X; f2elm_t Z; } point_proj; // Point representation in projective XZ Montgomery coordinates. ++typedef point_proj point_proj_t[1]; ++ ++/* Old GCC 4.9 (jessie) doesn't implement {0} initialization properly, ++ which violates C11 as described in 6.7.9, 21 (similarily C99, 6.7.8). ++ Defines below are used to work around the bug, and provide a way ++ to initialize f2elem_t and point_proj_t structs. ++ Bug has been fixed in GCC6 (debian stretch). ++*/ ++#define F2ELM_INIT {{ {0}, {0} }} ++#define POINT_PROJ_INIT {{ F2ELM_INIT, F2ELM_INIT }} ++ ++/**************** Function prototypes ****************/ ++/************* Multiprecision functions **************/ ++ ++// Copy wordsize digits, c = a, where lng(a) = nwords ++void copy_words(const digit_t* a, digit_t* c, const unsigned int nwords); ++ ++// Multiprecision addition, c = a+b, where lng(a) = lng(b) = nwords. Returns the carry bit ++unsigned int mp_add(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords); ++ ++// 503-bit multiprecision addition, c = a+b ++void mp_add503(const digit_t* a, const digit_t* b, digit_t* c); ++void mp_add503_asm(const digit_t* a, const digit_t* b, digit_t* c); ++void cswap503_asm(point_proj_t x, point_proj_t y, const digit_t option); ++ ++// Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = nwords. Returns the borrow bit ++unsigned int mp_sub(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords); ++digit_t mp_sub503x2_asm(const digit_t* a, const digit_t* b, digit_t* c); ++ ++// Double 2x503-bit multiprecision subtraction, c = c-a-b, where c > a and c > b ++void mp_dblsub503x2_asm(const digit_t* a, const digit_t* b, digit_t* c); ++ ++// Multiprecision left shift ++void mp_shiftleft(digit_t* x, unsigned int shift, const unsigned int nwords); ++ ++// Multiprecision right shift by one ++void mp_shiftr1(digit_t* x, const unsigned int nwords); ++ ++// Multiprecision left right shift by one ++void mp_shiftl1(digit_t* x, const unsigned int nwords); ++ ++// Digit multiplication, digit * digit -> 2-digit result ++void digit_x_digit(const digit_t a, const digit_t b, digit_t* c); ++ ++// Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = nwords. ++void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords); ++ ++/************ Field arithmetic functions *************/ ++ ++// Copy of a field element, c = a ++void fpcopy503(const digit_t* a, digit_t* c); ++ ++// Zeroing a field element, a = 0 ++void fpzero503(digit_t* a); ++ ++// Non constant-time comparison of two field elements. If a = b return TRUE, otherwise, return FALSE ++bool fpequal503_non_constant_time(const digit_t* a, const digit_t* b); ++ ++// Modular addition, c = a+b mod p503 ++extern void fpadd503(const digit_t* a, const digit_t* b, digit_t* c); ++extern void fpadd503_asm(const digit_t* a, const digit_t* b, digit_t* c); ++ ++// Modular subtraction, c = a-b mod p503 ++extern void fpsub503(const digit_t* a, const digit_t* b, digit_t* c); ++extern void fpsub503_asm(const digit_t* a, const digit_t* b, digit_t* c); ++ ++// Modular negation, a = -a mod p503 ++extern void fpneg503(digit_t* a); ++ ++// Modular division by two, c = a/2 mod p503. ++void fpdiv2_503(const digit_t* a, digit_t* c); ++ ++// Modular correction to reduce field element a in [0, 2*p503-1] to [0, p503-1]. ++void fpcorrection503(digit_t* a); ++ ++// 503-bit Montgomery reduction, c = a mod p ++void rdc_mont(const digit_t* a, digit_t* c); ++ ++// Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod p503, where R=2^768 ++void fpmul503_mont(const digit_t* a, const digit_t* b, digit_t* c); ++void mul503_asm(const digit_t* a, const digit_t* b, digit_t* c); ++void rdc503_asm(const digit_t* ma, digit_t* mc); ++ ++// Field squaring using Montgomery arithmetic, c = a*b*R^-1 mod p503, where R=2^768 ++void fpsqr503_mont(const digit_t* ma, digit_t* mc); ++ ++// Conversion to Montgomery representation ++void to_mont(const digit_t* a, digit_t* mc); ++ ++// Conversion from Montgomery representation to standard representation ++void from_mont(const digit_t* ma, digit_t* c); ++ ++// Field inversion, a = a^-1 in GF(p503) ++void fpinv503_mont(digit_t* a); ++ ++// Field inversion, a = a^-1 in GF(p503) using the binary GCD ++void fpinv503_mont_bingcd(digit_t* a); ++ ++// Chain to compute (p503-3)/4 using Montgomery arithmetic ++void fpinv503_chain_mont(digit_t* a); ++ ++/************ GF(p^2) arithmetic functions *************/ ++ ++// Copy of a GF(p503^2) element, c = a ++void fp2copy503(const f2elm_t a, f2elm_t c); ++ ++// Zeroing a GF(p503^2) element, a = 0 ++void fp2zero503(f2elm_t a); ++ ++// GF(p503^2) negation, a = -a in GF(p503^2) ++void fp2neg503(f2elm_t a); ++ ++// GF(p503^2) addition, c = a+b in GF(p503^2) ++extern void fp2add503(const f2elm_t a, const f2elm_t b, f2elm_t c); ++ ++// GF(p503^2) subtraction, c = a-b in GF(p503^2) ++extern void fp2sub503(const f2elm_t a, const f2elm_t b, f2elm_t c); ++ ++// GF(p503^2) division by two, c = a/2 in GF(p503^2) ++void fp2div2_503(const f2elm_t a, f2elm_t c); ++ ++// Modular correction, a = a in GF(p503^2) ++void fp2correction503(f2elm_t a); ++ ++// GF(p503^2) squaring using Montgomery arithmetic, c = a^2 in GF(p503^2) ++void fp2sqr503_mont(const f2elm_t a, f2elm_t c); ++ ++// GF(p503^2) multiplication using Montgomery arithmetic, c = a*b in GF(p503^2) ++void fp2mul503_mont(const f2elm_t a, const f2elm_t b, f2elm_t c); ++ ++// Conversion of a GF(p503^2) element to Montgomery representation ++void to_fp2mont(const f2elm_t a, f2elm_t mc); ++ ++// Conversion of a GF(p503^2) element from Montgomery representation to standard representation ++void from_fp2mont(const f2elm_t ma, f2elm_t c); ++ ++// GF(p503^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2) ++void fp2inv503_mont(f2elm_t a); ++ ++// GF(p503^2) inversion, a = (a0-i*a1)/(a0^2+a1^2), GF(p503) inversion done using the binary GCD ++void fp2inv503_mont_bingcd(f2elm_t a); ++ ++// n-way Montgomery inversion ++void mont_n_way_inv(const f2elm_t* vec, const int n, f2elm_t* out); ++ ++/************ Elliptic curve and isogeny functions *************/ ++ ++// Computes the j-invariant of a Montgomery curve with projective constant. ++void j_inv(const f2elm_t A, const f2elm_t C, f2elm_t jinv); ++ ++// Simultaneous doubling and differential addition. ++void xDBLADD(point_proj_t P, point_proj_t Q, const f2elm_t xPQ, const f2elm_t A24); ++ ++// Doubling of a Montgomery point in projective coordinates (X:Z). ++void xDBL(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24); ++ ++// Computes [2^e](X:Z) on Montgomery curve with projective constant via e repeated doublings. ++void xDBLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24, const int e); ++ ++// Differential addition. ++void xADD(point_proj_t P, const point_proj_t Q, const f2elm_t xPQ); ++ ++// Computes the corresponding 4-isogeny of a projective Montgomery point (X4:Z4) of order 4. ++void get_4_isog(const point_proj_t P, f2elm_t A24plus, f2elm_t C24, f2elm_t* coeff); ++ ++// Evaluates the isogeny at the point (X:Z) in the domain of the isogeny. ++void eval_4_isog(point_proj_t P, f2elm_t* coeff); ++ ++// Tripling of a Montgomery point in projective coordinates (X:Z). ++void xTPL(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus); ++ ++// Computes [3^e](X:Z) on Montgomery curve with projective constant via e repeated triplings. ++void xTPLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus, const int e); ++ ++// Computes the corresponding 3-isogeny of a projective Montgomery point (X3:Z3) of order 3. ++void get_3_isog(const point_proj_t P, f2elm_t A24minus, f2elm_t A24plus, f2elm_t* coeff); ++ ++// Computes the 3-isogeny R=phi(X:Z), given projective point (X3:Z3) of order 3 on a Montgomery curve and a point P with coefficients given in coeff. ++void eval_3_isog(point_proj_t Q, f2elm_t* coeff); ++ ++// 3-way simultaneous inversion ++void inv_3_way(f2elm_t z1, f2elm_t z2, f2elm_t z3); ++ ++// Given the x-coordinates of P, Q, and R, returns the value A corresponding to the Montgomery curve E_A: y^2=x^3+A*x^2+x such that R=Q-P on E_A. ++void get_A(const f2elm_t xP, const f2elm_t xQ, const f2elm_t xR, f2elm_t A); ++ ++ ++#endif +diff --git a/third_party/sidh/src/ec_isogeny.c b/third_party/sidh/src/ec_isogeny.c +new file mode 100644 +index 000000000..c512c5831 +--- /dev/null ++++ b/third_party/sidh/src/ec_isogeny.c +@@ -0,0 +1,270 @@ ++/******************************************************************************************** ++* SIDH: an efficient supersingular isogeny cryptography library ++* ++* Abstract: elliptic curve and isogeny functions ++*********************************************************************************************/ ++#include "sidh/def_p503.h" ++#include "P503_internal.h" ++ ++extern const struct params_t kP503Params; ++ ++void xDBL(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24) ++{ // Doubling of a Montgomery point in projective coordinates (X:Z). ++ // Input: projective Montgomery x-coordinates P = (X1:Z1), where x1=X1/Z1 and Montgomery curve constants A+2C and 4C. ++ // Output: projective Montgomery x-coordinates Q = 2*P = (X2:Z2). ++ f2elm_t t0, t1; ++ ++ fp2sub(P->X, P->Z, t0); // t0 = X1-Z1 ++ fp2add(P->X, P->Z, t1); // t1 = X1+Z1 ++ fp2sqr_mont(t0, t0); // t0 = (X1-Z1)^2 ++ fp2sqr_mont(t1, t1); // t1 = (X1+Z1)^2 ++ fp2mul_mont(C24, t0, Q->Z); // Z2 = C24*(X1-Z1)^2 ++ fp2mul_mont(t1, Q->Z, Q->X); // X2 = C24*(X1-Z1)^2*(X1+Z1)^2 ++ fp2sub(t1, t0, t1); // t1 = (X1+Z1)^2-(X1-Z1)^2 ++ fp2mul_mont(A24plus, t1, t0); // t0 = A24plus*[(X1+Z1)^2-(X1-Z1)^2] ++ fp2add(Q->Z, t0, Q->Z); // Z2 = A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2 ++ fp2mul_mont(Q->Z, t1, Q->Z); // Z2 = [A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2]*[(X1+Z1)^2-(X1-Z1)^2] ++} ++ ++ ++void xDBLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24, const int e) ++{ // Computes [2^e](X:Z) on Montgomery curve with projective constant via e repeated doublings. ++ // Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A+2C and 4C. ++ // Output: projective Montgomery x-coordinates Q <- (2^e)*P. ++ int i; ++ ++ copy_words((digit_t*)P, (digit_t*)Q, 2*2*NWORDS_FIELD); ++ ++ for (i = 0; i < e; i++) { ++ xDBL(Q, Q, A24plus, C24); ++ } ++} ++ ++ ++void get_4_isog(const point_proj_t P, f2elm_t A24plus, f2elm_t C24, f2elm_t* coeff) ++{ // Computes the corresponding 4-isogeny of a projective Montgomery point (X4:Z4) of order 4. ++ // Input: projective point of order four P = (X4:Z4). ++ // Output: the 4-isogenous Montgomery curve with projective coefficients A+2C/4C and the 3 coefficients ++ // that are used to evaluate the isogeny at a point in eval_4_isog(). ++ ++ fp2sub(P->X, P->Z, coeff[1]); // coeff[1] = X4-Z4 ++ fp2add(P->X, P->Z, coeff[2]); // coeff[2] = X4+Z4 ++ fp2sqr_mont(P->Z, coeff[0]); // coeff[0] = Z4^2 ++ fp2add(coeff[0], coeff[0], coeff[0]); // coeff[0] = 2*Z4^2 ++ fp2sqr_mont(coeff[0], C24); // C24 = 4*Z4^4 ++ fp2add(coeff[0], coeff[0], coeff[0]); // coeff[0] = 4*Z4^2 ++ fp2sqr_mont(P->X, A24plus); // A24plus = X4^2 ++ fp2add(A24plus, A24plus, A24plus); // A24plus = 2*X4^2 ++ fp2sqr_mont(A24plus, A24plus); // A24plus = 4*X4^4 ++} ++ ++ ++void eval_4_isog(point_proj_t P, f2elm_t* coeff) ++{ // Evaluates the isogeny at the point (X:Z) in the domain of the isogeny, given a 4-isogeny phi defined ++ // by the 3 coefficients in coeff (computed in the function get_4_isog()). ++ // Inputs: the coefficients defining the isogeny, and the projective point P = (X:Z). ++ // Output: the projective point P = phi(P) = (X:Z) in the codomain. ++ f2elm_t t0, t1; ++ ++ fp2add(P->X, P->Z, t0); // t0 = X+Z ++ fp2sub(P->X, P->Z, t1); // t1 = X-Z ++ fp2mul_mont(t0, coeff[1], P->X); // X = (X+Z)*coeff[1] ++ fp2mul_mont(t1, coeff[2], P->Z); // Z = (X-Z)*coeff[2] ++ fp2mul_mont(t0, t1, t0); // t0 = (X+Z)*(X-Z) ++ fp2mul_mont(t0, coeff[0], t0); // t0 = coeff[0]*(X+Z)*(X-Z) ++ fp2add(P->X, P->Z, t1); // t1 = (X-Z)*coeff[2] + (X+Z)*coeff[1] ++ fp2sub(P->X, P->Z, P->Z); // Z = (X-Z)*coeff[2] - (X+Z)*coeff[1] ++ fp2sqr_mont(t1, t1); // t1 = [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2 ++ fp2sqr_mont(P->Z, P->Z); // Z = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2 ++ fp2add(t1, t0, P->X); // X = coeff[0]*(X+Z)*(X-Z) + [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2 ++ fp2sub(P->Z, t0, t0); // t0 = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2 - coeff[0]*(X+Z)*(X-Z) ++ fp2mul_mont(P->X, t1, P->X); // Xfinal ++ fp2mul_mont(P->Z, t0, P->Z); // Zfinal ++} ++ ++ ++void xTPL(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus) ++{ // Tripling of a Montgomery point in projective coordinates (X:Z). ++ // Input: projective Montgomery x-coordinates P = (X:Z), where x=X/Z and Montgomery curve constants A24plus = A+2C and A24minus = A-2C. ++ // Output: projective Montgomery x-coordinates Q = 3*P = (X3:Z3). ++ f2elm_t t0, t1, t2, t3, t4, t5, t6; ++ ++ fp2sub(P->X, P->Z, t0); // t0 = X-Z ++ fp2sqr_mont(t0, t2); // t2 = (X-Z)^2 ++ fp2add(P->X, P->Z, t1); // t1 = X+Z ++ fp2sqr_mont(t1, t3); // t3 = (X+Z)^2 ++ fp2add(t0, t1, t4); // t4 = 2*X ++ fp2sub(t1, t0, t0); // t0 = 2*Z ++ fp2sqr_mont(t4, t1); // t1 = 4*X^2 ++ fp2sub(t1, t3, t1); // t1 = 4*X^2 - (X+Z)^2 ++ fp2sub(t1, t2, t1); // t1 = 4*X^2 - (X+Z)^2 - (X-Z)^2 ++ fp2mul_mont(t3, A24plus, t5); // t5 = A24plus*(X+Z)^2 ++ fp2mul_mont(t3, t5, t3); // t3 = A24plus*(X+Z)^3 ++ fp2mul_mont(A24minus, t2, t6); // t6 = A24minus*(X-Z)^2 ++ fp2mul_mont(t2, t6, t2); // t2 = A24minus*(X-Z)^3 ++ fp2sub(t2, t3, t3); // t3 = A24minus*(X-Z)^3 - coeff*(X+Z)^3 ++ fp2sub(t5, t6, t2); // t2 = A24plus*(X+Z)^2 - A24minus*(X-Z)^2 ++ fp2mul_mont(t1, t2, t1); // t1 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] ++ fp2add(t3, t1, t2); // t2 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] + A24minus*(X-Z)^3 - coeff*(X+Z)^3 ++ fp2sqr_mont(t2, t2); // t2 = t2^2 ++ fp2mul_mont(t4, t2, Q->X); // X3 = 2*X*t2 ++ fp2sub(t3, t1, t1); // t1 = A24minus*(X-Z)^3 - A24plus*(X+Z)^3 - [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] ++ fp2sqr_mont(t1, t1); // t1 = t1^2 ++ fp2mul_mont(t0, t1, Q->Z); // Z3 = 2*Z*t1 ++} ++ ++ ++void xTPLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus, const int e) ++{ // Computes [3^e](X:Z) on Montgomery curve with projective constant via e repeated triplings. ++ // Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A24plus = A+2C and A24minus = A-2C. ++ // Output: projective Montgomery x-coordinates Q <- (3^e)*P. ++ int i; ++ ++ copy_words((digit_t*)P, (digit_t*)Q, 2*2*NWORDS_FIELD); ++ ++ for (i = 0; i < e; i++) { ++ xTPL(Q, Q, A24minus, A24plus); ++ } ++} ++ ++ ++void get_3_isog(const point_proj_t P, f2elm_t A24minus, f2elm_t A24plus, f2elm_t* coeff) ++{ // Computes the corresponding 3-isogeny of a projective Montgomery point (X3:Z3) of order 3. ++ // Input: projective point of order three P = (X3:Z3). ++ // Output: the 3-isogenous Montgomery curve with projective coefficient A/C. ++ f2elm_t t0, t1, t2, t3, t4; ++ ++ fp2sub(P->X, P->Z, coeff[0]); // coeff0 = X-Z ++ fp2sqr_mont(coeff[0], t0); // t0 = (X-Z)^2 ++ fp2add(P->X, P->Z, coeff[1]); // coeff1 = X+Z ++ fp2sqr_mont(coeff[1], t1); // t1 = (X+Z)^2 ++ fp2add(t0, t1, t2); // t2 = (X+Z)^2 + (X-Z)^2 ++ fp2add(coeff[0], coeff[1], t3); // t3 = 2*X ++ fp2sqr_mont(t3, t3); // t3 = 4*X^2 ++ fp2sub(t3, t2, t3); // t3 = 4*X^2 - (X+Z)^2 - (X-Z)^2 ++ fp2add(t1, t3, t2); // t2 = 4*X^2 - (X-Z)^2 ++ fp2add(t3, t0, t3); // t3 = 4*X^2 - (X+Z)^2 ++ fp2add(t0, t3, t4); // t4 = 4*X^2 - (X+Z)^2 + (X-Z)^2 ++ fp2add(t4, t4, t4); // t4 = 2(4*X^2 - (X+Z)^2 + (X-Z)^2) ++ fp2add(t1, t4, t4); // t4 = 8*X^2 - (X+Z)^2 + 2*(X-Z)^2 ++ fp2mul_mont(t2, t4, A24minus); // A24minus = [4*X^2 - (X-Z)^2]*[8*X^2 - (X+Z)^2 + 2*(X-Z)^2] ++ fp2add(t1, t2, t4); // t4 = 4*X^2 + (X+Z)^2 - (X-Z)^2 ++ fp2add(t4, t4, t4); // t4 = 2(4*X^2 + (X+Z)^2 - (X-Z)^2) ++ fp2add(t0, t4, t4); // t4 = 8*X^2 + 2*(X+Z)^2 - (X-Z)^2 ++ fp2mul_mont(t3, t4, t4); // t4 = [4*X^2 - (X+Z)^2]*[8*X^2 + 2*(X+Z)^2 - (X-Z)^2] ++ fp2sub(t4, A24minus, t0); // t0 = [4*X^2 - (X+Z)^2]*[8*X^2 + 2*(X+Z)^2 - (X-Z)^2] - [4*X^2 - (X-Z)^2]*[8*X^2 - (X+Z)^2 + 2*(X-Z)^2] ++ fp2add(A24minus, t0, A24plus); // A24plus = 8*X^2 - (X+Z)^2 + 2*(X-Z)^2 ++} ++ ++ ++void eval_3_isog(point_proj_t Q, f2elm_t* coeff) ++{ // Computes the 3-isogeny R=phi(X:Z), given projective point (X3:Z3) of order 3 on a Montgomery curve and ++ // a point P with 2 coefficients in coeff (computed in the function get_3_isog()). ++ // Inputs: projective points P = (X3:Z3) and Q = (X:Z). ++ // Output: the projective point Q <- phi(Q) = (X3:Z3). ++ f2elm_t t0, t1, t2; ++ ++ fp2add(Q->X, Q->Z, t0); // t0 = X+Z ++ fp2sub(Q->X, Q->Z, t1); // t1 = X-Z ++ fp2mul_mont(t0, coeff[0], t0); // t0 = coeff0*(X+Z) ++ fp2mul_mont(t1, coeff[1], t1); // t1 = coeff1*(X-Z) ++ fp2add(t0, t1, t2); // t2 = coeff0*(X+Z) + coeff1*(X-Z) ++ fp2sub(t1, t0, t0); // t0 = coeff1*(X-Z) - coeff0*(X+Z) ++ fp2sqr_mont(t2, t2); // t2 = [coeff0*(X+Z) + coeff1*(X-Z)]^2 ++ fp2sqr_mont(t0, t0); // t0 = [coeff1*(X-Z) - coeff0*(X+Z)]^2 ++ fp2mul_mont(Q->X, t2, Q->X); // X3final = X*[coeff0*(X+Z) + coeff1*(X-Z)]^2 ++ fp2mul_mont(Q->Z, t0, Q->Z); // Z3final = Z*[coeff1*(X-Z) - coeff0*(X+Z)]^2 ++} ++ ++ ++void inv_3_way(f2elm_t z1, f2elm_t z2, f2elm_t z3) ++{ // 3-way simultaneous inversion ++ // Input: z1,z2,z3 ++ // Output: 1/z1,1/z2,1/z3 (override inputs). ++ f2elm_t t0, t1, t2, t3; ++ ++ fp2mul_mont(z1, z2, t0); // t0 = z1*z2 ++ fp2mul_mont(z3, t0, t1); // t1 = z1*z2*z3 ++ fp2inv_mont(t1); // t1 = 1/(z1*z2*z3) ++ fp2mul_mont(z3, t1, t2); // t2 = 1/(z1*z2) ++ fp2mul_mont(t2, z2, t3); // t3 = 1/z1 ++ fp2mul_mont(t2, z1, z2); // z2 = 1/z2 ++ fp2mul_mont(t0, t1, z3); // z3 = 1/z3 ++ fp2copy(t3, z1); // z1 = 1/z1 ++} ++ ++ ++void get_A(const f2elm_t xP, const f2elm_t xQ, const f2elm_t xR, f2elm_t A) ++{ // Given the x-coordinates of P, Q, and R, returns the value A corresponding to the Montgomery curve E_A: y^2=x^3+A*x^2+x such that R=Q-P on E_A. ++ // Input: the x-coordinates xP, xQ, and xR of the points P, Q and R. ++ // Output: the coefficient A corresponding to the curve E_A: y^2=x^3+A*x^2+x. ++ f2elm_t t0, t1, one = F2ELM_INIT; ++ ++ fpcopy((digit_t*)&kP503Params.Montgomery_one, one->c0); ++ fp2add(xP, xQ, t1); // t1 = xP+xQ ++ fp2mul_mont(xP, xQ, t0); // t0 = xP*xQ ++ fp2mul_mont(xR, t1, A); // A = xR*t1 ++ fp2add(t0, A, A); // A = A+t0 ++ fp2mul_mont(t0, xR, t0); // t0 = t0*xR ++ fp2sub(A, one, A); // A = A-1 ++ fp2add(t0, t0, t0); // t0 = t0+t0 ++ fp2add(t1, xR, t1); // t1 = t1+xR ++ fp2add(t0, t0, t0); // t0 = t0+t0 ++ fp2sqr_mont(A, A); // A = A^2 ++ fp2inv_mont(t0); // t0 = 1/t0 ++ fp2mul_mont(A, t0, A); // A = A*t0 ++ fp2sub(A, t1, A); // Afinal = A-t1 ++} ++ ++ ++void j_inv(const f2elm_t A, const f2elm_t C, f2elm_t jinv) ++{ // Computes the j-invariant of a Montgomery curve with projective constant. ++ // Input: A,C in GF(p^2). ++ // Output: j=256*(A^2-3*C^2)^3/(C^4*(A^2-4*C^2)), which is the j-invariant of the Montgomery curve B*y^2=x^3+(A/C)*x^2+x or (equivalently) j-invariant of B'*y^2=C*x^3+A*x^2+C*x. ++ f2elm_t t0, t1; ++ ++ fp2sqr_mont(A, jinv); // jinv = A^2 ++ fp2sqr_mont(C, t1); // t1 = C^2 ++ fp2add(t1, t1, t0); // t0 = t1+t1 ++ fp2sub(jinv, t0, t0); // t0 = jinv-t0 ++ fp2sub(t0, t1, t0); // t0 = t0-t1 ++ fp2sub(t0, t1, jinv); // jinv = t0-t1 ++ fp2sqr_mont(t1, t1); // t1 = t1^2 ++ fp2mul_mont(jinv, t1, jinv); // jinv = jinv*t1 ++ fp2add(t0, t0, t0); // t0 = t0+t0 ++ fp2add(t0, t0, t0); // t0 = t0+t0 ++ fp2sqr_mont(t0, t1); // t1 = t0^2 ++ fp2mul_mont(t0, t1, t0); // t0 = t0*t1 ++ fp2add(t0, t0, t0); // t0 = t0+t0 ++ fp2add(t0, t0, t0); // t0 = t0+t0 ++ fp2inv_mont(jinv); // jinv = 1/jinv ++ fp2mul_mont(jinv, t0, jinv); // jinv = t0*jinv ++} ++ ++ ++void xDBLADD(point_proj_t P, point_proj_t Q, const f2elm_t xPQ, const f2elm_t A24) ++{ // Simultaneous doubling and differential addition. ++ // Input: projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ, affine difference xPQ=x(P-Q) and Montgomery curve constant A24=(A+2)/4. ++ // Output: projective Montgomery points P <- 2*P = (X2P:Z2P) such that x(2P)=X2P/Z2P, and Q <- P+Q = (XQP:ZQP) such that = x(Q+P)=XQP/ZQP. ++ f2elm_t t0, t1, t2; ++ ++ fp2add(P->X, P->Z, t0); // t0 = XP+ZP ++ fp2sub(P->X, P->Z, t1); // t1 = XP-ZP ++ fp2sqr_mont(t0, P->X); // XP = (XP+ZP)^2 ++ fp2sub(Q->X, Q->Z, t2); // t2 = XQ-ZQ ++ fp2correction(t2); ++ fp2add(Q->X, Q->Z, Q->X); // XQ = XQ+ZQ ++ fp2mul_mont(t0, t2, t0); // t0 = (XP+ZP)*(XQ-ZQ) ++ fp2sqr_mont(t1, P->Z); // ZP = (XP-ZP)^2 ++ fp2mul_mont(t1, Q->X, t1); // t1 = (XP-ZP)*(XQ+ZQ) ++ fp2sub(P->X, P->Z, t2); // t2 = (XP+ZP)^2-(XP-ZP)^2 ++ fp2mul_mont(P->X, P->Z, P->X); // XP = (XP+ZP)^2*(XP-ZP)^2 ++ fp2mul_mont(t2, A24, Q->X); // XQ = A24*[(XP+ZP)^2-(XP-ZP)^2] ++ fp2sub(t0, t1, Q->Z); // ZQ = (XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ) ++ fp2add(Q->X, P->Z, P->Z); // ZP = A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2 ++ fp2add(t0, t1, Q->X); // XQ = (XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ) ++ fp2mul_mont(P->Z, t2, P->Z); // ZP = [A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2]*[(XP+ZP)^2-(XP-ZP)^2] ++ fp2sqr_mont(Q->Z, Q->Z); // ZQ = [(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2 ++ fp2sqr_mont(Q->X, Q->X); // XQ = [(XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ)]^2 ++ fp2mul_mont(Q->Z, xPQ, Q->Z); // ZQ = xPQ*[(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2 ++} +diff --git a/third_party/sidh/src/fpx.c b/third_party/sidh/src/fpx.c +new file mode 100644 +index 000000000..4f933cbb5 +--- /dev/null ++++ b/third_party/sidh/src/fpx.c +@@ -0,0 +1,420 @@ ++/******************************************************************************************** ++* SIDH: an efficient supersingular isogeny cryptography library ++* ++* Abstract: core functions over GF(p) and GF(p^2) ++*********************************************************************************************/ ++ ++#include "sidh/def_p503.h" ++#include "P503_internal.h" ++#include "internal.h" ++ ++extern const struct params_t kP503Params; ++ ++inline void fpcopy(const felm_t a, felm_t c) ++{ // Copy a field element, c = a. ++ unsigned int i; ++ ++ for (i = 0; i < NWORDS_FIELD; i++) ++ c[i] = a[i]; ++} ++ ++ ++inline void fpzero(felm_t a) ++{ // Zero a field element, a = 0. ++ unsigned int i; ++ ++ for (i = 0; i < NWORDS_FIELD; i++) ++ a[i] = 0; ++} ++ ++ ++void to_mont(const felm_t a, felm_t mc) ++{ // Conversion to Montgomery representation, ++ // mc = a*R^2*R^(-1) mod p = a*R mod p, where a in [0, p-1]. ++ // The Montgomery constant R^2 mod p is the global value "Montgomery_R2". ++ ++ fpmul_mont(a, (digit_t*)&kP503Params.Montgomery_R2, mc); ++} ++ ++ ++void from_mont(const felm_t ma, felm_t c) ++{ // Conversion from Montgomery representation to standard representation, ++ // c = ma*R^(-1) mod p = a mod p, where ma in [0, p-1]. ++ digit_t one[NWORDS_FIELD] = {0}; ++ ++ one[0] = 1; ++ fpmul_mont(ma, one, c); ++ fpcorrection(c); ++} ++ ++ ++void copy_words(const digit_t* a, digit_t* c, const unsigned int nwords) ++{ // Copy wordsize digits, c = a, where lng(a) = nwords. ++ unsigned int i; ++ ++ for (i = 0; i < nwords; i++) { ++ c[i] = a[i]; ++ } ++} ++ ++ ++void fpmul_mont(const felm_t ma, const felm_t mb, felm_t mc) ++{ // Multiprecision multiplication, c = a*b mod p. ++ dfelm_t temp = {0}; ++ ++ mp_mul(ma, mb, temp, NWORDS_FIELD); ++ rdc_mont(temp, mc); ++} ++ ++ ++void fpsqr_mont(const felm_t ma, felm_t mc) ++{ // Multiprecision squaring, c = a^2 mod p. ++ dfelm_t temp = {0}; ++ ++ mp_mul(ma, ma, temp, NWORDS_FIELD); ++ rdc_mont(temp, mc); ++} ++ ++ ++void fpinv_mont(felm_t a) ++{ // Field inversion using Montgomery arithmetic, a = a^(-1)*R mod p. ++ felm_t tt; ++ ++ fpcopy(a, tt); ++ fpinv_chain_mont(tt); ++ fpsqr_mont(tt, tt); ++ fpsqr_mont(tt, tt); ++ fpmul_mont(a, tt, a); ++} ++ ++ ++void fp2copy(const f2elm_t a, f2elm_t c) ++{ // Copy a GF(p^2) element, c = a. ++ fpcopy(a->c0, c->c0); ++ fpcopy(a->c1, c->c1); ++} ++ ++ ++void fp2zero(f2elm_t a) ++{ // Zero a GF(p^2) element, a = 0. ++ fpzero(a->c0); ++ fpzero(a->c1); ++} ++ ++ ++void fp2neg(f2elm_t a) ++{ // GF(p^2) negation, a = -a in GF(p^2). ++ fpneg(a->c0); ++ fpneg(a->c1); ++} ++ ++inline void fp2add(const f2elm_t a, const f2elm_t b, f2elm_t c) ++{ // GF(p^2) addition, c = a+b in GF(p^2). ++ fpadd(a->c0, b->c0, c->c0); ++ fpadd(a->c1, b->c1, c->c1); ++} ++ ++inline void fp2sub(const f2elm_t a, const f2elm_t b, f2elm_t c) ++{ // GF(p^2) subtraction, c = a-b in GF(p^2). ++ fpsub(a->c0, b->c0, c->c0); ++ fpsub(a->c1, b->c1, c->c1); ++} ++ ++ ++void fp2div2(const f2elm_t a, f2elm_t c) ++{ // GF(p^2) division by two, c = a/2 in GF(p^2). ++ fpdiv2(a->c0, c->c0); ++ fpdiv2(a->c1, c->c1); ++} ++ ++ ++void fp2correction(f2elm_t a) ++{ // Modular correction, a = a in GF(p^2). ++ fpcorrection(a->c0); ++ fpcorrection(a->c1); ++} ++ ++ ++inline static void mp_addfast(const digit_t* a, const digit_t* b, digit_t* c) ++{ // Multiprecision addition, c = a+b. ++#if defined(OPENSSL_NO_ASM) ++ mp_add(a, b, c, NWORDS_FIELD); ++#else ++ mp_add_asm(a, b, c); ++ ++#endif ++} ++ ++ ++void fp2sqr_mont(const f2elm_t a, f2elm_t c) ++{ // GF(p^2) squaring using Montgomery arithmetic, c = a^2 in GF(p^2). ++ // Inputs: a = a0+a1*i, where a0, a1 are in [0, 2*p-1] ++ // Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1] ++ felm_t t1, t2, t3; ++ ++ mp_addfast(a->c0, a->c1, t1); // t1 = a0+a1 ++ fpsub(a->c0, a->c1, t2); // t2 = a0-a1 ++ mp_addfast(a->c0, a->c0, t3); // t3 = 2a0 ++ fpmul_mont(t1, t2, c->c0); // c0 = (a0+a1)(a0-a1) ++ fpmul_mont(t3, a->c1, c->c1); // c1 = 2a0*a1 ++} ++ ++ ++inline unsigned int mp_sub(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) ++{ // Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = nwords. Returns the borrow bit. ++ unsigned int i, borrow = 0; ++ ++ for (i = 0; i < nwords; i++) { ++ SUBC(borrow, a[i], b[i], borrow, c[i]); ++ } ++ ++ return borrow; ++} ++ ++ ++inline static digit_t mp_subfast(const digit_t* a, const digit_t* b, digit_t* c) ++{ // Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = 2*NWORDS_FIELD. ++ // If c < 0 then returns mask = 0xFF..F, else mask = 0x00..0 ++#if defined(OPENSSL_NO_ASM) ++ ++ return (0 - (digit_t)mp_sub(a, b, c, 2*NWORDS_FIELD)); ++ ++#else ++ ++ return mp_subx2_asm(a, b, c); ++ ++#endif ++} ++ ++ ++inline static void mp_dblsubfast(const digit_t* a, const digit_t* b, digit_t* c) ++{ // Multiprecision subtraction, c = c-a-b, where lng(a) = lng(b) = 2*NWORDS_FIELD. ++ // Inputs should be s.t. c > a and c > b ++#if defined(OPENSSL_NO_ASM) ++ ++ mp_sub(c, a, c, 2*NWORDS_FIELD); ++ mp_sub(c, b, c, 2*NWORDS_FIELD); ++ ++#else ++ ++ mp_dblsubx2_asm(a, b, c); ++ ++#endif ++} ++ ++ ++void fp2mul_mont(const f2elm_t a, const f2elm_t b, f2elm_t c) ++{ // GF(p^2) multiplication using Montgomery arithmetic, c = a*b in GF(p^2). ++ // Inputs: a = a0+a1*i and b = b0+b1*i, where a0, a1, b0, b1 are in [0, 2*p-1] ++ // Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1] ++ felm_t t1, t2; ++ dfelm_t tt1, tt2, tt3; ++ digit_t mask; ++ unsigned int i; ++ ++ mp_addfast(a->c0, a->c1, t1); // t1 = a0+a1 ++ mp_addfast(b->c0, b->c1, t2); // t2 = b0+b1 ++ mp_mul(a->c0, b->c0, tt1, NWORDS_FIELD); // tt1 = a0*b0 ++ mp_mul(a->c1, b->c1, tt2, NWORDS_FIELD); // tt2 = a1*b1 ++ mp_mul(t1, t2, tt3, NWORDS_FIELD); // tt3 = (a0+a1)*(b0+b1) ++ mp_dblsubfast(tt1, tt2, tt3); // tt3 = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 ++ mask = mp_subfast(tt1, tt2, tt1); // tt1 = a0*b0 - a1*b1. If tt1 < 0 then mask = 0xFF..F, else if tt1 >= 0 then mask = 0x00..0 ++ ++ for (i = 0; i < NWORDS_FIELD; i++) { ++ t1[i] = ((digit_t*)kP503Params.prime)[i] & mask; ++ } ++ ++ rdc_mont(tt3, c->c1); // c[1] = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 ++ mp_addfast((digit_t*)&tt1[NWORDS_FIELD], t1, (digit_t*)&tt1[NWORDS_FIELD]); ++ rdc_mont(tt1, c->c0); // c[0] = a0*b0 - a1*b1 ++} ++ ++ ++void fpinv_chain_mont(felm_t a) ++{ // Chain to compute a^(p-3)/4 using Montgomery arithmetic. ++ unsigned int i, j; ++ felm_t t[15], tt; ++ ++ // Precomputed table ++ fpsqr_mont(a, tt); ++ fpmul_mont(a, tt, t[0]); ++ for (i = 0; i <= 13; i++) fpmul_mont(t[i], tt, t[i+1]); ++ ++ fpcopy(a, tt); ++ for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(a, tt, tt); ++ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[8], tt, tt); ++ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[6], tt, tt); ++ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[9], tt, tt); ++ for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[0], tt, tt); ++ for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(a, tt, tt); ++ for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[6], tt, tt); ++ for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[2], tt, tt); ++ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[8], tt, tt); ++ for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(a, tt, tt); ++ for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[10], tt, tt); ++ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[0], tt, tt); ++ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[10], tt, tt); ++ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[10], tt, tt); ++ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[5], tt, tt); ++ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[2], tt, tt); ++ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[6], tt, tt); ++ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[3], tt, tt); ++ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[5], tt, tt); ++ for (i = 0; i < 12; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[12], tt, tt); ++ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[8], tt, tt); ++ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[6], tt, tt); ++ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[12], tt, tt); ++ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[11], tt, tt); ++ for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[6], tt, tt); ++ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[5], tt, tt); ++ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[14], tt, tt); ++ for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[14], tt, tt); ++ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[5], tt, tt); ++ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[6], tt, tt); ++ for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[8], tt, tt); ++ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(a, tt, tt); ++ for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[4], tt, tt); ++ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[6], tt, tt); ++ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[5], tt, tt); ++ for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[7], tt, tt); ++ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(a, tt, tt); ++ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[0], tt, tt); ++ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[11], tt, tt); ++ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[13], tt, tt); ++ for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[1], tt, tt); ++ for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[10], tt, tt); ++ for (j = 0; j < 49; j++) { ++ for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); ++ fpmul_mont(t[14], tt, tt); ++ } ++ fpcopy(tt, a); ++} ++ ++ ++void fp2inv_mont(f2elm_t a) ++{// GF(p^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2). ++ f2elm_t t1; ++ ++ fpsqr_mont(a->c0, t1->c0); // t10 = a0^2 ++ fpsqr_mont(a->c1, t1->c1); // t11 = a1^2 ++ fpadd(t1->c0, t1->c1, t1->c0); // t10 = a0^2+a1^2 ++ fpinv_mont(t1->c0); // t10 = (a0^2+a1^2)^-1 ++ fpneg(a->c1); // a = a0-i*a1 ++ fpmul_mont(a->c0, t1->c0, a->c0); ++ fpmul_mont(a->c1, t1->c0, a->c1); // a = (a0-i*a1)*(a0^2+a1^2)^-1 ++} ++ ++ ++void to_fp2mont(const f2elm_t a, f2elm_t mc) ++{ // Conversion of a GF(p^2) element to Montgomery representation, ++ // mc_i = a_i*R^2*R^(-1) = a_i*R in GF(p^2). ++ ++ to_mont(a->c0, mc->c0); ++ to_mont(a->c1, mc->c1); ++} ++ ++ ++void from_fp2mont(const f2elm_t ma, f2elm_t c) ++{ // Conversion of a GF(p^2) element from Montgomery representation to standard representation, ++ // c_i = ma_i*R^(-1) = a_i in GF(p^2). ++ ++ from_mont(ma->c0, c->c0); ++ from_mont(ma->c1, c->c1); ++} ++ ++ ++inline unsigned int mp_add(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) ++{ // Multiprecision addition, c = a+b, where lng(a) = lng(b) = nwords. Returns the carry bit. ++ unsigned int i, carry = 0; ++ ++ for (i = 0; i < nwords; i++) { ++ ADDC(carry, a[i], b[i], carry, c[i]); ++ } ++ ++ return carry; ++} ++ ++ ++void mp_shiftleft(digit_t* x, unsigned int shift, const unsigned int nwords) ++{ ++ unsigned int i, j = 0; ++ ++ while (shift > RADIX) { ++ j += 1; ++ shift -= RADIX; ++ } ++ ++ for (i = 0; i < nwords-j; i++) ++ x[nwords-1-i] = x[nwords-1-i-j]; ++ for (i = nwords-j; i < nwords; i++) ++ x[nwords-1-i] = 0; ++ if (shift != 0) { ++ for (j = nwords-1; j > 0; j--) ++ SHIFTL(x[j], x[j-1], shift, x[j], RADIX); ++ x[0] <<= shift; ++ } ++} ++ ++ ++void mp_shiftr1(digit_t* x, const unsigned int nwords) ++{ // Multiprecision right shift by one. ++ unsigned int i; ++ ++ for (i = 0; i < nwords-1; i++) { ++ SHIFTR(x[i+1], x[i], 1, x[i], RADIX); ++ } ++ x[nwords-1] >>= 1; ++} ++ ++ ++void mp_shiftl1(digit_t* x, const unsigned int nwords) ++{ // Multiprecision left shift by one. ++ int i; ++ ++ for (i = nwords-1; i > 0; i--) { ++ SHIFTL(x[i], x[i-1], 1, x[i], RADIX); ++ } ++ x[0] <<= 1; ++} +diff --git a/third_party/sidh/src/generic/fp_generic.c b/third_party/sidh/src/generic/fp_generic.c +new file mode 100644 +index 000000000..325053f1c +--- /dev/null ++++ b/third_party/sidh/src/generic/fp_generic.c +@@ -0,0 +1,222 @@ ++/******************************************************************************************** ++* SIDH: an efficient supersingular isogeny cryptography library ++* ++* Abstract: portable modular arithmetic for P503 ++*********************************************************************************************/ ++ ++#include "../internal.h" ++#include "../P503_internal.h" ++ ++ ++// Global constants ++extern const struct params_t kP503Params; ++ ++inline void fpadd503(const digit_t* a, const digit_t* b, digit_t* c) ++{ // Modular addition, c = a+b mod p503. ++ // Inputs: a, b in [0, 2*p503-1] ++ // Output: c in [0, 2*p503-1] ++ unsigned int i, carry = 0; ++ digit_t mask; ++ ++ for (i = 0; i < NWORDS_FIELD; i++) { ++ ADDC(carry, a[i], b[i], carry, c[i]); ++ } ++ ++ carry = 0; ++ for (i = 0; i < NWORDS_FIELD; i++) { ++ SUBC(carry, c[i], ((digit_t*)kP503Params.primeX2)[i], carry, c[i]); ++ } ++ mask = 0 - (digit_t)carry; ++ ++ carry = 0; ++ for (i = 0; i < NWORDS_FIELD; i++) { ++ ADDC(carry, c[i], ((digit_t*)kP503Params.primeX2)[i] & mask, carry, c[i]); ++ } ++} ++ ++ ++inline void fpsub503(const digit_t* a, const digit_t* b, digit_t* c) ++{ // Modular subtraction, c = a-b mod p503. ++ // Inputs: a, b in [0, 2*p503-1] ++ // Output: c in [0, 2*p503-1] ++ unsigned int i, borrow = 0; ++ digit_t mask; ++ ++ for (i = 0; i < NWORDS_FIELD; i++) { ++ SUBC(borrow, a[i], b[i], borrow, c[i]); ++ } ++ mask = 0 - (digit_t)borrow; ++ ++ borrow = 0; ++ for (i = 0; i < NWORDS_FIELD; i++) { ++ ADDC(borrow, c[i], ((digit_t*)kP503Params.primeX2)[i] & mask, borrow, c[i]); ++ } ++} ++ ++ ++inline void fpneg503(digit_t* a) ++{ // Modular negation, a = -a mod p503. ++ // Input/output: a in [0, 2*p503-1] ++ unsigned int i, borrow = 0; ++ ++ for (i = 0; i < NWORDS_FIELD; i++) { ++ SUBC(borrow, ((digit_t*)kP503Params.primeX2)[i], a[i], borrow, a[i]); ++ } ++} ++ ++ ++void fpdiv2_503(const digit_t* a, digit_t* c) ++{ // Modular division by two, c = a/2 mod p503. ++ // Input : a in [0, 2*p503-1] ++ // Output: c in [0, 2*p503-1] ++ unsigned int i, carry = 0; ++ digit_t mask; ++ ++ mask = 0 - (digit_t)(a[0] & 1); // If a is odd compute a+p503 ++ for (i = 0; i < NWORDS_FIELD; i++) { ++ ADDC(carry, a[i], ((digit_t*)kP503Params.prime)[i] & mask, carry, c[i]); ++ } ++ ++ mp_shiftr1(c, NWORDS_FIELD); ++} ++ ++ ++void fpcorrection503(digit_t* a) ++{ // Modular correction to reduce field element a in [0, 2*p503-1] to [0, p503-1]. ++ unsigned int i, borrow = 0; ++ digit_t mask; ++ ++ for (i = 0; i < NWORDS_FIELD; i++) { ++ SUBC(borrow, a[i], ((digit_t*)kP503Params.prime)[i], borrow, a[i]); ++ } ++ mask = 0 - (digit_t)borrow; ++ ++ borrow = 0; ++ for (i = 0; i < NWORDS_FIELD; i++) { ++ ADDC(borrow, a[i], ((digit_t*)kP503Params.prime)[i] & mask, borrow, a[i]); ++ } ++} ++ ++ ++void digit_x_digit(const digit_t a, const digit_t b, digit_t* c) ++{ // Digit multiplication, digit * digit -> 2-digit result ++ register digit_t al, ah, bl, bh, temp; ++ digit_t albl, albh, ahbl, ahbh, res1, res2, res3, carry; ++ digit_t mask_low = (digit_t)(-1) >> (sizeof(digit_t)*4), mask_high = (digit_t)(-1) << (sizeof(digit_t)*4); ++ ++ al = a & mask_low; // Low part ++ ah = a >> (sizeof(digit_t) * 4); // High part ++ bl = b & mask_low; ++ bh = b >> (sizeof(digit_t) * 4); ++ ++ albl = al*bl; ++ albh = al*bh; ++ ahbl = ah*bl; ++ ahbh = ah*bh; ++ c[0] = albl & mask_low; // C00 ++ ++ res1 = albl >> (sizeof(digit_t) * 4); ++ res2 = ahbl & mask_low; ++ res3 = albh & mask_low; ++ temp = res1 + res2 + res3; ++ carry = temp >> (sizeof(digit_t) * 4); ++ c[0] ^= temp << (sizeof(digit_t) * 4); // C01 ++ ++ res1 = ahbl >> (sizeof(digit_t) * 4); ++ res2 = albh >> (sizeof(digit_t) * 4); ++ res3 = ahbh & mask_low; ++ temp = res1 + res2 + res3 + carry; ++ c[1] = temp & mask_low; // C10 ++ carry = temp & mask_high; ++ c[1] ^= (ahbh & mask_high) + carry; // C11 ++} ++ ++ ++void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) ++{ // Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = nwords. ++ unsigned int i, j; ++ digit_t t = 0, u = 0, v = 0, UV[2]; ++ unsigned int carry = 0; ++ ++ for (i = 0; i < nwords; i++) { ++ for (j = 0; j <= i; j++) { ++ MUL(a[j], b[i-j], UV+1, UV[0]); ++ ADDC(0, UV[0], v, carry, v); ++ ADDC(carry, UV[1], u, carry, u); ++ t += carry; ++ } ++ c[i] = v; ++ v = u; ++ u = t; ++ t = 0; ++ } ++ ++ for (i = nwords; i < 2*nwords-1; i++) { ++ for (j = i-nwords+1; j < nwords; j++) { ++ MUL(a[j], b[i-j], UV+1, UV[0]); ++ ADDC(0, UV[0], v, carry, v); ++ ADDC(carry, UV[1], u, carry, u); ++ t += carry; ++ } ++ c[i] = v; ++ v = u; ++ u = t; ++ t = 0; ++ } ++ c[2*nwords-1] = v; ++} ++ ++ ++void rdc_mont(const digit_t* ma, digit_t* mc) ++{ // Efficient Montgomery reduction using comba and exploiting the special form of the prime p503. ++ // mc = ma*R^-1 mod p503x2, where R = 2^512. ++ // If ma < 2^512*p503, the output mc is in the range [0, 2*p503-1]. ++ // ma is assumed to be in Montgomery representation. ++ unsigned int i, j, carry, count = p503_ZERO_WORDS; ++ digit_t UV[2], t = 0, u = 0, v = 0; ++ ++ for (i = 0; i < NWORDS_FIELD; i++) { ++ mc[i] = 0; ++ } ++ ++ for (i = 0; i < NWORDS_FIELD; i++) { ++ for (j = 0; j < i; j++) { ++ if (j < (i-p503_ZERO_WORDS+1)) { ++ MUL(mc[j], ((digit_t*)kP503Params.primeP1)[i-j], UV+1, UV[0]); ++ ADDC(0, UV[0], v, carry, v); ++ ADDC(carry, UV[1], u, carry, u); ++ t += carry; ++ } ++ } ++ ADDC(0, v, ma[i], carry, v); ++ ADDC(carry, u, 0, carry, u); ++ t += carry; ++ mc[i] = v; ++ v = u; ++ u = t; ++ t = 0; ++ } ++ ++ for (i = NWORDS_FIELD; i < 2*NWORDS_FIELD-1; i++) { ++ if (count > 0) { ++ count -= 1; ++ } ++ for (j = i-NWORDS_FIELD+1; j < NWORDS_FIELD; j++) { ++ if (j < (NWORDS_FIELD-count)) { ++ MUL(mc[j], ((digit_t*)kP503Params.primeP1)[i-j], UV+1, UV[0]); ++ ADDC(0, UV[0], v, carry, v); ++ ADDC(carry, UV[1], u, carry, u); ++ t += carry; ++ } ++ } ++ ADDC(0, v, ma[i], carry, v); ++ ADDC(carry, u, 0, carry, u); ++ t += carry; ++ mc[i-NWORDS_FIELD] = v; ++ v = u; ++ u = t; ++ t = 0; ++ } ++ ADDC(0, v, ma[2*NWORDS_FIELD-1], carry, v); ++ mc[NWORDS_FIELD-1] = v; ++} +\ No newline at end of file +diff --git a/third_party/sidh/src/internal.h b/third_party/sidh/src/internal.h +new file mode 100644 +index 000000000..8a32b465c +--- /dev/null ++++ b/third_party/sidh/src/internal.h +@@ -0,0 +1,96 @@ ++#ifndef INTERNAL_H_ ++#define INTERNAL_H_ ++ ++#include "sidh/def_p503.h" ++ ++/********************** Macros for platform-dependent operations **********************/ ++ ++#if defined(OPENSSL_NO_ASM) ++ ++/********************** Constant-time unsigned comparisons ***********************/ ++ ++// The following functions return 1 (TRUE) if condition is true, 0 (FALSE) otherwise ++ ++static inline unsigned int is_digit_nonzero_ct(digit_t x) ++{ // Is x != 0? ++ return (unsigned int)((x | (0-x)) >> (RADIX-1)); ++} ++ ++static inline unsigned int is_digit_zero_ct(digit_t x) ++{ // Is x = 0? ++ return (unsigned int)(1 ^ is_digit_nonzero_ct(x)); ++} ++ ++static inline unsigned int is_digit_lessthan_ct(digit_t x, digit_t y) ++{ // Is x < y? ++ return (unsigned int)((x ^ ((x ^ y) | ((x - y) ^ y))) >> (RADIX-1)); ++} ++ ++// Digit multiplication ++#define MUL(multiplier, multiplicand, hi, lo) \ ++ digit_x_digit((multiplier), (multiplicand), &(lo)); ++ ++// Digit addition with carry ++#define ADDC(carryIn, addend1, addend2, carryOut, sumOut) \ ++ { digit_t tempReg = (addend1) + (digit_t)(carryIn); \ ++ (sumOut) = (addend2) + tempReg; \ ++ (carryOut) = (is_digit_lessthan_ct(tempReg, (digit_t)(carryIn)) | is_digit_lessthan_ct((sumOut), tempReg)); } ++ ++// Digit subtraction with borrow ++#define SUBC(borrowIn, minuend, subtrahend, borrowOut, differenceOut) \ ++ { digit_t tempReg = (minuend) - (subtrahend); \ ++ unsigned int borrowReg = (is_digit_lessthan_ct((minuend), (subtrahend)) | ((borrowIn) & is_digit_zero_ct(tempReg))); \ ++ (differenceOut) = tempReg - (digit_t)(borrowIn); \ ++ (borrowOut) = borrowReg; } ++ ++// Shift right with flexible datatype ++#define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize) \ ++ (shiftOut) = ((lowIn) >> (shift)) ^ ((highIn) << (DigitSize - (shift))); ++ ++// Shift left with flexible datatype ++#define SHIFTL(highIn, lowIn, shift, shiftOut, DigitSize) \ ++ (shiftOut) = ((highIn) << (shift)) ^ ((lowIn) >> (DigitSize - (shift))); ++ ++// 64x64-bit multiplication ++#define MUL128(multiplier, multiplicand, product) \ ++ mp_mul((digit_t*)&(multiplier), (digit_t*)&(multiplicand), (digit_t*)&(product), NWORDS_FIELD/2); ++ ++// 128-bit addition, inputs < 2^127 ++#define ADD128(addend1, addend2, addition) \ ++ mp_add((digit_t*)(addend1), (digit_t*)(addend2), (digit_t*)(addition), NWORDS_FIELD); ++ ++// 128-bit addition with output carry ++#define ADC128(addend1, addend2, carry, addition) \ ++ (carry) = mp_add((digit_t*)(addend1), (digit_t*)(addend2), (digit_t*)(addition), NWORDS_FIELD); ++ ++#else ++ ++// Digit multiplication ++#define MUL(multiplier, multiplicand, hi, lo) \ ++ { uint128_t tempReg = (uint128_t)(multiplier) * (uint128_t)(multiplicand); \ ++ *(hi) = (digit_t)(tempReg >> RADIX); \ ++ (lo) = (digit_t)tempReg; } ++ ++// Digit addition with carry ++#define ADDC(carryIn, addend1, addend2, carryOut, sumOut) \ ++ { uint128_t tempReg = (uint128_t)(addend1) + (uint128_t)(addend2) + (uint128_t)(carryIn); \ ++ (carryOut) = (digit_t)(tempReg >> RADIX); \ ++ (sumOut) = (digit_t)tempReg; } ++ ++// Digit subtraction with borrow ++#define SUBC(borrowIn, minuend, subtrahend, borrowOut, differenceOut) \ ++ { uint128_t tempReg = (uint128_t)(minuend) - (uint128_t)(subtrahend) - (uint128_t)(borrowIn); \ ++ (borrowOut) = (digit_t)(tempReg >> (sizeof(uint128_t)*8 - 1)); \ ++ (differenceOut) = (digit_t)tempReg; } ++ ++// Digit shift right ++#define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize) \ ++ (shiftOut) = ((lowIn) >> (shift)) ^ ((highIn) << (RADIX - (shift))); ++ ++// Digit shift left ++#define SHIFTL(highIn, lowIn, shift, shiftOut, DigitSize) \ ++ (shiftOut) = ((highIn) << (shift)) ^ ((lowIn) >> (RADIX - (shift))); ++ ++#endif ++ ++#endif // INTERNAL_H_ +diff --git a/third_party/sidh/src/sidh.c b/third_party/sidh/src/sidh.c +new file mode 100644 +index 000000000..fdbd87e28 +--- /dev/null ++++ b/third_party/sidh/src/sidh.c +@@ -0,0 +1,410 @@ ++/******************************************************************************************** ++* SIDH: an efficient supersingular isogeny cryptography library ++* ++* Abstract: ephemeral supersingular isogeny Diffie-Hellman key exchange (SIDH) ++*********************************************************************************************/ ++ ++#include "openssl/bn.h" ++#include "openssl/base.h" ++ ++#include "sidh/def_p503.h" ++#include "P503_internal.h" ++ ++extern const struct params_t kP503Params; ++ ++// Returns private key bit size for type A (IsInitiator=true) or B (IsInitiator=false) type. ++static inline size_t PrvKeyBitSz(int IsInitiator) { ++ return IsInitiator?SIDHp503_PRV_A_BITSZ:SIDHp503_PRV_B_BITSZ; ++} ++ ++// Swap points. ++// If option = 0 then P <- P and Q <- Q, else if option = 0xFF...FF then P <- Q and Q <- P ++#if !defined(OPENSSL_X86_64) || defined(OPENSSL_NO_ASM) ++static void cswap(point_proj_t P, point_proj_t Q, const digit_t option) ++{ ++ digit_t temp; ++ unsigned int i; ++ ++ for (i = 0; i < NWORDS_FIELD; i++) { ++ temp = option & (P->X->c0[i] ^ Q->X->c0[i]); ++ P->X->c0[i] = temp ^ P->X->c0[i]; ++ Q->X->c0[i] = temp ^ Q->X->c0[i]; ++ temp = option & (P->Z->c0[i] ^ Q->Z->c0[i]); ++ P->Z->c0[i] = temp ^ P->Z->c0[i]; ++ Q->Z->c0[i] = temp ^ Q->Z->c0[i]; ++ temp = option & (P->X->c1[i] ^ Q->X->c1[i]); ++ P->X->c1[i] = temp ^ P->X->c1[i]; ++ Q->X->c1[i] = temp ^ Q->X->c1[i]; ++ temp = option & (P->Z->c1[i] ^ Q->Z->c1[i]); ++ P->Z->c1[i] = temp ^ P->Z->c1[i]; ++ Q->Z->c1[i] = temp ^ Q->Z->c1[i]; ++ } ++} ++#endif ++ ++// Swap points. ++// If option = 0 then P <- P and Q <- Q, else if option = 0xFF...FF then P <- Q and Q <- P ++static inline void fp2cswap(point_proj_t P, point_proj_t Q, const digit_t option) ++{ ++#if defined(OPENSSL_X86_64) && !defined(OPENSSL_NO_ASM) ++ cswap_asm(P, Q, option); ++#else ++ cswap(P, Q, option); ++#endif ++} ++ ++static void LADDER3PT(const f2elm_t xP, const f2elm_t xQ, const f2elm_t xPQ, const digit_t* m, const unsigned int AliceOrBob, point_proj_t R, const f2elm_t A) ++{ ++ point_proj_t R0 = POINT_PROJ_INIT, R2 = POINT_PROJ_INIT; ++ f2elm_t A24 = F2ELM_INIT; ++ digit_t mask; ++ int i, nbits, bit, swap, prevbit = 0; ++ ++ if (AliceOrBob == ALICE) { ++ nbits = OALICE_BITS; ++ } else { ++ nbits = OBOB_BITS; ++ } ++ ++ // Initializing constant ++ fpcopy((digit_t*)&kP503Params.Montgomery_one, A24->c0); ++ fp2add(A24, A24, A24); ++ fp2add(A, A24, A24); ++ fp2div2(A24, A24); ++ fp2div2(A24, A24); // A24 = (A+2)/4 ++ ++ // Initializing points ++ fp2copy(xQ, R0->X); ++ fpcopy((digit_t*)&kP503Params.Montgomery_one, (digit_t*)R0->Z); ++ fp2copy(xPQ, R2->X); ++ fpcopy((digit_t*)&kP503Params.Montgomery_one, (digit_t*)R2->Z); ++ fp2copy(xP, R->X); ++ fpcopy((digit_t*)&kP503Params.Montgomery_one, (digit_t*)R->Z); ++ fpzero((digit_t*)(R->Z)->c1); ++ ++ // Main loop ++ for (i = 0; i < nbits; i++) { ++ bit = (m[i >> LOG2RADIX] >> (i & (RADIX-1))) & 1; ++ swap = bit ^ prevbit; ++ prevbit = bit; ++ mask = 0 - (digit_t)swap; ++ ++ fp2cswap(R, R2, mask); ++ xDBLADD(R0, R2, R->X, A24); ++ fp2mul_mont(R2->X, R->Z, R2->X); ++ } ++} ++ ++// Initialization of basis points ++static void init_basis(digit_t *gen, f2elm_t XP, f2elm_t XQ, f2elm_t XR) ++{ ++ ++ fpcopy(gen, XP->c0); ++ fpcopy(gen + NWORDS_FIELD, XP->c1); ++ fpcopy(gen + 2*NWORDS_FIELD, XQ->c0); ++ fpzero(XQ->c1); ++ fpcopy(gen + 3*NWORDS_FIELD, XR->c0); ++ fpcopy(gen + 4*NWORDS_FIELD, XR->c1); ++} ++ ++// Conversion of GF(p^2) element from Montgomery to standard representation, and encoding by removing leading 0 bytes ++static void fp2_encode(const f2elm_t x, unsigned char *enc) ++{ ++ unsigned int i; ++ f2elm_t t; ++ ++ from_fp2mont(x, t); ++ for (i = 0; i < FP2_ENCODED_BYTES / 2; i++) { ++ enc[i] = ((unsigned char*)t)[i]; ++ enc[i + FP2_ENCODED_BYTES / 2] = ((unsigned char*)t)[i + MAXBITS_FIELD / 8]; ++ } ++} ++ ++// Parse byte sequence back into GF(p^2) element, and conversion to Montgomery representation ++static void fp2_decode(const unsigned char *enc, f2elm_t x) ++{ ++ unsigned int i; ++ ++ for (i = 0; i < 2*(MAXBITS_FIELD / 8); i++) ((unsigned char *)x)[i] = 0; ++ for (i = 0; i < FP2_ENCODED_BYTES / 2; i++) { ++ ((unsigned char*)x)[i] = enc[i]; ++ ((unsigned char*)x)[i + MAXBITS_FIELD / 8] = enc[i + FP2_ENCODED_BYTES / 2]; ++ } ++ to_fp2mont(x, x); ++} ++ ++int EphemeralKeyGeneration_A(const unsigned char* PrivateKeyA, unsigned char* PublicKeyA) ++{ ++ point_proj_t R, phiP = POINT_PROJ_INIT, phiQ = POINT_PROJ_INIT, phiR = POINT_PROJ_INIT, pts[MAX_INT_POINTS_ALICE]; ++ f2elm_t XPA, XQA, XRA, coeff[3], A24plus = F2ELM_INIT, C24 = F2ELM_INIT, A = F2ELM_INIT; ++ unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_ALICE], npts = 0, ii = 0; ++ ++ // Initialize basis points ++ init_basis((digit_t*)kP503Params.A_gen, XPA, XQA, XRA); ++ init_basis((digit_t*)kP503Params.B_gen, phiP->X, phiQ->X, phiR->X); ++ fpcopy((digit_t*)&kP503Params.Montgomery_one, (phiP->Z)->c0); ++ fpcopy((digit_t*)&kP503Params.Montgomery_one, (phiQ->Z)->c0); ++ fpcopy((digit_t*)&kP503Params.Montgomery_one, (phiR->Z)->c0); ++ ++ // Initialize constants ++ fpcopy((digit_t*)&kP503Params.Montgomery_one, A24plus->c0); ++ fp2add(A24plus, A24plus, C24); ++ ++ // Retrieve kernel point ++ LADDER3PT(XPA, XQA, XRA, (digit_t*)PrivateKeyA, ALICE, R, A); ++ ++ // Traverse tree ++ index = 0; ++ for (row = 1; row < MAX_Alice; row++) { ++ while (index < MAX_Alice-row) { ++ fp2copy(R->X, pts[npts]->X); ++ fp2copy(R->Z, pts[npts]->Z); ++ pts_index[npts++] = index; ++ m = kP503Params.strat_Alice[ii++]; ++ xDBLe(R, R, A24plus, C24, (int)(2*m)); ++ index += m; ++ } ++ get_4_isog(R, A24plus, C24, coeff); ++ ++ for (i = 0; i < npts; i++) { ++ eval_4_isog(pts[i], coeff); ++ } ++ eval_4_isog(phiP, coeff); ++ eval_4_isog(phiQ, coeff); ++ eval_4_isog(phiR, coeff); ++ ++ fp2copy(pts[npts-1]->X, R->X); ++ fp2copy(pts[npts-1]->Z, R->Z); ++ index = pts_index[npts-1]; ++ npts -= 1; ++ } ++ ++ get_4_isog(R, A24plus, C24, coeff); ++ eval_4_isog(phiP, coeff); ++ eval_4_isog(phiQ, coeff); ++ eval_4_isog(phiR, coeff); ++ ++ inv_3_way(phiP->Z, phiQ->Z, phiR->Z); ++ fp2mul_mont(phiP->X, phiP->Z, phiP->X); ++ fp2mul_mont(phiQ->X, phiQ->Z, phiQ->X); ++ fp2mul_mont(phiR->X, phiR->Z, phiR->X); ++ ++ // Format public key ++ fp2_encode(phiP->X, PublicKeyA); ++ fp2_encode(phiQ->X, PublicKeyA + FP2_ENCODED_BYTES); ++ fp2_encode(phiR->X, PublicKeyA + 2*FP2_ENCODED_BYTES); ++ ++ return 0; ++} ++ ++ ++int EphemeralKeyGeneration_B(const unsigned char* PrivateKeyB, unsigned char* PublicKeyB) ++{ ++ point_proj_t R, phiP = POINT_PROJ_INIT, phiQ = POINT_PROJ_INIT, phiR = POINT_PROJ_INIT, pts[MAX_INT_POINTS_BOB]; ++ f2elm_t XPB, XQB, XRB, coeff[3], A24plus = F2ELM_INIT, A24minus = F2ELM_INIT, A = F2ELM_INIT; ++ unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_BOB], npts = 0, ii = 0; ++ ++ // Initialize basis points ++ init_basis((digit_t*)kP503Params.B_gen, XPB, XQB, XRB); ++ init_basis((digit_t*)kP503Params.A_gen, phiP->X, phiQ->X, phiR->X); ++ fpcopy((digit_t*)&kP503Params.Montgomery_one, (phiP->Z)->c0); ++ fpcopy((digit_t*)&kP503Params.Montgomery_one, (phiQ->Z)->c0); ++ fpcopy((digit_t*)&kP503Params.Montgomery_one, (phiR->Z)->c0); ++ ++ // Initialize constants ++ fpcopy((digit_t*)&kP503Params.Montgomery_one, A24plus->c0); ++ fp2add(A24plus, A24plus, A24plus); ++ fp2copy(A24plus, A24minus); ++ fp2neg(A24minus); ++ ++ // Retrieve kernel point ++ LADDER3PT(XPB, XQB, XRB, (digit_t*)PrivateKeyB, BOB, R, A); ++ ++ // Traverse tree ++ index = 0; ++ for (row = 1; row < MAX_Bob; row++) { ++ while (index < MAX_Bob-row) { ++ fp2copy(R->X, pts[npts]->X); ++ fp2copy(R->Z, pts[npts]->Z); ++ pts_index[npts++] = index; ++ m = kP503Params.strat_Bob[ii++]; ++ xTPLe(R, R, A24minus, A24plus, (int)m); ++ index += m; ++ } ++ get_3_isog(R, A24minus, A24plus, coeff); ++ ++ for (i = 0; i < npts; i++) { ++ eval_3_isog(pts[i], coeff); ++ } ++ eval_3_isog(phiP, coeff); ++ eval_3_isog(phiQ, coeff); ++ eval_3_isog(phiR, coeff); ++ ++ fp2copy(pts[npts-1]->X, R->X); ++ fp2copy(pts[npts-1]->Z, R->Z); ++ index = pts_index[npts-1]; ++ npts -= 1; ++ } ++ ++ get_3_isog(R, A24minus, A24plus, coeff); ++ eval_3_isog(phiP, coeff); ++ eval_3_isog(phiQ, coeff); ++ eval_3_isog(phiR, coeff); ++ ++ inv_3_way(phiP->Z, phiQ->Z, phiR->Z); ++ fp2mul_mont(phiP->X, phiP->Z, phiP->X); ++ fp2mul_mont(phiQ->X, phiQ->Z, phiQ->X); ++ fp2mul_mont(phiR->X, phiR->Z, phiR->X); ++ ++ // Format public key ++ fp2_encode(phiP->X, PublicKeyB); ++ fp2_encode(phiQ->X, PublicKeyB + FP2_ENCODED_BYTES); ++ fp2_encode(phiR->X, PublicKeyB + 2*FP2_ENCODED_BYTES); ++ ++ return 0; ++} ++ ++ ++int EphemeralSecretAgreement_A(const unsigned char* PrivateKeyA, const unsigned char* PublicKeyB, unsigned char* SharedSecretA) ++{ ++ point_proj_t R, pts[MAX_INT_POINTS_ALICE]; ++ f2elm_t coeff[3], PKB[3], jinv; ++ f2elm_t A24plus = F2ELM_INIT, C24 = F2ELM_INIT, A = F2ELM_INIT; ++ unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_ALICE], npts = 0, ii = 0; ++ ++ // Initialize images of Bob's basis ++ fp2_decode(PublicKeyB, PKB[0]); ++ fp2_decode(PublicKeyB + FP2_ENCODED_BYTES, PKB[1]); ++ fp2_decode(PublicKeyB + 2*FP2_ENCODED_BYTES, PKB[2]); ++ ++ // Initialize constants ++ get_A(PKB[0], PKB[1], PKB[2], A); // TODO: Can return projective A? ++ fpadd((digit_t*)&kP503Params.Montgomery_one, (digit_t*)&kP503Params.Montgomery_one, C24->c0); ++ fp2add(A, C24, A24plus); ++ fpadd(C24->c0, C24->c0, C24->c0); ++ ++ // Retrieve kernel point ++ LADDER3PT(PKB[0], PKB[1], PKB[2], (digit_t*)PrivateKeyA, ALICE, R, A); ++ ++ // Traverse tree ++ index = 0; ++ for (row = 1; row < MAX_Alice; row++) { ++ while (index < MAX_Alice-row) { ++ fp2copy(R->X, pts[npts]->X); ++ fp2copy(R->Z, pts[npts]->Z); ++ pts_index[npts++] = index; ++ m = kP503Params.strat_Alice[ii++]; ++ xDBLe(R, R, A24plus, C24, (int)(2*m)); ++ index += m; ++ } ++ get_4_isog(R, A24plus, C24, coeff); ++ ++ for (i = 0; i < npts; i++) { ++ eval_4_isog(pts[i], coeff); ++ } ++ ++ fp2copy(pts[npts-1]->X, R->X); ++ fp2copy(pts[npts-1]->Z, R->Z); ++ index = pts_index[npts-1]; ++ npts -= 1; ++ } ++ ++ get_4_isog(R, A24plus, C24, coeff); ++ fp2div2(C24, C24); ++ fp2sub(A24plus, C24, A24plus); ++ fp2div2(C24, C24); ++ j_inv(A24plus, C24, jinv); ++ fp2_encode(jinv, SharedSecretA); // Format shared secret ++ ++ return 0; ++} ++ ++int EphemeralSecretAgreement_B(const unsigned char* PrivateKeyB, const unsigned char* PublicKeyA, unsigned char* SharedSecretB) ++{ ++ point_proj_t R, pts[MAX_INT_POINTS_BOB]; ++ f2elm_t coeff[3], PKB[3], jinv; ++ f2elm_t A24plus = F2ELM_INIT, A24minus = F2ELM_INIT, A = F2ELM_INIT; ++ unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_BOB], npts = 0, ii = 0; ++ ++ // Initialize images of Alice's basis ++ fp2_decode(PublicKeyA, PKB[0]); ++ fp2_decode(PublicKeyA + FP2_ENCODED_BYTES, PKB[1]); ++ fp2_decode(PublicKeyA + 2*FP2_ENCODED_BYTES, PKB[2]); ++ ++ // Initialize constants ++ get_A(PKB[0], PKB[1], PKB[2], A); // TODO: Can return projective A? ++ fpadd((digit_t*)&kP503Params.Montgomery_one, (digit_t*)&kP503Params.Montgomery_one, A24minus->c0); ++ fp2add(A, A24minus, A24plus); ++ fp2sub(A, A24minus, A24minus); ++ ++ // Retrieve kernel point ++ LADDER3PT(PKB[0], PKB[1], PKB[2], (digit_t*)PrivateKeyB, BOB, R, A); ++ ++ // Traverse tree ++ index = 0; ++ for (row = 1; row < MAX_Bob; row++) { ++ while (index < MAX_Bob-row) { ++ fp2copy(R->X, pts[npts]->X); ++ fp2copy(R->Z, pts[npts]->Z); ++ pts_index[npts++] = index; ++ m = kP503Params.strat_Bob[ii++]; ++ xTPLe(R, R, A24minus, A24plus, (int)m); ++ index += m; ++ } ++ get_3_isog(R, A24minus, A24plus, coeff); ++ ++ for (i = 0; i < npts; i++) { ++ eval_3_isog(pts[i], coeff); ++ } ++ ++ fp2copy(pts[npts-1]->X, R->X); ++ fp2copy(pts[npts-1]->Z, R->Z); ++ index = pts_index[npts-1]; ++ npts -= 1; ++ } ++ ++ get_3_isog(R, A24minus, A24plus, coeff); ++ fp2add(A24plus, A24minus, A); ++ fp2add(A, A, A); ++ fp2sub(A24plus, A24minus, A24plus); ++ j_inv(A, A24plus, jinv); ++ fp2_encode(jinv, SharedSecretB); // Format shared secret ++ ++ return 0; ++} ++ ++int EphemeralKeyPair_SIDHp503(unsigned char* PrivateKey, unsigned char* PublicKey, int IsInitiator) { ++ int ret = -1; ++ ++ BN_CTX *ctx = BN_CTX_new(); ++ if (!ctx) { ++ goto end; ++ } ++ ++ // Calculate private key for Alice. Needs to be in range [0, 2^0xFA - 1] and < 250 bits ++ BIGNUM *bn_sidh_prv = BN_CTX_get(ctx); ++ if (!bn_sidh_prv) { ++ goto end; ++ } ++ ++ if (!BN_rand(bn_sidh_prv, PrvKeyBitSz(IsInitiator), BN_RAND_TOP_ONE, BN_RAND_BOTTOM_ANY)) { ++ goto end; ++ } ++ ++ // Convert to little endian ++ if (!BN_bn2le_padded(PrivateKey, NBITS_TO_NBYTES(PrvKeyBitSz(IsInitiator)), bn_sidh_prv)) { ++ goto end; ++ } ++ ++ // Never fails ++ IsInitiator ++ ?(void)EphemeralKeyGeneration_A_SIDHp503(PrivateKey, PublicKey) ++ :(void)EphemeralKeyGeneration_B_SIDHp503(PrivateKey, PublicKey); ++ ++ // All good ++ ret = 0; ++ ++end: ++ BN_CTX_free(ctx); ++ return ret; ++} +diff --git a/tool/CMakeLists.txt b/tool/CMakeLists.txt +index 7f340171d..6a796cc48 100644 +--- a/tool/CMakeLists.txt ++++ b/tool/CMakeLists.txt +@@ -1,4 +1,4 @@ +-include_directories(../include) ++include_directories(../include ../third_party/sidh/include) + + add_executable( + bssl +diff --git a/tool/speed.cc b/tool/speed.cc +index 2175baa24..4d86441ea 100644 +--- a/tool/speed.cc ++++ b/tool/speed.cc +@@ -50,6 +50,10 @@ OPENSSL_MSVC_PRAGMA(warning(pop)) + #include "internal.h" + + ++#include "sidh/def_p503.h" ++#include "sidh/P503_api.h" ++ ++ + // TimeResults represents the results of benchmarking a function. + struct TimeResults { + // num_calls is the number of function calls done in the time period. +@@ -282,6 +286,78 @@ static bool SpeedRSAKeyGen(const std::string &selected) { + return true; + } + ++static bool SpeedSIDHP503KeyGen(bool is_initiator) { ++ uint8_t public_SIDH[SIDHp503_PUB_BYTESZ] = {0}; ++ uint8_t private_SIDH[SIDHp503_PRV_KEY_BYTESZ_MAX] = {0}; ++ // Key generation function to be benchmarked ++ std::function keygen = ++ is_initiator? ++ EphemeralKeyGeneration_A_SIDHp503: ++ EphemeralKeyGeneration_B_SIDHp503; ++ ++ // Generate private key to be used for public key generation ++ if (EphemeralKeyPair_SIDHp503(private_SIDH, public_SIDH, is_initiator)) { ++ return false; ++ } ++ ++ TimeResults results; ++ TimeFunction(&results, ++ [keygen, &private_SIDH, &public_SIDH]() -> bool { ++ // Never fails ++ (void)keygen(private_SIDH, public_SIDH); ++ return true; ++ }); ++ ++ results.Print(std::string("SIDH/P503 KeyGen ") + std::string(is_initiator?"A":"B")); ++ return true; ++} ++ ++static bool SpeedSIDHP503Kex(bool is_initiator) { ++ uint8_t public_SIDH[SIDHp503_PUB_BYTESZ] = {0}; ++ uint8_t private_SIDH[SIDHp503_PRV_KEY_BYTESZ_MAX] = {0}; ++ uint8_t tmp[SIDHp503_PRV_KEY_BYTESZ_MAX] = {0}; ++ uint8_t ss[SIDHp503_SS_BYTESZ] = {0}; ++ ++ // Key agreement function to be benchmarked ++ std::function kex = ++ is_initiator? ++ EphemeralSecretAgreement_A_SIDHp503: ++ EphemeralSecretAgreement_B_SIDHp503; ++ ++ // Generate private key for one side ++ if (EphemeralKeyPair_SIDHp503(private_SIDH, public_SIDH, is_initiator)) { ++ return false; ++ } ++ ++ // Generate public key for other side ++ memset(public_SIDH, 0, sizeof(public_SIDH)); ++ if (EphemeralKeyPair_SIDHp503(tmp, public_SIDH, !is_initiator)) { ++ return false; ++ } ++ ++ TimeResults results; ++ TimeFunction(&results, ++ [kex, &private_SIDH, &public_SIDH, &ss]() -> bool { ++ // Never fails ++ (void)kex(private_SIDH, public_SIDH, ss); ++ return true; ++ }); ++ ++ results.Print(std::string("SIDH/P503 KEX ") + std::string(is_initiator?"A":"B")); ++ return true; ++} ++ ++static bool SpeedSIDHP503(const std::string &selected) { ++ if (!selected.empty() && selected.find("SIDH") == std::string::npos) { ++ return true; ++ } ++ return ++ SpeedSIDHP503KeyGen(true) && ++ SpeedSIDHP503KeyGen(false) && ++ SpeedSIDHP503Kex(true) && ++ SpeedSIDHP503Kex(false); ++} ++ + static uint8_t *align(uint8_t *in, unsigned alignment) { + return reinterpret_cast( + (reinterpret_cast(in) + alignment) & +@@ -815,6 +891,7 @@ bool Speed(const std::vector &args) { + !SpeedECDH(selected) || + !SpeedECDSA(selected) || + !Speed25519(selected) || ++ !SpeedSIDHP503(selected) || + !SpeedSPAKE2(selected) || + !SpeedScrypt(selected) || + !SpeedRSAKeyGen(selected)) { diff --git a/_dev/interop_test_runner b/_dev/interop_test_runner index cdb1da5..60dce37 100755 --- a/_dev/interop_test_runner +++ b/_dev/interop_test_runner @@ -20,7 +20,7 @@ RE_PATTERN_ALPN = "ALPN protocol: npn_proto$" # Successful TLS establishement from TRIS RE_TRIS_ALL_PASSED = ".*All handshakes passed.*" # TLS handshake from BoringSSL with SIDH/P503-X25519 -RE_BORINGSSL_P503 = "ECDHE curve: x25519sidh503" +RE_BORINGSSL_P503 = "ECDHE curve: X25519-SIDHp503" class Docker(object): ''' Utility class used for starting/stoping servers and clients during tests''' @@ -227,7 +227,7 @@ class InteropServer_BoringSSL(InteropServer, ServerNominalMixin, ServerClientAut ''' Connects to TRIS server listening on 7443 and tries to perform key agreement with SIDH/P503-X25519 ''' - res = self.d.run_client(self.CLIENT_NAME, self.server_ip+":7443 "+'-curves x25519sidh503') + res = self.d.run_client(self.CLIENT_NAME, self.server_ip+":7443 "+'-curves X25519-SIDHp503') self.assertEqual(res[0], 0) self.assertIsNotNone(re.search(RE_BORINGSSL_P503, res[1], re.MULTILINE)) self.assertIsNotNone(re.search(RE_PATTERN_HELLO_TLS_13_NORESUME, res[1], re.MULTILINE)) @@ -257,7 +257,7 @@ class InteropClient_BoringSSL(InteropClient, ClientNominalMixin, ClientClientAut ''' Connects to BoringSSL server listening on 7443 and tries to perform key agreement with SIDH/P503-X25519 ''' - res = self.d.run_client(self.CLIENT_NAME, '-rsa=false -ecdsa=true -qr SIDH-P503-X25519 ' + self.server_ip+":7443") + res = self.d.run_client(self.CLIENT_NAME, '-rsa=false -ecdsa=true -qr X25519-SIDHp503 ' + self.server_ip+":7443") self.assertEqual(res[0], 0) self.assertIsNotNone(re.search(RE_TRIS_ALL_PASSED, res[1], re.MULTILINE)) @@ -278,7 +278,7 @@ class InteropServer_TRIS(ClientNominalMixin, InteropServer, unittest.TestCase): self.assertEqual(res[0], 0) def test_SIDH(self): - res = self.d.run_client(self.CLIENT_NAME, '-rsa=false -ecdsa=true -qr SIDH-P503-X25519 '+self.server_ip+":7443") + res = self.d.run_client(self.CLIENT_NAME, '-rsa=false -ecdsa=true -qr X25519-SIDHp503 '+self.server_ip+":7443") self.assertEqual(res[0], 0) def test_server_doesnt_support_SIDH(self): diff --git a/_dev/tris-localserver/server.go b/_dev/tris-localserver/server.go index 6298b5b..f2ee889 100644 --- a/_dev/tris-localserver/server.go +++ b/_dev/tris-localserver/server.go @@ -56,7 +56,7 @@ func NewServer() *server { } func enableQR(s *server, enableDefault bool) { - var sidhCurves = []tls.CurveID{tls.HybridSidhP503Curve25519} + var sidhCurves = []tls.CurveID{tls.HybridSIDHp503Curve25519} if enableDefault { var defaultCurvePreferences = []tls.CurveID{tls.X25519, tls.CurveP256, tls.CurveP384, tls.CurveP521} s.TLS.CurvePreferences = append(s.TLS.CurvePreferences, defaultCurvePreferences...) diff --git a/_dev/tris-testclient/client.go b/_dev/tris-testclient/client.go index 56532fe..aa2a5ca 100644 --- a/_dev/tris-testclient/client.go +++ b/_dev/tris-testclient/client.go @@ -53,10 +53,8 @@ func (c *Client) setMinMaxTLS(ver uint16) { func getQrAlgoId(qr string) tls.CurveID { switch qr { - case "SIDH-P503-X25519": - return tls.HybridSidhP503Curve25519 - //case "SIDH-P751-X448": - // return tls.HybridSidhP751Curve448 + case "X25519-SIDHp503": + return tls.HybridSIDHp503Curve25519 default: return 0 } @@ -110,7 +108,7 @@ func main() { flag.BoolVar(&enable_rsa, "rsa", true, "Whether to enable RSA cipher suites") flag.BoolVar(&enable_ecdsa, "ecdsa", true, "Whether to enable ECDSA cipher suites") flag.BoolVar(&client_auth, "cliauth", false, "Whether to enable client authentication") - flag.StringVar(&qrAlgoName, "qr", "", "Specifies qr algorithm from following list:\n[SIDH-P503-X25519, SIDH-P751-X448]") + flag.StringVar(&qrAlgoName, "qr", "", "Specifies qr algorithm from following list:\n[X25519-SIDHp503]") flag.Parse() if flag.NArg() != 1 { flag.Usage() diff --git a/common.go b/common.go index 7f5021a..459b1ef 100644 --- a/common.go +++ b/common.go @@ -123,12 +123,7 @@ const ( X25519 CurveID = 29 // Experimental KEX - HybridSidhP503Curve25519 CurveID = 0x0105 + (sidhP503 & 0xFF) // HybridSIDH: X25519 + P503 - // HybridSidhP751Curve448 CurveID = 0x0105 + (sidhP751 & 0xFF) // HybridSIDH: X448 + P751 - - // Internal usage. Deliberately not exported - sidhP503 CurveID = 0xFE00 - sidhP751 CurveID = 0xFE01 + HybridSIDHp503Curve25519 CurveID = 0xFE30 ) // TLS 1.3 Key Share diff --git a/handshake_messages.go b/handshake_messages.go index 8181f27..91f0ca8 100644 --- a/handshake_messages.go +++ b/handshake_messages.go @@ -658,7 +658,7 @@ func (m *clientHelloMsg) unmarshal(data []byte) alert { return alertDecodeError } case extensionKeyShare: - // https://tools.ietf.org/html/draft-ietf-tls-tls13-18#section-4.2.5 + // https://tools.ietf.org/html/rfc8446#section-4.2.8 if length < 2 { return alertDecodeError }