boringssl/third_party/fiat/curve25519.c
David Benjamin 32e59d2d32 Switch to new fiat pipeline.
This new version makes it much easier to tell which code is handwritten
and which is verified. For some reason, it also is *dramatically* faster
for 32-bit x86 GCC. Clang x86_64, however, does take a small hit.
Benchmarks below.

x86, GCC 7.3.0, OPENSSL_SMALL
(For some reason, GCC used to be really bad at compiling the 32-bit curve25519
code. The new one fixes this. I'm not sure what changed.)
Before:
Did 17135 Ed25519 key generation operations in 10026402us (1709.0 ops/sec)
Did 17170 Ed25519 signing operations in 10074192us (1704.4 ops/sec)
Did 9180 Ed25519 verify operations in 10034025us (914.9 ops/sec)
Did 17271 Curve25519 base-point multiplication operations in 10050837us (1718.4 ops/sec)
Did 10605 Curve25519 arbitrary point multiplication operations in 10047714us (1055.5 ops/sec)
Did 7800 ECDH P-256 operations in 10018331us (778.6 ops/sec)
Did 24308 ECDSA P-256 signing operations in 10019241us (2426.1 ops/sec)
Did 9191 ECDSA P-256 verify operations in 10081639us (911.7 ops/sec)
After:
Did 99873 Ed25519 key generation operations in 10021810us (9965.6 ops/sec) [+483.1%]
Did 99960 Ed25519 signing operations in 10052236us (9944.1 ops/sec) [+483.4%]
Did 53676 Ed25519 verify operations in 10009078us (5362.7 ops/sec) [+486.2%]
Did 102000 Curve25519 base-point multiplication operations in 10039764us (10159.6 ops/sec) [+491.2%]
Did 60802 Curve25519 arbitrary point multiplication operations in 10056897us (6045.8 ops/sec) [+472.8%]
Did 7900 ECDH P-256 operations in 10054509us (785.7 ops/sec) [+0.9%]
Did 24926 ECDSA P-256 signing operations in 10050919us (2480.0 ops/sec) [+2.2%]
Did 9494 ECDSA P-256 verify operations in 10064659us (943.3 ops/sec) [+3.5%]

x86, Clang 8.0.0 trunk 349417, OPENSSL_SMALL
Before:
Did 82750 Ed25519 key generation operations in 10051177us (8232.9 ops/sec)
Did 82400 Ed25519 signing operations in 10035806us (8210.6 ops/sec)
Did 41511 Ed25519 verify operations in 10048919us (4130.9 ops/sec)
Did 83300 Curve25519 base-point multiplication operations in 10044283us (8293.3 ops/sec)
Did 49700 Curve25519 arbitrary point multiplication operations in 10007005us (4966.5 ops/sec)
Did 14039 ECDH P-256 operations in 10093929us (1390.8 ops/sec)
Did 40950 ECDSA P-256 signing operations in 10006757us (4092.2 ops/sec)
Did 16068 ECDSA P-256 verify operations in 10095996us (1591.5 ops/sec)
After:
Did 80476 Ed25519 key generation operations in 10048648us (8008.6 ops/sec) [-2.7%]
Did 79050 Ed25519 signing operations in 10049180us (7866.3 ops/sec) [-4.2%]
Did 40501 Ed25519 verify operations in 10048347us (4030.6 ops/sec) [-2.4%]
Did 81300 Curve25519 base-point multiplication operations in 10017480us (8115.8 ops/sec) [-2.1%]
Did 48278 Curve25519 arbitrary point multiplication operations in 10092500us (4783.6 ops/sec) [-3.7%]
Did 15402 ECDH P-256 operations in 10096705us (1525.4 ops/sec) [+9.7%]
Did 44200 ECDSA P-256 signing operations in 10037715us (4403.4 ops/sec) [+7.6%]
Did 17000 ECDSA P-256 verify operations in 10008813us (1698.5 ops/sec) [+6.7%]

x86_64, GCC 7.3.0
(Note these P-256 numbers are not affected by this change. Included to get a
sense of noise.)
Before:
Did 557000 Ed25519 key generation operations in 10011721us (55634.8 ops/sec)
Did 550000 Ed25519 signing operations in 10016449us (54909.7 ops/sec)
Did 190000 Ed25519 verify operations in 10014565us (18972.4 ops/sec)
Did 587000 Curve25519 base-point multiplication operations in 10015402us (58609.7 ops/sec)
Did 230000 Curve25519 arbitrary point multiplication operations in 10023827us (22945.3 ops/sec)
Did 179000 ECDH P-256 operations in 10016294us (17870.9 ops/sec)
Did 557000 ECDSA P-256 signing operations in 10014158us (55621.3 ops/sec)
Did 198000 ECDSA P-256 verify operations in 10036694us (19727.6 ops/sec)
After:
Did 569000 Ed25519 key generation operations in 10004965us (56871.8 ops/sec) [+2.2%]
Did 563000 Ed25519 signing operations in 10000064us (56299.6 ops/sec) [+2.5%]
Did 196000 Ed25519 verify operations in 10025650us (19549.9 ops/sec) [+3.0%]
Did 596000 Curve25519 base-point multiplication operations in 10008666us (59548.4 ops/sec) [+1.6%]
Did 229000 Curve25519 arbitrary point multiplication operations in 10028921us (22834.0 ops/sec) [-0.5%]
Did 182910 ECDH P-256 operations in 10014905us (18263.8 ops/sec) [+2.2%]
Did 562000 ECDSA P-256 signing operations in 10011944us (56133.0 ops/sec) [+0.9%]
Did 202000 ECDSA P-256 verify operations in 10046901us (20105.7 ops/sec) [+1.9%]

x86_64, GCC 7.3.0, OPENSSL_SMALL
Before:
Did 350000 Ed25519 key generation operations in 10002540us (34991.1 ops/sec)
Did 344000 Ed25519 signing operations in 10010420us (34364.2 ops/sec)
Did 197000 Ed25519 verify operations in 10030593us (19639.9 ops/sec)
Did 362000 Curve25519 base-point multiplication operations in 10004615us (36183.3 ops/sec)
Did 235000 Curve25519 arbitrary point multiplication operations in 10025951us (23439.2 ops/sec)
Did 32032 ECDH P-256 operations in 10056486us (3185.2 ops/sec)
Did 96354 ECDSA P-256 signing operations in 10007297us (9628.4 ops/sec)
Did 37774 ECDSA P-256 verify operations in 10044892us (3760.5 ops/sec)
After:
Did 343000 Ed25519 key generation operations in 10025108us (34214.1 ops/sec) [-2.2%]
Did 340000 Ed25519 signing operations in 10014870us (33949.5 ops/sec) [-1.2%]
Did 192000 Ed25519 verify operations in 10025082us (19152.0 ops/sec) [-2.5%]
Did 355000 Curve25519 base-point multiplication operations in 10013220us (35453.1 ops/sec) [-2.0%]
Did 231000 Curve25519 arbitrary point multiplication operations in 10010775us (23075.1 ops/sec) [-1.6%]
Did 31540 ECDH P-256 operations in 10009664us (3151.0 ops/sec) [-1.1%]
Did 99012 ECDSA P-256 signing operations in 10090296us (9812.6 ops/sec) [+1.9%]
Did 37695 ECDSA P-256 verify operations in 10092859us (3734.8 ops/sec) [-0.7%]

x86_64, Clang 8.0.0 trunk 349417
(Note these P-256 numbers are not affected by this change. Included to get a
sense of noise.)
Before:
Did 600000 Ed25519 key generation operations in 10000278us (59998.3 ops/sec)
Did 595000 Ed25519 signing operations in 10010375us (59438.3 ops/sec)
Did 184000 Ed25519 verify operations in 10013984us (18374.3 ops/sec)
Did 636000 Curve25519 base-point multiplication operations in 10005250us (63566.6 ops/sec)
Did 229000 Curve25519 arbitrary point multiplication operations in 10006059us (22886.1 ops/sec)
Did 179250 ECDH P-256 operations in 10026354us (17877.9 ops/sec)
Did 547000 ECDSA P-256 signing operations in 10017585us (54604.0 ops/sec)
Did 197000 ECDSA P-256 verify operations in 10013020us (19674.4 ops/sec)
After:
Did 560000 Ed25519 key generation operations in 10009295us (55948.0 ops/sec) [-6.8%]
Did 548000 Ed25519 signing operations in 10007912us (54756.7 ops/sec) [-7.9%]
Did 170000 Ed25519 verify operations in 10056948us (16903.7 ops/sec) [-8.0%]
Did 592000 Curve25519 base-point multiplication operations in 10016818us (59100.6 ops/sec) [-7.0%]
Did 214000 Curve25519 arbitrary point multiplication operations in 10043918us (21306.4 ops/sec) [-6.9%]
Did 180000 ECDH P-256 operations in 10026019us (17953.3 ops/sec) [+0.4%]
Did 550000 ECDSA P-256 signing operations in 10004943us (54972.8 ops/sec) [+0.7%]
Did 198000 ECDSA P-256 verify operations in 10021714us (19757.1 ops/sec) [+0.4%]

x86_64, Clang 8.0.0 trunk 349417, OPENSSL_SMALL
Before:
Did 326000 Ed25519 key generation operations in 10003266us (32589.4 ops/sec)
Did 322000 Ed25519 signing operations in 10026783us (32114.0 ops/sec)
Did 181000 Ed25519 verify operations in 10015635us (18071.7 ops/sec)
Did 335000 Curve25519 base-point multiplication operations in 10000359us (33498.8 ops/sec)
Did 224000 Curve25519 arbitrary point multiplication operations in 10027245us (22339.1 ops/sec)
Did 68552 ECDH P-256 operations in 10018900us (6842.3 ops/sec)
Did 184000 ECDSA P-256 signing operations in 10014516us (18373.3 ops/sec)
Did 76020 ECDSA P-256 verify operations in 10016891us (7589.2 ops/sec)
After:
Did 310000 Ed25519 key generation operations in 10022086us (30931.7 ops/sec) [-5.1%]
Did 308000 Ed25519 signing operations in 10007543us (30776.8 ops/sec) [-4.2%]
Did 173000 Ed25519 verify operations in 10005829us (17289.9 ops/sec) [-4.3%]
Did 321000 Curve25519 base-point multiplication operations in 10027058us (32013.4 ops/sec) [-4.4%]
Did 212000 Curve25519 arbitrary point multiplication operations in 10015203us (21167.8 ops/sec) [-5.2%]
Did 64059 ECDH P-256 operations in 10042781us (6378.6 ops/sec) [-6.8%]
Did 170000 ECDSA P-256 signing operations in 10030896us (16947.6 ops/sec) [-7.8%]
Did 72176 ECDSA P-256 verify operations in 10075369us (7163.6 ops/sec) [-5.6%]

Bug: 254
Change-Id: Ib04c773f01b542bcb8611cceb582466bfa6f6d52
Reviewed-on: https://boringssl-review.googlesource.com/c/34306
Commit-Queue: Adam Langley <agl@google.com>
Reviewed-by: Adam Langley <agl@google.com>
2019-01-18 00:24:03 +00:00

2162 lines
55 KiB
C

// The MIT License (MIT)
//
// Copyright (c) 2015-2016 the fiat-crypto authors (see the AUTHORS file).
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
// Some of this code is taken from the ref10 version of Ed25519 in SUPERCOP
// 20141124 (http://bench.cr.yp.to/supercop.html). That code is released as
// public domain but parts have been replaced with code generated by Fiat
// (https://github.com/mit-plv/fiat-crypto), which is MIT licensed.
//
// The field functions are shared by Ed25519 and X25519 where possible.
#include <openssl/curve25519.h>
#include <assert.h>
#include <string.h>
#include <openssl/cpu.h>
#include <openssl/mem.h>
#include <openssl/rand.h>
#include <openssl/sha.h>
#include <openssl/type_check.h>
#include "internal.h"
#include "../../crypto/internal.h"
// Various pre-computed constants.
#include "./curve25519_tables.h"
#if defined(BORINGSSL_CURVE25519_64BIT)
#include "./curve25519_64.c"
#else
#include "./curve25519_32.c"
#endif // BORINGSSL_CURVE25519_64BIT
// Low-level intrinsic operations
static uint64_t load_3(const uint8_t *in) {
uint64_t result;
result = (uint64_t)in[0];
result |= ((uint64_t)in[1]) << 8;
result |= ((uint64_t)in[2]) << 16;
return result;
}
static uint64_t load_4(const uint8_t *in) {
uint64_t result;
result = (uint64_t)in[0];
result |= ((uint64_t)in[1]) << 8;
result |= ((uint64_t)in[2]) << 16;
result |= ((uint64_t)in[3]) << 24;
return result;
}
// Field operations.
#if defined(BORINGSSL_CURVE25519_64BIT)
typedef uint64_t fe_limb_t;
#define FE_NUM_LIMBS 5
// assert_fe asserts that |f| satisfies bounds:
//
// [[0x0 ~> 0x8cccccccccccc],
// [0x0 ~> 0x8cccccccccccc],
// [0x0 ~> 0x8cccccccccccc],
// [0x0 ~> 0x8cccccccccccc],
// [0x0 ~> 0x8cccccccccccc]]
//
// See comments in curve25519_64.c for which functions use these bounds for
// inputs or outputs.
#define assert_fe(f) \
do { \
for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \
assert(f[_assert_fe_i] <= UINT64_C(0x8cccccccccccc)); \
} \
} while (0)
// assert_fe_loose asserts that |f| satisfies bounds:
//
// [[0x0 ~> 0x1a666666666664],
// [0x0 ~> 0x1a666666666664],
// [0x0 ~> 0x1a666666666664],
// [0x0 ~> 0x1a666666666664],
// [0x0 ~> 0x1a666666666664]]
//
// See comments in curve25519_64.c for which functions use these bounds for
// inputs or outputs.
#define assert_fe_loose(f) \
do { \
for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \
assert(f[_assert_fe_i] <= UINT64_C(0x1a666666666664)); \
} \
} while (0)
#else
typedef uint32_t fe_limb_t;
#define FE_NUM_LIMBS 10
// assert_fe asserts that |f| satisfies bounds:
//
// [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
// [0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
// [0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
// [0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
// [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]]
//
// See comments in curve25519_32.c for which functions use these bounds for
// inputs or outputs.
#define assert_fe(f) \
do { \
for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \
assert(f[_assert_fe_i] <= \
((_assert_fe_i & 1) ? 0x2333333u : 0x4666666u)); \
} \
} while (0)
// assert_fe_loose asserts that |f| satisfies bounds:
//
// [[0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
// [0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
// [0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
// [0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
// [0x0 ~> 0xd333332], [0x0 ~> 0x6999999]]
//
// See comments in curve25519_32.c for which functions use these bounds for
// inputs or outputs.
#define assert_fe_loose(f) \
do { \
for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \
assert(f[_assert_fe_i] <= \
((_assert_fe_i & 1) ? 0x6999999u : 0xd333332u)); \
} \
} while (0)
#endif // BORINGSSL_CURVE25519_64BIT
OPENSSL_STATIC_ASSERT(sizeof(fe) == sizeof(fe_limb_t) * FE_NUM_LIMBS,
"fe_limb_t[FE_NUM_LIMBS] is inconsistent with fe");
static void fe_frombytes_strict(fe *h, const uint8_t s[32]) {
// |fiat_25519_from_bytes| requires the top-most bit be clear.
assert((s[31] & 0x80) == 0);
fiat_25519_from_bytes(h->v, s);
assert_fe(h->v);
}
static void fe_frombytes(fe *h, const uint8_t s[32]) {
uint8_t s_copy[32];
OPENSSL_memcpy(s_copy, s, 32);
s_copy[31] &= 0x7f;
fe_frombytes_strict(h, s_copy);
}
static void fe_tobytes(uint8_t s[32], const fe *f) {
assert_fe(f->v);
fiat_25519_to_bytes(s, f->v);
}
// h = 0
static void fe_0(fe *h) {
OPENSSL_memset(h, 0, sizeof(fe));
}
static void fe_loose_0(fe_loose *h) {
OPENSSL_memset(h, 0, sizeof(fe_loose));
}
// h = 1
static void fe_1(fe *h) {
OPENSSL_memset(h, 0, sizeof(fe));
h->v[0] = 1;
}
static void fe_loose_1(fe_loose *h) {
OPENSSL_memset(h, 0, sizeof(fe_loose));
h->v[0] = 1;
}
// h = f + g
// Can overlap h with f or g.
static void fe_add(fe_loose *h, const fe *f, const fe *g) {
assert_fe(f->v);
assert_fe(g->v);
fiat_25519_add(h->v, f->v, g->v);
assert_fe_loose(h->v);
}
// h = f - g
// Can overlap h with f or g.
static void fe_sub(fe_loose *h, const fe *f, const fe *g) {
assert_fe(f->v);
assert_fe(g->v);
fiat_25519_sub(h->v, f->v, g->v);
assert_fe_loose(h->v);
}
static void fe_carry(fe *h, const fe_loose* f) {
assert_fe_loose(f->v);
fiat_25519_carry(h->v, f->v);
assert_fe(h->v);
}
static void fe_mul_impl(fe_limb_t out[FE_NUM_LIMBS],
const fe_limb_t in1[FE_NUM_LIMBS],
const fe_limb_t in2[FE_NUM_LIMBS]) {
assert_fe_loose(in1);
assert_fe_loose(in2);
fiat_25519_carry_mul(out, in1, in2);
assert_fe(out);
}
static void fe_mul_ltt(fe_loose *h, const fe *f, const fe *g) {
fe_mul_impl(h->v, f->v, g->v);
}
static void fe_mul_llt(fe_loose *h, const fe_loose *f, const fe *g) {
fe_mul_impl(h->v, f->v, g->v);
}
static void fe_mul_ttt(fe *h, const fe *f, const fe *g) {
fe_mul_impl(h->v, f->v, g->v);
}
static void fe_mul_tlt(fe *h, const fe_loose *f, const fe *g) {
fe_mul_impl(h->v, f->v, g->v);
}
static void fe_mul_ttl(fe *h, const fe *f, const fe_loose *g) {
fe_mul_impl(h->v, f->v, g->v);
}
static void fe_mul_tll(fe *h, const fe_loose *f, const fe_loose *g) {
fe_mul_impl(h->v, f->v, g->v);
}
static void fe_sq_tl(fe *h, const fe_loose *f) {
assert_fe_loose(f->v);
fiat_25519_carry_square(h->v, f->v);
assert_fe(h->v);
}
static void fe_sq_tt(fe *h, const fe *f) {
assert_fe_loose(f->v);
fiat_25519_carry_square(h->v, f->v);
assert_fe(h->v);
}
// Replace (f,g) with (g,f) if b == 1;
// replace (f,g) with (f,g) if b == 0.
//
// Preconditions: b in {0,1}.
static void fe_cswap(fe *f, fe *g, fe_limb_t b) {
b = 0-b;
for (unsigned i = 0; i < FE_NUM_LIMBS; i++) {
fe_limb_t x = f->v[i] ^ g->v[i];
x &= b;
f->v[i] ^= x;
g->v[i] ^= x;
}
}
static void fe_mul121666(fe *h, const fe_loose *f) {
assert_fe_loose(f->v);
fiat_25519_carry_scmul_121666(h->v, f->v);
assert_fe(h->v);
}
// h = -f
static void fe_neg(fe_loose *h, const fe *f) {
assert_fe(f->v);
fiat_25519_opp(h->v, f->v);
assert_fe_loose(h->v);
}
// Replace (f,g) with (g,g) if b == 1;
// replace (f,g) with (f,g) if b == 0.
//
// Preconditions: b in {0,1}.
static void fe_cmov(fe_loose *f, const fe_loose *g, fe_limb_t b) {
// Silence an unused function warning. |fiat_25519_selectznz| isn't quite the
// calling convention the rest of this code wants, so implement it by hand.
//
// TODO(davidben): Switch to fiat's calling convention, or ask fiat to emit a
// different one.
(void)fiat_25519_selectznz;
b = 0-b;
for (unsigned i = 0; i < FE_NUM_LIMBS; i++) {
fe_limb_t x = f->v[i] ^ g->v[i];
x &= b;
f->v[i] ^= x;
}
}
// h = f
static void fe_copy(fe *h, const fe *f) {
OPENSSL_memmove(h, f, sizeof(fe));
}
static void fe_copy_lt(fe_loose *h, const fe *f) {
OPENSSL_STATIC_ASSERT(sizeof(fe_loose) == sizeof(fe),
"fe and fe_loose mismatch");
OPENSSL_memmove(h, f, sizeof(fe));
}
#if !defined(OPENSSL_SMALL)
static void fe_copy_ll(fe_loose *h, const fe_loose *f) {
OPENSSL_memmove(h, f, sizeof(fe_loose));
}
#endif // !defined(OPENSSL_SMALL)
static void fe_loose_invert(fe *out, const fe_loose *z) {
fe t0;
fe t1;
fe t2;
fe t3;
int i;
fe_sq_tl(&t0, z);
fe_sq_tt(&t1, &t0);
for (i = 1; i < 2; ++i) {
fe_sq_tt(&t1, &t1);
}
fe_mul_tlt(&t1, z, &t1);
fe_mul_ttt(&t0, &t0, &t1);
fe_sq_tt(&t2, &t0);
fe_mul_ttt(&t1, &t1, &t2);
fe_sq_tt(&t2, &t1);
for (i = 1; i < 5; ++i) {
fe_sq_tt(&t2, &t2);
}
fe_mul_ttt(&t1, &t2, &t1);
fe_sq_tt(&t2, &t1);
for (i = 1; i < 10; ++i) {
fe_sq_tt(&t2, &t2);
}
fe_mul_ttt(&t2, &t2, &t1);
fe_sq_tt(&t3, &t2);
for (i = 1; i < 20; ++i) {
fe_sq_tt(&t3, &t3);
}
fe_mul_ttt(&t2, &t3, &t2);
fe_sq_tt(&t2, &t2);
for (i = 1; i < 10; ++i) {
fe_sq_tt(&t2, &t2);
}
fe_mul_ttt(&t1, &t2, &t1);
fe_sq_tt(&t2, &t1);
for (i = 1; i < 50; ++i) {
fe_sq_tt(&t2, &t2);
}
fe_mul_ttt(&t2, &t2, &t1);
fe_sq_tt(&t3, &t2);
for (i = 1; i < 100; ++i) {
fe_sq_tt(&t3, &t3);
}
fe_mul_ttt(&t2, &t3, &t2);
fe_sq_tt(&t2, &t2);
for (i = 1; i < 50; ++i) {
fe_sq_tt(&t2, &t2);
}
fe_mul_ttt(&t1, &t2, &t1);
fe_sq_tt(&t1, &t1);
for (i = 1; i < 5; ++i) {
fe_sq_tt(&t1, &t1);
}
fe_mul_ttt(out, &t1, &t0);
}
static void fe_invert(fe *out, const fe *z) {
fe_loose l;
fe_copy_lt(&l, z);
fe_loose_invert(out, &l);
}
// return 0 if f == 0
// return 1 if f != 0
static int fe_isnonzero(const fe_loose *f) {
fe tight;
fe_carry(&tight, f);
uint8_t s[32];
fe_tobytes(s, &tight);
static const uint8_t zero[32] = {0};
return CRYPTO_memcmp(s, zero, sizeof(zero)) != 0;
}
// return 1 if f is in {1,3,5,...,q-2}
// return 0 if f is in {0,2,4,...,q-1}
static int fe_isnegative(const fe *f) {
uint8_t s[32];
fe_tobytes(s, f);
return s[0] & 1;
}
static void fe_sq2_tt(fe *h, const fe *f) {
// h = f^2
fe_sq_tt(h, f);
// h = h + h
fe_loose tmp;
fe_add(&tmp, h, h);
fe_carry(h, &tmp);
}
static void fe_pow22523(fe *out, const fe *z) {
fe t0;
fe t1;
fe t2;
int i;
fe_sq_tt(&t0, z);
fe_sq_tt(&t1, &t0);
for (i = 1; i < 2; ++i) {
fe_sq_tt(&t1, &t1);
}
fe_mul_ttt(&t1, z, &t1);
fe_mul_ttt(&t0, &t0, &t1);
fe_sq_tt(&t0, &t0);
fe_mul_ttt(&t0, &t1, &t0);
fe_sq_tt(&t1, &t0);
for (i = 1; i < 5; ++i) {
fe_sq_tt(&t1, &t1);
}
fe_mul_ttt(&t0, &t1, &t0);
fe_sq_tt(&t1, &t0);
for (i = 1; i < 10; ++i) {
fe_sq_tt(&t1, &t1);
}
fe_mul_ttt(&t1, &t1, &t0);
fe_sq_tt(&t2, &t1);
for (i = 1; i < 20; ++i) {
fe_sq_tt(&t2, &t2);
}
fe_mul_ttt(&t1, &t2, &t1);
fe_sq_tt(&t1, &t1);
for (i = 1; i < 10; ++i) {
fe_sq_tt(&t1, &t1);
}
fe_mul_ttt(&t0, &t1, &t0);
fe_sq_tt(&t1, &t0);
for (i = 1; i < 50; ++i) {
fe_sq_tt(&t1, &t1);
}
fe_mul_ttt(&t1, &t1, &t0);
fe_sq_tt(&t2, &t1);
for (i = 1; i < 100; ++i) {
fe_sq_tt(&t2, &t2);
}
fe_mul_ttt(&t1, &t2, &t1);
fe_sq_tt(&t1, &t1);
for (i = 1; i < 50; ++i) {
fe_sq_tt(&t1, &t1);
}
fe_mul_ttt(&t0, &t1, &t0);
fe_sq_tt(&t0, &t0);
for (i = 1; i < 2; ++i) {
fe_sq_tt(&t0, &t0);
}
fe_mul_ttt(out, &t0, z);
}
// Group operations.
void x25519_ge_tobytes(uint8_t s[32], const ge_p2 *h) {
fe recip;
fe x;
fe y;
fe_invert(&recip, &h->Z);
fe_mul_ttt(&x, &h->X, &recip);
fe_mul_ttt(&y, &h->Y, &recip);
fe_tobytes(s, &y);
s[31] ^= fe_isnegative(&x) << 7;
}
static void ge_p3_tobytes(uint8_t s[32], const ge_p3 *h) {
fe recip;
fe x;
fe y;
fe_invert(&recip, &h->Z);
fe_mul_ttt(&x, &h->X, &recip);
fe_mul_ttt(&y, &h->Y, &recip);
fe_tobytes(s, &y);
s[31] ^= fe_isnegative(&x) << 7;
}
int x25519_ge_frombytes_vartime(ge_p3 *h, const uint8_t s[32]) {
fe u;
fe_loose v;
fe v3;
fe vxx;
fe_loose check;
fe_frombytes(&h->Y, s);
fe_1(&h->Z);
fe_sq_tt(&v3, &h->Y);
fe_mul_ttt(&vxx, &v3, &d);
fe_sub(&v, &v3, &h->Z); // u = y^2-1
fe_carry(&u, &v);
fe_add(&v, &vxx, &h->Z); // v = dy^2+1
fe_sq_tl(&v3, &v);
fe_mul_ttl(&v3, &v3, &v); // v3 = v^3
fe_sq_tt(&h->X, &v3);
fe_mul_ttl(&h->X, &h->X, &v);
fe_mul_ttt(&h->X, &h->X, &u); // x = uv^7
fe_pow22523(&h->X, &h->X); // x = (uv^7)^((q-5)/8)
fe_mul_ttt(&h->X, &h->X, &v3);
fe_mul_ttt(&h->X, &h->X, &u); // x = uv^3(uv^7)^((q-5)/8)
fe_sq_tt(&vxx, &h->X);
fe_mul_ttl(&vxx, &vxx, &v);
fe_sub(&check, &vxx, &u);
if (fe_isnonzero(&check)) {
fe_add(&check, &vxx, &u);
if (fe_isnonzero(&check)) {
return 0;
}
fe_mul_ttt(&h->X, &h->X, &sqrtm1);
}
if (fe_isnegative(&h->X) != (s[31] >> 7)) {
fe_loose t;
fe_neg(&t, &h->X);
fe_carry(&h->X, &t);
}
fe_mul_ttt(&h->T, &h->X, &h->Y);
return 1;
}
static void ge_p2_0(ge_p2 *h) {
fe_0(&h->X);
fe_1(&h->Y);
fe_1(&h->Z);
}
static void ge_p3_0(ge_p3 *h) {
fe_0(&h->X);
fe_1(&h->Y);
fe_1(&h->Z);
fe_0(&h->T);
}
static void ge_cached_0(ge_cached *h) {
fe_loose_1(&h->YplusX);
fe_loose_1(&h->YminusX);
fe_loose_1(&h->Z);
fe_loose_0(&h->T2d);
}
static void ge_precomp_0(ge_precomp *h) {
fe_loose_1(&h->yplusx);
fe_loose_1(&h->yminusx);
fe_loose_0(&h->xy2d);
}
// r = p
static void ge_p3_to_p2(ge_p2 *r, const ge_p3 *p) {
fe_copy(&r->X, &p->X);
fe_copy(&r->Y, &p->Y);
fe_copy(&r->Z, &p->Z);
}
// r = p
void x25519_ge_p3_to_cached(ge_cached *r, const ge_p3 *p) {
fe_add(&r->YplusX, &p->Y, &p->X);
fe_sub(&r->YminusX, &p->Y, &p->X);
fe_copy_lt(&r->Z, &p->Z);
fe_mul_ltt(&r->T2d, &p->T, &d2);
}
// r = p
void x25519_ge_p1p1_to_p2(ge_p2 *r, const ge_p1p1 *p) {
fe_mul_tll(&r->X, &p->X, &p->T);
fe_mul_tll(&r->Y, &p->Y, &p->Z);
fe_mul_tll(&r->Z, &p->Z, &p->T);
}
// r = p
void x25519_ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p) {
fe_mul_tll(&r->X, &p->X, &p->T);
fe_mul_tll(&r->Y, &p->Y, &p->Z);
fe_mul_tll(&r->Z, &p->Z, &p->T);
fe_mul_tll(&r->T, &p->X, &p->Y);
}
// r = p
static void ge_p1p1_to_cached(ge_cached *r, const ge_p1p1 *p) {
ge_p3 t;
x25519_ge_p1p1_to_p3(&t, p);
x25519_ge_p3_to_cached(r, &t);
}
// r = 2 * p
static void ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p) {
fe trX, trZ, trT;
fe t0;
fe_sq_tt(&trX, &p->X);
fe_sq_tt(&trZ, &p->Y);
fe_sq2_tt(&trT, &p->Z);
fe_add(&r->Y, &p->X, &p->Y);
fe_sq_tl(&t0, &r->Y);
fe_add(&r->Y, &trZ, &trX);
fe_sub(&r->Z, &trZ, &trX);
fe_carry(&trZ, &r->Y);
fe_sub(&r->X, &t0, &trZ);
fe_carry(&trZ, &r->Z);
fe_sub(&r->T, &trT, &trZ);
}
// r = 2 * p
static void ge_p3_dbl(ge_p1p1 *r, const ge_p3 *p) {
ge_p2 q;
ge_p3_to_p2(&q, p);
ge_p2_dbl(r, &q);
}
// r = p + q
static void ge_madd(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) {
fe trY, trZ, trT;
fe_add(&r->X, &p->Y, &p->X);
fe_sub(&r->Y, &p->Y, &p->X);
fe_mul_tll(&trZ, &r->X, &q->yplusx);
fe_mul_tll(&trY, &r->Y, &q->yminusx);
fe_mul_tlt(&trT, &q->xy2d, &p->T);
fe_add(&r->T, &p->Z, &p->Z);
fe_sub(&r->X, &trZ, &trY);
fe_add(&r->Y, &trZ, &trY);
fe_carry(&trZ, &r->T);
fe_add(&r->Z, &trZ, &trT);
fe_sub(&r->T, &trZ, &trT);
}
// r = p - q
static void ge_msub(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) {
fe trY, trZ, trT;
fe_add(&r->X, &p->Y, &p->X);
fe_sub(&r->Y, &p->Y, &p->X);
fe_mul_tll(&trZ, &r->X, &q->yminusx);
fe_mul_tll(&trY, &r->Y, &q->yplusx);
fe_mul_tlt(&trT, &q->xy2d, &p->T);
fe_add(&r->T, &p->Z, &p->Z);
fe_sub(&r->X, &trZ, &trY);
fe_add(&r->Y, &trZ, &trY);
fe_carry(&trZ, &r->T);
fe_sub(&r->Z, &trZ, &trT);
fe_add(&r->T, &trZ, &trT);
}
// r = p + q
void x25519_ge_add(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
fe trX, trY, trZ, trT;
fe_add(&r->X, &p->Y, &p->X);
fe_sub(&r->Y, &p->Y, &p->X);
fe_mul_tll(&trZ, &r->X, &q->YplusX);
fe_mul_tll(&trY, &r->Y, &q->YminusX);
fe_mul_tlt(&trT, &q->T2d, &p->T);
fe_mul_ttl(&trX, &p->Z, &q->Z);
fe_add(&r->T, &trX, &trX);
fe_sub(&r->X, &trZ, &trY);
fe_add(&r->Y, &trZ, &trY);
fe_carry(&trZ, &r->T);
fe_add(&r->Z, &trZ, &trT);
fe_sub(&r->T, &trZ, &trT);
}
// r = p - q
void x25519_ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
fe trX, trY, trZ, trT;
fe_add(&r->X, &p->Y, &p->X);
fe_sub(&r->Y, &p->Y, &p->X);
fe_mul_tll(&trZ, &r->X, &q->YminusX);
fe_mul_tll(&trY, &r->Y, &q->YplusX);
fe_mul_tlt(&trT, &q->T2d, &p->T);
fe_mul_ttl(&trX, &p->Z, &q->Z);
fe_add(&r->T, &trX, &trX);
fe_sub(&r->X, &trZ, &trY);
fe_add(&r->Y, &trZ, &trY);
fe_carry(&trZ, &r->T);
fe_sub(&r->Z, &trZ, &trT);
fe_add(&r->T, &trZ, &trT);
}
static uint8_t equal(signed char b, signed char c) {
uint8_t ub = b;
uint8_t uc = c;
uint8_t x = ub ^ uc; // 0: yes; 1..255: no
uint32_t y = x; // 0: yes; 1..255: no
y -= 1; // 4294967295: yes; 0..254: no
y >>= 31; // 1: yes; 0: no
return y;
}
static void cmov(ge_precomp *t, const ge_precomp *u, uint8_t b) {
fe_cmov(&t->yplusx, &u->yplusx, b);
fe_cmov(&t->yminusx, &u->yminusx, b);
fe_cmov(&t->xy2d, &u->xy2d, b);
}
void x25519_ge_scalarmult_small_precomp(
ge_p3 *h, const uint8_t a[32], const uint8_t precomp_table[15 * 2 * 32]) {
// precomp_table is first expanded into matching |ge_precomp|
// elements.
ge_precomp multiples[15];
unsigned i;
for (i = 0; i < 15; i++) {
// The precomputed table is assumed to already clear the top bit, so
// |fe_frombytes_strict| may be used directly.
const uint8_t *bytes = &precomp_table[i*(2 * 32)];
fe x, y;
fe_frombytes_strict(&x, bytes);
fe_frombytes_strict(&y, bytes + 32);
ge_precomp *out = &multiples[i];
fe_add(&out->yplusx, &y, &x);
fe_sub(&out->yminusx, &y, &x);
fe_mul_ltt(&out->xy2d, &x, &y);
fe_mul_llt(&out->xy2d, &out->xy2d, &d2);
}
// See the comment above |k25519SmallPrecomp| about the structure of the
// precomputed elements. This loop does 64 additions and 64 doublings to
// calculate the result.
ge_p3_0(h);
for (i = 63; i < 64; i--) {
unsigned j;
signed char index = 0;
for (j = 0; j < 4; j++) {
const uint8_t bit = 1 & (a[(8 * j) + (i / 8)] >> (i & 7));
index |= (bit << j);
}
ge_precomp e;
ge_precomp_0(&e);
for (j = 1; j < 16; j++) {
cmov(&e, &multiples[j-1], equal(index, j));
}
ge_cached cached;
ge_p1p1 r;
x25519_ge_p3_to_cached(&cached, h);
x25519_ge_add(&r, h, &cached);
x25519_ge_p1p1_to_p3(h, &r);
ge_madd(&r, h, &e);
x25519_ge_p1p1_to_p3(h, &r);
}
}
#if defined(OPENSSL_SMALL)
void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32]) {
x25519_ge_scalarmult_small_precomp(h, a, k25519SmallPrecomp);
}
#else
static uint8_t negative(signed char b) {
uint32_t x = b;
x >>= 31; // 1: yes; 0: no
return x;
}
static void table_select(ge_precomp *t, int pos, signed char b) {
ge_precomp minust;
uint8_t bnegative = negative(b);
uint8_t babs = b - ((uint8_t)((-bnegative) & b) << 1);
ge_precomp_0(t);
cmov(t, &k25519Precomp[pos][0], equal(babs, 1));
cmov(t, &k25519Precomp[pos][1], equal(babs, 2));
cmov(t, &k25519Precomp[pos][2], equal(babs, 3));
cmov(t, &k25519Precomp[pos][3], equal(babs, 4));
cmov(t, &k25519Precomp[pos][4], equal(babs, 5));
cmov(t, &k25519Precomp[pos][5], equal(babs, 6));
cmov(t, &k25519Precomp[pos][6], equal(babs, 7));
cmov(t, &k25519Precomp[pos][7], equal(babs, 8));
fe_copy_ll(&minust.yplusx, &t->yminusx);
fe_copy_ll(&minust.yminusx, &t->yplusx);
// NOTE: the input table is canonical, but types don't encode it
fe tmp;
fe_carry(&tmp, &t->xy2d);
fe_neg(&minust.xy2d, &tmp);
cmov(t, &minust, bnegative);
}
// h = a * B
// where a = a[0]+256*a[1]+...+256^31 a[31]
// B is the Ed25519 base point (x,4/5) with x positive.
//
// Preconditions:
// a[31] <= 127
void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t *a) {
signed char e[64];
signed char carry;
ge_p1p1 r;
ge_p2 s;
ge_precomp t;
int i;
for (i = 0; i < 32; ++i) {
e[2 * i + 0] = (a[i] >> 0) & 15;
e[2 * i + 1] = (a[i] >> 4) & 15;
}
// each e[i] is between 0 and 15
// e[63] is between 0 and 7
carry = 0;
for (i = 0; i < 63; ++i) {
e[i] += carry;
carry = e[i] + 8;
carry >>= 4;
e[i] -= carry << 4;
}
e[63] += carry;
// each e[i] is between -8 and 8
ge_p3_0(h);
for (i = 1; i < 64; i += 2) {
table_select(&t, i / 2, e[i]);
ge_madd(&r, h, &t);
x25519_ge_p1p1_to_p3(h, &r);
}
ge_p3_dbl(&r, h);
x25519_ge_p1p1_to_p2(&s, &r);
ge_p2_dbl(&r, &s);
x25519_ge_p1p1_to_p2(&s, &r);
ge_p2_dbl(&r, &s);
x25519_ge_p1p1_to_p2(&s, &r);
ge_p2_dbl(&r, &s);
x25519_ge_p1p1_to_p3(h, &r);
for (i = 0; i < 64; i += 2) {
table_select(&t, i / 2, e[i]);
ge_madd(&r, h, &t);
x25519_ge_p1p1_to_p3(h, &r);
}
}
#endif
static void cmov_cached(ge_cached *t, ge_cached *u, uint8_t b) {
fe_cmov(&t->YplusX, &u->YplusX, b);
fe_cmov(&t->YminusX, &u->YminusX, b);
fe_cmov(&t->Z, &u->Z, b);
fe_cmov(&t->T2d, &u->T2d, b);
}
// r = scalar * A.
// where a = a[0]+256*a[1]+...+256^31 a[31].
void x25519_ge_scalarmult(ge_p2 *r, const uint8_t *scalar, const ge_p3 *A) {
ge_p2 Ai_p2[8];
ge_cached Ai[16];
ge_p1p1 t;
ge_cached_0(&Ai[0]);
x25519_ge_p3_to_cached(&Ai[1], A);
ge_p3_to_p2(&Ai_p2[1], A);
unsigned i;
for (i = 2; i < 16; i += 2) {
ge_p2_dbl(&t, &Ai_p2[i / 2]);
ge_p1p1_to_cached(&Ai[i], &t);
if (i < 8) {
x25519_ge_p1p1_to_p2(&Ai_p2[i], &t);
}
x25519_ge_add(&t, A, &Ai[i]);
ge_p1p1_to_cached(&Ai[i + 1], &t);
if (i < 7) {
x25519_ge_p1p1_to_p2(&Ai_p2[i + 1], &t);
}
}
ge_p2_0(r);
ge_p3 u;
for (i = 0; i < 256; i += 4) {
ge_p2_dbl(&t, r);
x25519_ge_p1p1_to_p2(r, &t);
ge_p2_dbl(&t, r);
x25519_ge_p1p1_to_p2(r, &t);
ge_p2_dbl(&t, r);
x25519_ge_p1p1_to_p2(r, &t);
ge_p2_dbl(&t, r);
x25519_ge_p1p1_to_p3(&u, &t);
uint8_t index = scalar[31 - i/8];
index >>= 4 - (i & 4);
index &= 0xf;
unsigned j;
ge_cached selected;
ge_cached_0(&selected);
for (j = 0; j < 16; j++) {
cmov_cached(&selected, &Ai[j], equal(j, index));
}
x25519_ge_add(&t, &u, &selected);
x25519_ge_p1p1_to_p2(r, &t);
}
}
static void slide(signed char *r, const uint8_t *a) {
int i;
int b;
int k;
for (i = 0; i < 256; ++i) {
r[i] = 1 & (a[i >> 3] >> (i & 7));
}
for (i = 0; i < 256; ++i) {
if (r[i]) {
for (b = 1; b <= 6 && i + b < 256; ++b) {
if (r[i + b]) {
if (r[i] + (r[i + b] << b) <= 15) {
r[i] += r[i + b] << b;
r[i + b] = 0;
} else if (r[i] - (r[i + b] << b) >= -15) {
r[i] -= r[i + b] << b;
for (k = i + b; k < 256; ++k) {
if (!r[k]) {
r[k] = 1;
break;
}
r[k] = 0;
}
} else {
break;
}
}
}
}
}
}
// r = a * A + b * B
// where a = a[0]+256*a[1]+...+256^31 a[31].
// and b = b[0]+256*b[1]+...+256^31 b[31].
// B is the Ed25519 base point (x,4/5) with x positive.
static void ge_double_scalarmult_vartime(ge_p2 *r, const uint8_t *a,
const ge_p3 *A, const uint8_t *b) {
signed char aslide[256];
signed char bslide[256];
ge_cached Ai[8]; // A,3A,5A,7A,9A,11A,13A,15A
ge_p1p1 t;
ge_p3 u;
ge_p3 A2;
int i;
slide(aslide, a);
slide(bslide, b);
x25519_ge_p3_to_cached(&Ai[0], A);
ge_p3_dbl(&t, A);
x25519_ge_p1p1_to_p3(&A2, &t);
x25519_ge_add(&t, &A2, &Ai[0]);
x25519_ge_p1p1_to_p3(&u, &t);
x25519_ge_p3_to_cached(&Ai[1], &u);
x25519_ge_add(&t, &A2, &Ai[1]);
x25519_ge_p1p1_to_p3(&u, &t);
x25519_ge_p3_to_cached(&Ai[2], &u);
x25519_ge_add(&t, &A2, &Ai[2]);
x25519_ge_p1p1_to_p3(&u, &t);
x25519_ge_p3_to_cached(&Ai[3], &u);
x25519_ge_add(&t, &A2, &Ai[3]);
x25519_ge_p1p1_to_p3(&u, &t);
x25519_ge_p3_to_cached(&Ai[4], &u);
x25519_ge_add(&t, &A2, &Ai[4]);
x25519_ge_p1p1_to_p3(&u, &t);
x25519_ge_p3_to_cached(&Ai[5], &u);
x25519_ge_add(&t, &A2, &Ai[5]);
x25519_ge_p1p1_to_p3(&u, &t);
x25519_ge_p3_to_cached(&Ai[6], &u);
x25519_ge_add(&t, &A2, &Ai[6]);
x25519_ge_p1p1_to_p3(&u, &t);
x25519_ge_p3_to_cached(&Ai[7], &u);
ge_p2_0(r);
for (i = 255; i >= 0; --i) {
if (aslide[i] || bslide[i]) {
break;
}
}
for (; i >= 0; --i) {
ge_p2_dbl(&t, r);
if (aslide[i] > 0) {
x25519_ge_p1p1_to_p3(&u, &t);
x25519_ge_add(&t, &u, &Ai[aslide[i] / 2]);
} else if (aslide[i] < 0) {
x25519_ge_p1p1_to_p3(&u, &t);
x25519_ge_sub(&t, &u, &Ai[(-aslide[i]) / 2]);
}
if (bslide[i] > 0) {
x25519_ge_p1p1_to_p3(&u, &t);
ge_madd(&t, &u, &Bi[bslide[i] / 2]);
} else if (bslide[i] < 0) {
x25519_ge_p1p1_to_p3(&u, &t);
ge_msub(&t, &u, &Bi[(-bslide[i]) / 2]);
}
x25519_ge_p1p1_to_p2(r, &t);
}
}
// The set of scalars is \Z/l
// where l = 2^252 + 27742317777372353535851937790883648493.
// Input:
// s[0]+256*s[1]+...+256^63*s[63] = s
//
// Output:
// s[0]+256*s[1]+...+256^31*s[31] = s mod l
// where l = 2^252 + 27742317777372353535851937790883648493.
// Overwrites s in place.
void x25519_sc_reduce(uint8_t s[64]) {
int64_t s0 = 2097151 & load_3(s);
int64_t s1 = 2097151 & (load_4(s + 2) >> 5);
int64_t s2 = 2097151 & (load_3(s + 5) >> 2);
int64_t s3 = 2097151 & (load_4(s + 7) >> 7);
int64_t s4 = 2097151 & (load_4(s + 10) >> 4);
int64_t s5 = 2097151 & (load_3(s + 13) >> 1);
int64_t s6 = 2097151 & (load_4(s + 15) >> 6);
int64_t s7 = 2097151 & (load_3(s + 18) >> 3);
int64_t s8 = 2097151 & load_3(s + 21);
int64_t s9 = 2097151 & (load_4(s + 23) >> 5);
int64_t s10 = 2097151 & (load_3(s + 26) >> 2);
int64_t s11 = 2097151 & (load_4(s + 28) >> 7);
int64_t s12 = 2097151 & (load_4(s + 31) >> 4);
int64_t s13 = 2097151 & (load_3(s + 34) >> 1);
int64_t s14 = 2097151 & (load_4(s + 36) >> 6);
int64_t s15 = 2097151 & (load_3(s + 39) >> 3);
int64_t s16 = 2097151 & load_3(s + 42);
int64_t s17 = 2097151 & (load_4(s + 44) >> 5);
int64_t s18 = 2097151 & (load_3(s + 47) >> 2);
int64_t s19 = 2097151 & (load_4(s + 49) >> 7);
int64_t s20 = 2097151 & (load_4(s + 52) >> 4);
int64_t s21 = 2097151 & (load_3(s + 55) >> 1);
int64_t s22 = 2097151 & (load_4(s + 57) >> 6);
int64_t s23 = (load_4(s + 60) >> 3);
int64_t carry0;
int64_t carry1;
int64_t carry2;
int64_t carry3;
int64_t carry4;
int64_t carry5;
int64_t carry6;
int64_t carry7;
int64_t carry8;
int64_t carry9;
int64_t carry10;
int64_t carry11;
int64_t carry12;
int64_t carry13;
int64_t carry14;
int64_t carry15;
int64_t carry16;
s11 += s23 * 666643;
s12 += s23 * 470296;
s13 += s23 * 654183;
s14 -= s23 * 997805;
s15 += s23 * 136657;
s16 -= s23 * 683901;
s23 = 0;
s10 += s22 * 666643;
s11 += s22 * 470296;
s12 += s22 * 654183;
s13 -= s22 * 997805;
s14 += s22 * 136657;
s15 -= s22 * 683901;
s22 = 0;
s9 += s21 * 666643;
s10 += s21 * 470296;
s11 += s21 * 654183;
s12 -= s21 * 997805;
s13 += s21 * 136657;
s14 -= s21 * 683901;
s21 = 0;
s8 += s20 * 666643;
s9 += s20 * 470296;
s10 += s20 * 654183;
s11 -= s20 * 997805;
s12 += s20 * 136657;
s13 -= s20 * 683901;
s20 = 0;
s7 += s19 * 666643;
s8 += s19 * 470296;
s9 += s19 * 654183;
s10 -= s19 * 997805;
s11 += s19 * 136657;
s12 -= s19 * 683901;
s19 = 0;
s6 += s18 * 666643;
s7 += s18 * 470296;
s8 += s18 * 654183;
s9 -= s18 * 997805;
s10 += s18 * 136657;
s11 -= s18 * 683901;
s18 = 0;
carry6 = (s6 + (1 << 20)) >> 21;
s7 += carry6;
s6 -= carry6 << 21;
carry8 = (s8 + (1 << 20)) >> 21;
s9 += carry8;
s8 -= carry8 << 21;
carry10 = (s10 + (1 << 20)) >> 21;
s11 += carry10;
s10 -= carry10 << 21;
carry12 = (s12 + (1 << 20)) >> 21;
s13 += carry12;
s12 -= carry12 << 21;
carry14 = (s14 + (1 << 20)) >> 21;
s15 += carry14;
s14 -= carry14 << 21;
carry16 = (s16 + (1 << 20)) >> 21;
s17 += carry16;
s16 -= carry16 << 21;
carry7 = (s7 + (1 << 20)) >> 21;
s8 += carry7;
s7 -= carry7 << 21;
carry9 = (s9 + (1 << 20)) >> 21;
s10 += carry9;
s9 -= carry9 << 21;
carry11 = (s11 + (1 << 20)) >> 21;
s12 += carry11;
s11 -= carry11 << 21;
carry13 = (s13 + (1 << 20)) >> 21;
s14 += carry13;
s13 -= carry13 << 21;
carry15 = (s15 + (1 << 20)) >> 21;
s16 += carry15;
s15 -= carry15 << 21;
s5 += s17 * 666643;
s6 += s17 * 470296;
s7 += s17 * 654183;
s8 -= s17 * 997805;
s9 += s17 * 136657;
s10 -= s17 * 683901;
s17 = 0;
s4 += s16 * 666643;
s5 += s16 * 470296;
s6 += s16 * 654183;
s7 -= s16 * 997805;
s8 += s16 * 136657;
s9 -= s16 * 683901;
s16 = 0;
s3 += s15 * 666643;
s4 += s15 * 470296;
s5 += s15 * 654183;
s6 -= s15 * 997805;
s7 += s15 * 136657;
s8 -= s15 * 683901;
s15 = 0;
s2 += s14 * 666643;
s3 += s14 * 470296;
s4 += s14 * 654183;
s5 -= s14 * 997805;
s6 += s14 * 136657;
s7 -= s14 * 683901;
s14 = 0;
s1 += s13 * 666643;
s2 += s13 * 470296;
s3 += s13 * 654183;
s4 -= s13 * 997805;
s5 += s13 * 136657;
s6 -= s13 * 683901;
s13 = 0;
s0 += s12 * 666643;
s1 += s12 * 470296;
s2 += s12 * 654183;
s3 -= s12 * 997805;
s4 += s12 * 136657;
s5 -= s12 * 683901;
s12 = 0;
carry0 = (s0 + (1 << 20)) >> 21;
s1 += carry0;
s0 -= carry0 << 21;
carry2 = (s2 + (1 << 20)) >> 21;
s3 += carry2;
s2 -= carry2 << 21;
carry4 = (s4 + (1 << 20)) >> 21;
s5 += carry4;
s4 -= carry4 << 21;
carry6 = (s6 + (1 << 20)) >> 21;
s7 += carry6;
s6 -= carry6 << 21;
carry8 = (s8 + (1 << 20)) >> 21;
s9 += carry8;
s8 -= carry8 << 21;
carry10 = (s10 + (1 << 20)) >> 21;
s11 += carry10;
s10 -= carry10 << 21;
carry1 = (s1 + (1 << 20)) >> 21;
s2 += carry1;
s1 -= carry1 << 21;
carry3 = (s3 + (1 << 20)) >> 21;
s4 += carry3;
s3 -= carry3 << 21;
carry5 = (s5 + (1 << 20)) >> 21;
s6 += carry5;
s5 -= carry5 << 21;
carry7 = (s7 + (1 << 20)) >> 21;
s8 += carry7;
s7 -= carry7 << 21;
carry9 = (s9 + (1 << 20)) >> 21;
s10 += carry9;
s9 -= carry9 << 21;
carry11 = (s11 + (1 << 20)) >> 21;
s12 += carry11;
s11 -= carry11 << 21;
s0 += s12 * 666643;
s1 += s12 * 470296;
s2 += s12 * 654183;
s3 -= s12 * 997805;
s4 += s12 * 136657;
s5 -= s12 * 683901;
s12 = 0;
carry0 = s0 >> 21;
s1 += carry0;
s0 -= carry0 << 21;
carry1 = s1 >> 21;
s2 += carry1;
s1 -= carry1 << 21;
carry2 = s2 >> 21;
s3 += carry2;
s2 -= carry2 << 21;
carry3 = s3 >> 21;
s4 += carry3;
s3 -= carry3 << 21;
carry4 = s4 >> 21;
s5 += carry4;
s4 -= carry4 << 21;
carry5 = s5 >> 21;
s6 += carry5;
s5 -= carry5 << 21;
carry6 = s6 >> 21;
s7 += carry6;
s6 -= carry6 << 21;
carry7 = s7 >> 21;
s8 += carry7;
s7 -= carry7 << 21;
carry8 = s8 >> 21;
s9 += carry8;
s8 -= carry8 << 21;
carry9 = s9 >> 21;
s10 += carry9;
s9 -= carry9 << 21;
carry10 = s10 >> 21;
s11 += carry10;
s10 -= carry10 << 21;
carry11 = s11 >> 21;
s12 += carry11;
s11 -= carry11 << 21;
s0 += s12 * 666643;
s1 += s12 * 470296;
s2 += s12 * 654183;
s3 -= s12 * 997805;
s4 += s12 * 136657;
s5 -= s12 * 683901;
s12 = 0;
carry0 = s0 >> 21;
s1 += carry0;
s0 -= carry0 << 21;
carry1 = s1 >> 21;
s2 += carry1;
s1 -= carry1 << 21;
carry2 = s2 >> 21;
s3 += carry2;
s2 -= carry2 << 21;
carry3 = s3 >> 21;
s4 += carry3;
s3 -= carry3 << 21;
carry4 = s4 >> 21;
s5 += carry4;
s4 -= carry4 << 21;
carry5 = s5 >> 21;
s6 += carry5;
s5 -= carry5 << 21;
carry6 = s6 >> 21;
s7 += carry6;
s6 -= carry6 << 21;
carry7 = s7 >> 21;
s8 += carry7;
s7 -= carry7 << 21;
carry8 = s8 >> 21;
s9 += carry8;
s8 -= carry8 << 21;
carry9 = s9 >> 21;
s10 += carry9;
s9 -= carry9 << 21;
carry10 = s10 >> 21;
s11 += carry10;
s10 -= carry10 << 21;
s[0] = s0 >> 0;
s[1] = s0 >> 8;
s[2] = (s0 >> 16) | (s1 << 5);
s[3] = s1 >> 3;
s[4] = s1 >> 11;
s[5] = (s1 >> 19) | (s2 << 2);
s[6] = s2 >> 6;
s[7] = (s2 >> 14) | (s3 << 7);
s[8] = s3 >> 1;
s[9] = s3 >> 9;
s[10] = (s3 >> 17) | (s4 << 4);
s[11] = s4 >> 4;
s[12] = s4 >> 12;
s[13] = (s4 >> 20) | (s5 << 1);
s[14] = s5 >> 7;
s[15] = (s5 >> 15) | (s6 << 6);
s[16] = s6 >> 2;
s[17] = s6 >> 10;
s[18] = (s6 >> 18) | (s7 << 3);
s[19] = s7 >> 5;
s[20] = s7 >> 13;
s[21] = s8 >> 0;
s[22] = s8 >> 8;
s[23] = (s8 >> 16) | (s9 << 5);
s[24] = s9 >> 3;
s[25] = s9 >> 11;
s[26] = (s9 >> 19) | (s10 << 2);
s[27] = s10 >> 6;
s[28] = (s10 >> 14) | (s11 << 7);
s[29] = s11 >> 1;
s[30] = s11 >> 9;
s[31] = s11 >> 17;
}
// Input:
// a[0]+256*a[1]+...+256^31*a[31] = a
// b[0]+256*b[1]+...+256^31*b[31] = b
// c[0]+256*c[1]+...+256^31*c[31] = c
//
// Output:
// s[0]+256*s[1]+...+256^31*s[31] = (ab+c) mod l
// where l = 2^252 + 27742317777372353535851937790883648493.
static void sc_muladd(uint8_t *s, const uint8_t *a, const uint8_t *b,
const uint8_t *c) {
int64_t a0 = 2097151 & load_3(a);
int64_t a1 = 2097151 & (load_4(a + 2) >> 5);
int64_t a2 = 2097151 & (load_3(a + 5) >> 2);
int64_t a3 = 2097151 & (load_4(a + 7) >> 7);
int64_t a4 = 2097151 & (load_4(a + 10) >> 4);
int64_t a5 = 2097151 & (load_3(a + 13) >> 1);
int64_t a6 = 2097151 & (load_4(a + 15) >> 6);
int64_t a7 = 2097151 & (load_3(a + 18) >> 3);
int64_t a8 = 2097151 & load_3(a + 21);
int64_t a9 = 2097151 & (load_4(a + 23) >> 5);
int64_t a10 = 2097151 & (load_3(a + 26) >> 2);
int64_t a11 = (load_4(a + 28) >> 7);
int64_t b0 = 2097151 & load_3(b);
int64_t b1 = 2097151 & (load_4(b + 2) >> 5);
int64_t b2 = 2097151 & (load_3(b + 5) >> 2);
int64_t b3 = 2097151 & (load_4(b + 7) >> 7);
int64_t b4 = 2097151 & (load_4(b + 10) >> 4);
int64_t b5 = 2097151 & (load_3(b + 13) >> 1);
int64_t b6 = 2097151 & (load_4(b + 15) >> 6);
int64_t b7 = 2097151 & (load_3(b + 18) >> 3);
int64_t b8 = 2097151 & load_3(b + 21);
int64_t b9 = 2097151 & (load_4(b + 23) >> 5);
int64_t b10 = 2097151 & (load_3(b + 26) >> 2);
int64_t b11 = (load_4(b + 28) >> 7);
int64_t c0 = 2097151 & load_3(c);
int64_t c1 = 2097151 & (load_4(c + 2) >> 5);
int64_t c2 = 2097151 & (load_3(c + 5) >> 2);
int64_t c3 = 2097151 & (load_4(c + 7) >> 7);
int64_t c4 = 2097151 & (load_4(c + 10) >> 4);
int64_t c5 = 2097151 & (load_3(c + 13) >> 1);
int64_t c6 = 2097151 & (load_4(c + 15) >> 6);
int64_t c7 = 2097151 & (load_3(c + 18) >> 3);
int64_t c8 = 2097151 & load_3(c + 21);
int64_t c9 = 2097151 & (load_4(c + 23) >> 5);
int64_t c10 = 2097151 & (load_3(c + 26) >> 2);
int64_t c11 = (load_4(c + 28) >> 7);
int64_t s0;
int64_t s1;
int64_t s2;
int64_t s3;
int64_t s4;
int64_t s5;
int64_t s6;
int64_t s7;
int64_t s8;
int64_t s9;
int64_t s10;
int64_t s11;
int64_t s12;
int64_t s13;
int64_t s14;
int64_t s15;
int64_t s16;
int64_t s17;
int64_t s18;
int64_t s19;
int64_t s20;
int64_t s21;
int64_t s22;
int64_t s23;
int64_t carry0;
int64_t carry1;
int64_t carry2;
int64_t carry3;
int64_t carry4;
int64_t carry5;
int64_t carry6;
int64_t carry7;
int64_t carry8;
int64_t carry9;
int64_t carry10;
int64_t carry11;
int64_t carry12;
int64_t carry13;
int64_t carry14;
int64_t carry15;
int64_t carry16;
int64_t carry17;
int64_t carry18;
int64_t carry19;
int64_t carry20;
int64_t carry21;
int64_t carry22;
s0 = c0 + a0 * b0;
s1 = c1 + a0 * b1 + a1 * b0;
s2 = c2 + a0 * b2 + a1 * b1 + a2 * b0;
s3 = c3 + a0 * b3 + a1 * b2 + a2 * b1 + a3 * b0;
s4 = c4 + a0 * b4 + a1 * b3 + a2 * b2 + a3 * b1 + a4 * b0;
s5 = c5 + a0 * b5 + a1 * b4 + a2 * b3 + a3 * b2 + a4 * b1 + a5 * b0;
s6 = c6 + a0 * b6 + a1 * b5 + a2 * b4 + a3 * b3 + a4 * b2 + a5 * b1 + a6 * b0;
s7 = c7 + a0 * b7 + a1 * b6 + a2 * b5 + a3 * b4 + a4 * b3 + a5 * b2 +
a6 * b1 + a7 * b0;
s8 = c8 + a0 * b8 + a1 * b7 + a2 * b6 + a3 * b5 + a4 * b4 + a5 * b3 +
a6 * b2 + a7 * b1 + a8 * b0;
s9 = c9 + a0 * b9 + a1 * b8 + a2 * b7 + a3 * b6 + a4 * b5 + a5 * b4 +
a6 * b3 + a7 * b2 + a8 * b1 + a9 * b0;
s10 = c10 + a0 * b10 + a1 * b9 + a2 * b8 + a3 * b7 + a4 * b6 + a5 * b5 +
a6 * b4 + a7 * b3 + a8 * b2 + a9 * b1 + a10 * b0;
s11 = c11 + a0 * b11 + a1 * b10 + a2 * b9 + a3 * b8 + a4 * b7 + a5 * b6 +
a6 * b5 + a7 * b4 + a8 * b3 + a9 * b2 + a10 * b1 + a11 * b0;
s12 = a1 * b11 + a2 * b10 + a3 * b9 + a4 * b8 + a5 * b7 + a6 * b6 + a7 * b5 +
a8 * b4 + a9 * b3 + a10 * b2 + a11 * b1;
s13 = a2 * b11 + a3 * b10 + a4 * b9 + a5 * b8 + a6 * b7 + a7 * b6 + a8 * b5 +
a9 * b4 + a10 * b3 + a11 * b2;
s14 = a3 * b11 + a4 * b10 + a5 * b9 + a6 * b8 + a7 * b7 + a8 * b6 + a9 * b5 +
a10 * b4 + a11 * b3;
s15 = a4 * b11 + a5 * b10 + a6 * b9 + a7 * b8 + a8 * b7 + a9 * b6 + a10 * b5 +
a11 * b4;
s16 = a5 * b11 + a6 * b10 + a7 * b9 + a8 * b8 + a9 * b7 + a10 * b6 + a11 * b5;
s17 = a6 * b11 + a7 * b10 + a8 * b9 + a9 * b8 + a10 * b7 + a11 * b6;
s18 = a7 * b11 + a8 * b10 + a9 * b9 + a10 * b8 + a11 * b7;
s19 = a8 * b11 + a9 * b10 + a10 * b9 + a11 * b8;
s20 = a9 * b11 + a10 * b10 + a11 * b9;
s21 = a10 * b11 + a11 * b10;
s22 = a11 * b11;
s23 = 0;
carry0 = (s0 + (1 << 20)) >> 21;
s1 += carry0;
s0 -= carry0 << 21;
carry2 = (s2 + (1 << 20)) >> 21;
s3 += carry2;
s2 -= carry2 << 21;
carry4 = (s4 + (1 << 20)) >> 21;
s5 += carry4;
s4 -= carry4 << 21;
carry6 = (s6 + (1 << 20)) >> 21;
s7 += carry6;
s6 -= carry6 << 21;
carry8 = (s8 + (1 << 20)) >> 21;
s9 += carry8;
s8 -= carry8 << 21;
carry10 = (s10 + (1 << 20)) >> 21;
s11 += carry10;
s10 -= carry10 << 21;
carry12 = (s12 + (1 << 20)) >> 21;
s13 += carry12;
s12 -= carry12 << 21;
carry14 = (s14 + (1 << 20)) >> 21;
s15 += carry14;
s14 -= carry14 << 21;
carry16 = (s16 + (1 << 20)) >> 21;
s17 += carry16;
s16 -= carry16 << 21;
carry18 = (s18 + (1 << 20)) >> 21;
s19 += carry18;
s18 -= carry18 << 21;
carry20 = (s20 + (1 << 20)) >> 21;
s21 += carry20;
s20 -= carry20 << 21;
carry22 = (s22 + (1 << 20)) >> 21;
s23 += carry22;
s22 -= carry22 << 21;
carry1 = (s1 + (1 << 20)) >> 21;
s2 += carry1;
s1 -= carry1 << 21;
carry3 = (s3 + (1 << 20)) >> 21;
s4 += carry3;
s3 -= carry3 << 21;
carry5 = (s5 + (1 << 20)) >> 21;
s6 += carry5;
s5 -= carry5 << 21;
carry7 = (s7 + (1 << 20)) >> 21;
s8 += carry7;
s7 -= carry7 << 21;
carry9 = (s9 + (1 << 20)) >> 21;
s10 += carry9;
s9 -= carry9 << 21;
carry11 = (s11 + (1 << 20)) >> 21;
s12 += carry11;
s11 -= carry11 << 21;
carry13 = (s13 + (1 << 20)) >> 21;
s14 += carry13;
s13 -= carry13 << 21;
carry15 = (s15 + (1 << 20)) >> 21;
s16 += carry15;
s15 -= carry15 << 21;
carry17 = (s17 + (1 << 20)) >> 21;
s18 += carry17;
s17 -= carry17 << 21;
carry19 = (s19 + (1 << 20)) >> 21;
s20 += carry19;
s19 -= carry19 << 21;
carry21 = (s21 + (1 << 20)) >> 21;
s22 += carry21;
s21 -= carry21 << 21;
s11 += s23 * 666643;
s12 += s23 * 470296;
s13 += s23 * 654183;
s14 -= s23 * 997805;
s15 += s23 * 136657;
s16 -= s23 * 683901;
s23 = 0;
s10 += s22 * 666643;
s11 += s22 * 470296;
s12 += s22 * 654183;
s13 -= s22 * 997805;
s14 += s22 * 136657;
s15 -= s22 * 683901;
s22 = 0;
s9 += s21 * 666643;
s10 += s21 * 470296;
s11 += s21 * 654183;
s12 -= s21 * 997805;
s13 += s21 * 136657;
s14 -= s21 * 683901;
s21 = 0;
s8 += s20 * 666643;
s9 += s20 * 470296;
s10 += s20 * 654183;
s11 -= s20 * 997805;
s12 += s20 * 136657;
s13 -= s20 * 683901;
s20 = 0;
s7 += s19 * 666643;
s8 += s19 * 470296;
s9 += s19 * 654183;
s10 -= s19 * 997805;
s11 += s19 * 136657;
s12 -= s19 * 683901;
s19 = 0;
s6 += s18 * 666643;
s7 += s18 * 470296;
s8 += s18 * 654183;
s9 -= s18 * 997805;
s10 += s18 * 136657;
s11 -= s18 * 683901;
s18 = 0;
carry6 = (s6 + (1 << 20)) >> 21;
s7 += carry6;
s6 -= carry6 << 21;
carry8 = (s8 + (1 << 20)) >> 21;
s9 += carry8;
s8 -= carry8 << 21;
carry10 = (s10 + (1 << 20)) >> 21;
s11 += carry10;
s10 -= carry10 << 21;
carry12 = (s12 + (1 << 20)) >> 21;
s13 += carry12;
s12 -= carry12 << 21;
carry14 = (s14 + (1 << 20)) >> 21;
s15 += carry14;
s14 -= carry14 << 21;
carry16 = (s16 + (1 << 20)) >> 21;
s17 += carry16;
s16 -= carry16 << 21;
carry7 = (s7 + (1 << 20)) >> 21;
s8 += carry7;
s7 -= carry7 << 21;
carry9 = (s9 + (1 << 20)) >> 21;
s10 += carry9;
s9 -= carry9 << 21;
carry11 = (s11 + (1 << 20)) >> 21;
s12 += carry11;
s11 -= carry11 << 21;
carry13 = (s13 + (1 << 20)) >> 21;
s14 += carry13;
s13 -= carry13 << 21;
carry15 = (s15 + (1 << 20)) >> 21;
s16 += carry15;
s15 -= carry15 << 21;
s5 += s17 * 666643;
s6 += s17 * 470296;
s7 += s17 * 654183;
s8 -= s17 * 997805;
s9 += s17 * 136657;
s10 -= s17 * 683901;
s17 = 0;
s4 += s16 * 666643;
s5 += s16 * 470296;
s6 += s16 * 654183;
s7 -= s16 * 997805;
s8 += s16 * 136657;
s9 -= s16 * 683901;
s16 = 0;
s3 += s15 * 666643;
s4 += s15 * 470296;
s5 += s15 * 654183;
s6 -= s15 * 997805;
s7 += s15 * 136657;
s8 -= s15 * 683901;
s15 = 0;
s2 += s14 * 666643;
s3 += s14 * 470296;
s4 += s14 * 654183;
s5 -= s14 * 997805;
s6 += s14 * 136657;
s7 -= s14 * 683901;
s14 = 0;
s1 += s13 * 666643;
s2 += s13 * 470296;
s3 += s13 * 654183;
s4 -= s13 * 997805;
s5 += s13 * 136657;
s6 -= s13 * 683901;
s13 = 0;
s0 += s12 * 666643;
s1 += s12 * 470296;
s2 += s12 * 654183;
s3 -= s12 * 997805;
s4 += s12 * 136657;
s5 -= s12 * 683901;
s12 = 0;
carry0 = (s0 + (1 << 20)) >> 21;
s1 += carry0;
s0 -= carry0 << 21;
carry2 = (s2 + (1 << 20)) >> 21;
s3 += carry2;
s2 -= carry2 << 21;
carry4 = (s4 + (1 << 20)) >> 21;
s5 += carry4;
s4 -= carry4 << 21;
carry6 = (s6 + (1 << 20)) >> 21;
s7 += carry6;
s6 -= carry6 << 21;
carry8 = (s8 + (1 << 20)) >> 21;
s9 += carry8;
s8 -= carry8 << 21;
carry10 = (s10 + (1 << 20)) >> 21;
s11 += carry10;
s10 -= carry10 << 21;
carry1 = (s1 + (1 << 20)) >> 21;
s2 += carry1;
s1 -= carry1 << 21;
carry3 = (s3 + (1 << 20)) >> 21;
s4 += carry3;
s3 -= carry3 << 21;
carry5 = (s5 + (1 << 20)) >> 21;
s6 += carry5;
s5 -= carry5 << 21;
carry7 = (s7 + (1 << 20)) >> 21;
s8 += carry7;
s7 -= carry7 << 21;
carry9 = (s9 + (1 << 20)) >> 21;
s10 += carry9;
s9 -= carry9 << 21;
carry11 = (s11 + (1 << 20)) >> 21;
s12 += carry11;
s11 -= carry11 << 21;
s0 += s12 * 666643;
s1 += s12 * 470296;
s2 += s12 * 654183;
s3 -= s12 * 997805;
s4 += s12 * 136657;
s5 -= s12 * 683901;
s12 = 0;
carry0 = s0 >> 21;
s1 += carry0;
s0 -= carry0 << 21;
carry1 = s1 >> 21;
s2 += carry1;
s1 -= carry1 << 21;
carry2 = s2 >> 21;
s3 += carry2;
s2 -= carry2 << 21;
carry3 = s3 >> 21;
s4 += carry3;
s3 -= carry3 << 21;
carry4 = s4 >> 21;
s5 += carry4;
s4 -= carry4 << 21;
carry5 = s5 >> 21;
s6 += carry5;
s5 -= carry5 << 21;
carry6 = s6 >> 21;
s7 += carry6;
s6 -= carry6 << 21;
carry7 = s7 >> 21;
s8 += carry7;
s7 -= carry7 << 21;
carry8 = s8 >> 21;
s9 += carry8;
s8 -= carry8 << 21;
carry9 = s9 >> 21;
s10 += carry9;
s9 -= carry9 << 21;
carry10 = s10 >> 21;
s11 += carry10;
s10 -= carry10 << 21;
carry11 = s11 >> 21;
s12 += carry11;
s11 -= carry11 << 21;
s0 += s12 * 666643;
s1 += s12 * 470296;
s2 += s12 * 654183;
s3 -= s12 * 997805;
s4 += s12 * 136657;
s5 -= s12 * 683901;
s12 = 0;
carry0 = s0 >> 21;
s1 += carry0;
s0 -= carry0 << 21;
carry1 = s1 >> 21;
s2 += carry1;
s1 -= carry1 << 21;
carry2 = s2 >> 21;
s3 += carry2;
s2 -= carry2 << 21;
carry3 = s3 >> 21;
s4 += carry3;
s3 -= carry3 << 21;
carry4 = s4 >> 21;
s5 += carry4;
s4 -= carry4 << 21;
carry5 = s5 >> 21;
s6 += carry5;
s5 -= carry5 << 21;
carry6 = s6 >> 21;
s7 += carry6;
s6 -= carry6 << 21;
carry7 = s7 >> 21;
s8 += carry7;
s7 -= carry7 << 21;
carry8 = s8 >> 21;
s9 += carry8;
s8 -= carry8 << 21;
carry9 = s9 >> 21;
s10 += carry9;
s9 -= carry9 << 21;
carry10 = s10 >> 21;
s11 += carry10;
s10 -= carry10 << 21;
s[0] = s0 >> 0;
s[1] = s0 >> 8;
s[2] = (s0 >> 16) | (s1 << 5);
s[3] = s1 >> 3;
s[4] = s1 >> 11;
s[5] = (s1 >> 19) | (s2 << 2);
s[6] = s2 >> 6;
s[7] = (s2 >> 14) | (s3 << 7);
s[8] = s3 >> 1;
s[9] = s3 >> 9;
s[10] = (s3 >> 17) | (s4 << 4);
s[11] = s4 >> 4;
s[12] = s4 >> 12;
s[13] = (s4 >> 20) | (s5 << 1);
s[14] = s5 >> 7;
s[15] = (s5 >> 15) | (s6 << 6);
s[16] = s6 >> 2;
s[17] = s6 >> 10;
s[18] = (s6 >> 18) | (s7 << 3);
s[19] = s7 >> 5;
s[20] = s7 >> 13;
s[21] = s8 >> 0;
s[22] = s8 >> 8;
s[23] = (s8 >> 16) | (s9 << 5);
s[24] = s9 >> 3;
s[25] = s9 >> 11;
s[26] = (s9 >> 19) | (s10 << 2);
s[27] = s10 >> 6;
s[28] = (s10 >> 14) | (s11 << 7);
s[29] = s11 >> 1;
s[30] = s11 >> 9;
s[31] = s11 >> 17;
}
void ED25519_keypair(uint8_t out_public_key[32], uint8_t out_private_key[64]) {
uint8_t seed[32];
RAND_bytes(seed, 32);
ED25519_keypair_from_seed(out_public_key, out_private_key, seed);
}
int ED25519_sign(uint8_t out_sig[64], const uint8_t *message,
size_t message_len, const uint8_t private_key[64]) {
// NOTE: The documentation on this function says that it returns zero on
// allocation failure. While that can't happen with the current
// implementation, we want to reserve the ability to allocate in this
// implementation in the future.
uint8_t az[SHA512_DIGEST_LENGTH];
SHA512(private_key, 32, az);
az[0] &= 248;
az[31] &= 63;
az[31] |= 64;
SHA512_CTX hash_ctx;
SHA512_Init(&hash_ctx);
SHA512_Update(&hash_ctx, az + 32, 32);
SHA512_Update(&hash_ctx, message, message_len);
uint8_t nonce[SHA512_DIGEST_LENGTH];
SHA512_Final(nonce, &hash_ctx);
x25519_sc_reduce(nonce);
ge_p3 R;
x25519_ge_scalarmult_base(&R, nonce);
ge_p3_tobytes(out_sig, &R);
SHA512_Init(&hash_ctx);
SHA512_Update(&hash_ctx, out_sig, 32);
SHA512_Update(&hash_ctx, private_key + 32, 32);
SHA512_Update(&hash_ctx, message, message_len);
uint8_t hram[SHA512_DIGEST_LENGTH];
SHA512_Final(hram, &hash_ctx);
x25519_sc_reduce(hram);
sc_muladd(out_sig + 32, hram, az, nonce);
return 1;
}
int ED25519_verify(const uint8_t *message, size_t message_len,
const uint8_t signature[64], const uint8_t public_key[32]) {
ge_p3 A;
if ((signature[63] & 224) != 0 ||
!x25519_ge_frombytes_vartime(&A, public_key)) {
return 0;
}
fe_loose t;
fe_neg(&t, &A.X);
fe_carry(&A.X, &t);
fe_neg(&t, &A.T);
fe_carry(&A.T, &t);
uint8_t pkcopy[32];
OPENSSL_memcpy(pkcopy, public_key, 32);
uint8_t rcopy[32];
OPENSSL_memcpy(rcopy, signature, 32);
union {
uint64_t u64[4];
uint8_t u8[32];
} scopy;
OPENSSL_memcpy(&scopy.u8[0], signature + 32, 32);
// https://tools.ietf.org/html/rfc8032#section-5.1.7 requires that s be in
// the range [0, order) in order to prevent signature malleability.
// kOrder is the order of Curve25519 in little-endian form.
static const uint64_t kOrder[4] = {
UINT64_C(0x5812631a5cf5d3ed),
UINT64_C(0x14def9dea2f79cd6),
0,
UINT64_C(0x1000000000000000),
};
for (size_t i = 3;; i--) {
if (scopy.u64[i] > kOrder[i]) {
return 0;
} else if (scopy.u64[i] < kOrder[i]) {
break;
} else if (i == 0) {
return 0;
}
}
SHA512_CTX hash_ctx;
SHA512_Init(&hash_ctx);
SHA512_Update(&hash_ctx, signature, 32);
SHA512_Update(&hash_ctx, public_key, 32);
SHA512_Update(&hash_ctx, message, message_len);
uint8_t h[SHA512_DIGEST_LENGTH];
SHA512_Final(h, &hash_ctx);
x25519_sc_reduce(h);
ge_p2 R;
ge_double_scalarmult_vartime(&R, h, &A, scopy.u8);
uint8_t rcheck[32];
x25519_ge_tobytes(rcheck, &R);
return CRYPTO_memcmp(rcheck, rcopy, sizeof(rcheck)) == 0;
}
void ED25519_keypair_from_seed(uint8_t out_public_key[32],
uint8_t out_private_key[64],
const uint8_t seed[32]) {
uint8_t az[SHA512_DIGEST_LENGTH];
SHA512(seed, 32, az);
az[0] &= 248;
az[31] &= 127;
az[31] |= 64;
ge_p3 A;
x25519_ge_scalarmult_base(&A, az);
ge_p3_tobytes(out_public_key, &A);
OPENSSL_memcpy(out_private_key, seed, 32);
OPENSSL_memcpy(out_private_key + 32, out_public_key, 32);
}
static void x25519_scalar_mult_generic(uint8_t out[32],
const uint8_t scalar[32],
const uint8_t point[32]) {
fe x1, x2, z2, x3, z3, tmp0, tmp1;
fe_loose x2l, z2l, x3l, tmp0l, tmp1l;
uint8_t e[32];
OPENSSL_memcpy(e, scalar, 32);
e[0] &= 248;
e[31] &= 127;
e[31] |= 64;
// The following implementation was transcribed to Coq and proven to
// correspond to unary scalar multiplication in affine coordinates given that
// x1 != 0 is the x coordinate of some point on the curve. It was also checked
// in Coq that doing a ladderstep with x1 = x3 = 0 gives z2' = z3' = 0, and z2
// = z3 = 0 gives z2' = z3' = 0. The statement was quantified over the
// underlying field, so it applies to Curve25519 itself and the quadratic
// twist of Curve25519. It was not proven in Coq that prime-field arithmetic
// correctly simulates extension-field arithmetic on prime-field values.
// The decoding of the byte array representation of e was not considered.
// Specification of Montgomery curves in affine coordinates:
// <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Spec/MontgomeryCurve.v#L27>
// Proof that these form a group that is isomorphic to a Weierstrass curve:
// <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/AffineProofs.v#L35>
// Coq transcription and correctness proof of the loop (where scalarbits=255):
// <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L118>
// <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L278>
// preconditions: 0 <= e < 2^255 (not necessarily e < order), fe_invert(0) = 0
fe_frombytes(&x1, point);
fe_1(&x2);
fe_0(&z2);
fe_copy(&x3, &x1);
fe_1(&z3);
unsigned swap = 0;
int pos;
for (pos = 254; pos >= 0; --pos) {
// loop invariant as of right before the test, for the case where x1 != 0:
// pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3 is nonzero
// let r := e >> (pos+1) in the following equalities of projective points:
// to_xz (r*P) === if swap then (x3, z3) else (x2, z2)
// to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3)
// x1 is the nonzero x coordinate of the nonzero point (r*P-(r+1)*P)
unsigned b = 1 & (e[pos / 8] >> (pos & 7));
swap ^= b;
fe_cswap(&x2, &x3, swap);
fe_cswap(&z2, &z3, swap);
swap = b;
// Coq transcription of ladderstep formula (called from transcribed loop):
// <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L89>
// <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L131>
// x1 != 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L217>
// x1 = 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L147>
fe_sub(&tmp0l, &x3, &z3);
fe_sub(&tmp1l, &x2, &z2);
fe_add(&x2l, &x2, &z2);
fe_add(&z2l, &x3, &z3);
fe_mul_tll(&z3, &tmp0l, &x2l);
fe_mul_tll(&z2, &z2l, &tmp1l);
fe_sq_tl(&tmp0, &tmp1l);
fe_sq_tl(&tmp1, &x2l);
fe_add(&x3l, &z3, &z2);
fe_sub(&z2l, &z3, &z2);
fe_mul_ttt(&x2, &tmp1, &tmp0);
fe_sub(&tmp1l, &tmp1, &tmp0);
fe_sq_tl(&z2, &z2l);
fe_mul121666(&z3, &tmp1l);
fe_sq_tl(&x3, &x3l);
fe_add(&tmp0l, &tmp0, &z3);
fe_mul_ttt(&z3, &x1, &z2);
fe_mul_tll(&z2, &tmp1l, &tmp0l);
}
// here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3) else (x2, z2)
fe_cswap(&x2, &x3, swap);
fe_cswap(&z2, &z3, swap);
fe_invert(&z2, &z2);
fe_mul_ttt(&x2, &x2, &z2);
fe_tobytes(out, &x2);
}
static void x25519_scalar_mult(uint8_t out[32], const uint8_t scalar[32],
const uint8_t point[32]) {
#if defined(BORINGSSL_X25519_NEON)
if (CRYPTO_is_NEON_capable()) {
x25519_NEON(out, scalar, point);
return;
}
#endif
x25519_scalar_mult_generic(out, scalar, point);
}
void X25519_keypair(uint8_t out_public_value[32], uint8_t out_private_key[32]) {
RAND_bytes(out_private_key, 32);
// All X25519 implementations should decode scalars correctly (see
// https://tools.ietf.org/html/rfc7748#section-5). However, if an
// implementation doesn't then it might interoperate with random keys a
// fraction of the time because they'll, randomly, happen to be correctly
// formed.
//
// Thus we do the opposite of the masking here to make sure that our private
// keys are never correctly masked and so, hopefully, any incorrect
// implementations are deterministically broken.
//
// This does not affect security because, although we're throwing away
// entropy, a valid implementation of scalarmult should throw away the exact
// same bits anyway.
out_private_key[0] |= ~248;
out_private_key[31] &= ~64;
out_private_key[31] |= ~127;
X25519_public_from_private(out_public_value, out_private_key);
}
int X25519(uint8_t out_shared_key[32], const uint8_t private_key[32],
const uint8_t peer_public_value[32]) {
static const uint8_t kZeros[32] = {0};
x25519_scalar_mult(out_shared_key, private_key, peer_public_value);
// The all-zero output results when the input is a point of small order.
return CRYPTO_memcmp(kZeros, out_shared_key, 32) != 0;
}
void X25519_public_from_private(uint8_t out_public_value[32],
const uint8_t private_key[32]) {
#if defined(BORINGSSL_X25519_NEON)
if (CRYPTO_is_NEON_capable()) {
static const uint8_t kMongomeryBasePoint[32] = {9};
x25519_NEON(out_public_value, private_key, kMongomeryBasePoint);
return;
}
#endif
uint8_t e[32];
OPENSSL_memcpy(e, private_key, 32);
e[0] &= 248;
e[31] &= 127;
e[31] |= 64;
ge_p3 A;
x25519_ge_scalarmult_base(&A, e);
// We only need the u-coordinate of the curve25519 point. The map is
// u=(y+1)/(1-y). Since y=Y/Z, this gives u=(Z+Y)/(Z-Y).
fe_loose zplusy, zminusy;
fe zminusy_inv;
fe_add(&zplusy, &A.Z, &A.Y);
fe_sub(&zminusy, &A.Z, &A.Y);
fe_loose_invert(&zminusy_inv, &zminusy);
fe_mul_tlt(&zminusy_inv, &zplusy, &zminusy_inv);
fe_tobytes(out_public_value, &zminusy_inv);
}