Add a tuned variable-time P-256 multiplication function.
This reuses wnaf.c's window scheduling, but has access to the tuned field arithemetic and pre-computed base point table. Unlike wnaf.c, we do not make the points affine as it's not worth it for a single table. (We already precomputed the base point table.) Annoyingly, 32-bit x86 gets slower by a bit, but the other platforms are faster. My guess is that that the generic code gets to use the bn_mul_mont assembly and the compiler, faced with the increased 32-bit register pressure and the extremely register-poor x86, is making bad decisions on the otherwise P-256-tuned C code. The three platforms that see much larger gains are significantly more important than 32-bit x86 at this point, so go with this change. armv7a (Nexus 5X) before/after [+14.4%]: Did 2703 ECDSA P-256 verify operations in 5034539us (536.9 ops/sec) Did 3127 ECDSA P-256 verify operations in 5091379us (614.2 ops/sec) aarch64 (Nexus 5X) before/after [+9.2%]: Did 6783 ECDSA P-256 verify operations in 5031324us (1348.2 ops/sec) Did 7410 ECDSA P-256 verify operations in 5033291us (1472.2 ops/sec) x86 before/after [-2.7%]: Did 8961 ECDSA P-256 verify operations in 10075901us (889.3 ops/sec) Did 8568 ECDSA P-256 verify operations in 10003001us (856.5 ops/sec) x86_64 before/after [+8.6%]: Did 29808 ECDSA P-256 verify operations in 10008662us (2978.2 ops/sec) Did 32528 ECDSA P-256 verify operations in 10057137us (3234.3 ops/sec) Change-Id: I5fa643149f5bfbbda9533e3008baadfee9979b93 Reviewed-on: https://boringssl-review.googlesource.com/25684 Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Adam Langley <agl@google.com> CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
This commit is contained in:
parent
6e4ff114fc
commit
638a408cd2
@ -208,6 +208,17 @@ int ec_point_mul_scalar_public(const EC_GROUP *group, EC_POINT *r,
|
||||
const EC_SCALAR *g_scalar, const EC_POINT *p,
|
||||
const EC_SCALAR *p_scalar, BN_CTX *ctx);
|
||||
|
||||
// ec_compute_wNAF writes the modified width-(w+1) Non-Adjacent Form (wNAF) of
|
||||
// |scalar| to |out| and returns one on success or zero on internal error. |out|
|
||||
// must have room for |bits| + 1 elements, each of which will be either zero or
|
||||
// odd with an absolute value less than 2^w satisfying
|
||||
// scalar = \sum_j out[j]*2^j
|
||||
// where at most one of any w+1 consecutive digits is non-zero
|
||||
// with the exception that the most significant digit may be only
|
||||
// w-1 zeros away from that next non-zero digit.
|
||||
int ec_compute_wNAF(const EC_GROUP *group, int8_t *out, const EC_SCALAR *scalar,
|
||||
size_t bits, int w);
|
||||
|
||||
int ec_wNAF_mul(const EC_GROUP *group, EC_POINT *r, const EC_SCALAR *g_scalar,
|
||||
const EC_POINT *p, const EC_SCALAR *p_scalar, BN_CTX *ctx);
|
||||
|
||||
|
@ -84,16 +84,8 @@
|
||||
// http://link.springer.com/chapter/10.1007%2F3-540-45537-X_13
|
||||
// http://www.bmoeller.de/pdf/TI-01-08.multiexp.pdf
|
||||
|
||||
// compute_wNAF writes the modified width-(w+1) Non-Adjacent Form (wNAF) of
|
||||
// |scalar| to |out| and returns one on success or zero on internal error. |out|
|
||||
// must have room for |bits| + 1 elements, each of which will be either zero or
|
||||
// odd with an absolute value less than 2^w satisfying
|
||||
// scalar = \sum_j out[j]*2^j
|
||||
// where at most one of any w+1 consecutive digits is non-zero
|
||||
// with the exception that the most significant digit may be only
|
||||
// w-1 zeros away from that next non-zero digit.
|
||||
static int compute_wNAF(const EC_GROUP *group, int8_t *out,
|
||||
const EC_SCALAR *scalar, size_t bits, int w) {
|
||||
int ec_compute_wNAF(const EC_GROUP *group, int8_t *out, const EC_SCALAR *scalar,
|
||||
size_t bits, int w) {
|
||||
// 'int8_t' can represent integers with absolute values less than 2^7.
|
||||
if (w <= 0 || w > 7 || bits == 0) {
|
||||
OPENSSL_PUT_ERROR(EC, ERR_R_INTERNAL_ERROR);
|
||||
@ -277,7 +269,7 @@ int ec_wNAF_mul(const EC_GROUP *group, EC_POINT *r, const EC_SCALAR *g_scalar,
|
||||
}
|
||||
g_precomp = precomp_storage + total_precomp;
|
||||
total_precomp += precomp_len;
|
||||
if (!compute_wNAF(group, g_wNAF, g_scalar, bits, wsize) ||
|
||||
if (!ec_compute_wNAF(group, g_wNAF, g_scalar, bits, wsize) ||
|
||||
!compute_precomp(group, g_precomp, g, precomp_len, ctx)) {
|
||||
goto err;
|
||||
}
|
||||
@ -286,7 +278,7 @@ int ec_wNAF_mul(const EC_GROUP *group, EC_POINT *r, const EC_SCALAR *g_scalar,
|
||||
if (p_scalar != NULL) {
|
||||
p_precomp = precomp_storage + total_precomp;
|
||||
total_precomp += precomp_len;
|
||||
if (!compute_wNAF(group, p_wNAF, p_scalar, bits, wsize) ||
|
||||
if (!ec_compute_wNAF(group, p_wNAF, p_scalar, bits, wsize) ||
|
||||
!compute_precomp(group, p_precomp, p, precomp_len, ctx)) {
|
||||
goto err;
|
||||
}
|
||||
|
100
third_party/fiat/p256.c
vendored
100
third_party/fiat/p256.c
vendored
@ -1718,6 +1718,95 @@ static int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int ec_GFp_nistp256_point_mul_public(const EC_GROUP *group, EC_POINT *r,
|
||||
const EC_SCALAR *g_scalar,
|
||||
const EC_POINT *p,
|
||||
const EC_SCALAR *p_scalar,
|
||||
BN_CTX *unused_ctx) {
|
||||
#define P256_WSIZE_PUBLIC 4
|
||||
// Precompute multiples of |p|. p_pre_comp[i] is (2*i+1) * |p|.
|
||||
fe p_pre_comp[1 << (P256_WSIZE_PUBLIC-1)][3];
|
||||
if (!BN_to_fe(p_pre_comp[0][0], &p->X) ||
|
||||
!BN_to_fe(p_pre_comp[0][1], &p->Y) ||
|
||||
!BN_to_fe(p_pre_comp[0][2], &p->Z)) {
|
||||
return 0;
|
||||
}
|
||||
fe p2[3];
|
||||
point_double(p2[0], p2[1], p2[2], p_pre_comp[0][0], p_pre_comp[0][1],
|
||||
p_pre_comp[0][2]);
|
||||
for (size_t i = 1; i < OPENSSL_ARRAY_SIZE(p_pre_comp); i++) {
|
||||
point_add(p_pre_comp[i][0], p_pre_comp[i][1], p_pre_comp[i][2],
|
||||
p_pre_comp[i - 1][0], p_pre_comp[i - 1][1], p_pre_comp[i - 1][2],
|
||||
0 /* not mixed */, p2[0], p2[1], p2[2]);
|
||||
}
|
||||
|
||||
// Set up the coefficients for |p_scalar|.
|
||||
int8_t p_wNAF[257];
|
||||
if (!ec_compute_wNAF(group, p_wNAF, p_scalar, 256, P256_WSIZE_PUBLIC)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Set |ret| to the point at infinity.
|
||||
int skip = 1; // Save some point operations.
|
||||
fe ret[3] = {{0},{0},{0}};
|
||||
for (int i = 256; i >= 0; i--) {
|
||||
if (!skip) {
|
||||
point_double(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2]);
|
||||
}
|
||||
|
||||
// For the |g_scalar|, we use the precomputed table without the
|
||||
// constant-time lookup.
|
||||
if (i <= 31) {
|
||||
// First, look 32 bits upwards.
|
||||
uint64_t bits = get_bit(g_scalar->bytes, i + 224) << 3;
|
||||
bits |= get_bit(g_scalar->bytes, i + 160) << 2;
|
||||
bits |= get_bit(g_scalar->bytes, i + 96) << 1;
|
||||
bits |= get_bit(g_scalar->bytes, i + 32);
|
||||
point_add(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2], 1 /* mixed */,
|
||||
g_pre_comp[1][bits][0], g_pre_comp[1][bits][1],
|
||||
g_pre_comp[1][bits][2]);
|
||||
skip = 0;
|
||||
|
||||
// Second, look at the current position.
|
||||
bits = get_bit(g_scalar->bytes, i + 192) << 3;
|
||||
bits |= get_bit(g_scalar->bytes, i + 128) << 2;
|
||||
bits |= get_bit(g_scalar->bytes, i + 64) << 1;
|
||||
bits |= get_bit(g_scalar->bytes, i);
|
||||
point_add(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2], 1 /* mixed */,
|
||||
g_pre_comp[0][bits][0], g_pre_comp[0][bits][1],
|
||||
g_pre_comp[0][bits][2]);
|
||||
}
|
||||
|
||||
int digit = p_wNAF[i];
|
||||
if (digit != 0) {
|
||||
assert(digit & 1);
|
||||
int idx = digit < 0 ? (-digit) >> 1 : digit >> 1;
|
||||
fe *y = &p_pre_comp[idx][1], tmp;
|
||||
if (digit < 0) {
|
||||
fe_opp(tmp, p_pre_comp[idx][1]);
|
||||
y = &tmp;
|
||||
}
|
||||
if (!skip) {
|
||||
point_add(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2],
|
||||
0 /* not mixed */, p_pre_comp[idx][0], *y, p_pre_comp[idx][2]);
|
||||
} else {
|
||||
fe_copy(ret[0], p_pre_comp[idx][0]);
|
||||
fe_copy(ret[1], *y);
|
||||
fe_copy(ret[2], p_pre_comp[idx][2]);
|
||||
skip = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!fe_to_BN(&r->X, ret[0]) ||
|
||||
!fe_to_BN(&r->Y, ret[1]) ||
|
||||
!fe_to_BN(&r->Z, ret[2])) {
|
||||
OPENSSL_PUT_ERROR(EC, ERR_R_BN_LIB);
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistp256_method) {
|
||||
out->group_init = ec_GFp_mont_group_init;
|
||||
out->group_finish = ec_GFp_mont_group_finish;
|
||||
@ -1725,16 +1814,7 @@ DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistp256_method) {
|
||||
out->point_get_affine_coordinates =
|
||||
ec_GFp_nistp256_point_get_affine_coordinates;
|
||||
out->mul = ec_GFp_nistp256_points_mul;
|
||||
// The variable-time wNAF point multiplication uses fewer field operations than
|
||||
// the constant-time implementation here, but the 64-bit field arithmetic in
|
||||
// this file is much faster than the generic BIGNUM-based field arithmetic used
|
||||
// by wNAF. For 32-bit, the wNAF code is overall ~60% faster on non-precomputed
|
||||
// points, so we use it for public inputs.
|
||||
#if defined(BORINGSSL_NISTP256_64BIT)
|
||||
out->mul_public = ec_GFp_nistp256_points_mul;
|
||||
#else
|
||||
out->mul_public = ec_wNAF_mul;
|
||||
#endif
|
||||
out->mul_public = ec_GFp_nistp256_point_mul_public;
|
||||
out->field_mul = ec_GFp_mont_field_mul;
|
||||
out->field_sqr = ec_GFp_mont_field_sqr;
|
||||
out->field_encode = ec_GFp_mont_field_encode;
|
||||
|
Loading…
Reference in New Issue
Block a user