boringssl/crypto/fipsmodule/ec/internal.h

402 lines
17 KiB
C
Raw Normal View History

/* Originally written by Bodo Moeller for the OpenSSL project.
* ====================================================================
* Copyright (c) 1998-2005 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
*
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For written permission, please contact
* openssl-core@openssl.org.
*
* 5. Products derived from this software may not be called "OpenSSL"
* nor may "OpenSSL" appear in their names without prior written
* permission of the OpenSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*
* This product includes cryptographic software written by Eric Young
* (eay@cryptsoft.com). This product includes software written by Tim
* Hudson (tjh@cryptsoft.com).
*
*/
/* ====================================================================
* Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
*
* Portions of the attached software ("Contribution") are developed by
* SUN MICROSYSTEMS, INC., and are contributed to the OpenSSL project.
*
* The Contribution is licensed pursuant to the OpenSSL open source
* license provided above.
*
* The elliptic curve binary polynomial software is originally written by
* Sheueling Chang Shantz and Douglas Stebila of Sun Microsystems
* Laboratories. */
#ifndef OPENSSL_HEADER_EC_INTERNAL_H
#define OPENSSL_HEADER_EC_INTERNAL_H
#include <openssl/base.h>
#include <openssl/bn.h>
#include <openssl/ex_data.h>
#include <openssl/thread.h>
#include <openssl/type_check.h>
#include "../bn/internal.h"
#if defined(__cplusplus)
extern "C" {
#endif
// Cap the size of all field elements and scalars, including custom curves, to
// 66 bytes, large enough to fit secp521r1 and brainpoolP512r1, which appear to
// be the largest fields anyone plausibly uses.
#define EC_MAX_SCALAR_BYTES 66
#define EC_MAX_SCALAR_WORDS ((66 + BN_BYTES - 1) / BN_BYTES)
OPENSSL_COMPILE_ASSERT(EC_MAX_SCALAR_WORDS <= BN_SMALL_MAX_WORDS,
bn_small_functions_applicable);
// An EC_SCALAR is an integer fully reduced modulo the order. Only the first
// |order->width| words are used. An |EC_SCALAR| is specific to an |EC_GROUP|
// and must not be mixed between groups.
Make ECDSA signing 10% faster and plug some timing leaks. None of the asymmetric crypto we inherented from OpenSSL is constant-time because of BIGNUM. BIGNUM chops leading zeros off the front of everything, so we end up leaking information about the first word, in theory. BIGNUM functions additionally tend to take the full range of inputs and then call into BN_nnmod at various points. All our secret values should be acted on in constant-time, but k in ECDSA is a particularly sensitive value. So, ecdsa_sign_setup, in an attempt to mitigate the BIGNUM leaks, would add a couple copies of the order. This does not work at all. k is used to compute two values: k^-1 and kG. The first operation when computing k^-1 is to call BN_nnmod if k is out of range. The entry point to our tuned constant-time curve implementations is to call BN_nnmod if the scalar has too many bits, which this causes. The result is both corrections are immediately undone but cause us to do more variable-time work in the meantime. Replace all these computations around k with the word-based functions added in the various preceding CLs. In doing so, replace the BN_mod_mul calls (which internally call BN_nnmod) with Montgomery reduction. We can avoid taking k^-1 out of Montgomery form, which combines nicely with Brian Smith's trick in 3426d1011946b26ff1bb2fd98a081ba4753c9cc8. Along the way, we avoid some unnecessary mallocs. BIGNUM still affects the private key itself, as well as the EC_POINTs. But this should hopefully be much better now. Also it's 10% faster: Before: Did 15000 ECDSA P-224 signing operations in 1069117us (14030.3 ops/sec) Did 18000 ECDSA P-256 signing operations in 1053908us (17079.3 ops/sec) Did 1078 ECDSA P-384 signing operations in 1087853us (990.9 ops/sec) Did 473 ECDSA P-521 signing operations in 1069835us (442.1 ops/sec) After: Did 16000 ECDSA P-224 signing operations in 1064799us (15026.3 ops/sec) Did 19000 ECDSA P-256 signing operations in 1007839us (18852.2 ops/sec) Did 1078 ECDSA P-384 signing operations in 1079413us (998.7 ops/sec) Did 484 ECDSA P-521 signing operations in 1083616us (446.7 ops/sec) Change-Id: I2a25e90fc99dac13c0616d0ea45e125a4bd8cca1 Reviewed-on: https://boringssl-review.googlesource.com/23075 Reviewed-by: Adam Langley <agl@google.com>
2017-11-13 03:58:00 +00:00
typedef union {
// bytes is the representation of the scalar in little-endian order.
uint8_t bytes[EC_MAX_SCALAR_BYTES];
BN_ULONG words[EC_MAX_SCALAR_WORDS];
} EC_SCALAR;
Add EC_FELEM for EC_POINTs and related temporaries. This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used for EC_POINT's representation in the generic EC_METHOD, as well as random operations on tuned EC_METHODs that still are implemented genericly. Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific to the EC_METHOD, analogous to how the old values were BIGNUMs but may or may not have been in Montgomery form. This is kind of a nuisance, but no more than before. (If p224-64.c were easily convertable to Montgomery form, we could say |EC_FELEM| is always in Montgomery form. If we exposed the internal add and double implementations in each of the curves, we could give |EC_POINT| an |EC_METHOD|-specific representation and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this for later.) The generic add and doubling formulas are aligned with the formulas proved in fiat-crypto. Those only applied to a = -3, so I've proved a generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case someone uses a custom curve. The new formulas are verified, constant-time, and swap a multiply for a square. As expressed in fiat-crypto they do use more temporaries, but this seems to be fine with stack-allocated EC_FELEMs. (We can try to help the compiler later, but benchamrks below suggest this isn't necessary.) Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the bounds in the type system and, in particular, that the width is correct, which will make it easier to select a point in constant-time in the future. (Indeed the old code did not always have the correct width. Its point formula involved halving and implemented this in variable time and variable width.) Before: Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec) Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec) Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec) Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec) Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec) Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec) Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec) After: Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%] Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%] Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%] Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%] Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%] Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%] Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%] This more than pays for removing points_make_affine previously and even speeds up ECDH P-256 slightly. (The point-on-curve check uses the generic code.) Next is to push the stack-allocating up to ec_wNAF_mul, followed by a constant-time single-point multiplication. Bug: 239 Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c Reviewed-on: https://boringssl-review.googlesource.com/27668 Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
// An EC_FELEM represents a field element. Only the first |field->width| words
// are used. An |EC_FELEM| is specific to an |EC_GROUP| and must not be mixed
// between groups. Additionally, the representation (whether or not elements are
// represented in Montgomery-form) may vary between |EC_METHOD|s.
typedef union {
// bytes is the representation of the field element in little-endian order.
uint8_t bytes[EC_MAX_SCALAR_BYTES];
BN_ULONG words[EC_MAX_SCALAR_WORDS];
} EC_FELEM;
struct ec_method_st {
int (*group_init)(EC_GROUP *);
void (*group_finish)(EC_GROUP *);
int (*group_set_curve)(EC_GROUP *, const BIGNUM *p, const BIGNUM *a,
const BIGNUM *b, BN_CTX *);
int (*point_get_affine_coordinates)(const EC_GROUP *, const EC_POINT *,
Add EC_FELEM for EC_POINTs and related temporaries. This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used for EC_POINT's representation in the generic EC_METHOD, as well as random operations on tuned EC_METHODs that still are implemented genericly. Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific to the EC_METHOD, analogous to how the old values were BIGNUMs but may or may not have been in Montgomery form. This is kind of a nuisance, but no more than before. (If p224-64.c were easily convertable to Montgomery form, we could say |EC_FELEM| is always in Montgomery form. If we exposed the internal add and double implementations in each of the curves, we could give |EC_POINT| an |EC_METHOD|-specific representation and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this for later.) The generic add and doubling formulas are aligned with the formulas proved in fiat-crypto. Those only applied to a = -3, so I've proved a generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case someone uses a custom curve. The new formulas are verified, constant-time, and swap a multiply for a square. As expressed in fiat-crypto they do use more temporaries, but this seems to be fine with stack-allocated EC_FELEMs. (We can try to help the compiler later, but benchamrks below suggest this isn't necessary.) Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the bounds in the type system and, in particular, that the width is correct, which will make it easier to select a point in constant-time in the future. (Indeed the old code did not always have the correct width. Its point formula involved halving and implemented this in variable time and variable width.) Before: Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec) Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec) Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec) Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec) Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec) Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec) Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec) After: Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%] Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%] Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%] Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%] Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%] Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%] Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%] This more than pays for removing points_make_affine previously and even speeds up ECDH P-256 slightly. (The point-on-curve check uses the generic code.) Next is to push the stack-allocating up to ec_wNAF_mul, followed by a constant-time single-point multiplication. Bug: 239 Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c Reviewed-on: https://boringssl-review.googlesource.com/27668 Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
BIGNUM *x, BIGNUM *y);
// Computes |r = g_scalar*generator + p_scalar*p| if |g_scalar| and |p_scalar|
// are both non-null. Computes |r = g_scalar*generator| if |p_scalar| is null.
// Computes |r = p_scalar*p| if g_scalar is null. At least one of |g_scalar|
// and |p_scalar| must be non-null, and |p| must be non-null if |p_scalar| is
// non-null.
Make ECDSA signing 10% faster and plug some timing leaks. None of the asymmetric crypto we inherented from OpenSSL is constant-time because of BIGNUM. BIGNUM chops leading zeros off the front of everything, so we end up leaking information about the first word, in theory. BIGNUM functions additionally tend to take the full range of inputs and then call into BN_nnmod at various points. All our secret values should be acted on in constant-time, but k in ECDSA is a particularly sensitive value. So, ecdsa_sign_setup, in an attempt to mitigate the BIGNUM leaks, would add a couple copies of the order. This does not work at all. k is used to compute two values: k^-1 and kG. The first operation when computing k^-1 is to call BN_nnmod if k is out of range. The entry point to our tuned constant-time curve implementations is to call BN_nnmod if the scalar has too many bits, which this causes. The result is both corrections are immediately undone but cause us to do more variable-time work in the meantime. Replace all these computations around k with the word-based functions added in the various preceding CLs. In doing so, replace the BN_mod_mul calls (which internally call BN_nnmod) with Montgomery reduction. We can avoid taking k^-1 out of Montgomery form, which combines nicely with Brian Smith's trick in 3426d1011946b26ff1bb2fd98a081ba4753c9cc8. Along the way, we avoid some unnecessary mallocs. BIGNUM still affects the private key itself, as well as the EC_POINTs. But this should hopefully be much better now. Also it's 10% faster: Before: Did 15000 ECDSA P-224 signing operations in 1069117us (14030.3 ops/sec) Did 18000 ECDSA P-256 signing operations in 1053908us (17079.3 ops/sec) Did 1078 ECDSA P-384 signing operations in 1087853us (990.9 ops/sec) Did 473 ECDSA P-521 signing operations in 1069835us (442.1 ops/sec) After: Did 16000 ECDSA P-224 signing operations in 1064799us (15026.3 ops/sec) Did 19000 ECDSA P-256 signing operations in 1007839us (18852.2 ops/sec) Did 1078 ECDSA P-384 signing operations in 1079413us (998.7 ops/sec) Did 484 ECDSA P-521 signing operations in 1083616us (446.7 ops/sec) Change-Id: I2a25e90fc99dac13c0616d0ea45e125a4bd8cca1 Reviewed-on: https://boringssl-review.googlesource.com/23075 Reviewed-by: Adam Langley <agl@google.com>
2017-11-13 03:58:00 +00:00
int (*mul)(const EC_GROUP *group, EC_POINT *r, const EC_SCALAR *g_scalar,
const EC_POINT *p, const EC_SCALAR *p_scalar, BN_CTX *ctx);
ec/p256.c: fiat-crypto field arithmetic (64, 32) The fiat-crypto-generated code uses the Montgomery form implementation strategy, for both 32-bit and 64-bit code. 64-bit throughput seems slower, but the difference is smaller than noise between repetitions (-2%?) 32-bit throughput has decreased significantly for ECDH (-40%). I am attributing this to the change from varibale-time scalar multiplication to constant-time scalar multiplication. Due to the same bottleneck, ECDSA verification still uses the old code (otherwise there would have been a 60% throughput decrease). On the other hand, ECDSA signing throughput has increased slightly (+10%), perhaps due to the use of a precomputed table of multiples of the base point. 64-bit benchmarks (Google Cloud Haswell): with this change: Did 9126 ECDH P-256 operations in 1009572us (9039.5 ops/sec) Did 23000 ECDSA P-256 signing operations in 1039832us (22119.0 ops/sec) Did 8820 ECDSA P-256 verify operations in 1024242us (8611.2 ops/sec) master (40e8c921cab5cce2bc10722ecf4ebe0e380cf6c8): Did 9340 ECDH P-256 operations in 1017975us (9175.1 ops/sec) Did 23000 ECDSA P-256 signing operations in 1039820us (22119.2 ops/sec) Did 8688 ECDSA P-256 verify operations in 1021108us (8508.4 ops/sec) benchmarks on ARMv7 (LG Nexus 4): with this change: Did 150 ECDH P-256 operations in 1029726us (145.7 ops/sec) Did 506 ECDSA P-256 signing operations in 1065192us (475.0 ops/sec) Did 363 ECDSA P-256 verify operations in 1033298us (351.3 ops/sec) master (2fce1beda0f7e74e2d687860f807cf0b8d8056a4): Did 245 ECDH P-256 operations in 1017518us (240.8 ops/sec) Did 473 ECDSA P-256 signing operations in 1086281us (435.4 ops/sec) Did 360 ECDSA P-256 verify operations in 1003846us (358.6 ops/sec) 64-bit tables converted as follows: import re, sys, math p = 2**256 - 2**224 + 2**192 + 2**96 - 1 R = 2**256 def convert(t): x0, s1, x1, s2, x2, s3, x3 = t.groups() v = int(x0, 0) + 2**64 * (int(x1, 0) + 2**64*(int(x2,0) + 2**64*(int(x3, 0)) )) w = v*R%p y0 = hex(w%(2**64)) y1 = hex((w>>64)%(2**64)) y2 = hex((w>>(2*64))%(2**64)) y3 = hex((w>>(3*64))%(2**64)) ww = int(y0, 0) + 2**64 * (int(y1, 0) + 2**64*(int(y2,0) + 2**64*(int(y3, 0)) )) if ww != v*R%p: print(x0,x1,x2,x3) print(hex(v)) print(y0,y1,y2,y3) print(hex(w)) print(hex(ww)) assert 0 return '{'+y0+s1+y1+s2+y2+s3+y3+'}' fe_re = re.compile('{'+r'(\s*,\s*)'.join(r'(\d+|0x[abcdefABCDEF0123456789]+)' for i in range(4)) + '}') print (re.sub(fe_re, convert, sys.stdin.read()).rstrip('\n')) 32-bit tables converted from 64-bit tables Change-Id: I52d6e5504fcb6ca2e8b0ee13727f4500c80c1799 Reviewed-on: https://boringssl-review.googlesource.com/23244 Commit-Queue: Adam Langley <agl@google.com> Reviewed-by: Adam Langley <agl@google.com> CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
2017-11-08 20:32:38 +00:00
// mul_public performs the same computation as mul. It further assumes that
// the inputs are public so there is no concern about leaking their values
// through timing.
int (*mul_public)(const EC_GROUP *group, EC_POINT *r,
const EC_SCALAR *g_scalar, const EC_POINT *p,
const EC_SCALAR *p_scalar, BN_CTX *ctx);
Add EC_FELEM for EC_POINTs and related temporaries. This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used for EC_POINT's representation in the generic EC_METHOD, as well as random operations on tuned EC_METHODs that still are implemented genericly. Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific to the EC_METHOD, analogous to how the old values were BIGNUMs but may or may not have been in Montgomery form. This is kind of a nuisance, but no more than before. (If p224-64.c were easily convertable to Montgomery form, we could say |EC_FELEM| is always in Montgomery form. If we exposed the internal add and double implementations in each of the curves, we could give |EC_POINT| an |EC_METHOD|-specific representation and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this for later.) The generic add and doubling formulas are aligned with the formulas proved in fiat-crypto. Those only applied to a = -3, so I've proved a generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case someone uses a custom curve. The new formulas are verified, constant-time, and swap a multiply for a square. As expressed in fiat-crypto they do use more temporaries, but this seems to be fine with stack-allocated EC_FELEMs. (We can try to help the compiler later, but benchamrks below suggest this isn't necessary.) Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the bounds in the type system and, in particular, that the width is correct, which will make it easier to select a point in constant-time in the future. (Indeed the old code did not always have the correct width. Its point formula involved halving and implemented this in variable time and variable width.) Before: Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec) Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec) Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec) Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec) Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec) Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec) Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec) After: Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%] Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%] Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%] Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%] Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%] Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%] Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%] This more than pays for removing points_make_affine previously and even speeds up ECDH P-256 slightly. (The point-on-curve check uses the generic code.) Next is to push the stack-allocating up to ec_wNAF_mul, followed by a constant-time single-point multiplication. Bug: 239 Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c Reviewed-on: https://boringssl-review.googlesource.com/27668 Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
// felem_mul and felem_sqr implement multiplication and squaring,
// respectively, so that the generic |EC_POINT_add| and |EC_POINT_dbl|
// implementations can work both with |EC_GFp_mont_method| and the tuned
// operations.
//
// TODO(davidben): This constrains |EC_FELEM|'s internal representation, adds
// many indirect calls in the middle of the generic code, and a bunch of
// conversions. If p224-64.c were easily convertable to Montgomery form, we
// could say |EC_FELEM| is always in Montgomery form. If we exposed the
// internal add and double implementations in each of the curves, we could
// give |EC_POINT| an |EC_METHOD|-specific representation and |EC_FELEM| is
// purely a |EC_GFp_mont_method| type.
void (*felem_mul)(const EC_GROUP *, EC_FELEM *r, const EC_FELEM *a,
const EC_FELEM *b);
void (*felem_sqr)(const EC_GROUP *, EC_FELEM *r, const EC_FELEM *a);
int (*bignum_to_felem)(const EC_GROUP *group, EC_FELEM *out,
const BIGNUM *in);
int (*felem_to_bignum)(const EC_GROUP *group, BIGNUM *out,
const EC_FELEM *in);
// scalar_inv_mont sets |out| to |in|^-1, where both input and output are in
// Montgomery form.
void (*scalar_inv_montgomery)(const EC_GROUP *group, EC_SCALAR *out,
const EC_SCALAR *in);
} /* EC_METHOD */;
const EC_METHOD *EC_GFp_mont_method(void);
struct ec_group_st {
const EC_METHOD *meth;
// Unlike all other |EC_POINT|s, |generator| does not own |generator->group|
// to avoid a reference cycle.
EC_POINT *generator;
BIGNUM order;
int curve_name; // optional NID for named curve
BN_MONT_CTX *order_mont; // data for ECDSA inverse
// The following members are handled by the method functions,
// even if they appear generic
BIGNUM field; // For curves over GF(p), this is the modulus.
Add EC_FELEM for EC_POINTs and related temporaries. This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used for EC_POINT's representation in the generic EC_METHOD, as well as random operations on tuned EC_METHODs that still are implemented genericly. Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific to the EC_METHOD, analogous to how the old values were BIGNUMs but may or may not have been in Montgomery form. This is kind of a nuisance, but no more than before. (If p224-64.c were easily convertable to Montgomery form, we could say |EC_FELEM| is always in Montgomery form. If we exposed the internal add and double implementations in each of the curves, we could give |EC_POINT| an |EC_METHOD|-specific representation and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this for later.) The generic add and doubling formulas are aligned with the formulas proved in fiat-crypto. Those only applied to a = -3, so I've proved a generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case someone uses a custom curve. The new formulas are verified, constant-time, and swap a multiply for a square. As expressed in fiat-crypto they do use more temporaries, but this seems to be fine with stack-allocated EC_FELEMs. (We can try to help the compiler later, but benchamrks below suggest this isn't necessary.) Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the bounds in the type system and, in particular, that the width is correct, which will make it easier to select a point in constant-time in the future. (Indeed the old code did not always have the correct width. Its point formula involved halving and implemented this in variable time and variable width.) Before: Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec) Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec) Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec) Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec) Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec) Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec) Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec) After: Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%] Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%] Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%] Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%] Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%] Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%] Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%] This more than pays for removing points_make_affine previously and even speeds up ECDH P-256 slightly. (The point-on-curve check uses the generic code.) Next is to push the stack-allocating up to ec_wNAF_mul, followed by a constant-time single-point multiplication. Bug: 239 Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c Reviewed-on: https://boringssl-review.googlesource.com/27668 Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
EC_FELEM a, b; // Curve coefficients.
int a_is_minus3; // enable optimized point arithmetics for special case
CRYPTO_refcount_t references;
BN_MONT_CTX *mont; // Montgomery structure.
Add EC_FELEM for EC_POINTs and related temporaries. This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used for EC_POINT's representation in the generic EC_METHOD, as well as random operations on tuned EC_METHODs that still are implemented genericly. Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific to the EC_METHOD, analogous to how the old values were BIGNUMs but may or may not have been in Montgomery form. This is kind of a nuisance, but no more than before. (If p224-64.c were easily convertable to Montgomery form, we could say |EC_FELEM| is always in Montgomery form. If we exposed the internal add and double implementations in each of the curves, we could give |EC_POINT| an |EC_METHOD|-specific representation and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this for later.) The generic add and doubling formulas are aligned with the formulas proved in fiat-crypto. Those only applied to a = -3, so I've proved a generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case someone uses a custom curve. The new formulas are verified, constant-time, and swap a multiply for a square. As expressed in fiat-crypto they do use more temporaries, but this seems to be fine with stack-allocated EC_FELEMs. (We can try to help the compiler later, but benchamrks below suggest this isn't necessary.) Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the bounds in the type system and, in particular, that the width is correct, which will make it easier to select a point in constant-time in the future. (Indeed the old code did not always have the correct width. Its point formula involved halving and implemented this in variable time and variable width.) Before: Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec) Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec) Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec) Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec) Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec) Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec) Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec) After: Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%] Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%] Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%] Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%] Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%] Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%] Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%] This more than pays for removing points_make_affine previously and even speeds up ECDH P-256 slightly. (The point-on-curve check uses the generic code.) Next is to push the stack-allocating up to ec_wNAF_mul, followed by a constant-time single-point multiplication. Bug: 239 Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c Reviewed-on: https://boringssl-review.googlesource.com/27668 Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
EC_FELEM one; // The value one.
} /* EC_GROUP */;
struct ec_point_st {
// group is an owning reference to |group|, unless this is
// |group->generator|.
EC_GROUP *group;
Add EC_FELEM for EC_POINTs and related temporaries. This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used for EC_POINT's representation in the generic EC_METHOD, as well as random operations on tuned EC_METHODs that still are implemented genericly. Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific to the EC_METHOD, analogous to how the old values were BIGNUMs but may or may not have been in Montgomery form. This is kind of a nuisance, but no more than before. (If p224-64.c were easily convertable to Montgomery form, we could say |EC_FELEM| is always in Montgomery form. If we exposed the internal add and double implementations in each of the curves, we could give |EC_POINT| an |EC_METHOD|-specific representation and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this for later.) The generic add and doubling formulas are aligned with the formulas proved in fiat-crypto. Those only applied to a = -3, so I've proved a generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case someone uses a custom curve. The new formulas are verified, constant-time, and swap a multiply for a square. As expressed in fiat-crypto they do use more temporaries, but this seems to be fine with stack-allocated EC_FELEMs. (We can try to help the compiler later, but benchamrks below suggest this isn't necessary.) Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the bounds in the type system and, in particular, that the width is correct, which will make it easier to select a point in constant-time in the future. (Indeed the old code did not always have the correct width. Its point formula involved halving and implemented this in variable time and variable width.) Before: Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec) Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec) Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec) Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec) Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec) Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec) Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec) After: Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%] Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%] Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%] Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%] Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%] Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%] Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%] This more than pays for removing points_make_affine previously and even speeds up ECDH P-256 slightly. (The point-on-curve check uses the generic code.) Next is to push the stack-allocating up to ec_wNAF_mul, followed by a constant-time single-point multiplication. Bug: 239 Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c Reviewed-on: https://boringssl-review.googlesource.com/27668 Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
// X, Y, and Z are Jacobian projective coordinates. They represent
// (X/Z^2, Y/Z^3) if Z != 0 and the point and infinite otherwise.
EC_FELEM X, Y, Z;
} /* EC_POINT */;
EC_GROUP *ec_group_new(const EC_METHOD *meth);
Add EC_FELEM for EC_POINTs and related temporaries. This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used for EC_POINT's representation in the generic EC_METHOD, as well as random operations on tuned EC_METHODs that still are implemented genericly. Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific to the EC_METHOD, analogous to how the old values were BIGNUMs but may or may not have been in Montgomery form. This is kind of a nuisance, but no more than before. (If p224-64.c were easily convertable to Montgomery form, we could say |EC_FELEM| is always in Montgomery form. If we exposed the internal add and double implementations in each of the curves, we could give |EC_POINT| an |EC_METHOD|-specific representation and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this for later.) The generic add and doubling formulas are aligned with the formulas proved in fiat-crypto. Those only applied to a = -3, so I've proved a generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case someone uses a custom curve. The new formulas are verified, constant-time, and swap a multiply for a square. As expressed in fiat-crypto they do use more temporaries, but this seems to be fine with stack-allocated EC_FELEMs. (We can try to help the compiler later, but benchamrks below suggest this isn't necessary.) Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the bounds in the type system and, in particular, that the width is correct, which will make it easier to select a point in constant-time in the future. (Indeed the old code did not always have the correct width. Its point formula involved halving and implemented this in variable time and variable width.) Before: Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec) Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec) Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec) Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec) Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec) Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec) Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec) After: Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%] Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%] Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%] Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%] Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%] Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%] Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%] This more than pays for removing points_make_affine previously and even speeds up ECDH P-256 slightly. (The point-on-curve check uses the generic code.) Next is to push the stack-allocating up to ec_wNAF_mul, followed by a constant-time single-point multiplication. Bug: 239 Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c Reviewed-on: https://boringssl-review.googlesource.com/27668 Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
// ec_bignum_to_felem converts |in| to an |EC_FELEM|. It returns one on success
// and zero if |in| is out of range.
int ec_bignum_to_felem(const EC_GROUP *group, EC_FELEM *out, const BIGNUM *in);
// ec_felem_to_bignum converts |in| to a |BIGNUM|. It returns one on success and
// zero on allocation failure.
int ec_felem_to_bignum(const EC_GROUP *group, BIGNUM *out, const EC_FELEM *in);
// ec_felem_neg sets |out| to -|a|.
void ec_felem_neg(const EC_GROUP *group, EC_FELEM *out, const EC_FELEM *a);
// ec_felem_add sets |out| to |a| + |b|.
void ec_felem_add(const EC_GROUP *group, EC_FELEM *out, const EC_FELEM *a,
const EC_FELEM *b);
// ec_felem_add sets |out| to |a| - |b|.
void ec_felem_sub(const EC_GROUP *group, EC_FELEM *out, const EC_FELEM *a,
const EC_FELEM *b);
// ec_felem_non_zero_mask returns all ones if |a| is non-zero and all zeros
// otherwise.
BN_ULONG ec_felem_non_zero_mask(const EC_GROUP *group, const EC_FELEM *a);
// ec_felem_select, in constant time, sets |out| to |a| if |mask| is all ones
// and |b| if |mask| is all zeros.
void ec_felem_select(const EC_GROUP *group, EC_FELEM *out, BN_ULONG mask,
const EC_FELEM *a, const EC_FELEM *b);
// ec_felem_equal returns one if |a| and |b| are equal and zero otherwise. It
// treats |a| and |b| as public and does *not* run in constant time.
int ec_felem_equal(const EC_GROUP *group, const EC_FELEM *a, const EC_FELEM *b);
// ec_bignum_to_scalar converts |in| to an |EC_SCALAR| and writes it to
// |*out|. It returns one on success and zero if |in| is out of range.
OPENSSL_EXPORT int ec_bignum_to_scalar(const EC_GROUP *group, EC_SCALAR *out,
const BIGNUM *in);
Make ECDSA signing 10% faster and plug some timing leaks. None of the asymmetric crypto we inherented from OpenSSL is constant-time because of BIGNUM. BIGNUM chops leading zeros off the front of everything, so we end up leaking information about the first word, in theory. BIGNUM functions additionally tend to take the full range of inputs and then call into BN_nnmod at various points. All our secret values should be acted on in constant-time, but k in ECDSA is a particularly sensitive value. So, ecdsa_sign_setup, in an attempt to mitigate the BIGNUM leaks, would add a couple copies of the order. This does not work at all. k is used to compute two values: k^-1 and kG. The first operation when computing k^-1 is to call BN_nnmod if k is out of range. The entry point to our tuned constant-time curve implementations is to call BN_nnmod if the scalar has too many bits, which this causes. The result is both corrections are immediately undone but cause us to do more variable-time work in the meantime. Replace all these computations around k with the word-based functions added in the various preceding CLs. In doing so, replace the BN_mod_mul calls (which internally call BN_nnmod) with Montgomery reduction. We can avoid taking k^-1 out of Montgomery form, which combines nicely with Brian Smith's trick in 3426d1011946b26ff1bb2fd98a081ba4753c9cc8. Along the way, we avoid some unnecessary mallocs. BIGNUM still affects the private key itself, as well as the EC_POINTs. But this should hopefully be much better now. Also it's 10% faster: Before: Did 15000 ECDSA P-224 signing operations in 1069117us (14030.3 ops/sec) Did 18000 ECDSA P-256 signing operations in 1053908us (17079.3 ops/sec) Did 1078 ECDSA P-384 signing operations in 1087853us (990.9 ops/sec) Did 473 ECDSA P-521 signing operations in 1069835us (442.1 ops/sec) After: Did 16000 ECDSA P-224 signing operations in 1064799us (15026.3 ops/sec) Did 19000 ECDSA P-256 signing operations in 1007839us (18852.2 ops/sec) Did 1078 ECDSA P-384 signing operations in 1079413us (998.7 ops/sec) Did 484 ECDSA P-521 signing operations in 1083616us (446.7 ops/sec) Change-Id: I2a25e90fc99dac13c0616d0ea45e125a4bd8cca1 Reviewed-on: https://boringssl-review.googlesource.com/23075 Reviewed-by: Adam Langley <agl@google.com>
2017-11-13 03:58:00 +00:00
// ec_random_nonzero_scalar sets |out| to a uniformly selected random value from
// 1 to |group->order| - 1. It returns one on success and zero on error.
int ec_random_nonzero_scalar(const EC_GROUP *group, EC_SCALAR *out,
const uint8_t additional_data[32]);
// ec_scalar_add sets |r| to |a| + |b|.
void ec_scalar_add(const EC_GROUP *group, EC_SCALAR *r, const EC_SCALAR *a,
const EC_SCALAR *b);
// ec_scalar_to_montgomery sets |r| to |a| in Montgomery form.
void ec_scalar_to_montgomery(const EC_GROUP *group, EC_SCALAR *r,
const EC_SCALAR *a);
// ec_scalar_to_montgomery sets |r| to |a| converted from Montgomery form.
void ec_scalar_from_montgomery(const EC_GROUP *group, EC_SCALAR *r,
const EC_SCALAR *a);
// ec_scalar_mul_montgomery sets |r| to |a| * |b| where inputs and outputs are
// in Montgomery form.
void ec_scalar_mul_montgomery(const EC_GROUP *group, EC_SCALAR *r,
const EC_SCALAR *a, const EC_SCALAR *b);
// ec_scalar_mul_montgomery sets |r| to |a|^-1 where inputs and outputs are in
// Montgomery form.
void ec_scalar_inv_montgomery(const EC_GROUP *group, EC_SCALAR *r,
const EC_SCALAR *a);
Make ECDSA signing 10% faster and plug some timing leaks. None of the asymmetric crypto we inherented from OpenSSL is constant-time because of BIGNUM. BIGNUM chops leading zeros off the front of everything, so we end up leaking information about the first word, in theory. BIGNUM functions additionally tend to take the full range of inputs and then call into BN_nnmod at various points. All our secret values should be acted on in constant-time, but k in ECDSA is a particularly sensitive value. So, ecdsa_sign_setup, in an attempt to mitigate the BIGNUM leaks, would add a couple copies of the order. This does not work at all. k is used to compute two values: k^-1 and kG. The first operation when computing k^-1 is to call BN_nnmod if k is out of range. The entry point to our tuned constant-time curve implementations is to call BN_nnmod if the scalar has too many bits, which this causes. The result is both corrections are immediately undone but cause us to do more variable-time work in the meantime. Replace all these computations around k with the word-based functions added in the various preceding CLs. In doing so, replace the BN_mod_mul calls (which internally call BN_nnmod) with Montgomery reduction. We can avoid taking k^-1 out of Montgomery form, which combines nicely with Brian Smith's trick in 3426d1011946b26ff1bb2fd98a081ba4753c9cc8. Along the way, we avoid some unnecessary mallocs. BIGNUM still affects the private key itself, as well as the EC_POINTs. But this should hopefully be much better now. Also it's 10% faster: Before: Did 15000 ECDSA P-224 signing operations in 1069117us (14030.3 ops/sec) Did 18000 ECDSA P-256 signing operations in 1053908us (17079.3 ops/sec) Did 1078 ECDSA P-384 signing operations in 1087853us (990.9 ops/sec) Did 473 ECDSA P-521 signing operations in 1069835us (442.1 ops/sec) After: Did 16000 ECDSA P-224 signing operations in 1064799us (15026.3 ops/sec) Did 19000 ECDSA P-256 signing operations in 1007839us (18852.2 ops/sec) Did 1078 ECDSA P-384 signing operations in 1079413us (998.7 ops/sec) Did 484 ECDSA P-521 signing operations in 1083616us (446.7 ops/sec) Change-Id: I2a25e90fc99dac13c0616d0ea45e125a4bd8cca1 Reviewed-on: https://boringssl-review.googlesource.com/23075 Reviewed-by: Adam Langley <agl@google.com>
2017-11-13 03:58:00 +00:00
// ec_point_mul_scalar sets |r| to generator * |g_scalar| + |p| *
// |p_scalar|. Unlike other functions which take |EC_SCALAR|, |g_scalar| and
// |p_scalar| need not be fully reduced. They need only contain as many bits as
// the order.
int ec_point_mul_scalar(const EC_GROUP *group, EC_POINT *r,
const EC_SCALAR *g_scalar, const EC_POINT *p,
const EC_SCALAR *p_scalar, BN_CTX *ctx);
ec/p256.c: fiat-crypto field arithmetic (64, 32) The fiat-crypto-generated code uses the Montgomery form implementation strategy, for both 32-bit and 64-bit code. 64-bit throughput seems slower, but the difference is smaller than noise between repetitions (-2%?) 32-bit throughput has decreased significantly for ECDH (-40%). I am attributing this to the change from varibale-time scalar multiplication to constant-time scalar multiplication. Due to the same bottleneck, ECDSA verification still uses the old code (otherwise there would have been a 60% throughput decrease). On the other hand, ECDSA signing throughput has increased slightly (+10%), perhaps due to the use of a precomputed table of multiples of the base point. 64-bit benchmarks (Google Cloud Haswell): with this change: Did 9126 ECDH P-256 operations in 1009572us (9039.5 ops/sec) Did 23000 ECDSA P-256 signing operations in 1039832us (22119.0 ops/sec) Did 8820 ECDSA P-256 verify operations in 1024242us (8611.2 ops/sec) master (40e8c921cab5cce2bc10722ecf4ebe0e380cf6c8): Did 9340 ECDH P-256 operations in 1017975us (9175.1 ops/sec) Did 23000 ECDSA P-256 signing operations in 1039820us (22119.2 ops/sec) Did 8688 ECDSA P-256 verify operations in 1021108us (8508.4 ops/sec) benchmarks on ARMv7 (LG Nexus 4): with this change: Did 150 ECDH P-256 operations in 1029726us (145.7 ops/sec) Did 506 ECDSA P-256 signing operations in 1065192us (475.0 ops/sec) Did 363 ECDSA P-256 verify operations in 1033298us (351.3 ops/sec) master (2fce1beda0f7e74e2d687860f807cf0b8d8056a4): Did 245 ECDH P-256 operations in 1017518us (240.8 ops/sec) Did 473 ECDSA P-256 signing operations in 1086281us (435.4 ops/sec) Did 360 ECDSA P-256 verify operations in 1003846us (358.6 ops/sec) 64-bit tables converted as follows: import re, sys, math p = 2**256 - 2**224 + 2**192 + 2**96 - 1 R = 2**256 def convert(t): x0, s1, x1, s2, x2, s3, x3 = t.groups() v = int(x0, 0) + 2**64 * (int(x1, 0) + 2**64*(int(x2,0) + 2**64*(int(x3, 0)) )) w = v*R%p y0 = hex(w%(2**64)) y1 = hex((w>>64)%(2**64)) y2 = hex((w>>(2*64))%(2**64)) y3 = hex((w>>(3*64))%(2**64)) ww = int(y0, 0) + 2**64 * (int(y1, 0) + 2**64*(int(y2,0) + 2**64*(int(y3, 0)) )) if ww != v*R%p: print(x0,x1,x2,x3) print(hex(v)) print(y0,y1,y2,y3) print(hex(w)) print(hex(ww)) assert 0 return '{'+y0+s1+y1+s2+y2+s3+y3+'}' fe_re = re.compile('{'+r'(\s*,\s*)'.join(r'(\d+|0x[abcdefABCDEF0123456789]+)' for i in range(4)) + '}') print (re.sub(fe_re, convert, sys.stdin.read()).rstrip('\n')) 32-bit tables converted from 64-bit tables Change-Id: I52d6e5504fcb6ca2e8b0ee13727f4500c80c1799 Reviewed-on: https://boringssl-review.googlesource.com/23244 Commit-Queue: Adam Langley <agl@google.com> Reviewed-by: Adam Langley <agl@google.com> CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
2017-11-08 20:32:38 +00:00
// ec_point_mul_scalar_public performs the same computation as
// ec_point_mul_scalar. It further assumes that the inputs are public so
// there is no concern about leaking their values through timing.
OPENSSL_EXPORT int ec_point_mul_scalar_public(
const EC_GROUP *group, EC_POINT *r, const EC_SCALAR *g_scalar,
const EC_POINT *p, const EC_SCALAR *p_scalar, BN_CTX *ctx);
ec/p256.c: fiat-crypto field arithmetic (64, 32) The fiat-crypto-generated code uses the Montgomery form implementation strategy, for both 32-bit and 64-bit code. 64-bit throughput seems slower, but the difference is smaller than noise between repetitions (-2%?) 32-bit throughput has decreased significantly for ECDH (-40%). I am attributing this to the change from varibale-time scalar multiplication to constant-time scalar multiplication. Due to the same bottleneck, ECDSA verification still uses the old code (otherwise there would have been a 60% throughput decrease). On the other hand, ECDSA signing throughput has increased slightly (+10%), perhaps due to the use of a precomputed table of multiples of the base point. 64-bit benchmarks (Google Cloud Haswell): with this change: Did 9126 ECDH P-256 operations in 1009572us (9039.5 ops/sec) Did 23000 ECDSA P-256 signing operations in 1039832us (22119.0 ops/sec) Did 8820 ECDSA P-256 verify operations in 1024242us (8611.2 ops/sec) master (40e8c921cab5cce2bc10722ecf4ebe0e380cf6c8): Did 9340 ECDH P-256 operations in 1017975us (9175.1 ops/sec) Did 23000 ECDSA P-256 signing operations in 1039820us (22119.2 ops/sec) Did 8688 ECDSA P-256 verify operations in 1021108us (8508.4 ops/sec) benchmarks on ARMv7 (LG Nexus 4): with this change: Did 150 ECDH P-256 operations in 1029726us (145.7 ops/sec) Did 506 ECDSA P-256 signing operations in 1065192us (475.0 ops/sec) Did 363 ECDSA P-256 verify operations in 1033298us (351.3 ops/sec) master (2fce1beda0f7e74e2d687860f807cf0b8d8056a4): Did 245 ECDH P-256 operations in 1017518us (240.8 ops/sec) Did 473 ECDSA P-256 signing operations in 1086281us (435.4 ops/sec) Did 360 ECDSA P-256 verify operations in 1003846us (358.6 ops/sec) 64-bit tables converted as follows: import re, sys, math p = 2**256 - 2**224 + 2**192 + 2**96 - 1 R = 2**256 def convert(t): x0, s1, x1, s2, x2, s3, x3 = t.groups() v = int(x0, 0) + 2**64 * (int(x1, 0) + 2**64*(int(x2,0) + 2**64*(int(x3, 0)) )) w = v*R%p y0 = hex(w%(2**64)) y1 = hex((w>>64)%(2**64)) y2 = hex((w>>(2*64))%(2**64)) y3 = hex((w>>(3*64))%(2**64)) ww = int(y0, 0) + 2**64 * (int(y1, 0) + 2**64*(int(y2,0) + 2**64*(int(y3, 0)) )) if ww != v*R%p: print(x0,x1,x2,x3) print(hex(v)) print(y0,y1,y2,y3) print(hex(w)) print(hex(ww)) assert 0 return '{'+y0+s1+y1+s2+y2+s3+y3+'}' fe_re = re.compile('{'+r'(\s*,\s*)'.join(r'(\d+|0x[abcdefABCDEF0123456789]+)' for i in range(4)) + '}') print (re.sub(fe_re, convert, sys.stdin.read()).rstrip('\n')) 32-bit tables converted from 64-bit tables Change-Id: I52d6e5504fcb6ca2e8b0ee13727f4500c80c1799 Reviewed-on: https://boringssl-review.googlesource.com/23244 Commit-Queue: Adam Langley <agl@google.com> Reviewed-by: Adam Langley <agl@google.com> CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
2017-11-08 20:32:38 +00:00
Add a tuned variable-time P-256 multiplication function. This reuses wnaf.c's window scheduling, but has access to the tuned field arithemetic and pre-computed base point table. Unlike wnaf.c, we do not make the points affine as it's not worth it for a single table. (We already precomputed the base point table.) Annoyingly, 32-bit x86 gets slower by a bit, but the other platforms are faster. My guess is that that the generic code gets to use the bn_mul_mont assembly and the compiler, faced with the increased 32-bit register pressure and the extremely register-poor x86, is making bad decisions on the otherwise P-256-tuned C code. The three platforms that see much larger gains are significantly more important than 32-bit x86 at this point, so go with this change. armv7a (Nexus 5X) before/after [+14.4%]: Did 2703 ECDSA P-256 verify operations in 5034539us (536.9 ops/sec) Did 3127 ECDSA P-256 verify operations in 5091379us (614.2 ops/sec) aarch64 (Nexus 5X) before/after [+9.2%]: Did 6783 ECDSA P-256 verify operations in 5031324us (1348.2 ops/sec) Did 7410 ECDSA P-256 verify operations in 5033291us (1472.2 ops/sec) x86 before/after [-2.7%]: Did 8961 ECDSA P-256 verify operations in 10075901us (889.3 ops/sec) Did 8568 ECDSA P-256 verify operations in 10003001us (856.5 ops/sec) x86_64 before/after [+8.6%]: Did 29808 ECDSA P-256 verify operations in 10008662us (2978.2 ops/sec) Did 32528 ECDSA P-256 verify operations in 10057137us (3234.3 ops/sec) Change-Id: I5fa643149f5bfbbda9533e3008baadfee9979b93 Reviewed-on: https://boringssl-review.googlesource.com/25684 Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Adam Langley <agl@google.com> CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
2018-02-02 23:24:09 +00:00
// ec_compute_wNAF writes the modified width-(w+1) Non-Adjacent Form (wNAF) of
// |scalar| to |out| and returns one on success or zero on internal error. |out|
// must have room for |bits| + 1 elements, each of which will be either zero or
// odd with an absolute value less than 2^w satisfying
// scalar = \sum_j out[j]*2^j
// where at most one of any w+1 consecutive digits is non-zero
// with the exception that the most significant digit may be only
// w-1 zeros away from that next non-zero digit.
int ec_compute_wNAF(const EC_GROUP *group, int8_t *out, const EC_SCALAR *scalar,
size_t bits, int w);
Make ECDSA signing 10% faster and plug some timing leaks. None of the asymmetric crypto we inherented from OpenSSL is constant-time because of BIGNUM. BIGNUM chops leading zeros off the front of everything, so we end up leaking information about the first word, in theory. BIGNUM functions additionally tend to take the full range of inputs and then call into BN_nnmod at various points. All our secret values should be acted on in constant-time, but k in ECDSA is a particularly sensitive value. So, ecdsa_sign_setup, in an attempt to mitigate the BIGNUM leaks, would add a couple copies of the order. This does not work at all. k is used to compute two values: k^-1 and kG. The first operation when computing k^-1 is to call BN_nnmod if k is out of range. The entry point to our tuned constant-time curve implementations is to call BN_nnmod if the scalar has too many bits, which this causes. The result is both corrections are immediately undone but cause us to do more variable-time work in the meantime. Replace all these computations around k with the word-based functions added in the various preceding CLs. In doing so, replace the BN_mod_mul calls (which internally call BN_nnmod) with Montgomery reduction. We can avoid taking k^-1 out of Montgomery form, which combines nicely with Brian Smith's trick in 3426d1011946b26ff1bb2fd98a081ba4753c9cc8. Along the way, we avoid some unnecessary mallocs. BIGNUM still affects the private key itself, as well as the EC_POINTs. But this should hopefully be much better now. Also it's 10% faster: Before: Did 15000 ECDSA P-224 signing operations in 1069117us (14030.3 ops/sec) Did 18000 ECDSA P-256 signing operations in 1053908us (17079.3 ops/sec) Did 1078 ECDSA P-384 signing operations in 1087853us (990.9 ops/sec) Did 473 ECDSA P-521 signing operations in 1069835us (442.1 ops/sec) After: Did 16000 ECDSA P-224 signing operations in 1064799us (15026.3 ops/sec) Did 19000 ECDSA P-256 signing operations in 1007839us (18852.2 ops/sec) Did 1078 ECDSA P-384 signing operations in 1079413us (998.7 ops/sec) Did 484 ECDSA P-521 signing operations in 1083616us (446.7 ops/sec) Change-Id: I2a25e90fc99dac13c0616d0ea45e125a4bd8cca1 Reviewed-on: https://boringssl-review.googlesource.com/23075 Reviewed-by: Adam Langley <agl@google.com>
2017-11-13 03:58:00 +00:00
int ec_wNAF_mul(const EC_GROUP *group, EC_POINT *r, const EC_SCALAR *g_scalar,
const EC_POINT *p, const EC_SCALAR *p_scalar, BN_CTX *ctx);
// method functions in simple.c
int ec_GFp_simple_group_init(EC_GROUP *);
void ec_GFp_simple_group_finish(EC_GROUP *);
int ec_GFp_simple_group_set_curve(EC_GROUP *, const BIGNUM *p, const BIGNUM *a,
const BIGNUM *b, BN_CTX *);
int ec_GFp_simple_group_get_curve(const EC_GROUP *, BIGNUM *p, BIGNUM *a,
Add EC_FELEM for EC_POINTs and related temporaries. This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used for EC_POINT's representation in the generic EC_METHOD, as well as random operations on tuned EC_METHODs that still are implemented genericly. Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific to the EC_METHOD, analogous to how the old values were BIGNUMs but may or may not have been in Montgomery form. This is kind of a nuisance, but no more than before. (If p224-64.c were easily convertable to Montgomery form, we could say |EC_FELEM| is always in Montgomery form. If we exposed the internal add and double implementations in each of the curves, we could give |EC_POINT| an |EC_METHOD|-specific representation and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this for later.) The generic add and doubling formulas are aligned with the formulas proved in fiat-crypto. Those only applied to a = -3, so I've proved a generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case someone uses a custom curve. The new formulas are verified, constant-time, and swap a multiply for a square. As expressed in fiat-crypto they do use more temporaries, but this seems to be fine with stack-allocated EC_FELEMs. (We can try to help the compiler later, but benchamrks below suggest this isn't necessary.) Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the bounds in the type system and, in particular, that the width is correct, which will make it easier to select a point in constant-time in the future. (Indeed the old code did not always have the correct width. Its point formula involved halving and implemented this in variable time and variable width.) Before: Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec) Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec) Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec) Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec) Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec) Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec) Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec) After: Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%] Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%] Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%] Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%] Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%] Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%] Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%] This more than pays for removing points_make_affine previously and even speeds up ECDH P-256 slightly. (The point-on-curve check uses the generic code.) Next is to push the stack-allocating up to ec_wNAF_mul, followed by a constant-time single-point multiplication. Bug: 239 Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c Reviewed-on: https://boringssl-review.googlesource.com/27668 Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
BIGNUM *b);
unsigned ec_GFp_simple_group_get_degree(const EC_GROUP *);
Add EC_FELEM for EC_POINTs and related temporaries. This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used for EC_POINT's representation in the generic EC_METHOD, as well as random operations on tuned EC_METHODs that still are implemented genericly. Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific to the EC_METHOD, analogous to how the old values were BIGNUMs but may or may not have been in Montgomery form. This is kind of a nuisance, but no more than before. (If p224-64.c were easily convertable to Montgomery form, we could say |EC_FELEM| is always in Montgomery form. If we exposed the internal add and double implementations in each of the curves, we could give |EC_POINT| an |EC_METHOD|-specific representation and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this for later.) The generic add and doubling formulas are aligned with the formulas proved in fiat-crypto. Those only applied to a = -3, so I've proved a generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case someone uses a custom curve. The new formulas are verified, constant-time, and swap a multiply for a square. As expressed in fiat-crypto they do use more temporaries, but this seems to be fine with stack-allocated EC_FELEMs. (We can try to help the compiler later, but benchamrks below suggest this isn't necessary.) Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the bounds in the type system and, in particular, that the width is correct, which will make it easier to select a point in constant-time in the future. (Indeed the old code did not always have the correct width. Its point formula involved halving and implemented this in variable time and variable width.) Before: Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec) Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec) Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec) Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec) Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec) Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec) Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec) After: Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%] Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%] Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%] Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%] Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%] Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%] Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%] This more than pays for removing points_make_affine previously and even speeds up ECDH P-256 slightly. (The point-on-curve check uses the generic code.) Next is to push the stack-allocating up to ec_wNAF_mul, followed by a constant-time single-point multiplication. Bug: 239 Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c Reviewed-on: https://boringssl-review.googlesource.com/27668 Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
void ec_GFp_simple_point_init(EC_POINT *);
void ec_GFp_simple_point_copy(EC_POINT *, const EC_POINT *);
void ec_GFp_simple_point_set_to_infinity(const EC_GROUP *, EC_POINT *);
int ec_GFp_simple_point_set_affine_coordinates(const EC_GROUP *, EC_POINT *,
Add EC_FELEM for EC_POINTs and related temporaries. This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used for EC_POINT's representation in the generic EC_METHOD, as well as random operations on tuned EC_METHODs that still are implemented genericly. Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific to the EC_METHOD, analogous to how the old values were BIGNUMs but may or may not have been in Montgomery form. This is kind of a nuisance, but no more than before. (If p224-64.c were easily convertable to Montgomery form, we could say |EC_FELEM| is always in Montgomery form. If we exposed the internal add and double implementations in each of the curves, we could give |EC_POINT| an |EC_METHOD|-specific representation and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this for later.) The generic add and doubling formulas are aligned with the formulas proved in fiat-crypto. Those only applied to a = -3, so I've proved a generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case someone uses a custom curve. The new formulas are verified, constant-time, and swap a multiply for a square. As expressed in fiat-crypto they do use more temporaries, but this seems to be fine with stack-allocated EC_FELEMs. (We can try to help the compiler later, but benchamrks below suggest this isn't necessary.) Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the bounds in the type system and, in particular, that the width is correct, which will make it easier to select a point in constant-time in the future. (Indeed the old code did not always have the correct width. Its point formula involved halving and implemented this in variable time and variable width.) Before: Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec) Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec) Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec) Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec) Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec) Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec) Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec) After: Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%] Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%] Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%] Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%] Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%] Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%] Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%] This more than pays for removing points_make_affine previously and even speeds up ECDH P-256 slightly. (The point-on-curve check uses the generic code.) Next is to push the stack-allocating up to ec_wNAF_mul, followed by a constant-time single-point multiplication. Bug: 239 Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c Reviewed-on: https://boringssl-review.googlesource.com/27668 Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
const BIGNUM *x, const BIGNUM *y);
void ec_GFp_simple_add(const EC_GROUP *, EC_POINT *r, const EC_POINT *a,
const EC_POINT *b);
void ec_GFp_simple_dbl(const EC_GROUP *, EC_POINT *r, const EC_POINT *a);
void ec_GFp_simple_invert(const EC_GROUP *, EC_POINT *);
int ec_GFp_simple_is_at_infinity(const EC_GROUP *, const EC_POINT *);
Add EC_FELEM for EC_POINTs and related temporaries. This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used for EC_POINT's representation in the generic EC_METHOD, as well as random operations on tuned EC_METHODs that still are implemented genericly. Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific to the EC_METHOD, analogous to how the old values were BIGNUMs but may or may not have been in Montgomery form. This is kind of a nuisance, but no more than before. (If p224-64.c were easily convertable to Montgomery form, we could say |EC_FELEM| is always in Montgomery form. If we exposed the internal add and double implementations in each of the curves, we could give |EC_POINT| an |EC_METHOD|-specific representation and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this for later.) The generic add and doubling formulas are aligned with the formulas proved in fiat-crypto. Those only applied to a = -3, so I've proved a generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case someone uses a custom curve. The new formulas are verified, constant-time, and swap a multiply for a square. As expressed in fiat-crypto they do use more temporaries, but this seems to be fine with stack-allocated EC_FELEMs. (We can try to help the compiler later, but benchamrks below suggest this isn't necessary.) Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the bounds in the type system and, in particular, that the width is correct, which will make it easier to select a point in constant-time in the future. (Indeed the old code did not always have the correct width. Its point formula involved halving and implemented this in variable time and variable width.) Before: Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec) Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec) Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec) Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec) Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec) Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec) Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec) After: Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%] Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%] Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%] Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%] Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%] Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%] Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%] This more than pays for removing points_make_affine previously and even speeds up ECDH P-256 slightly. (The point-on-curve check uses the generic code.) Next is to push the stack-allocating up to ec_wNAF_mul, followed by a constant-time single-point multiplication. Bug: 239 Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c Reviewed-on: https://boringssl-review.googlesource.com/27668 Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
int ec_GFp_simple_is_on_curve(const EC_GROUP *, const EC_POINT *);
int ec_GFp_simple_cmp(const EC_GROUP *, const EC_POINT *a, const EC_POINT *b);
void ec_simple_scalar_inv_montgomery(const EC_GROUP *group, EC_SCALAR *r,
const EC_SCALAR *a);
// method functions in montgomery.c
int ec_GFp_mont_group_init(EC_GROUP *);
int ec_GFp_mont_group_set_curve(EC_GROUP *, const BIGNUM *p, const BIGNUM *a,
const BIGNUM *b, BN_CTX *);
void ec_GFp_mont_group_finish(EC_GROUP *);
Add EC_FELEM for EC_POINTs and related temporaries. This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used for EC_POINT's representation in the generic EC_METHOD, as well as random operations on tuned EC_METHODs that still are implemented genericly. Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific to the EC_METHOD, analogous to how the old values were BIGNUMs but may or may not have been in Montgomery form. This is kind of a nuisance, but no more than before. (If p224-64.c were easily convertable to Montgomery form, we could say |EC_FELEM| is always in Montgomery form. If we exposed the internal add and double implementations in each of the curves, we could give |EC_POINT| an |EC_METHOD|-specific representation and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this for later.) The generic add and doubling formulas are aligned with the formulas proved in fiat-crypto. Those only applied to a = -3, so I've proved a generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case someone uses a custom curve. The new formulas are verified, constant-time, and swap a multiply for a square. As expressed in fiat-crypto they do use more temporaries, but this seems to be fine with stack-allocated EC_FELEMs. (We can try to help the compiler later, but benchamrks below suggest this isn't necessary.) Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the bounds in the type system and, in particular, that the width is correct, which will make it easier to select a point in constant-time in the future. (Indeed the old code did not always have the correct width. Its point formula involved halving and implemented this in variable time and variable width.) Before: Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec) Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec) Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec) Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec) Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec) Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec) Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec) After: Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%] Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%] Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%] Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%] Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%] Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%] Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%] This more than pays for removing points_make_affine previously and even speeds up ECDH P-256 slightly. (The point-on-curve check uses the generic code.) Next is to push the stack-allocating up to ec_wNAF_mul, followed by a constant-time single-point multiplication. Bug: 239 Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c Reviewed-on: https://boringssl-review.googlesource.com/27668 Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
void ec_GFp_mont_felem_mul(const EC_GROUP *, EC_FELEM *r, const EC_FELEM *a,
const EC_FELEM *b);
void ec_GFp_mont_felem_sqr(const EC_GROUP *, EC_FELEM *r, const EC_FELEM *a);
int ec_GFp_mont_bignum_to_felem(const EC_GROUP *group, EC_FELEM *out,
const BIGNUM *in);
int ec_GFp_mont_felem_to_bignum(const EC_GROUP *group, BIGNUM *out,
const EC_FELEM *in);
void ec_GFp_nistp_recode_scalar_bits(uint8_t *sign, uint8_t *digit, uint8_t in);
const EC_METHOD *EC_GFp_nistp224_method(void);
const EC_METHOD *EC_GFp_nistp256_method(void);
// EC_GFp_nistz256_method is a GFp method using montgomery multiplication, with
// x86-64 optimized P256. See http://eprint.iacr.org/2013/816.
const EC_METHOD *EC_GFp_nistz256_method(void);
// An EC_WRAPPED_SCALAR is an |EC_SCALAR| with a parallel |BIGNUM|
// representation. It exists to support the |EC_KEY_get0_private_key| API.
typedef struct {
BIGNUM bignum;
EC_SCALAR scalar;
} EC_WRAPPED_SCALAR;
struct ec_key_st {
EC_GROUP *group;
EC_POINT *pub_key;
EC_WRAPPED_SCALAR *priv_key;
// fixed_k may contain a specific value of 'k', to be used in ECDSA signing.
// This is only for the FIPS power-on tests.
BIGNUM *fixed_k;
unsigned int enc_flag;
point_conversion_form_t conv_form;
CRYPTO_refcount_t references;
ECDSA_METHOD *ecdsa_meth;
CRYPTO_EX_DATA ex_data;
} /* EC_KEY */;
struct built_in_curve {
int nid;
const uint8_t *oid;
uint8_t oid_len;
// comment is a human-readable string describing the curve.
const char *comment;
// param_len is the number of bytes needed to store a field element.
uint8_t param_len;
// params points to an array of 6*|param_len| bytes which hold the field
// elements of the following (in big-endian order): prime, a, b, generator x,
// generator y, order.
const uint8_t *params;
const EC_METHOD *method;
};
#define OPENSSL_NUM_BUILT_IN_CURVES 4
struct built_in_curves {
struct built_in_curve curves[OPENSSL_NUM_BUILT_IN_CURVES];
};
// OPENSSL_built_in_curves returns a pointer to static information about
// standard curves. The array is terminated with an entry where |nid| is
// |NID_undef|.
const struct built_in_curves *OPENSSL_built_in_curves(void);
#if defined(__cplusplus)
} // extern C
#endif
#endif // OPENSSL_HEADER_EC_INTERNAL_H