2014-06-20 20:00:00 +01:00
|
|
|
/* Originally written by Bodo Moeller for the OpenSSL project.
|
|
|
|
* ====================================================================
|
|
|
|
* Copyright (c) 1998-2005 The OpenSSL Project. All rights reserved.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
*
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
*
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in
|
|
|
|
* the documentation and/or other materials provided with the
|
|
|
|
* distribution.
|
|
|
|
*
|
|
|
|
* 3. All advertising materials mentioning features or use of this
|
|
|
|
* software must display the following acknowledgment:
|
|
|
|
* "This product includes software developed by the OpenSSL Project
|
|
|
|
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
|
|
|
|
*
|
|
|
|
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
|
|
|
|
* endorse or promote products derived from this software without
|
|
|
|
* prior written permission. For written permission, please contact
|
|
|
|
* openssl-core@openssl.org.
|
|
|
|
*
|
|
|
|
* 5. Products derived from this software may not be called "OpenSSL"
|
|
|
|
* nor may "OpenSSL" appear in their names without prior written
|
|
|
|
* permission of the OpenSSL Project.
|
|
|
|
*
|
|
|
|
* 6. Redistributions of any form whatsoever must retain the following
|
|
|
|
* acknowledgment:
|
|
|
|
* "This product includes software developed by the OpenSSL Project
|
|
|
|
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
|
|
|
|
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
|
|
|
|
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
|
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
|
|
|
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
|
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
|
|
|
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
|
|
|
* OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
* ====================================================================
|
|
|
|
*
|
|
|
|
* This product includes cryptographic software written by Eric Young
|
|
|
|
* (eay@cryptsoft.com). This product includes software written by Tim
|
|
|
|
* Hudson (tjh@cryptsoft.com).
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
/* ====================================================================
|
|
|
|
* Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
|
|
|
|
*
|
|
|
|
* Portions of the attached software ("Contribution") are developed by
|
|
|
|
* SUN MICROSYSTEMS, INC., and are contributed to the OpenSSL project.
|
|
|
|
*
|
|
|
|
* The Contribution is licensed pursuant to the OpenSSL open source
|
|
|
|
* license provided above.
|
|
|
|
*
|
|
|
|
* The elliptic curve binary polynomial software is originally written by
|
|
|
|
* Sheueling Chang Shantz and Douglas Stebila of Sun Microsystems
|
|
|
|
* Laboratories. */
|
|
|
|
|
|
|
|
#ifndef OPENSSL_HEADER_EC_INTERNAL_H
|
|
|
|
#define OPENSSL_HEADER_EC_INTERNAL_H
|
|
|
|
|
|
|
|
#include <openssl/base.h>
|
|
|
|
|
|
|
|
#include <openssl/bn.h>
|
|
|
|
#include <openssl/ex_data.h>
|
2015-05-15 20:49:30 +01:00
|
|
|
#include <openssl/thread.h>
|
2017-11-11 03:37:40 +00:00
|
|
|
#include <openssl/type_check.h>
|
|
|
|
|
|
|
|
#include "../bn/internal.h"
|
2014-06-20 20:00:00 +01:00
|
|
|
|
|
|
|
#if defined(__cplusplus)
|
|
|
|
extern "C" {
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
2017-11-11 03:37:40 +00:00
|
|
|
// Cap the size of all field elements and scalars, including custom curves, to
|
|
|
|
// 66 bytes, large enough to fit secp521r1 and brainpoolP512r1, which appear to
|
|
|
|
// be the largest fields anyone plausibly uses.
|
2018-11-09 23:14:15 +00:00
|
|
|
#define EC_MAX_BYTES 66
|
|
|
|
#define EC_MAX_WORDS ((EC_MAX_BYTES + BN_BYTES - 1) / BN_BYTES)
|
2017-11-11 03:37:40 +00:00
|
|
|
|
2018-11-09 23:14:15 +00:00
|
|
|
OPENSSL_COMPILE_ASSERT(EC_MAX_WORDS <= BN_SMALL_MAX_WORDS,
|
2017-11-11 03:37:40 +00:00
|
|
|
bn_small_functions_applicable);
|
|
|
|
|
2017-11-30 21:05:36 +00:00
|
|
|
// An EC_SCALAR is an integer fully reduced modulo the order. Only the first
|
2018-01-15 10:23:24 +00:00
|
|
|
// |order->width| words are used. An |EC_SCALAR| is specific to an |EC_GROUP|
|
|
|
|
// and must not be mixed between groups.
|
Make ECDSA signing 10% faster and plug some timing leaks.
None of the asymmetric crypto we inherented from OpenSSL is
constant-time because of BIGNUM. BIGNUM chops leading zeros off the
front of everything, so we end up leaking information about the first
word, in theory. BIGNUM functions additionally tend to take the full
range of inputs and then call into BN_nnmod at various points.
All our secret values should be acted on in constant-time, but k in
ECDSA is a particularly sensitive value. So, ecdsa_sign_setup, in an
attempt to mitigate the BIGNUM leaks, would add a couple copies of the
order.
This does not work at all. k is used to compute two values: k^-1 and kG.
The first operation when computing k^-1 is to call BN_nnmod if k is out
of range. The entry point to our tuned constant-time curve
implementations is to call BN_nnmod if the scalar has too many bits,
which this causes. The result is both corrections are immediately undone
but cause us to do more variable-time work in the meantime.
Replace all these computations around k with the word-based functions
added in the various preceding CLs. In doing so, replace the BN_mod_mul
calls (which internally call BN_nnmod) with Montgomery reduction. We can
avoid taking k^-1 out of Montgomery form, which combines nicely with
Brian Smith's trick in 3426d1011946b26ff1bb2fd98a081ba4753c9cc8. Along
the way, we avoid some unnecessary mallocs.
BIGNUM still affects the private key itself, as well as the EC_POINTs.
But this should hopefully be much better now. Also it's 10% faster:
Before:
Did 15000 ECDSA P-224 signing operations in 1069117us (14030.3 ops/sec)
Did 18000 ECDSA P-256 signing operations in 1053908us (17079.3 ops/sec)
Did 1078 ECDSA P-384 signing operations in 1087853us (990.9 ops/sec)
Did 473 ECDSA P-521 signing operations in 1069835us (442.1 ops/sec)
After:
Did 16000 ECDSA P-224 signing operations in 1064799us (15026.3 ops/sec)
Did 19000 ECDSA P-256 signing operations in 1007839us (18852.2 ops/sec)
Did 1078 ECDSA P-384 signing operations in 1079413us (998.7 ops/sec)
Did 484 ECDSA P-521 signing operations in 1083616us (446.7 ops/sec)
Change-Id: I2a25e90fc99dac13c0616d0ea45e125a4bd8cca1
Reviewed-on: https://boringssl-review.googlesource.com/23075
Reviewed-by: Adam Langley <agl@google.com>
2017-11-13 03:58:00 +00:00
|
|
|
typedef union {
|
|
|
|
// bytes is the representation of the scalar in little-endian order.
|
2018-11-09 23:14:15 +00:00
|
|
|
uint8_t bytes[EC_MAX_BYTES];
|
|
|
|
BN_ULONG words[EC_MAX_WORDS];
|
Make ECDSA signing 10% faster and plug some timing leaks.
None of the asymmetric crypto we inherented from OpenSSL is
constant-time because of BIGNUM. BIGNUM chops leading zeros off the
front of everything, so we end up leaking information about the first
word, in theory. BIGNUM functions additionally tend to take the full
range of inputs and then call into BN_nnmod at various points.
All our secret values should be acted on in constant-time, but k in
ECDSA is a particularly sensitive value. So, ecdsa_sign_setup, in an
attempt to mitigate the BIGNUM leaks, would add a couple copies of the
order.
This does not work at all. k is used to compute two values: k^-1 and kG.
The first operation when computing k^-1 is to call BN_nnmod if k is out
of range. The entry point to our tuned constant-time curve
implementations is to call BN_nnmod if the scalar has too many bits,
which this causes. The result is both corrections are immediately undone
but cause us to do more variable-time work in the meantime.
Replace all these computations around k with the word-based functions
added in the various preceding CLs. In doing so, replace the BN_mod_mul
calls (which internally call BN_nnmod) with Montgomery reduction. We can
avoid taking k^-1 out of Montgomery form, which combines nicely with
Brian Smith's trick in 3426d1011946b26ff1bb2fd98a081ba4753c9cc8. Along
the way, we avoid some unnecessary mallocs.
BIGNUM still affects the private key itself, as well as the EC_POINTs.
But this should hopefully be much better now. Also it's 10% faster:
Before:
Did 15000 ECDSA P-224 signing operations in 1069117us (14030.3 ops/sec)
Did 18000 ECDSA P-256 signing operations in 1053908us (17079.3 ops/sec)
Did 1078 ECDSA P-384 signing operations in 1087853us (990.9 ops/sec)
Did 473 ECDSA P-521 signing operations in 1069835us (442.1 ops/sec)
After:
Did 16000 ECDSA P-224 signing operations in 1064799us (15026.3 ops/sec)
Did 19000 ECDSA P-256 signing operations in 1007839us (18852.2 ops/sec)
Did 1078 ECDSA P-384 signing operations in 1079413us (998.7 ops/sec)
Did 484 ECDSA P-521 signing operations in 1083616us (446.7 ops/sec)
Change-Id: I2a25e90fc99dac13c0616d0ea45e125a4bd8cca1
Reviewed-on: https://boringssl-review.googlesource.com/23075
Reviewed-by: Adam Langley <agl@google.com>
2017-11-13 03:58:00 +00:00
|
|
|
} EC_SCALAR;
|
|
|
|
|
Add EC_FELEM for EC_POINTs and related temporaries.
This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used
for EC_POINT's representation in the generic EC_METHOD, as well as
random operations on tuned EC_METHODs that still are implemented
genericly.
Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific
to the EC_METHOD, analogous to how the old values were BIGNUMs but may
or may not have been in Montgomery form. This is kind of a nuisance, but
no more than before. (If p224-64.c were easily convertable to Montgomery
form, we could say |EC_FELEM| is always in Montgomery form. If we
exposed the internal add and double implementations in each of the
curves, we could give |EC_POINT| an |EC_METHOD|-specific representation
and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this
for later.)
The generic add and doubling formulas are aligned with the formulas
proved in fiat-crypto. Those only applied to a = -3, so I've proved a
generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case
someone uses a custom curve. The new formulas are verified,
constant-time, and swap a multiply for a square. As expressed in
fiat-crypto they do use more temporaries, but this seems to be fine with
stack-allocated EC_FELEMs. (We can try to help the compiler later,
but benchamrks below suggest this isn't necessary.)
Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the
bounds in the type system and, in particular, that the width is correct,
which will make it easier to select a point in constant-time in the
future. (Indeed the old code did not always have the correct width. Its
point formula involved halving and implemented this in variable time and
variable width.)
Before:
Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec)
Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec)
Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec)
Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec)
Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec)
Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec)
Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec)
After:
Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%]
Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%]
Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%]
Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%]
Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%]
Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%]
Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%]
This more than pays for removing points_make_affine previously and even
speeds up ECDH P-256 slightly. (The point-on-curve check uses the
generic code.)
Next is to push the stack-allocating up to ec_wNAF_mul, followed by a
constant-time single-point multiplication.
Bug: 239
Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c
Reviewed-on: https://boringssl-review.googlesource.com/27668
Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
|
|
|
// An EC_FELEM represents a field element. Only the first |field->width| words
|
|
|
|
// are used. An |EC_FELEM| is specific to an |EC_GROUP| and must not be mixed
|
|
|
|
// between groups. Additionally, the representation (whether or not elements are
|
|
|
|
// represented in Montgomery-form) may vary between |EC_METHOD|s.
|
|
|
|
typedef union {
|
|
|
|
// bytes is the representation of the field element in little-endian order.
|
2018-11-09 23:14:15 +00:00
|
|
|
uint8_t bytes[EC_MAX_BYTES];
|
|
|
|
BN_ULONG words[EC_MAX_WORDS];
|
Add EC_FELEM for EC_POINTs and related temporaries.
This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used
for EC_POINT's representation in the generic EC_METHOD, as well as
random operations on tuned EC_METHODs that still are implemented
genericly.
Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific
to the EC_METHOD, analogous to how the old values were BIGNUMs but may
or may not have been in Montgomery form. This is kind of a nuisance, but
no more than before. (If p224-64.c were easily convertable to Montgomery
form, we could say |EC_FELEM| is always in Montgomery form. If we
exposed the internal add and double implementations in each of the
curves, we could give |EC_POINT| an |EC_METHOD|-specific representation
and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this
for later.)
The generic add and doubling formulas are aligned with the formulas
proved in fiat-crypto. Those only applied to a = -3, so I've proved a
generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case
someone uses a custom curve. The new formulas are verified,
constant-time, and swap a multiply for a square. As expressed in
fiat-crypto they do use more temporaries, but this seems to be fine with
stack-allocated EC_FELEMs. (We can try to help the compiler later,
but benchamrks below suggest this isn't necessary.)
Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the
bounds in the type system and, in particular, that the width is correct,
which will make it easier to select a point in constant-time in the
future. (Indeed the old code did not always have the correct width. Its
point formula involved halving and implemented this in variable time and
variable width.)
Before:
Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec)
Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec)
Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec)
Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec)
Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec)
Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec)
Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec)
After:
Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%]
Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%]
Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%]
Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%]
Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%]
Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%]
Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%]
This more than pays for removing points_make_affine previously and even
speeds up ECDH P-256 slightly. (The point-on-curve check uses the
generic code.)
Next is to push the stack-allocating up to ec_wNAF_mul, followed by a
constant-time single-point multiplication.
Bug: 239
Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c
Reviewed-on: https://boringssl-review.googlesource.com/27668
Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
|
|
|
} EC_FELEM;
|
|
|
|
|
2018-04-25 03:53:07 +01:00
|
|
|
// An EC_RAW_POINT represents an elliptic curve point. Unlike |EC_POINT|, it is
|
|
|
|
// a plain struct which can be stack-allocated and needs no cleanup. It is
|
|
|
|
// specific to an |EC_GROUP| and must not be mixed between groups.
|
|
|
|
typedef struct {
|
|
|
|
EC_FELEM X, Y, Z;
|
|
|
|
// X, Y, and Z are Jacobian projective coordinates. They represent
|
|
|
|
// (X/Z^2, Y/Z^3) if Z != 0 and the point at infinity otherwise.
|
|
|
|
} EC_RAW_POINT;
|
|
|
|
|
2014-06-20 20:00:00 +01:00
|
|
|
struct ec_method_st {
|
|
|
|
int (*group_init)(EC_GROUP *);
|
|
|
|
void (*group_finish)(EC_GROUP *);
|
|
|
|
int (*group_set_curve)(EC_GROUP *, const BIGNUM *p, const BIGNUM *a,
|
|
|
|
const BIGNUM *b, BN_CTX *);
|
2018-11-09 00:31:58 +00:00
|
|
|
|
|
|
|
// point_get_affine_coordinates sets |*x| and |*y| to the affine coordinates
|
|
|
|
// of |p|. Either |x| or |y| may be NULL to omit it. It returns one on success
|
|
|
|
// and zero if |p| is the point at infinity.
|
|
|
|
//
|
|
|
|
// Note: unlike |EC_FELEM|s used as intermediate values internal to the
|
|
|
|
// |EC_METHOD|, |*x| and |*y| are not encoded in Montgomery form.
|
|
|
|
int (*point_get_affine_coordinates)(const EC_GROUP *, const EC_RAW_POINT *p,
|
|
|
|
EC_FELEM *x, EC_FELEM *y);
|
2014-06-20 20:00:00 +01:00
|
|
|
|
2018-11-05 23:30:28 +00:00
|
|
|
// add sets |r| to |a| + |b|.
|
|
|
|
void (*add)(const EC_GROUP *group, EC_RAW_POINT *r, const EC_RAW_POINT *a,
|
|
|
|
const EC_RAW_POINT *b);
|
|
|
|
// dbl sets |r| to |a| + |a|.
|
|
|
|
void (*dbl)(const EC_GROUP *group, EC_RAW_POINT *r, const EC_RAW_POINT *a);
|
|
|
|
|
2017-08-18 19:06:02 +01:00
|
|
|
// Computes |r = g_scalar*generator + p_scalar*p| if |g_scalar| and |p_scalar|
|
|
|
|
// are both non-null. Computes |r = g_scalar*generator| if |p_scalar| is null.
|
|
|
|
// Computes |r = p_scalar*p| if g_scalar is null. At least one of |g_scalar|
|
|
|
|
// and |p_scalar| must be non-null, and |p| must be non-null if |p_scalar| is
|
|
|
|
// non-null.
|
2018-04-25 03:53:07 +01:00
|
|
|
void (*mul)(const EC_GROUP *group, EC_RAW_POINT *r, const EC_SCALAR *g_scalar,
|
|
|
|
const EC_RAW_POINT *p, const EC_SCALAR *p_scalar);
|
ec/p256.c: fiat-crypto field arithmetic (64, 32)
The fiat-crypto-generated code uses the Montgomery form implementation
strategy, for both 32-bit and 64-bit code.
64-bit throughput seems slower, but the difference is smaller than noise between repetitions (-2%?)
32-bit throughput has decreased significantly for ECDH (-40%). I am
attributing this to the change from varibale-time scalar multiplication
to constant-time scalar multiplication. Due to the same bottleneck,
ECDSA verification still uses the old code (otherwise there would have
been a 60% throughput decrease). On the other hand, ECDSA signing
throughput has increased slightly (+10%), perhaps due to the use of a
precomputed table of multiples of the base point.
64-bit benchmarks (Google Cloud Haswell):
with this change:
Did 9126 ECDH P-256 operations in 1009572us (9039.5 ops/sec)
Did 23000 ECDSA P-256 signing operations in 1039832us (22119.0 ops/sec)
Did 8820 ECDSA P-256 verify operations in 1024242us (8611.2 ops/sec)
master (40e8c921cab5cce2bc10722ecf4ebe0e380cf6c8):
Did 9340 ECDH P-256 operations in 1017975us (9175.1 ops/sec)
Did 23000 ECDSA P-256 signing operations in 1039820us (22119.2 ops/sec)
Did 8688 ECDSA P-256 verify operations in 1021108us (8508.4 ops/sec)
benchmarks on ARMv7 (LG Nexus 4):
with this change:
Did 150 ECDH P-256 operations in 1029726us (145.7 ops/sec)
Did 506 ECDSA P-256 signing operations in 1065192us (475.0 ops/sec)
Did 363 ECDSA P-256 verify operations in 1033298us (351.3 ops/sec)
master (2fce1beda0f7e74e2d687860f807cf0b8d8056a4):
Did 245 ECDH P-256 operations in 1017518us (240.8 ops/sec)
Did 473 ECDSA P-256 signing operations in 1086281us (435.4 ops/sec)
Did 360 ECDSA P-256 verify operations in 1003846us (358.6 ops/sec)
64-bit tables converted as follows:
import re, sys, math
p = 2**256 - 2**224 + 2**192 + 2**96 - 1
R = 2**256
def convert(t):
x0, s1, x1, s2, x2, s3, x3 = t.groups()
v = int(x0, 0) + 2**64 * (int(x1, 0) + 2**64*(int(x2,0) + 2**64*(int(x3, 0)) ))
w = v*R%p
y0 = hex(w%(2**64))
y1 = hex((w>>64)%(2**64))
y2 = hex((w>>(2*64))%(2**64))
y3 = hex((w>>(3*64))%(2**64))
ww = int(y0, 0) + 2**64 * (int(y1, 0) + 2**64*(int(y2,0) + 2**64*(int(y3, 0)) ))
if ww != v*R%p:
print(x0,x1,x2,x3)
print(hex(v))
print(y0,y1,y2,y3)
print(hex(w))
print(hex(ww))
assert 0
return '{'+y0+s1+y1+s2+y2+s3+y3+'}'
fe_re = re.compile('{'+r'(\s*,\s*)'.join(r'(\d+|0x[abcdefABCDEF0123456789]+)' for i in range(4)) + '}')
print (re.sub(fe_re, convert, sys.stdin.read()).rstrip('\n'))
32-bit tables converted from 64-bit tables
Change-Id: I52d6e5504fcb6ca2e8b0ee13727f4500c80c1799
Reviewed-on: https://boringssl-review.googlesource.com/23244
Commit-Queue: Adam Langley <agl@google.com>
Reviewed-by: Adam Langley <agl@google.com>
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
2017-11-08 20:32:38 +00:00
|
|
|
// mul_public performs the same computation as mul. It further assumes that
|
|
|
|
// the inputs are public so there is no concern about leaking their values
|
|
|
|
// through timing.
|
2018-04-25 03:53:07 +01:00
|
|
|
void (*mul_public)(const EC_GROUP *group, EC_RAW_POINT *r,
|
|
|
|
const EC_SCALAR *g_scalar, const EC_RAW_POINT *p,
|
|
|
|
const EC_SCALAR *p_scalar);
|
2014-06-20 20:00:00 +01:00
|
|
|
|
Add EC_FELEM for EC_POINTs and related temporaries.
This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used
for EC_POINT's representation in the generic EC_METHOD, as well as
random operations on tuned EC_METHODs that still are implemented
genericly.
Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific
to the EC_METHOD, analogous to how the old values were BIGNUMs but may
or may not have been in Montgomery form. This is kind of a nuisance, but
no more than before. (If p224-64.c were easily convertable to Montgomery
form, we could say |EC_FELEM| is always in Montgomery form. If we
exposed the internal add and double implementations in each of the
curves, we could give |EC_POINT| an |EC_METHOD|-specific representation
and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this
for later.)
The generic add and doubling formulas are aligned with the formulas
proved in fiat-crypto. Those only applied to a = -3, so I've proved a
generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case
someone uses a custom curve. The new formulas are verified,
constant-time, and swap a multiply for a square. As expressed in
fiat-crypto they do use more temporaries, but this seems to be fine with
stack-allocated EC_FELEMs. (We can try to help the compiler later,
but benchamrks below suggest this isn't necessary.)
Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the
bounds in the type system and, in particular, that the width is correct,
which will make it easier to select a point in constant-time in the
future. (Indeed the old code did not always have the correct width. Its
point formula involved halving and implemented this in variable time and
variable width.)
Before:
Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec)
Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec)
Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec)
Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec)
Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec)
Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec)
Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec)
After:
Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%]
Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%]
Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%]
Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%]
Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%]
Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%]
Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%]
This more than pays for removing points_make_affine previously and even
speeds up ECDH P-256 slightly. (The point-on-curve check uses the
generic code.)
Next is to push the stack-allocating up to ec_wNAF_mul, followed by a
constant-time single-point multiplication.
Bug: 239
Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c
Reviewed-on: https://boringssl-review.googlesource.com/27668
Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
|
|
|
// felem_mul and felem_sqr implement multiplication and squaring,
|
|
|
|
// respectively, so that the generic |EC_POINT_add| and |EC_POINT_dbl|
|
|
|
|
// implementations can work both with |EC_GFp_mont_method| and the tuned
|
|
|
|
// operations.
|
|
|
|
//
|
|
|
|
// TODO(davidben): This constrains |EC_FELEM|'s internal representation, adds
|
|
|
|
// many indirect calls in the middle of the generic code, and a bunch of
|
|
|
|
// conversions. If p224-64.c were easily convertable to Montgomery form, we
|
2018-11-05 23:30:28 +00:00
|
|
|
// could say |EC_FELEM| is always in Montgomery form. If we routed the rest of
|
|
|
|
// simple.c to |EC_METHOD|, we could give |EC_POINT| an |EC_METHOD|-specific
|
|
|
|
// representation and say |EC_FELEM| is purely a |EC_GFp_mont_method| type.
|
Add EC_FELEM for EC_POINTs and related temporaries.
This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used
for EC_POINT's representation in the generic EC_METHOD, as well as
random operations on tuned EC_METHODs that still are implemented
genericly.
Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific
to the EC_METHOD, analogous to how the old values were BIGNUMs but may
or may not have been in Montgomery form. This is kind of a nuisance, but
no more than before. (If p224-64.c were easily convertable to Montgomery
form, we could say |EC_FELEM| is always in Montgomery form. If we
exposed the internal add and double implementations in each of the
curves, we could give |EC_POINT| an |EC_METHOD|-specific representation
and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this
for later.)
The generic add and doubling formulas are aligned with the formulas
proved in fiat-crypto. Those only applied to a = -3, so I've proved a
generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case
someone uses a custom curve. The new formulas are verified,
constant-time, and swap a multiply for a square. As expressed in
fiat-crypto they do use more temporaries, but this seems to be fine with
stack-allocated EC_FELEMs. (We can try to help the compiler later,
but benchamrks below suggest this isn't necessary.)
Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the
bounds in the type system and, in particular, that the width is correct,
which will make it easier to select a point in constant-time in the
future. (Indeed the old code did not always have the correct width. Its
point formula involved halving and implemented this in variable time and
variable width.)
Before:
Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec)
Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec)
Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec)
Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec)
Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec)
Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec)
Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec)
After:
Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%]
Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%]
Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%]
Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%]
Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%]
Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%]
Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%]
This more than pays for removing points_make_affine previously and even
speeds up ECDH P-256 slightly. (The point-on-curve check uses the
generic code.)
Next is to push the stack-allocating up to ec_wNAF_mul, followed by a
constant-time single-point multiplication.
Bug: 239
Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c
Reviewed-on: https://boringssl-review.googlesource.com/27668
Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
|
|
|
void (*felem_mul)(const EC_GROUP *, EC_FELEM *r, const EC_FELEM *a,
|
|
|
|
const EC_FELEM *b);
|
|
|
|
void (*felem_sqr)(const EC_GROUP *, EC_FELEM *r, const EC_FELEM *a);
|
|
|
|
|
|
|
|
int (*bignum_to_felem)(const EC_GROUP *group, EC_FELEM *out,
|
|
|
|
const BIGNUM *in);
|
|
|
|
int (*felem_to_bignum)(const EC_GROUP *group, BIGNUM *out,
|
|
|
|
const EC_FELEM *in);
|
2018-04-07 00:43:29 +01:00
|
|
|
|
2018-11-06 23:18:56 +00:00
|
|
|
// scalar_inv_montgomery sets |out| to |in|^-1, where both input and output
|
|
|
|
// are in Montgomery form.
|
2018-04-07 00:43:29 +01:00
|
|
|
void (*scalar_inv_montgomery)(const EC_GROUP *group, EC_SCALAR *out,
|
|
|
|
const EC_SCALAR *in);
|
|
|
|
|
2018-11-06 23:18:56 +00:00
|
|
|
// scalar_inv_montgomery_vartime performs the same computation as
|
|
|
|
// |scalar_inv_montgomery|. It further assumes that the inputs are public so
|
|
|
|
// there is no concern about leaking their values through timing.
|
|
|
|
int (*scalar_inv_montgomery_vartime)(const EC_GROUP *group, EC_SCALAR *out,
|
|
|
|
const EC_SCALAR *in);
|
|
|
|
|
|
|
|
// cmp_x_coordinate compares the x (affine) coordinate of |p|, mod the group
|
2018-11-09 01:07:42 +00:00
|
|
|
// order, with |r|. It returns one if the values match and zero if |p| is the
|
|
|
|
// point at infinity of the values do not match.
|
|
|
|
int (*cmp_x_coordinate)(const EC_GROUP *group, const EC_RAW_POINT *p,
|
|
|
|
const EC_SCALAR *r);
|
2014-06-20 20:00:00 +01:00
|
|
|
} /* EC_METHOD */;
|
|
|
|
|
2017-05-02 22:25:39 +01:00
|
|
|
const EC_METHOD *EC_GFp_mont_method(void);
|
2014-06-20 20:00:00 +01:00
|
|
|
|
|
|
|
struct ec_group_st {
|
|
|
|
const EC_METHOD *meth;
|
|
|
|
|
2017-10-26 20:48:18 +01:00
|
|
|
// Unlike all other |EC_POINT|s, |generator| does not own |generator->group|
|
|
|
|
// to avoid a reference cycle.
|
2016-02-06 00:12:04 +00:00
|
|
|
EC_POINT *generator;
|
2016-06-17 01:40:18 +01:00
|
|
|
BIGNUM order;
|
2014-06-20 20:00:00 +01:00
|
|
|
|
2017-08-18 19:06:02 +01:00
|
|
|
int curve_name; // optional NID for named curve
|
2014-06-20 20:00:00 +01:00
|
|
|
|
2017-11-13 06:55:25 +00:00
|
|
|
BN_MONT_CTX *order_mont; // data for ECDSA inverse
|
2014-06-20 20:00:00 +01:00
|
|
|
|
2017-08-18 19:06:02 +01:00
|
|
|
// The following members are handled by the method functions,
|
|
|
|
// even if they appear generic
|
2014-06-20 20:00:00 +01:00
|
|
|
|
2017-08-18 19:06:02 +01:00
|
|
|
BIGNUM field; // For curves over GF(p), this is the modulus.
|
2015-04-08 22:11:16 +01:00
|
|
|
|
Add EC_FELEM for EC_POINTs and related temporaries.
This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used
for EC_POINT's representation in the generic EC_METHOD, as well as
random operations on tuned EC_METHODs that still are implemented
genericly.
Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific
to the EC_METHOD, analogous to how the old values were BIGNUMs but may
or may not have been in Montgomery form. This is kind of a nuisance, but
no more than before. (If p224-64.c were easily convertable to Montgomery
form, we could say |EC_FELEM| is always in Montgomery form. If we
exposed the internal add and double implementations in each of the
curves, we could give |EC_POINT| an |EC_METHOD|-specific representation
and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this
for later.)
The generic add and doubling formulas are aligned with the formulas
proved in fiat-crypto. Those only applied to a = -3, so I've proved a
generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case
someone uses a custom curve. The new formulas are verified,
constant-time, and swap a multiply for a square. As expressed in
fiat-crypto they do use more temporaries, but this seems to be fine with
stack-allocated EC_FELEMs. (We can try to help the compiler later,
but benchamrks below suggest this isn't necessary.)
Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the
bounds in the type system and, in particular, that the width is correct,
which will make it easier to select a point in constant-time in the
future. (Indeed the old code did not always have the correct width. Its
point formula involved halving and implemented this in variable time and
variable width.)
Before:
Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec)
Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec)
Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec)
Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec)
Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec)
Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec)
Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec)
After:
Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%]
Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%]
Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%]
Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%]
Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%]
Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%]
Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%]
This more than pays for removing points_make_affine previously and even
speeds up ECDH P-256 slightly. (The point-on-curve check uses the
generic code.)
Next is to push the stack-allocating up to ec_wNAF_mul, followed by a
constant-time single-point multiplication.
Bug: 239
Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c
Reviewed-on: https://boringssl-review.googlesource.com/27668
Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
|
|
|
EC_FELEM a, b; // Curve coefficients.
|
2014-06-20 20:00:00 +01:00
|
|
|
|
2018-11-09 22:46:55 +00:00
|
|
|
// a_is_minus3 is one if |a| is -3 mod |field| and zero otherwise. Point
|
|
|
|
// arithmetic is optimized for -3.
|
|
|
|
int a_is_minus3;
|
|
|
|
|
|
|
|
// field_greater_than_order is one if |field| is greate than |order| and zero
|
|
|
|
// otherwise.
|
|
|
|
int field_greater_than_order;
|
|
|
|
|
|
|
|
// field_minus_order, if |field_greater_than_order| is true, is |field| minus
|
|
|
|
// |order| represented as an |EC_FELEM|. Otherwise, it is zero.
|
|
|
|
//
|
|
|
|
// Note: unlike |EC_FELEM|s used as intermediate values internal to the
|
|
|
|
// |EC_METHOD|, this value is not encoded in Montgomery form.
|
|
|
|
EC_FELEM field_minus_order;
|
2014-06-20 20:00:00 +01:00
|
|
|
|
2017-10-26 19:53:29 +01:00
|
|
|
CRYPTO_refcount_t references;
|
|
|
|
|
2017-08-18 19:06:02 +01:00
|
|
|
BN_MONT_CTX *mont; // Montgomery structure.
|
2015-11-26 00:19:21 +00:00
|
|
|
|
Add EC_FELEM for EC_POINTs and related temporaries.
This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used
for EC_POINT's representation in the generic EC_METHOD, as well as
random operations on tuned EC_METHODs that still are implemented
genericly.
Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific
to the EC_METHOD, analogous to how the old values were BIGNUMs but may
or may not have been in Montgomery form. This is kind of a nuisance, but
no more than before. (If p224-64.c were easily convertable to Montgomery
form, we could say |EC_FELEM| is always in Montgomery form. If we
exposed the internal add and double implementations in each of the
curves, we could give |EC_POINT| an |EC_METHOD|-specific representation
and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this
for later.)
The generic add and doubling formulas are aligned with the formulas
proved in fiat-crypto. Those only applied to a = -3, so I've proved a
generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case
someone uses a custom curve. The new formulas are verified,
constant-time, and swap a multiply for a square. As expressed in
fiat-crypto they do use more temporaries, but this seems to be fine with
stack-allocated EC_FELEMs. (We can try to help the compiler later,
but benchamrks below suggest this isn't necessary.)
Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the
bounds in the type system and, in particular, that the width is correct,
which will make it easier to select a point in constant-time in the
future. (Indeed the old code did not always have the correct width. Its
point formula involved halving and implemented this in variable time and
variable width.)
Before:
Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec)
Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec)
Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec)
Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec)
Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec)
Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec)
Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec)
After:
Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%]
Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%]
Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%]
Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%]
Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%]
Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%]
Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%]
This more than pays for removing points_make_affine previously and even
speeds up ECDH P-256 slightly. (The point-on-curve check uses the
generic code.)
Next is to push the stack-allocating up to ec_wNAF_mul, followed by a
constant-time single-point multiplication.
Bug: 239
Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c
Reviewed-on: https://boringssl-review.googlesource.com/27668
Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
|
|
|
EC_FELEM one; // The value one.
|
2014-06-20 20:00:00 +01:00
|
|
|
} /* EC_GROUP */;
|
|
|
|
|
|
|
|
struct ec_point_st {
|
2017-10-26 20:48:18 +01:00
|
|
|
// group is an owning reference to |group|, unless this is
|
|
|
|
// |group->generator|.
|
|
|
|
EC_GROUP *group;
|
2018-11-09 23:06:51 +00:00
|
|
|
// raw is the group-specific point data. Functions that take |EC_POINT|
|
|
|
|
// typically check consistency with |EC_GROUP| while functions that take
|
|
|
|
// |EC_RAW_POINT| do not. Thus accesses to this field should be externally
|
|
|
|
// checked for consistency.
|
2018-04-25 03:53:07 +01:00
|
|
|
EC_RAW_POINT raw;
|
2014-06-20 20:00:00 +01:00
|
|
|
} /* EC_POINT */;
|
|
|
|
|
|
|
|
EC_GROUP *ec_group_new(const EC_METHOD *meth);
|
|
|
|
|
Add EC_FELEM for EC_POINTs and related temporaries.
This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used
for EC_POINT's representation in the generic EC_METHOD, as well as
random operations on tuned EC_METHODs that still are implemented
genericly.
Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific
to the EC_METHOD, analogous to how the old values were BIGNUMs but may
or may not have been in Montgomery form. This is kind of a nuisance, but
no more than before. (If p224-64.c were easily convertable to Montgomery
form, we could say |EC_FELEM| is always in Montgomery form. If we
exposed the internal add and double implementations in each of the
curves, we could give |EC_POINT| an |EC_METHOD|-specific representation
and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this
for later.)
The generic add and doubling formulas are aligned with the formulas
proved in fiat-crypto. Those only applied to a = -3, so I've proved a
generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case
someone uses a custom curve. The new formulas are verified,
constant-time, and swap a multiply for a square. As expressed in
fiat-crypto they do use more temporaries, but this seems to be fine with
stack-allocated EC_FELEMs. (We can try to help the compiler later,
but benchamrks below suggest this isn't necessary.)
Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the
bounds in the type system and, in particular, that the width is correct,
which will make it easier to select a point in constant-time in the
future. (Indeed the old code did not always have the correct width. Its
point formula involved halving and implemented this in variable time and
variable width.)
Before:
Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec)
Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec)
Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec)
Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec)
Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec)
Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec)
Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec)
After:
Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%]
Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%]
Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%]
Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%]
Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%]
Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%]
Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%]
This more than pays for removing points_make_affine previously and even
speeds up ECDH P-256 slightly. (The point-on-curve check uses the
generic code.)
Next is to push the stack-allocating up to ec_wNAF_mul, followed by a
constant-time single-point multiplication.
Bug: 239
Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c
Reviewed-on: https://boringssl-review.googlesource.com/27668
Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
|
|
|
// ec_bignum_to_felem converts |in| to an |EC_FELEM|. It returns one on success
|
|
|
|
// and zero if |in| is out of range.
|
|
|
|
int ec_bignum_to_felem(const EC_GROUP *group, EC_FELEM *out, const BIGNUM *in);
|
|
|
|
|
|
|
|
// ec_felem_to_bignum converts |in| to a |BIGNUM|. It returns one on success and
|
|
|
|
// zero on allocation failure.
|
|
|
|
int ec_felem_to_bignum(const EC_GROUP *group, BIGNUM *out, const EC_FELEM *in);
|
|
|
|
|
|
|
|
// ec_felem_neg sets |out| to -|a|.
|
|
|
|
void ec_felem_neg(const EC_GROUP *group, EC_FELEM *out, const EC_FELEM *a);
|
|
|
|
|
|
|
|
// ec_felem_add sets |out| to |a| + |b|.
|
|
|
|
void ec_felem_add(const EC_GROUP *group, EC_FELEM *out, const EC_FELEM *a,
|
|
|
|
const EC_FELEM *b);
|
|
|
|
|
|
|
|
// ec_felem_add sets |out| to |a| - |b|.
|
|
|
|
void ec_felem_sub(const EC_GROUP *group, EC_FELEM *out, const EC_FELEM *a,
|
|
|
|
const EC_FELEM *b);
|
|
|
|
|
|
|
|
// ec_felem_non_zero_mask returns all ones if |a| is non-zero and all zeros
|
|
|
|
// otherwise.
|
|
|
|
BN_ULONG ec_felem_non_zero_mask(const EC_GROUP *group, const EC_FELEM *a);
|
|
|
|
|
|
|
|
// ec_felem_select, in constant time, sets |out| to |a| if |mask| is all ones
|
|
|
|
// and |b| if |mask| is all zeros.
|
|
|
|
void ec_felem_select(const EC_GROUP *group, EC_FELEM *out, BN_ULONG mask,
|
|
|
|
const EC_FELEM *a, const EC_FELEM *b);
|
|
|
|
|
|
|
|
// ec_felem_equal returns one if |a| and |b| are equal and zero otherwise. It
|
|
|
|
// treats |a| and |b| as public and does *not* run in constant time.
|
|
|
|
int ec_felem_equal(const EC_GROUP *group, const EC_FELEM *a, const EC_FELEM *b);
|
|
|
|
|
2017-11-30 21:05:36 +00:00
|
|
|
// ec_bignum_to_scalar converts |in| to an |EC_SCALAR| and writes it to
|
|
|
|
// |*out|. It returns one on success and zero if |in| is out of range.
|
2018-03-23 18:00:44 +00:00
|
|
|
OPENSSL_EXPORT int ec_bignum_to_scalar(const EC_GROUP *group, EC_SCALAR *out,
|
|
|
|
const BIGNUM *in);
|
Make ECDSA signing 10% faster and plug some timing leaks.
None of the asymmetric crypto we inherented from OpenSSL is
constant-time because of BIGNUM. BIGNUM chops leading zeros off the
front of everything, so we end up leaking information about the first
word, in theory. BIGNUM functions additionally tend to take the full
range of inputs and then call into BN_nnmod at various points.
All our secret values should be acted on in constant-time, but k in
ECDSA is a particularly sensitive value. So, ecdsa_sign_setup, in an
attempt to mitigate the BIGNUM leaks, would add a couple copies of the
order.
This does not work at all. k is used to compute two values: k^-1 and kG.
The first operation when computing k^-1 is to call BN_nnmod if k is out
of range. The entry point to our tuned constant-time curve
implementations is to call BN_nnmod if the scalar has too many bits,
which this causes. The result is both corrections are immediately undone
but cause us to do more variable-time work in the meantime.
Replace all these computations around k with the word-based functions
added in the various preceding CLs. In doing so, replace the BN_mod_mul
calls (which internally call BN_nnmod) with Montgomery reduction. We can
avoid taking k^-1 out of Montgomery form, which combines nicely with
Brian Smith's trick in 3426d1011946b26ff1bb2fd98a081ba4753c9cc8. Along
the way, we avoid some unnecessary mallocs.
BIGNUM still affects the private key itself, as well as the EC_POINTs.
But this should hopefully be much better now. Also it's 10% faster:
Before:
Did 15000 ECDSA P-224 signing operations in 1069117us (14030.3 ops/sec)
Did 18000 ECDSA P-256 signing operations in 1053908us (17079.3 ops/sec)
Did 1078 ECDSA P-384 signing operations in 1087853us (990.9 ops/sec)
Did 473 ECDSA P-521 signing operations in 1069835us (442.1 ops/sec)
After:
Did 16000 ECDSA P-224 signing operations in 1064799us (15026.3 ops/sec)
Did 19000 ECDSA P-256 signing operations in 1007839us (18852.2 ops/sec)
Did 1078 ECDSA P-384 signing operations in 1079413us (998.7 ops/sec)
Did 484 ECDSA P-521 signing operations in 1083616us (446.7 ops/sec)
Change-Id: I2a25e90fc99dac13c0616d0ea45e125a4bd8cca1
Reviewed-on: https://boringssl-review.googlesource.com/23075
Reviewed-by: Adam Langley <agl@google.com>
2017-11-13 03:58:00 +00:00
|
|
|
|
|
|
|
// ec_random_nonzero_scalar sets |out| to a uniformly selected random value from
|
|
|
|
// 1 to |group->order| - 1. It returns one on success and zero on error.
|
|
|
|
int ec_random_nonzero_scalar(const EC_GROUP *group, EC_SCALAR *out,
|
|
|
|
const uint8_t additional_data[32]);
|
|
|
|
|
2018-11-09 01:07:42 +00:00
|
|
|
// ec_scalar_equal_vartime returns one if |a| and |b| are equal and zero
|
|
|
|
// otherwise. Both values are treated as public.
|
|
|
|
int ec_scalar_equal_vartime(const EC_GROUP *group, const EC_SCALAR *a,
|
|
|
|
const EC_SCALAR *b);
|
|
|
|
|
|
|
|
// ec_scalar_is_zero returns one if |a| is zero and zero otherwise.
|
|
|
|
int ec_scalar_is_zero(const EC_GROUP *group, const EC_SCALAR *a);
|
|
|
|
|
2018-04-05 04:36:15 +01:00
|
|
|
// ec_scalar_add sets |r| to |a| + |b|.
|
|
|
|
void ec_scalar_add(const EC_GROUP *group, EC_SCALAR *r, const EC_SCALAR *a,
|
|
|
|
const EC_SCALAR *b);
|
|
|
|
|
|
|
|
// ec_scalar_to_montgomery sets |r| to |a| in Montgomery form.
|
|
|
|
void ec_scalar_to_montgomery(const EC_GROUP *group, EC_SCALAR *r,
|
|
|
|
const EC_SCALAR *a);
|
|
|
|
|
|
|
|
// ec_scalar_to_montgomery sets |r| to |a| converted from Montgomery form.
|
|
|
|
void ec_scalar_from_montgomery(const EC_GROUP *group, EC_SCALAR *r,
|
|
|
|
const EC_SCALAR *a);
|
|
|
|
|
|
|
|
// ec_scalar_mul_montgomery sets |r| to |a| * |b| where inputs and outputs are
|
|
|
|
// in Montgomery form.
|
|
|
|
void ec_scalar_mul_montgomery(const EC_GROUP *group, EC_SCALAR *r,
|
|
|
|
const EC_SCALAR *a, const EC_SCALAR *b);
|
|
|
|
|
|
|
|
// ec_scalar_mul_montgomery sets |r| to |a|^-1 where inputs and outputs are in
|
|
|
|
// Montgomery form.
|
|
|
|
void ec_scalar_inv_montgomery(const EC_GROUP *group, EC_SCALAR *r,
|
|
|
|
const EC_SCALAR *a);
|
|
|
|
|
2018-11-06 23:18:56 +00:00
|
|
|
// ec_scalar_inv_montgomery_vartime performs the same actions as
|
|
|
|
// |ec_scalar_inv_montgomery|, but in variable time.
|
|
|
|
int ec_scalar_inv_montgomery_vartime(const EC_GROUP *group, EC_SCALAR *r,
|
|
|
|
const EC_SCALAR *a);
|
|
|
|
|
Make ECDSA signing 10% faster and plug some timing leaks.
None of the asymmetric crypto we inherented from OpenSSL is
constant-time because of BIGNUM. BIGNUM chops leading zeros off the
front of everything, so we end up leaking information about the first
word, in theory. BIGNUM functions additionally tend to take the full
range of inputs and then call into BN_nnmod at various points.
All our secret values should be acted on in constant-time, but k in
ECDSA is a particularly sensitive value. So, ecdsa_sign_setup, in an
attempt to mitigate the BIGNUM leaks, would add a couple copies of the
order.
This does not work at all. k is used to compute two values: k^-1 and kG.
The first operation when computing k^-1 is to call BN_nnmod if k is out
of range. The entry point to our tuned constant-time curve
implementations is to call BN_nnmod if the scalar has too many bits,
which this causes. The result is both corrections are immediately undone
but cause us to do more variable-time work in the meantime.
Replace all these computations around k with the word-based functions
added in the various preceding CLs. In doing so, replace the BN_mod_mul
calls (which internally call BN_nnmod) with Montgomery reduction. We can
avoid taking k^-1 out of Montgomery form, which combines nicely with
Brian Smith's trick in 3426d1011946b26ff1bb2fd98a081ba4753c9cc8. Along
the way, we avoid some unnecessary mallocs.
BIGNUM still affects the private key itself, as well as the EC_POINTs.
But this should hopefully be much better now. Also it's 10% faster:
Before:
Did 15000 ECDSA P-224 signing operations in 1069117us (14030.3 ops/sec)
Did 18000 ECDSA P-256 signing operations in 1053908us (17079.3 ops/sec)
Did 1078 ECDSA P-384 signing operations in 1087853us (990.9 ops/sec)
Did 473 ECDSA P-521 signing operations in 1069835us (442.1 ops/sec)
After:
Did 16000 ECDSA P-224 signing operations in 1064799us (15026.3 ops/sec)
Did 19000 ECDSA P-256 signing operations in 1007839us (18852.2 ops/sec)
Did 1078 ECDSA P-384 signing operations in 1079413us (998.7 ops/sec)
Did 484 ECDSA P-521 signing operations in 1083616us (446.7 ops/sec)
Change-Id: I2a25e90fc99dac13c0616d0ea45e125a4bd8cca1
Reviewed-on: https://boringssl-review.googlesource.com/23075
Reviewed-by: Adam Langley <agl@google.com>
2017-11-13 03:58:00 +00:00
|
|
|
// ec_point_mul_scalar sets |r| to generator * |g_scalar| + |p| *
|
|
|
|
// |p_scalar|. Unlike other functions which take |EC_SCALAR|, |g_scalar| and
|
|
|
|
// |p_scalar| need not be fully reduced. They need only contain as many bits as
|
|
|
|
// the order.
|
2018-11-09 23:06:51 +00:00
|
|
|
int ec_point_mul_scalar(const EC_GROUP *group, EC_RAW_POINT *r,
|
|
|
|
const EC_SCALAR *g_scalar, const EC_RAW_POINT *p,
|
2018-11-09 18:24:18 +00:00
|
|
|
const EC_SCALAR *p_scalar);
|
Make ECDSA signing 10% faster and plug some timing leaks.
None of the asymmetric crypto we inherented from OpenSSL is
constant-time because of BIGNUM. BIGNUM chops leading zeros off the
front of everything, so we end up leaking information about the first
word, in theory. BIGNUM functions additionally tend to take the full
range of inputs and then call into BN_nnmod at various points.
All our secret values should be acted on in constant-time, but k in
ECDSA is a particularly sensitive value. So, ecdsa_sign_setup, in an
attempt to mitigate the BIGNUM leaks, would add a couple copies of the
order.
This does not work at all. k is used to compute two values: k^-1 and kG.
The first operation when computing k^-1 is to call BN_nnmod if k is out
of range. The entry point to our tuned constant-time curve
implementations is to call BN_nnmod if the scalar has too many bits,
which this causes. The result is both corrections are immediately undone
but cause us to do more variable-time work in the meantime.
Replace all these computations around k with the word-based functions
added in the various preceding CLs. In doing so, replace the BN_mod_mul
calls (which internally call BN_nnmod) with Montgomery reduction. We can
avoid taking k^-1 out of Montgomery form, which combines nicely with
Brian Smith's trick in 3426d1011946b26ff1bb2fd98a081ba4753c9cc8. Along
the way, we avoid some unnecessary mallocs.
BIGNUM still affects the private key itself, as well as the EC_POINTs.
But this should hopefully be much better now. Also it's 10% faster:
Before:
Did 15000 ECDSA P-224 signing operations in 1069117us (14030.3 ops/sec)
Did 18000 ECDSA P-256 signing operations in 1053908us (17079.3 ops/sec)
Did 1078 ECDSA P-384 signing operations in 1087853us (990.9 ops/sec)
Did 473 ECDSA P-521 signing operations in 1069835us (442.1 ops/sec)
After:
Did 16000 ECDSA P-224 signing operations in 1064799us (15026.3 ops/sec)
Did 19000 ECDSA P-256 signing operations in 1007839us (18852.2 ops/sec)
Did 1078 ECDSA P-384 signing operations in 1079413us (998.7 ops/sec)
Did 484 ECDSA P-521 signing operations in 1083616us (446.7 ops/sec)
Change-Id: I2a25e90fc99dac13c0616d0ea45e125a4bd8cca1
Reviewed-on: https://boringssl-review.googlesource.com/23075
Reviewed-by: Adam Langley <agl@google.com>
2017-11-13 03:58:00 +00:00
|
|
|
|
ec/p256.c: fiat-crypto field arithmetic (64, 32)
The fiat-crypto-generated code uses the Montgomery form implementation
strategy, for both 32-bit and 64-bit code.
64-bit throughput seems slower, but the difference is smaller than noise between repetitions (-2%?)
32-bit throughput has decreased significantly for ECDH (-40%). I am
attributing this to the change from varibale-time scalar multiplication
to constant-time scalar multiplication. Due to the same bottleneck,
ECDSA verification still uses the old code (otherwise there would have
been a 60% throughput decrease). On the other hand, ECDSA signing
throughput has increased slightly (+10%), perhaps due to the use of a
precomputed table of multiples of the base point.
64-bit benchmarks (Google Cloud Haswell):
with this change:
Did 9126 ECDH P-256 operations in 1009572us (9039.5 ops/sec)
Did 23000 ECDSA P-256 signing operations in 1039832us (22119.0 ops/sec)
Did 8820 ECDSA P-256 verify operations in 1024242us (8611.2 ops/sec)
master (40e8c921cab5cce2bc10722ecf4ebe0e380cf6c8):
Did 9340 ECDH P-256 operations in 1017975us (9175.1 ops/sec)
Did 23000 ECDSA P-256 signing operations in 1039820us (22119.2 ops/sec)
Did 8688 ECDSA P-256 verify operations in 1021108us (8508.4 ops/sec)
benchmarks on ARMv7 (LG Nexus 4):
with this change:
Did 150 ECDH P-256 operations in 1029726us (145.7 ops/sec)
Did 506 ECDSA P-256 signing operations in 1065192us (475.0 ops/sec)
Did 363 ECDSA P-256 verify operations in 1033298us (351.3 ops/sec)
master (2fce1beda0f7e74e2d687860f807cf0b8d8056a4):
Did 245 ECDH P-256 operations in 1017518us (240.8 ops/sec)
Did 473 ECDSA P-256 signing operations in 1086281us (435.4 ops/sec)
Did 360 ECDSA P-256 verify operations in 1003846us (358.6 ops/sec)
64-bit tables converted as follows:
import re, sys, math
p = 2**256 - 2**224 + 2**192 + 2**96 - 1
R = 2**256
def convert(t):
x0, s1, x1, s2, x2, s3, x3 = t.groups()
v = int(x0, 0) + 2**64 * (int(x1, 0) + 2**64*(int(x2,0) + 2**64*(int(x3, 0)) ))
w = v*R%p
y0 = hex(w%(2**64))
y1 = hex((w>>64)%(2**64))
y2 = hex((w>>(2*64))%(2**64))
y3 = hex((w>>(3*64))%(2**64))
ww = int(y0, 0) + 2**64 * (int(y1, 0) + 2**64*(int(y2,0) + 2**64*(int(y3, 0)) ))
if ww != v*R%p:
print(x0,x1,x2,x3)
print(hex(v))
print(y0,y1,y2,y3)
print(hex(w))
print(hex(ww))
assert 0
return '{'+y0+s1+y1+s2+y2+s3+y3+'}'
fe_re = re.compile('{'+r'(\s*,\s*)'.join(r'(\d+|0x[abcdefABCDEF0123456789]+)' for i in range(4)) + '}')
print (re.sub(fe_re, convert, sys.stdin.read()).rstrip('\n'))
32-bit tables converted from 64-bit tables
Change-Id: I52d6e5504fcb6ca2e8b0ee13727f4500c80c1799
Reviewed-on: https://boringssl-review.googlesource.com/23244
Commit-Queue: Adam Langley <agl@google.com>
Reviewed-by: Adam Langley <agl@google.com>
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
2017-11-08 20:32:38 +00:00
|
|
|
// ec_point_mul_scalar_public performs the same computation as
|
|
|
|
// ec_point_mul_scalar. It further assumes that the inputs are public so
|
|
|
|
// there is no concern about leaking their values through timing.
|
2018-11-09 23:06:51 +00:00
|
|
|
OPENSSL_EXPORT int ec_point_mul_scalar_public(const EC_GROUP *group,
|
|
|
|
EC_RAW_POINT *r,
|
|
|
|
const EC_SCALAR *g_scalar,
|
|
|
|
const EC_RAW_POINT *p,
|
|
|
|
const EC_SCALAR *p_scalar);
|
ec/p256.c: fiat-crypto field arithmetic (64, 32)
The fiat-crypto-generated code uses the Montgomery form implementation
strategy, for both 32-bit and 64-bit code.
64-bit throughput seems slower, but the difference is smaller than noise between repetitions (-2%?)
32-bit throughput has decreased significantly for ECDH (-40%). I am
attributing this to the change from varibale-time scalar multiplication
to constant-time scalar multiplication. Due to the same bottleneck,
ECDSA verification still uses the old code (otherwise there would have
been a 60% throughput decrease). On the other hand, ECDSA signing
throughput has increased slightly (+10%), perhaps due to the use of a
precomputed table of multiples of the base point.
64-bit benchmarks (Google Cloud Haswell):
with this change:
Did 9126 ECDH P-256 operations in 1009572us (9039.5 ops/sec)
Did 23000 ECDSA P-256 signing operations in 1039832us (22119.0 ops/sec)
Did 8820 ECDSA P-256 verify operations in 1024242us (8611.2 ops/sec)
master (40e8c921cab5cce2bc10722ecf4ebe0e380cf6c8):
Did 9340 ECDH P-256 operations in 1017975us (9175.1 ops/sec)
Did 23000 ECDSA P-256 signing operations in 1039820us (22119.2 ops/sec)
Did 8688 ECDSA P-256 verify operations in 1021108us (8508.4 ops/sec)
benchmarks on ARMv7 (LG Nexus 4):
with this change:
Did 150 ECDH P-256 operations in 1029726us (145.7 ops/sec)
Did 506 ECDSA P-256 signing operations in 1065192us (475.0 ops/sec)
Did 363 ECDSA P-256 verify operations in 1033298us (351.3 ops/sec)
master (2fce1beda0f7e74e2d687860f807cf0b8d8056a4):
Did 245 ECDH P-256 operations in 1017518us (240.8 ops/sec)
Did 473 ECDSA P-256 signing operations in 1086281us (435.4 ops/sec)
Did 360 ECDSA P-256 verify operations in 1003846us (358.6 ops/sec)
64-bit tables converted as follows:
import re, sys, math
p = 2**256 - 2**224 + 2**192 + 2**96 - 1
R = 2**256
def convert(t):
x0, s1, x1, s2, x2, s3, x3 = t.groups()
v = int(x0, 0) + 2**64 * (int(x1, 0) + 2**64*(int(x2,0) + 2**64*(int(x3, 0)) ))
w = v*R%p
y0 = hex(w%(2**64))
y1 = hex((w>>64)%(2**64))
y2 = hex((w>>(2*64))%(2**64))
y3 = hex((w>>(3*64))%(2**64))
ww = int(y0, 0) + 2**64 * (int(y1, 0) + 2**64*(int(y2,0) + 2**64*(int(y3, 0)) ))
if ww != v*R%p:
print(x0,x1,x2,x3)
print(hex(v))
print(y0,y1,y2,y3)
print(hex(w))
print(hex(ww))
assert 0
return '{'+y0+s1+y1+s2+y2+s3+y3+'}'
fe_re = re.compile('{'+r'(\s*,\s*)'.join(r'(\d+|0x[abcdefABCDEF0123456789]+)' for i in range(4)) + '}')
print (re.sub(fe_re, convert, sys.stdin.read()).rstrip('\n'))
32-bit tables converted from 64-bit tables
Change-Id: I52d6e5504fcb6ca2e8b0ee13727f4500c80c1799
Reviewed-on: https://boringssl-review.googlesource.com/23244
Commit-Queue: Adam Langley <agl@google.com>
Reviewed-by: Adam Langley <agl@google.com>
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
2017-11-08 20:32:38 +00:00
|
|
|
|
2018-11-09 01:07:42 +00:00
|
|
|
// ec_cmp_x_coordinate compares the x (affine) coordinate of |p|, mod the group
|
|
|
|
// order, with |r|. It returns one if the values match and zero if |p| is the
|
|
|
|
// point at infinity of the values do not match.
|
|
|
|
int ec_cmp_x_coordinate(const EC_GROUP *group, const EC_RAW_POINT *p,
|
|
|
|
const EC_SCALAR *r);
|
|
|
|
|
|
|
|
// ec_get_x_coordinate_as_scalar sets |*out| to |p|'s x-coordinate, modulo
|
|
|
|
// |group->order|. It returns one on success and zero if |p| is the point at
|
|
|
|
// infinity.
|
|
|
|
int ec_get_x_coordinate_as_scalar(const EC_GROUP *group, EC_SCALAR *out,
|
|
|
|
const EC_RAW_POINT *p);
|
2018-11-06 23:18:56 +00:00
|
|
|
|
2018-11-09 23:36:12 +00:00
|
|
|
// ec_point_get_affine_coordinate_bytes writes |p|'s affine coordinates to
|
|
|
|
// |out_x| and |out_y|, each of which must have at must |max_out| bytes. It sets
|
|
|
|
// |*out_len| to the number of bytes written in each buffer. Coordinates are
|
|
|
|
// written big-endian and zero-padded to the size of the field.
|
|
|
|
//
|
|
|
|
// Either of |out_x| or |out_y| may be NULL to omit that coordinate. This
|
|
|
|
// function returns one on success and zero on failure.
|
|
|
|
int ec_point_get_affine_coordinate_bytes(const EC_GROUP *group, uint8_t *out_x,
|
|
|
|
uint8_t *out_y, size_t *out_len,
|
|
|
|
size_t max_out, const EC_RAW_POINT *p);
|
|
|
|
|
2018-11-06 23:18:56 +00:00
|
|
|
// ec_field_element_to_scalar reduces |r| modulo |group->order|. |r| must
|
|
|
|
// previously have been reduced modulo |group->field|.
|
|
|
|
int ec_field_element_to_scalar(const EC_GROUP *group, BIGNUM *r);
|
|
|
|
|
2018-11-05 23:37:29 +00:00
|
|
|
void ec_GFp_mont_mul(const EC_GROUP *group, EC_RAW_POINT *r,
|
|
|
|
const EC_SCALAR *g_scalar, const EC_RAW_POINT *p,
|
|
|
|
const EC_SCALAR *p_scalar);
|
2018-04-25 04:40:01 +01:00
|
|
|
|
2018-02-02 23:24:09 +00:00
|
|
|
// ec_compute_wNAF writes the modified width-(w+1) Non-Adjacent Form (wNAF) of
|
2018-04-25 03:06:54 +01:00
|
|
|
// |scalar| to |out|. |out| must have room for |bits| + 1 elements, each of
|
|
|
|
// which will be either zero or odd with an absolute value less than 2^w
|
|
|
|
// satisfying
|
2018-02-02 23:24:09 +00:00
|
|
|
// scalar = \sum_j out[j]*2^j
|
|
|
|
// where at most one of any w+1 consecutive digits is non-zero
|
|
|
|
// with the exception that the most significant digit may be only
|
|
|
|
// w-1 zeros away from that next non-zero digit.
|
2018-04-25 03:06:54 +01:00
|
|
|
void ec_compute_wNAF(const EC_GROUP *group, int8_t *out,
|
|
|
|
const EC_SCALAR *scalar, size_t bits, int w);
|
2018-02-02 23:24:09 +00:00
|
|
|
|
2018-11-05 23:37:29 +00:00
|
|
|
void ec_GFp_mont_mul_public(const EC_GROUP *group, EC_RAW_POINT *r,
|
|
|
|
const EC_SCALAR *g_scalar, const EC_RAW_POINT *p,
|
|
|
|
const EC_SCALAR *p_scalar);
|
2014-06-20 20:00:00 +01:00
|
|
|
|
2017-08-18 19:06:02 +01:00
|
|
|
// method functions in simple.c
|
2014-06-20 20:00:00 +01:00
|
|
|
int ec_GFp_simple_group_init(EC_GROUP *);
|
|
|
|
void ec_GFp_simple_group_finish(EC_GROUP *);
|
|
|
|
int ec_GFp_simple_group_set_curve(EC_GROUP *, const BIGNUM *p, const BIGNUM *a,
|
|
|
|
const BIGNUM *b, BN_CTX *);
|
|
|
|
int ec_GFp_simple_group_get_curve(const EC_GROUP *, BIGNUM *p, BIGNUM *a,
|
Add EC_FELEM for EC_POINTs and related temporaries.
This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used
for EC_POINT's representation in the generic EC_METHOD, as well as
random operations on tuned EC_METHODs that still are implemented
genericly.
Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific
to the EC_METHOD, analogous to how the old values were BIGNUMs but may
or may not have been in Montgomery form. This is kind of a nuisance, but
no more than before. (If p224-64.c were easily convertable to Montgomery
form, we could say |EC_FELEM| is always in Montgomery form. If we
exposed the internal add and double implementations in each of the
curves, we could give |EC_POINT| an |EC_METHOD|-specific representation
and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this
for later.)
The generic add and doubling formulas are aligned with the formulas
proved in fiat-crypto. Those only applied to a = -3, so I've proved a
generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case
someone uses a custom curve. The new formulas are verified,
constant-time, and swap a multiply for a square. As expressed in
fiat-crypto they do use more temporaries, but this seems to be fine with
stack-allocated EC_FELEMs. (We can try to help the compiler later,
but benchamrks below suggest this isn't necessary.)
Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the
bounds in the type system and, in particular, that the width is correct,
which will make it easier to select a point in constant-time in the
future. (Indeed the old code did not always have the correct width. Its
point formula involved halving and implemented this in variable time and
variable width.)
Before:
Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec)
Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec)
Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec)
Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec)
Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec)
Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec)
Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec)
After:
Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%]
Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%]
Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%]
Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%]
Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%]
Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%]
Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%]
This more than pays for removing points_make_affine previously and even
speeds up ECDH P-256 slightly. (The point-on-curve check uses the
generic code.)
Next is to push the stack-allocating up to ec_wNAF_mul, followed by a
constant-time single-point multiplication.
Bug: 239
Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c
Reviewed-on: https://boringssl-review.googlesource.com/27668
Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
|
|
|
BIGNUM *b);
|
2018-04-25 03:53:07 +01:00
|
|
|
void ec_GFp_simple_point_init(EC_RAW_POINT *);
|
|
|
|
void ec_GFp_simple_point_copy(EC_RAW_POINT *, const EC_RAW_POINT *);
|
|
|
|
void ec_GFp_simple_point_set_to_infinity(const EC_GROUP *, EC_RAW_POINT *);
|
|
|
|
int ec_GFp_simple_point_set_affine_coordinates(const EC_GROUP *, EC_RAW_POINT *,
|
|
|
|
const BIGNUM *x,
|
|
|
|
const BIGNUM *y);
|
2018-11-05 23:37:29 +00:00
|
|
|
void ec_GFp_mont_add(const EC_GROUP *, EC_RAW_POINT *r, const EC_RAW_POINT *a,
|
|
|
|
const EC_RAW_POINT *b);
|
|
|
|
void ec_GFp_mont_dbl(const EC_GROUP *, EC_RAW_POINT *r, const EC_RAW_POINT *a);
|
2018-04-25 03:53:07 +01:00
|
|
|
void ec_GFp_simple_invert(const EC_GROUP *, EC_RAW_POINT *);
|
|
|
|
int ec_GFp_simple_is_at_infinity(const EC_GROUP *, const EC_RAW_POINT *);
|
|
|
|
int ec_GFp_simple_is_on_curve(const EC_GROUP *, const EC_RAW_POINT *);
|
|
|
|
int ec_GFp_simple_cmp(const EC_GROUP *, const EC_RAW_POINT *a,
|
|
|
|
const EC_RAW_POINT *b);
|
2018-04-07 00:43:29 +01:00
|
|
|
void ec_simple_scalar_inv_montgomery(const EC_GROUP *group, EC_SCALAR *r,
|
|
|
|
const EC_SCALAR *a);
|
2014-06-20 20:00:00 +01:00
|
|
|
|
2018-11-06 23:18:56 +00:00
|
|
|
int ec_GFp_simple_mont_inv_mod_ord_vartime(const EC_GROUP *group, EC_SCALAR *r,
|
|
|
|
const EC_SCALAR *a);
|
|
|
|
|
2018-11-09 01:07:42 +00:00
|
|
|
int ec_GFp_simple_cmp_x_coordinate(const EC_GROUP *group, const EC_RAW_POINT *p,
|
|
|
|
const EC_SCALAR *r);
|
2018-11-06 23:18:56 +00:00
|
|
|
|
2017-08-18 19:06:02 +01:00
|
|
|
// method functions in montgomery.c
|
2014-06-20 20:00:00 +01:00
|
|
|
int ec_GFp_mont_group_init(EC_GROUP *);
|
|
|
|
int ec_GFp_mont_group_set_curve(EC_GROUP *, const BIGNUM *p, const BIGNUM *a,
|
|
|
|
const BIGNUM *b, BN_CTX *);
|
|
|
|
void ec_GFp_mont_group_finish(EC_GROUP *);
|
Add EC_FELEM for EC_POINTs and related temporaries.
This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used
for EC_POINT's representation in the generic EC_METHOD, as well as
random operations on tuned EC_METHODs that still are implemented
genericly.
Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific
to the EC_METHOD, analogous to how the old values were BIGNUMs but may
or may not have been in Montgomery form. This is kind of a nuisance, but
no more than before. (If p224-64.c were easily convertable to Montgomery
form, we could say |EC_FELEM| is always in Montgomery form. If we
exposed the internal add and double implementations in each of the
curves, we could give |EC_POINT| an |EC_METHOD|-specific representation
and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this
for later.)
The generic add and doubling formulas are aligned with the formulas
proved in fiat-crypto. Those only applied to a = -3, so I've proved a
generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case
someone uses a custom curve. The new formulas are verified,
constant-time, and swap a multiply for a square. As expressed in
fiat-crypto they do use more temporaries, but this seems to be fine with
stack-allocated EC_FELEMs. (We can try to help the compiler later,
but benchamrks below suggest this isn't necessary.)
Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the
bounds in the type system and, in particular, that the width is correct,
which will make it easier to select a point in constant-time in the
future. (Indeed the old code did not always have the correct width. Its
point formula involved halving and implemented this in variable time and
variable width.)
Before:
Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec)
Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec)
Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec)
Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec)
Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec)
Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec)
Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec)
After:
Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%]
Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%]
Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%]
Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%]
Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%]
Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%]
Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%]
This more than pays for removing points_make_affine previously and even
speeds up ECDH P-256 slightly. (The point-on-curve check uses the
generic code.)
Next is to push the stack-allocating up to ec_wNAF_mul, followed by a
constant-time single-point multiplication.
Bug: 239
Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c
Reviewed-on: https://boringssl-review.googlesource.com/27668
Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
|
|
|
void ec_GFp_mont_felem_mul(const EC_GROUP *, EC_FELEM *r, const EC_FELEM *a,
|
|
|
|
const EC_FELEM *b);
|
|
|
|
void ec_GFp_mont_felem_sqr(const EC_GROUP *, EC_FELEM *r, const EC_FELEM *a);
|
|
|
|
|
|
|
|
int ec_GFp_mont_bignum_to_felem(const EC_GROUP *group, EC_FELEM *out,
|
|
|
|
const BIGNUM *in);
|
|
|
|
int ec_GFp_mont_felem_to_bignum(const EC_GROUP *group, BIGNUM *out,
|
|
|
|
const EC_FELEM *in);
|
2014-06-20 20:00:00 +01:00
|
|
|
|
2015-04-14 20:07:44 +01:00
|
|
|
void ec_GFp_nistp_recode_scalar_bits(uint8_t *sign, uint8_t *digit, uint8_t in);
|
|
|
|
|
2017-05-02 22:25:39 +01:00
|
|
|
const EC_METHOD *EC_GFp_nistp224_method(void);
|
|
|
|
const EC_METHOD *EC_GFp_nistp256_method(void);
|
2015-04-14 20:07:44 +01:00
|
|
|
|
2017-08-18 19:06:02 +01:00
|
|
|
// EC_GFp_nistz256_method is a GFp method using montgomery multiplication, with
|
|
|
|
// x86-64 optimized P256. See http://eprint.iacr.org/2013/816.
|
2017-05-02 22:25:39 +01:00
|
|
|
const EC_METHOD *EC_GFp_nistz256_method(void);
|
2015-11-03 22:02:04 +00:00
|
|
|
|
2018-03-07 01:21:28 +00:00
|
|
|
// An EC_WRAPPED_SCALAR is an |EC_SCALAR| with a parallel |BIGNUM|
|
|
|
|
// representation. It exists to support the |EC_KEY_get0_private_key| API.
|
|
|
|
typedef struct {
|
|
|
|
BIGNUM bignum;
|
|
|
|
EC_SCALAR scalar;
|
|
|
|
} EC_WRAPPED_SCALAR;
|
|
|
|
|
2014-06-20 20:00:00 +01:00
|
|
|
struct ec_key_st {
|
|
|
|
EC_GROUP *group;
|
|
|
|
|
|
|
|
EC_POINT *pub_key;
|
2018-03-07 01:21:28 +00:00
|
|
|
EC_WRAPPED_SCALAR *priv_key;
|
2014-06-20 20:00:00 +01:00
|
|
|
|
2017-08-18 19:06:02 +01:00
|
|
|
// fixed_k may contain a specific value of 'k', to be used in ECDSA signing.
|
|
|
|
// This is only for the FIPS power-on tests.
|
2017-06-13 20:45:49 +01:00
|
|
|
BIGNUM *fixed_k;
|
|
|
|
|
2014-06-20 20:00:00 +01:00
|
|
|
unsigned int enc_flag;
|
|
|
|
point_conversion_form_t conv_form;
|
|
|
|
|
2015-05-15 20:49:30 +01:00
|
|
|
CRYPTO_refcount_t references;
|
2014-06-20 20:00:00 +01:00
|
|
|
|
|
|
|
ECDSA_METHOD *ecdsa_meth;
|
|
|
|
|
|
|
|
CRYPTO_EX_DATA ex_data;
|
|
|
|
} /* EC_KEY */;
|
|
|
|
|
2017-06-21 19:37:22 +01:00
|
|
|
struct built_in_curve {
|
|
|
|
int nid;
|
|
|
|
const uint8_t *oid;
|
|
|
|
uint8_t oid_len;
|
2017-08-18 19:06:02 +01:00
|
|
|
// comment is a human-readable string describing the curve.
|
2014-07-19 00:26:25 +01:00
|
|
|
const char *comment;
|
2017-08-18 19:06:02 +01:00
|
|
|
// param_len is the number of bytes needed to store a field element.
|
2014-07-19 00:26:25 +01:00
|
|
|
uint8_t param_len;
|
2017-08-18 19:06:02 +01:00
|
|
|
// params points to an array of 6*|param_len| bytes which hold the field
|
|
|
|
// elements of the following (in big-endian order): prime, a, b, generator x,
|
|
|
|
// generator y, order.
|
2017-06-21 19:37:22 +01:00
|
|
|
const uint8_t *params;
|
2016-08-14 21:41:27 +01:00
|
|
|
const EC_METHOD *method;
|
2014-07-19 00:26:25 +01:00
|
|
|
};
|
|
|
|
|
2017-05-02 22:25:39 +01:00
|
|
|
#define OPENSSL_NUM_BUILT_IN_CURVES 4
|
|
|
|
|
|
|
|
struct built_in_curves {
|
|
|
|
struct built_in_curve curves[OPENSSL_NUM_BUILT_IN_CURVES];
|
|
|
|
};
|
|
|
|
|
2017-08-18 19:06:02 +01:00
|
|
|
// OPENSSL_built_in_curves returns a pointer to static information about
|
|
|
|
// standard curves. The array is terminated with an entry where |nid| is
|
|
|
|
// |NID_undef|.
|
2017-05-02 22:25:39 +01:00
|
|
|
const struct built_in_curves *OPENSSL_built_in_curves(void);
|
2014-06-20 20:00:00 +01:00
|
|
|
|
|
|
|
#if defined(__cplusplus)
|
2017-08-18 19:06:02 +01:00
|
|
|
} // extern C
|
2014-06-20 20:00:00 +01:00
|
|
|
#endif
|
|
|
|
|
2017-08-18 19:06:02 +01:00
|
|
|
#endif // OPENSSL_HEADER_EC_INTERNAL_H
|