boringssl/crypto/fipsmodule/bn/internal.h

598 lines
27 KiB
C
Raw Normal View History

/* Copyright (C) 1995-1997 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
* This package is an SSL implementation written
* by Eric Young (eay@cryptsoft.com).
* The implementation was written so as to conform with Netscapes SSL.
*
* This library is free for commercial and non-commercial use as long as
* the following conditions are aheared to. The following conditions
* apply to all code found in this distribution, be it the RC4, RSA,
* lhash, DES, etc., code; not just the SSL code. The SSL documentation
* included with this distribution is covered by the same copyright terms
* except that the holder is Tim Hudson (tjh@cryptsoft.com).
*
* Copyright remains Eric Young's, and as such any Copyright notices in
* the code are not to be removed.
* If this package is used in a product, Eric Young should be given attribution
* as the author of the parts of the library used.
* This can be in the form of a textual message at program startup or
* in documentation (online or textual) provided with the package.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* "This product includes cryptographic software written by
* Eric Young (eay@cryptsoft.com)"
* The word 'cryptographic' can be left out if the rouines from the library
* being used are not cryptographic related :-).
* 4. If you include any Windows specific code (or a derivative thereof) from
* the apps directory (application code) you must include an acknowledgement:
* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
*
* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* The licence and distribution terms for any publically available version or
* derivative of this code cannot be changed. i.e. this code cannot simply be
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
/* ====================================================================
* Copyright (c) 1998-2006 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
*
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For written permission, please contact
* openssl-core@openssl.org.
*
* 5. Products derived from this software may not be called "OpenSSL"
* nor may "OpenSSL" appear in their names without prior written
* permission of the OpenSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*
* This product includes cryptographic software written by Eric Young
* (eay@cryptsoft.com). This product includes software written by Tim
* Hudson (tjh@cryptsoft.com).
*
*/
/* ====================================================================
* Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
*
* Portions of the attached software ("Contribution") are developed by
* SUN MICROSYSTEMS, INC., and are contributed to the OpenSSL project.
*
* The Contribution is licensed pursuant to the Eric Young open source
* license provided above.
*
* The binary polynomial arithmetic software is originally written by
* Sheueling Chang Shantz and Douglas Stebila of Sun Microsystems
* Laboratories. */
#ifndef OPENSSL_HEADER_BN_INTERNAL_H
#define OPENSSL_HEADER_BN_INTERNAL_H
#include <openssl/base.h>
#if defined(OPENSSL_X86_64) && defined(_MSC_VER)
OPENSSL_MSVC_PRAGMA(warning(push, 3))
#include <intrin.h>
OPENSSL_MSVC_PRAGMA(warning(pop))
#pragma intrinsic(__umulh, _umul128)
#endif
#include "../../internal.h"
#if defined(__cplusplus)
extern "C" {
#endif
#if defined(OPENSSL_64_BIT)
#if defined(BORINGSSL_HAS_UINT128)
// MSVC doesn't support two-word integers on 64-bit.
#define BN_ULLONG uint128_t
#if defined(BORINGSSL_CAN_DIVIDE_UINT128)
#define BN_CAN_DIVIDE_ULLONG
#endif
#endif
#define BN_BITS2 64
#define BN_BYTES 8
#define BN_BITS4 32
#define BN_MASK2 (0xffffffffffffffffUL)
#define BN_MASK2l (0xffffffffUL)
#define BN_MASK2h (0xffffffff00000000UL)
#define BN_MASK2h1 (0xffffffff80000000UL)
#define BN_MONT_CTX_N0_LIMBS 1
#define BN_DEC_CONV (10000000000000000000UL)
#define BN_DEC_NUM 19
#define TOBN(hi, lo) ((BN_ULONG)(hi) << 32 | (lo))
#elif defined(OPENSSL_32_BIT)
#define BN_ULLONG uint64_t
#define BN_CAN_DIVIDE_ULLONG
#define BN_BITS2 32
#define BN_BYTES 4
#define BN_BITS4 16
#define BN_MASK2 (0xffffffffUL)
#define BN_MASK2l (0xffffUL)
#define BN_MASK2h1 (0xffff8000UL)
#define BN_MASK2h (0xffff0000UL)
// On some 32-bit platforms, Montgomery multiplication is done using 64-bit
// arithmetic with SIMD instructions. On such platforms, |BN_MONT_CTX::n0|
// needs to be two words long. Only certain 32-bit platforms actually make use
// of n0[1] and shorter R value would suffice for the others. However,
// currently only the assembly files know which is which.
#define BN_MONT_CTX_N0_LIMBS 2
#define BN_DEC_CONV (1000000000UL)
#define BN_DEC_NUM 9
#define TOBN(hi, lo) (lo), (hi)
#else
#error "Must define either OPENSSL_32_BIT or OPENSSL_64_BIT"
#endif
Replace |alloca| in |BN_mod_exp_mont_consttime|. |alloca| is dangerous and poorly specified, according to any description of |alloca|. It's also hard for some analysis tools to reason about. The code here assumed |alloca| is a macro, which isn't a valid assumption. Depending on what which headers are included and what toolchain is being used, |alloca| may or may not be defined as a macro, and this might change over time if/when toolchains are updated. Or, we might be doing static analysis and/or dynamic analysis with a different configuration w.r.t. the availability of |alloca| than production builds use. Regardless, the |alloca| code path only kicked in when the inputs are 840 bits or smaller. Since the multi-prime RSA support was removed, for interesting RSA key sizes the input will be at least 1024 bits and this code path won't be triggered since powerbufLen will be larger than 3072 bytes in those cases. ECC inversion via Fermat's Little Theorem has its own constant-time exponentiation so there are no cases where smaller inputs need to be fast. The RSAZ code avoids the |OPENSSL_malloc| for 2048-bit RSA keys. Increasingly the RSAZ code won't be used though, since it will be skipped over on Broadwell+ CPUs. Generalize the RSAZ stack allocation to work for non-RSAZ code paths. In order to ensure this doesn't cause too much stack usage on platforms where RSAZ wasn't already being used, only do so on x86-64, which already has this large stack size requirement due to RSAZ. This change will make it easier to refactor |BN_mod_exp_mont_consttime| to do that more safely and in a way that's more compatible with various analysis tools. This is also a step towards eliminating the |uintptr_t|-based alignment hack. Since this change increases the number of times |OPENSSL_free| is skipped, I've added an explicit |OPENSSL_cleanse| to ensure the zeroization is done. This should be done regardless of the other changes here. Change-Id: I8a161ce2720a26127e85fff7513f394883e50b2e Reviewed-on: https://boringssl-review.googlesource.com/28584 Commit-Queue: David Benjamin <davidben@google.com> CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Reviewed-by: David Benjamin <davidben@google.com>
2018-05-17 04:24:20 +01:00
// |BN_mod_exp_mont_consttime| is based on the assumption that the L1 data
// cache line width of the target processor is at least the following value.
#define MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH 64
// The number of |BN_ULONG|s needed for the |BN_mod_exp_mont_consttime| stack-
// allocated storage buffer. The buffer is just the right size for the RSAZ
// and is about ~1KB larger than what's necessary (4480 bytes) for 1024-bit
// inputs.
#define MOD_EXP_CTIME_STORAGE_LEN \
(((320u * 3u) + (32u * 9u * 16u)) / sizeof(BN_ULONG))
#define STATIC_BIGNUM(x) \
{ \
(BN_ULONG *)(x), sizeof(x) / sizeof(BN_ULONG), \
sizeof(x) / sizeof(BN_ULONG), 0, BN_FLG_STATIC_DATA \
}
#if defined(BN_ULLONG)
#define Lw(t) ((BN_ULONG)(t))
#define Hw(t) ((BN_ULONG)((t) >> BN_BITS2))
#endif
Add initial support for non-minimal BIGNUMs. Thanks to Andres Erbsen for extremely helpful suggestions on how finally plug this long-standing hole! OpenSSL BIGNUMs are currently minimal-width, which means they cannot be constant-time. We'll need to either excise BIGNUM from RSA and EC or somehow fix BIGNUM. EC_SCALAR and later EC_FELEM work will excise it from EC, but RSA's BIGNUMs are more transparent. Teaching BIGNUM to handle non-minimal word widths is probably simpler. The main constraint is BIGNUM's large "calculator" API surface. One could, in theory, do arbitrary math on RSA components, which means all public functions must tolerate non-minimal inputs. This is also useful for EC; https://boringssl-review.googlesource.com/c/boringssl/+/24445 is silly. As a first step, fix comparison-type functions that were assuming minimal BIGNUMs. I've also added bn_resize_words, but it is testing-only until the rest of the library is fixed. bn->top is now a loose upper bound we carry around. It does not affect numerical results, only performance and secrecy. This is a departure from the original meaning, and compiler help in auditing everything is nice, so the final change in this series will rename bn->top to bn->width. Thus these new functions are named per "width", not "top". Looking further ahead, how are output BIGNUM widths determined? There's three notions of correctness here: 1. Do I compute the right answer for all widths? 2. Do I handle secret data in constant time? 3. Does my memory usage not balloon absurdly? For (1), a BIGNUM function must give the same answer for all input widths. BN_mod_add_quick may assume |a| < |m|, but |a| may still be wider than |m| by way of leading zeres. The simplest approach is to write code in a width-agnostic way and rely on functions to accept all widths. Where functions need to look at bn->d, we'll a few helper functions to smooth over funny widths. For (2), (1) is little cumbersome. Consider constant-time modular addition. A sane type system would guarantee input widths match. But C is weak here, and bifurcating the internals is a lot of work. Thus, at least for now, I do not propose we move RSA's internal computation out of BIGNUM. (EC_SCALAR/EC_FELEM are valuable for EC because we get to stack-allocate, curves were already specialized, and EC only has two types with many operations on those types. None of these apply to RSA. We've got numbers mod n, mod p, mod q, and their corresponding exponents, each of which is used for basically one operation.) Instead, constant-time BIGNUM functions will output non-minimal widths. This is trivial for BN_bin2bn or modular arithmetic. But for BN_mul, constant-time[*] would dictate r->top = a->top + b->top. A calculator repeatedly multiplying by one would then run out of memory. Those we'll split into a private BN_mul_fixed for crypto, leaving BN_mul for calculators. BN_mul is just BN_mul_fixed followed by bn_correct_top. [*] BN_mul is not constant-time for other reasons, but that will be fixed separately. Bug: 232 Change-Id: Ide2258ae8c09a9a41bb71d6777908d1c27917069 Reviewed-on: https://boringssl-review.googlesource.com/25244 Reviewed-by: Adam Langley <agl@google.com>
2018-01-20 20:56:53 +00:00
// bn_minimal_width returns the minimal value of |bn->top| which fits the
// value of |bn|.
int bn_minimal_width(const BIGNUM *bn);
// bn_set_minimal_width sets |bn->width| to |bn_minimal_width(bn)|. If |bn| is
// zero, |bn->neg| is set to zero.
void bn_set_minimal_width(BIGNUM *bn);
// bn_wexpand ensures that |bn| has at least |words| works of space without
// altering its value. It returns one on success or zero on allocation
// failure.
int bn_wexpand(BIGNUM *bn, size_t words);
// bn_expand acts the same as |bn_wexpand|, but takes a number of bits rather
// than a number of words.
int bn_expand(BIGNUM *bn, size_t bits);
Add initial support for non-minimal BIGNUMs. Thanks to Andres Erbsen for extremely helpful suggestions on how finally plug this long-standing hole! OpenSSL BIGNUMs are currently minimal-width, which means they cannot be constant-time. We'll need to either excise BIGNUM from RSA and EC or somehow fix BIGNUM. EC_SCALAR and later EC_FELEM work will excise it from EC, but RSA's BIGNUMs are more transparent. Teaching BIGNUM to handle non-minimal word widths is probably simpler. The main constraint is BIGNUM's large "calculator" API surface. One could, in theory, do arbitrary math on RSA components, which means all public functions must tolerate non-minimal inputs. This is also useful for EC; https://boringssl-review.googlesource.com/c/boringssl/+/24445 is silly. As a first step, fix comparison-type functions that were assuming minimal BIGNUMs. I've also added bn_resize_words, but it is testing-only until the rest of the library is fixed. bn->top is now a loose upper bound we carry around. It does not affect numerical results, only performance and secrecy. This is a departure from the original meaning, and compiler help in auditing everything is nice, so the final change in this series will rename bn->top to bn->width. Thus these new functions are named per "width", not "top". Looking further ahead, how are output BIGNUM widths determined? There's three notions of correctness here: 1. Do I compute the right answer for all widths? 2. Do I handle secret data in constant time? 3. Does my memory usage not balloon absurdly? For (1), a BIGNUM function must give the same answer for all input widths. BN_mod_add_quick may assume |a| < |m|, but |a| may still be wider than |m| by way of leading zeres. The simplest approach is to write code in a width-agnostic way and rely on functions to accept all widths. Where functions need to look at bn->d, we'll a few helper functions to smooth over funny widths. For (2), (1) is little cumbersome. Consider constant-time modular addition. A sane type system would guarantee input widths match. But C is weak here, and bifurcating the internals is a lot of work. Thus, at least for now, I do not propose we move RSA's internal computation out of BIGNUM. (EC_SCALAR/EC_FELEM are valuable for EC because we get to stack-allocate, curves were already specialized, and EC only has two types with many operations on those types. None of these apply to RSA. We've got numbers mod n, mod p, mod q, and their corresponding exponents, each of which is used for basically one operation.) Instead, constant-time BIGNUM functions will output non-minimal widths. This is trivial for BN_bin2bn or modular arithmetic. But for BN_mul, constant-time[*] would dictate r->top = a->top + b->top. A calculator repeatedly multiplying by one would then run out of memory. Those we'll split into a private BN_mul_fixed for crypto, leaving BN_mul for calculators. BN_mul is just BN_mul_fixed followed by bn_correct_top. [*] BN_mul is not constant-time for other reasons, but that will be fixed separately. Bug: 232 Change-Id: Ide2258ae8c09a9a41bb71d6777908d1c27917069 Reviewed-on: https://boringssl-review.googlesource.com/25244 Reviewed-by: Adam Langley <agl@google.com>
2018-01-20 20:56:53 +00:00
// bn_resize_words adjusts |bn->top| to be |words|. It returns one on success
// and zero on allocation error or if |bn|'s value is too large.
OPENSSL_EXPORT int bn_resize_words(BIGNUM *bn, size_t words);
Add initial support for non-minimal BIGNUMs. Thanks to Andres Erbsen for extremely helpful suggestions on how finally plug this long-standing hole! OpenSSL BIGNUMs are currently minimal-width, which means they cannot be constant-time. We'll need to either excise BIGNUM from RSA and EC or somehow fix BIGNUM. EC_SCALAR and later EC_FELEM work will excise it from EC, but RSA's BIGNUMs are more transparent. Teaching BIGNUM to handle non-minimal word widths is probably simpler. The main constraint is BIGNUM's large "calculator" API surface. One could, in theory, do arbitrary math on RSA components, which means all public functions must tolerate non-minimal inputs. This is also useful for EC; https://boringssl-review.googlesource.com/c/boringssl/+/24445 is silly. As a first step, fix comparison-type functions that were assuming minimal BIGNUMs. I've also added bn_resize_words, but it is testing-only until the rest of the library is fixed. bn->top is now a loose upper bound we carry around. It does not affect numerical results, only performance and secrecy. This is a departure from the original meaning, and compiler help in auditing everything is nice, so the final change in this series will rename bn->top to bn->width. Thus these new functions are named per "width", not "top". Looking further ahead, how are output BIGNUM widths determined? There's three notions of correctness here: 1. Do I compute the right answer for all widths? 2. Do I handle secret data in constant time? 3. Does my memory usage not balloon absurdly? For (1), a BIGNUM function must give the same answer for all input widths. BN_mod_add_quick may assume |a| < |m|, but |a| may still be wider than |m| by way of leading zeres. The simplest approach is to write code in a width-agnostic way and rely on functions to accept all widths. Where functions need to look at bn->d, we'll a few helper functions to smooth over funny widths. For (2), (1) is little cumbersome. Consider constant-time modular addition. A sane type system would guarantee input widths match. But C is weak here, and bifurcating the internals is a lot of work. Thus, at least for now, I do not propose we move RSA's internal computation out of BIGNUM. (EC_SCALAR/EC_FELEM are valuable for EC because we get to stack-allocate, curves were already specialized, and EC only has two types with many operations on those types. None of these apply to RSA. We've got numbers mod n, mod p, mod q, and their corresponding exponents, each of which is used for basically one operation.) Instead, constant-time BIGNUM functions will output non-minimal widths. This is trivial for BN_bin2bn or modular arithmetic. But for BN_mul, constant-time[*] would dictate r->top = a->top + b->top. A calculator repeatedly multiplying by one would then run out of memory. Those we'll split into a private BN_mul_fixed for crypto, leaving BN_mul for calculators. BN_mul is just BN_mul_fixed followed by bn_correct_top. [*] BN_mul is not constant-time for other reasons, but that will be fixed separately. Bug: 232 Change-Id: Ide2258ae8c09a9a41bb71d6777908d1c27917069 Reviewed-on: https://boringssl-review.googlesource.com/25244 Reviewed-by: Adam Langley <agl@google.com>
2018-01-20 20:56:53 +00:00
// bn_select_words sets |r| to |a| if |mask| is all ones or |b| if |mask| is
// all zeros.
void bn_select_words(BN_ULONG *r, BN_ULONG mask, const BN_ULONG *a,
const BN_ULONG *b, size_t num);
// bn_set_words sets |bn| to the value encoded in the |num| words in |words|,
// least significant word first.
int bn_set_words(BIGNUM *bn, const BN_ULONG *words, size_t num);
// bn_fits_in_words returns one if |bn| may be represented in |num| words, plus
// a sign bit, and zero otherwise.
int bn_fits_in_words(const BIGNUM *bn, size_t num);
// bn_copy_words copies the value of |bn| to |out| and returns one if the value
// is representable in |num| words. Otherwise, it returns zero.
int bn_copy_words(BN_ULONG *out, size_t num, const BIGNUM *bn);
// bn_mul_add_words multiples |ap| by |w|, adds the result to |rp|, and places
// the result in |rp|. |ap| and |rp| must both be |num| words long. It returns
// the carry word of the operation. |ap| and |rp| may be equal but otherwise may
// not alias.
BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num,
BN_ULONG w);
// bn_mul_words multiples |ap| by |w| and places the result in |rp|. |ap| and
// |rp| must both be |num| words long. It returns the carry word of the
// operation. |ap| and |rp| may be equal but otherwise may not alias.
BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num, BN_ULONG w);
// bn_sqr_words sets |rp[2*i]| and |rp[2*i+1]| to |ap[i]|'s square, for all |i|
// up to |num|. |ap| is an array of |num| words and |rp| an array of |2*num|
// words. |ap| and |rp| may not alias.
//
// This gives the contribution of the |ap[i]*ap[i]| terms when squaring |ap|.
void bn_sqr_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num);
// bn_add_words adds |ap| to |bp| and places the result in |rp|, each of which
// are |num| words long. It returns the carry bit, which is one if the operation
// overflowed and zero otherwise. Any pair of |ap|, |bp|, and |rp| may be equal
// to each other but otherwise may not alias.
BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
size_t num);
// bn_sub_words subtracts |bp| from |ap| and places the result in |rp|. It
// returns the borrow bit, which is one if the computation underflowed and zero
// otherwise. Any pair of |ap|, |bp|, and |rp| may be equal to each other but
// otherwise may not alias.
BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
size_t num);
// bn_mul_comba4 sets |r| to the product of |a| and |b|.
void bn_mul_comba4(BN_ULONG r[8], const BN_ULONG a[4], const BN_ULONG b[4]);
// bn_mul_comba8 sets |r| to the product of |a| and |b|.
void bn_mul_comba8(BN_ULONG r[16], const BN_ULONG a[8], const BN_ULONG b[8]);
// bn_sqr_comba8 sets |r| to |a|^2.
void bn_sqr_comba8(BN_ULONG r[16], const BN_ULONG a[4]);
// bn_sqr_comba4 sets |r| to |a|^2.
void bn_sqr_comba4(BN_ULONG r[8], const BN_ULONG a[4]);
// bn_less_than_words returns one if |a| < |b| and zero otherwise, where |a|
// and |b| both are |len| words long. It runs in constant time.
int bn_less_than_words(const BN_ULONG *a, const BN_ULONG *b, size_t len);
// bn_in_range_words returns one if |min_inclusive| <= |a| < |max_exclusive|,
Blind the range check for finding a Rabin-Miller witness. Rabin-Miller requires selecting a random number from 2 to |w|-1. This is done by picking an N-bit number and discarding out-of-range values. This leaks information about |w|, so apply blinding. Rather than discard bad values, adjust them to be in range. Though not uniformly selected, these adjusted values are still usable as Rabin-Miller checks. Rabin-Miller is already probabilistic, so we could reach the desired confidence levels by just suitably increasing the iteration count. However, to align with FIPS 186-4, we use a more pessimal analysis: we do not count the non-uniform values towards the iteration count. As a result, this function is more complex and has more timing risk than necessary. We count both total iterations and uniform ones and iterate until we've reached at least |BN_PRIME_CHECKS_BLINDED| and |iterations|, respectively. If the latter is large enough, it will be the limiting factor with high probability and we won't leak information. Note this blinding does not impact most calls when picking primes because composites are rejected early. Only the two secret primes see extra work. So while this does make the BNTest.PrimeChecking test take about 2x longer to run on debug mode, RSA key generation time is fine. Another, perhaps simpler, option here would have to run bn_rand_range_words to the full 100 count, select an arbitrary successful try, and declare failure of the entire keygen process (as we do already) if all tries failed. I went with the option in this CL because I happened to come up with it first, and because the failure probability decreases much faster. Additionally, the option in this CL does not affect composite numbers, while the alternate would. This gives a smaller multiplier on our entropy draw. We also continue to use the "wasted" work for stronger assurance on primality. FIPS' numbers are remarkably low, considering the increase has negligible cost. Thanks to Nathan Benjamin for helping me explore the failure rate as the target count and blinding count change. Now we're down to the rest of RSA keygen, which will require all the operations we've traditionally just avoided in constant-time code! Median of 29 RSA keygens: 0m0.169s -> 0m0.298s (Accuracy beyond 0.1s is questionable. The runs at subsequent test- and rename-only CLs were 0m0.217s, 0m0.245s, 0m0.244s, 0m0.247s.) Bug: 238 Change-Id: Id6406c3020f2585b86946eb17df64ac42f30ebab Reviewed-on: https://boringssl-review.googlesource.com/25890 Commit-Queue: Adam Langley <agl@google.com> CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Reviewed-by: Adam Langley <agl@google.com>
2018-02-05 04:48:36 +00:00
// where |a| and |max_exclusive| both are |len| words long. |a| and
// |max_exclusive| are treated as secret.
int bn_in_range_words(const BN_ULONG *a, BN_ULONG min_inclusive,
const BN_ULONG *max_exclusive, size_t len);
Make ECDSA signing 10% faster and plug some timing leaks. None of the asymmetric crypto we inherented from OpenSSL is constant-time because of BIGNUM. BIGNUM chops leading zeros off the front of everything, so we end up leaking information about the first word, in theory. BIGNUM functions additionally tend to take the full range of inputs and then call into BN_nnmod at various points. All our secret values should be acted on in constant-time, but k in ECDSA is a particularly sensitive value. So, ecdsa_sign_setup, in an attempt to mitigate the BIGNUM leaks, would add a couple copies of the order. This does not work at all. k is used to compute two values: k^-1 and kG. The first operation when computing k^-1 is to call BN_nnmod if k is out of range. The entry point to our tuned constant-time curve implementations is to call BN_nnmod if the scalar has too many bits, which this causes. The result is both corrections are immediately undone but cause us to do more variable-time work in the meantime. Replace all these computations around k with the word-based functions added in the various preceding CLs. In doing so, replace the BN_mod_mul calls (which internally call BN_nnmod) with Montgomery reduction. We can avoid taking k^-1 out of Montgomery form, which combines nicely with Brian Smith's trick in 3426d1011946b26ff1bb2fd98a081ba4753c9cc8. Along the way, we avoid some unnecessary mallocs. BIGNUM still affects the private key itself, as well as the EC_POINTs. But this should hopefully be much better now. Also it's 10% faster: Before: Did 15000 ECDSA P-224 signing operations in 1069117us (14030.3 ops/sec) Did 18000 ECDSA P-256 signing operations in 1053908us (17079.3 ops/sec) Did 1078 ECDSA P-384 signing operations in 1087853us (990.9 ops/sec) Did 473 ECDSA P-521 signing operations in 1069835us (442.1 ops/sec) After: Did 16000 ECDSA P-224 signing operations in 1064799us (15026.3 ops/sec) Did 19000 ECDSA P-256 signing operations in 1007839us (18852.2 ops/sec) Did 1078 ECDSA P-384 signing operations in 1079413us (998.7 ops/sec) Did 484 ECDSA P-521 signing operations in 1083616us (446.7 ops/sec) Change-Id: I2a25e90fc99dac13c0616d0ea45e125a4bd8cca1 Reviewed-on: https://boringssl-review.googlesource.com/23075 Reviewed-by: Adam Langley <agl@google.com>
2017-11-13 03:58:00 +00:00
// bn_rand_range_words sets |out| to a uniformly distributed random number from
// |min_inclusive| to |max_exclusive|. Both |out| and |max_exclusive| are |len|
// words long.
//
// This function runs in time independent of the result, but |min_inclusive| and
// |max_exclusive| are public data. (Information about the range is unavoidably
// leaked by how many iterations it took to select a number.)
int bn_rand_range_words(BN_ULONG *out, BN_ULONG min_inclusive,
const BN_ULONG *max_exclusive, size_t len,
const uint8_t additional_data[32]);
Blind the range check for finding a Rabin-Miller witness. Rabin-Miller requires selecting a random number from 2 to |w|-1. This is done by picking an N-bit number and discarding out-of-range values. This leaks information about |w|, so apply blinding. Rather than discard bad values, adjust them to be in range. Though not uniformly selected, these adjusted values are still usable as Rabin-Miller checks. Rabin-Miller is already probabilistic, so we could reach the desired confidence levels by just suitably increasing the iteration count. However, to align with FIPS 186-4, we use a more pessimal analysis: we do not count the non-uniform values towards the iteration count. As a result, this function is more complex and has more timing risk than necessary. We count both total iterations and uniform ones and iterate until we've reached at least |BN_PRIME_CHECKS_BLINDED| and |iterations|, respectively. If the latter is large enough, it will be the limiting factor with high probability and we won't leak information. Note this blinding does not impact most calls when picking primes because composites are rejected early. Only the two secret primes see extra work. So while this does make the BNTest.PrimeChecking test take about 2x longer to run on debug mode, RSA key generation time is fine. Another, perhaps simpler, option here would have to run bn_rand_range_words to the full 100 count, select an arbitrary successful try, and declare failure of the entire keygen process (as we do already) if all tries failed. I went with the option in this CL because I happened to come up with it first, and because the failure probability decreases much faster. Additionally, the option in this CL does not affect composite numbers, while the alternate would. This gives a smaller multiplier on our entropy draw. We also continue to use the "wasted" work for stronger assurance on primality. FIPS' numbers are remarkably low, considering the increase has negligible cost. Thanks to Nathan Benjamin for helping me explore the failure rate as the target count and blinding count change. Now we're down to the rest of RSA keygen, which will require all the operations we've traditionally just avoided in constant-time code! Median of 29 RSA keygens: 0m0.169s -> 0m0.298s (Accuracy beyond 0.1s is questionable. The runs at subsequent test- and rename-only CLs were 0m0.217s, 0m0.245s, 0m0.244s, 0m0.247s.) Bug: 238 Change-Id: Id6406c3020f2585b86946eb17df64ac42f30ebab Reviewed-on: https://boringssl-review.googlesource.com/25890 Commit-Queue: Adam Langley <agl@google.com> CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Reviewed-by: Adam Langley <agl@google.com>
2018-02-05 04:48:36 +00:00
// bn_range_secret_range behaves like |BN_rand_range_ex|, but treats
// |max_exclusive| as secret. Because of this constraint, the distribution of
// values returned is more complex.
//
// Rather than repeatedly generating values until one is in range, which would
// leak information, it generates one value. If the value is in range, it sets
// |*out_is_uniform| to one. Otherwise, it sets |*out_is_uniform| to zero,
// fixing up the value to force it in range.
//
// The subset of calls to |bn_rand_secret_range| which set |*out_is_uniform| to
// one are uniformly distributed in the target range. Calls overall are not.
// This function is intended for use in situations where the extra values are
// still usable and where the number of iterations needed to reach the target
// number of uniform outputs may be blinded for negligible probabilities of
// timing leaks.
//
// Although this function treats |max_exclusive| as secret, it treats the number
// of bits in |max_exclusive| as public.
int bn_rand_secret_range(BIGNUM *r, int *out_is_uniform, BN_ULONG min_inclusive,
const BIGNUM *max_exclusive);
int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, int num);
uint64_t bn_mont_n0(const BIGNUM *n);
// bn_mod_exp_base_2_consttime calculates r = 2**p (mod n). |p| must be larger
// than log_2(n); i.e. 2**p must be larger than |n|. |n| must be positive and
// odd. |p| and the bit width of |n| are assumed public, but |n| is otherwise
// treated as secret.
int bn_mod_exp_base_2_consttime(BIGNUM *r, unsigned p, const BIGNUM *n,
BN_CTX *ctx);
#if defined(OPENSSL_X86_64) && defined(_MSC_VER)
#define BN_UMULT_LOHI(low, high, a, b) ((low) = _umul128((a), (b), &(high)))
#endif
#if !defined(BN_ULLONG) && !defined(BN_UMULT_LOHI)
#error "Either BN_ULLONG or BN_UMULT_LOHI must be defined on every platform."
#endif
// bn_jacobi returns the Jacobi symbol of |a| and |b| (which is -1, 0 or 1), or
// -2 on error.
int bn_jacobi(const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
// bn_is_bit_set_words returns one if bit |bit| is set in |a| and zero
// otherwise.
int bn_is_bit_set_words(const BN_ULONG *a, size_t num, unsigned bit);
// bn_one_to_montgomery sets |r| to one in Montgomery form. It returns one on
// success and zero on error. This function treats the bit width of the modulus
// as public.
int bn_one_to_montgomery(BIGNUM *r, const BN_MONT_CTX *mont, BN_CTX *ctx);
// bn_less_than_montgomery_R returns one if |bn| is less than the Montgomery R
// value for |mont| and zero otherwise.
int bn_less_than_montgomery_R(const BIGNUM *bn, const BN_MONT_CTX *mont);
// bn_mod_u16_consttime returns |bn| mod |d|, ignoring |bn|'s sign bit. It runs
// in time independent of the value of |bn|, but it treats |d| as public.
OPENSSL_EXPORT uint16_t bn_mod_u16_consttime(const BIGNUM *bn, uint16_t d);
// bn_odd_number_is_obviously_composite returns one if |bn| is divisible by one
// of the first several odd primes and zero otherwise.
int bn_odd_number_is_obviously_composite(const BIGNUM *bn);
// bn_rshift1_words sets |r| to |a| >> 1, where both arrays are |num| bits wide.
void bn_rshift1_words(BN_ULONG *r, const BN_ULONG *a, size_t num);
// bn_rshift_words sets |r| to |a| >> |shift|, where both arrays are |num| bits
// wide.
void bn_rshift_words(BN_ULONG *r, const BN_ULONG *a, unsigned shift,
size_t num);
// bn_rshift_secret_shift behaves like |BN_rshift| but runs in time independent
// of both |a| and |n|.
OPENSSL_EXPORT int bn_rshift_secret_shift(BIGNUM *r, const BIGNUM *a,
unsigned n, BN_CTX *ctx);
// bn_reduce_once sets |r| to |a| mod |m| where 0 <= |a| < 2*|m|. It returns
// zero if |a| < |m| and a mask of all ones if |a| >= |m|. Each array is |num|
// words long, but |a| has an additional word specified by |carry|. |carry| must
// be zero or one, as implied by the bounds on |a|.
//
// |r|, |a|, and |m| may not alias. Use |bn_reduce_once_in_place| if |r| and |a|
// must alias.
BN_ULONG bn_reduce_once(BN_ULONG *r, const BN_ULONG *a, BN_ULONG carry,
const BN_ULONG *m, size_t num);
// bn_reduce_once_in_place behaves like |bn_reduce_once| but acts in-place on
// |r|, using |tmp| as scratch space. |r|, |tmp|, and |m| may not alias.
BN_ULONG bn_reduce_once_in_place(BN_ULONG *r, BN_ULONG carry, const BN_ULONG *m,
BN_ULONG *tmp, size_t num);
// Constant-time non-modular arithmetic.
//
// The following functions implement non-modular arithmetic in constant-time
// and pessimally set |r->width| to the largest possible word size.
//
// Note this means that, e.g., repeatedly multiplying by one will cause widths
// to increase without bound. The corresponding public API functions minimize
// their outputs to avoid regressing calculator consumers.
// bn_uadd_consttime behaves like |BN_uadd|, but it pessimally sets
// |r->width| = |a->width| + |b->width| + 1.
int bn_uadd_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
// bn_usub_consttime behaves like |BN_usub|, but it pessimally sets
// |r->width| = |a->width|.
int bn_usub_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
// bn_abs_sub_consttime sets |r| to the absolute value of |a| - |b|, treating
// both inputs as secret. It returns one on success and zero on error.
OPENSSL_EXPORT int bn_abs_sub_consttime(BIGNUM *r, const BIGNUM *a,
const BIGNUM *b, BN_CTX *ctx);
// bn_mul_consttime behaves like |BN_mul|, but it rejects negative inputs and
// pessimally sets |r->width| to |a->width| + |b->width|, to avoid leaking
// information about |a| and |b|.
int bn_mul_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
// bn_sqrt_consttime behaves like |BN_sqrt|, but it pessimally sets |r->width|
// to 2*|a->width|, to avoid leaking information about |a| and |b|.
int bn_sqr_consttime(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx);
// bn_div_consttime behaves like |BN_div|, but it rejects negative inputs and
// treats both inputs, including their magnitudes, as secret. It is, as a
// result, much slower than |BN_div| and should only be used for rare operations
// where Montgomery reduction is not available.
//
// Note that |quotient->width| will be set pessimally to |numerator->width|.
OPENSSL_EXPORT int bn_div_consttime(BIGNUM *quotient, BIGNUM *remainder,
const BIGNUM *numerator,
const BIGNUM *divisor, BN_CTX *ctx);
// bn_is_relatively_prime checks whether GCD(|x|, |y|) is one. On success, it
// returns one and sets |*out_relatively_prime| to one if the GCD was one and
// zero otherwise. On error, it returns zero.
OPENSSL_EXPORT int bn_is_relatively_prime(int *out_relatively_prime,
const BIGNUM *x, const BIGNUM *y,
BN_CTX *ctx);
// bn_lcm_consttime sets |r| to LCM(|a|, |b|). It returns one and success and
// zero on error. |a| and |b| are both treated as secret.
OPENSSL_EXPORT int bn_lcm_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
BN_CTX *ctx);
Make BN_mod_*_quick constant-time. As the EC code will ultimately want to use these in "words" form by way of EC_FELEM, and because it's much easier, I've implement these as low-level words-based functions that require all inputs have the same width. The BIGNUM versions which RSA and, for now, EC calls are implemented on top of that. Unfortunately, doing such things in constant-time and accounting for undersized inputs requires some scratch space, and these functions don't take BN_CTX. So I've added internal bn_mod_*_quick_ctx functions that take a BN_CTX and the old functions now allocate a bit unnecessarily. RSA only needs lshift (for BN_MONT_CTX) and sub (for CRT), but the generic EC code wants add as well. The generic EC code isn't even remotely constant-time, and I hope to ultimately use stack-allocated EC_FELEMs, so I've made the actual implementations here implemented in "words", which is much simpler anyway due to not having to take care of widths. I've also gone ahead and switched the EC code to these functions, largely as a test of their performance (an earlier iteration made the EC code noticeably slower). These operations are otherwise not performance-critical in RSA. The conversion from BIGNUM to BIGNUM+BN_CTX should be dropped by the static linker already, and the unused BIGNUM+BN_CTX functions will fall off when EC_FELEM happens. Update-Note: BN_mod_*_quick bounce on malloc a bit now, but they're not really used externally. The one caller I found was wpa_supplicant which bounces on malloc already. They appear to be implementing compressed coordinates by hand? We may be able to convince them to call EC_POINT_set_compressed_coordinates_GFp. Bug: 233, 236 Change-Id: I2bf361e9c089e0211b97d95523dbc06f1168e12b Reviewed-on: https://boringssl-review.googlesource.com/25261 Commit-Queue: David Benjamin <davidben@google.com> CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Reviewed-by: Adam Langley <agl@google.com>
2018-01-24 20:29:00 +00:00
// Constant-time modular arithmetic.
//
// The following functions implement basic constant-time modular arithmetic.
Make BN_mod_*_quick constant-time. As the EC code will ultimately want to use these in "words" form by way of EC_FELEM, and because it's much easier, I've implement these as low-level words-based functions that require all inputs have the same width. The BIGNUM versions which RSA and, for now, EC calls are implemented on top of that. Unfortunately, doing such things in constant-time and accounting for undersized inputs requires some scratch space, and these functions don't take BN_CTX. So I've added internal bn_mod_*_quick_ctx functions that take a BN_CTX and the old functions now allocate a bit unnecessarily. RSA only needs lshift (for BN_MONT_CTX) and sub (for CRT), but the generic EC code wants add as well. The generic EC code isn't even remotely constant-time, and I hope to ultimately use stack-allocated EC_FELEMs, so I've made the actual implementations here implemented in "words", which is much simpler anyway due to not having to take care of widths. I've also gone ahead and switched the EC code to these functions, largely as a test of their performance (an earlier iteration made the EC code noticeably slower). These operations are otherwise not performance-critical in RSA. The conversion from BIGNUM to BIGNUM+BN_CTX should be dropped by the static linker already, and the unused BIGNUM+BN_CTX functions will fall off when EC_FELEM happens. Update-Note: BN_mod_*_quick bounce on malloc a bit now, but they're not really used externally. The one caller I found was wpa_supplicant which bounces on malloc already. They appear to be implementing compressed coordinates by hand? We may be able to convince them to call EC_POINT_set_compressed_coordinates_GFp. Bug: 233, 236 Change-Id: I2bf361e9c089e0211b97d95523dbc06f1168e12b Reviewed-on: https://boringssl-review.googlesource.com/25261 Commit-Queue: David Benjamin <davidben@google.com> CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Reviewed-by: Adam Langley <agl@google.com>
2018-01-24 20:29:00 +00:00
// bn_mod_add_words sets |r| to |a| + |b| (mod |m|), using |tmp| as scratch
// space. Each array is |num| words long. |a| and |b| must be < |m|. Any pair of
// |r|, |a|, and |b| may alias.
void bn_mod_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
const BN_ULONG *m, BN_ULONG *tmp, size_t num);
// bn_mod_add_consttime acts like |BN_mod_add_quick| but takes a |BN_CTX|.
int bn_mod_add_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
Make BN_mod_*_quick constant-time. As the EC code will ultimately want to use these in "words" form by way of EC_FELEM, and because it's much easier, I've implement these as low-level words-based functions that require all inputs have the same width. The BIGNUM versions which RSA and, for now, EC calls are implemented on top of that. Unfortunately, doing such things in constant-time and accounting for undersized inputs requires some scratch space, and these functions don't take BN_CTX. So I've added internal bn_mod_*_quick_ctx functions that take a BN_CTX and the old functions now allocate a bit unnecessarily. RSA only needs lshift (for BN_MONT_CTX) and sub (for CRT), but the generic EC code wants add as well. The generic EC code isn't even remotely constant-time, and I hope to ultimately use stack-allocated EC_FELEMs, so I've made the actual implementations here implemented in "words", which is much simpler anyway due to not having to take care of widths. I've also gone ahead and switched the EC code to these functions, largely as a test of their performance (an earlier iteration made the EC code noticeably slower). These operations are otherwise not performance-critical in RSA. The conversion from BIGNUM to BIGNUM+BN_CTX should be dropped by the static linker already, and the unused BIGNUM+BN_CTX functions will fall off when EC_FELEM happens. Update-Note: BN_mod_*_quick bounce on malloc a bit now, but they're not really used externally. The one caller I found was wpa_supplicant which bounces on malloc already. They appear to be implementing compressed coordinates by hand? We may be able to convince them to call EC_POINT_set_compressed_coordinates_GFp. Bug: 233, 236 Change-Id: I2bf361e9c089e0211b97d95523dbc06f1168e12b Reviewed-on: https://boringssl-review.googlesource.com/25261 Commit-Queue: David Benjamin <davidben@google.com> CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Reviewed-by: Adam Langley <agl@google.com>
2018-01-24 20:29:00 +00:00
const BIGNUM *m, BN_CTX *ctx);
Add EC_FELEM for EC_POINTs and related temporaries. This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used for EC_POINT's representation in the generic EC_METHOD, as well as random operations on tuned EC_METHODs that still are implemented genericly. Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific to the EC_METHOD, analogous to how the old values were BIGNUMs but may or may not have been in Montgomery form. This is kind of a nuisance, but no more than before. (If p224-64.c were easily convertable to Montgomery form, we could say |EC_FELEM| is always in Montgomery form. If we exposed the internal add and double implementations in each of the curves, we could give |EC_POINT| an |EC_METHOD|-specific representation and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this for later.) The generic add and doubling formulas are aligned with the formulas proved in fiat-crypto. Those only applied to a = -3, so I've proved a generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case someone uses a custom curve. The new formulas are verified, constant-time, and swap a multiply for a square. As expressed in fiat-crypto they do use more temporaries, but this seems to be fine with stack-allocated EC_FELEMs. (We can try to help the compiler later, but benchamrks below suggest this isn't necessary.) Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the bounds in the type system and, in particular, that the width is correct, which will make it easier to select a point in constant-time in the future. (Indeed the old code did not always have the correct width. Its point formula involved halving and implemented this in variable time and variable width.) Before: Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec) Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec) Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec) Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec) Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec) Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec) Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec) After: Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%] Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%] Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%] Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%] Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%] Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%] Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%] This more than pays for removing points_make_affine previously and even speeds up ECDH P-256 slightly. (The point-on-curve check uses the generic code.) Next is to push the stack-allocating up to ec_wNAF_mul, followed by a constant-time single-point multiplication. Bug: 239 Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c Reviewed-on: https://boringssl-review.googlesource.com/27668 Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
// bn_mod_sub_words sets |r| to |a| - |b| (mod |m|), using |tmp| as scratch
// space. Each array is |num| words long. |a| and |b| must be < |m|. Any pair of
// |r|, |a|, and |b| may alias.
void bn_mod_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
const BN_ULONG *m, BN_ULONG *tmp, size_t num);
// bn_mod_sub_consttime acts like |BN_mod_sub_quick| but takes a |BN_CTX|.
int bn_mod_sub_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
Make BN_mod_*_quick constant-time. As the EC code will ultimately want to use these in "words" form by way of EC_FELEM, and because it's much easier, I've implement these as low-level words-based functions that require all inputs have the same width. The BIGNUM versions which RSA and, for now, EC calls are implemented on top of that. Unfortunately, doing such things in constant-time and accounting for undersized inputs requires some scratch space, and these functions don't take BN_CTX. So I've added internal bn_mod_*_quick_ctx functions that take a BN_CTX and the old functions now allocate a bit unnecessarily. RSA only needs lshift (for BN_MONT_CTX) and sub (for CRT), but the generic EC code wants add as well. The generic EC code isn't even remotely constant-time, and I hope to ultimately use stack-allocated EC_FELEMs, so I've made the actual implementations here implemented in "words", which is much simpler anyway due to not having to take care of widths. I've also gone ahead and switched the EC code to these functions, largely as a test of their performance (an earlier iteration made the EC code noticeably slower). These operations are otherwise not performance-critical in RSA. The conversion from BIGNUM to BIGNUM+BN_CTX should be dropped by the static linker already, and the unused BIGNUM+BN_CTX functions will fall off when EC_FELEM happens. Update-Note: BN_mod_*_quick bounce on malloc a bit now, but they're not really used externally. The one caller I found was wpa_supplicant which bounces on malloc already. They appear to be implementing compressed coordinates by hand? We may be able to convince them to call EC_POINT_set_compressed_coordinates_GFp. Bug: 233, 236 Change-Id: I2bf361e9c089e0211b97d95523dbc06f1168e12b Reviewed-on: https://boringssl-review.googlesource.com/25261 Commit-Queue: David Benjamin <davidben@google.com> CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Reviewed-by: Adam Langley <agl@google.com>
2018-01-24 20:29:00 +00:00
const BIGNUM *m, BN_CTX *ctx);
// bn_mod_lshift1_consttime acts like |BN_mod_lshift1_quick| but takes a
Make BN_mod_*_quick constant-time. As the EC code will ultimately want to use these in "words" form by way of EC_FELEM, and because it's much easier, I've implement these as low-level words-based functions that require all inputs have the same width. The BIGNUM versions which RSA and, for now, EC calls are implemented on top of that. Unfortunately, doing such things in constant-time and accounting for undersized inputs requires some scratch space, and these functions don't take BN_CTX. So I've added internal bn_mod_*_quick_ctx functions that take a BN_CTX and the old functions now allocate a bit unnecessarily. RSA only needs lshift (for BN_MONT_CTX) and sub (for CRT), but the generic EC code wants add as well. The generic EC code isn't even remotely constant-time, and I hope to ultimately use stack-allocated EC_FELEMs, so I've made the actual implementations here implemented in "words", which is much simpler anyway due to not having to take care of widths. I've also gone ahead and switched the EC code to these functions, largely as a test of their performance (an earlier iteration made the EC code noticeably slower). These operations are otherwise not performance-critical in RSA. The conversion from BIGNUM to BIGNUM+BN_CTX should be dropped by the static linker already, and the unused BIGNUM+BN_CTX functions will fall off when EC_FELEM happens. Update-Note: BN_mod_*_quick bounce on malloc a bit now, but they're not really used externally. The one caller I found was wpa_supplicant which bounces on malloc already. They appear to be implementing compressed coordinates by hand? We may be able to convince them to call EC_POINT_set_compressed_coordinates_GFp. Bug: 233, 236 Change-Id: I2bf361e9c089e0211b97d95523dbc06f1168e12b Reviewed-on: https://boringssl-review.googlesource.com/25261 Commit-Queue: David Benjamin <davidben@google.com> CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Reviewed-by: Adam Langley <agl@google.com>
2018-01-24 20:29:00 +00:00
// |BN_CTX|.
int bn_mod_lshift1_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *m,
Make BN_mod_*_quick constant-time. As the EC code will ultimately want to use these in "words" form by way of EC_FELEM, and because it's much easier, I've implement these as low-level words-based functions that require all inputs have the same width. The BIGNUM versions which RSA and, for now, EC calls are implemented on top of that. Unfortunately, doing such things in constant-time and accounting for undersized inputs requires some scratch space, and these functions don't take BN_CTX. So I've added internal bn_mod_*_quick_ctx functions that take a BN_CTX and the old functions now allocate a bit unnecessarily. RSA only needs lshift (for BN_MONT_CTX) and sub (for CRT), but the generic EC code wants add as well. The generic EC code isn't even remotely constant-time, and I hope to ultimately use stack-allocated EC_FELEMs, so I've made the actual implementations here implemented in "words", which is much simpler anyway due to not having to take care of widths. I've also gone ahead and switched the EC code to these functions, largely as a test of their performance (an earlier iteration made the EC code noticeably slower). These operations are otherwise not performance-critical in RSA. The conversion from BIGNUM to BIGNUM+BN_CTX should be dropped by the static linker already, and the unused BIGNUM+BN_CTX functions will fall off when EC_FELEM happens. Update-Note: BN_mod_*_quick bounce on malloc a bit now, but they're not really used externally. The one caller I found was wpa_supplicant which bounces on malloc already. They appear to be implementing compressed coordinates by hand? We may be able to convince them to call EC_POINT_set_compressed_coordinates_GFp. Bug: 233, 236 Change-Id: I2bf361e9c089e0211b97d95523dbc06f1168e12b Reviewed-on: https://boringssl-review.googlesource.com/25261 Commit-Queue: David Benjamin <davidben@google.com> CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Reviewed-by: Adam Langley <agl@google.com>
2018-01-24 20:29:00 +00:00
BN_CTX *ctx);
// bn_mod_lshift_consttime acts like |BN_mod_lshift_quick| but takes a |BN_CTX|.
int bn_mod_lshift_consttime(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m,
Make BN_mod_*_quick constant-time. As the EC code will ultimately want to use these in "words" form by way of EC_FELEM, and because it's much easier, I've implement these as low-level words-based functions that require all inputs have the same width. The BIGNUM versions which RSA and, for now, EC calls are implemented on top of that. Unfortunately, doing such things in constant-time and accounting for undersized inputs requires some scratch space, and these functions don't take BN_CTX. So I've added internal bn_mod_*_quick_ctx functions that take a BN_CTX and the old functions now allocate a bit unnecessarily. RSA only needs lshift (for BN_MONT_CTX) and sub (for CRT), but the generic EC code wants add as well. The generic EC code isn't even remotely constant-time, and I hope to ultimately use stack-allocated EC_FELEMs, so I've made the actual implementations here implemented in "words", which is much simpler anyway due to not having to take care of widths. I've also gone ahead and switched the EC code to these functions, largely as a test of their performance (an earlier iteration made the EC code noticeably slower). These operations are otherwise not performance-critical in RSA. The conversion from BIGNUM to BIGNUM+BN_CTX should be dropped by the static linker already, and the unused BIGNUM+BN_CTX functions will fall off when EC_FELEM happens. Update-Note: BN_mod_*_quick bounce on malloc a bit now, but they're not really used externally. The one caller I found was wpa_supplicant which bounces on malloc already. They appear to be implementing compressed coordinates by hand? We may be able to convince them to call EC_POINT_set_compressed_coordinates_GFp. Bug: 233, 236 Change-Id: I2bf361e9c089e0211b97d95523dbc06f1168e12b Reviewed-on: https://boringssl-review.googlesource.com/25261 Commit-Queue: David Benjamin <davidben@google.com> CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Reviewed-by: Adam Langley <agl@google.com>
2018-01-24 20:29:00 +00:00
BN_CTX *ctx);
// bn_mod_inverse_consttime sets |r| to |a|^-1, mod |n|. |a| must be non-
// negative and less than |n|. It returns one on success and zero on error. On
// failure, if the failure was caused by |a| having no inverse mod |n| then
// |*out_no_inverse| will be set to one; otherwise it will be set to zero.
//
// This function treats both |a| and |n| as secret, provided they are both non-
// zero and the inverse exists. It should only be used for even moduli where
// none of the less general implementations are applicable.
OPENSSL_EXPORT int bn_mod_inverse_consttime(BIGNUM *r, int *out_no_inverse,
const BIGNUM *a, const BIGNUM *n,
BN_CTX *ctx);
// bn_mod_inverse_prime sets |out| to the modular inverse of |a| modulo |p|,
// computed with Fermat's Little Theorem. It returns one on success and zero on
// error. If |mont_p| is NULL, one will be computed temporarily.
int bn_mod_inverse_prime(BIGNUM *out, const BIGNUM *a, const BIGNUM *p,
BN_CTX *ctx, const BN_MONT_CTX *mont_p);
// bn_mod_inverse_secret_prime behaves like |bn_mod_inverse_prime| but uses
// |BN_mod_exp_mont_consttime| instead of |BN_mod_exp_mont| in hopes of
// protecting the exponent.
int bn_mod_inverse_secret_prime(BIGNUM *out, const BIGNUM *a, const BIGNUM *p,
BN_CTX *ctx, const BN_MONT_CTX *mont_p);
Make BN_mod_*_quick constant-time. As the EC code will ultimately want to use these in "words" form by way of EC_FELEM, and because it's much easier, I've implement these as low-level words-based functions that require all inputs have the same width. The BIGNUM versions which RSA and, for now, EC calls are implemented on top of that. Unfortunately, doing such things in constant-time and accounting for undersized inputs requires some scratch space, and these functions don't take BN_CTX. So I've added internal bn_mod_*_quick_ctx functions that take a BN_CTX and the old functions now allocate a bit unnecessarily. RSA only needs lshift (for BN_MONT_CTX) and sub (for CRT), but the generic EC code wants add as well. The generic EC code isn't even remotely constant-time, and I hope to ultimately use stack-allocated EC_FELEMs, so I've made the actual implementations here implemented in "words", which is much simpler anyway due to not having to take care of widths. I've also gone ahead and switched the EC code to these functions, largely as a test of their performance (an earlier iteration made the EC code noticeably slower). These operations are otherwise not performance-critical in RSA. The conversion from BIGNUM to BIGNUM+BN_CTX should be dropped by the static linker already, and the unused BIGNUM+BN_CTX functions will fall off when EC_FELEM happens. Update-Note: BN_mod_*_quick bounce on malloc a bit now, but they're not really used externally. The one caller I found was wpa_supplicant which bounces on malloc already. They appear to be implementing compressed coordinates by hand? We may be able to convince them to call EC_POINT_set_compressed_coordinates_GFp. Bug: 233, 236 Change-Id: I2bf361e9c089e0211b97d95523dbc06f1168e12b Reviewed-on: https://boringssl-review.googlesource.com/25261 Commit-Queue: David Benjamin <davidben@google.com> CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Reviewed-by: Adam Langley <agl@google.com>
2018-01-24 20:29:00 +00:00
// Low-level operations for small numbers.
//
// The following functions implement algorithms suitable for use with scalars
// and field elements in elliptic curves. They rely on the number being small
// both to stack-allocate various temporaries and because they do not implement
// optimizations useful for the larger values used in RSA.
// BN_SMALL_MAX_WORDS is the largest size input these functions handle. This
// limit allows temporaries to be more easily stack-allocated. This limit is set
// to accommodate P-521.
#if defined(OPENSSL_32_BIT)
#define BN_SMALL_MAX_WORDS 17
#else
#define BN_SMALL_MAX_WORDS 9
#endif
// bn_mul_small sets |r| to |a|*|b|. |num_r| must be |num_a| + |num_b|. |r| may
// not alias with |a| or |b|.
void bn_mul_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a,
const BN_ULONG *b, size_t num_b);
// bn_sqr_small sets |r| to |a|^2. |num_a| must be at most |BN_SMALL_MAX_WORDS|.
// |num_r| must be |num_a|*2. |r| and |a| may not alias.
void bn_sqr_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a);
// In the following functions, the modulus must be at most |BN_SMALL_MAX_WORDS|
// words long.
// bn_to_montgomery_small sets |r| to |a| translated to the Montgomery domain.
// |r| and |a| are |num| words long, which must be |mont->N.width|. |a| must be
// fully reduced and may alias |r|.
void bn_to_montgomery_small(BN_ULONG *r, const BN_ULONG *a, size_t num,
const BN_MONT_CTX *mont);
// bn_from_montgomery_small sets |r| to |a| translated out of the Montgomery
// domain. |r| and |a| are |num| words long, which must be |mont->N.width|. |a|
// must be fully-reduced and may alias |r|.
void bn_from_montgomery_small(BN_ULONG *r, const BN_ULONG *a, size_t num,
const BN_MONT_CTX *mont);
// bn_mod_mul_montgomery_small sets |r| to |a| * |b| mod |mont->N|. Both inputs
// and outputs are in the Montgomery domain. Each array is |num| words long,
// which must be |mont->N.width|. Any two of |r|, |a|, and |b| may alias. |a|
// and |b| must be reduced on input.
void bn_mod_mul_montgomery_small(BN_ULONG *r, const BN_ULONG *a,
const BN_ULONG *b, size_t num,
const BN_MONT_CTX *mont);
// bn_mod_exp_mont_small sets |r| to |a|^|p| mod |mont->N|. It returns one on
// success and zero on programmer or internal error. Both inputs and outputs are
// in the Montgomery domain. |r| and |a| are |num| words long, which must be
// |mont->N.width| and at most |BN_SMALL_MAX_WORDS|. |a| must be fully-reduced.
// This function runs in time independent of |a|, but |p| and |mont->N| are
// public values. |a| must be fully-reduced and may alias with |r|.
//
// Note this function differs from |BN_mod_exp_mont| which uses Montgomery
// reduction but takes input and output outside the Montgomery domain. Combine
// this function with |bn_from_montgomery_small| and |bn_to_montgomery_small|
// if necessary.
void bn_mod_exp_mont_small(BN_ULONG *r, const BN_ULONG *a, size_t num,
const BN_ULONG *p, size_t num_p,
const BN_MONT_CTX *mont);
// bn_mod_inverse_prime_mont_small sets |r| to |a|^-1 mod |mont->N|. |mont->N|
// must be a prime. |r| and |a| are |num| words long, which must be
// |mont->N.width| and at most |BN_SMALL_MAX_WORDS|. |a| must be fully-reduced
// and may alias |r|. This function runs in time independent of |a|, but
// |mont->N| is a public value.
void bn_mod_inverse_prime_mont_small(BN_ULONG *r, const BN_ULONG *a, size_t num,
const BN_MONT_CTX *mont);
#if defined(__cplusplus)
} // extern C
#endif
#endif // OPENSSL_HEADER_BN_INTERNAL_H