boringssl/crypto/fipsmodule/ec/p224-64.c

1126 lines
41 KiB
C
Raw Normal View History

/* Copyright (c) 2015, Google Inc.
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
// A 64-bit implementation of the NIST P-224 elliptic curve point multiplication
//
// Inspired by Daniel J. Bernstein's public domain nistp224 implementation
// and Adam Langley's public domain 64-bit C implementation of curve25519.
#include <openssl/base.h>
#include <openssl/bn.h>
#include <openssl/ec.h>
#include <openssl/err.h>
#include <openssl/mem.h>
#include <string.h>
#include "internal.h"
#include "../delocate.h"
#include "../../internal.h"
#if defined(BORINGSSL_HAS_UINT128) && !defined(OPENSSL_SMALL)
// Field elements are represented as a_0 + 2^56*a_1 + 2^112*a_2 + 2^168*a_3
// using 64-bit coefficients called 'limbs', and sometimes (for multiplication
// results) as b_0 + 2^56*b_1 + 2^112*b_2 + 2^168*b_3 + 2^224*b_4 + 2^280*b_5 +
// 2^336*b_6 using 128-bit coefficients called 'widelimbs'. A 4-p224_limb
// representation is an 'p224_felem'; a 7-p224_widelimb representation is a
// 'p224_widefelem'. Even within felems, bits of adjacent limbs overlap, and we
// don't always reduce the representations: we ensure that inputs to each
// p224_felem multiplication satisfy a_i < 2^60, so outputs satisfy b_i <
// 4*2^60*2^60, and fit into a 128-bit word without overflow. The coefficients
// are then again partially reduced to obtain an p224_felem satisfying a_i <
// 2^57. We only reduce to the unique minimal representation at the end of the
// computation.
typedef uint64_t p224_limb;
typedef uint128_t p224_widelimb;
typedef p224_limb p224_felem[4];
typedef p224_widelimb p224_widefelem[7];
// Field element represented as a byte arrary. 28*8 = 224 bits is also the
// group order size for the elliptic curve, and we also use this type for
// scalars for point multiplication.
typedef uint8_t p224_felem_bytearray[28];
// Precomputed multiples of the standard generator
// Points are given in coordinates (X, Y, Z) where Z normally is 1
// (0 for the point at infinity).
// For each field element, slice a_0 is word 0, etc.
//
// The table has 2 * 16 elements, starting with the following:
// index | bits | point
// ------+---------+------------------------------
// 0 | 0 0 0 0 | 0G
// 1 | 0 0 0 1 | 1G
// 2 | 0 0 1 0 | 2^56G
// 3 | 0 0 1 1 | (2^56 + 1)G
// 4 | 0 1 0 0 | 2^112G
// 5 | 0 1 0 1 | (2^112 + 1)G
// 6 | 0 1 1 0 | (2^112 + 2^56)G
// 7 | 0 1 1 1 | (2^112 + 2^56 + 1)G
// 8 | 1 0 0 0 | 2^168G
// 9 | 1 0 0 1 | (2^168 + 1)G
// 10 | 1 0 1 0 | (2^168 + 2^56)G
// 11 | 1 0 1 1 | (2^168 + 2^56 + 1)G
// 12 | 1 1 0 0 | (2^168 + 2^112)G
// 13 | 1 1 0 1 | (2^168 + 2^112 + 1)G
// 14 | 1 1 1 0 | (2^168 + 2^112 + 2^56)G
// 15 | 1 1 1 1 | (2^168 + 2^112 + 2^56 + 1)G
// followed by a copy of this with each element multiplied by 2^28.
//
// The reason for this is so that we can clock bits into four different
// locations when doing simple scalar multiplies against the base point,
// and then another four locations using the second 16 elements.
static const p224_felem g_p224_pre_comp[2][16][3] = {
{{{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}},
{{0x3280d6115c1d21, 0xc1d356c2112234, 0x7f321390b94a03, 0xb70e0cbd6bb4bf},
{0xd5819985007e34, 0x75a05a07476444, 0xfb4c22dfe6cd43, 0xbd376388b5f723},
{1, 0, 0, 0}},
{{0xfd9675666ebbe9, 0xbca7664d40ce5e, 0x2242df8d8a2a43, 0x1f49bbb0f99bc5},
{0x29e0b892dc9c43, 0xece8608436e662, 0xdc858f185310d0, 0x9812dd4eb8d321},
{1, 0, 0, 0}},
{{0x6d3e678d5d8eb8, 0x559eed1cb362f1, 0x16e9a3bbce8a3f, 0xeedcccd8c2a748},
{0xf19f90ed50266d, 0xabf2b4bf65f9df, 0x313865468fafec, 0x5cb379ba910a17},
{1, 0, 0, 0}},
{{0x0641966cab26e3, 0x91fb2991fab0a0, 0xefec27a4e13a0b, 0x0499aa8a5f8ebe},
{0x7510407766af5d, 0x84d929610d5450, 0x81d77aae82f706, 0x6916f6d4338c5b},
{1, 0, 0, 0}},
{{0xea95ac3b1f15c6, 0x086000905e82d4, 0xdd323ae4d1c8b1, 0x932b56be7685a3},
{0x9ef93dea25dbbf, 0x41665960f390f0, 0xfdec76dbe2a8a7, 0x523e80f019062a},
{1, 0, 0, 0}},
{{0x822fdd26732c73, 0xa01c83531b5d0f, 0x363f37347c1ba4, 0xc391b45c84725c},
{0xbbd5e1b2d6ad24, 0xddfbcde19dfaec, 0xc393da7e222a7f, 0x1efb7890ede244},
{1, 0, 0, 0}},
{{0x4c9e90ca217da1, 0xd11beca79159bb, 0xff8d33c2c98b7c, 0x2610b39409f849},
{0x44d1352ac64da0, 0xcdbb7b2c46b4fb, 0x966c079b753c89, 0xfe67e4e820b112},
{1, 0, 0, 0}},
{{0xe28cae2df5312d, 0xc71b61d16f5c6e, 0x79b7619a3e7c4c, 0x05c73240899b47},
{0x9f7f6382c73e3a, 0x18615165c56bda, 0x641fab2116fd56, 0x72855882b08394},
{1, 0, 0, 0}},
{{0x0469182f161c09, 0x74a98ca8d00fb5, 0xb89da93489a3e0, 0x41c98768fb0c1d},
{0xe5ea05fb32da81, 0x3dce9ffbca6855, 0x1cfe2d3fbf59e6, 0x0e5e03408738a7},
{1, 0, 0, 0}},
{{0xdab22b2333e87f, 0x4430137a5dd2f6, 0xe03ab9f738beb8, 0xcb0c5d0dc34f24},
{0x764a7df0c8fda5, 0x185ba5c3fa2044, 0x9281d688bcbe50, 0xc40331df893881},
{1, 0, 0, 0}},
{{0xb89530796f0f60, 0xade92bd26909a3, 0x1a0c83fb4884da, 0x1765bf22a5a984},
{0x772a9ee75db09e, 0x23bc6c67cec16f, 0x4c1edba8b14e2f, 0xe2a215d9611369},
{1, 0, 0, 0}},
{{0x571e509fb5efb3, 0xade88696410552, 0xc8ae85fada74fe, 0x6c7e4be83bbde3},
{0xff9f51160f4652, 0xb47ce2495a6539, 0xa2946c53b582f4, 0x286d2db3ee9a60},
{1, 0, 0, 0}},
{{0x40bbd5081a44af, 0x0995183b13926c, 0xbcefba6f47f6d0, 0x215619e9cc0057},
{0x8bc94d3b0df45e, 0xf11c54a3694f6f, 0x8631b93cdfe8b5, 0xe7e3f4b0982db9},
{1, 0, 0, 0}},
{{0xb17048ab3e1c7b, 0xac38f36ff8a1d8, 0x1c29819435d2c6, 0xc813132f4c07e9},
{0x2891425503b11f, 0x08781030579fea, 0xf5426ba5cc9674, 0x1e28ebf18562bc},
{1, 0, 0, 0}},
{{0x9f31997cc864eb, 0x06cd91d28b5e4c, 0xff17036691a973, 0xf1aef351497c58},
{0xdd1f2d600564ff, 0xdead073b1402db, 0x74a684435bd693, 0xeea7471f962558},
{1, 0, 0, 0}}},
{{{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}},
{{0x9665266dddf554, 0x9613d78b60ef2d, 0xce27a34cdba417, 0xd35ab74d6afc31},
{0x85ccdd22deb15e, 0x2137e5783a6aab, 0xa141cffd8c93c6, 0x355a1830e90f2d},
{1, 0, 0, 0}},
{{0x1a494eadaade65, 0xd6da4da77fe53c, 0xe7992996abec86, 0x65c3553c6090e3},
{0xfa610b1fb09346, 0xf1c6540b8a4aaf, 0xc51a13ccd3cbab, 0x02995b1b18c28a},
{1, 0, 0, 0}},
{{0x7874568e7295ef, 0x86b419fbe38d04, 0xdc0690a7550d9a, 0xd3966a44beac33},
{0x2b7280ec29132f, 0xbeaa3b6a032df3, 0xdc7dd88ae41200, 0xd25e2513e3a100},
{1, 0, 0, 0}},
{{0x924857eb2efafd, 0xac2bce41223190, 0x8edaa1445553fc, 0x825800fd3562d5},
{0x8d79148ea96621, 0x23a01c3dd9ed8d, 0xaf8b219f9416b5, 0xd8db0cc277daea},
{1, 0, 0, 0}},
{{0x76a9c3b1a700f0, 0xe9acd29bc7e691, 0x69212d1a6b0327, 0x6322e97fe154be},
{0x469fc5465d62aa, 0x8d41ed18883b05, 0x1f8eae66c52b88, 0xe4fcbe9325be51},
{1, 0, 0, 0}},
{{0x825fdf583cac16, 0x020b857c7b023a, 0x683c17744b0165, 0x14ffd0a2daf2f1},
{0x323b36184218f9, 0x4944ec4e3b47d4, 0xc15b3080841acf, 0x0bced4b01a28bb},
{1, 0, 0, 0}},
{{0x92ac22230df5c4, 0x52f33b4063eda8, 0xcb3f19870c0c93, 0x40064f2ba65233},
{0xfe16f0924f8992, 0x012da25af5b517, 0x1a57bb24f723a6, 0x06f8bc76760def},
{1, 0, 0, 0}},
{{0x4a7084f7817cb9, 0xbcab0738ee9a78, 0x3ec11e11d9c326, 0xdc0fe90e0f1aae},
{0xcf639ea5f98390, 0x5c350aa22ffb74, 0x9afae98a4047b7, 0x956ec2d617fc45},
{1, 0, 0, 0}},
{{0x4306d648c1be6a, 0x9247cd8bc9a462, 0xf5595e377d2f2e, 0xbd1c3caff1a52e},
{0x045e14472409d0, 0x29f3e17078f773, 0x745a602b2d4f7d, 0x191837685cdfbb},
{1, 0, 0, 0}},
{{0x5b6ee254a8cb79, 0x4953433f5e7026, 0xe21faeb1d1def4, 0xc4c225785c09de},
{0x307ce7bba1e518, 0x31b125b1036db8, 0x47e91868839e8f, 0xc765866e33b9f3},
{1, 0, 0, 0}},
{{0x3bfece24f96906, 0x4794da641e5093, 0xde5df64f95db26, 0x297ecd89714b05},
{0x701bd3ebb2c3aa, 0x7073b4f53cb1d5, 0x13c5665658af16, 0x9895089d66fe58},
{1, 0, 0, 0}},
{{0x0fef05f78c4790, 0x2d773633b05d2e, 0x94229c3a951c94, 0xbbbd70df4911bb},
{0xb2c6963d2c1168, 0x105f47a72b0d73, 0x9fdf6111614080, 0x7b7e94b39e67b0},
{1, 0, 0, 0}},
{{0xad1a7d6efbe2b3, 0xf012482c0da69d, 0x6b3bdf12438345, 0x40d7558d7aa4d9},
{0x8a09fffb5c6d3d, 0x9a356e5d9ffd38, 0x5973f15f4f9b1c, 0xdcd5f59f63c3ea},
{1, 0, 0, 0}},
{{0xacf39f4c5ca7ab, 0x4c8071cc5fd737, 0xc64e3602cd1184, 0x0acd4644c9abba},
{0x6c011a36d8bf6e, 0xfecd87ba24e32a, 0x19f6f56574fad8, 0x050b204ced9405},
{1, 0, 0, 0}},
{{0xed4f1cae7d9a96, 0x5ceef7ad94c40a, 0x778e4a3bf3ef9b, 0x7405783dc3b55e},
{0x32477c61b6e8c6, 0xb46a97570f018b, 0x91176d0a7e95d1, 0x3df90fbc4c7d0e},
{1, 0, 0, 0}}}};
static uint64_t p224_load_u64(const uint8_t in[8]) {
uint64_t ret;
OPENSSL_memcpy(&ret, in, sizeof(ret));
return ret;
}
// Helper functions to convert field elements to/from internal representation
static void p224_bin28_to_felem(p224_felem out, const uint8_t in[28]) {
out[0] = p224_load_u64(in) & 0x00ffffffffffffff;
out[1] = p224_load_u64(in + 7) & 0x00ffffffffffffff;
out[2] = p224_load_u64(in + 14) & 0x00ffffffffffffff;
out[3] = p224_load_u64(in + 20) >> 8;
}
static void p224_felem_to_bin28(uint8_t out[28], const p224_felem in) {
for (size_t i = 0; i < 7; ++i) {
out[i] = in[0] >> (8 * i);
out[i + 7] = in[1] >> (8 * i);
out[i + 14] = in[2] >> (8 * i);
out[i + 21] = in[3] >> (8 * i);
}
}
Add EC_FELEM for EC_POINTs and related temporaries. This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used for EC_POINT's representation in the generic EC_METHOD, as well as random operations on tuned EC_METHODs that still are implemented genericly. Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific to the EC_METHOD, analogous to how the old values were BIGNUMs but may or may not have been in Montgomery form. This is kind of a nuisance, but no more than before. (If p224-64.c were easily convertable to Montgomery form, we could say |EC_FELEM| is always in Montgomery form. If we exposed the internal add and double implementations in each of the curves, we could give |EC_POINT| an |EC_METHOD|-specific representation and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this for later.) The generic add and doubling formulas are aligned with the formulas proved in fiat-crypto. Those only applied to a = -3, so I've proved a generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case someone uses a custom curve. The new formulas are verified, constant-time, and swap a multiply for a square. As expressed in fiat-crypto they do use more temporaries, but this seems to be fine with stack-allocated EC_FELEMs. (We can try to help the compiler later, but benchamrks below suggest this isn't necessary.) Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the bounds in the type system and, in particular, that the width is correct, which will make it easier to select a point in constant-time in the future. (Indeed the old code did not always have the correct width. Its point formula involved halving and implemented this in variable time and variable width.) Before: Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec) Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec) Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec) Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec) Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec) Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec) Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec) After: Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%] Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%] Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%] Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%] Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%] Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%] Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%] This more than pays for removing points_make_affine previously and even speeds up ECDH P-256 slightly. (The point-on-curve check uses the generic code.) Next is to push the stack-allocating up to ec_wNAF_mul, followed by a constant-time single-point multiplication. Bug: 239 Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c Reviewed-on: https://boringssl-review.googlesource.com/27668 Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
static void p224_generic_to_felem(p224_felem out, const EC_FELEM *in) {
p224_bin28_to_felem(out, in->bytes);
}
// Requires 0 <= in < 2*p (always call p224_felem_reduce first)
Add EC_FELEM for EC_POINTs and related temporaries. This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used for EC_POINT's representation in the generic EC_METHOD, as well as random operations on tuned EC_METHODs that still are implemented genericly. Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific to the EC_METHOD, analogous to how the old values were BIGNUMs but may or may not have been in Montgomery form. This is kind of a nuisance, but no more than before. (If p224-64.c were easily convertable to Montgomery form, we could say |EC_FELEM| is always in Montgomery form. If we exposed the internal add and double implementations in each of the curves, we could give |EC_POINT| an |EC_METHOD|-specific representation and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this for later.) The generic add and doubling formulas are aligned with the formulas proved in fiat-crypto. Those only applied to a = -3, so I've proved a generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case someone uses a custom curve. The new formulas are verified, constant-time, and swap a multiply for a square. As expressed in fiat-crypto they do use more temporaries, but this seems to be fine with stack-allocated EC_FELEMs. (We can try to help the compiler later, but benchamrks below suggest this isn't necessary.) Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the bounds in the type system and, in particular, that the width is correct, which will make it easier to select a point in constant-time in the future. (Indeed the old code did not always have the correct width. Its point formula involved halving and implemented this in variable time and variable width.) Before: Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec) Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec) Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec) Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec) Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec) Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec) Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec) After: Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%] Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%] Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%] Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%] Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%] Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%] Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%] This more than pays for removing points_make_affine previously and even speeds up ECDH P-256 slightly. (The point-on-curve check uses the generic code.) Next is to push the stack-allocating up to ec_wNAF_mul, followed by a constant-time single-point multiplication. Bug: 239 Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c Reviewed-on: https://boringssl-review.googlesource.com/27668 Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
static void p224_felem_to_generic(EC_FELEM *out, const p224_felem in) {
// Reduce to unique minimal representation.
static const int64_t two56 = ((p224_limb)1) << 56;
// 0 <= in < 2*p, p = 2^224 - 2^96 + 1
// if in > p , reduce in = in - 2^224 + 2^96 - 1
int64_t tmp[4], a;
tmp[0] = in[0];
tmp[1] = in[1];
tmp[2] = in[2];
tmp[3] = in[3];
// Case 1: a = 1 iff in >= 2^224
a = (in[3] >> 56);
tmp[0] -= a;
tmp[1] += a << 40;
tmp[3] &= 0x00ffffffffffffff;
// Case 2: a = 0 iff p <= in < 2^224, i.e., the high 128 bits are all 1 and
// the lower part is non-zero
a = ((in[3] & in[2] & (in[1] | 0x000000ffffffffff)) + 1) |
(((int64_t)(in[0] + (in[1] & 0x000000ffffffffff)) - 1) >> 63);
a &= 0x00ffffffffffffff;
// turn a into an all-one mask (if a = 0) or an all-zero mask
a = (a - 1) >> 63;
// subtract 2^224 - 2^96 + 1 if a is all-one
tmp[3] &= a ^ 0xffffffffffffffff;
tmp[2] &= a ^ 0xffffffffffffffff;
tmp[1] &= (a ^ 0xffffffffffffffff) | 0x000000ffffffffff;
tmp[0] -= 1 & a;
// eliminate negative coefficients: if tmp[0] is negative, tmp[1] must
// be non-zero, so we only need one step
a = tmp[0] >> 63;
tmp[0] += two56 & a;
tmp[1] -= 1 & a;
// carry 1 -> 2 -> 3
tmp[2] += tmp[1] >> 56;
tmp[1] &= 0x00ffffffffffffff;
tmp[3] += tmp[2] >> 56;
tmp[2] &= 0x00ffffffffffffff;
// Now 0 <= tmp < p
p224_felem tmp2;
tmp2[0] = tmp[0];
tmp2[1] = tmp[1];
tmp2[2] = tmp[2];
tmp2[3] = tmp[3];
p224_felem_to_bin28(out->bytes, tmp2);
Add EC_FELEM for EC_POINTs and related temporaries. This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used for EC_POINT's representation in the generic EC_METHOD, as well as random operations on tuned EC_METHODs that still are implemented genericly. Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific to the EC_METHOD, analogous to how the old values were BIGNUMs but may or may not have been in Montgomery form. This is kind of a nuisance, but no more than before. (If p224-64.c were easily convertable to Montgomery form, we could say |EC_FELEM| is always in Montgomery form. If we exposed the internal add and double implementations in each of the curves, we could give |EC_POINT| an |EC_METHOD|-specific representation and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this for later.) The generic add and doubling formulas are aligned with the formulas proved in fiat-crypto. Those only applied to a = -3, so I've proved a generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case someone uses a custom curve. The new formulas are verified, constant-time, and swap a multiply for a square. As expressed in fiat-crypto they do use more temporaries, but this seems to be fine with stack-allocated EC_FELEMs. (We can try to help the compiler later, but benchamrks below suggest this isn't necessary.) Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the bounds in the type system and, in particular, that the width is correct, which will make it easier to select a point in constant-time in the future. (Indeed the old code did not always have the correct width. Its point formula involved halving and implemented this in variable time and variable width.) Before: Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec) Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec) Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec) Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec) Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec) Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec) Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec) After: Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%] Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%] Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%] Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%] Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%] Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%] Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%] This more than pays for removing points_make_affine previously and even speeds up ECDH P-256 slightly. (The point-on-curve check uses the generic code.) Next is to push the stack-allocating up to ec_wNAF_mul, followed by a constant-time single-point multiplication. Bug: 239 Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c Reviewed-on: https://boringssl-review.googlesource.com/27668 Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
// 224 is not a multiple of 64, so zero the remaining bytes.
OPENSSL_memset(out->bytes + 28, 0, 32 - 28);
}
// Field operations, using the internal representation of field elements.
// NB! These operations are specific to our point multiplication and cannot be
// expected to be correct in general - e.g., multiplication with a large scalar
// will cause an overflow.
static void p224_felem_assign(p224_felem out, const p224_felem in) {
out[0] = in[0];
out[1] = in[1];
out[2] = in[2];
out[3] = in[3];
}
// Sum two field elements: out += in
static void p224_felem_sum(p224_felem out, const p224_felem in) {
out[0] += in[0];
out[1] += in[1];
out[2] += in[2];
out[3] += in[3];
}
// Subtract field elements: out -= in
// Assumes in[i] < 2^57
static void p224_felem_diff(p224_felem out, const p224_felem in) {
static const p224_limb two58p2 =
(((p224_limb)1) << 58) + (((p224_limb)1) << 2);
static const p224_limb two58m2 =
(((p224_limb)1) << 58) - (((p224_limb)1) << 2);
static const p224_limb two58m42m2 =
(((p224_limb)1) << 58) - (((p224_limb)1) << 42) - (((p224_limb)1) << 2);
// Add 0 mod 2^224-2^96+1 to ensure out > in
out[0] += two58p2;
out[1] += two58m42m2;
out[2] += two58m2;
out[3] += two58m2;
out[0] -= in[0];
out[1] -= in[1];
out[2] -= in[2];
out[3] -= in[3];
}
// Subtract in unreduced 128-bit mode: out -= in
// Assumes in[i] < 2^119
static void p224_widefelem_diff(p224_widefelem out, const p224_widefelem in) {
static const p224_widelimb two120 = ((p224_widelimb)1) << 120;
static const p224_widelimb two120m64 =
(((p224_widelimb)1) << 120) - (((p224_widelimb)1) << 64);
static const p224_widelimb two120m104m64 = (((p224_widelimb)1) << 120) -
(((p224_widelimb)1) << 104) -
(((p224_widelimb)1) << 64);
// Add 0 mod 2^224-2^96+1 to ensure out > in
out[0] += two120;
out[1] += two120m64;
out[2] += two120m64;
out[3] += two120;
out[4] += two120m104m64;
out[5] += two120m64;
out[6] += two120m64;
out[0] -= in[0];
out[1] -= in[1];
out[2] -= in[2];
out[3] -= in[3];
out[4] -= in[4];
out[5] -= in[5];
out[6] -= in[6];
}
// Subtract in mixed mode: out128 -= in64
// in[i] < 2^63
static void p224_felem_diff_128_64(p224_widefelem out, const p224_felem in) {
static const p224_widelimb two64p8 =
(((p224_widelimb)1) << 64) + (((p224_widelimb)1) << 8);
static const p224_widelimb two64m8 =
(((p224_widelimb)1) << 64) - (((p224_widelimb)1) << 8);
static const p224_widelimb two64m48m8 = (((p224_widelimb)1) << 64) -
(((p224_widelimb)1) << 48) -
(((p224_widelimb)1) << 8);
// Add 0 mod 2^224-2^96+1 to ensure out > in
out[0] += two64p8;
out[1] += two64m48m8;
out[2] += two64m8;
out[3] += two64m8;
out[0] -= in[0];
out[1] -= in[1];
out[2] -= in[2];
out[3] -= in[3];
}
// Multiply a field element by a scalar: out = out * scalar
// The scalars we actually use are small, so results fit without overflow
static void p224_felem_scalar(p224_felem out, const p224_limb scalar) {
out[0] *= scalar;
out[1] *= scalar;
out[2] *= scalar;
out[3] *= scalar;
}
// Multiply an unreduced field element by a scalar: out = out * scalar
// The scalars we actually use are small, so results fit without overflow
static void p224_widefelem_scalar(p224_widefelem out,
const p224_widelimb scalar) {
out[0] *= scalar;
out[1] *= scalar;
out[2] *= scalar;
out[3] *= scalar;
out[4] *= scalar;
out[5] *= scalar;
out[6] *= scalar;
}
// Square a field element: out = in^2
static void p224_felem_square(p224_widefelem out, const p224_felem in) {
p224_limb tmp0, tmp1, tmp2;
tmp0 = 2 * in[0];
tmp1 = 2 * in[1];
tmp2 = 2 * in[2];
out[0] = ((p224_widelimb)in[0]) * in[0];
out[1] = ((p224_widelimb)in[0]) * tmp1;
out[2] = ((p224_widelimb)in[0]) * tmp2 + ((p224_widelimb)in[1]) * in[1];
out[3] = ((p224_widelimb)in[3]) * tmp0 + ((p224_widelimb)in[1]) * tmp2;
out[4] = ((p224_widelimb)in[3]) * tmp1 + ((p224_widelimb)in[2]) * in[2];
out[5] = ((p224_widelimb)in[3]) * tmp2;
out[6] = ((p224_widelimb)in[3]) * in[3];
}
// Multiply two field elements: out = in1 * in2
static void p224_felem_mul(p224_widefelem out, const p224_felem in1,
const p224_felem in2) {
out[0] = ((p224_widelimb)in1[0]) * in2[0];
out[1] = ((p224_widelimb)in1[0]) * in2[1] + ((p224_widelimb)in1[1]) * in2[0];
out[2] = ((p224_widelimb)in1[0]) * in2[2] + ((p224_widelimb)in1[1]) * in2[1] +
((p224_widelimb)in1[2]) * in2[0];
out[3] = ((p224_widelimb)in1[0]) * in2[3] + ((p224_widelimb)in1[1]) * in2[2] +
((p224_widelimb)in1[2]) * in2[1] + ((p224_widelimb)in1[3]) * in2[0];
out[4] = ((p224_widelimb)in1[1]) * in2[3] + ((p224_widelimb)in1[2]) * in2[2] +
((p224_widelimb)in1[3]) * in2[1];
out[5] = ((p224_widelimb)in1[2]) * in2[3] + ((p224_widelimb)in1[3]) * in2[2];
out[6] = ((p224_widelimb)in1[3]) * in2[3];
}
// Reduce seven 128-bit coefficients to four 64-bit coefficients.
// Requires in[i] < 2^126,
// ensures out[0] < 2^56, out[1] < 2^56, out[2] < 2^56, out[3] <= 2^56 + 2^16
static void p224_felem_reduce(p224_felem out, const p224_widefelem in) {
static const p224_widelimb two127p15 =
(((p224_widelimb)1) << 127) + (((p224_widelimb)1) << 15);
static const p224_widelimb two127m71 =
(((p224_widelimb)1) << 127) - (((p224_widelimb)1) << 71);
static const p224_widelimb two127m71m55 = (((p224_widelimb)1) << 127) -
(((p224_widelimb)1) << 71) -
(((p224_widelimb)1) << 55);
p224_widelimb output[5];
// Add 0 mod 2^224-2^96+1 to ensure all differences are positive
output[0] = in[0] + two127p15;
output[1] = in[1] + two127m71m55;
output[2] = in[2] + two127m71;
output[3] = in[3];
output[4] = in[4];
// Eliminate in[4], in[5], in[6]
output[4] += in[6] >> 16;
output[3] += (in[6] & 0xffff) << 40;
output[2] -= in[6];
output[3] += in[5] >> 16;
output[2] += (in[5] & 0xffff) << 40;
output[1] -= in[5];
output[2] += output[4] >> 16;
output[1] += (output[4] & 0xffff) << 40;
output[0] -= output[4];
// Carry 2 -> 3 -> 4
output[3] += output[2] >> 56;
output[2] &= 0x00ffffffffffffff;
output[4] = output[3] >> 56;
output[3] &= 0x00ffffffffffffff;
// Now output[2] < 2^56, output[3] < 2^56, output[4] < 2^72
// Eliminate output[4]
output[2] += output[4] >> 16;
// output[2] < 2^56 + 2^56 = 2^57
output[1] += (output[4] & 0xffff) << 40;
output[0] -= output[4];
// Carry 0 -> 1 -> 2 -> 3
output[1] += output[0] >> 56;
out[0] = output[0] & 0x00ffffffffffffff;
output[2] += output[1] >> 56;
// output[2] < 2^57 + 2^72
out[1] = output[1] & 0x00ffffffffffffff;
output[3] += output[2] >> 56;
// output[3] <= 2^56 + 2^16
out[2] = output[2] & 0x00ffffffffffffff;
// out[0] < 2^56, out[1] < 2^56, out[2] < 2^56,
// out[3] <= 2^56 + 2^16 (due to final carry),
// so out < 2*p
out[3] = output[3];
}
// Get negative value: out = -in
// Requires in[i] < 2^63,
// ensures out[0] < 2^56, out[1] < 2^56, out[2] < 2^56, out[3] <= 2^56 + 2^16
static void p224_felem_neg(p224_felem out, const p224_felem in) {
p224_widefelem tmp = {0};
p224_felem_diff_128_64(tmp, in);
p224_felem_reduce(out, tmp);
}
// Zero-check: returns 1 if input is 0, and 0 otherwise. We know that field
// elements are reduced to in < 2^225, so we only need to check three cases: 0,
// 2^224 - 2^96 + 1, and 2^225 - 2^97 + 2
static p224_limb p224_felem_is_zero(const p224_felem in) {
p224_limb zero = in[0] | in[1] | in[2] | in[3];
zero = (((int64_t)(zero)-1) >> 63) & 1;
p224_limb two224m96p1 = (in[0] ^ 1) | (in[1] ^ 0x00ffff0000000000) |
(in[2] ^ 0x00ffffffffffffff) |
(in[3] ^ 0x00ffffffffffffff);
two224m96p1 = (((int64_t)(two224m96p1)-1) >> 63) & 1;
p224_limb two225m97p2 = (in[0] ^ 2) | (in[1] ^ 0x00fffe0000000000) |
(in[2] ^ 0x00ffffffffffffff) |
(in[3] ^ 0x01ffffffffffffff);
two225m97p2 = (((int64_t)(two225m97p2)-1) >> 63) & 1;
return (zero | two224m96p1 | two225m97p2);
}
// Invert a field element
// Computation chain copied from djb's code
static void p224_felem_inv(p224_felem out, const p224_felem in) {
p224_felem ftmp, ftmp2, ftmp3, ftmp4;
p224_widefelem tmp;
p224_felem_square(tmp, in);
p224_felem_reduce(ftmp, tmp); // 2
p224_felem_mul(tmp, in, ftmp);
p224_felem_reduce(ftmp, tmp); // 2^2 - 1
p224_felem_square(tmp, ftmp);
p224_felem_reduce(ftmp, tmp); // 2^3 - 2
p224_felem_mul(tmp, in, ftmp);
p224_felem_reduce(ftmp, tmp); // 2^3 - 1
p224_felem_square(tmp, ftmp);
p224_felem_reduce(ftmp2, tmp); // 2^4 - 2
p224_felem_square(tmp, ftmp2);
p224_felem_reduce(ftmp2, tmp); // 2^5 - 4
p224_felem_square(tmp, ftmp2);
p224_felem_reduce(ftmp2, tmp); // 2^6 - 8
p224_felem_mul(tmp, ftmp2, ftmp);
p224_felem_reduce(ftmp, tmp); // 2^6 - 1
p224_felem_square(tmp, ftmp);
p224_felem_reduce(ftmp2, tmp); // 2^7 - 2
for (size_t i = 0; i < 5; ++i) { // 2^12 - 2^6
p224_felem_square(tmp, ftmp2);
p224_felem_reduce(ftmp2, tmp);
}
p224_felem_mul(tmp, ftmp2, ftmp);
p224_felem_reduce(ftmp2, tmp); // 2^12 - 1
p224_felem_square(tmp, ftmp2);
p224_felem_reduce(ftmp3, tmp); // 2^13 - 2
for (size_t i = 0; i < 11; ++i) { // 2^24 - 2^12
p224_felem_square(tmp, ftmp3);
p224_felem_reduce(ftmp3, tmp);
}
p224_felem_mul(tmp, ftmp3, ftmp2);
p224_felem_reduce(ftmp2, tmp); // 2^24 - 1
p224_felem_square(tmp, ftmp2);
p224_felem_reduce(ftmp3, tmp); // 2^25 - 2
for (size_t i = 0; i < 23; ++i) { // 2^48 - 2^24
p224_felem_square(tmp, ftmp3);
p224_felem_reduce(ftmp3, tmp);
}
p224_felem_mul(tmp, ftmp3, ftmp2);
p224_felem_reduce(ftmp3, tmp); // 2^48 - 1
p224_felem_square(tmp, ftmp3);
p224_felem_reduce(ftmp4, tmp); // 2^49 - 2
for (size_t i = 0; i < 47; ++i) { // 2^96 - 2^48
p224_felem_square(tmp, ftmp4);
p224_felem_reduce(ftmp4, tmp);
}
p224_felem_mul(tmp, ftmp3, ftmp4);
p224_felem_reduce(ftmp3, tmp); // 2^96 - 1
p224_felem_square(tmp, ftmp3);
p224_felem_reduce(ftmp4, tmp); // 2^97 - 2
for (size_t i = 0; i < 23; ++i) { // 2^120 - 2^24
p224_felem_square(tmp, ftmp4);
p224_felem_reduce(ftmp4, tmp);
}
p224_felem_mul(tmp, ftmp2, ftmp4);
p224_felem_reduce(ftmp2, tmp); // 2^120 - 1
for (size_t i = 0; i < 6; ++i) { // 2^126 - 2^6
p224_felem_square(tmp, ftmp2);
p224_felem_reduce(ftmp2, tmp);
}
p224_felem_mul(tmp, ftmp2, ftmp);
p224_felem_reduce(ftmp, tmp); // 2^126 - 1
p224_felem_square(tmp, ftmp);
p224_felem_reduce(ftmp, tmp); // 2^127 - 2
p224_felem_mul(tmp, ftmp, in);
p224_felem_reduce(ftmp, tmp); // 2^127 - 1
for (size_t i = 0; i < 97; ++i) { // 2^224 - 2^97
p224_felem_square(tmp, ftmp);
p224_felem_reduce(ftmp, tmp);
}
p224_felem_mul(tmp, ftmp, ftmp3);
p224_felem_reduce(out, tmp); // 2^224 - 2^96 - 1
}
// Copy in constant time:
// if icopy == 1, copy in to out,
// if icopy == 0, copy out to itself.
static void p224_copy_conditional(p224_felem out, const p224_felem in,
p224_limb icopy) {
// icopy is a (64-bit) 0 or 1, so copy is either all-zero or all-one
const p224_limb copy = -icopy;
for (size_t i = 0; i < 4; ++i) {
const p224_limb tmp = copy & (in[i] ^ out[i]);
out[i] ^= tmp;
}
}
// ELLIPTIC CURVE POINT OPERATIONS
//
// Points are represented in Jacobian projective coordinates:
// (X, Y, Z) corresponds to the affine point (X/Z^2, Y/Z^3),
// or to the point at infinity if Z == 0.
// Double an elliptic curve point:
// (X', Y', Z') = 2 * (X, Y, Z), where
// X' = (3 * (X - Z^2) * (X + Z^2))^2 - 8 * X * Y^2
// Y' = 3 * (X - Z^2) * (X + Z^2) * (4 * X * Y^2 - X') - 8 * Y^2
// Z' = (Y + Z)^2 - Y^2 - Z^2 = 2 * Y * Z
// Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed,
// while x_out == y_in is not (maybe this works, but it's not tested).
static void p224_point_double(p224_felem x_out, p224_felem y_out,
p224_felem z_out, const p224_felem x_in,
const p224_felem y_in, const p224_felem z_in) {
p224_widefelem tmp, tmp2;
p224_felem delta, gamma, beta, alpha, ftmp, ftmp2;
p224_felem_assign(ftmp, x_in);
p224_felem_assign(ftmp2, x_in);
// delta = z^2
p224_felem_square(tmp, z_in);
p224_felem_reduce(delta, tmp);
// gamma = y^2
p224_felem_square(tmp, y_in);
p224_felem_reduce(gamma, tmp);
// beta = x*gamma
p224_felem_mul(tmp, x_in, gamma);
p224_felem_reduce(beta, tmp);
// alpha = 3*(x-delta)*(x+delta)
p224_felem_diff(ftmp, delta);
// ftmp[i] < 2^57 + 2^58 + 2 < 2^59
p224_felem_sum(ftmp2, delta);
// ftmp2[i] < 2^57 + 2^57 = 2^58
p224_felem_scalar(ftmp2, 3);
// ftmp2[i] < 3 * 2^58 < 2^60
p224_felem_mul(tmp, ftmp, ftmp2);
// tmp[i] < 2^60 * 2^59 * 4 = 2^121
p224_felem_reduce(alpha, tmp);
// x' = alpha^2 - 8*beta
p224_felem_square(tmp, alpha);
// tmp[i] < 4 * 2^57 * 2^57 = 2^116
p224_felem_assign(ftmp, beta);
p224_felem_scalar(ftmp, 8);
// ftmp[i] < 8 * 2^57 = 2^60
p224_felem_diff_128_64(tmp, ftmp);
// tmp[i] < 2^116 + 2^64 + 8 < 2^117
p224_felem_reduce(x_out, tmp);
// z' = (y + z)^2 - gamma - delta
p224_felem_sum(delta, gamma);
// delta[i] < 2^57 + 2^57 = 2^58
p224_felem_assign(ftmp, y_in);
p224_felem_sum(ftmp, z_in);
// ftmp[i] < 2^57 + 2^57 = 2^58
p224_felem_square(tmp, ftmp);
// tmp[i] < 4 * 2^58 * 2^58 = 2^118
p224_felem_diff_128_64(tmp, delta);
// tmp[i] < 2^118 + 2^64 + 8 < 2^119
p224_felem_reduce(z_out, tmp);
// y' = alpha*(4*beta - x') - 8*gamma^2
p224_felem_scalar(beta, 4);
// beta[i] < 4 * 2^57 = 2^59
p224_felem_diff(beta, x_out);
// beta[i] < 2^59 + 2^58 + 2 < 2^60
p224_felem_mul(tmp, alpha, beta);
// tmp[i] < 4 * 2^57 * 2^60 = 2^119
p224_felem_square(tmp2, gamma);
// tmp2[i] < 4 * 2^57 * 2^57 = 2^116
p224_widefelem_scalar(tmp2, 8);
// tmp2[i] < 8 * 2^116 = 2^119
p224_widefelem_diff(tmp, tmp2);
// tmp[i] < 2^119 + 2^120 < 2^121
p224_felem_reduce(y_out, tmp);
}
// Add two elliptic curve points:
// (X_1, Y_1, Z_1) + (X_2, Y_2, Z_2) = (X_3, Y_3, Z_3), where
// X_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1)^2 - (Z_1^2 * X_2 - Z_2^2 * X_1)^3 -
// 2 * Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2
// Y_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1) * (Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 *
// X_1)^2 - X_3) -
// Z_2^3 * Y_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^3
// Z_3 = (Z_1^2 * X_2 - Z_2^2 * X_1) * (Z_1 * Z_2)
//
// This runs faster if 'mixed' is set, which requires Z_2 = 1 or Z_2 = 0.
// This function is not entirely constant-time: it includes a branch for
// checking whether the two input points are equal, (while not equal to the
// point at infinity). This case never happens during single point
// multiplication, so there is no timing leak for ECDH or ECDSA signing.
static void p224_point_add(p224_felem x3, p224_felem y3, p224_felem z3,
const p224_felem x1, const p224_felem y1,
const p224_felem z1, const int mixed,
const p224_felem x2, const p224_felem y2,
const p224_felem z2) {
p224_felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, x_out, y_out, z_out;
p224_widefelem tmp, tmp2;
p224_limb z1_is_zero, z2_is_zero, x_equal, y_equal;
if (!mixed) {
// ftmp2 = z2^2
p224_felem_square(tmp, z2);
p224_felem_reduce(ftmp2, tmp);
// ftmp4 = z2^3
p224_felem_mul(tmp, ftmp2, z2);
p224_felem_reduce(ftmp4, tmp);
// ftmp4 = z2^3*y1
p224_felem_mul(tmp2, ftmp4, y1);
p224_felem_reduce(ftmp4, tmp2);
// ftmp2 = z2^2*x1
p224_felem_mul(tmp2, ftmp2, x1);
p224_felem_reduce(ftmp2, tmp2);
} else {
// We'll assume z2 = 1 (special case z2 = 0 is handled later)
// ftmp4 = z2^3*y1
p224_felem_assign(ftmp4, y1);
// ftmp2 = z2^2*x1
p224_felem_assign(ftmp2, x1);
}
// ftmp = z1^2
p224_felem_square(tmp, z1);
p224_felem_reduce(ftmp, tmp);
// ftmp3 = z1^3
p224_felem_mul(tmp, ftmp, z1);
p224_felem_reduce(ftmp3, tmp);
// tmp = z1^3*y2
p224_felem_mul(tmp, ftmp3, y2);
// tmp[i] < 4 * 2^57 * 2^57 = 2^116
// ftmp3 = z1^3*y2 - z2^3*y1
p224_felem_diff_128_64(tmp, ftmp4);
// tmp[i] < 2^116 + 2^64 + 8 < 2^117
p224_felem_reduce(ftmp3, tmp);
// tmp = z1^2*x2
p224_felem_mul(tmp, ftmp, x2);
// tmp[i] < 4 * 2^57 * 2^57 = 2^116
// ftmp = z1^2*x2 - z2^2*x1
p224_felem_diff_128_64(tmp, ftmp2);
// tmp[i] < 2^116 + 2^64 + 8 < 2^117
p224_felem_reduce(ftmp, tmp);
// the formulae are incorrect if the points are equal
// so we check for this and do doubling if this happens
x_equal = p224_felem_is_zero(ftmp);
y_equal = p224_felem_is_zero(ftmp3);
z1_is_zero = p224_felem_is_zero(z1);
z2_is_zero = p224_felem_is_zero(z2);
// In affine coordinates, (X_1, Y_1) == (X_2, Y_2)
if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
p224_point_double(x3, y3, z3, x1, y1, z1);
return;
}
// ftmp5 = z1*z2
if (!mixed) {
p224_felem_mul(tmp, z1, z2);
p224_felem_reduce(ftmp5, tmp);
} else {
// special case z2 = 0 is handled later
p224_felem_assign(ftmp5, z1);
}
// z_out = (z1^2*x2 - z2^2*x1)*(z1*z2)
p224_felem_mul(tmp, ftmp, ftmp5);
p224_felem_reduce(z_out, tmp);
// ftmp = (z1^2*x2 - z2^2*x1)^2
p224_felem_assign(ftmp5, ftmp);
p224_felem_square(tmp, ftmp);
p224_felem_reduce(ftmp, tmp);
// ftmp5 = (z1^2*x2 - z2^2*x1)^3
p224_felem_mul(tmp, ftmp, ftmp5);
p224_felem_reduce(ftmp5, tmp);
// ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2
p224_felem_mul(tmp, ftmp2, ftmp);
p224_felem_reduce(ftmp2, tmp);
// tmp = z2^3*y1*(z1^2*x2 - z2^2*x1)^3
p224_felem_mul(tmp, ftmp4, ftmp5);
// tmp[i] < 4 * 2^57 * 2^57 = 2^116
// tmp2 = (z1^3*y2 - z2^3*y1)^2
p224_felem_square(tmp2, ftmp3);
// tmp2[i] < 4 * 2^57 * 2^57 < 2^116
// tmp2 = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3
p224_felem_diff_128_64(tmp2, ftmp5);
// tmp2[i] < 2^116 + 2^64 + 8 < 2^117
// ftmp5 = 2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2
p224_felem_assign(ftmp5, ftmp2);
p224_felem_scalar(ftmp5, 2);
// ftmp5[i] < 2 * 2^57 = 2^58
/* x_out = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 -
2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
p224_felem_diff_128_64(tmp2, ftmp5);
// tmp2[i] < 2^117 + 2^64 + 8 < 2^118
p224_felem_reduce(x_out, tmp2);
// ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out
p224_felem_diff(ftmp2, x_out);
// ftmp2[i] < 2^57 + 2^58 + 2 < 2^59
// tmp2 = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out)
p224_felem_mul(tmp2, ftmp3, ftmp2);
// tmp2[i] < 4 * 2^57 * 2^59 = 2^118
/* y_out = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out) -
z2^3*y1*(z1^2*x2 - z2^2*x1)^3 */
p224_widefelem_diff(tmp2, tmp);
// tmp2[i] < 2^118 + 2^120 < 2^121
p224_felem_reduce(y_out, tmp2);
// the result (x_out, y_out, z_out) is incorrect if one of the inputs is
// the point at infinity, so we need to check for this separately
// if point 1 is at infinity, copy point 2 to output, and vice versa
p224_copy_conditional(x_out, x2, z1_is_zero);
p224_copy_conditional(x_out, x1, z2_is_zero);
p224_copy_conditional(y_out, y2, z1_is_zero);
p224_copy_conditional(y_out, y1, z2_is_zero);
p224_copy_conditional(z_out, z2, z1_is_zero);
p224_copy_conditional(z_out, z1, z2_is_zero);
p224_felem_assign(x3, x_out);
p224_felem_assign(y3, y_out);
p224_felem_assign(z3, z_out);
}
// p224_select_point selects the |idx|th point from a precomputation table and
// copies it to out.
static void p224_select_point(const uint64_t idx, size_t size,
const p224_felem pre_comp[/*size*/][3],
p224_felem out[3]) {
p224_limb *outlimbs = &out[0][0];
OPENSSL_memset(outlimbs, 0, 3 * sizeof(p224_felem));
for (size_t i = 0; i < size; i++) {
const p224_limb *inlimbs = &pre_comp[i][0][0];
uint64_t mask = i ^ idx;
mask |= mask >> 4;
mask |= mask >> 2;
mask |= mask >> 1;
mask &= 1;
mask--;
for (size_t j = 0; j < 4 * 3; j++) {
outlimbs[j] |= inlimbs[j] & mask;
}
}
}
// p224_get_bit returns the |i|th bit in |in|
static char p224_get_bit(const p224_felem_bytearray in, size_t i) {
if (i >= 224) {
return 0;
}
return (in[i >> 3] >> (i & 7)) & 1;
}
// Interleaved point multiplication using precomputed point multiples:
// The small point multiples 0*P, 1*P, ..., 16*P are in p_pre_comp, the scalars
// in p_scalar, if non-NULL. If g_scalar is non-NULL, we also add this multiple
// of the generator, using certain (large) precomputed multiples in
// g_p224_pre_comp. Output point (X, Y, Z) is stored in x_out, y_out, z_out
static void p224_batch_mul(p224_felem x_out, p224_felem y_out, p224_felem z_out,
const uint8_t *p_scalar, const uint8_t *g_scalar,
const p224_felem p_pre_comp[17][3]) {
p224_felem nq[3], tmp[4];
uint64_t bits;
uint8_t sign, digit;
// set nq to the point at infinity
OPENSSL_memset(nq, 0, 3 * sizeof(p224_felem));
// Loop over both scalars msb-to-lsb, interleaving additions of multiples of
// the generator (two in each of the last 28 rounds) and additions of p (every
// 5th round).
int skip = 1; // save two point operations in the first round
size_t i = p_scalar != NULL ? 220 : 27;
for (;;) {
// double
if (!skip) {
p224_point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
}
// add multiples of the generator
if (g_scalar != NULL && i <= 27) {
// first, look 28 bits upwards
bits = p224_get_bit(g_scalar, i + 196) << 3;
bits |= p224_get_bit(g_scalar, i + 140) << 2;
bits |= p224_get_bit(g_scalar, i + 84) << 1;
bits |= p224_get_bit(g_scalar, i + 28);
// select the point to add, in constant time
p224_select_point(bits, 16, g_p224_pre_comp[1], tmp);
if (!skip) {
p224_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1 /* mixed */,
tmp[0], tmp[1], tmp[2]);
} else {
OPENSSL_memcpy(nq, tmp, 3 * sizeof(p224_felem));
skip = 0;
}
// second, look at the current position
bits = p224_get_bit(g_scalar, i + 168) << 3;
bits |= p224_get_bit(g_scalar, i + 112) << 2;
bits |= p224_get_bit(g_scalar, i + 56) << 1;
bits |= p224_get_bit(g_scalar, i);
// select the point to add, in constant time
p224_select_point(bits, 16, g_p224_pre_comp[0], tmp);
p224_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1 /* mixed */,
tmp[0], tmp[1], tmp[2]);
}
// do other additions every 5 doublings
if (p_scalar != NULL && i % 5 == 0) {
bits = p224_get_bit(p_scalar, i + 4) << 5;
bits |= p224_get_bit(p_scalar, i + 3) << 4;
bits |= p224_get_bit(p_scalar, i + 2) << 3;
bits |= p224_get_bit(p_scalar, i + 1) << 2;
bits |= p224_get_bit(p_scalar, i) << 1;
bits |= p224_get_bit(p_scalar, i - 1);
ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
// select the point to add or subtract
p224_select_point(digit, 17, p_pre_comp, tmp);
p224_felem_neg(tmp[3], tmp[1]); // (X, -Y, Z) is the negative point
p224_copy_conditional(tmp[1], tmp[3], sign);
if (!skip) {
p224_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 0 /* mixed */,
tmp[0], tmp[1], tmp[2]);
} else {
OPENSSL_memcpy(nq, tmp, 3 * sizeof(p224_felem));
skip = 0;
}
}
if (i == 0) {
break;
}
--i;
}
p224_felem_assign(x_out, nq[0]);
p224_felem_assign(y_out, nq[1]);
p224_felem_assign(z_out, nq[2]);
}
// Takes the Jacobian coordinates (X, Y, Z) of a point and returns
// (X', Y') = (X/Z^2, Y/Z^3)
static int ec_GFp_nistp224_point_get_affine_coordinates(
const EC_GROUP *group, const EC_RAW_POINT *point, EC_FELEM *x,
EC_FELEM *y) {
if (ec_GFp_simple_is_at_infinity(group, point)) {
OPENSSL_PUT_ERROR(EC, EC_R_POINT_AT_INFINITY);
return 0;
}
Add EC_FELEM for EC_POINTs and related temporaries. This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used for EC_POINT's representation in the generic EC_METHOD, as well as random operations on tuned EC_METHODs that still are implemented genericly. Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific to the EC_METHOD, analogous to how the old values were BIGNUMs but may or may not have been in Montgomery form. This is kind of a nuisance, but no more than before. (If p224-64.c were easily convertable to Montgomery form, we could say |EC_FELEM| is always in Montgomery form. If we exposed the internal add and double implementations in each of the curves, we could give |EC_POINT| an |EC_METHOD|-specific representation and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this for later.) The generic add and doubling formulas are aligned with the formulas proved in fiat-crypto. Those only applied to a = -3, so I've proved a generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case someone uses a custom curve. The new formulas are verified, constant-time, and swap a multiply for a square. As expressed in fiat-crypto they do use more temporaries, but this seems to be fine with stack-allocated EC_FELEMs. (We can try to help the compiler later, but benchamrks below suggest this isn't necessary.) Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the bounds in the type system and, in particular, that the width is correct, which will make it easier to select a point in constant-time in the future. (Indeed the old code did not always have the correct width. Its point formula involved halving and implemented this in variable time and variable width.) Before: Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec) Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec) Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec) Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec) Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec) Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec) Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec) After: Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%] Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%] Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%] Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%] Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%] Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%] Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%] This more than pays for removing points_make_affine previously and even speeds up ECDH P-256 slightly. (The point-on-curve check uses the generic code.) Next is to push the stack-allocating up to ec_wNAF_mul, followed by a constant-time single-point multiplication. Bug: 239 Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c Reviewed-on: https://boringssl-review.googlesource.com/27668 Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
p224_felem z1, z2;
p224_widefelem tmp;
p224_generic_to_felem(z1, &point->Z);
p224_felem_inv(z2, z1);
p224_felem_square(tmp, z2);
p224_felem_reduce(z1, tmp);
if (x != NULL) {
Add EC_FELEM for EC_POINTs and related temporaries. This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used for EC_POINT's representation in the generic EC_METHOD, as well as random operations on tuned EC_METHODs that still are implemented genericly. Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific to the EC_METHOD, analogous to how the old values were BIGNUMs but may or may not have been in Montgomery form. This is kind of a nuisance, but no more than before. (If p224-64.c were easily convertable to Montgomery form, we could say |EC_FELEM| is always in Montgomery form. If we exposed the internal add and double implementations in each of the curves, we could give |EC_POINT| an |EC_METHOD|-specific representation and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this for later.) The generic add and doubling formulas are aligned with the formulas proved in fiat-crypto. Those only applied to a = -3, so I've proved a generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case someone uses a custom curve. The new formulas are verified, constant-time, and swap a multiply for a square. As expressed in fiat-crypto they do use more temporaries, but this seems to be fine with stack-allocated EC_FELEMs. (We can try to help the compiler later, but benchamrks below suggest this isn't necessary.) Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the bounds in the type system and, in particular, that the width is correct, which will make it easier to select a point in constant-time in the future. (Indeed the old code did not always have the correct width. Its point formula involved halving and implemented this in variable time and variable width.) Before: Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec) Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec) Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec) Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec) Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec) Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec) Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec) After: Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%] Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%] Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%] Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%] Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%] Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%] Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%] This more than pays for removing points_make_affine previously and even speeds up ECDH P-256 slightly. (The point-on-curve check uses the generic code.) Next is to push the stack-allocating up to ec_wNAF_mul, followed by a constant-time single-point multiplication. Bug: 239 Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c Reviewed-on: https://boringssl-review.googlesource.com/27668 Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
p224_felem x_in, x_out;
p224_generic_to_felem(x_in, &point->X);
p224_felem_mul(tmp, x_in, z1);
p224_felem_reduce(x_out, tmp);
p224_felem_to_generic(x, x_out);
}
if (y != NULL) {
Add EC_FELEM for EC_POINTs and related temporaries. This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used for EC_POINT's representation in the generic EC_METHOD, as well as random operations on tuned EC_METHODs that still are implemented genericly. Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific to the EC_METHOD, analogous to how the old values were BIGNUMs but may or may not have been in Montgomery form. This is kind of a nuisance, but no more than before. (If p224-64.c were easily convertable to Montgomery form, we could say |EC_FELEM| is always in Montgomery form. If we exposed the internal add and double implementations in each of the curves, we could give |EC_POINT| an |EC_METHOD|-specific representation and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this for later.) The generic add and doubling formulas are aligned with the formulas proved in fiat-crypto. Those only applied to a = -3, so I've proved a generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case someone uses a custom curve. The new formulas are verified, constant-time, and swap a multiply for a square. As expressed in fiat-crypto they do use more temporaries, but this seems to be fine with stack-allocated EC_FELEMs. (We can try to help the compiler later, but benchamrks below suggest this isn't necessary.) Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the bounds in the type system and, in particular, that the width is correct, which will make it easier to select a point in constant-time in the future. (Indeed the old code did not always have the correct width. Its point formula involved halving and implemented this in variable time and variable width.) Before: Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec) Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec) Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec) Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec) Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec) Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec) Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec) After: Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%] Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%] Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%] Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%] Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%] Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%] Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%] This more than pays for removing points_make_affine previously and even speeds up ECDH P-256 slightly. (The point-on-curve check uses the generic code.) Next is to push the stack-allocating up to ec_wNAF_mul, followed by a constant-time single-point multiplication. Bug: 239 Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c Reviewed-on: https://boringssl-review.googlesource.com/27668 Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
p224_felem y_in, y_out;
p224_generic_to_felem(y_in, &point->Y);
p224_felem_mul(tmp, z1, z2);
p224_felem_reduce(z1, tmp);
p224_felem_mul(tmp, y_in, z1);
p224_felem_reduce(y_out, tmp);
p224_felem_to_generic(y, y_out);
}
return 1;
}
static void ec_GFp_nistp224_add(const EC_GROUP *group, EC_RAW_POINT *r,
const EC_RAW_POINT *a, const EC_RAW_POINT *b) {
p224_felem x1, y1, z1, x2, y2, z2;
p224_generic_to_felem(x1, &a->X);
p224_generic_to_felem(y1, &a->Y);
p224_generic_to_felem(z1, &a->Z);
p224_generic_to_felem(x2, &b->X);
p224_generic_to_felem(y2, &b->Y);
p224_generic_to_felem(z2, &b->Z);
p224_point_add(x1, y1, z1, x1, y1, z1, 0 /* both Jacobian */, x2, y2, z2);
// The outputs are already reduced, but still need to be contracted.
p224_felem_to_generic(&r->X, x1);
p224_felem_to_generic(&r->Y, y1);
p224_felem_to_generic(&r->Z, z1);
}
static void ec_GFp_nistp224_dbl(const EC_GROUP *group, EC_RAW_POINT *r,
const EC_RAW_POINT *a) {
p224_felem x, y, z;
p224_generic_to_felem(x, &a->X);
p224_generic_to_felem(y, &a->Y);
p224_generic_to_felem(z, &a->Z);
p224_point_double(x, y, z, x, y, z);
// The outputs are already reduced, but still need to be contracted.
p224_felem_to_generic(&r->X, x);
p224_felem_to_generic(&r->Y, y);
p224_felem_to_generic(&r->Z, z);
}
static void ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_RAW_POINT *r,
const EC_SCALAR *g_scalar,
const EC_RAW_POINT *p,
const EC_SCALAR *p_scalar) {
p224_felem p_pre_comp[17][3];
p224_felem x_out, y_out, z_out;
if (p != NULL && p_scalar != NULL) {
// We treat NULL scalars as 0, and NULL points as points at infinity, i.e.,
// they contribute nothing to the linear combination.
OPENSSL_memset(&p_pre_comp, 0, sizeof(p_pre_comp));
// precompute multiples
Add EC_FELEM for EC_POINTs and related temporaries. This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used for EC_POINT's representation in the generic EC_METHOD, as well as random operations on tuned EC_METHODs that still are implemented genericly. Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific to the EC_METHOD, analogous to how the old values were BIGNUMs but may or may not have been in Montgomery form. This is kind of a nuisance, but no more than before. (If p224-64.c were easily convertable to Montgomery form, we could say |EC_FELEM| is always in Montgomery form. If we exposed the internal add and double implementations in each of the curves, we could give |EC_POINT| an |EC_METHOD|-specific representation and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this for later.) The generic add and doubling formulas are aligned with the formulas proved in fiat-crypto. Those only applied to a = -3, so I've proved a generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case someone uses a custom curve. The new formulas are verified, constant-time, and swap a multiply for a square. As expressed in fiat-crypto they do use more temporaries, but this seems to be fine with stack-allocated EC_FELEMs. (We can try to help the compiler later, but benchamrks below suggest this isn't necessary.) Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the bounds in the type system and, in particular, that the width is correct, which will make it easier to select a point in constant-time in the future. (Indeed the old code did not always have the correct width. Its point formula involved halving and implemented this in variable time and variable width.) Before: Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec) Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec) Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec) Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec) Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec) Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec) Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec) After: Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%] Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%] Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%] Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%] Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%] Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%] Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%] This more than pays for removing points_make_affine previously and even speeds up ECDH P-256 slightly. (The point-on-curve check uses the generic code.) Next is to push the stack-allocating up to ec_wNAF_mul, followed by a constant-time single-point multiplication. Bug: 239 Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c Reviewed-on: https://boringssl-review.googlesource.com/27668 Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
p224_generic_to_felem(x_out, &p->X);
p224_generic_to_felem(y_out, &p->Y);
p224_generic_to_felem(z_out, &p->Z);
p224_felem_assign(p_pre_comp[1][0], x_out);
p224_felem_assign(p_pre_comp[1][1], y_out);
p224_felem_assign(p_pre_comp[1][2], z_out);
for (size_t j = 2; j <= 16; ++j) {
if (j & 1) {
p224_point_add(p_pre_comp[j][0], p_pre_comp[j][1], p_pre_comp[j][2],
p_pre_comp[1][0], p_pre_comp[1][1], p_pre_comp[1][2], 0,
p_pre_comp[j - 1][0], p_pre_comp[j - 1][1],
p_pre_comp[j - 1][2]);
} else {
p224_point_double(p_pre_comp[j][0], p_pre_comp[j][1], p_pre_comp[j][2],
p_pre_comp[j / 2][0], p_pre_comp[j / 2][1],
p_pre_comp[j / 2][2]);
}
}
}
Make ECDSA signing 10% faster and plug some timing leaks. None of the asymmetric crypto we inherented from OpenSSL is constant-time because of BIGNUM. BIGNUM chops leading zeros off the front of everything, so we end up leaking information about the first word, in theory. BIGNUM functions additionally tend to take the full range of inputs and then call into BN_nnmod at various points. All our secret values should be acted on in constant-time, but k in ECDSA is a particularly sensitive value. So, ecdsa_sign_setup, in an attempt to mitigate the BIGNUM leaks, would add a couple copies of the order. This does not work at all. k is used to compute two values: k^-1 and kG. The first operation when computing k^-1 is to call BN_nnmod if k is out of range. The entry point to our tuned constant-time curve implementations is to call BN_nnmod if the scalar has too many bits, which this causes. The result is both corrections are immediately undone but cause us to do more variable-time work in the meantime. Replace all these computations around k with the word-based functions added in the various preceding CLs. In doing so, replace the BN_mod_mul calls (which internally call BN_nnmod) with Montgomery reduction. We can avoid taking k^-1 out of Montgomery form, which combines nicely with Brian Smith's trick in 3426d1011946b26ff1bb2fd98a081ba4753c9cc8. Along the way, we avoid some unnecessary mallocs. BIGNUM still affects the private key itself, as well as the EC_POINTs. But this should hopefully be much better now. Also it's 10% faster: Before: Did 15000 ECDSA P-224 signing operations in 1069117us (14030.3 ops/sec) Did 18000 ECDSA P-256 signing operations in 1053908us (17079.3 ops/sec) Did 1078 ECDSA P-384 signing operations in 1087853us (990.9 ops/sec) Did 473 ECDSA P-521 signing operations in 1069835us (442.1 ops/sec) After: Did 16000 ECDSA P-224 signing operations in 1064799us (15026.3 ops/sec) Did 19000 ECDSA P-256 signing operations in 1007839us (18852.2 ops/sec) Did 1078 ECDSA P-384 signing operations in 1079413us (998.7 ops/sec) Did 484 ECDSA P-521 signing operations in 1083616us (446.7 ops/sec) Change-Id: I2a25e90fc99dac13c0616d0ea45e125a4bd8cca1 Reviewed-on: https://boringssl-review.googlesource.com/23075 Reviewed-by: Adam Langley <agl@google.com>
2017-11-13 03:58:00 +00:00
p224_batch_mul(x_out, y_out, z_out,
(p != NULL && p_scalar != NULL) ? p_scalar->bytes : NULL,
g_scalar != NULL ? g_scalar->bytes : NULL,
(const p224_felem(*)[3])p_pre_comp);
// reduce the output to its unique minimal representation
p224_felem_to_generic(&r->X, x_out);
p224_felem_to_generic(&r->Y, y_out);
p224_felem_to_generic(&r->Z, z_out);
}
Add EC_FELEM for EC_POINTs and related temporaries. This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used for EC_POINT's representation in the generic EC_METHOD, as well as random operations on tuned EC_METHODs that still are implemented genericly. Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific to the EC_METHOD, analogous to how the old values were BIGNUMs but may or may not have been in Montgomery form. This is kind of a nuisance, but no more than before. (If p224-64.c were easily convertable to Montgomery form, we could say |EC_FELEM| is always in Montgomery form. If we exposed the internal add and double implementations in each of the curves, we could give |EC_POINT| an |EC_METHOD|-specific representation and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this for later.) The generic add and doubling formulas are aligned with the formulas proved in fiat-crypto. Those only applied to a = -3, so I've proved a generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case someone uses a custom curve. The new formulas are verified, constant-time, and swap a multiply for a square. As expressed in fiat-crypto they do use more temporaries, but this seems to be fine with stack-allocated EC_FELEMs. (We can try to help the compiler later, but benchamrks below suggest this isn't necessary.) Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the bounds in the type system and, in particular, that the width is correct, which will make it easier to select a point in constant-time in the future. (Indeed the old code did not always have the correct width. Its point formula involved halving and implemented this in variable time and variable width.) Before: Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec) Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec) Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec) Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec) Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec) Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec) Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec) After: Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%] Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%] Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%] Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%] Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%] Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%] Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%] This more than pays for removing points_make_affine previously and even speeds up ECDH P-256 slightly. (The point-on-curve check uses the generic code.) Next is to push the stack-allocating up to ec_wNAF_mul, followed by a constant-time single-point multiplication. Bug: 239 Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c Reviewed-on: https://boringssl-review.googlesource.com/27668 Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
static void ec_GFp_nistp224_felem_mul(const EC_GROUP *group, EC_FELEM *r,
const EC_FELEM *a, const EC_FELEM *b) {
p224_felem felem1, felem2;
p224_widefelem wide;
Add EC_FELEM for EC_POINTs and related temporaries. This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used for EC_POINT's representation in the generic EC_METHOD, as well as random operations on tuned EC_METHODs that still are implemented genericly. Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific to the EC_METHOD, analogous to how the old values were BIGNUMs but may or may not have been in Montgomery form. This is kind of a nuisance, but no more than before. (If p224-64.c were easily convertable to Montgomery form, we could say |EC_FELEM| is always in Montgomery form. If we exposed the internal add and double implementations in each of the curves, we could give |EC_POINT| an |EC_METHOD|-specific representation and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this for later.) The generic add and doubling formulas are aligned with the formulas proved in fiat-crypto. Those only applied to a = -3, so I've proved a generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case someone uses a custom curve. The new formulas are verified, constant-time, and swap a multiply for a square. As expressed in fiat-crypto they do use more temporaries, but this seems to be fine with stack-allocated EC_FELEMs. (We can try to help the compiler later, but benchamrks below suggest this isn't necessary.) Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the bounds in the type system and, in particular, that the width is correct, which will make it easier to select a point in constant-time in the future. (Indeed the old code did not always have the correct width. Its point formula involved halving and implemented this in variable time and variable width.) Before: Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec) Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec) Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec) Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec) Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec) Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec) Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec) After: Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%] Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%] Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%] Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%] Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%] Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%] Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%] This more than pays for removing points_make_affine previously and even speeds up ECDH P-256 slightly. (The point-on-curve check uses the generic code.) Next is to push the stack-allocating up to ec_wNAF_mul, followed by a constant-time single-point multiplication. Bug: 239 Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c Reviewed-on: https://boringssl-review.googlesource.com/27668 Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
p224_generic_to_felem(felem1, a);
p224_generic_to_felem(felem2, b);
p224_felem_mul(wide, felem1, felem2);
p224_felem_reduce(felem1, wide);
Add EC_FELEM for EC_POINTs and related temporaries. This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used for EC_POINT's representation in the generic EC_METHOD, as well as random operations on tuned EC_METHODs that still are implemented genericly. Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific to the EC_METHOD, analogous to how the old values were BIGNUMs but may or may not have been in Montgomery form. This is kind of a nuisance, but no more than before. (If p224-64.c were easily convertable to Montgomery form, we could say |EC_FELEM| is always in Montgomery form. If we exposed the internal add and double implementations in each of the curves, we could give |EC_POINT| an |EC_METHOD|-specific representation and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this for later.) The generic add and doubling formulas are aligned with the formulas proved in fiat-crypto. Those only applied to a = -3, so I've proved a generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case someone uses a custom curve. The new formulas are verified, constant-time, and swap a multiply for a square. As expressed in fiat-crypto they do use more temporaries, but this seems to be fine with stack-allocated EC_FELEMs. (We can try to help the compiler later, but benchamrks below suggest this isn't necessary.) Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the bounds in the type system and, in particular, that the width is correct, which will make it easier to select a point in constant-time in the future. (Indeed the old code did not always have the correct width. Its point formula involved halving and implemented this in variable time and variable width.) Before: Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec) Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec) Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec) Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec) Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec) Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec) Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec) After: Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%] Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%] Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%] Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%] Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%] Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%] Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%] This more than pays for removing points_make_affine previously and even speeds up ECDH P-256 slightly. (The point-on-curve check uses the generic code.) Next is to push the stack-allocating up to ec_wNAF_mul, followed by a constant-time single-point multiplication. Bug: 239 Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c Reviewed-on: https://boringssl-review.googlesource.com/27668 Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
p224_felem_to_generic(r, felem1);
}
Add EC_FELEM for EC_POINTs and related temporaries. This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used for EC_POINT's representation in the generic EC_METHOD, as well as random operations on tuned EC_METHODs that still are implemented genericly. Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific to the EC_METHOD, analogous to how the old values were BIGNUMs but may or may not have been in Montgomery form. This is kind of a nuisance, but no more than before. (If p224-64.c were easily convertable to Montgomery form, we could say |EC_FELEM| is always in Montgomery form. If we exposed the internal add and double implementations in each of the curves, we could give |EC_POINT| an |EC_METHOD|-specific representation and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this for later.) The generic add and doubling formulas are aligned with the formulas proved in fiat-crypto. Those only applied to a = -3, so I've proved a generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case someone uses a custom curve. The new formulas are verified, constant-time, and swap a multiply for a square. As expressed in fiat-crypto they do use more temporaries, but this seems to be fine with stack-allocated EC_FELEMs. (We can try to help the compiler later, but benchamrks below suggest this isn't necessary.) Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the bounds in the type system and, in particular, that the width is correct, which will make it easier to select a point in constant-time in the future. (Indeed the old code did not always have the correct width. Its point formula involved halving and implemented this in variable time and variable width.) Before: Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec) Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec) Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec) Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec) Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec) Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec) Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec) After: Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%] Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%] Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%] Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%] Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%] Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%] Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%] This more than pays for removing points_make_affine previously and even speeds up ECDH P-256 slightly. (The point-on-curve check uses the generic code.) Next is to push the stack-allocating up to ec_wNAF_mul, followed by a constant-time single-point multiplication. Bug: 239 Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c Reviewed-on: https://boringssl-review.googlesource.com/27668 Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
static void ec_GFp_nistp224_felem_sqr(const EC_GROUP *group, EC_FELEM *r,
const EC_FELEM *a) {
p224_felem felem;
Add EC_FELEM for EC_POINTs and related temporaries. This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used for EC_POINT's representation in the generic EC_METHOD, as well as random operations on tuned EC_METHODs that still are implemented genericly. Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific to the EC_METHOD, analogous to how the old values were BIGNUMs but may or may not have been in Montgomery form. This is kind of a nuisance, but no more than before. (If p224-64.c were easily convertable to Montgomery form, we could say |EC_FELEM| is always in Montgomery form. If we exposed the internal add and double implementations in each of the curves, we could give |EC_POINT| an |EC_METHOD|-specific representation and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this for later.) The generic add and doubling formulas are aligned with the formulas proved in fiat-crypto. Those only applied to a = -3, so I've proved a generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case someone uses a custom curve. The new formulas are verified, constant-time, and swap a multiply for a square. As expressed in fiat-crypto they do use more temporaries, but this seems to be fine with stack-allocated EC_FELEMs. (We can try to help the compiler later, but benchamrks below suggest this isn't necessary.) Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the bounds in the type system and, in particular, that the width is correct, which will make it easier to select a point in constant-time in the future. (Indeed the old code did not always have the correct width. Its point formula involved halving and implemented this in variable time and variable width.) Before: Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec) Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec) Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec) Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec) Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec) Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec) Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec) After: Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%] Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%] Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%] Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%] Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%] Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%] Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%] This more than pays for removing points_make_affine previously and even speeds up ECDH P-256 slightly. (The point-on-curve check uses the generic code.) Next is to push the stack-allocating up to ec_wNAF_mul, followed by a constant-time single-point multiplication. Bug: 239 Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c Reviewed-on: https://boringssl-review.googlesource.com/27668 Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
p224_generic_to_felem(felem, a);
p224_widefelem wide;
p224_felem_square(wide, felem);
p224_felem_reduce(felem, wide);
Add EC_FELEM for EC_POINTs and related temporaries. This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used for EC_POINT's representation in the generic EC_METHOD, as well as random operations on tuned EC_METHODs that still are implemented genericly. Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific to the EC_METHOD, analogous to how the old values were BIGNUMs but may or may not have been in Montgomery form. This is kind of a nuisance, but no more than before. (If p224-64.c were easily convertable to Montgomery form, we could say |EC_FELEM| is always in Montgomery form. If we exposed the internal add and double implementations in each of the curves, we could give |EC_POINT| an |EC_METHOD|-specific representation and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this for later.) The generic add and doubling formulas are aligned with the formulas proved in fiat-crypto. Those only applied to a = -3, so I've proved a generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case someone uses a custom curve. The new formulas are verified, constant-time, and swap a multiply for a square. As expressed in fiat-crypto they do use more temporaries, but this seems to be fine with stack-allocated EC_FELEMs. (We can try to help the compiler later, but benchamrks below suggest this isn't necessary.) Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the bounds in the type system and, in particular, that the width is correct, which will make it easier to select a point in constant-time in the future. (Indeed the old code did not always have the correct width. Its point formula involved halving and implemented this in variable time and variable width.) Before: Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec) Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec) Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec) Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec) Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec) Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec) Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec) After: Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%] Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%] Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%] Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%] Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%] Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%] Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%] This more than pays for removing points_make_affine previously and even speeds up ECDH P-256 slightly. (The point-on-curve check uses the generic code.) Next is to push the stack-allocating up to ec_wNAF_mul, followed by a constant-time single-point multiplication. Bug: 239 Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c Reviewed-on: https://boringssl-review.googlesource.com/27668 Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
p224_felem_to_generic(r, felem);
}
static int ec_GFp_nistp224_bignum_to_felem(const EC_GROUP *group, EC_FELEM *out,
const BIGNUM *in) {
return bn_copy_words(out->words, group->field.width, in);
}
static int ec_GFp_nistp224_felem_to_bignum(const EC_GROUP *group, BIGNUM *out,
const EC_FELEM *in) {
return bn_set_words(out, in->words, group->field.width);
}
DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistp224_method) {
out->group_init = ec_GFp_simple_group_init;
out->group_finish = ec_GFp_simple_group_finish;
out->group_set_curve = ec_GFp_simple_group_set_curve;
out->point_get_affine_coordinates =
ec_GFp_nistp224_point_get_affine_coordinates;
out->add = ec_GFp_nistp224_add;
out->dbl = ec_GFp_nistp224_dbl;
out->mul = ec_GFp_nistp224_points_mul;
ec/p256.c: fiat-crypto field arithmetic (64, 32) The fiat-crypto-generated code uses the Montgomery form implementation strategy, for both 32-bit and 64-bit code. 64-bit throughput seems slower, but the difference is smaller than noise between repetitions (-2%?) 32-bit throughput has decreased significantly for ECDH (-40%). I am attributing this to the change from varibale-time scalar multiplication to constant-time scalar multiplication. Due to the same bottleneck, ECDSA verification still uses the old code (otherwise there would have been a 60% throughput decrease). On the other hand, ECDSA signing throughput has increased slightly (+10%), perhaps due to the use of a precomputed table of multiples of the base point. 64-bit benchmarks (Google Cloud Haswell): with this change: Did 9126 ECDH P-256 operations in 1009572us (9039.5 ops/sec) Did 23000 ECDSA P-256 signing operations in 1039832us (22119.0 ops/sec) Did 8820 ECDSA P-256 verify operations in 1024242us (8611.2 ops/sec) master (40e8c921cab5cce2bc10722ecf4ebe0e380cf6c8): Did 9340 ECDH P-256 operations in 1017975us (9175.1 ops/sec) Did 23000 ECDSA P-256 signing operations in 1039820us (22119.2 ops/sec) Did 8688 ECDSA P-256 verify operations in 1021108us (8508.4 ops/sec) benchmarks on ARMv7 (LG Nexus 4): with this change: Did 150 ECDH P-256 operations in 1029726us (145.7 ops/sec) Did 506 ECDSA P-256 signing operations in 1065192us (475.0 ops/sec) Did 363 ECDSA P-256 verify operations in 1033298us (351.3 ops/sec) master (2fce1beda0f7e74e2d687860f807cf0b8d8056a4): Did 245 ECDH P-256 operations in 1017518us (240.8 ops/sec) Did 473 ECDSA P-256 signing operations in 1086281us (435.4 ops/sec) Did 360 ECDSA P-256 verify operations in 1003846us (358.6 ops/sec) 64-bit tables converted as follows: import re, sys, math p = 2**256 - 2**224 + 2**192 + 2**96 - 1 R = 2**256 def convert(t): x0, s1, x1, s2, x2, s3, x3 = t.groups() v = int(x0, 0) + 2**64 * (int(x1, 0) + 2**64*(int(x2,0) + 2**64*(int(x3, 0)) )) w = v*R%p y0 = hex(w%(2**64)) y1 = hex((w>>64)%(2**64)) y2 = hex((w>>(2*64))%(2**64)) y3 = hex((w>>(3*64))%(2**64)) ww = int(y0, 0) + 2**64 * (int(y1, 0) + 2**64*(int(y2,0) + 2**64*(int(y3, 0)) )) if ww != v*R%p: print(x0,x1,x2,x3) print(hex(v)) print(y0,y1,y2,y3) print(hex(w)) print(hex(ww)) assert 0 return '{'+y0+s1+y1+s2+y2+s3+y3+'}' fe_re = re.compile('{'+r'(\s*,\s*)'.join(r'(\d+|0x[abcdefABCDEF0123456789]+)' for i in range(4)) + '}') print (re.sub(fe_re, convert, sys.stdin.read()).rstrip('\n')) 32-bit tables converted from 64-bit tables Change-Id: I52d6e5504fcb6ca2e8b0ee13727f4500c80c1799 Reviewed-on: https://boringssl-review.googlesource.com/23244 Commit-Queue: Adam Langley <agl@google.com> Reviewed-by: Adam Langley <agl@google.com> CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
2017-11-08 20:32:38 +00:00
out->mul_public = ec_GFp_nistp224_points_mul;
Add EC_FELEM for EC_POINTs and related temporaries. This introduces EC_FELEM, which is analogous to EC_SCALAR. It is used for EC_POINT's representation in the generic EC_METHOD, as well as random operations on tuned EC_METHODs that still are implemented genericly. Unlike EC_SCALAR, EC_FELEM's exact representation is awkwardly specific to the EC_METHOD, analogous to how the old values were BIGNUMs but may or may not have been in Montgomery form. This is kind of a nuisance, but no more than before. (If p224-64.c were easily convertable to Montgomery form, we could say |EC_FELEM| is always in Montgomery form. If we exposed the internal add and double implementations in each of the curves, we could give |EC_POINT| an |EC_METHOD|-specific representation and |EC_FELEM| is purely a |EC_GFp_mont_method| type. I'll leave this for later.) The generic add and doubling formulas are aligned with the formulas proved in fiat-crypto. Those only applied to a = -3, so I've proved a generic one in https://github.com/mit-plv/fiat-crypto/pull/356, in case someone uses a custom curve. The new formulas are verified, constant-time, and swap a multiply for a square. As expressed in fiat-crypto they do use more temporaries, but this seems to be fine with stack-allocated EC_FELEMs. (We can try to help the compiler later, but benchamrks below suggest this isn't necessary.) Unlike BIGNUM, EC_FELEM can be stack-allocated. It also captures the bounds in the type system and, in particular, that the width is correct, which will make it easier to select a point in constant-time in the future. (Indeed the old code did not always have the correct width. Its point formula involved halving and implemented this in variable time and variable width.) Before: Did 77274 ECDH P-256 operations in 10046087us (7692.0 ops/sec) Did 5959 ECDH P-384 operations in 10031701us (594.0 ops/sec) Did 10815 ECDSA P-384 signing operations in 10087892us (1072.1 ops/sec) Did 8976 ECDSA P-384 verify operations in 10071038us (891.3 ops/sec) Did 2600 ECDH P-521 operations in 10091688us (257.6 ops/sec) Did 4590 ECDSA P-521 signing operations in 10055195us (456.5 ops/sec) Did 3811 ECDSA P-521 verify operations in 10003574us (381.0 ops/sec) After: Did 77736 ECDH P-256 operations in 10029858us (7750.5 ops/sec) [+0.8%] Did 7519 ECDH P-384 operations in 10068076us (746.8 ops/sec) [+25.7%] Did 13335 ECDSA P-384 signing operations in 10029962us (1329.5 ops/sec) [+24.0%] Did 11021 ECDSA P-384 verify operations in 10088600us (1092.4 ops/sec) [+22.6%] Did 2912 ECDH P-521 operations in 10001325us (291.2 ops/sec) [+13.0%] Did 5150 ECDSA P-521 signing operations in 10027462us (513.6 ops/sec) [+12.5%] Did 4264 ECDSA P-521 verify operations in 10069694us (423.4 ops/sec) [+11.1%] This more than pays for removing points_make_affine previously and even speeds up ECDH P-256 slightly. (The point-on-curve check uses the generic code.) Next is to push the stack-allocating up to ec_wNAF_mul, followed by a constant-time single-point multiplication. Bug: 239 Change-Id: I44a2dff7c52522e491d0f8cffff64c4ab5cd353c Reviewed-on: https://boringssl-review.googlesource.com/27668 Reviewed-by: Adam Langley <agl@google.com>
2018-04-23 02:39:34 +01:00
out->felem_mul = ec_GFp_nistp224_felem_mul;
out->felem_sqr = ec_GFp_nistp224_felem_sqr;
out->bignum_to_felem = ec_GFp_nistp224_bignum_to_felem;
out->felem_to_bignum = ec_GFp_nistp224_felem_to_bignum;
out->scalar_inv_montgomery = ec_simple_scalar_inv_montgomery;
out->scalar_inv_montgomery_vartime = ec_GFp_simple_mont_inv_mod_ord_vartime;
out->cmp_x_coordinate = ec_GFp_simple_cmp_x_coordinate;
}
#endif // BORINGSSL_HAS_UINT128 && !SMALL