From b8f14b7d5357f7a3490bb66d3dc9d83671ae5908 Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Fri, 6 Apr 2018 19:58:46 -0400 Subject: [PATCH] Add dedicated scalar inversion code to p256-x86_64.c. This is adapted from upstream's eb7916960bf50f436593abe3d5f2e0592d291017. This gives a 22% win for ECDSA signing. (Upstream cites 30-40%, but they are unnecessarily using BN_mod_exp_mont_consttime in their generic path. The exponent is public. I expect part of their 30-40% is just offsetting this.) Did 506000 ECDSA P-256 signing operations in 25044595us (20204.0 ops/sec) Did 170506 ECDSA P-256 verify operations in 25033567us (6811.1 ops/sec) Did 618000 ECDSA P-256 signing operations in 25031294us (24689.1 ops/sec) Did 182240 ECDSA P-256 verify operations in 25006918us (7287.6 ops/sec) Most of the performance win appears to be from the assembly operations and not the addition chain. I have a CL to graft the addition chain onto the C implementation, but it did not show measurable improvement in ECDSA verify. ECDSA sign gets 2-4% faster, but we're more concerned about ECDSA verify in the OPENSSL_SMALL builds. Change-Id: Ide166f98b146c025f7f80ed7906336c16818540a Reviewed-on: https://boringssl-review.googlesource.com/27593 Reviewed-by: Adam Langley --- crypto/fipsmodule/ec/p256-x86_64.c | 83 +++++++++++++++++++++++++++++- 1 file changed, 82 insertions(+), 1 deletion(-) diff --git a/crypto/fipsmodule/ec/p256-x86_64.c b/crypto/fipsmodule/ec/p256-x86_64.c index 459f2249..d8d3a399 100644 --- a/crypto/fipsmodule/ec/p256-x86_64.c +++ b/crypto/fipsmodule/ec/p256-x86_64.c @@ -430,6 +430,87 @@ static int ecp_nistz256_get_affine(const EC_GROUP *group, const EC_POINT *point, return 1; } +static void ecp_nistz256_inv_mod_ord(const EC_GROUP *group, EC_SCALAR *out, + const EC_SCALAR *in) { + // table[i] stores a power of |in| corresponding to the matching enum value. + enum { + // The following indices specify the power in binary. + i_1 = 0, + i_10, + i_11, + i_101, + i_111, + i_1010, + i_1111, + i_10101, + i_101010, + i_101111, + // The following indices specify 2^N-1, or N ones in a row. + i_x6, + i_x8, + i_x16, + i_x32 + }; + BN_ULONG table[15][P256_LIMBS]; + + // https://briansmith.org/ecc-inversion-addition-chains-01#p256_scalar_inversion + // + // Even though this code path spares 12 squarings, 4.5%, and 13 + // multiplications, 25%, the overall sign operation is not that much faster, + // not more that 2%. Most of the performance of this function comes from the + // scalar operations. + + // Pre-calculate powers. + OPENSSL_memcpy(table[i_1], in->words, P256_LIMBS * sizeof(BN_ULONG)); + + ecp_nistz256_ord_sqr_mont(table[i_10], table[i_1], 1); + + ecp_nistz256_ord_mul_mont(table[i_11], table[i_1], table[i_10]); + + ecp_nistz256_ord_mul_mont(table[i_101], table[i_11], table[i_10]); + + ecp_nistz256_ord_mul_mont(table[i_111], table[i_101], table[i_10]); + + ecp_nistz256_ord_sqr_mont(table[i_1010], table[i_101], 1); + + ecp_nistz256_ord_mul_mont(table[i_1111], table[i_1010], table[i_101]); + + ecp_nistz256_ord_sqr_mont(table[i_10101], table[i_1010], 1); + ecp_nistz256_ord_mul_mont(table[i_10101], table[i_10101], table[i_1]); + + ecp_nistz256_ord_sqr_mont(table[i_101010], table[i_10101], 1); + + ecp_nistz256_ord_mul_mont(table[i_101111], table[i_101010], table[i_101]); + + ecp_nistz256_ord_mul_mont(table[i_x6], table[i_101010], table[i_10101]); + + ecp_nistz256_ord_sqr_mont(table[i_x8], table[i_x6], 2); + ecp_nistz256_ord_mul_mont(table[i_x8], table[i_x8], table[i_11]); + + ecp_nistz256_ord_sqr_mont(table[i_x16], table[i_x8], 8); + ecp_nistz256_ord_mul_mont(table[i_x16], table[i_x16], table[i_x8]); + + ecp_nistz256_ord_sqr_mont(table[i_x32], table[i_x16], 16); + ecp_nistz256_ord_mul_mont(table[i_x32], table[i_x32], table[i_x16]); + + // Compute |in| raised to the order-2. + ecp_nistz256_ord_sqr_mont(out->words, table[i_x32], 64); + ecp_nistz256_ord_mul_mont(out->words, out->words, table[i_x32]); + static const struct { + uint8_t p, i; + } kChain[27] = {{32, i_x32}, {6, i_101111}, {5, i_111}, {4, i_11}, + {5, i_1111}, {5, i_10101}, {4, i_101}, {3, i_101}, + {3, i_101}, {5, i_111}, {9, i_101111}, {6, i_1111}, + {2, i_1}, {5, i_1}, {6, i_1111}, {5, i_111}, + {4, i_111}, {5, i_111}, {5, i_101}, {3, i_11}, + {10, i_101111}, {2, i_11}, {5, i_11}, {5, i_11}, + {3, i_1}, {7, i_10101}, {6, i_1111}}; + for (size_t i = 0; i < OPENSSL_ARRAY_SIZE(kChain); i++) { + ecp_nistz256_ord_sqr_mont(out->words, out->words, kChain[i].p); + ecp_nistz256_ord_mul_mont(out->words, out->words, table[kChain[i].i]); + } +} + DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistz256_method) { out->group_init = ec_GFp_mont_group_init; out->group_finish = ec_GFp_mont_group_finish; @@ -441,7 +522,7 @@ DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistz256_method) { out->field_sqr = ec_GFp_mont_field_sqr; out->field_encode = ec_GFp_mont_field_encode; out->field_decode = ec_GFp_mont_field_decode; - out->scalar_inv_montgomery = ec_simple_scalar_inv_montgomery; + out->scalar_inv_montgomery = ecp_nistz256_inv_mod_ord; }; #endif /* !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \