From b8f14b7d5357f7a3490bb66d3dc9d83671ae5908 Mon Sep 17 00:00:00 2001
From: David Benjamin <davidben@google.com>
Date: Fri, 6 Apr 2018 19:58:46 -0400
Subject: [PATCH] Add dedicated scalar inversion code to p256-x86_64.c.

This is adapted from upstream's
eb7916960bf50f436593abe3d5f2e0592d291017.

This gives a 22% win for ECDSA signing. (Upstream cites 30-40%, but they
are unnecessarily using BN_mod_exp_mont_consttime in their generic path.
The exponent is public. I expect part of their 30-40% is just offsetting
this.)

Did 506000 ECDSA P-256 signing operations in 25044595us (20204.0 ops/sec)
Did 170506 ECDSA P-256 verify operations in 25033567us (6811.1 ops/sec)

Did 618000 ECDSA P-256 signing operations in 25031294us (24689.1 ops/sec)
Did 182240 ECDSA P-256 verify operations in 25006918us (7287.6 ops/sec)

Most of the performance win appears to be from the assembly operations
and not the addition chain. I have a CL to graft the addition chain onto
the C implementation, but it did not show measurable improvement in
ECDSA verify. ECDSA sign gets 2-4% faster, but we're more concerned
about ECDSA verify in the OPENSSL_SMALL builds.

Change-Id: Ide166f98b146c025f7f80ed7906336c16818540a
Reviewed-on: https://boringssl-review.googlesource.com/27593
Reviewed-by: Adam Langley <alangley@gmail.com>
---
 crypto/fipsmodule/ec/p256-x86_64.c | 83 +++++++++++++++++++++++++++++-
 1 file changed, 82 insertions(+), 1 deletion(-)

diff --git a/crypto/fipsmodule/ec/p256-x86_64.c b/crypto/fipsmodule/ec/p256-x86_64.c
index 459f2249..d8d3a399 100644
--- a/crypto/fipsmodule/ec/p256-x86_64.c
+++ b/crypto/fipsmodule/ec/p256-x86_64.c
@@ -430,6 +430,87 @@ static int ecp_nistz256_get_affine(const EC_GROUP *group, const EC_POINT *point,
   return 1;
 }
 
+static void ecp_nistz256_inv_mod_ord(const EC_GROUP *group, EC_SCALAR *out,
+                                     const EC_SCALAR *in) {
+  // table[i] stores a power of |in| corresponding to the matching enum value.
+  enum {
+    // The following indices specify the power in binary.
+    i_1 = 0,
+    i_10,
+    i_11,
+    i_101,
+    i_111,
+    i_1010,
+    i_1111,
+    i_10101,
+    i_101010,
+    i_101111,
+    // The following indices specify 2^N-1, or N ones in a row.
+    i_x6,
+    i_x8,
+    i_x16,
+    i_x32
+  };
+  BN_ULONG table[15][P256_LIMBS];
+
+  // https://briansmith.org/ecc-inversion-addition-chains-01#p256_scalar_inversion
+  //
+  // Even though this code path spares 12 squarings, 4.5%, and 13
+  // multiplications, 25%, the overall sign operation is not that much faster,
+  // not more that 2%. Most of the performance of this function comes from the
+  // scalar operations.
+
+  // Pre-calculate powers.
+  OPENSSL_memcpy(table[i_1], in->words, P256_LIMBS * sizeof(BN_ULONG));
+
+  ecp_nistz256_ord_sqr_mont(table[i_10], table[i_1], 1);
+
+  ecp_nistz256_ord_mul_mont(table[i_11], table[i_1], table[i_10]);
+
+  ecp_nistz256_ord_mul_mont(table[i_101], table[i_11], table[i_10]);
+
+  ecp_nistz256_ord_mul_mont(table[i_111], table[i_101], table[i_10]);
+
+  ecp_nistz256_ord_sqr_mont(table[i_1010], table[i_101], 1);
+
+  ecp_nistz256_ord_mul_mont(table[i_1111], table[i_1010], table[i_101]);
+
+  ecp_nistz256_ord_sqr_mont(table[i_10101], table[i_1010], 1);
+  ecp_nistz256_ord_mul_mont(table[i_10101], table[i_10101], table[i_1]);
+
+  ecp_nistz256_ord_sqr_mont(table[i_101010], table[i_10101], 1);
+
+  ecp_nistz256_ord_mul_mont(table[i_101111], table[i_101010], table[i_101]);
+
+  ecp_nistz256_ord_mul_mont(table[i_x6], table[i_101010], table[i_10101]);
+
+  ecp_nistz256_ord_sqr_mont(table[i_x8], table[i_x6], 2);
+  ecp_nistz256_ord_mul_mont(table[i_x8], table[i_x8], table[i_11]);
+
+  ecp_nistz256_ord_sqr_mont(table[i_x16], table[i_x8], 8);
+  ecp_nistz256_ord_mul_mont(table[i_x16], table[i_x16], table[i_x8]);
+
+  ecp_nistz256_ord_sqr_mont(table[i_x32], table[i_x16], 16);
+  ecp_nistz256_ord_mul_mont(table[i_x32], table[i_x32], table[i_x16]);
+
+  // Compute |in| raised to the order-2.
+  ecp_nistz256_ord_sqr_mont(out->words, table[i_x32], 64);
+  ecp_nistz256_ord_mul_mont(out->words, out->words, table[i_x32]);
+  static const struct {
+    uint8_t p, i;
+  } kChain[27] = {{32, i_x32},    {6, i_101111}, {5, i_111},    {4, i_11},
+                  {5, i_1111},    {5, i_10101},  {4, i_101},    {3, i_101},
+                  {3, i_101},     {5, i_111},    {9, i_101111}, {6, i_1111},
+                  {2, i_1},       {5, i_1},      {6, i_1111},   {5, i_111},
+                  {4, i_111},     {5, i_111},    {5, i_101},    {3, i_11},
+                  {10, i_101111}, {2, i_11},     {5, i_11},     {5, i_11},
+                  {3, i_1},       {7, i_10101},  {6, i_1111}};
+  for (size_t i = 0; i < OPENSSL_ARRAY_SIZE(kChain); i++) {
+    ecp_nistz256_ord_sqr_mont(out->words, out->words, kChain[i].p);
+    ecp_nistz256_ord_mul_mont(out->words, out->words, table[kChain[i].i]);
+  }
+}
+
 DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistz256_method) {
   out->group_init = ec_GFp_mont_group_init;
   out->group_finish = ec_GFp_mont_group_finish;
@@ -441,7 +522,7 @@ DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistz256_method) {
   out->field_sqr = ec_GFp_mont_field_sqr;
   out->field_encode = ec_GFp_mont_field_encode;
   out->field_decode = ec_GFp_mont_field_decode;
-  out->scalar_inv_montgomery = ec_simple_scalar_inv_montgomery;
+  out->scalar_inv_montgomery = ecp_nistz256_inv_mod_ord;
 };
 
 #endif /* !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \