ec/p256.c: fiat-crypto field arithmetic (64, 32)

The fiat-crypto-generated code uses the Montgomery form implementation strategy, for both 32-bit and 64-bit code. 64-bit throughput seems slower, but the difference is smaller than noise between repetitions (-2%?) 32-bit throughput has decreased significantly for ECDH (-40%). I am attributing this to the change from varibale-time scalar multiplication to constant-time scalar multiplication. Due to the same bottleneck, ECDSA verification still uses the old code (otherwise there would have been a 60% throughput decrease). On the other hand, ECDSA signing throughput has increased slightly (+10%), perhaps due to the use of a precomputed table of multiples of the base point. 64-bit benchmarks (Google Cloud Haswell): with this change: Did 9126 ECDH P-256 operations in 1009572us (9039.5 ops/sec) Did 23000 ECDSA P-256 signing operations in 1039832us (22119.0 ops/sec) Did 8820 ECDSA P-256 verify operations in 1024242us (8611.2 ops/sec) master (40e8c921ca): Did 9340 ECDH P-256 operations in 1017975us (9175.1 ops/sec) Did 23000 ECDSA P-256 signing operations in 1039820us (22119.2 ops/sec) Did 8688 ECDSA P-256 verify operations in 1021108us (8508.4 ops/sec) benchmarks on ARMv7 (LG Nexus 4): with this change: Did 150 ECDH P-256 operations in 1029726us (145.7 ops/sec) Did 506 ECDSA P-256 signing operations in 1065192us (475.0 ops/sec) Did 363 ECDSA P-256 verify operations in 1033298us (351.3 ops/sec) master (2fce1beda0): Did 245 ECDH P-256 operations in 1017518us (240.8 ops/sec) Did 473 ECDSA P-256 signing operations in 1086281us (435.4 ops/sec) Did 360 ECDSA P-256 verify operations in 1003846us (358.6 ops/sec) 64-bit tables converted as follows: import re, sys, math p = 2**256 - 2**224 + 2**192 + 2**96 - 1 R = 2**256 def convert(t): x0, s1, x1, s2, x2, s3, x3 = t.groups() v = int(x0, 0) + 2**64 * (int(x1, 0) + 2**64*(int(x2,0) + 2**64*(int(x3, 0)) )) w = v*R%p y0 = hex(w%(2**64)) y1 = hex((w>>64)%(2**64)) y2 = hex((w>>(2*64))%(2**64)) y3 = hex((w>>(3*64))%(2**64)) ww = int(y0, 0) + 2**64 * (int(y1, 0) + 2**64*(int(y2,0) + 2**64*(int(y3, 0)) )) if ww != v*R%p: print(x0,x1,x2,x3) print(hex(v)) print(y0,y1,y2,y3) print(hex(w)) print(hex(ww)) assert 0 return '{'+y0+s1+y1+s2+y2+s3+y3+'}' fe_re = re.compile('{'+r'(\s*,\s*)'.join(r'(\d+|0x[abcdefABCDEF0123456789]+)' for i in range(4)) + '}') print (re.sub(fe_re, convert, sys.stdin.read()).rstrip('\n')) 32-bit tables converted from 64-bit tables Change-Id: I52d6e5504fcb6ca2e8b0ee13727f4500c80c1799 Reviewed-on: https://boringssl-review.googlesource.com/23244 Commit-Queue: Adam Langley <agl@google.com> Reviewed-by: Adam Langley <agl@google.com> CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
7 years ago · 46304abf7d
--- a/crypto/fipsmodule/bcm.c
+++ b/crypto/fipsmodule/bcm.c
@@ -67,10 +67,10 @@
 #include "ec/ec_montgomery.c"
 #include "ec/oct.c"
 #include "ec/p224-64.c"
 #include "ec/p256-64.c"
 #include "../../third_party/fiat/p256.c"
 #include "ec/p256-x86_64.c"
 #include "ec/simple.c"
 #include "ec/util-64.c"
 #include "ec/util.c"
 #include "ec/wnaf.c"
 #include "hmac/hmac.c"
 #include "md4/md4.c"
--- a/crypto/fipsmodule/ec/ec.c
+++ b/crypto/fipsmodule/ec/ec.c
@@ -215,13 +215,6 @@ static const uint8_t kP521Params[6 * 66] = {
    0xB7, 0x1E, 0x91, 0x38, 0x64, 0x09,
 };

 // MSan appears to have a bug that causes code to be miscompiled in opt mode.
 // While that is being looked at, don't run the uint128_t code under MSan.
 #if defined(OPENSSL_64_BIT) && !defined(OPENSSL_WINDOWS) && \
    !defined(MEMORY_SANITIZER)
 #define BORINGSSL_USE_INT128_CODE
 #endif

 DEFINE_METHOD_FUNCTION(struct built_in_curves, OPENSSL_built_in_curves) {
  // 1.3.132.0.35
  static const uint8_t kOIDP521[] = {0x2b, 0x81, 0x04, 0x00, 0x23};
@@ -253,15 +246,18 @@ DEFINE_METHOD_FUNCTION(struct built_in_curves, OPENSSL_built_in_curves) {
  out->curves[2].param_len = 32;
  out->curves[2].params = kP256Params;
  out->curves[2].method =
 #if defined(BORINGSSL_USE_INT128_CODE)
 // MSan appears to have a bug that causes code to be miscompiled in opt mode.
 // While that is being looked at, don't run the uint128_t code under MSan.
 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
    !defined(OPENSSL_SMALL)
    !defined(OPENSSL_SMALL) && !defined(MEMORY_SANITIZER)
      EC_GFp_nistz256_method();
 #else
 #if defined(OPENSSL_32_BIT) || \
    (defined(OPENSSL_64_BIT) && !defined(MEMORY_SANITIZER))
      EC_GFp_nistp256_method();
 #endif
 #else
      EC_GFp_mont_method();
 #endif
 #endif

  // 1.3.132.0.33
@@ -273,7 +269,8 @@ DEFINE_METHOD_FUNCTION(struct built_in_curves, OPENSSL_built_in_curves) {
  out->curves[3].param_len = 28;
  out->curves[3].params = kP224Params;
  out->curves[3].method =
 #if defined(BORINGSSL_USE_INT128_CODE) && !defined(OPENSSL_SMALL)
 #if defined(OPENSSL_64_BIT) && !defined(OPENSSL_WINDOWS) && \
    !defined(MEMORY_SANITIZER) && !defined(OPENSSL_SMALL)
      EC_GFp_nistp224_method();
 #else
      EC_GFp_mont_method();
@@ -883,6 +880,24 @@ err:
  return ret;
 }

 int ec_point_mul_scalar_public(const EC_GROUP *group, EC_POINT *r,
                               const EC_SCALAR *g_scalar, const EC_POINT *p,
                               const EC_SCALAR *p_scalar, BN_CTX *ctx) {
  if ((g_scalar == NULL && p_scalar == NULL) ||
      (p == NULL) != (p_scalar == NULL))  {
    OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER);
    return 0;
  }

  if (EC_GROUP_cmp(group, r->group, NULL) != 0 ||
      (p != NULL && EC_GROUP_cmp(group, p->group, NULL) != 0)) {
    OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS);
    return 0;
  }

  return group->meth->mul_public(group, r, g_scalar, p, p_scalar, ctx);
 }

 int ec_point_mul_scalar(const EC_GROUP *group, EC_POINT *r,
                        const EC_SCALAR *g_scalar, const EC_POINT *p,
                        const EC_SCALAR *p_scalar, BN_CTX *ctx) {
--- a/crypto/fipsmodule/ec/ec_montgomery.c
+++ b/crypto/fipsmodule/ec/ec_montgomery.c
@@ -270,6 +270,7 @@ DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_mont_method) {
  out->group_set_curve = ec_GFp_mont_group_set_curve;
  out->point_get_affine_coordinates = ec_GFp_mont_point_get_affine_coordinates;
  out->mul = ec_wNAF_mul /* XXX: Not constant time. */;
  out->mul_public = ec_wNAF_mul;
  out->field_mul = ec_GFp_mont_field_mul;
  out->field_sqr = ec_GFp_mont_field_sqr;
  out->field_encode = ec_GFp_mont_field_encode;
--- a/crypto/fipsmodule/ec/internal.h
+++ b/crypto/fipsmodule/ec/internal.h
@@ -115,6 +115,12 @@ struct ec_method_st {
  // non-null.
  int (*mul)(const EC_GROUP *group, EC_POINT *r, const EC_SCALAR *g_scalar,
             const EC_POINT *p, const EC_SCALAR *p_scalar, BN_CTX *ctx);
  // mul_public performs the same computation as mul. It further assumes that
  // the inputs are public so there is no concern about leaking their values
  // through timing.
  int (*mul_public)(const EC_GROUP *group, EC_POINT *r,
                    const EC_SCALAR *g_scalar, const EC_POINT *p,
                    const EC_SCALAR *p_scalar, BN_CTX *ctx);

  // 'field_mul' and 'field_sqr' can be used by 'add' and 'dbl' so that the
  // same implementations of point operations can be used with different
@@ -195,6 +201,13 @@ int ec_point_mul_scalar(const EC_GROUP *group, EC_POINT *r,
                        const EC_SCALAR *g_scalar, const EC_POINT *p,
                        const EC_SCALAR *p_scalar, BN_CTX *ctx);

 // ec_point_mul_scalar_public performs the same computation as
 // ec_point_mul_scalar.  It further assumes that the inputs are public so
 // there is no concern about leaking their values through timing.
 int ec_point_mul_scalar_public(const EC_GROUP *group, EC_POINT *r,
                               const EC_SCALAR *g_scalar, const EC_POINT *p,
                               const EC_SCALAR *p_scalar, BN_CTX *ctx);

 int ec_wNAF_mul(const EC_GROUP *group, EC_POINT *r, const EC_SCALAR *g_scalar,
                const EC_POINT *p, const EC_SCALAR *p_scalar, BN_CTX *ctx);

--- a/crypto/fipsmodule/ec/p224-64.c
+++ b/crypto/fipsmodule/ec/p224-64.c
@@ -1122,6 +1122,7 @@ DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistp224_method) {
  out->point_get_affine_coordinates =
      ec_GFp_nistp224_point_get_affine_coordinates;
  out->mul = ec_GFp_nistp224_points_mul;
  out->mul_public = ec_GFp_nistp224_points_mul;
  out->field_mul = ec_GFp_simple_field_mul;
  out->field_sqr = ec_GFp_simple_field_sqr;
  out->field_encode = NULL;
--- a/crypto/fipsmodule/ec/p256-64.c
+++ b/crypto/fipsmodule/ec/p256-64.c
--- a/crypto/fipsmodule/ec/p256-x86_64.c
+++ b/crypto/fipsmodule/ec/p256-x86_64.c
@@ -446,6 +446,7 @@ DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistz256_method) {
  out->group_set_curve = ec_GFp_mont_group_set_curve;
  out->point_get_affine_coordinates = ecp_nistz256_get_affine;
  out->mul = ecp_nistz256_points_mul;
  out->mul_public = ecp_nistz256_points_mul;
  out->field_mul = ec_GFp_mont_field_mul;
  out->field_sqr = ec_GFp_mont_field_sqr;
  out->field_encode = ec_GFp_mont_field_encode;
--- a/crypto/fipsmodule/ec/util-64.c
+++ b/crypto/fipsmodule/ec/util-64.c
@@ -14,9 +14,6 @@

 #include <openssl/base.h>


 #if defined(OPENSSL_64_BIT) && !defined(OPENSSL_WINDOWS)

 #include <openssl/ec.h>

 #include "internal.h"
@@ -105,5 +102,3 @@ void ec_GFp_nistp_recode_scalar_bits(uint8_t *sign, uint8_t *digit,
  *sign = s & 1;
  *digit = d;
 }

 #endif  // 64_BIT && !WINDOWS
--- a/crypto/fipsmodule/ecdsa/ecdsa.c
+++ b/crypto/fipsmodule/ecdsa/ecdsa.c
@@ -275,7 +275,7 @@ int ECDSA_do_verify(const uint8_t *digest, size_t digest_len,
    OPENSSL_PUT_ERROR(ECDSA, ERR_R_MALLOC_FAILURE);
    goto err;
  }
  if (!ec_point_mul_scalar(group, point, &u1, pub_key, &u2, ctx)) {
  if (!ec_point_mul_scalar_public(group, point, &u1, pub_key, &u2, ctx)) {
    OPENSSL_PUT_ERROR(ECDSA, ERR_R_EC_LIB);
    goto err;
  }
--- a/third_party/fiat/p256.c
+++ b/third_party/fiat/p256.c