boringssl/crypto/fipsmodule/ec/p256-x86_64.h
Nir Drucker 3d450d2844 Speed up ECDSA verify on x86-64.
This commit improves the performance of ECDSA signature verification
(over NIST P-256 curve) for x86 platforms. The speedup is by a factor of 1.15x.
It does so by:
  1) Leveraging the fact that the verification does not need
     to run in constant time. To this end, we implemented:
    a) the function ecp_nistz256_points_mul_public in a similar way to
       the current ecp_nistz256_points_mul function by removing its constant
       time features.
    b) the Binary Extended Euclidean Algorithm (BEEU) in x86 assembly to
       replace the current modular inverse function used for the inversion.
  2) The last step in the ECDSA_verify function compares the (x) affine
     coordinate with the signature (r) value. Converting x from the Jacobian's
     representation to the affine coordinate requires to perform one inversions
     (x_affine = x * z^(-2)). We save this inversion and speed up the computations
     by instead bringing r to x (r_jacobian = r*z^2) which is faster.

The measured results are:
Before (on a Kaby Lake desktop with gcc-5):
Did 26000 ECDSA P-224 signing operations in 1002372us (25938.5 ops/sec)
Did 11000 ECDSA P-224 verify operations in 1043821us (10538.2 ops/sec)
Did 55000 ECDSA P-256 signing operations in 1017560us (54050.9 ops/sec)
Did 17000 ECDSA P-256 verify operations in 1051280us (16170.8 ops/sec)

After (on a Kaby Lake desktop with gcc-5):
Did 27000 ECDSA P-224 signing operations in 1011287us (26698.7 ops/sec)
Did 11640 ECDSA P-224 verify operations in 1076698us (10810.8 ops/sec)
Did 55000 ECDSA P-256 signing operations in 1016880us (54087.0 ops/sec)
Did 20000 ECDSA P-256 verify operations in 1038736us (19254.2 ops/sec)

Before (on a Skylake server platform with gcc-5):
Did 25000 ECDSA P-224 signing operations in 1021651us (24470.2 ops/sec)
Did 10373 ECDSA P-224 verify operations in 1046563us (9911.5 ops/sec)
Did 50000 ECDSA P-256 signing operations in 1002774us (49861.7 ops/sec)
Did 15000 ECDSA P-256 verify operations in 1006471us (14903.6 ops/sec)

After (on a Skylake server platform with gcc-5):
Did 25000 ECDSA P-224 signing operations in 1020958us (24486.8 ops/sec)
Did 10373 ECDSA P-224 verify operations in 1046359us (9913.4 ops/sec)
Did 50000 ECDSA P-256 signing operations in 1003996us (49801.0 ops/sec)
Did 18000 ECDSA P-256 verify operations in 1021604us (17619.4 ops/sec)

Developers and authors:
***************************************************************************
Nir Drucker (1,2), Shay Gueron (1,2)
(1) Amazon Web Services Inc.
(2) University of Haifa, Israel
***************************************************************************

Change-Id: Idd42a7bc40626bce974ea000b61fdb5bad33851c
Reviewed-on: https://boringssl-review.googlesource.com/c/31304
Commit-Queue: Adam Langley <agl@google.com>
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
Reviewed-by: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>
2018-11-05 23:48:07 +00:00

152 lines
5.6 KiB
C

/*
* Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
* Copyright (c) 2014, Intel Corporation. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*
* Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
* (1) Intel Corporation, Israel Development Center, Haifa, Israel
* (2) University of Haifa, Israel
*
* Reference:
* S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
* 256 Bit Primes"
*/
#ifndef OPENSSL_HEADER_EC_P256_X86_64_H
#define OPENSSL_HEADER_EC_P256_X86_64_H
#include <openssl/base.h>
#include <openssl/bn.h>
#if defined(__cplusplus)
extern "C" {
#endif
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
!defined(OPENSSL_SMALL)
// P-256 field operations.
//
// An element mod P in P-256 is represented as a little-endian array of
// |P256_LIMBS| |BN_ULONG|s, spanning the full range of values.
//
// The following functions take fully-reduced inputs mod P and give
// fully-reduced outputs. They may be used in-place.
#define P256_LIMBS (256 / BN_BITS2)
// ecp_nistz256_neg sets |res| to -|a| mod P.
void ecp_nistz256_neg(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS]);
// ecp_nistz256_mul_mont sets |res| to |a| * |b| * 2^-256 mod P.
void ecp_nistz256_mul_mont(BN_ULONG res[P256_LIMBS],
const BN_ULONG a[P256_LIMBS],
const BN_ULONG b[P256_LIMBS]);
// ecp_nistz256_sqr_mont sets |res| to |a| * |a| * 2^-256 mod P.
void ecp_nistz256_sqr_mont(BN_ULONG res[P256_LIMBS],
const BN_ULONG a[P256_LIMBS]);
// ecp_nistz256_from_mont sets |res| to |in|, converted from Montgomery domain
// by multiplying with 1.
static inline void ecp_nistz256_from_mont(BN_ULONG res[P256_LIMBS],
const BN_ULONG in[P256_LIMBS]) {
static const BN_ULONG ONE[P256_LIMBS] = { 1 };
ecp_nistz256_mul_mont(res, in, ONE);
}
// ecp_nistz256_to_mont sets |res| to |in|, converted to Montgomery domain
// by multiplying with RR = 2^512 mod P precomputed for NIST P256 curve.
static inline void ecp_nistz256_to_mont(BN_ULONG res[P256_LIMBS],
const BN_ULONG in[P256_LIMBS]) {
static const BN_ULONG RR[P256_LIMBS] = {
TOBN(0x00000000, 0x00000003), TOBN(0xfffffffb, 0xffffffff),
TOBN(0xffffffff, 0xfffffffe), TOBN(0x00000004, 0xfffffffd)};
ecp_nistz256_mul_mont(res, in, RR);
}
// P-256 scalar operations.
//
// The following functions compute modulo N, where N is the order of P-256. They
// take fully-reduced inputs and give fully-reduced outputs.
// ecp_nistz256_ord_mul_mont sets |res| to |a| * |b| where inputs and outputs
// are in Montgomery form. That is, |res| is |a| * |b| * 2^-256 mod N.
void ecp_nistz256_ord_mul_mont(BN_ULONG res[P256_LIMBS],
const BN_ULONG a[P256_LIMBS],
const BN_ULONG b[P256_LIMBS]);
// ecp_nistz256_ord_sqr_mont sets |res| to |a|^(2*|rep|) where inputs and
// outputs are in Montgomery form. That is, |res| is
// (|a| * 2^-256)^(2*|rep|) * 2^256 mod N.
void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS],
const BN_ULONG a[P256_LIMBS], int rep);
// beeu_mod_inverse_vartime sets out = a^-1 mod p using a Euclidean algorithm.
// Assumption: 0 < a < p < 2^(256) and p is odd.
int beeu_mod_inverse_vartime(BN_ULONG out[P256_LIMBS],
const BN_ULONG a[P256_LIMBS],
const BN_ULONG p[P256_LIMBS]);
// P-256 point operations.
//
// The following functions may be used in-place. All coordinates are in the
// Montgomery domain.
// A P256_POINT represents a P-256 point in Jacobian coordinates.
typedef struct {
BN_ULONG X[P256_LIMBS];
BN_ULONG Y[P256_LIMBS];
BN_ULONG Z[P256_LIMBS];
} P256_POINT;
// A P256_POINT_AFFINE represents a P-256 point in affine coordinates. Infinity
// is encoded as (0, 0).
typedef struct {
BN_ULONG X[P256_LIMBS];
BN_ULONG Y[P256_LIMBS];
} P256_POINT_AFFINE;
// ecp_nistz256_select_w5 sets |*val| to |in_t[index-1]| if 1 <= |index| <= 16
// and all zeros (the point at infinity) if |index| is 0. This is done in
// constant time.
void ecp_nistz256_select_w5(P256_POINT *val, const P256_POINT in_t[16],
int index);
// ecp_nistz256_select_w7 sets |*val| to |in_t[index-1]| if 1 <= |index| <= 64
// and all zeros (the point at infinity) if |index| is 0. This is done in
// constant time.
void ecp_nistz256_select_w7(P256_POINT_AFFINE *val,
const P256_POINT_AFFINE in_t[64], int index);
// ecp_nistz256_point_double sets |r| to |a| doubled.
void ecp_nistz256_point_double(P256_POINT *r, const P256_POINT *a);
// ecp_nistz256_point_add adds |a| to |b| and places the result in |r|.
void ecp_nistz256_point_add(P256_POINT *r, const P256_POINT *a,
const P256_POINT *b);
// ecp_nistz256_point_add_affine adds |a| to |b| and places the result in
// |r|. |a| and |b| must not represent the same point unless they are both
// infinity.
void ecp_nistz256_point_add_affine(P256_POINT *r, const P256_POINT *a,
const P256_POINT_AFFINE *b);
#endif /* !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
!defined(OPENSSL_SMALL) */
#if defined(__cplusplus)
} // extern C++
#endif
#endif // OPENSSL_HEADER_EC_P256_X86_64_H