boringssl/crypto/fipsmodule/bcm.c

149 lines
3.9 KiB
C
Raw Normal View History

/* Copyright (c) 2017, Google Inc.
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
#if !defined(_GNU_SOURCE)
#define _GNU_SOURCE // needed for syscall() on Linux.
#endif
#include <openssl/crypto.h>
#include <stdlib.h>
#include <openssl/digest.h>
#include <openssl/hmac.h>
#include <openssl/sha.h>
#include "../internal.h"
#include "aes/aes.c"
#include "aes/key_wrap.c"
#include "aes/mode_wrappers.c"
#include "bn/add.c"
#include "bn/asm/x86_64-gcc.c"
#include "bn/bn.c"
#include "bn/bytes.c"
#include "bn/cmp.c"
#include "bn/ctx.c"
#include "bn/div.c"
#include "bn/exponentiation.c"
#include "bn/gcd.c"
#include "bn/generic.c"
#include "bn/jacobi.c"
#include "bn/montgomery.c"
#include "bn/montgomery_inv.c"
#include "bn/mul.c"
#include "bn/prime.c"
#include "bn/random.c"
#include "bn/rsaz_exp.c"
#include "bn/shift.c"
#include "bn/sqrt.c"
#include "cipher/aead.c"
#include "cipher/cipher.c"
#include "cipher/e_aes.c"
#include "cipher/e_des.c"
#include "des/des.c"
#include "digest/digest.c"
#include "digest/digests.c"
#include "ecdsa/ecdsa.c"
#include "ec/ec.c"
#include "ec/ec_key.c"
#include "ec/ec_montgomery.c"
#include "ec/oct.c"
#include "ec/p224-64.c"
ec/p256.c: fiat-crypto field arithmetic (64, 32) The fiat-crypto-generated code uses the Montgomery form implementation strategy, for both 32-bit and 64-bit code. 64-bit throughput seems slower, but the difference is smaller than noise between repetitions (-2%?) 32-bit throughput has decreased significantly for ECDH (-40%). I am attributing this to the change from varibale-time scalar multiplication to constant-time scalar multiplication. Due to the same bottleneck, ECDSA verification still uses the old code (otherwise there would have been a 60% throughput decrease). On the other hand, ECDSA signing throughput has increased slightly (+10%), perhaps due to the use of a precomputed table of multiples of the base point. 64-bit benchmarks (Google Cloud Haswell): with this change: Did 9126 ECDH P-256 operations in 1009572us (9039.5 ops/sec) Did 23000 ECDSA P-256 signing operations in 1039832us (22119.0 ops/sec) Did 8820 ECDSA P-256 verify operations in 1024242us (8611.2 ops/sec) master (40e8c921cab5cce2bc10722ecf4ebe0e380cf6c8): Did 9340 ECDH P-256 operations in 1017975us (9175.1 ops/sec) Did 23000 ECDSA P-256 signing operations in 1039820us (22119.2 ops/sec) Did 8688 ECDSA P-256 verify operations in 1021108us (8508.4 ops/sec) benchmarks on ARMv7 (LG Nexus 4): with this change: Did 150 ECDH P-256 operations in 1029726us (145.7 ops/sec) Did 506 ECDSA P-256 signing operations in 1065192us (475.0 ops/sec) Did 363 ECDSA P-256 verify operations in 1033298us (351.3 ops/sec) master (2fce1beda0f7e74e2d687860f807cf0b8d8056a4): Did 245 ECDH P-256 operations in 1017518us (240.8 ops/sec) Did 473 ECDSA P-256 signing operations in 1086281us (435.4 ops/sec) Did 360 ECDSA P-256 verify operations in 1003846us (358.6 ops/sec) 64-bit tables converted as follows: import re, sys, math p = 2**256 - 2**224 + 2**192 + 2**96 - 1 R = 2**256 def convert(t): x0, s1, x1, s2, x2, s3, x3 = t.groups() v = int(x0, 0) + 2**64 * (int(x1, 0) + 2**64*(int(x2,0) + 2**64*(int(x3, 0)) )) w = v*R%p y0 = hex(w%(2**64)) y1 = hex((w>>64)%(2**64)) y2 = hex((w>>(2*64))%(2**64)) y3 = hex((w>>(3*64))%(2**64)) ww = int(y0, 0) + 2**64 * (int(y1, 0) + 2**64*(int(y2,0) + 2**64*(int(y3, 0)) )) if ww != v*R%p: print(x0,x1,x2,x3) print(hex(v)) print(y0,y1,y2,y3) print(hex(w)) print(hex(ww)) assert 0 return '{'+y0+s1+y1+s2+y2+s3+y3+'}' fe_re = re.compile('{'+r'(\s*,\s*)'.join(r'(\d+|0x[abcdefABCDEF0123456789]+)' for i in range(4)) + '}') print (re.sub(fe_re, convert, sys.stdin.read()).rstrip('\n')) 32-bit tables converted from 64-bit tables Change-Id: I52d6e5504fcb6ca2e8b0ee13727f4500c80c1799 Reviewed-on: https://boringssl-review.googlesource.com/23244 Commit-Queue: Adam Langley <agl@google.com> Reviewed-by: Adam Langley <agl@google.com> CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
2017-11-08 20:32:38 +00:00
#include "../../third_party/fiat/p256.c"
#include "ec/p256-x86_64.c"
#include "ec/simple.c"
ec/p256.c: fiat-crypto field arithmetic (64, 32) The fiat-crypto-generated code uses the Montgomery form implementation strategy, for both 32-bit and 64-bit code. 64-bit throughput seems slower, but the difference is smaller than noise between repetitions (-2%?) 32-bit throughput has decreased significantly for ECDH (-40%). I am attributing this to the change from varibale-time scalar multiplication to constant-time scalar multiplication. Due to the same bottleneck, ECDSA verification still uses the old code (otherwise there would have been a 60% throughput decrease). On the other hand, ECDSA signing throughput has increased slightly (+10%), perhaps due to the use of a precomputed table of multiples of the base point. 64-bit benchmarks (Google Cloud Haswell): with this change: Did 9126 ECDH P-256 operations in 1009572us (9039.5 ops/sec) Did 23000 ECDSA P-256 signing operations in 1039832us (22119.0 ops/sec) Did 8820 ECDSA P-256 verify operations in 1024242us (8611.2 ops/sec) master (40e8c921cab5cce2bc10722ecf4ebe0e380cf6c8): Did 9340 ECDH P-256 operations in 1017975us (9175.1 ops/sec) Did 23000 ECDSA P-256 signing operations in 1039820us (22119.2 ops/sec) Did 8688 ECDSA P-256 verify operations in 1021108us (8508.4 ops/sec) benchmarks on ARMv7 (LG Nexus 4): with this change: Did 150 ECDH P-256 operations in 1029726us (145.7 ops/sec) Did 506 ECDSA P-256 signing operations in 1065192us (475.0 ops/sec) Did 363 ECDSA P-256 verify operations in 1033298us (351.3 ops/sec) master (2fce1beda0f7e74e2d687860f807cf0b8d8056a4): Did 245 ECDH P-256 operations in 1017518us (240.8 ops/sec) Did 473 ECDSA P-256 signing operations in 1086281us (435.4 ops/sec) Did 360 ECDSA P-256 verify operations in 1003846us (358.6 ops/sec) 64-bit tables converted as follows: import re, sys, math p = 2**256 - 2**224 + 2**192 + 2**96 - 1 R = 2**256 def convert(t): x0, s1, x1, s2, x2, s3, x3 = t.groups() v = int(x0, 0) + 2**64 * (int(x1, 0) + 2**64*(int(x2,0) + 2**64*(int(x3, 0)) )) w = v*R%p y0 = hex(w%(2**64)) y1 = hex((w>>64)%(2**64)) y2 = hex((w>>(2*64))%(2**64)) y3 = hex((w>>(3*64))%(2**64)) ww = int(y0, 0) + 2**64 * (int(y1, 0) + 2**64*(int(y2,0) + 2**64*(int(y3, 0)) )) if ww != v*R%p: print(x0,x1,x2,x3) print(hex(v)) print(y0,y1,y2,y3) print(hex(w)) print(hex(ww)) assert 0 return '{'+y0+s1+y1+s2+y2+s3+y3+'}' fe_re = re.compile('{'+r'(\s*,\s*)'.join(r'(\d+|0x[abcdefABCDEF0123456789]+)' for i in range(4)) + '}') print (re.sub(fe_re, convert, sys.stdin.read()).rstrip('\n')) 32-bit tables converted from 64-bit tables Change-Id: I52d6e5504fcb6ca2e8b0ee13727f4500c80c1799 Reviewed-on: https://boringssl-review.googlesource.com/23244 Commit-Queue: Adam Langley <agl@google.com> Reviewed-by: Adam Langley <agl@google.com> CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
2017-11-08 20:32:38 +00:00
#include "ec/util.c"
#include "ec/wnaf.c"
#include "hmac/hmac.c"
#include "md4/md4.c"
#include "md5/md5.c"
#include "modes/cbc.c"
#include "modes/ccm.c"
#include "modes/cfb.c"
#include "modes/ctr.c"
#include "modes/gcm.c"
#include "modes/ofb.c"
#include "modes/polyval.c"
#include "rand/ctrdrbg.c"
#include "rand/rand.c"
#include "rand/urandom.c"
#include "rsa/blinding.c"
#include "rsa/padding.c"
#include "rsa/rsa.c"
#include "rsa/rsa_impl.c"
#include "self_check/self_check.c"
#include "sha/sha1-altivec.c"
#include "sha/sha1.c"
#include "sha/sha256.c"
#include "sha/sha512.c"
#include "tls/kdf.c"
#if defined(BORINGSSL_FIPS)
#if !defined(OPENSSL_ASAN)
// These symbols are filled in by delocate.go. They point to the start and end
// of the module, and the location of the integrity hash, respectively.
extern const uint8_t BORINGSSL_bcm_text_start[];
extern const uint8_t BORINGSSL_bcm_text_end[];
extern const uint8_t BORINGSSL_bcm_text_hash[];
#endif
static void __attribute__((constructor))
BORINGSSL_bcm_power_on_self_test(void) {
CRYPTO_library_init();
#if !defined(OPENSSL_ASAN)
// Integrity tests cannot run under ASAN because it involves reading the full
// .text section, which triggers the global-buffer overflow detection.
const uint8_t *const start = BORINGSSL_bcm_text_start;
const uint8_t *const end = BORINGSSL_bcm_text_end;
static const uint8_t kHMACKey[64] = {0};
uint8_t result[SHA512_DIGEST_LENGTH];
unsigned result_len;
if (!HMAC(EVP_sha512(), kHMACKey, sizeof(kHMACKey), start, end - start,
result, &result_len) ||
result_len != sizeof(result)) {
goto err;
}
const uint8_t *expected = BORINGSSL_bcm_text_hash;
if (!check_test(expected, result, sizeof(result), "FIPS integrity test")) {
goto err;
}
#endif
if (!BORINGSSL_self_test()) {
goto err;
}
return;
err:
BORINGSSL_FIPS_abort();
}
void BORINGSSL_FIPS_abort(void) {
for (;;) {
abort();
exit(1);
}
}
#endif // BORINGSSL_FIPS