boringssl/crypto/fipsmodule/cipher/e_aes.c

1226 lines
36 KiB
C
Raw Normal View History

/* ====================================================================
* Copyright (c) 2001-2011 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
*
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For written permission, please contact
* openssl-core@openssl.org.
*
* 5. Products derived from this software may not be called "OpenSSL"
* nor may "OpenSSL" appear in their names without prior written
* permission of the OpenSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ==================================================================== */
Add a constant-time pshufb-based GHASH implementation. We currently require clmul instructions for constant-time GHASH on x86_64. Otherwise, it falls back to a variable-time 4-bit table implementation. However, a significant proportion of clients lack these instructions. Inspired by vpaes, we can use pshufb and a slightly different order of incorporating the bits to make a constant-time GHASH. This requires SSSE3, which is very common. Benchmarking old machines we had on hand, it appears to be a no-op on Sandy Bridge and a small slowdown for Penryn. Sandy Bridge (Intel Pentium CPU 987 @ 1.50GHz): (Note: these numbers are before 16-byte-aligning the table. That was an improvement on Penryn, so it's possible Sandy Bridge is now better.) Before: Did 4244750 AES-128-GCM (16 bytes) seal operations in 4015000us (1057222.9 ops/sec): 16.9 MB/s Did 442000 AES-128-GCM (1350 bytes) seal operations in 4016000us (110059.8 ops/sec): 148.6 MB/s Did 84000 AES-128-GCM (8192 bytes) seal operations in 4015000us (20921.5 ops/sec): 171.4 MB/s Did 3349250 AES-256-GCM (16 bytes) seal operations in 4016000us (833976.6 ops/sec): 13.3 MB/s Did 343500 AES-256-GCM (1350 bytes) seal operations in 4016000us (85532.9 ops/sec): 115.5 MB/s Did 65250 AES-256-GCM (8192 bytes) seal operations in 4015000us (16251.6 ops/sec): 133.1 MB/s After: Did 4229250 AES-128-GCM (16 bytes) seal operations in 4016000us (1053100.1 ops/sec): 16.8 MB/s [-0.4%] Did 442250 AES-128-GCM (1350 bytes) seal operations in 4016000us (110122.0 ops/sec): 148.7 MB/s [+0.1%] Did 83500 AES-128-GCM (8192 bytes) seal operations in 4015000us (20797.0 ops/sec): 170.4 MB/s [-0.6%] Did 3286500 AES-256-GCM (16 bytes) seal operations in 4016000us (818351.6 ops/sec): 13.1 MB/s [-1.9%] Did 342750 AES-256-GCM (1350 bytes) seal operations in 4015000us (85367.4 ops/sec): 115.2 MB/s [-0.2%] Did 65250 AES-256-GCM (8192 bytes) seal operations in 4016000us (16247.5 ops/sec): 133.1 MB/s [-0.0%] Penryn (Intel Core 2 Duo CPU P8600 @ 2.40GHz): Before: Did 1179000 AES-128-GCM (16 bytes) seal operations in 1000139us (1178836.1 ops/sec): 18.9 MB/s Did 97000 AES-128-GCM (1350 bytes) seal operations in 1006347us (96388.2 ops/sec): 130.1 MB/s Did 18000 AES-128-GCM (8192 bytes) seal operations in 1028943us (17493.7 ops/sec): 143.3 MB/s Did 977000 AES-256-GCM (16 bytes) seal operations in 1000197us (976807.6 ops/sec): 15.6 MB/s Did 82000 AES-256-GCM (1350 bytes) seal operations in 1012434us (80992.9 ops/sec): 109.3 MB/s Did 15000 AES-256-GCM (8192 bytes) seal operations in 1006528us (14902.7 ops/sec): 122.1 MB/s After: Did 1306000 AES-128-GCM (16 bytes) seal operations in 1000153us (1305800.2 ops/sec): 20.9 MB/s [+10.8%] Did 94000 AES-128-GCM (1350 bytes) seal operations in 1009852us (93082.9 ops/sec): 125.7 MB/s [-3.4%] Did 17000 AES-128-GCM (8192 bytes) seal operations in 1012096us (16796.8 ops/sec): 137.6 MB/s [-4.0%] Did 1070000 AES-256-GCM (16 bytes) seal operations in 1000929us (1069006.9 ops/sec): 17.1 MB/s [+9.4%] Did 79000 AES-256-GCM (1350 bytes) seal operations in 1002209us (78825.9 ops/sec): 106.4 MB/s [-2.7%] Did 15000 AES-256-GCM (8192 bytes) seal operations in 1061489us (14131.1 ops/sec): 115.8 MB/s [-5.2%] Change-Id: I1c3760a77af7bee4aee3745d1c648d9e34594afb Reviewed-on: https://boringssl-review.googlesource.com/c/34267 Commit-Queue: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-01-09 03:35:56 +00:00
#include <assert.h>
#include <string.h>
#include <openssl/aead.h>
#include <openssl/aes.h>
#include <openssl/cipher.h>
#include <openssl/cpu.h>
#include <openssl/err.h>
#include <openssl/mem.h>
#include <openssl/nid.h>
#include <openssl/rand.h>
#include "internal.h"
#include "../../internal.h"
#include "../aes/internal.h"
#include "../modes/internal.h"
#include "../delocate.h"
#if defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
#include <openssl/arm_arch.h>
#endif
OPENSSL_MSVC_PRAGMA(warning(push))
OPENSSL_MSVC_PRAGMA(warning(disable: 4702)) // Unreachable code.
Fix build when using Visual Studio 2015 Update 1. Many of the compatibility issues are described at https://msdn.microsoft.com/en-us/library/mt612856.aspx. The macros that suppressed warnings on a per-function basis no longer work in Update 1, so replace them with #pragmas. Update 1 warns when |size_t| arguments to |printf| are casted, so stop doing that casting. Unfortunately, this requires an ugly hack to continue working in MSVC 2013 as MSVC 2013 doesn't support "%zu". Finally, Update 1 has new warnings, some of which need to be suppressed. --- Updated by davidben to give up on suppressing warnings in crypto/x509 and crypto/x509v3 as those directories aren't changed much from upstream. In each of these cases, upstream opted just blindly initialize the variable, so do the same. Also switch C4265 to level 4, per Microsoft's recommendation and work around a bug in limits.h that happens to get fixed by Google include order style. (limits.h is sensitive to whether corecrt.h, pulled in by stddef.h and some other headers, is included before it. The reason it affected just one file is we often put the file's header first, which means base.h is pulling in stddef.h. Relying on this is ugly, but it's no worse than what everything else is doing and this doesn't seem worth making something as tame as limits.h so messy to use.) Change-Id: I02d1f935356899f424d3525d03eca401bfa3e6cd Reviewed-on: https://boringssl-review.googlesource.com/7480 Reviewed-by: David Benjamin <davidben@google.com>
2016-01-18 08:21:42 +00:00
typedef struct {
union {
double align;
AES_KEY ks;
} ks;
block128_f block;
union {
cbc128_f cbc;
ctr128_f ctr;
} stream;
} EVP_AES_KEY;
typedef struct {
Add a constant-time pshufb-based GHASH implementation. We currently require clmul instructions for constant-time GHASH on x86_64. Otherwise, it falls back to a variable-time 4-bit table implementation. However, a significant proportion of clients lack these instructions. Inspired by vpaes, we can use pshufb and a slightly different order of incorporating the bits to make a constant-time GHASH. This requires SSSE3, which is very common. Benchmarking old machines we had on hand, it appears to be a no-op on Sandy Bridge and a small slowdown for Penryn. Sandy Bridge (Intel Pentium CPU 987 @ 1.50GHz): (Note: these numbers are before 16-byte-aligning the table. That was an improvement on Penryn, so it's possible Sandy Bridge is now better.) Before: Did 4244750 AES-128-GCM (16 bytes) seal operations in 4015000us (1057222.9 ops/sec): 16.9 MB/s Did 442000 AES-128-GCM (1350 bytes) seal operations in 4016000us (110059.8 ops/sec): 148.6 MB/s Did 84000 AES-128-GCM (8192 bytes) seal operations in 4015000us (20921.5 ops/sec): 171.4 MB/s Did 3349250 AES-256-GCM (16 bytes) seal operations in 4016000us (833976.6 ops/sec): 13.3 MB/s Did 343500 AES-256-GCM (1350 bytes) seal operations in 4016000us (85532.9 ops/sec): 115.5 MB/s Did 65250 AES-256-GCM (8192 bytes) seal operations in 4015000us (16251.6 ops/sec): 133.1 MB/s After: Did 4229250 AES-128-GCM (16 bytes) seal operations in 4016000us (1053100.1 ops/sec): 16.8 MB/s [-0.4%] Did 442250 AES-128-GCM (1350 bytes) seal operations in 4016000us (110122.0 ops/sec): 148.7 MB/s [+0.1%] Did 83500 AES-128-GCM (8192 bytes) seal operations in 4015000us (20797.0 ops/sec): 170.4 MB/s [-0.6%] Did 3286500 AES-256-GCM (16 bytes) seal operations in 4016000us (818351.6 ops/sec): 13.1 MB/s [-1.9%] Did 342750 AES-256-GCM (1350 bytes) seal operations in 4015000us (85367.4 ops/sec): 115.2 MB/s [-0.2%] Did 65250 AES-256-GCM (8192 bytes) seal operations in 4016000us (16247.5 ops/sec): 133.1 MB/s [-0.0%] Penryn (Intel Core 2 Duo CPU P8600 @ 2.40GHz): Before: Did 1179000 AES-128-GCM (16 bytes) seal operations in 1000139us (1178836.1 ops/sec): 18.9 MB/s Did 97000 AES-128-GCM (1350 bytes) seal operations in 1006347us (96388.2 ops/sec): 130.1 MB/s Did 18000 AES-128-GCM (8192 bytes) seal operations in 1028943us (17493.7 ops/sec): 143.3 MB/s Did 977000 AES-256-GCM (16 bytes) seal operations in 1000197us (976807.6 ops/sec): 15.6 MB/s Did 82000 AES-256-GCM (1350 bytes) seal operations in 1012434us (80992.9 ops/sec): 109.3 MB/s Did 15000 AES-256-GCM (8192 bytes) seal operations in 1006528us (14902.7 ops/sec): 122.1 MB/s After: Did 1306000 AES-128-GCM (16 bytes) seal operations in 1000153us (1305800.2 ops/sec): 20.9 MB/s [+10.8%] Did 94000 AES-128-GCM (1350 bytes) seal operations in 1009852us (93082.9 ops/sec): 125.7 MB/s [-3.4%] Did 17000 AES-128-GCM (8192 bytes) seal operations in 1012096us (16796.8 ops/sec): 137.6 MB/s [-4.0%] Did 1070000 AES-256-GCM (16 bytes) seal operations in 1000929us (1069006.9 ops/sec): 17.1 MB/s [+9.4%] Did 79000 AES-256-GCM (1350 bytes) seal operations in 1002209us (78825.9 ops/sec): 106.4 MB/s [-2.7%] Did 15000 AES-256-GCM (8192 bytes) seal operations in 1061489us (14131.1 ops/sec): 115.8 MB/s [-5.2%] Change-Id: I1c3760a77af7bee4aee3745d1c648d9e34594afb Reviewed-on: https://boringssl-review.googlesource.com/c/34267 Commit-Queue: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-01-09 03:35:56 +00:00
GCM128_CONTEXT gcm;
union {
double align;
AES_KEY ks;
} ks; // AES key schedule to use
int key_set; // Set if key initialised
int iv_set; // Set if an iv is set
uint8_t *iv; // Temporary IV store
int ivlen; // IV length
int taglen;
int iv_gen; // It is OK to generate IVs
ctr128_f ctr;
} EVP_AES_GCM_CTX;
static int aes_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key,
Fix build when using Visual Studio 2015 Update 1. Many of the compatibility issues are described at https://msdn.microsoft.com/en-us/library/mt612856.aspx. The macros that suppressed warnings on a per-function basis no longer work in Update 1, so replace them with #pragmas. Update 1 warns when |size_t| arguments to |printf| are casted, so stop doing that casting. Unfortunately, this requires an ugly hack to continue working in MSVC 2013 as MSVC 2013 doesn't support "%zu". Finally, Update 1 has new warnings, some of which need to be suppressed. --- Updated by davidben to give up on suppressing warnings in crypto/x509 and crypto/x509v3 as those directories aren't changed much from upstream. In each of these cases, upstream opted just blindly initialize the variable, so do the same. Also switch C4265 to level 4, per Microsoft's recommendation and work around a bug in limits.h that happens to get fixed by Google include order style. (limits.h is sensitive to whether corecrt.h, pulled in by stddef.h and some other headers, is included before it. The reason it affected just one file is we often put the file's header first, which means base.h is pulling in stddef.h. Relying on this is ugly, but it's no worse than what everything else is doing and this doesn't seem worth making something as tame as limits.h so messy to use.) Change-Id: I02d1f935356899f424d3525d03eca401bfa3e6cd Reviewed-on: https://boringssl-review.googlesource.com/7480 Reviewed-by: David Benjamin <davidben@google.com>
2016-01-18 08:21:42 +00:00
const uint8_t *iv, int enc) {
int ret, mode;
EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
mode = ctx->cipher->flags & EVP_CIPH_MODE_MASK;
if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE) && !enc) {
if (hwaes_capable()) {
Add PPC64LE assembly for AES-GCM. This change adds AES and GHASH assembly from upstream, with the aim of speeding up AES-GCM. The PPC64LE assembly matches the interface of the ARMv8 assembly so I've changed the prefix of both sets of asm functions to be the same ("aes_hw_"). Otherwise, the new assmebly files and Perlasm match exactly those from upstream's c536b6be1a (from their master branch). Before: Did 1879000 AES-128-GCM (16 bytes) seal operations in 1000428us (1878196.1 ops/sec): 30.1 MB/s Did 61000 AES-128-GCM (1350 bytes) seal operations in 1006660us (60596.4 ops/sec): 81.8 MB/s Did 11000 AES-128-GCM (8192 bytes) seal operations in 1072649us (10255.0 ops/sec): 84.0 MB/s Did 1665000 AES-256-GCM (16 bytes) seal operations in 1000591us (1664016.6 ops/sec): 26.6 MB/s Did 52000 AES-256-GCM (1350 bytes) seal operations in 1006971us (51640.0 ops/sec): 69.7 MB/s Did 8840 AES-256-GCM (8192 bytes) seal operations in 1013294us (8724.0 ops/sec): 71.5 MB/s After: Did 4994000 AES-128-GCM (16 bytes) seal operations in 1000017us (4993915.1 ops/sec): 79.9 MB/s Did 1389000 AES-128-GCM (1350 bytes) seal operations in 1000073us (1388898.6 ops/sec): 1875.0 MB/s Did 319000 AES-128-GCM (8192 bytes) seal operations in 1000101us (318967.8 ops/sec): 2613.0 MB/s Did 4668000 AES-256-GCM (16 bytes) seal operations in 1000149us (4667304.6 ops/sec): 74.7 MB/s Did 1202000 AES-256-GCM (1350 bytes) seal operations in 1000646us (1201224.0 ops/sec): 1621.7 MB/s Did 269000 AES-256-GCM (8192 bytes) seal operations in 1002804us (268247.8 ops/sec): 2197.5 MB/s Change-Id: Id848562bd4e1aa79a4683012501dfa5e6c08cfcc Reviewed-on: https://boringssl-review.googlesource.com/11262 Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Adam Langley <agl@google.com> CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
2016-09-23 20:47:24 +01:00
ret = aes_hw_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
dat->block = aes_hw_decrypt;
dat->stream.cbc = NULL;
if (mode == EVP_CIPH_CBC_MODE) {
dat->stream.cbc = aes_hw_cbc_encrypt;
}
} else if (bsaes_capable() && mode == EVP_CIPH_CBC_MODE) {
ret = AES_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
dat->block = AES_decrypt;
dat->stream.cbc = bsaes_cbc_encrypt;
} else if (vpaes_capable()) {
ret = vpaes_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
dat->block = vpaes_decrypt;
dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ? vpaes_cbc_encrypt : NULL;
} else {
ret = AES_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
dat->block = AES_decrypt;
dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ? AES_cbc_encrypt : NULL;
}
} else if (hwaes_capable()) {
Add PPC64LE assembly for AES-GCM. This change adds AES and GHASH assembly from upstream, with the aim of speeding up AES-GCM. The PPC64LE assembly matches the interface of the ARMv8 assembly so I've changed the prefix of both sets of asm functions to be the same ("aes_hw_"). Otherwise, the new assmebly files and Perlasm match exactly those from upstream's c536b6be1a (from their master branch). Before: Did 1879000 AES-128-GCM (16 bytes) seal operations in 1000428us (1878196.1 ops/sec): 30.1 MB/s Did 61000 AES-128-GCM (1350 bytes) seal operations in 1006660us (60596.4 ops/sec): 81.8 MB/s Did 11000 AES-128-GCM (8192 bytes) seal operations in 1072649us (10255.0 ops/sec): 84.0 MB/s Did 1665000 AES-256-GCM (16 bytes) seal operations in 1000591us (1664016.6 ops/sec): 26.6 MB/s Did 52000 AES-256-GCM (1350 bytes) seal operations in 1006971us (51640.0 ops/sec): 69.7 MB/s Did 8840 AES-256-GCM (8192 bytes) seal operations in 1013294us (8724.0 ops/sec): 71.5 MB/s After: Did 4994000 AES-128-GCM (16 bytes) seal operations in 1000017us (4993915.1 ops/sec): 79.9 MB/s Did 1389000 AES-128-GCM (1350 bytes) seal operations in 1000073us (1388898.6 ops/sec): 1875.0 MB/s Did 319000 AES-128-GCM (8192 bytes) seal operations in 1000101us (318967.8 ops/sec): 2613.0 MB/s Did 4668000 AES-256-GCM (16 bytes) seal operations in 1000149us (4667304.6 ops/sec): 74.7 MB/s Did 1202000 AES-256-GCM (1350 bytes) seal operations in 1000646us (1201224.0 ops/sec): 1621.7 MB/s Did 269000 AES-256-GCM (8192 bytes) seal operations in 1002804us (268247.8 ops/sec): 2197.5 MB/s Change-Id: Id848562bd4e1aa79a4683012501dfa5e6c08cfcc Reviewed-on: https://boringssl-review.googlesource.com/11262 Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Adam Langley <agl@google.com> CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
2016-09-23 20:47:24 +01:00
ret = aes_hw_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
dat->block = aes_hw_encrypt;
dat->stream.cbc = NULL;
if (mode == EVP_CIPH_CBC_MODE) {
dat->stream.cbc = aes_hw_cbc_encrypt;
} else if (mode == EVP_CIPH_CTR_MODE) {
dat->stream.ctr = aes_hw_ctr32_encrypt_blocks;
}
} else if (bsaes_capable() && mode == EVP_CIPH_CTR_MODE) {
ret = AES_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
dat->block = AES_encrypt;
dat->stream.ctr = bsaes_ctr32_encrypt_blocks;
} else if (vpaes_capable()) {
ret = vpaes_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
dat->block = vpaes_encrypt;
dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ? vpaes_cbc_encrypt : NULL;
} else {
ret = AES_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
dat->block = AES_encrypt;
dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ? AES_cbc_encrypt : NULL;
}
if (ret < 0) {
OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_AES_KEY_SETUP_FAILED);
return 0;
}
return 1;
}
static int aes_cbc_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in,
size_t len) {
EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
if (dat->stream.cbc) {
(*dat->stream.cbc)(in, out, len, &dat->ks.ks, ctx->iv, ctx->encrypt);
} else if (ctx->encrypt) {
CRYPTO_cbc128_encrypt(in, out, len, &dat->ks.ks, ctx->iv, dat->block);
} else {
CRYPTO_cbc128_decrypt(in, out, len, &dat->ks.ks, ctx->iv, dat->block);
}
return 1;
}
static int aes_ecb_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in,
size_t len) {
size_t bl = ctx->cipher->block_size;
EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
if (len < bl) {
return 1;
}
len -= bl;
for (size_t i = 0; i <= len; i += bl) {
(*dat->block)(in + i, out + i, &dat->ks.ks);
}
return 1;
}
static int aes_ctr_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in,
size_t len) {
EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
if (dat->stream.ctr) {
CRYPTO_ctr128_encrypt_ctr32(in, out, len, &dat->ks.ks, ctx->iv, ctx->buf,
&ctx->num, dat->stream.ctr);
} else {
CRYPTO_ctr128_encrypt(in, out, len, &dat->ks.ks, ctx->iv, ctx->buf,
&ctx->num, dat->block);
}
return 1;
}
static int aes_ofb_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in,
size_t len) {
EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
CRYPTO_ofb128_encrypt(in, out, len, &dat->ks.ks, ctx->iv, &ctx->num,
dat->block);
return 1;
}
ctr128_f aes_ctr_set_key(AES_KEY *aes_key, GCM128_KEY *gcm_key,
block128_f *out_block, const uint8_t *key,
Don't use bsaes over vpaes for CTR-DRBG. RAND_bytes rarely uses large enough inputs for bsaes to be worth it. https://boringssl-review.googlesource.com/c/boringssl/+/33589 includes some rough benchmarks of various bits here. Some observations: - 8 blocks of bsaes costs roughly 6.5 blocks of vpaes. Note the comparison isn't quite accurate because I'm measuring bsaes_ctr32_encrypt_blocks against vpaes_encrypt and vpaes in CTR mode today must make do with a C loop. Even assuming a cutoff of 6 rather than 7 blocks, it's rare to ask for 96 bytes of entropy at a time. - CTR-DRBG performs some stray block operations (ctr_drbg_update), which bsaes is bad at without extra work to fold them into the CTR loop (not really worth it). - CTR-DRBG calculates a couple new key schedules every RAND_bytes call. We don't currently have a constant-time bsaes key schedule. Unfortunately, even plain vpaes loses to the current aes_nohw used by bsaes, but it's not constant-time. Also taking CTR-DRBG out of the bsaes equation - Machines without AES hardware (clients) are not going to be RNG-bound. It's mostly servers pushing way too many CBC IVs that care. This means bsaes's current side channel tradeoffs make even less sense here. I'm not sure yet what we should do for the rest of the bsaes mess, but it seems clear that we want to stick with vpaes for the RNG. Bug: 256 Change-Id: Iec8f13af232794afd007cb1065913e8117eeee24 Reviewed-on: https://boringssl-review.googlesource.com/c/34744 Reviewed-by: Adam Langley <agl@google.com>
2019-01-12 15:29:10 +00:00
size_t key_bytes, int large_inputs) {
if (hwaes_capable()) {
aes_hw_set_encrypt_key(key, key_bytes * 8, aes_key);
if (gcm_key != NULL) {
CRYPTO_gcm128_init_key(gcm_key, aes_key, aes_hw_encrypt, 1);
}
if (out_block) {
*out_block = aes_hw_encrypt;
}
return aes_hw_ctr32_encrypt_blocks;
}
Don't use bsaes over vpaes for CTR-DRBG. RAND_bytes rarely uses large enough inputs for bsaes to be worth it. https://boringssl-review.googlesource.com/c/boringssl/+/33589 includes some rough benchmarks of various bits here. Some observations: - 8 blocks of bsaes costs roughly 6.5 blocks of vpaes. Note the comparison isn't quite accurate because I'm measuring bsaes_ctr32_encrypt_blocks against vpaes_encrypt and vpaes in CTR mode today must make do with a C loop. Even assuming a cutoff of 6 rather than 7 blocks, it's rare to ask for 96 bytes of entropy at a time. - CTR-DRBG performs some stray block operations (ctr_drbg_update), which bsaes is bad at without extra work to fold them into the CTR loop (not really worth it). - CTR-DRBG calculates a couple new key schedules every RAND_bytes call. We don't currently have a constant-time bsaes key schedule. Unfortunately, even plain vpaes loses to the current aes_nohw used by bsaes, but it's not constant-time. Also taking CTR-DRBG out of the bsaes equation - Machines without AES hardware (clients) are not going to be RNG-bound. It's mostly servers pushing way too many CBC IVs that care. This means bsaes's current side channel tradeoffs make even less sense here. I'm not sure yet what we should do for the rest of the bsaes mess, but it seems clear that we want to stick with vpaes for the RNG. Bug: 256 Change-Id: Iec8f13af232794afd007cb1065913e8117eeee24 Reviewed-on: https://boringssl-review.googlesource.com/c/34744 Reviewed-by: Adam Langley <agl@google.com>
2019-01-12 15:29:10 +00:00
const int bsaes_ok = bsaes_capable();
const int vpaes_ok = vpaes_capable();
if (bsaes_ok && (large_inputs || !vpaes_ok)) {
AES_set_encrypt_key(key, key_bytes * 8, aes_key);
if (gcm_key != NULL) {
CRYPTO_gcm128_init_key(gcm_key, aes_key, AES_encrypt, 0);
}
if (out_block) {
*out_block = AES_encrypt;
}
return bsaes_ctr32_encrypt_blocks;
}
Don't use bsaes over vpaes for CTR-DRBG. RAND_bytes rarely uses large enough inputs for bsaes to be worth it. https://boringssl-review.googlesource.com/c/boringssl/+/33589 includes some rough benchmarks of various bits here. Some observations: - 8 blocks of bsaes costs roughly 6.5 blocks of vpaes. Note the comparison isn't quite accurate because I'm measuring bsaes_ctr32_encrypt_blocks against vpaes_encrypt and vpaes in CTR mode today must make do with a C loop. Even assuming a cutoff of 6 rather than 7 blocks, it's rare to ask for 96 bytes of entropy at a time. - CTR-DRBG performs some stray block operations (ctr_drbg_update), which bsaes is bad at without extra work to fold them into the CTR loop (not really worth it). - CTR-DRBG calculates a couple new key schedules every RAND_bytes call. We don't currently have a constant-time bsaes key schedule. Unfortunately, even plain vpaes loses to the current aes_nohw used by bsaes, but it's not constant-time. Also taking CTR-DRBG out of the bsaes equation - Machines without AES hardware (clients) are not going to be RNG-bound. It's mostly servers pushing way too many CBC IVs that care. This means bsaes's current side channel tradeoffs make even less sense here. I'm not sure yet what we should do for the rest of the bsaes mess, but it seems clear that we want to stick with vpaes for the RNG. Bug: 256 Change-Id: Iec8f13af232794afd007cb1065913e8117eeee24 Reviewed-on: https://boringssl-review.googlesource.com/c/34744 Reviewed-by: Adam Langley <agl@google.com>
2019-01-12 15:29:10 +00:00
if (vpaes_ok) {
vpaes_set_encrypt_key(key, key_bytes * 8, aes_key);
if (out_block) {
*out_block = vpaes_encrypt;
}
if (gcm_key != NULL) {
CRYPTO_gcm128_init_key(gcm_key, aes_key, vpaes_encrypt, 0);
}
return NULL;
}
AES_set_encrypt_key(key, key_bytes * 8, aes_key);
if (gcm_key != NULL) {
CRYPTO_gcm128_init_key(gcm_key, aes_key, AES_encrypt, 0);
}
if (out_block) {
*out_block = AES_encrypt;
}
return NULL;
}
Add a constant-time pshufb-based GHASH implementation. We currently require clmul instructions for constant-time GHASH on x86_64. Otherwise, it falls back to a variable-time 4-bit table implementation. However, a significant proportion of clients lack these instructions. Inspired by vpaes, we can use pshufb and a slightly different order of incorporating the bits to make a constant-time GHASH. This requires SSSE3, which is very common. Benchmarking old machines we had on hand, it appears to be a no-op on Sandy Bridge and a small slowdown for Penryn. Sandy Bridge (Intel Pentium CPU 987 @ 1.50GHz): (Note: these numbers are before 16-byte-aligning the table. That was an improvement on Penryn, so it's possible Sandy Bridge is now better.) Before: Did 4244750 AES-128-GCM (16 bytes) seal operations in 4015000us (1057222.9 ops/sec): 16.9 MB/s Did 442000 AES-128-GCM (1350 bytes) seal operations in 4016000us (110059.8 ops/sec): 148.6 MB/s Did 84000 AES-128-GCM (8192 bytes) seal operations in 4015000us (20921.5 ops/sec): 171.4 MB/s Did 3349250 AES-256-GCM (16 bytes) seal operations in 4016000us (833976.6 ops/sec): 13.3 MB/s Did 343500 AES-256-GCM (1350 bytes) seal operations in 4016000us (85532.9 ops/sec): 115.5 MB/s Did 65250 AES-256-GCM (8192 bytes) seal operations in 4015000us (16251.6 ops/sec): 133.1 MB/s After: Did 4229250 AES-128-GCM (16 bytes) seal operations in 4016000us (1053100.1 ops/sec): 16.8 MB/s [-0.4%] Did 442250 AES-128-GCM (1350 bytes) seal operations in 4016000us (110122.0 ops/sec): 148.7 MB/s [+0.1%] Did 83500 AES-128-GCM (8192 bytes) seal operations in 4015000us (20797.0 ops/sec): 170.4 MB/s [-0.6%] Did 3286500 AES-256-GCM (16 bytes) seal operations in 4016000us (818351.6 ops/sec): 13.1 MB/s [-1.9%] Did 342750 AES-256-GCM (1350 bytes) seal operations in 4015000us (85367.4 ops/sec): 115.2 MB/s [-0.2%] Did 65250 AES-256-GCM (8192 bytes) seal operations in 4016000us (16247.5 ops/sec): 133.1 MB/s [-0.0%] Penryn (Intel Core 2 Duo CPU P8600 @ 2.40GHz): Before: Did 1179000 AES-128-GCM (16 bytes) seal operations in 1000139us (1178836.1 ops/sec): 18.9 MB/s Did 97000 AES-128-GCM (1350 bytes) seal operations in 1006347us (96388.2 ops/sec): 130.1 MB/s Did 18000 AES-128-GCM (8192 bytes) seal operations in 1028943us (17493.7 ops/sec): 143.3 MB/s Did 977000 AES-256-GCM (16 bytes) seal operations in 1000197us (976807.6 ops/sec): 15.6 MB/s Did 82000 AES-256-GCM (1350 bytes) seal operations in 1012434us (80992.9 ops/sec): 109.3 MB/s Did 15000 AES-256-GCM (8192 bytes) seal operations in 1006528us (14902.7 ops/sec): 122.1 MB/s After: Did 1306000 AES-128-GCM (16 bytes) seal operations in 1000153us (1305800.2 ops/sec): 20.9 MB/s [+10.8%] Did 94000 AES-128-GCM (1350 bytes) seal operations in 1009852us (93082.9 ops/sec): 125.7 MB/s [-3.4%] Did 17000 AES-128-GCM (8192 bytes) seal operations in 1012096us (16796.8 ops/sec): 137.6 MB/s [-4.0%] Did 1070000 AES-256-GCM (16 bytes) seal operations in 1000929us (1069006.9 ops/sec): 17.1 MB/s [+9.4%] Did 79000 AES-256-GCM (1350 bytes) seal operations in 1002209us (78825.9 ops/sec): 106.4 MB/s [-2.7%] Did 15000 AES-256-GCM (8192 bytes) seal operations in 1061489us (14131.1 ops/sec): 115.8 MB/s [-5.2%] Change-Id: I1c3760a77af7bee4aee3745d1c648d9e34594afb Reviewed-on: https://boringssl-review.googlesource.com/c/34267 Commit-Queue: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-01-09 03:35:56 +00:00
#if defined(OPENSSL_32_BIT)
#define EVP_AES_GCM_CTX_PADDING (4+8)
#else
#define EVP_AES_GCM_CTX_PADDING 8
#endif
static EVP_AES_GCM_CTX *aes_gcm_from_cipher_ctx(EVP_CIPHER_CTX *ctx) {
#if defined(__GNUC__) || defined(__clang__)
OPENSSL_STATIC_ASSERT(
alignof(EVP_AES_GCM_CTX) <= 16,
"EVP_AES_GCM_CTX needs more alignment than this function provides");
#endif
// |malloc| guarantees up to 4-byte alignment on 32-bit and 8-byte alignment
// on 64-bit systems, so we need to adjust to reach 16-byte alignment.
assert(ctx->cipher->ctx_size ==
sizeof(EVP_AES_GCM_CTX) + EVP_AES_GCM_CTX_PADDING);
char *ptr = ctx->cipher_data;
#if defined(OPENSSL_32_BIT)
assert((uintptr_t)ptr % 4 == 0);
ptr += (uintptr_t)ptr & 4;
#endif
assert((uintptr_t)ptr % 8 == 0);
ptr += (uintptr_t)ptr & 8;
return (EVP_AES_GCM_CTX *)ptr;
}
static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key,
const uint8_t *iv, int enc) {
Add a constant-time pshufb-based GHASH implementation. We currently require clmul instructions for constant-time GHASH on x86_64. Otherwise, it falls back to a variable-time 4-bit table implementation. However, a significant proportion of clients lack these instructions. Inspired by vpaes, we can use pshufb and a slightly different order of incorporating the bits to make a constant-time GHASH. This requires SSSE3, which is very common. Benchmarking old machines we had on hand, it appears to be a no-op on Sandy Bridge and a small slowdown for Penryn. Sandy Bridge (Intel Pentium CPU 987 @ 1.50GHz): (Note: these numbers are before 16-byte-aligning the table. That was an improvement on Penryn, so it's possible Sandy Bridge is now better.) Before: Did 4244750 AES-128-GCM (16 bytes) seal operations in 4015000us (1057222.9 ops/sec): 16.9 MB/s Did 442000 AES-128-GCM (1350 bytes) seal operations in 4016000us (110059.8 ops/sec): 148.6 MB/s Did 84000 AES-128-GCM (8192 bytes) seal operations in 4015000us (20921.5 ops/sec): 171.4 MB/s Did 3349250 AES-256-GCM (16 bytes) seal operations in 4016000us (833976.6 ops/sec): 13.3 MB/s Did 343500 AES-256-GCM (1350 bytes) seal operations in 4016000us (85532.9 ops/sec): 115.5 MB/s Did 65250 AES-256-GCM (8192 bytes) seal operations in 4015000us (16251.6 ops/sec): 133.1 MB/s After: Did 4229250 AES-128-GCM (16 bytes) seal operations in 4016000us (1053100.1 ops/sec): 16.8 MB/s [-0.4%] Did 442250 AES-128-GCM (1350 bytes) seal operations in 4016000us (110122.0 ops/sec): 148.7 MB/s [+0.1%] Did 83500 AES-128-GCM (8192 bytes) seal operations in 4015000us (20797.0 ops/sec): 170.4 MB/s [-0.6%] Did 3286500 AES-256-GCM (16 bytes) seal operations in 4016000us (818351.6 ops/sec): 13.1 MB/s [-1.9%] Did 342750 AES-256-GCM (1350 bytes) seal operations in 4015000us (85367.4 ops/sec): 115.2 MB/s [-0.2%] Did 65250 AES-256-GCM (8192 bytes) seal operations in 4016000us (16247.5 ops/sec): 133.1 MB/s [-0.0%] Penryn (Intel Core 2 Duo CPU P8600 @ 2.40GHz): Before: Did 1179000 AES-128-GCM (16 bytes) seal operations in 1000139us (1178836.1 ops/sec): 18.9 MB/s Did 97000 AES-128-GCM (1350 bytes) seal operations in 1006347us (96388.2 ops/sec): 130.1 MB/s Did 18000 AES-128-GCM (8192 bytes) seal operations in 1028943us (17493.7 ops/sec): 143.3 MB/s Did 977000 AES-256-GCM (16 bytes) seal operations in 1000197us (976807.6 ops/sec): 15.6 MB/s Did 82000 AES-256-GCM (1350 bytes) seal operations in 1012434us (80992.9 ops/sec): 109.3 MB/s Did 15000 AES-256-GCM (8192 bytes) seal operations in 1006528us (14902.7 ops/sec): 122.1 MB/s After: Did 1306000 AES-128-GCM (16 bytes) seal operations in 1000153us (1305800.2 ops/sec): 20.9 MB/s [+10.8%] Did 94000 AES-128-GCM (1350 bytes) seal operations in 1009852us (93082.9 ops/sec): 125.7 MB/s [-3.4%] Did 17000 AES-128-GCM (8192 bytes) seal operations in 1012096us (16796.8 ops/sec): 137.6 MB/s [-4.0%] Did 1070000 AES-256-GCM (16 bytes) seal operations in 1000929us (1069006.9 ops/sec): 17.1 MB/s [+9.4%] Did 79000 AES-256-GCM (1350 bytes) seal operations in 1002209us (78825.9 ops/sec): 106.4 MB/s [-2.7%] Did 15000 AES-256-GCM (8192 bytes) seal operations in 1061489us (14131.1 ops/sec): 115.8 MB/s [-5.2%] Change-Id: I1c3760a77af7bee4aee3745d1c648d9e34594afb Reviewed-on: https://boringssl-review.googlesource.com/c/34267 Commit-Queue: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-01-09 03:35:56 +00:00
EVP_AES_GCM_CTX *gctx = aes_gcm_from_cipher_ctx(ctx);
if (!iv && !key) {
return 1;
}
if (key) {
OPENSSL_memset(&gctx->gcm, 0, sizeof(gctx->gcm));
gctx->ctr = aes_ctr_set_key(&gctx->ks.ks, &gctx->gcm.gcm_key, NULL, key,
Don't use bsaes over vpaes for CTR-DRBG. RAND_bytes rarely uses large enough inputs for bsaes to be worth it. https://boringssl-review.googlesource.com/c/boringssl/+/33589 includes some rough benchmarks of various bits here. Some observations: - 8 blocks of bsaes costs roughly 6.5 blocks of vpaes. Note the comparison isn't quite accurate because I'm measuring bsaes_ctr32_encrypt_blocks against vpaes_encrypt and vpaes in CTR mode today must make do with a C loop. Even assuming a cutoff of 6 rather than 7 blocks, it's rare to ask for 96 bytes of entropy at a time. - CTR-DRBG performs some stray block operations (ctr_drbg_update), which bsaes is bad at without extra work to fold them into the CTR loop (not really worth it). - CTR-DRBG calculates a couple new key schedules every RAND_bytes call. We don't currently have a constant-time bsaes key schedule. Unfortunately, even plain vpaes loses to the current aes_nohw used by bsaes, but it's not constant-time. Also taking CTR-DRBG out of the bsaes equation - Machines without AES hardware (clients) are not going to be RNG-bound. It's mostly servers pushing way too many CBC IVs that care. This means bsaes's current side channel tradeoffs make even less sense here. I'm not sure yet what we should do for the rest of the bsaes mess, but it seems clear that we want to stick with vpaes for the RNG. Bug: 256 Change-Id: Iec8f13af232794afd007cb1065913e8117eeee24 Reviewed-on: https://boringssl-review.googlesource.com/c/34744 Reviewed-by: Adam Langley <agl@google.com>
2019-01-12 15:29:10 +00:00
ctx->key_len, 1 /* large inputs */);
// If we have an iv can set it directly, otherwise use saved IV.
if (iv == NULL && gctx->iv_set) {
iv = gctx->iv;
}
if (iv) {
CRYPTO_gcm128_setiv(&gctx->gcm, &gctx->ks.ks, iv, gctx->ivlen);
gctx->iv_set = 1;
}
gctx->key_set = 1;
} else {
// If key set use IV, otherwise copy
if (gctx->key_set) {
CRYPTO_gcm128_setiv(&gctx->gcm, &gctx->ks.ks, iv, gctx->ivlen);
} else {
OPENSSL_memcpy(gctx->iv, iv, gctx->ivlen);
}
gctx->iv_set = 1;
gctx->iv_gen = 0;
}
return 1;
}
static void aes_gcm_cleanup(EVP_CIPHER_CTX *c) {
Add a constant-time pshufb-based GHASH implementation. We currently require clmul instructions for constant-time GHASH on x86_64. Otherwise, it falls back to a variable-time 4-bit table implementation. However, a significant proportion of clients lack these instructions. Inspired by vpaes, we can use pshufb and a slightly different order of incorporating the bits to make a constant-time GHASH. This requires SSSE3, which is very common. Benchmarking old machines we had on hand, it appears to be a no-op on Sandy Bridge and a small slowdown for Penryn. Sandy Bridge (Intel Pentium CPU 987 @ 1.50GHz): (Note: these numbers are before 16-byte-aligning the table. That was an improvement on Penryn, so it's possible Sandy Bridge is now better.) Before: Did 4244750 AES-128-GCM (16 bytes) seal operations in 4015000us (1057222.9 ops/sec): 16.9 MB/s Did 442000 AES-128-GCM (1350 bytes) seal operations in 4016000us (110059.8 ops/sec): 148.6 MB/s Did 84000 AES-128-GCM (8192 bytes) seal operations in 4015000us (20921.5 ops/sec): 171.4 MB/s Did 3349250 AES-256-GCM (16 bytes) seal operations in 4016000us (833976.6 ops/sec): 13.3 MB/s Did 343500 AES-256-GCM (1350 bytes) seal operations in 4016000us (85532.9 ops/sec): 115.5 MB/s Did 65250 AES-256-GCM (8192 bytes) seal operations in 4015000us (16251.6 ops/sec): 133.1 MB/s After: Did 4229250 AES-128-GCM (16 bytes) seal operations in 4016000us (1053100.1 ops/sec): 16.8 MB/s [-0.4%] Did 442250 AES-128-GCM (1350 bytes) seal operations in 4016000us (110122.0 ops/sec): 148.7 MB/s [+0.1%] Did 83500 AES-128-GCM (8192 bytes) seal operations in 4015000us (20797.0 ops/sec): 170.4 MB/s [-0.6%] Did 3286500 AES-256-GCM (16 bytes) seal operations in 4016000us (818351.6 ops/sec): 13.1 MB/s [-1.9%] Did 342750 AES-256-GCM (1350 bytes) seal operations in 4015000us (85367.4 ops/sec): 115.2 MB/s [-0.2%] Did 65250 AES-256-GCM (8192 bytes) seal operations in 4016000us (16247.5 ops/sec): 133.1 MB/s [-0.0%] Penryn (Intel Core 2 Duo CPU P8600 @ 2.40GHz): Before: Did 1179000 AES-128-GCM (16 bytes) seal operations in 1000139us (1178836.1 ops/sec): 18.9 MB/s Did 97000 AES-128-GCM (1350 bytes) seal operations in 1006347us (96388.2 ops/sec): 130.1 MB/s Did 18000 AES-128-GCM (8192 bytes) seal operations in 1028943us (17493.7 ops/sec): 143.3 MB/s Did 977000 AES-256-GCM (16 bytes) seal operations in 1000197us (976807.6 ops/sec): 15.6 MB/s Did 82000 AES-256-GCM (1350 bytes) seal operations in 1012434us (80992.9 ops/sec): 109.3 MB/s Did 15000 AES-256-GCM (8192 bytes) seal operations in 1006528us (14902.7 ops/sec): 122.1 MB/s After: Did 1306000 AES-128-GCM (16 bytes) seal operations in 1000153us (1305800.2 ops/sec): 20.9 MB/s [+10.8%] Did 94000 AES-128-GCM (1350 bytes) seal operations in 1009852us (93082.9 ops/sec): 125.7 MB/s [-3.4%] Did 17000 AES-128-GCM (8192 bytes) seal operations in 1012096us (16796.8 ops/sec): 137.6 MB/s [-4.0%] Did 1070000 AES-256-GCM (16 bytes) seal operations in 1000929us (1069006.9 ops/sec): 17.1 MB/s [+9.4%] Did 79000 AES-256-GCM (1350 bytes) seal operations in 1002209us (78825.9 ops/sec): 106.4 MB/s [-2.7%] Did 15000 AES-256-GCM (8192 bytes) seal operations in 1061489us (14131.1 ops/sec): 115.8 MB/s [-5.2%] Change-Id: I1c3760a77af7bee4aee3745d1c648d9e34594afb Reviewed-on: https://boringssl-review.googlesource.com/c/34267 Commit-Queue: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-01-09 03:35:56 +00:00
EVP_AES_GCM_CTX *gctx = aes_gcm_from_cipher_ctx(c);
OPENSSL_cleanse(&gctx->gcm, sizeof(gctx->gcm));
if (gctx->iv != c->iv) {
OPENSSL_free(gctx->iv);
}
}
// increment counter (64-bit int) by 1
static void ctr64_inc(uint8_t *counter) {
int n = 8;
uint8_t c;
do {
--n;
c = counter[n];
++c;
counter[n] = c;
if (c) {
return;
}
} while (n);
}
static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr) {
Add a constant-time pshufb-based GHASH implementation. We currently require clmul instructions for constant-time GHASH on x86_64. Otherwise, it falls back to a variable-time 4-bit table implementation. However, a significant proportion of clients lack these instructions. Inspired by vpaes, we can use pshufb and a slightly different order of incorporating the bits to make a constant-time GHASH. This requires SSSE3, which is very common. Benchmarking old machines we had on hand, it appears to be a no-op on Sandy Bridge and a small slowdown for Penryn. Sandy Bridge (Intel Pentium CPU 987 @ 1.50GHz): (Note: these numbers are before 16-byte-aligning the table. That was an improvement on Penryn, so it's possible Sandy Bridge is now better.) Before: Did 4244750 AES-128-GCM (16 bytes) seal operations in 4015000us (1057222.9 ops/sec): 16.9 MB/s Did 442000 AES-128-GCM (1350 bytes) seal operations in 4016000us (110059.8 ops/sec): 148.6 MB/s Did 84000 AES-128-GCM (8192 bytes) seal operations in 4015000us (20921.5 ops/sec): 171.4 MB/s Did 3349250 AES-256-GCM (16 bytes) seal operations in 4016000us (833976.6 ops/sec): 13.3 MB/s Did 343500 AES-256-GCM (1350 bytes) seal operations in 4016000us (85532.9 ops/sec): 115.5 MB/s Did 65250 AES-256-GCM (8192 bytes) seal operations in 4015000us (16251.6 ops/sec): 133.1 MB/s After: Did 4229250 AES-128-GCM (16 bytes) seal operations in 4016000us (1053100.1 ops/sec): 16.8 MB/s [-0.4%] Did 442250 AES-128-GCM (1350 bytes) seal operations in 4016000us (110122.0 ops/sec): 148.7 MB/s [+0.1%] Did 83500 AES-128-GCM (8192 bytes) seal operations in 4015000us (20797.0 ops/sec): 170.4 MB/s [-0.6%] Did 3286500 AES-256-GCM (16 bytes) seal operations in 4016000us (818351.6 ops/sec): 13.1 MB/s [-1.9%] Did 342750 AES-256-GCM (1350 bytes) seal operations in 4015000us (85367.4 ops/sec): 115.2 MB/s [-0.2%] Did 65250 AES-256-GCM (8192 bytes) seal operations in 4016000us (16247.5 ops/sec): 133.1 MB/s [-0.0%] Penryn (Intel Core 2 Duo CPU P8600 @ 2.40GHz): Before: Did 1179000 AES-128-GCM (16 bytes) seal operations in 1000139us (1178836.1 ops/sec): 18.9 MB/s Did 97000 AES-128-GCM (1350 bytes) seal operations in 1006347us (96388.2 ops/sec): 130.1 MB/s Did 18000 AES-128-GCM (8192 bytes) seal operations in 1028943us (17493.7 ops/sec): 143.3 MB/s Did 977000 AES-256-GCM (16 bytes) seal operations in 1000197us (976807.6 ops/sec): 15.6 MB/s Did 82000 AES-256-GCM (1350 bytes) seal operations in 1012434us (80992.9 ops/sec): 109.3 MB/s Did 15000 AES-256-GCM (8192 bytes) seal operations in 1006528us (14902.7 ops/sec): 122.1 MB/s After: Did 1306000 AES-128-GCM (16 bytes) seal operations in 1000153us (1305800.2 ops/sec): 20.9 MB/s [+10.8%] Did 94000 AES-128-GCM (1350 bytes) seal operations in 1009852us (93082.9 ops/sec): 125.7 MB/s [-3.4%] Did 17000 AES-128-GCM (8192 bytes) seal operations in 1012096us (16796.8 ops/sec): 137.6 MB/s [-4.0%] Did 1070000 AES-256-GCM (16 bytes) seal operations in 1000929us (1069006.9 ops/sec): 17.1 MB/s [+9.4%] Did 79000 AES-256-GCM (1350 bytes) seal operations in 1002209us (78825.9 ops/sec): 106.4 MB/s [-2.7%] Did 15000 AES-256-GCM (8192 bytes) seal operations in 1061489us (14131.1 ops/sec): 115.8 MB/s [-5.2%] Change-Id: I1c3760a77af7bee4aee3745d1c648d9e34594afb Reviewed-on: https://boringssl-review.googlesource.com/c/34267 Commit-Queue: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-01-09 03:35:56 +00:00
EVP_AES_GCM_CTX *gctx = aes_gcm_from_cipher_ctx(c);
switch (type) {
case EVP_CTRL_INIT:
gctx->key_set = 0;
gctx->iv_set = 0;
gctx->ivlen = c->cipher->iv_len;
gctx->iv = c->iv;
gctx->taglen = -1;
gctx->iv_gen = 0;
return 1;
case EVP_CTRL_AEAD_SET_IVLEN:
if (arg <= 0) {
return 0;
}
// Allocate memory for IV if needed
if (arg > EVP_MAX_IV_LENGTH && arg > gctx->ivlen) {
if (gctx->iv != c->iv) {
OPENSSL_free(gctx->iv);
}
gctx->iv = OPENSSL_malloc(arg);
if (!gctx->iv) {
return 0;
}
}
gctx->ivlen = arg;
return 1;
case EVP_CTRL_AEAD_SET_TAG:
if (arg <= 0 || arg > 16 || c->encrypt) {
return 0;
}
OPENSSL_memcpy(c->buf, ptr, arg);
gctx->taglen = arg;
return 1;
case EVP_CTRL_AEAD_GET_TAG:
if (arg <= 0 || arg > 16 || !c->encrypt || gctx->taglen < 0) {
return 0;
}
OPENSSL_memcpy(ptr, c->buf, arg);
return 1;
case EVP_CTRL_AEAD_SET_IV_FIXED:
// Special case: -1 length restores whole IV
if (arg == -1) {
OPENSSL_memcpy(gctx->iv, ptr, gctx->ivlen);
gctx->iv_gen = 1;
return 1;
}
// Fixed field must be at least 4 bytes and invocation field
// at least 8.
if (arg < 4 || (gctx->ivlen - arg) < 8) {
return 0;
}
if (arg) {
OPENSSL_memcpy(gctx->iv, ptr, arg);
}
if (c->encrypt && !RAND_bytes(gctx->iv + arg, gctx->ivlen - arg)) {
return 0;
}
gctx->iv_gen = 1;
return 1;
case EVP_CTRL_GCM_IV_GEN:
if (gctx->iv_gen == 0 || gctx->key_set == 0) {
return 0;
}
CRYPTO_gcm128_setiv(&gctx->gcm, &gctx->ks.ks, gctx->iv, gctx->ivlen);
if (arg <= 0 || arg > gctx->ivlen) {
arg = gctx->ivlen;
}
OPENSSL_memcpy(ptr, gctx->iv + gctx->ivlen - arg, arg);
// Invocation field will be at least 8 bytes in size and
// so no need to check wrap around or increment more than
// last 8 bytes.
ctr64_inc(gctx->iv + gctx->ivlen - 8);
gctx->iv_set = 1;
return 1;
case EVP_CTRL_GCM_SET_IV_INV:
if (gctx->iv_gen == 0 || gctx->key_set == 0 || c->encrypt) {
return 0;
}
OPENSSL_memcpy(gctx->iv + gctx->ivlen - arg, ptr, arg);
CRYPTO_gcm128_setiv(&gctx->gcm, &gctx->ks.ks, gctx->iv, gctx->ivlen);
gctx->iv_set = 1;
return 1;
case EVP_CTRL_COPY: {
EVP_CIPHER_CTX *out = ptr;
Add a constant-time pshufb-based GHASH implementation. We currently require clmul instructions for constant-time GHASH on x86_64. Otherwise, it falls back to a variable-time 4-bit table implementation. However, a significant proportion of clients lack these instructions. Inspired by vpaes, we can use pshufb and a slightly different order of incorporating the bits to make a constant-time GHASH. This requires SSSE3, which is very common. Benchmarking old machines we had on hand, it appears to be a no-op on Sandy Bridge and a small slowdown for Penryn. Sandy Bridge (Intel Pentium CPU 987 @ 1.50GHz): (Note: these numbers are before 16-byte-aligning the table. That was an improvement on Penryn, so it's possible Sandy Bridge is now better.) Before: Did 4244750 AES-128-GCM (16 bytes) seal operations in 4015000us (1057222.9 ops/sec): 16.9 MB/s Did 442000 AES-128-GCM (1350 bytes) seal operations in 4016000us (110059.8 ops/sec): 148.6 MB/s Did 84000 AES-128-GCM (8192 bytes) seal operations in 4015000us (20921.5 ops/sec): 171.4 MB/s Did 3349250 AES-256-GCM (16 bytes) seal operations in 4016000us (833976.6 ops/sec): 13.3 MB/s Did 343500 AES-256-GCM (1350 bytes) seal operations in 4016000us (85532.9 ops/sec): 115.5 MB/s Did 65250 AES-256-GCM (8192 bytes) seal operations in 4015000us (16251.6 ops/sec): 133.1 MB/s After: Did 4229250 AES-128-GCM (16 bytes) seal operations in 4016000us (1053100.1 ops/sec): 16.8 MB/s [-0.4%] Did 442250 AES-128-GCM (1350 bytes) seal operations in 4016000us (110122.0 ops/sec): 148.7 MB/s [+0.1%] Did 83500 AES-128-GCM (8192 bytes) seal operations in 4015000us (20797.0 ops/sec): 170.4 MB/s [-0.6%] Did 3286500 AES-256-GCM (16 bytes) seal operations in 4016000us (818351.6 ops/sec): 13.1 MB/s [-1.9%] Did 342750 AES-256-GCM (1350 bytes) seal operations in 4015000us (85367.4 ops/sec): 115.2 MB/s [-0.2%] Did 65250 AES-256-GCM (8192 bytes) seal operations in 4016000us (16247.5 ops/sec): 133.1 MB/s [-0.0%] Penryn (Intel Core 2 Duo CPU P8600 @ 2.40GHz): Before: Did 1179000 AES-128-GCM (16 bytes) seal operations in 1000139us (1178836.1 ops/sec): 18.9 MB/s Did 97000 AES-128-GCM (1350 bytes) seal operations in 1006347us (96388.2 ops/sec): 130.1 MB/s Did 18000 AES-128-GCM (8192 bytes) seal operations in 1028943us (17493.7 ops/sec): 143.3 MB/s Did 977000 AES-256-GCM (16 bytes) seal operations in 1000197us (976807.6 ops/sec): 15.6 MB/s Did 82000 AES-256-GCM (1350 bytes) seal operations in 1012434us (80992.9 ops/sec): 109.3 MB/s Did 15000 AES-256-GCM (8192 bytes) seal operations in 1006528us (14902.7 ops/sec): 122.1 MB/s After: Did 1306000 AES-128-GCM (16 bytes) seal operations in 1000153us (1305800.2 ops/sec): 20.9 MB/s [+10.8%] Did 94000 AES-128-GCM (1350 bytes) seal operations in 1009852us (93082.9 ops/sec): 125.7 MB/s [-3.4%] Did 17000 AES-128-GCM (8192 bytes) seal operations in 1012096us (16796.8 ops/sec): 137.6 MB/s [-4.0%] Did 1070000 AES-256-GCM (16 bytes) seal operations in 1000929us (1069006.9 ops/sec): 17.1 MB/s [+9.4%] Did 79000 AES-256-GCM (1350 bytes) seal operations in 1002209us (78825.9 ops/sec): 106.4 MB/s [-2.7%] Did 15000 AES-256-GCM (8192 bytes) seal operations in 1061489us (14131.1 ops/sec): 115.8 MB/s [-5.2%] Change-Id: I1c3760a77af7bee4aee3745d1c648d9e34594afb Reviewed-on: https://boringssl-review.googlesource.com/c/34267 Commit-Queue: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-01-09 03:35:56 +00:00
EVP_AES_GCM_CTX *gctx_out = aes_gcm_from_cipher_ctx(out);
if (gctx->iv == c->iv) {
gctx_out->iv = out->iv;
} else {
gctx_out->iv = OPENSSL_malloc(gctx->ivlen);
if (!gctx_out->iv) {
return 0;
}
OPENSSL_memcpy(gctx_out->iv, gctx->iv, gctx->ivlen);
}
return 1;
}
default:
return -1;
}
}
static int aes_gcm_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in,
size_t len) {
Add a constant-time pshufb-based GHASH implementation. We currently require clmul instructions for constant-time GHASH on x86_64. Otherwise, it falls back to a variable-time 4-bit table implementation. However, a significant proportion of clients lack these instructions. Inspired by vpaes, we can use pshufb and a slightly different order of incorporating the bits to make a constant-time GHASH. This requires SSSE3, which is very common. Benchmarking old machines we had on hand, it appears to be a no-op on Sandy Bridge and a small slowdown for Penryn. Sandy Bridge (Intel Pentium CPU 987 @ 1.50GHz): (Note: these numbers are before 16-byte-aligning the table. That was an improvement on Penryn, so it's possible Sandy Bridge is now better.) Before: Did 4244750 AES-128-GCM (16 bytes) seal operations in 4015000us (1057222.9 ops/sec): 16.9 MB/s Did 442000 AES-128-GCM (1350 bytes) seal operations in 4016000us (110059.8 ops/sec): 148.6 MB/s Did 84000 AES-128-GCM (8192 bytes) seal operations in 4015000us (20921.5 ops/sec): 171.4 MB/s Did 3349250 AES-256-GCM (16 bytes) seal operations in 4016000us (833976.6 ops/sec): 13.3 MB/s Did 343500 AES-256-GCM (1350 bytes) seal operations in 4016000us (85532.9 ops/sec): 115.5 MB/s Did 65250 AES-256-GCM (8192 bytes) seal operations in 4015000us (16251.6 ops/sec): 133.1 MB/s After: Did 4229250 AES-128-GCM (16 bytes) seal operations in 4016000us (1053100.1 ops/sec): 16.8 MB/s [-0.4%] Did 442250 AES-128-GCM (1350 bytes) seal operations in 4016000us (110122.0 ops/sec): 148.7 MB/s [+0.1%] Did 83500 AES-128-GCM (8192 bytes) seal operations in 4015000us (20797.0 ops/sec): 170.4 MB/s [-0.6%] Did 3286500 AES-256-GCM (16 bytes) seal operations in 4016000us (818351.6 ops/sec): 13.1 MB/s [-1.9%] Did 342750 AES-256-GCM (1350 bytes) seal operations in 4015000us (85367.4 ops/sec): 115.2 MB/s [-0.2%] Did 65250 AES-256-GCM (8192 bytes) seal operations in 4016000us (16247.5 ops/sec): 133.1 MB/s [-0.0%] Penryn (Intel Core 2 Duo CPU P8600 @ 2.40GHz): Before: Did 1179000 AES-128-GCM (16 bytes) seal operations in 1000139us (1178836.1 ops/sec): 18.9 MB/s Did 97000 AES-128-GCM (1350 bytes) seal operations in 1006347us (96388.2 ops/sec): 130.1 MB/s Did 18000 AES-128-GCM (8192 bytes) seal operations in 1028943us (17493.7 ops/sec): 143.3 MB/s Did 977000 AES-256-GCM (16 bytes) seal operations in 1000197us (976807.6 ops/sec): 15.6 MB/s Did 82000 AES-256-GCM (1350 bytes) seal operations in 1012434us (80992.9 ops/sec): 109.3 MB/s Did 15000 AES-256-GCM (8192 bytes) seal operations in 1006528us (14902.7 ops/sec): 122.1 MB/s After: Did 1306000 AES-128-GCM (16 bytes) seal operations in 1000153us (1305800.2 ops/sec): 20.9 MB/s [+10.8%] Did 94000 AES-128-GCM (1350 bytes) seal operations in 1009852us (93082.9 ops/sec): 125.7 MB/s [-3.4%] Did 17000 AES-128-GCM (8192 bytes) seal operations in 1012096us (16796.8 ops/sec): 137.6 MB/s [-4.0%] Did 1070000 AES-256-GCM (16 bytes) seal operations in 1000929us (1069006.9 ops/sec): 17.1 MB/s [+9.4%] Did 79000 AES-256-GCM (1350 bytes) seal operations in 1002209us (78825.9 ops/sec): 106.4 MB/s [-2.7%] Did 15000 AES-256-GCM (8192 bytes) seal operations in 1061489us (14131.1 ops/sec): 115.8 MB/s [-5.2%] Change-Id: I1c3760a77af7bee4aee3745d1c648d9e34594afb Reviewed-on: https://boringssl-review.googlesource.com/c/34267 Commit-Queue: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-01-09 03:35:56 +00:00
EVP_AES_GCM_CTX *gctx = aes_gcm_from_cipher_ctx(ctx);
// If not set up, return error
if (!gctx->key_set) {
return -1;
}
if (!gctx->iv_set) {
return -1;
}
if (in) {
if (out == NULL) {
if (!CRYPTO_gcm128_aad(&gctx->gcm, in, len)) {
return -1;
}
} else if (ctx->encrypt) {
if (gctx->ctr) {
if (!CRYPTO_gcm128_encrypt_ctr32(&gctx->gcm, &gctx->ks.ks, in, out, len,
gctx->ctr)) {
return -1;
}
} else {
if (!CRYPTO_gcm128_encrypt(&gctx->gcm, &gctx->ks.ks, in, out, len)) {
return -1;
}
}
} else {
if (gctx->ctr) {
if (!CRYPTO_gcm128_decrypt_ctr32(&gctx->gcm, &gctx->ks.ks, in, out, len,
gctx->ctr)) {
return -1;
}
} else {
if (!CRYPTO_gcm128_decrypt(&gctx->gcm, &gctx->ks.ks, in, out, len)) {
return -1;
}
}
}
return len;
} else {
if (!ctx->encrypt) {
if (gctx->taglen < 0 ||
!CRYPTO_gcm128_finish(&gctx->gcm, ctx->buf, gctx->taglen)) {
return -1;
}
gctx->iv_set = 0;
return 0;
}
CRYPTO_gcm128_tag(&gctx->gcm, ctx->buf, 16);
gctx->taglen = 16;
// Don't reuse the IV
gctx->iv_set = 0;
return 0;
}
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_128_cbc_generic) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_128_cbc;
out->block_size = 16;
out->key_len = 16;
out->iv_len = 16;
out->ctx_size = sizeof(EVP_AES_KEY);
out->flags = EVP_CIPH_CBC_MODE;
out->init = aes_init_key;
out->cipher = aes_cbc_cipher;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_128_ctr_generic) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_128_ctr;
out->block_size = 1;
out->key_len = 16;
out->iv_len = 16;
out->ctx_size = sizeof(EVP_AES_KEY);
out->flags = EVP_CIPH_CTR_MODE;
out->init = aes_init_key;
out->cipher = aes_ctr_cipher;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_128_ecb_generic) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_128_ecb;
out->block_size = 16;
out->key_len = 16;
out->ctx_size = sizeof(EVP_AES_KEY);
out->flags = EVP_CIPH_ECB_MODE;
out->init = aes_init_key;
out->cipher = aes_ecb_cipher;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_128_ofb_generic) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_128_ofb128;
out->block_size = 1;
out->key_len = 16;
out->iv_len = 16;
out->ctx_size = sizeof(EVP_AES_KEY);
out->flags = EVP_CIPH_OFB_MODE;
out->init = aes_init_key;
out->cipher = aes_ofb_cipher;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_128_gcm_generic) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_128_gcm;
out->block_size = 1;
out->key_len = 16;
out->iv_len = 12;
Add a constant-time pshufb-based GHASH implementation. We currently require clmul instructions for constant-time GHASH on x86_64. Otherwise, it falls back to a variable-time 4-bit table implementation. However, a significant proportion of clients lack these instructions. Inspired by vpaes, we can use pshufb and a slightly different order of incorporating the bits to make a constant-time GHASH. This requires SSSE3, which is very common. Benchmarking old machines we had on hand, it appears to be a no-op on Sandy Bridge and a small slowdown for Penryn. Sandy Bridge (Intel Pentium CPU 987 @ 1.50GHz): (Note: these numbers are before 16-byte-aligning the table. That was an improvement on Penryn, so it's possible Sandy Bridge is now better.) Before: Did 4244750 AES-128-GCM (16 bytes) seal operations in 4015000us (1057222.9 ops/sec): 16.9 MB/s Did 442000 AES-128-GCM (1350 bytes) seal operations in 4016000us (110059.8 ops/sec): 148.6 MB/s Did 84000 AES-128-GCM (8192 bytes) seal operations in 4015000us (20921.5 ops/sec): 171.4 MB/s Did 3349250 AES-256-GCM (16 bytes) seal operations in 4016000us (833976.6 ops/sec): 13.3 MB/s Did 343500 AES-256-GCM (1350 bytes) seal operations in 4016000us (85532.9 ops/sec): 115.5 MB/s Did 65250 AES-256-GCM (8192 bytes) seal operations in 4015000us (16251.6 ops/sec): 133.1 MB/s After: Did 4229250 AES-128-GCM (16 bytes) seal operations in 4016000us (1053100.1 ops/sec): 16.8 MB/s [-0.4%] Did 442250 AES-128-GCM (1350 bytes) seal operations in 4016000us (110122.0 ops/sec): 148.7 MB/s [+0.1%] Did 83500 AES-128-GCM (8192 bytes) seal operations in 4015000us (20797.0 ops/sec): 170.4 MB/s [-0.6%] Did 3286500 AES-256-GCM (16 bytes) seal operations in 4016000us (818351.6 ops/sec): 13.1 MB/s [-1.9%] Did 342750 AES-256-GCM (1350 bytes) seal operations in 4015000us (85367.4 ops/sec): 115.2 MB/s [-0.2%] Did 65250 AES-256-GCM (8192 bytes) seal operations in 4016000us (16247.5 ops/sec): 133.1 MB/s [-0.0%] Penryn (Intel Core 2 Duo CPU P8600 @ 2.40GHz): Before: Did 1179000 AES-128-GCM (16 bytes) seal operations in 1000139us (1178836.1 ops/sec): 18.9 MB/s Did 97000 AES-128-GCM (1350 bytes) seal operations in 1006347us (96388.2 ops/sec): 130.1 MB/s Did 18000 AES-128-GCM (8192 bytes) seal operations in 1028943us (17493.7 ops/sec): 143.3 MB/s Did 977000 AES-256-GCM (16 bytes) seal operations in 1000197us (976807.6 ops/sec): 15.6 MB/s Did 82000 AES-256-GCM (1350 bytes) seal operations in 1012434us (80992.9 ops/sec): 109.3 MB/s Did 15000 AES-256-GCM (8192 bytes) seal operations in 1006528us (14902.7 ops/sec): 122.1 MB/s After: Did 1306000 AES-128-GCM (16 bytes) seal operations in 1000153us (1305800.2 ops/sec): 20.9 MB/s [+10.8%] Did 94000 AES-128-GCM (1350 bytes) seal operations in 1009852us (93082.9 ops/sec): 125.7 MB/s [-3.4%] Did 17000 AES-128-GCM (8192 bytes) seal operations in 1012096us (16796.8 ops/sec): 137.6 MB/s [-4.0%] Did 1070000 AES-256-GCM (16 bytes) seal operations in 1000929us (1069006.9 ops/sec): 17.1 MB/s [+9.4%] Did 79000 AES-256-GCM (1350 bytes) seal operations in 1002209us (78825.9 ops/sec): 106.4 MB/s [-2.7%] Did 15000 AES-256-GCM (8192 bytes) seal operations in 1061489us (14131.1 ops/sec): 115.8 MB/s [-5.2%] Change-Id: I1c3760a77af7bee4aee3745d1c648d9e34594afb Reviewed-on: https://boringssl-review.googlesource.com/c/34267 Commit-Queue: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-01-09 03:35:56 +00:00
out->ctx_size = sizeof(EVP_AES_GCM_CTX) + EVP_AES_GCM_CTX_PADDING;
out->flags = EVP_CIPH_GCM_MODE | EVP_CIPH_CUSTOM_IV |
EVP_CIPH_FLAG_CUSTOM_CIPHER | EVP_CIPH_ALWAYS_CALL_INIT |
EVP_CIPH_CTRL_INIT | EVP_CIPH_FLAG_AEAD_CIPHER;
out->init = aes_gcm_init_key;
out->cipher = aes_gcm_cipher;
out->cleanup = aes_gcm_cleanup;
out->ctrl = aes_gcm_ctrl;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_192_cbc_generic) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_192_cbc;
out->block_size = 16;
out->key_len = 24;
out->iv_len = 16;
out->ctx_size = sizeof(EVP_AES_KEY);
out->flags = EVP_CIPH_CBC_MODE;
out->init = aes_init_key;
out->cipher = aes_cbc_cipher;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_192_ctr_generic) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_192_ctr;
out->block_size = 1;
out->key_len = 24;
out->iv_len = 16;
out->ctx_size = sizeof(EVP_AES_KEY);
out->flags = EVP_CIPH_CTR_MODE;
out->init = aes_init_key;
out->cipher = aes_ctr_cipher;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_192_ecb_generic) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_192_ecb;
out->block_size = 16;
out->key_len = 24;
out->ctx_size = sizeof(EVP_AES_KEY);
out->flags = EVP_CIPH_ECB_MODE;
out->init = aes_init_key;
out->cipher = aes_ecb_cipher;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_192_ofb_generic) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_192_ofb128;
out->block_size = 1;
out->key_len = 24;
out->iv_len = 16;
out->ctx_size = sizeof(EVP_AES_KEY);
out->flags = EVP_CIPH_OFB_MODE;
out->init = aes_init_key;
out->cipher = aes_ofb_cipher;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_192_gcm_generic) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_192_gcm;
out->block_size = 1;
out->key_len = 24;
out->iv_len = 12;
Add a constant-time pshufb-based GHASH implementation. We currently require clmul instructions for constant-time GHASH on x86_64. Otherwise, it falls back to a variable-time 4-bit table implementation. However, a significant proportion of clients lack these instructions. Inspired by vpaes, we can use pshufb and a slightly different order of incorporating the bits to make a constant-time GHASH. This requires SSSE3, which is very common. Benchmarking old machines we had on hand, it appears to be a no-op on Sandy Bridge and a small slowdown for Penryn. Sandy Bridge (Intel Pentium CPU 987 @ 1.50GHz): (Note: these numbers are before 16-byte-aligning the table. That was an improvement on Penryn, so it's possible Sandy Bridge is now better.) Before: Did 4244750 AES-128-GCM (16 bytes) seal operations in 4015000us (1057222.9 ops/sec): 16.9 MB/s Did 442000 AES-128-GCM (1350 bytes) seal operations in 4016000us (110059.8 ops/sec): 148.6 MB/s Did 84000 AES-128-GCM (8192 bytes) seal operations in 4015000us (20921.5 ops/sec): 171.4 MB/s Did 3349250 AES-256-GCM (16 bytes) seal operations in 4016000us (833976.6 ops/sec): 13.3 MB/s Did 343500 AES-256-GCM (1350 bytes) seal operations in 4016000us (85532.9 ops/sec): 115.5 MB/s Did 65250 AES-256-GCM (8192 bytes) seal operations in 4015000us (16251.6 ops/sec): 133.1 MB/s After: Did 4229250 AES-128-GCM (16 bytes) seal operations in 4016000us (1053100.1 ops/sec): 16.8 MB/s [-0.4%] Did 442250 AES-128-GCM (1350 bytes) seal operations in 4016000us (110122.0 ops/sec): 148.7 MB/s [+0.1%] Did 83500 AES-128-GCM (8192 bytes) seal operations in 4015000us (20797.0 ops/sec): 170.4 MB/s [-0.6%] Did 3286500 AES-256-GCM (16 bytes) seal operations in 4016000us (818351.6 ops/sec): 13.1 MB/s [-1.9%] Did 342750 AES-256-GCM (1350 bytes) seal operations in 4015000us (85367.4 ops/sec): 115.2 MB/s [-0.2%] Did 65250 AES-256-GCM (8192 bytes) seal operations in 4016000us (16247.5 ops/sec): 133.1 MB/s [-0.0%] Penryn (Intel Core 2 Duo CPU P8600 @ 2.40GHz): Before: Did 1179000 AES-128-GCM (16 bytes) seal operations in 1000139us (1178836.1 ops/sec): 18.9 MB/s Did 97000 AES-128-GCM (1350 bytes) seal operations in 1006347us (96388.2 ops/sec): 130.1 MB/s Did 18000 AES-128-GCM (8192 bytes) seal operations in 1028943us (17493.7 ops/sec): 143.3 MB/s Did 977000 AES-256-GCM (16 bytes) seal operations in 1000197us (976807.6 ops/sec): 15.6 MB/s Did 82000 AES-256-GCM (1350 bytes) seal operations in 1012434us (80992.9 ops/sec): 109.3 MB/s Did 15000 AES-256-GCM (8192 bytes) seal operations in 1006528us (14902.7 ops/sec): 122.1 MB/s After: Did 1306000 AES-128-GCM (16 bytes) seal operations in 1000153us (1305800.2 ops/sec): 20.9 MB/s [+10.8%] Did 94000 AES-128-GCM (1350 bytes) seal operations in 1009852us (93082.9 ops/sec): 125.7 MB/s [-3.4%] Did 17000 AES-128-GCM (8192 bytes) seal operations in 1012096us (16796.8 ops/sec): 137.6 MB/s [-4.0%] Did 1070000 AES-256-GCM (16 bytes) seal operations in 1000929us (1069006.9 ops/sec): 17.1 MB/s [+9.4%] Did 79000 AES-256-GCM (1350 bytes) seal operations in 1002209us (78825.9 ops/sec): 106.4 MB/s [-2.7%] Did 15000 AES-256-GCM (8192 bytes) seal operations in 1061489us (14131.1 ops/sec): 115.8 MB/s [-5.2%] Change-Id: I1c3760a77af7bee4aee3745d1c648d9e34594afb Reviewed-on: https://boringssl-review.googlesource.com/c/34267 Commit-Queue: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-01-09 03:35:56 +00:00
out->ctx_size = sizeof(EVP_AES_GCM_CTX) + EVP_AES_GCM_CTX_PADDING;
out->flags = EVP_CIPH_GCM_MODE | EVP_CIPH_CUSTOM_IV |
EVP_CIPH_FLAG_CUSTOM_CIPHER | EVP_CIPH_ALWAYS_CALL_INIT |
EVP_CIPH_CTRL_INIT | EVP_CIPH_FLAG_AEAD_CIPHER;
out->init = aes_gcm_init_key;
out->cipher = aes_gcm_cipher;
out->cleanup = aes_gcm_cleanup;
out->ctrl = aes_gcm_ctrl;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_256_cbc_generic) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_256_cbc;
out->block_size = 16;
out->key_len = 32;
out->iv_len = 16;
out->ctx_size = sizeof(EVP_AES_KEY);
out->flags = EVP_CIPH_CBC_MODE;
out->init = aes_init_key;
out->cipher = aes_cbc_cipher;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_256_ctr_generic) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_256_ctr;
out->block_size = 1;
out->key_len = 32;
out->iv_len = 16;
out->ctx_size = sizeof(EVP_AES_KEY);
out->flags = EVP_CIPH_CTR_MODE;
out->init = aes_init_key;
out->cipher = aes_ctr_cipher;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_256_ecb_generic) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_256_ecb;
out->block_size = 16;
out->key_len = 32;
out->ctx_size = sizeof(EVP_AES_KEY);
out->flags = EVP_CIPH_ECB_MODE;
out->init = aes_init_key;
out->cipher = aes_ecb_cipher;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_256_ofb_generic) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_256_ofb128;
out->block_size = 1;
out->key_len = 32;
out->iv_len = 16;
out->ctx_size = sizeof(EVP_AES_KEY);
out->flags = EVP_CIPH_OFB_MODE;
out->init = aes_init_key;
out->cipher = aes_ofb_cipher;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_256_gcm_generic) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_256_gcm;
out->block_size = 1;
out->key_len = 32;
out->iv_len = 12;
Add a constant-time pshufb-based GHASH implementation. We currently require clmul instructions for constant-time GHASH on x86_64. Otherwise, it falls back to a variable-time 4-bit table implementation. However, a significant proportion of clients lack these instructions. Inspired by vpaes, we can use pshufb and a slightly different order of incorporating the bits to make a constant-time GHASH. This requires SSSE3, which is very common. Benchmarking old machines we had on hand, it appears to be a no-op on Sandy Bridge and a small slowdown for Penryn. Sandy Bridge (Intel Pentium CPU 987 @ 1.50GHz): (Note: these numbers are before 16-byte-aligning the table. That was an improvement on Penryn, so it's possible Sandy Bridge is now better.) Before: Did 4244750 AES-128-GCM (16 bytes) seal operations in 4015000us (1057222.9 ops/sec): 16.9 MB/s Did 442000 AES-128-GCM (1350 bytes) seal operations in 4016000us (110059.8 ops/sec): 148.6 MB/s Did 84000 AES-128-GCM (8192 bytes) seal operations in 4015000us (20921.5 ops/sec): 171.4 MB/s Did 3349250 AES-256-GCM (16 bytes) seal operations in 4016000us (833976.6 ops/sec): 13.3 MB/s Did 343500 AES-256-GCM (1350 bytes) seal operations in 4016000us (85532.9 ops/sec): 115.5 MB/s Did 65250 AES-256-GCM (8192 bytes) seal operations in 4015000us (16251.6 ops/sec): 133.1 MB/s After: Did 4229250 AES-128-GCM (16 bytes) seal operations in 4016000us (1053100.1 ops/sec): 16.8 MB/s [-0.4%] Did 442250 AES-128-GCM (1350 bytes) seal operations in 4016000us (110122.0 ops/sec): 148.7 MB/s [+0.1%] Did 83500 AES-128-GCM (8192 bytes) seal operations in 4015000us (20797.0 ops/sec): 170.4 MB/s [-0.6%] Did 3286500 AES-256-GCM (16 bytes) seal operations in 4016000us (818351.6 ops/sec): 13.1 MB/s [-1.9%] Did 342750 AES-256-GCM (1350 bytes) seal operations in 4015000us (85367.4 ops/sec): 115.2 MB/s [-0.2%] Did 65250 AES-256-GCM (8192 bytes) seal operations in 4016000us (16247.5 ops/sec): 133.1 MB/s [-0.0%] Penryn (Intel Core 2 Duo CPU P8600 @ 2.40GHz): Before: Did 1179000 AES-128-GCM (16 bytes) seal operations in 1000139us (1178836.1 ops/sec): 18.9 MB/s Did 97000 AES-128-GCM (1350 bytes) seal operations in 1006347us (96388.2 ops/sec): 130.1 MB/s Did 18000 AES-128-GCM (8192 bytes) seal operations in 1028943us (17493.7 ops/sec): 143.3 MB/s Did 977000 AES-256-GCM (16 bytes) seal operations in 1000197us (976807.6 ops/sec): 15.6 MB/s Did 82000 AES-256-GCM (1350 bytes) seal operations in 1012434us (80992.9 ops/sec): 109.3 MB/s Did 15000 AES-256-GCM (8192 bytes) seal operations in 1006528us (14902.7 ops/sec): 122.1 MB/s After: Did 1306000 AES-128-GCM (16 bytes) seal operations in 1000153us (1305800.2 ops/sec): 20.9 MB/s [+10.8%] Did 94000 AES-128-GCM (1350 bytes) seal operations in 1009852us (93082.9 ops/sec): 125.7 MB/s [-3.4%] Did 17000 AES-128-GCM (8192 bytes) seal operations in 1012096us (16796.8 ops/sec): 137.6 MB/s [-4.0%] Did 1070000 AES-256-GCM (16 bytes) seal operations in 1000929us (1069006.9 ops/sec): 17.1 MB/s [+9.4%] Did 79000 AES-256-GCM (1350 bytes) seal operations in 1002209us (78825.9 ops/sec): 106.4 MB/s [-2.7%] Did 15000 AES-256-GCM (8192 bytes) seal operations in 1061489us (14131.1 ops/sec): 115.8 MB/s [-5.2%] Change-Id: I1c3760a77af7bee4aee3745d1c648d9e34594afb Reviewed-on: https://boringssl-review.googlesource.com/c/34267 Commit-Queue: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-01-09 03:35:56 +00:00
out->ctx_size = sizeof(EVP_AES_GCM_CTX) + EVP_AES_GCM_CTX_PADDING;
out->flags = EVP_CIPH_GCM_MODE | EVP_CIPH_CUSTOM_IV |
EVP_CIPH_FLAG_CUSTOM_CIPHER | EVP_CIPH_ALWAYS_CALL_INIT |
EVP_CIPH_CTRL_INIT | EVP_CIPH_FLAG_AEAD_CIPHER;
out->init = aes_gcm_init_key;
out->cipher = aes_gcm_cipher;
out->cleanup = aes_gcm_cleanup;
out->ctrl = aes_gcm_ctrl;
}
#if defined(HWAES_ECB)
static int aes_hw_ecb_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out,
const uint8_t *in, size_t len) {
size_t bl = ctx->cipher->block_size;
if (len < bl) {
return 1;
}
aes_hw_ecb_encrypt(in, out, len, ctx->cipher_data, ctx->encrypt);
return 1;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_hw_128_ecb) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_128_ecb;
out->block_size = 16;
out->key_len = 16;
out->ctx_size = sizeof(EVP_AES_KEY);
out->flags = EVP_CIPH_ECB_MODE;
out->init = aes_init_key;
out->cipher = aes_hw_ecb_cipher;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_hw_192_ecb) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_192_ecb;
out->block_size = 16;
out->key_len = 24;
out->ctx_size = sizeof(EVP_AES_KEY);
out->flags = EVP_CIPH_ECB_MODE;
out->init = aes_init_key;
out->cipher = aes_hw_ecb_cipher;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_hw_256_ecb) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_256_ecb;
out->block_size = 16;
out->key_len = 32;
out->ctx_size = sizeof(EVP_AES_KEY);
out->flags = EVP_CIPH_ECB_MODE;
out->init = aes_init_key;
out->cipher = aes_hw_ecb_cipher;
}
#define EVP_ECB_CIPHER_FUNCTION(keybits) \
const EVP_CIPHER *EVP_aes_##keybits##_ecb(void) { \
if (hwaes_capable()) { \
return aes_hw_##keybits##_ecb(); \
} \
return aes_##keybits##_ecb_generic(); \
}
#else
#define EVP_ECB_CIPHER_FUNCTION(keybits) \
const EVP_CIPHER *EVP_aes_##keybits##_ecb(void) { \
return aes_##keybits##_ecb_generic(); \
}
#endif // HWAES_ECB
#define EVP_CIPHER_FUNCTION(keybits, mode) \
const EVP_CIPHER *EVP_aes_##keybits##_##mode(void) { \
return aes_##keybits##_##mode##_generic(); \
}
EVP_CIPHER_FUNCTION(128, cbc)
EVP_CIPHER_FUNCTION(128, ctr)
EVP_CIPHER_FUNCTION(128, ofb)
EVP_CIPHER_FUNCTION(128, gcm)
EVP_CIPHER_FUNCTION(192, cbc)
EVP_CIPHER_FUNCTION(192, ctr)
EVP_CIPHER_FUNCTION(192, ofb)
EVP_CIPHER_FUNCTION(192, gcm)
EVP_CIPHER_FUNCTION(256, cbc)
EVP_CIPHER_FUNCTION(256, ctr)
EVP_CIPHER_FUNCTION(256, ofb)
EVP_CIPHER_FUNCTION(256, gcm)
EVP_ECB_CIPHER_FUNCTION(128)
EVP_ECB_CIPHER_FUNCTION(192)
EVP_ECB_CIPHER_FUNCTION(256)
#define EVP_AEAD_AES_GCM_TAG_LEN 16
struct aead_aes_gcm_ctx {
union {
double align;
AES_KEY ks;
} ks;
GCM128_KEY gcm_key;
ctr128_f ctr;
};
static int aead_aes_gcm_init_impl(struct aead_aes_gcm_ctx *gcm_ctx,
size_t *out_tag_len, const uint8_t *key,
size_t key_len, size_t tag_len) {
const size_t key_bits = key_len * 8;
if (key_bits != 128 && key_bits != 256) {
OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_KEY_LENGTH);
return 0; // EVP_AEAD_CTX_init should catch this.
}
if (tag_len == EVP_AEAD_DEFAULT_TAG_LENGTH) {
tag_len = EVP_AEAD_AES_GCM_TAG_LEN;
}
if (tag_len > EVP_AEAD_AES_GCM_TAG_LEN) {
OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TAG_TOO_LARGE);
return 0;
}
Don't use bsaes over vpaes for CTR-DRBG. RAND_bytes rarely uses large enough inputs for bsaes to be worth it. https://boringssl-review.googlesource.com/c/boringssl/+/33589 includes some rough benchmarks of various bits here. Some observations: - 8 blocks of bsaes costs roughly 6.5 blocks of vpaes. Note the comparison isn't quite accurate because I'm measuring bsaes_ctr32_encrypt_blocks against vpaes_encrypt and vpaes in CTR mode today must make do with a C loop. Even assuming a cutoff of 6 rather than 7 blocks, it's rare to ask for 96 bytes of entropy at a time. - CTR-DRBG performs some stray block operations (ctr_drbg_update), which bsaes is bad at without extra work to fold them into the CTR loop (not really worth it). - CTR-DRBG calculates a couple new key schedules every RAND_bytes call. We don't currently have a constant-time bsaes key schedule. Unfortunately, even plain vpaes loses to the current aes_nohw used by bsaes, but it's not constant-time. Also taking CTR-DRBG out of the bsaes equation - Machines without AES hardware (clients) are not going to be RNG-bound. It's mostly servers pushing way too many CBC IVs that care. This means bsaes's current side channel tradeoffs make even less sense here. I'm not sure yet what we should do for the rest of the bsaes mess, but it seems clear that we want to stick with vpaes for the RNG. Bug: 256 Change-Id: Iec8f13af232794afd007cb1065913e8117eeee24 Reviewed-on: https://boringssl-review.googlesource.com/c/34744 Reviewed-by: Adam Langley <agl@google.com>
2019-01-12 15:29:10 +00:00
gcm_ctx->ctr = aes_ctr_set_key(&gcm_ctx->ks.ks, &gcm_ctx->gcm_key, NULL, key,
key_len, 1 /* large inputs */);
*out_tag_len = tag_len;
return 1;
}
OPENSSL_STATIC_ASSERT(sizeof(((EVP_AEAD_CTX *)NULL)->state) >=
sizeof(struct aead_aes_gcm_ctx),
"AEAD state is too small");
#if defined(__GNUC__) || defined(__clang__)
OPENSSL_STATIC_ASSERT(alignof(union evp_aead_ctx_st_state) >=
alignof(struct aead_aes_gcm_ctx),
"AEAD state has insufficient alignment");
#endif
static int aead_aes_gcm_init(EVP_AEAD_CTX *ctx, const uint8_t *key,
size_t key_len, size_t requested_tag_len) {
struct aead_aes_gcm_ctx *gcm_ctx = (struct aead_aes_gcm_ctx *) &ctx->state;
size_t actual_tag_len;
if (!aead_aes_gcm_init_impl(gcm_ctx, &actual_tag_len, key, key_len,
requested_tag_len)) {
return 0;
}
ctx->tag_len = actual_tag_len;
return 1;
}
static void aead_aes_gcm_cleanup(EVP_AEAD_CTX *ctx) {}
static int aead_aes_gcm_seal_scatter(const EVP_AEAD_CTX *ctx, uint8_t *out,
uint8_t *out_tag, size_t *out_tag_len,
size_t max_out_tag_len,
const uint8_t *nonce, size_t nonce_len,
const uint8_t *in, size_t in_len,
const uint8_t *extra_in,
size_t extra_in_len,
const uint8_t *ad, size_t ad_len) {
struct aead_aes_gcm_ctx *gcm_ctx = (struct aead_aes_gcm_ctx *) &ctx->state;
if (extra_in_len + ctx->tag_len < ctx->tag_len) {
OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE);
return 0;
}
if (max_out_tag_len < extra_in_len + ctx->tag_len) {
OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL);
return 0;
}
if (nonce_len == 0) {
OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE_SIZE);
return 0;
}
const AES_KEY *key = &gcm_ctx->ks.ks;
GCM128_CONTEXT gcm;
OPENSSL_memset(&gcm, 0, sizeof(gcm));
OPENSSL_memcpy(&gcm.gcm_key, &gcm_ctx->gcm_key, sizeof(gcm.gcm_key));
CRYPTO_gcm128_setiv(&gcm, key, nonce, nonce_len);
if (ad_len > 0 && !CRYPTO_gcm128_aad(&gcm, ad, ad_len)) {
return 0;
}
if (gcm_ctx->ctr) {
if (!CRYPTO_gcm128_encrypt_ctr32(&gcm, key, in, out, in_len,
gcm_ctx->ctr)) {
return 0;
}
} else {
if (!CRYPTO_gcm128_encrypt(&gcm, key, in, out, in_len)) {
return 0;
}
}
if (extra_in_len) {
if (gcm_ctx->ctr) {
if (!CRYPTO_gcm128_encrypt_ctr32(&gcm, key, extra_in, out_tag,
extra_in_len, gcm_ctx->ctr)) {
return 0;
}
} else {
if (!CRYPTO_gcm128_encrypt(&gcm, key, extra_in, out_tag, extra_in_len)) {
return 0;
}
}
}
CRYPTO_gcm128_tag(&gcm, out_tag + extra_in_len, ctx->tag_len);
*out_tag_len = ctx->tag_len + extra_in_len;
return 1;
}
static int aead_aes_gcm_open_gather(const EVP_AEAD_CTX *ctx, uint8_t *out,
const uint8_t *nonce, size_t nonce_len,
const uint8_t *in, size_t in_len,
const uint8_t *in_tag, size_t in_tag_len,
const uint8_t *ad, size_t ad_len) {
struct aead_aes_gcm_ctx *gcm_ctx = (struct aead_aes_gcm_ctx *) &ctx->state;
uint8_t tag[EVP_AEAD_AES_GCM_TAG_LEN];
if (nonce_len == 0) {
OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE_SIZE);
return 0;
}
if (in_tag_len != ctx->tag_len) {
OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT);
return 0;
}
const AES_KEY *key = &gcm_ctx->ks.ks;
GCM128_CONTEXT gcm;
OPENSSL_memset(&gcm, 0, sizeof(gcm));
OPENSSL_memcpy(&gcm.gcm_key, &gcm_ctx->gcm_key, sizeof(gcm.gcm_key));
CRYPTO_gcm128_setiv(&gcm, key, nonce, nonce_len);
if (!CRYPTO_gcm128_aad(&gcm, ad, ad_len)) {
return 0;
}
if (gcm_ctx->ctr) {
if (!CRYPTO_gcm128_decrypt_ctr32(&gcm, key, in, out, in_len,
gcm_ctx->ctr)) {
return 0;
}
} else {
if (!CRYPTO_gcm128_decrypt(&gcm, key, in, out, in_len)) {
return 0;
}
}
CRYPTO_gcm128_tag(&gcm, tag, ctx->tag_len);
if (CRYPTO_memcmp(tag, in_tag, ctx->tag_len) != 0) {
OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT);
return 0;
}
return 1;
}
DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_128_gcm) {
memset(out, 0, sizeof(EVP_AEAD));
out->key_len = 16;
out->nonce_len = 12;
out->overhead = EVP_AEAD_AES_GCM_TAG_LEN;
out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN;
out->seal_scatter_supports_extra_in = 1;
out->init = aead_aes_gcm_init;
out->cleanup = aead_aes_gcm_cleanup;
out->seal_scatter = aead_aes_gcm_seal_scatter;
out->open_gather = aead_aes_gcm_open_gather;
}
DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_256_gcm) {
memset(out, 0, sizeof(EVP_AEAD));
out->key_len = 32;
out->nonce_len = 12;
out->overhead = EVP_AEAD_AES_GCM_TAG_LEN;
out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN;
out->seal_scatter_supports_extra_in = 1;
out->init = aead_aes_gcm_init;
out->cleanup = aead_aes_gcm_cleanup;
out->seal_scatter = aead_aes_gcm_seal_scatter;
out->open_gather = aead_aes_gcm_open_gather;
}
struct aead_aes_gcm_tls12_ctx {
struct aead_aes_gcm_ctx gcm_ctx;
uint64_t min_next_nonce;
};
OPENSSL_STATIC_ASSERT(sizeof(((EVP_AEAD_CTX *)NULL)->state) >=
sizeof(struct aead_aes_gcm_tls12_ctx),
"AEAD state is too small");
#if defined(__GNUC__) || defined(__clang__)
OPENSSL_STATIC_ASSERT(alignof(union evp_aead_ctx_st_state) >=
alignof(struct aead_aes_gcm_tls12_ctx),
"AEAD state has insufficient alignment");
#endif
static int aead_aes_gcm_tls12_init(EVP_AEAD_CTX *ctx, const uint8_t *key,
size_t key_len, size_t requested_tag_len) {
struct aead_aes_gcm_tls12_ctx *gcm_ctx =
(struct aead_aes_gcm_tls12_ctx *) &ctx->state;
gcm_ctx->min_next_nonce = 0;
size_t actual_tag_len;
if (!aead_aes_gcm_init_impl(&gcm_ctx->gcm_ctx, &actual_tag_len, key, key_len,
requested_tag_len)) {
return 0;
}
ctx->tag_len = actual_tag_len;
return 1;
}
static int aead_aes_gcm_tls12_seal_scatter(
const EVP_AEAD_CTX *ctx, uint8_t *out, uint8_t *out_tag,
size_t *out_tag_len, size_t max_out_tag_len, const uint8_t *nonce,
size_t nonce_len, const uint8_t *in, size_t in_len, const uint8_t *extra_in,
size_t extra_in_len, const uint8_t *ad, size_t ad_len) {
struct aead_aes_gcm_tls12_ctx *gcm_ctx =
(struct aead_aes_gcm_tls12_ctx *) &ctx->state;
if (nonce_len != 12) {
OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE);
return 0;
}
// The given nonces must be strictly monotonically increasing.
uint64_t given_counter;
OPENSSL_memcpy(&given_counter, nonce + nonce_len - sizeof(given_counter),
sizeof(given_counter));
given_counter = CRYPTO_bswap8(given_counter);
if (given_counter == UINT64_MAX ||
given_counter < gcm_ctx->min_next_nonce) {
OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE);
return 0;
}
gcm_ctx->min_next_nonce = given_counter + 1;
return aead_aes_gcm_seal_scatter(ctx, out, out_tag, out_tag_len,
max_out_tag_len, nonce, nonce_len, in,
in_len, extra_in, extra_in_len, ad, ad_len);
}
DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_128_gcm_tls12) {
memset(out, 0, sizeof(EVP_AEAD));
out->key_len = 16;
out->nonce_len = 12;
out->overhead = EVP_AEAD_AES_GCM_TAG_LEN;
out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN;
out->seal_scatter_supports_extra_in = 1;
out->init = aead_aes_gcm_tls12_init;
out->cleanup = aead_aes_gcm_cleanup;
out->seal_scatter = aead_aes_gcm_tls12_seal_scatter;
out->open_gather = aead_aes_gcm_open_gather;
}
DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_256_gcm_tls12) {
memset(out, 0, sizeof(EVP_AEAD));
out->key_len = 32;
out->nonce_len = 12;
out->overhead = EVP_AEAD_AES_GCM_TAG_LEN;
out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN;
out->seal_scatter_supports_extra_in = 1;
out->init = aead_aes_gcm_tls12_init;
out->cleanup = aead_aes_gcm_cleanup;
out->seal_scatter = aead_aes_gcm_tls12_seal_scatter;
out->open_gather = aead_aes_gcm_open_gather;
}
struct aead_aes_gcm_tls13_ctx {
struct aead_aes_gcm_ctx gcm_ctx;
uint64_t min_next_nonce;
uint64_t mask;
uint8_t first;
};
OPENSSL_STATIC_ASSERT(sizeof(((EVP_AEAD_CTX *)NULL)->state) >=
sizeof(struct aead_aes_gcm_tls13_ctx),
"AEAD state is too small");
#if defined(__GNUC__) || defined(__clang__)
OPENSSL_STATIC_ASSERT(alignof(union evp_aead_ctx_st_state) >=
alignof(struct aead_aes_gcm_tls13_ctx),
"AEAD state has insufficient alignment");
#endif
static int aead_aes_gcm_tls13_init(EVP_AEAD_CTX *ctx, const uint8_t *key,
size_t key_len, size_t requested_tag_len) {
struct aead_aes_gcm_tls13_ctx *gcm_ctx =
(struct aead_aes_gcm_tls13_ctx *) &ctx->state;
gcm_ctx->min_next_nonce = 0;
gcm_ctx->first = 1;
size_t actual_tag_len;
if (!aead_aes_gcm_init_impl(&gcm_ctx->gcm_ctx, &actual_tag_len, key, key_len,
requested_tag_len)) {
return 0;
}
ctx->tag_len = actual_tag_len;
return 1;
}
static int aead_aes_gcm_tls13_seal_scatter(
const EVP_AEAD_CTX *ctx, uint8_t *out, uint8_t *out_tag,
size_t *out_tag_len, size_t max_out_tag_len, const uint8_t *nonce,
size_t nonce_len, const uint8_t *in, size_t in_len, const uint8_t *extra_in,
size_t extra_in_len, const uint8_t *ad, size_t ad_len) {
struct aead_aes_gcm_tls13_ctx *gcm_ctx =
(struct aead_aes_gcm_tls13_ctx *) &ctx->state;
if (nonce_len != 12) {
OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE);
return 0;
}
// The given nonces must be strictly monotonically increasing. See
// https://tools.ietf.org/html/rfc8446#section-5.3 for details of the TLS 1.3
// nonce construction.
uint64_t given_counter;
OPENSSL_memcpy(&given_counter, nonce + nonce_len - sizeof(given_counter),
sizeof(given_counter));
given_counter = CRYPTO_bswap8(given_counter);
if (gcm_ctx->first) {
// In the first call the sequence number will be zero and therefore the
// given nonce will be 0 ^ mask = mask.
gcm_ctx->mask = given_counter;
gcm_ctx->first = 0;
}
given_counter ^= gcm_ctx->mask;
if (given_counter == UINT64_MAX ||
given_counter < gcm_ctx->min_next_nonce) {
OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE);
return 0;
}
gcm_ctx->min_next_nonce = given_counter + 1;
return aead_aes_gcm_seal_scatter(ctx, out, out_tag, out_tag_len,
max_out_tag_len, nonce, nonce_len, in,
in_len, extra_in, extra_in_len, ad, ad_len);
}
DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_128_gcm_tls13) {
memset(out, 0, sizeof(EVP_AEAD));
out->key_len = 16;
out->nonce_len = 12;
out->overhead = EVP_AEAD_AES_GCM_TAG_LEN;
out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN;
out->seal_scatter_supports_extra_in = 1;
out->init = aead_aes_gcm_tls13_init;
out->cleanup = aead_aes_gcm_cleanup;
out->seal_scatter = aead_aes_gcm_tls13_seal_scatter;
out->open_gather = aead_aes_gcm_open_gather;
}
DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_256_gcm_tls13) {
memset(out, 0, sizeof(EVP_AEAD));
out->key_len = 32;
out->nonce_len = 12;
out->overhead = EVP_AEAD_AES_GCM_TAG_LEN;
out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN;
out->seal_scatter_supports_extra_in = 1;
out->init = aead_aes_gcm_tls13_init;
out->cleanup = aead_aes_gcm_cleanup;
out->seal_scatter = aead_aes_gcm_tls13_seal_scatter;
out->open_gather = aead_aes_gcm_open_gather;
}
int EVP_has_aes_hardware(void) {
#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
return hwaes_capable() && crypto_gcm_clmul_enabled();
#elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
return hwaes_capable() && CRYPTO_is_ARMv8_PMULL_capable();
#else
return 0;
#endif
}
OPENSSL_MSVC_PRAGMA(warning(pop))