boringssl/crypto/fipsmodule/cipher/e_aes.c

1244 lines
36 KiB
C
Raw Normal View History

/* ====================================================================
* Copyright (c) 2001-2011 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
*
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For written permission, please contact
* openssl-core@openssl.org.
*
* 5. Products derived from this software may not be called "OpenSSL"
* nor may "OpenSSL" appear in their names without prior written
* permission of the OpenSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the OpenSSL Project
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ==================================================================== */
Add a constant-time pshufb-based GHASH implementation. We currently require clmul instructions for constant-time GHASH on x86_64. Otherwise, it falls back to a variable-time 4-bit table implementation. However, a significant proportion of clients lack these instructions. Inspired by vpaes, we can use pshufb and a slightly different order of incorporating the bits to make a constant-time GHASH. This requires SSSE3, which is very common. Benchmarking old machines we had on hand, it appears to be a no-op on Sandy Bridge and a small slowdown for Penryn. Sandy Bridge (Intel Pentium CPU 987 @ 1.50GHz): (Note: these numbers are before 16-byte-aligning the table. That was an improvement on Penryn, so it's possible Sandy Bridge is now better.) Before: Did 4244750 AES-128-GCM (16 bytes) seal operations in 4015000us (1057222.9 ops/sec): 16.9 MB/s Did 442000 AES-128-GCM (1350 bytes) seal operations in 4016000us (110059.8 ops/sec): 148.6 MB/s Did 84000 AES-128-GCM (8192 bytes) seal operations in 4015000us (20921.5 ops/sec): 171.4 MB/s Did 3349250 AES-256-GCM (16 bytes) seal operations in 4016000us (833976.6 ops/sec): 13.3 MB/s Did 343500 AES-256-GCM (1350 bytes) seal operations in 4016000us (85532.9 ops/sec): 115.5 MB/s Did 65250 AES-256-GCM (8192 bytes) seal operations in 4015000us (16251.6 ops/sec): 133.1 MB/s After: Did 4229250 AES-128-GCM (16 bytes) seal operations in 4016000us (1053100.1 ops/sec): 16.8 MB/s [-0.4%] Did 442250 AES-128-GCM (1350 bytes) seal operations in 4016000us (110122.0 ops/sec): 148.7 MB/s [+0.1%] Did 83500 AES-128-GCM (8192 bytes) seal operations in 4015000us (20797.0 ops/sec): 170.4 MB/s [-0.6%] Did 3286500 AES-256-GCM (16 bytes) seal operations in 4016000us (818351.6 ops/sec): 13.1 MB/s [-1.9%] Did 342750 AES-256-GCM (1350 bytes) seal operations in 4015000us (85367.4 ops/sec): 115.2 MB/s [-0.2%] Did 65250 AES-256-GCM (8192 bytes) seal operations in 4016000us (16247.5 ops/sec): 133.1 MB/s [-0.0%] Penryn (Intel Core 2 Duo CPU P8600 @ 2.40GHz): Before: Did 1179000 AES-128-GCM (16 bytes) seal operations in 1000139us (1178836.1 ops/sec): 18.9 MB/s Did 97000 AES-128-GCM (1350 bytes) seal operations in 1006347us (96388.2 ops/sec): 130.1 MB/s Did 18000 AES-128-GCM (8192 bytes) seal operations in 1028943us (17493.7 ops/sec): 143.3 MB/s Did 977000 AES-256-GCM (16 bytes) seal operations in 1000197us (976807.6 ops/sec): 15.6 MB/s Did 82000 AES-256-GCM (1350 bytes) seal operations in 1012434us (80992.9 ops/sec): 109.3 MB/s Did 15000 AES-256-GCM (8192 bytes) seal operations in 1006528us (14902.7 ops/sec): 122.1 MB/s After: Did 1306000 AES-128-GCM (16 bytes) seal operations in 1000153us (1305800.2 ops/sec): 20.9 MB/s [+10.8%] Did 94000 AES-128-GCM (1350 bytes) seal operations in 1009852us (93082.9 ops/sec): 125.7 MB/s [-3.4%] Did 17000 AES-128-GCM (8192 bytes) seal operations in 1012096us (16796.8 ops/sec): 137.6 MB/s [-4.0%] Did 1070000 AES-256-GCM (16 bytes) seal operations in 1000929us (1069006.9 ops/sec): 17.1 MB/s [+9.4%] Did 79000 AES-256-GCM (1350 bytes) seal operations in 1002209us (78825.9 ops/sec): 106.4 MB/s [-2.7%] Did 15000 AES-256-GCM (8192 bytes) seal operations in 1061489us (14131.1 ops/sec): 115.8 MB/s [-5.2%] Change-Id: I1c3760a77af7bee4aee3745d1c648d9e34594afb Reviewed-on: https://boringssl-review.googlesource.com/c/34267 Commit-Queue: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-01-09 03:35:56 +00:00
#include <assert.h>
#include <string.h>
#include <openssl/aead.h>
#include <openssl/aes.h>
#include <openssl/cipher.h>
#include <openssl/cpu.h>
#include <openssl/err.h>
#include <openssl/mem.h>
#include <openssl/nid.h>
#include <openssl/rand.h>
#include "internal.h"
#include "../../internal.h"
#include "../aes/internal.h"
#include "../modes/internal.h"
#include "../delocate.h"
OPENSSL_MSVC_PRAGMA(warning(push))
OPENSSL_MSVC_PRAGMA(warning(disable: 4702)) // Unreachable code.
Fix build when using Visual Studio 2015 Update 1. Many of the compatibility issues are described at https://msdn.microsoft.com/en-us/library/mt612856.aspx. The macros that suppressed warnings on a per-function basis no longer work in Update 1, so replace them with #pragmas. Update 1 warns when |size_t| arguments to |printf| are casted, so stop doing that casting. Unfortunately, this requires an ugly hack to continue working in MSVC 2013 as MSVC 2013 doesn't support "%zu". Finally, Update 1 has new warnings, some of which need to be suppressed. --- Updated by davidben to give up on suppressing warnings in crypto/x509 and crypto/x509v3 as those directories aren't changed much from upstream. In each of these cases, upstream opted just blindly initialize the variable, so do the same. Also switch C4265 to level 4, per Microsoft's recommendation and work around a bug in limits.h that happens to get fixed by Google include order style. (limits.h is sensitive to whether corecrt.h, pulled in by stddef.h and some other headers, is included before it. The reason it affected just one file is we often put the file's header first, which means base.h is pulling in stddef.h. Relying on this is ugly, but it's no worse than what everything else is doing and this doesn't seem worth making something as tame as limits.h so messy to use.) Change-Id: I02d1f935356899f424d3525d03eca401bfa3e6cd Reviewed-on: https://boringssl-review.googlesource.com/7480 Reviewed-by: David Benjamin <davidben@google.com>
2016-01-18 08:21:42 +00:00
typedef struct {
union {
double align;
AES_KEY ks;
} ks;
block128_f block;
union {
cbc128_f cbc;
ctr128_f ctr;
} stream;
} EVP_AES_KEY;
typedef struct {
Add a constant-time pshufb-based GHASH implementation. We currently require clmul instructions for constant-time GHASH on x86_64. Otherwise, it falls back to a variable-time 4-bit table implementation. However, a significant proportion of clients lack these instructions. Inspired by vpaes, we can use pshufb and a slightly different order of incorporating the bits to make a constant-time GHASH. This requires SSSE3, which is very common. Benchmarking old machines we had on hand, it appears to be a no-op on Sandy Bridge and a small slowdown for Penryn. Sandy Bridge (Intel Pentium CPU 987 @ 1.50GHz): (Note: these numbers are before 16-byte-aligning the table. That was an improvement on Penryn, so it's possible Sandy Bridge is now better.) Before: Did 4244750 AES-128-GCM (16 bytes) seal operations in 4015000us (1057222.9 ops/sec): 16.9 MB/s Did 442000 AES-128-GCM (1350 bytes) seal operations in 4016000us (110059.8 ops/sec): 148.6 MB/s Did 84000 AES-128-GCM (8192 bytes) seal operations in 4015000us (20921.5 ops/sec): 171.4 MB/s Did 3349250 AES-256-GCM (16 bytes) seal operations in 4016000us (833976.6 ops/sec): 13.3 MB/s Did 343500 AES-256-GCM (1350 bytes) seal operations in 4016000us (85532.9 ops/sec): 115.5 MB/s Did 65250 AES-256-GCM (8192 bytes) seal operations in 4015000us (16251.6 ops/sec): 133.1 MB/s After: Did 4229250 AES-128-GCM (16 bytes) seal operations in 4016000us (1053100.1 ops/sec): 16.8 MB/s [-0.4%] Did 442250 AES-128-GCM (1350 bytes) seal operations in 4016000us (110122.0 ops/sec): 148.7 MB/s [+0.1%] Did 83500 AES-128-GCM (8192 bytes) seal operations in 4015000us (20797.0 ops/sec): 170.4 MB/s [-0.6%] Did 3286500 AES-256-GCM (16 bytes) seal operations in 4016000us (818351.6 ops/sec): 13.1 MB/s [-1.9%] Did 342750 AES-256-GCM (1350 bytes) seal operations in 4015000us (85367.4 ops/sec): 115.2 MB/s [-0.2%] Did 65250 AES-256-GCM (8192 bytes) seal operations in 4016000us (16247.5 ops/sec): 133.1 MB/s [-0.0%] Penryn (Intel Core 2 Duo CPU P8600 @ 2.40GHz): Before: Did 1179000 AES-128-GCM (16 bytes) seal operations in 1000139us (1178836.1 ops/sec): 18.9 MB/s Did 97000 AES-128-GCM (1350 bytes) seal operations in 1006347us (96388.2 ops/sec): 130.1 MB/s Did 18000 AES-128-GCM (8192 bytes) seal operations in 1028943us (17493.7 ops/sec): 143.3 MB/s Did 977000 AES-256-GCM (16 bytes) seal operations in 1000197us (976807.6 ops/sec): 15.6 MB/s Did 82000 AES-256-GCM (1350 bytes) seal operations in 1012434us (80992.9 ops/sec): 109.3 MB/s Did 15000 AES-256-GCM (8192 bytes) seal operations in 1006528us (14902.7 ops/sec): 122.1 MB/s After: Did 1306000 AES-128-GCM (16 bytes) seal operations in 1000153us (1305800.2 ops/sec): 20.9 MB/s [+10.8%] Did 94000 AES-128-GCM (1350 bytes) seal operations in 1009852us (93082.9 ops/sec): 125.7 MB/s [-3.4%] Did 17000 AES-128-GCM (8192 bytes) seal operations in 1012096us (16796.8 ops/sec): 137.6 MB/s [-4.0%] Did 1070000 AES-256-GCM (16 bytes) seal operations in 1000929us (1069006.9 ops/sec): 17.1 MB/s [+9.4%] Did 79000 AES-256-GCM (1350 bytes) seal operations in 1002209us (78825.9 ops/sec): 106.4 MB/s [-2.7%] Did 15000 AES-256-GCM (8192 bytes) seal operations in 1061489us (14131.1 ops/sec): 115.8 MB/s [-5.2%] Change-Id: I1c3760a77af7bee4aee3745d1c648d9e34594afb Reviewed-on: https://boringssl-review.googlesource.com/c/34267 Commit-Queue: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-01-09 03:35:56 +00:00
GCM128_CONTEXT gcm;
union {
double align;
AES_KEY ks;
} ks; // AES key schedule to use
int key_set; // Set if key initialised
int iv_set; // Set if an iv is set
uint8_t *iv; // Temporary IV store
int ivlen; // IV length
int taglen;
int iv_gen; // It is OK to generate IVs
ctr128_f ctr;
} EVP_AES_GCM_CTX;
static int aes_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key,
Fix build when using Visual Studio 2015 Update 1. Many of the compatibility issues are described at https://msdn.microsoft.com/en-us/library/mt612856.aspx. The macros that suppressed warnings on a per-function basis no longer work in Update 1, so replace them with #pragmas. Update 1 warns when |size_t| arguments to |printf| are casted, so stop doing that casting. Unfortunately, this requires an ugly hack to continue working in MSVC 2013 as MSVC 2013 doesn't support "%zu". Finally, Update 1 has new warnings, some of which need to be suppressed. --- Updated by davidben to give up on suppressing warnings in crypto/x509 and crypto/x509v3 as those directories aren't changed much from upstream. In each of these cases, upstream opted just blindly initialize the variable, so do the same. Also switch C4265 to level 4, per Microsoft's recommendation and work around a bug in limits.h that happens to get fixed by Google include order style. (limits.h is sensitive to whether corecrt.h, pulled in by stddef.h and some other headers, is included before it. The reason it affected just one file is we often put the file's header first, which means base.h is pulling in stddef.h. Relying on this is ugly, but it's no worse than what everything else is doing and this doesn't seem worth making something as tame as limits.h so messy to use.) Change-Id: I02d1f935356899f424d3525d03eca401bfa3e6cd Reviewed-on: https://boringssl-review.googlesource.com/7480 Reviewed-by: David Benjamin <davidben@google.com>
2016-01-18 08:21:42 +00:00
const uint8_t *iv, int enc) {
int ret, mode;
EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
mode = ctx->cipher->flags & EVP_CIPH_MODE_MASK;
if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE) && !enc) {
if (hwaes_capable()) {
Add PPC64LE assembly for AES-GCM. This change adds AES and GHASH assembly from upstream, with the aim of speeding up AES-GCM. The PPC64LE assembly matches the interface of the ARMv8 assembly so I've changed the prefix of both sets of asm functions to be the same ("aes_hw_"). Otherwise, the new assmebly files and Perlasm match exactly those from upstream's c536b6be1a (from their master branch). Before: Did 1879000 AES-128-GCM (16 bytes) seal operations in 1000428us (1878196.1 ops/sec): 30.1 MB/s Did 61000 AES-128-GCM (1350 bytes) seal operations in 1006660us (60596.4 ops/sec): 81.8 MB/s Did 11000 AES-128-GCM (8192 bytes) seal operations in 1072649us (10255.0 ops/sec): 84.0 MB/s Did 1665000 AES-256-GCM (16 bytes) seal operations in 1000591us (1664016.6 ops/sec): 26.6 MB/s Did 52000 AES-256-GCM (1350 bytes) seal operations in 1006971us (51640.0 ops/sec): 69.7 MB/s Did 8840 AES-256-GCM (8192 bytes) seal operations in 1013294us (8724.0 ops/sec): 71.5 MB/s After: Did 4994000 AES-128-GCM (16 bytes) seal operations in 1000017us (4993915.1 ops/sec): 79.9 MB/s Did 1389000 AES-128-GCM (1350 bytes) seal operations in 1000073us (1388898.6 ops/sec): 1875.0 MB/s Did 319000 AES-128-GCM (8192 bytes) seal operations in 1000101us (318967.8 ops/sec): 2613.0 MB/s Did 4668000 AES-256-GCM (16 bytes) seal operations in 1000149us (4667304.6 ops/sec): 74.7 MB/s Did 1202000 AES-256-GCM (1350 bytes) seal operations in 1000646us (1201224.0 ops/sec): 1621.7 MB/s Did 269000 AES-256-GCM (8192 bytes) seal operations in 1002804us (268247.8 ops/sec): 2197.5 MB/s Change-Id: Id848562bd4e1aa79a4683012501dfa5e6c08cfcc Reviewed-on: https://boringssl-review.googlesource.com/11262 Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Adam Langley <agl@google.com> CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
2016-09-23 20:47:24 +01:00
ret = aes_hw_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
dat->block = aes_hw_decrypt;
dat->stream.cbc = NULL;
if (mode == EVP_CIPH_CBC_MODE) {
dat->stream.cbc = aes_hw_cbc_encrypt;
}
} else if (bsaes_capable() && mode == EVP_CIPH_CBC_MODE) {
Avoid double-dispatch with AES_* vs aes_nohw_*. In particular, consistently pair bsaes with aes_nohw. Ideally the aes_nohw_* calls in bsaes-*.pl would be patched out and bsaes grows its own constant-time key setup (https://crbug.com/boringssl/256), but I'll sort that out separately. In the meantime, avoid going through AES_* which now dispatch. This avoids several nuisances: 1. If we were to add, say, a vpaes-armv7.pl the ABI tests would break. Fundamentally, we cannot assume that an AES_KEY has one and only one representation and must keep everything matching up. 2. AES_* functions should enable vpaes. This makes AES_* faster and constant-time for vector-capable CPUs (https://crbug.com/boringssl/263), relevant for QUIC packet number encryption, allowing us to add vpaes-armv8.pl (https://crbug.com/boringssl/246) without carrying a (likely) mostly unused AES implementation. 3. It's silly to double-dispatch when the EVP layer has already dispatched. 4. We should avoid asm calling into C. Otherwise, we need to test asm for ABI compliance as both caller and callee. Currently we only test it for callee compliance. When asm calls into asm, it *should* comply with the ABI as caller too, but mistakes don't matter as long as the called function triggers it. If the function is asm, this is fixed. If it is C, we must care about arbitrary C compiler output. Bug: 263 Change-Id: Ic85af5c765fd57cbffeaf301c3872bad6c5bbf78 Reviewed-on: https://boringssl-review.googlesource.com/c/34874 Commit-Queue: Adam Langley <agl@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-02-10 04:05:43 +00:00
ret = aes_nohw_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
dat->block = aes_nohw_decrypt;
dat->stream.cbc = bsaes_cbc_encrypt;
} else if (vpaes_capable()) {
ret = vpaes_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
dat->block = vpaes_decrypt;
dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ? vpaes_cbc_encrypt : NULL;
} else {
Avoid double-dispatch with AES_* vs aes_nohw_*. In particular, consistently pair bsaes with aes_nohw. Ideally the aes_nohw_* calls in bsaes-*.pl would be patched out and bsaes grows its own constant-time key setup (https://crbug.com/boringssl/256), but I'll sort that out separately. In the meantime, avoid going through AES_* which now dispatch. This avoids several nuisances: 1. If we were to add, say, a vpaes-armv7.pl the ABI tests would break. Fundamentally, we cannot assume that an AES_KEY has one and only one representation and must keep everything matching up. 2. AES_* functions should enable vpaes. This makes AES_* faster and constant-time for vector-capable CPUs (https://crbug.com/boringssl/263), relevant for QUIC packet number encryption, allowing us to add vpaes-armv8.pl (https://crbug.com/boringssl/246) without carrying a (likely) mostly unused AES implementation. 3. It's silly to double-dispatch when the EVP layer has already dispatched. 4. We should avoid asm calling into C. Otherwise, we need to test asm for ABI compliance as both caller and callee. Currently we only test it for callee compliance. When asm calls into asm, it *should* comply with the ABI as caller too, but mistakes don't matter as long as the called function triggers it. If the function is asm, this is fixed. If it is C, we must care about arbitrary C compiler output. Bug: 263 Change-Id: Ic85af5c765fd57cbffeaf301c3872bad6c5bbf78 Reviewed-on: https://boringssl-review.googlesource.com/c/34874 Commit-Queue: Adam Langley <agl@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-02-10 04:05:43 +00:00
ret = aes_nohw_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
dat->block = aes_nohw_decrypt;
dat->stream.cbc = NULL;
#if defined(AES_NOHW_CBC)
if (mode == EVP_CIPH_CBC_MODE) {
dat->stream.cbc = aes_nohw_cbc_encrypt;
}
#endif
}
} else if (hwaes_capable()) {
Add PPC64LE assembly for AES-GCM. This change adds AES and GHASH assembly from upstream, with the aim of speeding up AES-GCM. The PPC64LE assembly matches the interface of the ARMv8 assembly so I've changed the prefix of both sets of asm functions to be the same ("aes_hw_"). Otherwise, the new assmebly files and Perlasm match exactly those from upstream's c536b6be1a (from their master branch). Before: Did 1879000 AES-128-GCM (16 bytes) seal operations in 1000428us (1878196.1 ops/sec): 30.1 MB/s Did 61000 AES-128-GCM (1350 bytes) seal operations in 1006660us (60596.4 ops/sec): 81.8 MB/s Did 11000 AES-128-GCM (8192 bytes) seal operations in 1072649us (10255.0 ops/sec): 84.0 MB/s Did 1665000 AES-256-GCM (16 bytes) seal operations in 1000591us (1664016.6 ops/sec): 26.6 MB/s Did 52000 AES-256-GCM (1350 bytes) seal operations in 1006971us (51640.0 ops/sec): 69.7 MB/s Did 8840 AES-256-GCM (8192 bytes) seal operations in 1013294us (8724.0 ops/sec): 71.5 MB/s After: Did 4994000 AES-128-GCM (16 bytes) seal operations in 1000017us (4993915.1 ops/sec): 79.9 MB/s Did 1389000 AES-128-GCM (1350 bytes) seal operations in 1000073us (1388898.6 ops/sec): 1875.0 MB/s Did 319000 AES-128-GCM (8192 bytes) seal operations in 1000101us (318967.8 ops/sec): 2613.0 MB/s Did 4668000 AES-256-GCM (16 bytes) seal operations in 1000149us (4667304.6 ops/sec): 74.7 MB/s Did 1202000 AES-256-GCM (1350 bytes) seal operations in 1000646us (1201224.0 ops/sec): 1621.7 MB/s Did 269000 AES-256-GCM (8192 bytes) seal operations in 1002804us (268247.8 ops/sec): 2197.5 MB/s Change-Id: Id848562bd4e1aa79a4683012501dfa5e6c08cfcc Reviewed-on: https://boringssl-review.googlesource.com/11262 Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Adam Langley <agl@google.com> CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
2016-09-23 20:47:24 +01:00
ret = aes_hw_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
dat->block = aes_hw_encrypt;
dat->stream.cbc = NULL;
if (mode == EVP_CIPH_CBC_MODE) {
dat->stream.cbc = aes_hw_cbc_encrypt;
} else if (mode == EVP_CIPH_CTR_MODE) {
dat->stream.ctr = aes_hw_ctr32_encrypt_blocks;
}
} else if (bsaes_capable() && mode == EVP_CIPH_CTR_MODE) {
Avoid double-dispatch with AES_* vs aes_nohw_*. In particular, consistently pair bsaes with aes_nohw. Ideally the aes_nohw_* calls in bsaes-*.pl would be patched out and bsaes grows its own constant-time key setup (https://crbug.com/boringssl/256), but I'll sort that out separately. In the meantime, avoid going through AES_* which now dispatch. This avoids several nuisances: 1. If we were to add, say, a vpaes-armv7.pl the ABI tests would break. Fundamentally, we cannot assume that an AES_KEY has one and only one representation and must keep everything matching up. 2. AES_* functions should enable vpaes. This makes AES_* faster and constant-time for vector-capable CPUs (https://crbug.com/boringssl/263), relevant for QUIC packet number encryption, allowing us to add vpaes-armv8.pl (https://crbug.com/boringssl/246) without carrying a (likely) mostly unused AES implementation. 3. It's silly to double-dispatch when the EVP layer has already dispatched. 4. We should avoid asm calling into C. Otherwise, we need to test asm for ABI compliance as both caller and callee. Currently we only test it for callee compliance. When asm calls into asm, it *should* comply with the ABI as caller too, but mistakes don't matter as long as the called function triggers it. If the function is asm, this is fixed. If it is C, we must care about arbitrary C compiler output. Bug: 263 Change-Id: Ic85af5c765fd57cbffeaf301c3872bad6c5bbf78 Reviewed-on: https://boringssl-review.googlesource.com/c/34874 Commit-Queue: Adam Langley <agl@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-02-10 04:05:43 +00:00
ret = aes_nohw_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
dat->block = aes_nohw_encrypt;
dat->stream.ctr = bsaes_ctr32_encrypt_blocks;
} else if (vpaes_capable()) {
ret = vpaes_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
dat->block = vpaes_encrypt;
Enable vpaes for aarch64, with CTR optimizations. This patches vpaes-armv8.pl to add vpaes_ctr32_encrypt_blocks. CTR mode is by far the most important mode these days. It should have access to _vpaes_encrypt_2x, which gives a considerable speed boost. Also exclude vpaes_ecb_* as they're not even used. For iOS, this change is completely a no-op. iOS ARMv8 always has crypto extensions, and we already statically drop all other AES implementations. Android ARMv8 is *not* required to have crypto extensions, but every ARMv8 device I've seen has them. For those, it is a no-op performance-wise and a win on size. vpaes appears to be about 5.6KiB smaller than the tables. ARMv8 always makes SIMD (NEON) available, so we can statically drop aes_nohw. In theory, however, crypto-less Android ARMv8 is possible. Today such chips get a variable-time AES. This CL fixes this, but the performance story is complex. The Raspberry Pi 3 is not Android but has a Cortex-A53 chip without crypto extensions. (But the official images are 32-bit, so even this is slightly artificial...) There, vpaes is a performance win. Raspberry Pi 3, Model B+, Cortex-A53 Before: Did 265000 AES-128-GCM (16 bytes) seal operations in 1003312us (264125.2 ops/sec): 4.2 MB/s Did 44000 AES-128-GCM (256 bytes) seal operations in 1002141us (43906.0 ops/sec): 11.2 MB/s Did 9394 AES-128-GCM (1350 bytes) seal operations in 1032104us (9101.8 ops/sec): 12.3 MB/s Did 1562 AES-128-GCM (8192 bytes) seal operations in 1008982us (1548.1 ops/sec): 12.7 MB/s After: Did 277000 AES-128-GCM (16 bytes) seal operations in 1001884us (276479.1 ops/sec): 4.4 MB/s Did 52000 AES-128-GCM (256 bytes) seal operations in 1001480us (51923.2 ops/sec): 13.3 MB/s Did 11000 AES-128-GCM (1350 bytes) seal operations in 1007979us (10912.9 ops/sec): 14.7 MB/s Did 2013 AES-128-GCM (8192 bytes) seal operations in 1085545us (1854.4 ops/sec): 15.2 MB/s The Pixel 3 has a Cortex-A75 with crypto extensions, so it would never run this code. However, artificially ignoring them gives another data point (ARM documentation[*] suggests the extensions are still optional on a Cortex-A75.) Sadly, vpaes no longer wins on perf over aes_nohw. But, it is constant-time: Pixel 3, AES/PMULL extensions ignored, Cortex-A75: Before: Did 2102000 AES-128-GCM (16 bytes) seal operations in 1000378us (2101205.7 ops/sec): 33.6 MB/s Did 358000 AES-128-GCM (256 bytes) seal operations in 1002658us (357051.0 ops/sec): 91.4 MB/s Did 75000 AES-128-GCM (1350 bytes) seal operations in 1012830us (74049.9 ops/sec): 100.0 MB/s Did 13000 AES-128-GCM (8192 bytes) seal operations in 1036524us (12541.9 ops/sec): 102.7 MB/s After: Did 1453000 AES-128-GCM (16 bytes) seal operations in 1000213us (1452690.6 ops/sec): 23.2 MB/s Did 285000 AES-128-GCM (256 bytes) seal operations in 1002227us (284366.7 ops/sec): 72.8 MB/s Did 60000 AES-128-GCM (1350 bytes) seal operations in 1016106us (59049.0 ops/sec): 79.7 MB/s Did 11000 AES-128-GCM (8192 bytes) seal operations in 1094184us (10053.2 ops/sec): 82.4 MB/s Note the numbers above run with PMULL off, so the slow GHASH is dampening the regression. If we test aes_nohw and vpaes paired with PMULL on, the 20% perf hit becomes a 31% hit. The PMULL-less variant is more likely to represent a real chip. This is consistent with upstream's note in the comment, though it is unclear if 20% is the right order of magnitude: "these results are worse than scalar compiler-generated code, but it's constant-time and therefore preferred". [*] http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.100458_0301_00_en/lau1442495529696.html Bug: 246 Change-Id: If1dc87f5131fce742052498295476fbae4628dbf Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/35026 Commit-Queue: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-02-25 21:47:51 +00:00
dat->stream.cbc = NULL;
if (mode == EVP_CIPH_CBC_MODE) {
dat->stream.cbc = vpaes_cbc_encrypt;
}
#if defined(VPAES_CTR32)
if (mode == EVP_CIPH_CTR_MODE) {
dat->stream.ctr = vpaes_ctr32_encrypt_blocks;
}
#endif
} else {
Avoid double-dispatch with AES_* vs aes_nohw_*. In particular, consistently pair bsaes with aes_nohw. Ideally the aes_nohw_* calls in bsaes-*.pl would be patched out and bsaes grows its own constant-time key setup (https://crbug.com/boringssl/256), but I'll sort that out separately. In the meantime, avoid going through AES_* which now dispatch. This avoids several nuisances: 1. If we were to add, say, a vpaes-armv7.pl the ABI tests would break. Fundamentally, we cannot assume that an AES_KEY has one and only one representation and must keep everything matching up. 2. AES_* functions should enable vpaes. This makes AES_* faster and constant-time for vector-capable CPUs (https://crbug.com/boringssl/263), relevant for QUIC packet number encryption, allowing us to add vpaes-armv8.pl (https://crbug.com/boringssl/246) without carrying a (likely) mostly unused AES implementation. 3. It's silly to double-dispatch when the EVP layer has already dispatched. 4. We should avoid asm calling into C. Otherwise, we need to test asm for ABI compliance as both caller and callee. Currently we only test it for callee compliance. When asm calls into asm, it *should* comply with the ABI as caller too, but mistakes don't matter as long as the called function triggers it. If the function is asm, this is fixed. If it is C, we must care about arbitrary C compiler output. Bug: 263 Change-Id: Ic85af5c765fd57cbffeaf301c3872bad6c5bbf78 Reviewed-on: https://boringssl-review.googlesource.com/c/34874 Commit-Queue: Adam Langley <agl@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-02-10 04:05:43 +00:00
ret = aes_nohw_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
dat->block = aes_nohw_encrypt;
dat->stream.cbc = NULL;
#if defined(AES_NOHW_CBC)
if (mode == EVP_CIPH_CBC_MODE) {
dat->stream.cbc = aes_nohw_cbc_encrypt;
}
#endif
}
if (ret < 0) {
OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_AES_KEY_SETUP_FAILED);
return 0;
}
return 1;
}
static int aes_cbc_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in,
size_t len) {
EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
if (dat->stream.cbc) {
(*dat->stream.cbc)(in, out, len, &dat->ks.ks, ctx->iv, ctx->encrypt);
} else if (ctx->encrypt) {
CRYPTO_cbc128_encrypt(in, out, len, &dat->ks.ks, ctx->iv, dat->block);
} else {
CRYPTO_cbc128_decrypt(in, out, len, &dat->ks.ks, ctx->iv, dat->block);
}
return 1;
}
static int aes_ecb_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in,
size_t len) {
size_t bl = ctx->cipher->block_size;
EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
if (len < bl) {
return 1;
}
len -= bl;
for (size_t i = 0; i <= len; i += bl) {
(*dat->block)(in + i, out + i, &dat->ks.ks);
}
return 1;
}
static int aes_ctr_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in,
size_t len) {
EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
if (dat->stream.ctr) {
CRYPTO_ctr128_encrypt_ctr32(in, out, len, &dat->ks.ks, ctx->iv, ctx->buf,
&ctx->num, dat->stream.ctr);
} else {
CRYPTO_ctr128_encrypt(in, out, len, &dat->ks.ks, ctx->iv, ctx->buf,
&ctx->num, dat->block);
}
return 1;
}
static int aes_ofb_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in,
size_t len) {
EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
CRYPTO_ofb128_encrypt(in, out, len, &dat->ks.ks, ctx->iv, &ctx->num,
dat->block);
return 1;
}
ctr128_f aes_ctr_set_key(AES_KEY *aes_key, GCM128_KEY *gcm_key,
block128_f *out_block, const uint8_t *key,
Don't use bsaes over vpaes for CTR-DRBG. RAND_bytes rarely uses large enough inputs for bsaes to be worth it. https://boringssl-review.googlesource.com/c/boringssl/+/33589 includes some rough benchmarks of various bits here. Some observations: - 8 blocks of bsaes costs roughly 6.5 blocks of vpaes. Note the comparison isn't quite accurate because I'm measuring bsaes_ctr32_encrypt_blocks against vpaes_encrypt and vpaes in CTR mode today must make do with a C loop. Even assuming a cutoff of 6 rather than 7 blocks, it's rare to ask for 96 bytes of entropy at a time. - CTR-DRBG performs some stray block operations (ctr_drbg_update), which bsaes is bad at without extra work to fold them into the CTR loop (not really worth it). - CTR-DRBG calculates a couple new key schedules every RAND_bytes call. We don't currently have a constant-time bsaes key schedule. Unfortunately, even plain vpaes loses to the current aes_nohw used by bsaes, but it's not constant-time. Also taking CTR-DRBG out of the bsaes equation - Machines without AES hardware (clients) are not going to be RNG-bound. It's mostly servers pushing way too many CBC IVs that care. This means bsaes's current side channel tradeoffs make even less sense here. I'm not sure yet what we should do for the rest of the bsaes mess, but it seems clear that we want to stick with vpaes for the RNG. Bug: 256 Change-Id: Iec8f13af232794afd007cb1065913e8117eeee24 Reviewed-on: https://boringssl-review.googlesource.com/c/34744 Reviewed-by: Adam Langley <agl@google.com>
2019-01-12 15:29:10 +00:00
size_t key_bytes, int large_inputs) {
if (hwaes_capable()) {
aes_hw_set_encrypt_key(key, key_bytes * 8, aes_key);
if (gcm_key != NULL) {
CRYPTO_gcm128_init_key(gcm_key, aes_key, aes_hw_encrypt, 1);
}
if (out_block) {
*out_block = aes_hw_encrypt;
}
return aes_hw_ctr32_encrypt_blocks;
}
Don't use bsaes over vpaes for CTR-DRBG. RAND_bytes rarely uses large enough inputs for bsaes to be worth it. https://boringssl-review.googlesource.com/c/boringssl/+/33589 includes some rough benchmarks of various bits here. Some observations: - 8 blocks of bsaes costs roughly 6.5 blocks of vpaes. Note the comparison isn't quite accurate because I'm measuring bsaes_ctr32_encrypt_blocks against vpaes_encrypt and vpaes in CTR mode today must make do with a C loop. Even assuming a cutoff of 6 rather than 7 blocks, it's rare to ask for 96 bytes of entropy at a time. - CTR-DRBG performs some stray block operations (ctr_drbg_update), which bsaes is bad at without extra work to fold them into the CTR loop (not really worth it). - CTR-DRBG calculates a couple new key schedules every RAND_bytes call. We don't currently have a constant-time bsaes key schedule. Unfortunately, even plain vpaes loses to the current aes_nohw used by bsaes, but it's not constant-time. Also taking CTR-DRBG out of the bsaes equation - Machines without AES hardware (clients) are not going to be RNG-bound. It's mostly servers pushing way too many CBC IVs that care. This means bsaes's current side channel tradeoffs make even less sense here. I'm not sure yet what we should do for the rest of the bsaes mess, but it seems clear that we want to stick with vpaes for the RNG. Bug: 256 Change-Id: Iec8f13af232794afd007cb1065913e8117eeee24 Reviewed-on: https://boringssl-review.googlesource.com/c/34744 Reviewed-by: Adam Langley <agl@google.com>
2019-01-12 15:29:10 +00:00
const int bsaes_ok = bsaes_capable();
const int vpaes_ok = vpaes_capable();
if (bsaes_ok && (large_inputs || !vpaes_ok)) {
Avoid double-dispatch with AES_* vs aes_nohw_*. In particular, consistently pair bsaes with aes_nohw. Ideally the aes_nohw_* calls in bsaes-*.pl would be patched out and bsaes grows its own constant-time key setup (https://crbug.com/boringssl/256), but I'll sort that out separately. In the meantime, avoid going through AES_* which now dispatch. This avoids several nuisances: 1. If we were to add, say, a vpaes-armv7.pl the ABI tests would break. Fundamentally, we cannot assume that an AES_KEY has one and only one representation and must keep everything matching up. 2. AES_* functions should enable vpaes. This makes AES_* faster and constant-time for vector-capable CPUs (https://crbug.com/boringssl/263), relevant for QUIC packet number encryption, allowing us to add vpaes-armv8.pl (https://crbug.com/boringssl/246) without carrying a (likely) mostly unused AES implementation. 3. It's silly to double-dispatch when the EVP layer has already dispatched. 4. We should avoid asm calling into C. Otherwise, we need to test asm for ABI compliance as both caller and callee. Currently we only test it for callee compliance. When asm calls into asm, it *should* comply with the ABI as caller too, but mistakes don't matter as long as the called function triggers it. If the function is asm, this is fixed. If it is C, we must care about arbitrary C compiler output. Bug: 263 Change-Id: Ic85af5c765fd57cbffeaf301c3872bad6c5bbf78 Reviewed-on: https://boringssl-review.googlesource.com/c/34874 Commit-Queue: Adam Langley <agl@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-02-10 04:05:43 +00:00
aes_nohw_set_encrypt_key(key, key_bytes * 8, aes_key);
if (gcm_key != NULL) {
Avoid double-dispatch with AES_* vs aes_nohw_*. In particular, consistently pair bsaes with aes_nohw. Ideally the aes_nohw_* calls in bsaes-*.pl would be patched out and bsaes grows its own constant-time key setup (https://crbug.com/boringssl/256), but I'll sort that out separately. In the meantime, avoid going through AES_* which now dispatch. This avoids several nuisances: 1. If we were to add, say, a vpaes-armv7.pl the ABI tests would break. Fundamentally, we cannot assume that an AES_KEY has one and only one representation and must keep everything matching up. 2. AES_* functions should enable vpaes. This makes AES_* faster and constant-time for vector-capable CPUs (https://crbug.com/boringssl/263), relevant for QUIC packet number encryption, allowing us to add vpaes-armv8.pl (https://crbug.com/boringssl/246) without carrying a (likely) mostly unused AES implementation. 3. It's silly to double-dispatch when the EVP layer has already dispatched. 4. We should avoid asm calling into C. Otherwise, we need to test asm for ABI compliance as both caller and callee. Currently we only test it for callee compliance. When asm calls into asm, it *should* comply with the ABI as caller too, but mistakes don't matter as long as the called function triggers it. If the function is asm, this is fixed. If it is C, we must care about arbitrary C compiler output. Bug: 263 Change-Id: Ic85af5c765fd57cbffeaf301c3872bad6c5bbf78 Reviewed-on: https://boringssl-review.googlesource.com/c/34874 Commit-Queue: Adam Langley <agl@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-02-10 04:05:43 +00:00
CRYPTO_gcm128_init_key(gcm_key, aes_key, aes_nohw_encrypt, 0);
}
if (out_block) {
Avoid double-dispatch with AES_* vs aes_nohw_*. In particular, consistently pair bsaes with aes_nohw. Ideally the aes_nohw_* calls in bsaes-*.pl would be patched out and bsaes grows its own constant-time key setup (https://crbug.com/boringssl/256), but I'll sort that out separately. In the meantime, avoid going through AES_* which now dispatch. This avoids several nuisances: 1. If we were to add, say, a vpaes-armv7.pl the ABI tests would break. Fundamentally, we cannot assume that an AES_KEY has one and only one representation and must keep everything matching up. 2. AES_* functions should enable vpaes. This makes AES_* faster and constant-time for vector-capable CPUs (https://crbug.com/boringssl/263), relevant for QUIC packet number encryption, allowing us to add vpaes-armv8.pl (https://crbug.com/boringssl/246) without carrying a (likely) mostly unused AES implementation. 3. It's silly to double-dispatch when the EVP layer has already dispatched. 4. We should avoid asm calling into C. Otherwise, we need to test asm for ABI compliance as both caller and callee. Currently we only test it for callee compliance. When asm calls into asm, it *should* comply with the ABI as caller too, but mistakes don't matter as long as the called function triggers it. If the function is asm, this is fixed. If it is C, we must care about arbitrary C compiler output. Bug: 263 Change-Id: Ic85af5c765fd57cbffeaf301c3872bad6c5bbf78 Reviewed-on: https://boringssl-review.googlesource.com/c/34874 Commit-Queue: Adam Langley <agl@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-02-10 04:05:43 +00:00
*out_block = aes_nohw_encrypt;
}
return bsaes_ctr32_encrypt_blocks;
}
Don't use bsaes over vpaes for CTR-DRBG. RAND_bytes rarely uses large enough inputs for bsaes to be worth it. https://boringssl-review.googlesource.com/c/boringssl/+/33589 includes some rough benchmarks of various bits here. Some observations: - 8 blocks of bsaes costs roughly 6.5 blocks of vpaes. Note the comparison isn't quite accurate because I'm measuring bsaes_ctr32_encrypt_blocks against vpaes_encrypt and vpaes in CTR mode today must make do with a C loop. Even assuming a cutoff of 6 rather than 7 blocks, it's rare to ask for 96 bytes of entropy at a time. - CTR-DRBG performs some stray block operations (ctr_drbg_update), which bsaes is bad at without extra work to fold them into the CTR loop (not really worth it). - CTR-DRBG calculates a couple new key schedules every RAND_bytes call. We don't currently have a constant-time bsaes key schedule. Unfortunately, even plain vpaes loses to the current aes_nohw used by bsaes, but it's not constant-time. Also taking CTR-DRBG out of the bsaes equation - Machines without AES hardware (clients) are not going to be RNG-bound. It's mostly servers pushing way too many CBC IVs that care. This means bsaes's current side channel tradeoffs make even less sense here. I'm not sure yet what we should do for the rest of the bsaes mess, but it seems clear that we want to stick with vpaes for the RNG. Bug: 256 Change-Id: Iec8f13af232794afd007cb1065913e8117eeee24 Reviewed-on: https://boringssl-review.googlesource.com/c/34744 Reviewed-by: Adam Langley <agl@google.com>
2019-01-12 15:29:10 +00:00
if (vpaes_ok) {
vpaes_set_encrypt_key(key, key_bytes * 8, aes_key);
if (out_block) {
*out_block = vpaes_encrypt;
}
if (gcm_key != NULL) {
CRYPTO_gcm128_init_key(gcm_key, aes_key, vpaes_encrypt, 0);
}
Enable vpaes for aarch64, with CTR optimizations. This patches vpaes-armv8.pl to add vpaes_ctr32_encrypt_blocks. CTR mode is by far the most important mode these days. It should have access to _vpaes_encrypt_2x, which gives a considerable speed boost. Also exclude vpaes_ecb_* as they're not even used. For iOS, this change is completely a no-op. iOS ARMv8 always has crypto extensions, and we already statically drop all other AES implementations. Android ARMv8 is *not* required to have crypto extensions, but every ARMv8 device I've seen has them. For those, it is a no-op performance-wise and a win on size. vpaes appears to be about 5.6KiB smaller than the tables. ARMv8 always makes SIMD (NEON) available, so we can statically drop aes_nohw. In theory, however, crypto-less Android ARMv8 is possible. Today such chips get a variable-time AES. This CL fixes this, but the performance story is complex. The Raspberry Pi 3 is not Android but has a Cortex-A53 chip without crypto extensions. (But the official images are 32-bit, so even this is slightly artificial...) There, vpaes is a performance win. Raspberry Pi 3, Model B+, Cortex-A53 Before: Did 265000 AES-128-GCM (16 bytes) seal operations in 1003312us (264125.2 ops/sec): 4.2 MB/s Did 44000 AES-128-GCM (256 bytes) seal operations in 1002141us (43906.0 ops/sec): 11.2 MB/s Did 9394 AES-128-GCM (1350 bytes) seal operations in 1032104us (9101.8 ops/sec): 12.3 MB/s Did 1562 AES-128-GCM (8192 bytes) seal operations in 1008982us (1548.1 ops/sec): 12.7 MB/s After: Did 277000 AES-128-GCM (16 bytes) seal operations in 1001884us (276479.1 ops/sec): 4.4 MB/s Did 52000 AES-128-GCM (256 bytes) seal operations in 1001480us (51923.2 ops/sec): 13.3 MB/s Did 11000 AES-128-GCM (1350 bytes) seal operations in 1007979us (10912.9 ops/sec): 14.7 MB/s Did 2013 AES-128-GCM (8192 bytes) seal operations in 1085545us (1854.4 ops/sec): 15.2 MB/s The Pixel 3 has a Cortex-A75 with crypto extensions, so it would never run this code. However, artificially ignoring them gives another data point (ARM documentation[*] suggests the extensions are still optional on a Cortex-A75.) Sadly, vpaes no longer wins on perf over aes_nohw. But, it is constant-time: Pixel 3, AES/PMULL extensions ignored, Cortex-A75: Before: Did 2102000 AES-128-GCM (16 bytes) seal operations in 1000378us (2101205.7 ops/sec): 33.6 MB/s Did 358000 AES-128-GCM (256 bytes) seal operations in 1002658us (357051.0 ops/sec): 91.4 MB/s Did 75000 AES-128-GCM (1350 bytes) seal operations in 1012830us (74049.9 ops/sec): 100.0 MB/s Did 13000 AES-128-GCM (8192 bytes) seal operations in 1036524us (12541.9 ops/sec): 102.7 MB/s After: Did 1453000 AES-128-GCM (16 bytes) seal operations in 1000213us (1452690.6 ops/sec): 23.2 MB/s Did 285000 AES-128-GCM (256 bytes) seal operations in 1002227us (284366.7 ops/sec): 72.8 MB/s Did 60000 AES-128-GCM (1350 bytes) seal operations in 1016106us (59049.0 ops/sec): 79.7 MB/s Did 11000 AES-128-GCM (8192 bytes) seal operations in 1094184us (10053.2 ops/sec): 82.4 MB/s Note the numbers above run with PMULL off, so the slow GHASH is dampening the regression. If we test aes_nohw and vpaes paired with PMULL on, the 20% perf hit becomes a 31% hit. The PMULL-less variant is more likely to represent a real chip. This is consistent with upstream's note in the comment, though it is unclear if 20% is the right order of magnitude: "these results are worse than scalar compiler-generated code, but it's constant-time and therefore preferred". [*] http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.100458_0301_00_en/lau1442495529696.html Bug: 246 Change-Id: If1dc87f5131fce742052498295476fbae4628dbf Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/35026 Commit-Queue: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-02-25 21:47:51 +00:00
#if defined(VPAES_CTR32)
return vpaes_ctr32_encrypt_blocks;
#else
return NULL;
Enable vpaes for aarch64, with CTR optimizations. This patches vpaes-armv8.pl to add vpaes_ctr32_encrypt_blocks. CTR mode is by far the most important mode these days. It should have access to _vpaes_encrypt_2x, which gives a considerable speed boost. Also exclude vpaes_ecb_* as they're not even used. For iOS, this change is completely a no-op. iOS ARMv8 always has crypto extensions, and we already statically drop all other AES implementations. Android ARMv8 is *not* required to have crypto extensions, but every ARMv8 device I've seen has them. For those, it is a no-op performance-wise and a win on size. vpaes appears to be about 5.6KiB smaller than the tables. ARMv8 always makes SIMD (NEON) available, so we can statically drop aes_nohw. In theory, however, crypto-less Android ARMv8 is possible. Today such chips get a variable-time AES. This CL fixes this, but the performance story is complex. The Raspberry Pi 3 is not Android but has a Cortex-A53 chip without crypto extensions. (But the official images are 32-bit, so even this is slightly artificial...) There, vpaes is a performance win. Raspberry Pi 3, Model B+, Cortex-A53 Before: Did 265000 AES-128-GCM (16 bytes) seal operations in 1003312us (264125.2 ops/sec): 4.2 MB/s Did 44000 AES-128-GCM (256 bytes) seal operations in 1002141us (43906.0 ops/sec): 11.2 MB/s Did 9394 AES-128-GCM (1350 bytes) seal operations in 1032104us (9101.8 ops/sec): 12.3 MB/s Did 1562 AES-128-GCM (8192 bytes) seal operations in 1008982us (1548.1 ops/sec): 12.7 MB/s After: Did 277000 AES-128-GCM (16 bytes) seal operations in 1001884us (276479.1 ops/sec): 4.4 MB/s Did 52000 AES-128-GCM (256 bytes) seal operations in 1001480us (51923.2 ops/sec): 13.3 MB/s Did 11000 AES-128-GCM (1350 bytes) seal operations in 1007979us (10912.9 ops/sec): 14.7 MB/s Did 2013 AES-128-GCM (8192 bytes) seal operations in 1085545us (1854.4 ops/sec): 15.2 MB/s The Pixel 3 has a Cortex-A75 with crypto extensions, so it would never run this code. However, artificially ignoring them gives another data point (ARM documentation[*] suggests the extensions are still optional on a Cortex-A75.) Sadly, vpaes no longer wins on perf over aes_nohw. But, it is constant-time: Pixel 3, AES/PMULL extensions ignored, Cortex-A75: Before: Did 2102000 AES-128-GCM (16 bytes) seal operations in 1000378us (2101205.7 ops/sec): 33.6 MB/s Did 358000 AES-128-GCM (256 bytes) seal operations in 1002658us (357051.0 ops/sec): 91.4 MB/s Did 75000 AES-128-GCM (1350 bytes) seal operations in 1012830us (74049.9 ops/sec): 100.0 MB/s Did 13000 AES-128-GCM (8192 bytes) seal operations in 1036524us (12541.9 ops/sec): 102.7 MB/s After: Did 1453000 AES-128-GCM (16 bytes) seal operations in 1000213us (1452690.6 ops/sec): 23.2 MB/s Did 285000 AES-128-GCM (256 bytes) seal operations in 1002227us (284366.7 ops/sec): 72.8 MB/s Did 60000 AES-128-GCM (1350 bytes) seal operations in 1016106us (59049.0 ops/sec): 79.7 MB/s Did 11000 AES-128-GCM (8192 bytes) seal operations in 1094184us (10053.2 ops/sec): 82.4 MB/s Note the numbers above run with PMULL off, so the slow GHASH is dampening the regression. If we test aes_nohw and vpaes paired with PMULL on, the 20% perf hit becomes a 31% hit. The PMULL-less variant is more likely to represent a real chip. This is consistent with upstream's note in the comment, though it is unclear if 20% is the right order of magnitude: "these results are worse than scalar compiler-generated code, but it's constant-time and therefore preferred". [*] http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.100458_0301_00_en/lau1442495529696.html Bug: 246 Change-Id: If1dc87f5131fce742052498295476fbae4628dbf Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/35026 Commit-Queue: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-02-25 21:47:51 +00:00
#endif
}
Avoid double-dispatch with AES_* vs aes_nohw_*. In particular, consistently pair bsaes with aes_nohw. Ideally the aes_nohw_* calls in bsaes-*.pl would be patched out and bsaes grows its own constant-time key setup (https://crbug.com/boringssl/256), but I'll sort that out separately. In the meantime, avoid going through AES_* which now dispatch. This avoids several nuisances: 1. If we were to add, say, a vpaes-armv7.pl the ABI tests would break. Fundamentally, we cannot assume that an AES_KEY has one and only one representation and must keep everything matching up. 2. AES_* functions should enable vpaes. This makes AES_* faster and constant-time for vector-capable CPUs (https://crbug.com/boringssl/263), relevant for QUIC packet number encryption, allowing us to add vpaes-armv8.pl (https://crbug.com/boringssl/246) without carrying a (likely) mostly unused AES implementation. 3. It's silly to double-dispatch when the EVP layer has already dispatched. 4. We should avoid asm calling into C. Otherwise, we need to test asm for ABI compliance as both caller and callee. Currently we only test it for callee compliance. When asm calls into asm, it *should* comply with the ABI as caller too, but mistakes don't matter as long as the called function triggers it. If the function is asm, this is fixed. If it is C, we must care about arbitrary C compiler output. Bug: 263 Change-Id: Ic85af5c765fd57cbffeaf301c3872bad6c5bbf78 Reviewed-on: https://boringssl-review.googlesource.com/c/34874 Commit-Queue: Adam Langley <agl@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-02-10 04:05:43 +00:00
aes_nohw_set_encrypt_key(key, key_bytes * 8, aes_key);
if (gcm_key != NULL) {
Avoid double-dispatch with AES_* vs aes_nohw_*. In particular, consistently pair bsaes with aes_nohw. Ideally the aes_nohw_* calls in bsaes-*.pl would be patched out and bsaes grows its own constant-time key setup (https://crbug.com/boringssl/256), but I'll sort that out separately. In the meantime, avoid going through AES_* which now dispatch. This avoids several nuisances: 1. If we were to add, say, a vpaes-armv7.pl the ABI tests would break. Fundamentally, we cannot assume that an AES_KEY has one and only one representation and must keep everything matching up. 2. AES_* functions should enable vpaes. This makes AES_* faster and constant-time for vector-capable CPUs (https://crbug.com/boringssl/263), relevant for QUIC packet number encryption, allowing us to add vpaes-armv8.pl (https://crbug.com/boringssl/246) without carrying a (likely) mostly unused AES implementation. 3. It's silly to double-dispatch when the EVP layer has already dispatched. 4. We should avoid asm calling into C. Otherwise, we need to test asm for ABI compliance as both caller and callee. Currently we only test it for callee compliance. When asm calls into asm, it *should* comply with the ABI as caller too, but mistakes don't matter as long as the called function triggers it. If the function is asm, this is fixed. If it is C, we must care about arbitrary C compiler output. Bug: 263 Change-Id: Ic85af5c765fd57cbffeaf301c3872bad6c5bbf78 Reviewed-on: https://boringssl-review.googlesource.com/c/34874 Commit-Queue: Adam Langley <agl@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-02-10 04:05:43 +00:00
CRYPTO_gcm128_init_key(gcm_key, aes_key, aes_nohw_encrypt, 0);
}
if (out_block) {
Avoid double-dispatch with AES_* vs aes_nohw_*. In particular, consistently pair bsaes with aes_nohw. Ideally the aes_nohw_* calls in bsaes-*.pl would be patched out and bsaes grows its own constant-time key setup (https://crbug.com/boringssl/256), but I'll sort that out separately. In the meantime, avoid going through AES_* which now dispatch. This avoids several nuisances: 1. If we were to add, say, a vpaes-armv7.pl the ABI tests would break. Fundamentally, we cannot assume that an AES_KEY has one and only one representation and must keep everything matching up. 2. AES_* functions should enable vpaes. This makes AES_* faster and constant-time for vector-capable CPUs (https://crbug.com/boringssl/263), relevant for QUIC packet number encryption, allowing us to add vpaes-armv8.pl (https://crbug.com/boringssl/246) without carrying a (likely) mostly unused AES implementation. 3. It's silly to double-dispatch when the EVP layer has already dispatched. 4. We should avoid asm calling into C. Otherwise, we need to test asm for ABI compliance as both caller and callee. Currently we only test it for callee compliance. When asm calls into asm, it *should* comply with the ABI as caller too, but mistakes don't matter as long as the called function triggers it. If the function is asm, this is fixed. If it is C, we must care about arbitrary C compiler output. Bug: 263 Change-Id: Ic85af5c765fd57cbffeaf301c3872bad6c5bbf78 Reviewed-on: https://boringssl-review.googlesource.com/c/34874 Commit-Queue: Adam Langley <agl@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-02-10 04:05:43 +00:00
*out_block = aes_nohw_encrypt;
}
return NULL;
}
Add a constant-time pshufb-based GHASH implementation. We currently require clmul instructions for constant-time GHASH on x86_64. Otherwise, it falls back to a variable-time 4-bit table implementation. However, a significant proportion of clients lack these instructions. Inspired by vpaes, we can use pshufb and a slightly different order of incorporating the bits to make a constant-time GHASH. This requires SSSE3, which is very common. Benchmarking old machines we had on hand, it appears to be a no-op on Sandy Bridge and a small slowdown for Penryn. Sandy Bridge (Intel Pentium CPU 987 @ 1.50GHz): (Note: these numbers are before 16-byte-aligning the table. That was an improvement on Penryn, so it's possible Sandy Bridge is now better.) Before: Did 4244750 AES-128-GCM (16 bytes) seal operations in 4015000us (1057222.9 ops/sec): 16.9 MB/s Did 442000 AES-128-GCM (1350 bytes) seal operations in 4016000us (110059.8 ops/sec): 148.6 MB/s Did 84000 AES-128-GCM (8192 bytes) seal operations in 4015000us (20921.5 ops/sec): 171.4 MB/s Did 3349250 AES-256-GCM (16 bytes) seal operations in 4016000us (833976.6 ops/sec): 13.3 MB/s Did 343500 AES-256-GCM (1350 bytes) seal operations in 4016000us (85532.9 ops/sec): 115.5 MB/s Did 65250 AES-256-GCM (8192 bytes) seal operations in 4015000us (16251.6 ops/sec): 133.1 MB/s After: Did 4229250 AES-128-GCM (16 bytes) seal operations in 4016000us (1053100.1 ops/sec): 16.8 MB/s [-0.4%] Did 442250 AES-128-GCM (1350 bytes) seal operations in 4016000us (110122.0 ops/sec): 148.7 MB/s [+0.1%] Did 83500 AES-128-GCM (8192 bytes) seal operations in 4015000us (20797.0 ops/sec): 170.4 MB/s [-0.6%] Did 3286500 AES-256-GCM (16 bytes) seal operations in 4016000us (818351.6 ops/sec): 13.1 MB/s [-1.9%] Did 342750 AES-256-GCM (1350 bytes) seal operations in 4015000us (85367.4 ops/sec): 115.2 MB/s [-0.2%] Did 65250 AES-256-GCM (8192 bytes) seal operations in 4016000us (16247.5 ops/sec): 133.1 MB/s [-0.0%] Penryn (Intel Core 2 Duo CPU P8600 @ 2.40GHz): Before: Did 1179000 AES-128-GCM (16 bytes) seal operations in 1000139us (1178836.1 ops/sec): 18.9 MB/s Did 97000 AES-128-GCM (1350 bytes) seal operations in 1006347us (96388.2 ops/sec): 130.1 MB/s Did 18000 AES-128-GCM (8192 bytes) seal operations in 1028943us (17493.7 ops/sec): 143.3 MB/s Did 977000 AES-256-GCM (16 bytes) seal operations in 1000197us (976807.6 ops/sec): 15.6 MB/s Did 82000 AES-256-GCM (1350 bytes) seal operations in 1012434us (80992.9 ops/sec): 109.3 MB/s Did 15000 AES-256-GCM (8192 bytes) seal operations in 1006528us (14902.7 ops/sec): 122.1 MB/s After: Did 1306000 AES-128-GCM (16 bytes) seal operations in 1000153us (1305800.2 ops/sec): 20.9 MB/s [+10.8%] Did 94000 AES-128-GCM (1350 bytes) seal operations in 1009852us (93082.9 ops/sec): 125.7 MB/s [-3.4%] Did 17000 AES-128-GCM (8192 bytes) seal operations in 1012096us (16796.8 ops/sec): 137.6 MB/s [-4.0%] Did 1070000 AES-256-GCM (16 bytes) seal operations in 1000929us (1069006.9 ops/sec): 17.1 MB/s [+9.4%] Did 79000 AES-256-GCM (1350 bytes) seal operations in 1002209us (78825.9 ops/sec): 106.4 MB/s [-2.7%] Did 15000 AES-256-GCM (8192 bytes) seal operations in 1061489us (14131.1 ops/sec): 115.8 MB/s [-5.2%] Change-Id: I1c3760a77af7bee4aee3745d1c648d9e34594afb Reviewed-on: https://boringssl-review.googlesource.com/c/34267 Commit-Queue: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-01-09 03:35:56 +00:00
#if defined(OPENSSL_32_BIT)
#define EVP_AES_GCM_CTX_PADDING (4+8)
#else
#define EVP_AES_GCM_CTX_PADDING 8
#endif
static EVP_AES_GCM_CTX *aes_gcm_from_cipher_ctx(EVP_CIPHER_CTX *ctx) {
#if defined(__GNUC__) || defined(__clang__)
OPENSSL_STATIC_ASSERT(
alignof(EVP_AES_GCM_CTX) <= 16,
"EVP_AES_GCM_CTX needs more alignment than this function provides");
#endif
// |malloc| guarantees up to 4-byte alignment on 32-bit and 8-byte alignment
// on 64-bit systems, so we need to adjust to reach 16-byte alignment.
assert(ctx->cipher->ctx_size ==
sizeof(EVP_AES_GCM_CTX) + EVP_AES_GCM_CTX_PADDING);
char *ptr = ctx->cipher_data;
#if defined(OPENSSL_32_BIT)
assert((uintptr_t)ptr % 4 == 0);
ptr += (uintptr_t)ptr & 4;
#endif
assert((uintptr_t)ptr % 8 == 0);
ptr += (uintptr_t)ptr & 8;
return (EVP_AES_GCM_CTX *)ptr;
}
static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key,
const uint8_t *iv, int enc) {
Add a constant-time pshufb-based GHASH implementation. We currently require clmul instructions for constant-time GHASH on x86_64. Otherwise, it falls back to a variable-time 4-bit table implementation. However, a significant proportion of clients lack these instructions. Inspired by vpaes, we can use pshufb and a slightly different order of incorporating the bits to make a constant-time GHASH. This requires SSSE3, which is very common. Benchmarking old machines we had on hand, it appears to be a no-op on Sandy Bridge and a small slowdown for Penryn. Sandy Bridge (Intel Pentium CPU 987 @ 1.50GHz): (Note: these numbers are before 16-byte-aligning the table. That was an improvement on Penryn, so it's possible Sandy Bridge is now better.) Before: Did 4244750 AES-128-GCM (16 bytes) seal operations in 4015000us (1057222.9 ops/sec): 16.9 MB/s Did 442000 AES-128-GCM (1350 bytes) seal operations in 4016000us (110059.8 ops/sec): 148.6 MB/s Did 84000 AES-128-GCM (8192 bytes) seal operations in 4015000us (20921.5 ops/sec): 171.4 MB/s Did 3349250 AES-256-GCM (16 bytes) seal operations in 4016000us (833976.6 ops/sec): 13.3 MB/s Did 343500 AES-256-GCM (1350 bytes) seal operations in 4016000us (85532.9 ops/sec): 115.5 MB/s Did 65250 AES-256-GCM (8192 bytes) seal operations in 4015000us (16251.6 ops/sec): 133.1 MB/s After: Did 4229250 AES-128-GCM (16 bytes) seal operations in 4016000us (1053100.1 ops/sec): 16.8 MB/s [-0.4%] Did 442250 AES-128-GCM (1350 bytes) seal operations in 4016000us (110122.0 ops/sec): 148.7 MB/s [+0.1%] Did 83500 AES-128-GCM (8192 bytes) seal operations in 4015000us (20797.0 ops/sec): 170.4 MB/s [-0.6%] Did 3286500 AES-256-GCM (16 bytes) seal operations in 4016000us (818351.6 ops/sec): 13.1 MB/s [-1.9%] Did 342750 AES-256-GCM (1350 bytes) seal operations in 4015000us (85367.4 ops/sec): 115.2 MB/s [-0.2%] Did 65250 AES-256-GCM (8192 bytes) seal operations in 4016000us (16247.5 ops/sec): 133.1 MB/s [-0.0%] Penryn (Intel Core 2 Duo CPU P8600 @ 2.40GHz): Before: Did 1179000 AES-128-GCM (16 bytes) seal operations in 1000139us (1178836.1 ops/sec): 18.9 MB/s Did 97000 AES-128-GCM (1350 bytes) seal operations in 1006347us (96388.2 ops/sec): 130.1 MB/s Did 18000 AES-128-GCM (8192 bytes) seal operations in 1028943us (17493.7 ops/sec): 143.3 MB/s Did 977000 AES-256-GCM (16 bytes) seal operations in 1000197us (976807.6 ops/sec): 15.6 MB/s Did 82000 AES-256-GCM (1350 bytes) seal operations in 1012434us (80992.9 ops/sec): 109.3 MB/s Did 15000 AES-256-GCM (8192 bytes) seal operations in 1006528us (14902.7 ops/sec): 122.1 MB/s After: Did 1306000 AES-128-GCM (16 bytes) seal operations in 1000153us (1305800.2 ops/sec): 20.9 MB/s [+10.8%] Did 94000 AES-128-GCM (1350 bytes) seal operations in 1009852us (93082.9 ops/sec): 125.7 MB/s [-3.4%] Did 17000 AES-128-GCM (8192 bytes) seal operations in 1012096us (16796.8 ops/sec): 137.6 MB/s [-4.0%] Did 1070000 AES-256-GCM (16 bytes) seal operations in 1000929us (1069006.9 ops/sec): 17.1 MB/s [+9.4%] Did 79000 AES-256-GCM (1350 bytes) seal operations in 1002209us (78825.9 ops/sec): 106.4 MB/s [-2.7%] Did 15000 AES-256-GCM (8192 bytes) seal operations in 1061489us (14131.1 ops/sec): 115.8 MB/s [-5.2%] Change-Id: I1c3760a77af7bee4aee3745d1c648d9e34594afb Reviewed-on: https://boringssl-review.googlesource.com/c/34267 Commit-Queue: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-01-09 03:35:56 +00:00
EVP_AES_GCM_CTX *gctx = aes_gcm_from_cipher_ctx(ctx);
if (!iv && !key) {
return 1;
}
if (key) {
OPENSSL_memset(&gctx->gcm, 0, sizeof(gctx->gcm));
gctx->ctr = aes_ctr_set_key(&gctx->ks.ks, &gctx->gcm.gcm_key, NULL, key,
Don't use bsaes over vpaes for CTR-DRBG. RAND_bytes rarely uses large enough inputs for bsaes to be worth it. https://boringssl-review.googlesource.com/c/boringssl/+/33589 includes some rough benchmarks of various bits here. Some observations: - 8 blocks of bsaes costs roughly 6.5 blocks of vpaes. Note the comparison isn't quite accurate because I'm measuring bsaes_ctr32_encrypt_blocks against vpaes_encrypt and vpaes in CTR mode today must make do with a C loop. Even assuming a cutoff of 6 rather than 7 blocks, it's rare to ask for 96 bytes of entropy at a time. - CTR-DRBG performs some stray block operations (ctr_drbg_update), which bsaes is bad at without extra work to fold them into the CTR loop (not really worth it). - CTR-DRBG calculates a couple new key schedules every RAND_bytes call. We don't currently have a constant-time bsaes key schedule. Unfortunately, even plain vpaes loses to the current aes_nohw used by bsaes, but it's not constant-time. Also taking CTR-DRBG out of the bsaes equation - Machines without AES hardware (clients) are not going to be RNG-bound. It's mostly servers pushing way too many CBC IVs that care. This means bsaes's current side channel tradeoffs make even less sense here. I'm not sure yet what we should do for the rest of the bsaes mess, but it seems clear that we want to stick with vpaes for the RNG. Bug: 256 Change-Id: Iec8f13af232794afd007cb1065913e8117eeee24 Reviewed-on: https://boringssl-review.googlesource.com/c/34744 Reviewed-by: Adam Langley <agl@google.com>
2019-01-12 15:29:10 +00:00
ctx->key_len, 1 /* large inputs */);
// If we have an iv can set it directly, otherwise use saved IV.
if (iv == NULL && gctx->iv_set) {
iv = gctx->iv;
}
if (iv) {
CRYPTO_gcm128_setiv(&gctx->gcm, &gctx->ks.ks, iv, gctx->ivlen);
gctx->iv_set = 1;
}
gctx->key_set = 1;
} else {
// If key set use IV, otherwise copy
if (gctx->key_set) {
CRYPTO_gcm128_setiv(&gctx->gcm, &gctx->ks.ks, iv, gctx->ivlen);
} else {
OPENSSL_memcpy(gctx->iv, iv, gctx->ivlen);
}
gctx->iv_set = 1;
gctx->iv_gen = 0;
}
return 1;
}
static void aes_gcm_cleanup(EVP_CIPHER_CTX *c) {
Add a constant-time pshufb-based GHASH implementation. We currently require clmul instructions for constant-time GHASH on x86_64. Otherwise, it falls back to a variable-time 4-bit table implementation. However, a significant proportion of clients lack these instructions. Inspired by vpaes, we can use pshufb and a slightly different order of incorporating the bits to make a constant-time GHASH. This requires SSSE3, which is very common. Benchmarking old machines we had on hand, it appears to be a no-op on Sandy Bridge and a small slowdown for Penryn. Sandy Bridge (Intel Pentium CPU 987 @ 1.50GHz): (Note: these numbers are before 16-byte-aligning the table. That was an improvement on Penryn, so it's possible Sandy Bridge is now better.) Before: Did 4244750 AES-128-GCM (16 bytes) seal operations in 4015000us (1057222.9 ops/sec): 16.9 MB/s Did 442000 AES-128-GCM (1350 bytes) seal operations in 4016000us (110059.8 ops/sec): 148.6 MB/s Did 84000 AES-128-GCM (8192 bytes) seal operations in 4015000us (20921.5 ops/sec): 171.4 MB/s Did 3349250 AES-256-GCM (16 bytes) seal operations in 4016000us (833976.6 ops/sec): 13.3 MB/s Did 343500 AES-256-GCM (1350 bytes) seal operations in 4016000us (85532.9 ops/sec): 115.5 MB/s Did 65250 AES-256-GCM (8192 bytes) seal operations in 4015000us (16251.6 ops/sec): 133.1 MB/s After: Did 4229250 AES-128-GCM (16 bytes) seal operations in 4016000us (1053100.1 ops/sec): 16.8 MB/s [-0.4%] Did 442250 AES-128-GCM (1350 bytes) seal operations in 4016000us (110122.0 ops/sec): 148.7 MB/s [+0.1%] Did 83500 AES-128-GCM (8192 bytes) seal operations in 4015000us (20797.0 ops/sec): 170.4 MB/s [-0.6%] Did 3286500 AES-256-GCM (16 bytes) seal operations in 4016000us (818351.6 ops/sec): 13.1 MB/s [-1.9%] Did 342750 AES-256-GCM (1350 bytes) seal operations in 4015000us (85367.4 ops/sec): 115.2 MB/s [-0.2%] Did 65250 AES-256-GCM (8192 bytes) seal operations in 4016000us (16247.5 ops/sec): 133.1 MB/s [-0.0%] Penryn (Intel Core 2 Duo CPU P8600 @ 2.40GHz): Before: Did 1179000 AES-128-GCM (16 bytes) seal operations in 1000139us (1178836.1 ops/sec): 18.9 MB/s Did 97000 AES-128-GCM (1350 bytes) seal operations in 1006347us (96388.2 ops/sec): 130.1 MB/s Did 18000 AES-128-GCM (8192 bytes) seal operations in 1028943us (17493.7 ops/sec): 143.3 MB/s Did 977000 AES-256-GCM (16 bytes) seal operations in 1000197us (976807.6 ops/sec): 15.6 MB/s Did 82000 AES-256-GCM (1350 bytes) seal operations in 1012434us (80992.9 ops/sec): 109.3 MB/s Did 15000 AES-256-GCM (8192 bytes) seal operations in 1006528us (14902.7 ops/sec): 122.1 MB/s After: Did 1306000 AES-128-GCM (16 bytes) seal operations in 1000153us (1305800.2 ops/sec): 20.9 MB/s [+10.8%] Did 94000 AES-128-GCM (1350 bytes) seal operations in 1009852us (93082.9 ops/sec): 125.7 MB/s [-3.4%] Did 17000 AES-128-GCM (8192 bytes) seal operations in 1012096us (16796.8 ops/sec): 137.6 MB/s [-4.0%] Did 1070000 AES-256-GCM (16 bytes) seal operations in 1000929us (1069006.9 ops/sec): 17.1 MB/s [+9.4%] Did 79000 AES-256-GCM (1350 bytes) seal operations in 1002209us (78825.9 ops/sec): 106.4 MB/s [-2.7%] Did 15000 AES-256-GCM (8192 bytes) seal operations in 1061489us (14131.1 ops/sec): 115.8 MB/s [-5.2%] Change-Id: I1c3760a77af7bee4aee3745d1c648d9e34594afb Reviewed-on: https://boringssl-review.googlesource.com/c/34267 Commit-Queue: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-01-09 03:35:56 +00:00
EVP_AES_GCM_CTX *gctx = aes_gcm_from_cipher_ctx(c);
OPENSSL_cleanse(&gctx->gcm, sizeof(gctx->gcm));
if (gctx->iv != c->iv) {
OPENSSL_free(gctx->iv);
}
}
// increment counter (64-bit int) by 1
static void ctr64_inc(uint8_t *counter) {
int n = 8;
uint8_t c;
do {
--n;
c = counter[n];
++c;
counter[n] = c;
if (c) {
return;
}
} while (n);
}
static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr) {
Add a constant-time pshufb-based GHASH implementation. We currently require clmul instructions for constant-time GHASH on x86_64. Otherwise, it falls back to a variable-time 4-bit table implementation. However, a significant proportion of clients lack these instructions. Inspired by vpaes, we can use pshufb and a slightly different order of incorporating the bits to make a constant-time GHASH. This requires SSSE3, which is very common. Benchmarking old machines we had on hand, it appears to be a no-op on Sandy Bridge and a small slowdown for Penryn. Sandy Bridge (Intel Pentium CPU 987 @ 1.50GHz): (Note: these numbers are before 16-byte-aligning the table. That was an improvement on Penryn, so it's possible Sandy Bridge is now better.) Before: Did 4244750 AES-128-GCM (16 bytes) seal operations in 4015000us (1057222.9 ops/sec): 16.9 MB/s Did 442000 AES-128-GCM (1350 bytes) seal operations in 4016000us (110059.8 ops/sec): 148.6 MB/s Did 84000 AES-128-GCM (8192 bytes) seal operations in 4015000us (20921.5 ops/sec): 171.4 MB/s Did 3349250 AES-256-GCM (16 bytes) seal operations in 4016000us (833976.6 ops/sec): 13.3 MB/s Did 343500 AES-256-GCM (1350 bytes) seal operations in 4016000us (85532.9 ops/sec): 115.5 MB/s Did 65250 AES-256-GCM (8192 bytes) seal operations in 4015000us (16251.6 ops/sec): 133.1 MB/s After: Did 4229250 AES-128-GCM (16 bytes) seal operations in 4016000us (1053100.1 ops/sec): 16.8 MB/s [-0.4%] Did 442250 AES-128-GCM (1350 bytes) seal operations in 4016000us (110122.0 ops/sec): 148.7 MB/s [+0.1%] Did 83500 AES-128-GCM (8192 bytes) seal operations in 4015000us (20797.0 ops/sec): 170.4 MB/s [-0.6%] Did 3286500 AES-256-GCM (16 bytes) seal operations in 4016000us (818351.6 ops/sec): 13.1 MB/s [-1.9%] Did 342750 AES-256-GCM (1350 bytes) seal operations in 4015000us (85367.4 ops/sec): 115.2 MB/s [-0.2%] Did 65250 AES-256-GCM (8192 bytes) seal operations in 4016000us (16247.5 ops/sec): 133.1 MB/s [-0.0%] Penryn (Intel Core 2 Duo CPU P8600 @ 2.40GHz): Before: Did 1179000 AES-128-GCM (16 bytes) seal operations in 1000139us (1178836.1 ops/sec): 18.9 MB/s Did 97000 AES-128-GCM (1350 bytes) seal operations in 1006347us (96388.2 ops/sec): 130.1 MB/s Did 18000 AES-128-GCM (8192 bytes) seal operations in 1028943us (17493.7 ops/sec): 143.3 MB/s Did 977000 AES-256-GCM (16 bytes) seal operations in 1000197us (976807.6 ops/sec): 15.6 MB/s Did 82000 AES-256-GCM (1350 bytes) seal operations in 1012434us (80992.9 ops/sec): 109.3 MB/s Did 15000 AES-256-GCM (8192 bytes) seal operations in 1006528us (14902.7 ops/sec): 122.1 MB/s After: Did 1306000 AES-128-GCM (16 bytes) seal operations in 1000153us (1305800.2 ops/sec): 20.9 MB/s [+10.8%] Did 94000 AES-128-GCM (1350 bytes) seal operations in 1009852us (93082.9 ops/sec): 125.7 MB/s [-3.4%] Did 17000 AES-128-GCM (8192 bytes) seal operations in 1012096us (16796.8 ops/sec): 137.6 MB/s [-4.0%] Did 1070000 AES-256-GCM (16 bytes) seal operations in 1000929us (1069006.9 ops/sec): 17.1 MB/s [+9.4%] Did 79000 AES-256-GCM (1350 bytes) seal operations in 1002209us (78825.9 ops/sec): 106.4 MB/s [-2.7%] Did 15000 AES-256-GCM (8192 bytes) seal operations in 1061489us (14131.1 ops/sec): 115.8 MB/s [-5.2%] Change-Id: I1c3760a77af7bee4aee3745d1c648d9e34594afb Reviewed-on: https://boringssl-review.googlesource.com/c/34267 Commit-Queue: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-01-09 03:35:56 +00:00
EVP_AES_GCM_CTX *gctx = aes_gcm_from_cipher_ctx(c);
switch (type) {
case EVP_CTRL_INIT:
gctx->key_set = 0;
gctx->iv_set = 0;
gctx->ivlen = c->cipher->iv_len;
gctx->iv = c->iv;
gctx->taglen = -1;
gctx->iv_gen = 0;
return 1;
case EVP_CTRL_AEAD_SET_IVLEN:
if (arg <= 0) {
return 0;
}
// Allocate memory for IV if needed
if (arg > EVP_MAX_IV_LENGTH && arg > gctx->ivlen) {
if (gctx->iv != c->iv) {
OPENSSL_free(gctx->iv);
}
gctx->iv = OPENSSL_malloc(arg);
if (!gctx->iv) {
return 0;
}
}
gctx->ivlen = arg;
return 1;
case EVP_CTRL_AEAD_SET_TAG:
if (arg <= 0 || arg > 16 || c->encrypt) {
return 0;
}
OPENSSL_memcpy(c->buf, ptr, arg);
gctx->taglen = arg;
return 1;
case EVP_CTRL_AEAD_GET_TAG:
if (arg <= 0 || arg > 16 || !c->encrypt || gctx->taglen < 0) {
return 0;
}
OPENSSL_memcpy(ptr, c->buf, arg);
return 1;
case EVP_CTRL_AEAD_SET_IV_FIXED:
// Special case: -1 length restores whole IV
if (arg == -1) {
OPENSSL_memcpy(gctx->iv, ptr, gctx->ivlen);
gctx->iv_gen = 1;
return 1;
}
// Fixed field must be at least 4 bytes and invocation field
// at least 8.
if (arg < 4 || (gctx->ivlen - arg) < 8) {
return 0;
}
if (arg) {
OPENSSL_memcpy(gctx->iv, ptr, arg);
}
if (c->encrypt && !RAND_bytes(gctx->iv + arg, gctx->ivlen - arg)) {
return 0;
}
gctx->iv_gen = 1;
return 1;
case EVP_CTRL_GCM_IV_GEN:
if (gctx->iv_gen == 0 || gctx->key_set == 0) {
return 0;
}
CRYPTO_gcm128_setiv(&gctx->gcm, &gctx->ks.ks, gctx->iv, gctx->ivlen);
if (arg <= 0 || arg > gctx->ivlen) {
arg = gctx->ivlen;
}
OPENSSL_memcpy(ptr, gctx->iv + gctx->ivlen - arg, arg);
// Invocation field will be at least 8 bytes in size and
// so no need to check wrap around or increment more than
// last 8 bytes.
ctr64_inc(gctx->iv + gctx->ivlen - 8);
gctx->iv_set = 1;
return 1;
case EVP_CTRL_GCM_SET_IV_INV:
if (gctx->iv_gen == 0 || gctx->key_set == 0 || c->encrypt) {
return 0;
}
OPENSSL_memcpy(gctx->iv + gctx->ivlen - arg, ptr, arg);
CRYPTO_gcm128_setiv(&gctx->gcm, &gctx->ks.ks, gctx->iv, gctx->ivlen);
gctx->iv_set = 1;
return 1;
case EVP_CTRL_COPY: {
EVP_CIPHER_CTX *out = ptr;
Add a constant-time pshufb-based GHASH implementation. We currently require clmul instructions for constant-time GHASH on x86_64. Otherwise, it falls back to a variable-time 4-bit table implementation. However, a significant proportion of clients lack these instructions. Inspired by vpaes, we can use pshufb and a slightly different order of incorporating the bits to make a constant-time GHASH. This requires SSSE3, which is very common. Benchmarking old machines we had on hand, it appears to be a no-op on Sandy Bridge and a small slowdown for Penryn. Sandy Bridge (Intel Pentium CPU 987 @ 1.50GHz): (Note: these numbers are before 16-byte-aligning the table. That was an improvement on Penryn, so it's possible Sandy Bridge is now better.) Before: Did 4244750 AES-128-GCM (16 bytes) seal operations in 4015000us (1057222.9 ops/sec): 16.9 MB/s Did 442000 AES-128-GCM (1350 bytes) seal operations in 4016000us (110059.8 ops/sec): 148.6 MB/s Did 84000 AES-128-GCM (8192 bytes) seal operations in 4015000us (20921.5 ops/sec): 171.4 MB/s Did 3349250 AES-256-GCM (16 bytes) seal operations in 4016000us (833976.6 ops/sec): 13.3 MB/s Did 343500 AES-256-GCM (1350 bytes) seal operations in 4016000us (85532.9 ops/sec): 115.5 MB/s Did 65250 AES-256-GCM (8192 bytes) seal operations in 4015000us (16251.6 ops/sec): 133.1 MB/s After: Did 4229250 AES-128-GCM (16 bytes) seal operations in 4016000us (1053100.1 ops/sec): 16.8 MB/s [-0.4%] Did 442250 AES-128-GCM (1350 bytes) seal operations in 4016000us (110122.0 ops/sec): 148.7 MB/s [+0.1%] Did 83500 AES-128-GCM (8192 bytes) seal operations in 4015000us (20797.0 ops/sec): 170.4 MB/s [-0.6%] Did 3286500 AES-256-GCM (16 bytes) seal operations in 4016000us (818351.6 ops/sec): 13.1 MB/s [-1.9%] Did 342750 AES-256-GCM (1350 bytes) seal operations in 4015000us (85367.4 ops/sec): 115.2 MB/s [-0.2%] Did 65250 AES-256-GCM (8192 bytes) seal operations in 4016000us (16247.5 ops/sec): 133.1 MB/s [-0.0%] Penryn (Intel Core 2 Duo CPU P8600 @ 2.40GHz): Before: Did 1179000 AES-128-GCM (16 bytes) seal operations in 1000139us (1178836.1 ops/sec): 18.9 MB/s Did 97000 AES-128-GCM (1350 bytes) seal operations in 1006347us (96388.2 ops/sec): 130.1 MB/s Did 18000 AES-128-GCM (8192 bytes) seal operations in 1028943us (17493.7 ops/sec): 143.3 MB/s Did 977000 AES-256-GCM (16 bytes) seal operations in 1000197us (976807.6 ops/sec): 15.6 MB/s Did 82000 AES-256-GCM (1350 bytes) seal operations in 1012434us (80992.9 ops/sec): 109.3 MB/s Did 15000 AES-256-GCM (8192 bytes) seal operations in 1006528us (14902.7 ops/sec): 122.1 MB/s After: Did 1306000 AES-128-GCM (16 bytes) seal operations in 1000153us (1305800.2 ops/sec): 20.9 MB/s [+10.8%] Did 94000 AES-128-GCM (1350 bytes) seal operations in 1009852us (93082.9 ops/sec): 125.7 MB/s [-3.4%] Did 17000 AES-128-GCM (8192 bytes) seal operations in 1012096us (16796.8 ops/sec): 137.6 MB/s [-4.0%] Did 1070000 AES-256-GCM (16 bytes) seal operations in 1000929us (1069006.9 ops/sec): 17.1 MB/s [+9.4%] Did 79000 AES-256-GCM (1350 bytes) seal operations in 1002209us (78825.9 ops/sec): 106.4 MB/s [-2.7%] Did 15000 AES-256-GCM (8192 bytes) seal operations in 1061489us (14131.1 ops/sec): 115.8 MB/s [-5.2%] Change-Id: I1c3760a77af7bee4aee3745d1c648d9e34594afb Reviewed-on: https://boringssl-review.googlesource.com/c/34267 Commit-Queue: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-01-09 03:35:56 +00:00
EVP_AES_GCM_CTX *gctx_out = aes_gcm_from_cipher_ctx(out);
if (gctx->iv == c->iv) {
gctx_out->iv = out->iv;
} else {
gctx_out->iv = OPENSSL_malloc(gctx->ivlen);
if (!gctx_out->iv) {
return 0;
}
OPENSSL_memcpy(gctx_out->iv, gctx->iv, gctx->ivlen);
}
return 1;
}
default:
return -1;
}
}
static int aes_gcm_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in,
size_t len) {
Add a constant-time pshufb-based GHASH implementation. We currently require clmul instructions for constant-time GHASH on x86_64. Otherwise, it falls back to a variable-time 4-bit table implementation. However, a significant proportion of clients lack these instructions. Inspired by vpaes, we can use pshufb and a slightly different order of incorporating the bits to make a constant-time GHASH. This requires SSSE3, which is very common. Benchmarking old machines we had on hand, it appears to be a no-op on Sandy Bridge and a small slowdown for Penryn. Sandy Bridge (Intel Pentium CPU 987 @ 1.50GHz): (Note: these numbers are before 16-byte-aligning the table. That was an improvement on Penryn, so it's possible Sandy Bridge is now better.) Before: Did 4244750 AES-128-GCM (16 bytes) seal operations in 4015000us (1057222.9 ops/sec): 16.9 MB/s Did 442000 AES-128-GCM (1350 bytes) seal operations in 4016000us (110059.8 ops/sec): 148.6 MB/s Did 84000 AES-128-GCM (8192 bytes) seal operations in 4015000us (20921.5 ops/sec): 171.4 MB/s Did 3349250 AES-256-GCM (16 bytes) seal operations in 4016000us (833976.6 ops/sec): 13.3 MB/s Did 343500 AES-256-GCM (1350 bytes) seal operations in 4016000us (85532.9 ops/sec): 115.5 MB/s Did 65250 AES-256-GCM (8192 bytes) seal operations in 4015000us (16251.6 ops/sec): 133.1 MB/s After: Did 4229250 AES-128-GCM (16 bytes) seal operations in 4016000us (1053100.1 ops/sec): 16.8 MB/s [-0.4%] Did 442250 AES-128-GCM (1350 bytes) seal operations in 4016000us (110122.0 ops/sec): 148.7 MB/s [+0.1%] Did 83500 AES-128-GCM (8192 bytes) seal operations in 4015000us (20797.0 ops/sec): 170.4 MB/s [-0.6%] Did 3286500 AES-256-GCM (16 bytes) seal operations in 4016000us (818351.6 ops/sec): 13.1 MB/s [-1.9%] Did 342750 AES-256-GCM (1350 bytes) seal operations in 4015000us (85367.4 ops/sec): 115.2 MB/s [-0.2%] Did 65250 AES-256-GCM (8192 bytes) seal operations in 4016000us (16247.5 ops/sec): 133.1 MB/s [-0.0%] Penryn (Intel Core 2 Duo CPU P8600 @ 2.40GHz): Before: Did 1179000 AES-128-GCM (16 bytes) seal operations in 1000139us (1178836.1 ops/sec): 18.9 MB/s Did 97000 AES-128-GCM (1350 bytes) seal operations in 1006347us (96388.2 ops/sec): 130.1 MB/s Did 18000 AES-128-GCM (8192 bytes) seal operations in 1028943us (17493.7 ops/sec): 143.3 MB/s Did 977000 AES-256-GCM (16 bytes) seal operations in 1000197us (976807.6 ops/sec): 15.6 MB/s Did 82000 AES-256-GCM (1350 bytes) seal operations in 1012434us (80992.9 ops/sec): 109.3 MB/s Did 15000 AES-256-GCM (8192 bytes) seal operations in 1006528us (14902.7 ops/sec): 122.1 MB/s After: Did 1306000 AES-128-GCM (16 bytes) seal operations in 1000153us (1305800.2 ops/sec): 20.9 MB/s [+10.8%] Did 94000 AES-128-GCM (1350 bytes) seal operations in 1009852us (93082.9 ops/sec): 125.7 MB/s [-3.4%] Did 17000 AES-128-GCM (8192 bytes) seal operations in 1012096us (16796.8 ops/sec): 137.6 MB/s [-4.0%] Did 1070000 AES-256-GCM (16 bytes) seal operations in 1000929us (1069006.9 ops/sec): 17.1 MB/s [+9.4%] Did 79000 AES-256-GCM (1350 bytes) seal operations in 1002209us (78825.9 ops/sec): 106.4 MB/s [-2.7%] Did 15000 AES-256-GCM (8192 bytes) seal operations in 1061489us (14131.1 ops/sec): 115.8 MB/s [-5.2%] Change-Id: I1c3760a77af7bee4aee3745d1c648d9e34594afb Reviewed-on: https://boringssl-review.googlesource.com/c/34267 Commit-Queue: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-01-09 03:35:56 +00:00
EVP_AES_GCM_CTX *gctx = aes_gcm_from_cipher_ctx(ctx);
// If not set up, return error
if (!gctx->key_set) {
return -1;
}
if (!gctx->iv_set) {
return -1;
}
if (in) {
if (out == NULL) {
if (!CRYPTO_gcm128_aad(&gctx->gcm, in, len)) {
return -1;
}
} else if (ctx->encrypt) {
if (gctx->ctr) {
if (!CRYPTO_gcm128_encrypt_ctr32(&gctx->gcm, &gctx->ks.ks, in, out, len,
gctx->ctr)) {
return -1;
}
} else {
if (!CRYPTO_gcm128_encrypt(&gctx->gcm, &gctx->ks.ks, in, out, len)) {
return -1;
}
}
} else {
if (gctx->ctr) {
if (!CRYPTO_gcm128_decrypt_ctr32(&gctx->gcm, &gctx->ks.ks, in, out, len,
gctx->ctr)) {
return -1;
}
} else {
if (!CRYPTO_gcm128_decrypt(&gctx->gcm, &gctx->ks.ks, in, out, len)) {
return -1;
}
}
}
return len;
} else {
if (!ctx->encrypt) {
if (gctx->taglen < 0 ||
!CRYPTO_gcm128_finish(&gctx->gcm, ctx->buf, gctx->taglen)) {
return -1;
}
gctx->iv_set = 0;
return 0;
}
CRYPTO_gcm128_tag(&gctx->gcm, ctx->buf, 16);
gctx->taglen = 16;
// Don't reuse the IV
gctx->iv_set = 0;
return 0;
}
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_128_cbc_generic) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_128_cbc;
out->block_size = 16;
out->key_len = 16;
out->iv_len = 16;
out->ctx_size = sizeof(EVP_AES_KEY);
out->flags = EVP_CIPH_CBC_MODE;
out->init = aes_init_key;
out->cipher = aes_cbc_cipher;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_128_ctr_generic) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_128_ctr;
out->block_size = 1;
out->key_len = 16;
out->iv_len = 16;
out->ctx_size = sizeof(EVP_AES_KEY);
out->flags = EVP_CIPH_CTR_MODE;
out->init = aes_init_key;
out->cipher = aes_ctr_cipher;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_128_ecb_generic) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_128_ecb;
out->block_size = 16;
out->key_len = 16;
out->ctx_size = sizeof(EVP_AES_KEY);
out->flags = EVP_CIPH_ECB_MODE;
out->init = aes_init_key;
out->cipher = aes_ecb_cipher;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_128_ofb_generic) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_128_ofb128;
out->block_size = 1;
out->key_len = 16;
out->iv_len = 16;
out->ctx_size = sizeof(EVP_AES_KEY);
out->flags = EVP_CIPH_OFB_MODE;
out->init = aes_init_key;
out->cipher = aes_ofb_cipher;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_128_gcm_generic) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_128_gcm;
out->block_size = 1;
out->key_len = 16;
out->iv_len = 12;
Add a constant-time pshufb-based GHASH implementation. We currently require clmul instructions for constant-time GHASH on x86_64. Otherwise, it falls back to a variable-time 4-bit table implementation. However, a significant proportion of clients lack these instructions. Inspired by vpaes, we can use pshufb and a slightly different order of incorporating the bits to make a constant-time GHASH. This requires SSSE3, which is very common. Benchmarking old machines we had on hand, it appears to be a no-op on Sandy Bridge and a small slowdown for Penryn. Sandy Bridge (Intel Pentium CPU 987 @ 1.50GHz): (Note: these numbers are before 16-byte-aligning the table. That was an improvement on Penryn, so it's possible Sandy Bridge is now better.) Before: Did 4244750 AES-128-GCM (16 bytes) seal operations in 4015000us (1057222.9 ops/sec): 16.9 MB/s Did 442000 AES-128-GCM (1350 bytes) seal operations in 4016000us (110059.8 ops/sec): 148.6 MB/s Did 84000 AES-128-GCM (8192 bytes) seal operations in 4015000us (20921.5 ops/sec): 171.4 MB/s Did 3349250 AES-256-GCM (16 bytes) seal operations in 4016000us (833976.6 ops/sec): 13.3 MB/s Did 343500 AES-256-GCM (1350 bytes) seal operations in 4016000us (85532.9 ops/sec): 115.5 MB/s Did 65250 AES-256-GCM (8192 bytes) seal operations in 4015000us (16251.6 ops/sec): 133.1 MB/s After: Did 4229250 AES-128-GCM (16 bytes) seal operations in 4016000us (1053100.1 ops/sec): 16.8 MB/s [-0.4%] Did 442250 AES-128-GCM (1350 bytes) seal operations in 4016000us (110122.0 ops/sec): 148.7 MB/s [+0.1%] Did 83500 AES-128-GCM (8192 bytes) seal operations in 4015000us (20797.0 ops/sec): 170.4 MB/s [-0.6%] Did 3286500 AES-256-GCM (16 bytes) seal operations in 4016000us (818351.6 ops/sec): 13.1 MB/s [-1.9%] Did 342750 AES-256-GCM (1350 bytes) seal operations in 4015000us (85367.4 ops/sec): 115.2 MB/s [-0.2%] Did 65250 AES-256-GCM (8192 bytes) seal operations in 4016000us (16247.5 ops/sec): 133.1 MB/s [-0.0%] Penryn (Intel Core 2 Duo CPU P8600 @ 2.40GHz): Before: Did 1179000 AES-128-GCM (16 bytes) seal operations in 1000139us (1178836.1 ops/sec): 18.9 MB/s Did 97000 AES-128-GCM (1350 bytes) seal operations in 1006347us (96388.2 ops/sec): 130.1 MB/s Did 18000 AES-128-GCM (8192 bytes) seal operations in 1028943us (17493.7 ops/sec): 143.3 MB/s Did 977000 AES-256-GCM (16 bytes) seal operations in 1000197us (976807.6 ops/sec): 15.6 MB/s Did 82000 AES-256-GCM (1350 bytes) seal operations in 1012434us (80992.9 ops/sec): 109.3 MB/s Did 15000 AES-256-GCM (8192 bytes) seal operations in 1006528us (14902.7 ops/sec): 122.1 MB/s After: Did 1306000 AES-128-GCM (16 bytes) seal operations in 1000153us (1305800.2 ops/sec): 20.9 MB/s [+10.8%] Did 94000 AES-128-GCM (1350 bytes) seal operations in 1009852us (93082.9 ops/sec): 125.7 MB/s [-3.4%] Did 17000 AES-128-GCM (8192 bytes) seal operations in 1012096us (16796.8 ops/sec): 137.6 MB/s [-4.0%] Did 1070000 AES-256-GCM (16 bytes) seal operations in 1000929us (1069006.9 ops/sec): 17.1 MB/s [+9.4%] Did 79000 AES-256-GCM (1350 bytes) seal operations in 1002209us (78825.9 ops/sec): 106.4 MB/s [-2.7%] Did 15000 AES-256-GCM (8192 bytes) seal operations in 1061489us (14131.1 ops/sec): 115.8 MB/s [-5.2%] Change-Id: I1c3760a77af7bee4aee3745d1c648d9e34594afb Reviewed-on: https://boringssl-review.googlesource.com/c/34267 Commit-Queue: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-01-09 03:35:56 +00:00
out->ctx_size = sizeof(EVP_AES_GCM_CTX) + EVP_AES_GCM_CTX_PADDING;
out->flags = EVP_CIPH_GCM_MODE | EVP_CIPH_CUSTOM_IV |
EVP_CIPH_FLAG_CUSTOM_CIPHER | EVP_CIPH_ALWAYS_CALL_INIT |
EVP_CIPH_CTRL_INIT | EVP_CIPH_FLAG_AEAD_CIPHER;
out->init = aes_gcm_init_key;
out->cipher = aes_gcm_cipher;
out->cleanup = aes_gcm_cleanup;
out->ctrl = aes_gcm_ctrl;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_192_cbc_generic) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_192_cbc;
out->block_size = 16;
out->key_len = 24;
out->iv_len = 16;
out->ctx_size = sizeof(EVP_AES_KEY);
out->flags = EVP_CIPH_CBC_MODE;
out->init = aes_init_key;
out->cipher = aes_cbc_cipher;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_192_ctr_generic) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_192_ctr;
out->block_size = 1;
out->key_len = 24;
out->iv_len = 16;
out->ctx_size = sizeof(EVP_AES_KEY);
out->flags = EVP_CIPH_CTR_MODE;
out->init = aes_init_key;
out->cipher = aes_ctr_cipher;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_192_ecb_generic) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_192_ecb;
out->block_size = 16;
out->key_len = 24;
out->ctx_size = sizeof(EVP_AES_KEY);
out->flags = EVP_CIPH_ECB_MODE;
out->init = aes_init_key;
out->cipher = aes_ecb_cipher;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_192_ofb_generic) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_192_ofb128;
out->block_size = 1;
out->key_len = 24;
out->iv_len = 16;
out->ctx_size = sizeof(EVP_AES_KEY);
out->flags = EVP_CIPH_OFB_MODE;
out->init = aes_init_key;
out->cipher = aes_ofb_cipher;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_192_gcm_generic) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_192_gcm;
out->block_size = 1;
out->key_len = 24;
out->iv_len = 12;
Add a constant-time pshufb-based GHASH implementation. We currently require clmul instructions for constant-time GHASH on x86_64. Otherwise, it falls back to a variable-time 4-bit table implementation. However, a significant proportion of clients lack these instructions. Inspired by vpaes, we can use pshufb and a slightly different order of incorporating the bits to make a constant-time GHASH. This requires SSSE3, which is very common. Benchmarking old machines we had on hand, it appears to be a no-op on Sandy Bridge and a small slowdown for Penryn. Sandy Bridge (Intel Pentium CPU 987 @ 1.50GHz): (Note: these numbers are before 16-byte-aligning the table. That was an improvement on Penryn, so it's possible Sandy Bridge is now better.) Before: Did 4244750 AES-128-GCM (16 bytes) seal operations in 4015000us (1057222.9 ops/sec): 16.9 MB/s Did 442000 AES-128-GCM (1350 bytes) seal operations in 4016000us (110059.8 ops/sec): 148.6 MB/s Did 84000 AES-128-GCM (8192 bytes) seal operations in 4015000us (20921.5 ops/sec): 171.4 MB/s Did 3349250 AES-256-GCM (16 bytes) seal operations in 4016000us (833976.6 ops/sec): 13.3 MB/s Did 343500 AES-256-GCM (1350 bytes) seal operations in 4016000us (85532.9 ops/sec): 115.5 MB/s Did 65250 AES-256-GCM (8192 bytes) seal operations in 4015000us (16251.6 ops/sec): 133.1 MB/s After: Did 4229250 AES-128-GCM (16 bytes) seal operations in 4016000us (1053100.1 ops/sec): 16.8 MB/s [-0.4%] Did 442250 AES-128-GCM (1350 bytes) seal operations in 4016000us (110122.0 ops/sec): 148.7 MB/s [+0.1%] Did 83500 AES-128-GCM (8192 bytes) seal operations in 4015000us (20797.0 ops/sec): 170.4 MB/s [-0.6%] Did 3286500 AES-256-GCM (16 bytes) seal operations in 4016000us (818351.6 ops/sec): 13.1 MB/s [-1.9%] Did 342750 AES-256-GCM (1350 bytes) seal operations in 4015000us (85367.4 ops/sec): 115.2 MB/s [-0.2%] Did 65250 AES-256-GCM (8192 bytes) seal operations in 4016000us (16247.5 ops/sec): 133.1 MB/s [-0.0%] Penryn (Intel Core 2 Duo CPU P8600 @ 2.40GHz): Before: Did 1179000 AES-128-GCM (16 bytes) seal operations in 1000139us (1178836.1 ops/sec): 18.9 MB/s Did 97000 AES-128-GCM (1350 bytes) seal operations in 1006347us (96388.2 ops/sec): 130.1 MB/s Did 18000 AES-128-GCM (8192 bytes) seal operations in 1028943us (17493.7 ops/sec): 143.3 MB/s Did 977000 AES-256-GCM (16 bytes) seal operations in 1000197us (976807.6 ops/sec): 15.6 MB/s Did 82000 AES-256-GCM (1350 bytes) seal operations in 1012434us (80992.9 ops/sec): 109.3 MB/s Did 15000 AES-256-GCM (8192 bytes) seal operations in 1006528us (14902.7 ops/sec): 122.1 MB/s After: Did 1306000 AES-128-GCM (16 bytes) seal operations in 1000153us (1305800.2 ops/sec): 20.9 MB/s [+10.8%] Did 94000 AES-128-GCM (1350 bytes) seal operations in 1009852us (93082.9 ops/sec): 125.7 MB/s [-3.4%] Did 17000 AES-128-GCM (8192 bytes) seal operations in 1012096us (16796.8 ops/sec): 137.6 MB/s [-4.0%] Did 1070000 AES-256-GCM (16 bytes) seal operations in 1000929us (1069006.9 ops/sec): 17.1 MB/s [+9.4%] Did 79000 AES-256-GCM (1350 bytes) seal operations in 1002209us (78825.9 ops/sec): 106.4 MB/s [-2.7%] Did 15000 AES-256-GCM (8192 bytes) seal operations in 1061489us (14131.1 ops/sec): 115.8 MB/s [-5.2%] Change-Id: I1c3760a77af7bee4aee3745d1c648d9e34594afb Reviewed-on: https://boringssl-review.googlesource.com/c/34267 Commit-Queue: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-01-09 03:35:56 +00:00
out->ctx_size = sizeof(EVP_AES_GCM_CTX) + EVP_AES_GCM_CTX_PADDING;
out->flags = EVP_CIPH_GCM_MODE | EVP_CIPH_CUSTOM_IV |
EVP_CIPH_FLAG_CUSTOM_CIPHER | EVP_CIPH_ALWAYS_CALL_INIT |
EVP_CIPH_CTRL_INIT | EVP_CIPH_FLAG_AEAD_CIPHER;
out->init = aes_gcm_init_key;
out->cipher = aes_gcm_cipher;
out->cleanup = aes_gcm_cleanup;
out->ctrl = aes_gcm_ctrl;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_256_cbc_generic) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_256_cbc;
out->block_size = 16;
out->key_len = 32;
out->iv_len = 16;
out->ctx_size = sizeof(EVP_AES_KEY);
out->flags = EVP_CIPH_CBC_MODE;
out->init = aes_init_key;
out->cipher = aes_cbc_cipher;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_256_ctr_generic) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_256_ctr;
out->block_size = 1;
out->key_len = 32;
out->iv_len = 16;
out->ctx_size = sizeof(EVP_AES_KEY);
out->flags = EVP_CIPH_CTR_MODE;
out->init = aes_init_key;
out->cipher = aes_ctr_cipher;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_256_ecb_generic) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_256_ecb;
out->block_size = 16;
out->key_len = 32;
out->ctx_size = sizeof(EVP_AES_KEY);
out->flags = EVP_CIPH_ECB_MODE;
out->init = aes_init_key;
out->cipher = aes_ecb_cipher;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_256_ofb_generic) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_256_ofb128;
out->block_size = 1;
out->key_len = 32;
out->iv_len = 16;
out->ctx_size = sizeof(EVP_AES_KEY);
out->flags = EVP_CIPH_OFB_MODE;
out->init = aes_init_key;
out->cipher = aes_ofb_cipher;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_256_gcm_generic) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_256_gcm;
out->block_size = 1;
out->key_len = 32;
out->iv_len = 12;
Add a constant-time pshufb-based GHASH implementation. We currently require clmul instructions for constant-time GHASH on x86_64. Otherwise, it falls back to a variable-time 4-bit table implementation. However, a significant proportion of clients lack these instructions. Inspired by vpaes, we can use pshufb and a slightly different order of incorporating the bits to make a constant-time GHASH. This requires SSSE3, which is very common. Benchmarking old machines we had on hand, it appears to be a no-op on Sandy Bridge and a small slowdown for Penryn. Sandy Bridge (Intel Pentium CPU 987 @ 1.50GHz): (Note: these numbers are before 16-byte-aligning the table. That was an improvement on Penryn, so it's possible Sandy Bridge is now better.) Before: Did 4244750 AES-128-GCM (16 bytes) seal operations in 4015000us (1057222.9 ops/sec): 16.9 MB/s Did 442000 AES-128-GCM (1350 bytes) seal operations in 4016000us (110059.8 ops/sec): 148.6 MB/s Did 84000 AES-128-GCM (8192 bytes) seal operations in 4015000us (20921.5 ops/sec): 171.4 MB/s Did 3349250 AES-256-GCM (16 bytes) seal operations in 4016000us (833976.6 ops/sec): 13.3 MB/s Did 343500 AES-256-GCM (1350 bytes) seal operations in 4016000us (85532.9 ops/sec): 115.5 MB/s Did 65250 AES-256-GCM (8192 bytes) seal operations in 4015000us (16251.6 ops/sec): 133.1 MB/s After: Did 4229250 AES-128-GCM (16 bytes) seal operations in 4016000us (1053100.1 ops/sec): 16.8 MB/s [-0.4%] Did 442250 AES-128-GCM (1350 bytes) seal operations in 4016000us (110122.0 ops/sec): 148.7 MB/s [+0.1%] Did 83500 AES-128-GCM (8192 bytes) seal operations in 4015000us (20797.0 ops/sec): 170.4 MB/s [-0.6%] Did 3286500 AES-256-GCM (16 bytes) seal operations in 4016000us (818351.6 ops/sec): 13.1 MB/s [-1.9%] Did 342750 AES-256-GCM (1350 bytes) seal operations in 4015000us (85367.4 ops/sec): 115.2 MB/s [-0.2%] Did 65250 AES-256-GCM (8192 bytes) seal operations in 4016000us (16247.5 ops/sec): 133.1 MB/s [-0.0%] Penryn (Intel Core 2 Duo CPU P8600 @ 2.40GHz): Before: Did 1179000 AES-128-GCM (16 bytes) seal operations in 1000139us (1178836.1 ops/sec): 18.9 MB/s Did 97000 AES-128-GCM (1350 bytes) seal operations in 1006347us (96388.2 ops/sec): 130.1 MB/s Did 18000 AES-128-GCM (8192 bytes) seal operations in 1028943us (17493.7 ops/sec): 143.3 MB/s Did 977000 AES-256-GCM (16 bytes) seal operations in 1000197us (976807.6 ops/sec): 15.6 MB/s Did 82000 AES-256-GCM (1350 bytes) seal operations in 1012434us (80992.9 ops/sec): 109.3 MB/s Did 15000 AES-256-GCM (8192 bytes) seal operations in 1006528us (14902.7 ops/sec): 122.1 MB/s After: Did 1306000 AES-128-GCM (16 bytes) seal operations in 1000153us (1305800.2 ops/sec): 20.9 MB/s [+10.8%] Did 94000 AES-128-GCM (1350 bytes) seal operations in 1009852us (93082.9 ops/sec): 125.7 MB/s [-3.4%] Did 17000 AES-128-GCM (8192 bytes) seal operations in 1012096us (16796.8 ops/sec): 137.6 MB/s [-4.0%] Did 1070000 AES-256-GCM (16 bytes) seal operations in 1000929us (1069006.9 ops/sec): 17.1 MB/s [+9.4%] Did 79000 AES-256-GCM (1350 bytes) seal operations in 1002209us (78825.9 ops/sec): 106.4 MB/s [-2.7%] Did 15000 AES-256-GCM (8192 bytes) seal operations in 1061489us (14131.1 ops/sec): 115.8 MB/s [-5.2%] Change-Id: I1c3760a77af7bee4aee3745d1c648d9e34594afb Reviewed-on: https://boringssl-review.googlesource.com/c/34267 Commit-Queue: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com>
2019-01-09 03:35:56 +00:00
out->ctx_size = sizeof(EVP_AES_GCM_CTX) + EVP_AES_GCM_CTX_PADDING;
out->flags = EVP_CIPH_GCM_MODE | EVP_CIPH_CUSTOM_IV |
EVP_CIPH_FLAG_CUSTOM_CIPHER | EVP_CIPH_ALWAYS_CALL_INIT |
EVP_CIPH_CTRL_INIT | EVP_CIPH_FLAG_AEAD_CIPHER;
out->init = aes_gcm_init_key;
out->cipher = aes_gcm_cipher;
out->cleanup = aes_gcm_cleanup;
out->ctrl = aes_gcm_ctrl;
}
#if defined(HWAES_ECB)
static int aes_hw_ecb_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out,
const uint8_t *in, size_t len) {
size_t bl = ctx->cipher->block_size;
if (len < bl) {
return 1;
}
aes_hw_ecb_encrypt(in, out, len, ctx->cipher_data, ctx->encrypt);
return 1;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_hw_128_ecb) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_128_ecb;
out->block_size = 16;
out->key_len = 16;
out->ctx_size = sizeof(EVP_AES_KEY);
out->flags = EVP_CIPH_ECB_MODE;
out->init = aes_init_key;
out->cipher = aes_hw_ecb_cipher;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_hw_192_ecb) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_192_ecb;
out->block_size = 16;
out->key_len = 24;
out->ctx_size = sizeof(EVP_AES_KEY);
out->flags = EVP_CIPH_ECB_MODE;
out->init = aes_init_key;
out->cipher = aes_hw_ecb_cipher;
}
DEFINE_LOCAL_DATA(EVP_CIPHER, aes_hw_256_ecb) {
memset(out, 0, sizeof(EVP_CIPHER));
out->nid = NID_aes_256_ecb;
out->block_size = 16;
out->key_len = 32;
out->ctx_size = sizeof(EVP_AES_KEY);
out->flags = EVP_CIPH_ECB_MODE;
out->init = aes_init_key;
out->cipher = aes_hw_ecb_cipher;
}
#define EVP_ECB_CIPHER_FUNCTION(keybits) \
const EVP_CIPHER *EVP_aes_##keybits##_ecb(void) { \
if (hwaes_capable()) { \
return aes_hw_##keybits##_ecb(); \
} \
return aes_##keybits##_ecb_generic(); \
}
#else
#define EVP_ECB_CIPHER_FUNCTION(keybits) \
const EVP_CIPHER *EVP_aes_##keybits##_ecb(void) { \
return aes_##keybits##_ecb_generic(); \
}
#endif // HWAES_ECB
#define EVP_CIPHER_FUNCTION(keybits, mode) \
const EVP_CIPHER *EVP_aes_##keybits##_##mode(void) { \
return aes_##keybits##_##mode##_generic(); \
}
EVP_CIPHER_FUNCTION(128, cbc)
EVP_CIPHER_FUNCTION(128, ctr)
EVP_CIPHER_FUNCTION(128, ofb)
EVP_CIPHER_FUNCTION(128, gcm)
EVP_CIPHER_FUNCTION(192, cbc)
EVP_CIPHER_FUNCTION(192, ctr)
EVP_CIPHER_FUNCTION(192, ofb)
EVP_CIPHER_FUNCTION(192, gcm)
EVP_CIPHER_FUNCTION(256, cbc)
EVP_CIPHER_FUNCTION(256, ctr)
EVP_CIPHER_FUNCTION(256, ofb)
EVP_CIPHER_FUNCTION(256, gcm)
EVP_ECB_CIPHER_FUNCTION(128)
EVP_ECB_CIPHER_FUNCTION(192)
EVP_ECB_CIPHER_FUNCTION(256)
#define EVP_AEAD_AES_GCM_TAG_LEN 16
struct aead_aes_gcm_ctx {
union {
double align;
AES_KEY ks;
} ks;
GCM128_KEY gcm_key;
ctr128_f ctr;
};
static int aead_aes_gcm_init_impl(struct aead_aes_gcm_ctx *gcm_ctx,
size_t *out_tag_len, const uint8_t *key,
size_t key_len, size_t tag_len) {
const size_t key_bits = key_len * 8;
if (key_bits != 128 && key_bits != 256) {
OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_KEY_LENGTH);
return 0; // EVP_AEAD_CTX_init should catch this.
}
if (tag_len == EVP_AEAD_DEFAULT_TAG_LENGTH) {
tag_len = EVP_AEAD_AES_GCM_TAG_LEN;
}
if (tag_len > EVP_AEAD_AES_GCM_TAG_LEN) {
OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TAG_TOO_LARGE);
return 0;
}
Don't use bsaes over vpaes for CTR-DRBG. RAND_bytes rarely uses large enough inputs for bsaes to be worth it. https://boringssl-review.googlesource.com/c/boringssl/+/33589 includes some rough benchmarks of various bits here. Some observations: - 8 blocks of bsaes costs roughly 6.5 blocks of vpaes. Note the comparison isn't quite accurate because I'm measuring bsaes_ctr32_encrypt_blocks against vpaes_encrypt and vpaes in CTR mode today must make do with a C loop. Even assuming a cutoff of 6 rather than 7 blocks, it's rare to ask for 96 bytes of entropy at a time. - CTR-DRBG performs some stray block operations (ctr_drbg_update), which bsaes is bad at without extra work to fold them into the CTR loop (not really worth it). - CTR-DRBG calculates a couple new key schedules every RAND_bytes call. We don't currently have a constant-time bsaes key schedule. Unfortunately, even plain vpaes loses to the current aes_nohw used by bsaes, but it's not constant-time. Also taking CTR-DRBG out of the bsaes equation - Machines without AES hardware (clients) are not going to be RNG-bound. It's mostly servers pushing way too many CBC IVs that care. This means bsaes's current side channel tradeoffs make even less sense here. I'm not sure yet what we should do for the rest of the bsaes mess, but it seems clear that we want to stick with vpaes for the RNG. Bug: 256 Change-Id: Iec8f13af232794afd007cb1065913e8117eeee24 Reviewed-on: https://boringssl-review.googlesource.com/c/34744 Reviewed-by: Adam Langley <agl@google.com>
2019-01-12 15:29:10 +00:00
gcm_ctx->ctr = aes_ctr_set_key(&gcm_ctx->ks.ks, &gcm_ctx->gcm_key, NULL, key,
key_len, 1 /* large inputs */);
*out_tag_len = tag_len;
return 1;
}
OPENSSL_STATIC_ASSERT(sizeof(((EVP_AEAD_CTX *)NULL)->state) >=
sizeof(struct aead_aes_gcm_ctx),
"AEAD state is too small");
#if defined(__GNUC__) || defined(__clang__)
OPENSSL_STATIC_ASSERT(alignof(union evp_aead_ctx_st_state) >=
alignof(struct aead_aes_gcm_ctx),
"AEAD state has insufficient alignment");
#endif
static int aead_aes_gcm_init(EVP_AEAD_CTX *ctx, const uint8_t *key,
size_t key_len, size_t requested_tag_len) {
struct aead_aes_gcm_ctx *gcm_ctx = (struct aead_aes_gcm_ctx *) &ctx->state;
size_t actual_tag_len;
if (!aead_aes_gcm_init_impl(gcm_ctx, &actual_tag_len, key, key_len,
requested_tag_len)) {
return 0;
}
ctx->tag_len = actual_tag_len;
return 1;
}
static void aead_aes_gcm_cleanup(EVP_AEAD_CTX *ctx) {}
static int aead_aes_gcm_seal_scatter(const EVP_AEAD_CTX *ctx, uint8_t *out,
uint8_t *out_tag, size_t *out_tag_len,
size_t max_out_tag_len,
const uint8_t *nonce, size_t nonce_len,
const uint8_t *in, size_t in_len,
const uint8_t *extra_in,
size_t extra_in_len,
const uint8_t *ad, size_t ad_len) {
struct aead_aes_gcm_ctx *gcm_ctx = (struct aead_aes_gcm_ctx *) &ctx->state;
if (extra_in_len + ctx->tag_len < ctx->tag_len) {
OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE);
return 0;
}
if (max_out_tag_len < extra_in_len + ctx->tag_len) {
OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL);
return 0;
}
if (nonce_len == 0) {
OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE_SIZE);
return 0;
}
const AES_KEY *key = &gcm_ctx->ks.ks;
GCM128_CONTEXT gcm;
OPENSSL_memset(&gcm, 0, sizeof(gcm));
OPENSSL_memcpy(&gcm.gcm_key, &gcm_ctx->gcm_key, sizeof(gcm.gcm_key));
CRYPTO_gcm128_setiv(&gcm, key, nonce, nonce_len);
if (ad_len > 0 && !CRYPTO_gcm128_aad(&gcm, ad, ad_len)) {
return 0;
}
if (gcm_ctx->ctr) {
if (!CRYPTO_gcm128_encrypt_ctr32(&gcm, key, in, out, in_len,
gcm_ctx->ctr)) {
return 0;
}
} else {
if (!CRYPTO_gcm128_encrypt(&gcm, key, in, out, in_len)) {
return 0;
}
}
if (extra_in_len) {
if (gcm_ctx->ctr) {
if (!CRYPTO_gcm128_encrypt_ctr32(&gcm, key, extra_in, out_tag,
extra_in_len, gcm_ctx->ctr)) {
return 0;
}
} else {
if (!CRYPTO_gcm128_encrypt(&gcm, key, extra_in, out_tag, extra_in_len)) {
return 0;
}
}
}
CRYPTO_gcm128_tag(&gcm, out_tag + extra_in_len, ctx->tag_len);
*out_tag_len = ctx->tag_len + extra_in_len;
return 1;
}
static int aead_aes_gcm_open_gather(const EVP_AEAD_CTX *ctx, uint8_t *out,
const uint8_t *nonce, size_t nonce_len,
const uint8_t *in, size_t in_len,
const uint8_t *in_tag, size_t in_tag_len,
const uint8_t *ad, size_t ad_len) {
struct aead_aes_gcm_ctx *gcm_ctx = (struct aead_aes_gcm_ctx *) &ctx->state;
uint8_t tag[EVP_AEAD_AES_GCM_TAG_LEN];
if (nonce_len == 0) {
OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE_SIZE);
return 0;
}
if (in_tag_len != ctx->tag_len) {
OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT);
return 0;
}
const AES_KEY *key = &gcm_ctx->ks.ks;
GCM128_CONTEXT gcm;
OPENSSL_memset(&gcm, 0, sizeof(gcm));
OPENSSL_memcpy(&gcm.gcm_key, &gcm_ctx->gcm_key, sizeof(gcm.gcm_key));
CRYPTO_gcm128_setiv(&gcm, key, nonce, nonce_len);
if (!CRYPTO_gcm128_aad(&gcm, ad, ad_len)) {
return 0;
}
if (gcm_ctx->ctr) {
if (!CRYPTO_gcm128_decrypt_ctr32(&gcm, key, in, out, in_len,
gcm_ctx->ctr)) {
return 0;
}
} else {
if (!CRYPTO_gcm128_decrypt(&gcm, key, in, out, in_len)) {
return 0;
}
}
CRYPTO_gcm128_tag(&gcm, tag, ctx->tag_len);
if (CRYPTO_memcmp(tag, in_tag, ctx->tag_len) != 0) {
OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT);
return 0;
}
return 1;
}
DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_128_gcm) {
memset(out, 0, sizeof(EVP_AEAD));
out->key_len = 16;
out->nonce_len = 12;
out->overhead = EVP_AEAD_AES_GCM_TAG_LEN;
out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN;
out->seal_scatter_supports_extra_in = 1;
out->init = aead_aes_gcm_init;
out->cleanup = aead_aes_gcm_cleanup;
out->seal_scatter = aead_aes_gcm_seal_scatter;
out->open_gather = aead_aes_gcm_open_gather;
}
DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_256_gcm) {
memset(out, 0, sizeof(EVP_AEAD));
out->key_len = 32;
out->nonce_len = 12;
out->overhead = EVP_AEAD_AES_GCM_TAG_LEN;
out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN;
out->seal_scatter_supports_extra_in = 1;
out->init = aead_aes_gcm_init;
out->cleanup = aead_aes_gcm_cleanup;
out->seal_scatter = aead_aes_gcm_seal_scatter;
out->open_gather = aead_aes_gcm_open_gather;
}
struct aead_aes_gcm_tls12_ctx {
struct aead_aes_gcm_ctx gcm_ctx;
uint64_t min_next_nonce;
};
OPENSSL_STATIC_ASSERT(sizeof(((EVP_AEAD_CTX *)NULL)->state) >=
sizeof(struct aead_aes_gcm_tls12_ctx),
"AEAD state is too small");
#if defined(__GNUC__) || defined(__clang__)
OPENSSL_STATIC_ASSERT(alignof(union evp_aead_ctx_st_state) >=
alignof(struct aead_aes_gcm_tls12_ctx),
"AEAD state has insufficient alignment");
#endif
static int aead_aes_gcm_tls12_init(EVP_AEAD_CTX *ctx, const uint8_t *key,
size_t key_len, size_t requested_tag_len) {
struct aead_aes_gcm_tls12_ctx *gcm_ctx =
(struct aead_aes_gcm_tls12_ctx *) &ctx->state;
gcm_ctx->min_next_nonce = 0;
size_t actual_tag_len;
if (!aead_aes_gcm_init_impl(&gcm_ctx->gcm_ctx, &actual_tag_len, key, key_len,
requested_tag_len)) {
return 0;
}
ctx->tag_len = actual_tag_len;
return 1;
}
static int aead_aes_gcm_tls12_seal_scatter(
const EVP_AEAD_CTX *ctx, uint8_t *out, uint8_t *out_tag,
size_t *out_tag_len, size_t max_out_tag_len, const uint8_t *nonce,
size_t nonce_len, const uint8_t *in, size_t in_len, const uint8_t *extra_in,
size_t extra_in_len, const uint8_t *ad, size_t ad_len) {
struct aead_aes_gcm_tls12_ctx *gcm_ctx =
(struct aead_aes_gcm_tls12_ctx *) &ctx->state;
if (nonce_len != 12) {
OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE);
return 0;
}
// The given nonces must be strictly monotonically increasing.
uint64_t given_counter;
OPENSSL_memcpy(&given_counter, nonce + nonce_len - sizeof(given_counter),
sizeof(given_counter));
given_counter = CRYPTO_bswap8(given_counter);
if (given_counter == UINT64_MAX ||
given_counter < gcm_ctx->min_next_nonce) {
OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE);
return 0;
}
gcm_ctx->min_next_nonce = given_counter + 1;
return aead_aes_gcm_seal_scatter(ctx, out, out_tag, out_tag_len,
max_out_tag_len, nonce, nonce_len, in,
in_len, extra_in, extra_in_len, ad, ad_len);
}
DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_128_gcm_tls12) {
memset(out, 0, sizeof(EVP_AEAD));
out->key_len = 16;
out->nonce_len = 12;
out->overhead = EVP_AEAD_AES_GCM_TAG_LEN;
out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN;
out->seal_scatter_supports_extra_in = 1;
out->init = aead_aes_gcm_tls12_init;
out->cleanup = aead_aes_gcm_cleanup;
out->seal_scatter = aead_aes_gcm_tls12_seal_scatter;
out->open_gather = aead_aes_gcm_open_gather;
}
DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_256_gcm_tls12) {
memset(out, 0, sizeof(EVP_AEAD));
out->key_len = 32;
out->nonce_len = 12;
out->overhead = EVP_AEAD_AES_GCM_TAG_LEN;
out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN;
out->seal_scatter_supports_extra_in = 1;
out->init = aead_aes_gcm_tls12_init;
out->cleanup = aead_aes_gcm_cleanup;
out->seal_scatter = aead_aes_gcm_tls12_seal_scatter;
out->open_gather = aead_aes_gcm_open_gather;
}
struct aead_aes_gcm_tls13_ctx {
struct aead_aes_gcm_ctx gcm_ctx;
uint64_t min_next_nonce;
uint64_t mask;
uint8_t first;
};
OPENSSL_STATIC_ASSERT(sizeof(((EVP_AEAD_CTX *)NULL)->state) >=
sizeof(struct aead_aes_gcm_tls13_ctx),
"AEAD state is too small");
#if defined(__GNUC__) || defined(__clang__)
OPENSSL_STATIC_ASSERT(alignof(union evp_aead_ctx_st_state) >=
alignof(struct aead_aes_gcm_tls13_ctx),
"AEAD state has insufficient alignment");
#endif
static int aead_aes_gcm_tls13_init(EVP_AEAD_CTX *ctx, const uint8_t *key,
size_t key_len, size_t requested_tag_len) {
struct aead_aes_gcm_tls13_ctx *gcm_ctx =
(struct aead_aes_gcm_tls13_ctx *) &ctx->state;
gcm_ctx->min_next_nonce = 0;
gcm_ctx->first = 1;
size_t actual_tag_len;
if (!aead_aes_gcm_init_impl(&gcm_ctx->gcm_ctx, &actual_tag_len, key, key_len,
requested_tag_len)) {
return 0;
}
ctx->tag_len = actual_tag_len;
return 1;
}
static int aead_aes_gcm_tls13_seal_scatter(
const EVP_AEAD_CTX *ctx, uint8_t *out, uint8_t *out_tag,
size_t *out_tag_len, size_t max_out_tag_len, const uint8_t *nonce,
size_t nonce_len, const uint8_t *in, size_t in_len, const uint8_t *extra_in,
size_t extra_in_len, const uint8_t *ad, size_t ad_len) {
struct aead_aes_gcm_tls13_ctx *gcm_ctx =
(struct aead_aes_gcm_tls13_ctx *) &ctx->state;
if (nonce_len != 12) {
OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE);
return 0;
}
// The given nonces must be strictly monotonically increasing. See
// https://tools.ietf.org/html/rfc8446#section-5.3 for details of the TLS 1.3
// nonce construction.
uint64_t given_counter;
OPENSSL_memcpy(&given_counter, nonce + nonce_len - sizeof(given_counter),
sizeof(given_counter));
given_counter = CRYPTO_bswap8(given_counter);
if (gcm_ctx->first) {
// In the first call the sequence number will be zero and therefore the
// given nonce will be 0 ^ mask = mask.
gcm_ctx->mask = given_counter;
gcm_ctx->first = 0;
}
given_counter ^= gcm_ctx->mask;
if (given_counter == UINT64_MAX ||
given_counter < gcm_ctx->min_next_nonce) {
OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE);
return 0;
}
gcm_ctx->min_next_nonce = given_counter + 1;
return aead_aes_gcm_seal_scatter(ctx, out, out_tag, out_tag_len,
max_out_tag_len, nonce, nonce_len, in,
in_len, extra_in, extra_in_len, ad, ad_len);
}
DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_128_gcm_tls13) {
memset(out, 0, sizeof(EVP_AEAD));
out->key_len = 16;
out->nonce_len = 12;
out->overhead = EVP_AEAD_AES_GCM_TAG_LEN;
out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN;
out->seal_scatter_supports_extra_in = 1;
out->init = aead_aes_gcm_tls13_init;
out->cleanup = aead_aes_gcm_cleanup;
out->seal_scatter = aead_aes_gcm_tls13_seal_scatter;
out->open_gather = aead_aes_gcm_open_gather;
}
DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_256_gcm_tls13) {
memset(out, 0, sizeof(EVP_AEAD));
out->key_len = 32;
out->nonce_len = 12;
out->overhead = EVP_AEAD_AES_GCM_TAG_LEN;
out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN;
out->seal_scatter_supports_extra_in = 1;
out->init = aead_aes_gcm_tls13_init;
out->cleanup = aead_aes_gcm_cleanup;
out->seal_scatter = aead_aes_gcm_tls13_seal_scatter;
out->open_gather = aead_aes_gcm_open_gather;
}
int EVP_has_aes_hardware(void) {
#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
return hwaes_capable() && crypto_gcm_clmul_enabled();
#elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
return hwaes_capable() && CRYPTO_is_ARMv8_PMULL_capable();
#else
return 0;
#endif
}
OPENSSL_MSVC_PRAGMA(warning(pop))