2017-05-03 21:23:37 +01:00
|
|
|
/* Copyright (c) 2017, Google Inc.
|
|
|
|
*
|
|
|
|
* Permission to use, copy, modify, and/or distribute this software for any
|
|
|
|
* purpose with or without fee is hereby granted, provided that the above
|
|
|
|
* copyright notice and this permission notice appear in all copies.
|
|
|
|
*
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
|
|
|
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
|
|
|
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
|
|
|
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
|
|
|
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
|
|
|
|
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
|
|
|
|
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
|
|
|
|
|
|
|
|
#ifndef OPENSSL_HEADER_AES_INTERNAL_H
|
|
|
|
#define OPENSSL_HEADER_AES_INTERNAL_H
|
|
|
|
|
|
|
|
#include <stdlib.h>
|
|
|
|
|
|
|
|
#include <openssl/cpu.h>
|
|
|
|
|
|
|
|
#if defined(__cplusplus)
|
|
|
|
extern "C" {
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
2018-05-14 22:28:36 +01:00
|
|
|
#if !defined(OPENSSL_NO_ASM)
|
|
|
|
|
2018-11-27 03:58:19 +00:00
|
|
|
#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
|
2018-05-11 19:30:10 +01:00
|
|
|
#define HWAES
|
2018-05-14 22:28:36 +01:00
|
|
|
#define HWAES_ECB
|
2018-05-11 18:38:51 +01:00
|
|
|
|
2019-01-06 03:49:46 +00:00
|
|
|
OPENSSL_INLINE int hwaes_capable(void) {
|
2018-12-06 00:44:03 +00:00
|
|
|
return (OPENSSL_ia32cap_get()[1] & (1 << (57 - 32))) != 0;
|
2018-05-11 19:30:10 +01:00
|
|
|
}
|
2019-01-06 03:49:46 +00:00
|
|
|
|
|
|
|
#define VPAES
|
Add an optimized x86_64 vpaes ctr128_f and remove bsaes.
Brian Smith suggested applying vpaes-armv8's "2x" optimization to
vpaes-x86_64. The registers are a little tight (aarch64 has a whole 32
SIMD registers, while x86_64 only has 16), but it's doable with some
spills and makes vpaes much more competitive with bsaes. At small- and
medium-sized inputs, vpaes now matches bsaes. At large inputs, it's a
~10% perf hit.
bsaes is thus pulling much less weight. Losing an entire AES
implementation and having constant-time AES for SSSE3 is attractive.
Some notes:
- The fact that these are older CPUs tempers the perf hit, but CPUs
without AES-NI are still common enough to matter.
- This CL does regress CBC decrypt performance nontrivially (see below).
If this matters, we can double-up CBC decryption too. CBC in TLS is
legacy and already pays a costly Lucky13 mitigation.
- The difference between 1350 and 8192 bytes is likely bsaes AES-GCM
paying for two slow (and variable-time!) aes_nohw_encrypt
calls for EK0 and the trailing partial block. At larger inputs, those
two calls are more amortized.
- To that end, bsaes would likely be much faster on AES-GCM with smarter
use of bsaes. (Fold one-off calls above into bulk data.) Implementing
this is a bit of a nuisance though, especially considering we don't
wish to regress hwaes.
- I'd discarded the key conversion idea, but I think I did it wrong.
Benchmarks from
https://boringssl-review.googlesource.com/c/boringssl/+/33589 suggest
converting to bsaes format on-demand for large ctr32 inputs should
give the best of both worlds, but at the cost of an entire AES
implementation relative to this CL.
- ARMv7 still depends on bsaes and has no vpaes. It also has 16 SIMD
registers, so my plan is to translate it, with the same 2x
optimization, and see how it compares. Hopefully that, or some
combination of the above, will work for ARMv7.
Sandy Bridge
bsaes (before):
Did 3144750 AES-128-GCM (16 bytes) seal operations in 5016000us (626943.8 ops/sec): 10.0 MB/s
Did 2053750 AES-128-GCM (256 bytes) seal operations in 5016000us (409439.8 ops/sec): 104.8 MB/s
Did 469000 AES-128-GCM (1350 bytes) seal operations in 5015000us (93519.4 ops/sec): 126.3 MB/s
Did 92500 AES-128-GCM (8192 bytes) seal operations in 5016000us (18441.0 ops/sec): 151.1 MB/s
Did 46750 AES-128-GCM (16384 bytes) seal operations in 5032000us (9290.5 ops/sec): 152.2 MB/s
vpaes-1x (for reference, not this CL):
Did 8684750 AES-128-GCM (16 bytes) seal operations in 5015000us (1731754.7 ops/sec): 27.7 MB/s [+177%]
Did 1731500 AES-128-GCM (256 bytes) seal operations in 5016000us (345195.4 ops/sec): 88.4 MB/s [-15.6%]
Did 346500 AES-128-GCM (1350 bytes) seal operations in 5016000us (69078.9 ops/sec): 93.3 MB/s [-26.1%]
Did 61250 AES-128-GCM (8192 bytes) seal operations in 5015000us (12213.4 ops/sec): 100.1 MB/s [-33.8%]
Did 32500 AES-128-GCM (16384 bytes) seal operations in 5031000us (6459.9 ops/sec): 105.8 MB/s [-30.5%]
vpaes-2x (this CL):
Did 8840000 AES-128-GCM (16 bytes) seal operations in 5015000us (1762711.9 ops/sec): 28.2 MB/s [+182%]
Did 2167750 AES-128-GCM (256 bytes) seal operations in 5016000us (432167.1 ops/sec): 110.6 MB/s [+5.5%]
Did 474000 AES-128-GCM (1350 bytes) seal operations in 5016000us (94497.6 ops/sec): 127.6 MB/s [+1.0%]
Did 81750 AES-128-GCM (8192 bytes) seal operations in 5015000us (16301.1 ops/sec): 133.5 MB/s [-11.6%]
Did 41750 AES-128-GCM (16384 bytes) seal operations in 5031000us (8298.5 ops/sec): 136.0 MB/s [-10.6%]
Penryn
bsaes (before):
Did 958000 AES-128-GCM (16 bytes) seal operations in 1000264us (957747.2 ops/sec): 15.3 MB/s
Did 420000 AES-128-GCM (256 bytes) seal operations in 1000480us (419798.5 ops/sec): 107.5 MB/s
Did 96000 AES-128-GCM (1350 bytes) seal operations in 1001083us (95896.1 ops/sec): 129.5 MB/s
Did 18000 AES-128-GCM (8192 bytes) seal operations in 1042491us (17266.3 ops/sec): 141.4 MB/s
Did 9482 AES-128-GCM (16384 bytes) seal operations in 1095703us (8653.8 ops/sec): 141.8 MB/s
Did 758000 AES-256-GCM (16 bytes) seal operations in 1000769us (757417.5 ops/sec): 12.1 MB/s
Did 359000 AES-256-GCM (256 bytes) seal operations in 1001993us (358285.9 ops/sec): 91.7 MB/s
Did 82000 AES-256-GCM (1350 bytes) seal operations in 1009583us (81221.7 ops/sec): 109.6 MB/s
Did 15000 AES-256-GCM (8192 bytes) seal operations in 1022294us (14672.9 ops/sec): 120.2 MB/s
Did 7884 AES-256-GCM (16384 bytes) seal operations in 1070934us (7361.8 ops/sec): 120.6 MB/s
vpaes-1x (for reference, not this CL):
Did 2030000 AES-128-GCM (16 bytes) seal operations in 1000227us (2029539.3 ops/sec): 32.5 MB/s [+112%]
Did 382000 AES-128-GCM (256 bytes) seal operations in 1001949us (381256.9 ops/sec): 97.6 MB/s [-9.2%]
Did 81000 AES-128-GCM (1350 bytes) seal operations in 1007297us (80413.2 ops/sec): 108.6 MB/s [-16.1%]
Did 14000 AES-128-GCM (8192 bytes) seal operations in 1031499us (13572.5 ops/sec): 111.2 MB/s [-21.4%]
Did 7008 AES-128-GCM (16384 bytes) seal operations in 1030706us (6799.2 ops/sec): 111.4 MB/s [-21.4%]
Did 1838000 AES-256-GCM (16 bytes) seal operations in 1000238us (1837562.7 ops/sec): 29.4 MB/s [+143%]
Did 321000 AES-256-GCM (256 bytes) seal operations in 1001666us (320466.1 ops/sec): 82.0 MB/s [-10.6%]
Did 67000 AES-256-GCM (1350 bytes) seal operations in 1010359us (66313.1 ops/sec): 89.5 MB/s [-18.3%]
Did 12000 AES-256-GCM (8192 bytes) seal operations in 1072706us (11186.7 ops/sec): 91.6 MB/s [-23.8%]
Did 5680 AES-256-GCM (16384 bytes) seal operations in 1009214us (5628.1 ops/sec): 92.2 MB/s [-23.5%]
vpaes-2x (this CL):
Did 2072000 AES-128-GCM (16 bytes) seal operations in 1000066us (2071863.3 ops/sec): 33.1 MB/s [+116%]
Did 432000 AES-128-GCM (256 bytes) seal operations in 1000732us (431684.0 ops/sec): 110.5 MB/s [+2.8%]
Did 92000 AES-128-GCM (1350 bytes) seal operations in 1000580us (91946.7 ops/sec): 124.1 MB/s [-4.2%]
Did 16000 AES-128-GCM (8192 bytes) seal operations in 1016422us (15741.5 ops/sec): 129.0 MB/s [-8.8%]
Did 8448 AES-128-GCM (16384 bytes) seal operations in 1073962us (7866.2 ops/sec): 128.9 MB/s [-9.1%]
Did 1865000 AES-256-GCM (16 bytes) seal operations in 1000043us (1864919.8 ops/sec): 29.8 MB/s [+146%]
Did 364000 AES-256-GCM (256 bytes) seal operations in 1001561us (363432.7 ops/sec): 93.0 MB/s [+1.4%]
Did 77000 AES-256-GCM (1350 bytes) seal operations in 1004123us (76683.8 ops/sec): 103.5 MB/s [-5.6%]
Did 14000 AES-256-GCM (8192 bytes) seal operations in 1071179us (13069.7 ops/sec): 107.1 MB/s [-10.9%]
Did 7008 AES-256-GCM (16384 bytes) seal operations in 1074125us (6524.4 ops/sec): 106.9 MB/s [-11.4%]
Penryn, CBC mode decryption
bsaes (before):
Did 159000 AES-128-CBC-SHA1 (16 bytes) open operations in 1001019us (158838.1 ops/sec): 2.5 MB/s
Did 114000 AES-128-CBC-SHA1 (256 bytes) open operations in 1006485us (113265.5 ops/sec): 29.0 MB/s
Did 65000 AES-128-CBC-SHA1 (1350 bytes) open operations in 1008441us (64455.9 ops/sec): 87.0 MB/s
Did 17000 AES-128-CBC-SHA1 (8192 bytes) open operations in 1005440us (16908.0 ops/sec): 138.5 MB/s
vpaes (after):
Did 167000 AES-128-CBC-SHA1 (16 bytes) open operations in 1003556us (166408.3 ops/sec): 2.7 MB/s [+8%]
Did 112000 AES-128-CBC-SHA1 (256 bytes) open operations in 1005673us (111368.2 ops/sec): 28.5 MB/s [-1.7%]
Did 56000 AES-128-CBC-SHA1 (1350 bytes) open operations in 1005647us (55685.5 ops/sec): 75.2 MB/s [-13.6%]
Did 13635 AES-128-CBC-SHA1 (8192 bytes) open operations in 1020486us (13361.3 ops/sec): 109.5 MB/s [-20.9%]
Bug: 256
Change-Id: I11ed773323ec7a5ee61080c9ed9ed4761849828a
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/35364
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>
2019-03-20 02:59:49 +00:00
|
|
|
#if defined(OPENSSL_X86_64)
|
|
|
|
#define VPAES_CTR32
|
|
|
|
#endif
|
Enable vpaes for aarch64, with CTR optimizations.
This patches vpaes-armv8.pl to add vpaes_ctr32_encrypt_blocks. CTR mode
is by far the most important mode these days. It should have access to
_vpaes_encrypt_2x, which gives a considerable speed boost. Also exclude
vpaes_ecb_* as they're not even used.
For iOS, this change is completely a no-op. iOS ARMv8 always has crypto
extensions, and we already statically drop all other AES
implementations.
Android ARMv8 is *not* required to have crypto extensions, but every
ARMv8 device I've seen has them. For those, it is a no-op
performance-wise and a win on size. vpaes appears to be about 5.6KiB
smaller than the tables. ARMv8 always makes SIMD (NEON) available, so we
can statically drop aes_nohw.
In theory, however, crypto-less Android ARMv8 is possible. Today such
chips get a variable-time AES. This CL fixes this, but the performance
story is complex.
The Raspberry Pi 3 is not Android but has a Cortex-A53 chip
without crypto extensions. (But the official images are 32-bit, so even
this is slightly artificial...) There, vpaes is a performance win.
Raspberry Pi 3, Model B+, Cortex-A53
Before:
Did 265000 AES-128-GCM (16 bytes) seal operations in 1003312us (264125.2 ops/sec): 4.2 MB/s
Did 44000 AES-128-GCM (256 bytes) seal operations in 1002141us (43906.0 ops/sec): 11.2 MB/s
Did 9394 AES-128-GCM (1350 bytes) seal operations in 1032104us (9101.8 ops/sec): 12.3 MB/s
Did 1562 AES-128-GCM (8192 bytes) seal operations in 1008982us (1548.1 ops/sec): 12.7 MB/s
After:
Did 277000 AES-128-GCM (16 bytes) seal operations in 1001884us (276479.1 ops/sec): 4.4 MB/s
Did 52000 AES-128-GCM (256 bytes) seal operations in 1001480us (51923.2 ops/sec): 13.3 MB/s
Did 11000 AES-128-GCM (1350 bytes) seal operations in 1007979us (10912.9 ops/sec): 14.7 MB/s
Did 2013 AES-128-GCM (8192 bytes) seal operations in 1085545us (1854.4 ops/sec): 15.2 MB/s
The Pixel 3 has a Cortex-A75 with crypto extensions, so it would never
run this code. However, artificially ignoring them gives another data
point (ARM documentation[*] suggests the extensions are still optional
on a Cortex-A75.) Sadly, vpaes no longer wins on perf over aes_nohw.
But, it is constant-time:
Pixel 3, AES/PMULL extensions ignored, Cortex-A75:
Before:
Did 2102000 AES-128-GCM (16 bytes) seal operations in 1000378us (2101205.7 ops/sec): 33.6 MB/s
Did 358000 AES-128-GCM (256 bytes) seal operations in 1002658us (357051.0 ops/sec): 91.4 MB/s
Did 75000 AES-128-GCM (1350 bytes) seal operations in 1012830us (74049.9 ops/sec): 100.0 MB/s
Did 13000 AES-128-GCM (8192 bytes) seal operations in 1036524us (12541.9 ops/sec): 102.7 MB/s
After:
Did 1453000 AES-128-GCM (16 bytes) seal operations in 1000213us (1452690.6 ops/sec): 23.2 MB/s
Did 285000 AES-128-GCM (256 bytes) seal operations in 1002227us (284366.7 ops/sec): 72.8 MB/s
Did 60000 AES-128-GCM (1350 bytes) seal operations in 1016106us (59049.0 ops/sec): 79.7 MB/s
Did 11000 AES-128-GCM (8192 bytes) seal operations in 1094184us (10053.2 ops/sec): 82.4 MB/s
Note the numbers above run with PMULL off, so the slow GHASH is
dampening the regression. If we test aes_nohw and vpaes paired with
PMULL on, the 20% perf hit becomes a 31% hit. The PMULL-less variant is
more likely to represent a real chip.
This is consistent with upstream's note in the comment, though it is
unclear if 20% is the right order of magnitude: "these results are worse
than scalar compiler-generated code, but it's constant-time and
therefore preferred".
[*] http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.100458_0301_00_en/lau1442495529696.html
Bug: 246
Change-Id: If1dc87f5131fce742052498295476fbae4628dbf
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/35026
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>
2019-02-25 21:47:51 +00:00
|
|
|
OPENSSL_INLINE int vpaes_capable(void) {
|
2019-01-06 03:49:46 +00:00
|
|
|
return (OPENSSL_ia32cap_get()[1] & (1 << (41 - 32))) != 0;
|
|
|
|
}
|
|
|
|
|
2018-05-14 22:28:36 +01:00
|
|
|
#elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
|
|
|
|
#define HWAES
|
2018-05-12 16:04:57 +01:00
|
|
|
|
2019-01-06 03:49:46 +00:00
|
|
|
OPENSSL_INLINE int hwaes_capable(void) { return CRYPTO_is_ARMv8_AES_capable(); }
|
|
|
|
|
2019-02-27 01:44:57 +00:00
|
|
|
#if defined(OPENSSL_ARM)
|
2019-01-06 03:49:46 +00:00
|
|
|
#define BSAES
|
Enable vpaes for aarch64, with CTR optimizations.
This patches vpaes-armv8.pl to add vpaes_ctr32_encrypt_blocks. CTR mode
is by far the most important mode these days. It should have access to
_vpaes_encrypt_2x, which gives a considerable speed boost. Also exclude
vpaes_ecb_* as they're not even used.
For iOS, this change is completely a no-op. iOS ARMv8 always has crypto
extensions, and we already statically drop all other AES
implementations.
Android ARMv8 is *not* required to have crypto extensions, but every
ARMv8 device I've seen has them. For those, it is a no-op
performance-wise and a win on size. vpaes appears to be about 5.6KiB
smaller than the tables. ARMv8 always makes SIMD (NEON) available, so we
can statically drop aes_nohw.
In theory, however, crypto-less Android ARMv8 is possible. Today such
chips get a variable-time AES. This CL fixes this, but the performance
story is complex.
The Raspberry Pi 3 is not Android but has a Cortex-A53 chip
without crypto extensions. (But the official images are 32-bit, so even
this is slightly artificial...) There, vpaes is a performance win.
Raspberry Pi 3, Model B+, Cortex-A53
Before:
Did 265000 AES-128-GCM (16 bytes) seal operations in 1003312us (264125.2 ops/sec): 4.2 MB/s
Did 44000 AES-128-GCM (256 bytes) seal operations in 1002141us (43906.0 ops/sec): 11.2 MB/s
Did 9394 AES-128-GCM (1350 bytes) seal operations in 1032104us (9101.8 ops/sec): 12.3 MB/s
Did 1562 AES-128-GCM (8192 bytes) seal operations in 1008982us (1548.1 ops/sec): 12.7 MB/s
After:
Did 277000 AES-128-GCM (16 bytes) seal operations in 1001884us (276479.1 ops/sec): 4.4 MB/s
Did 52000 AES-128-GCM (256 bytes) seal operations in 1001480us (51923.2 ops/sec): 13.3 MB/s
Did 11000 AES-128-GCM (1350 bytes) seal operations in 1007979us (10912.9 ops/sec): 14.7 MB/s
Did 2013 AES-128-GCM (8192 bytes) seal operations in 1085545us (1854.4 ops/sec): 15.2 MB/s
The Pixel 3 has a Cortex-A75 with crypto extensions, so it would never
run this code. However, artificially ignoring them gives another data
point (ARM documentation[*] suggests the extensions are still optional
on a Cortex-A75.) Sadly, vpaes no longer wins on perf over aes_nohw.
But, it is constant-time:
Pixel 3, AES/PMULL extensions ignored, Cortex-A75:
Before:
Did 2102000 AES-128-GCM (16 bytes) seal operations in 1000378us (2101205.7 ops/sec): 33.6 MB/s
Did 358000 AES-128-GCM (256 bytes) seal operations in 1002658us (357051.0 ops/sec): 91.4 MB/s
Did 75000 AES-128-GCM (1350 bytes) seal operations in 1012830us (74049.9 ops/sec): 100.0 MB/s
Did 13000 AES-128-GCM (8192 bytes) seal operations in 1036524us (12541.9 ops/sec): 102.7 MB/s
After:
Did 1453000 AES-128-GCM (16 bytes) seal operations in 1000213us (1452690.6 ops/sec): 23.2 MB/s
Did 285000 AES-128-GCM (256 bytes) seal operations in 1002227us (284366.7 ops/sec): 72.8 MB/s
Did 60000 AES-128-GCM (1350 bytes) seal operations in 1016106us (59049.0 ops/sec): 79.7 MB/s
Did 11000 AES-128-GCM (8192 bytes) seal operations in 1094184us (10053.2 ops/sec): 82.4 MB/s
Note the numbers above run with PMULL off, so the slow GHASH is
dampening the regression. If we test aes_nohw and vpaes paired with
PMULL on, the 20% perf hit becomes a 31% hit. The PMULL-less variant is
more likely to represent a real chip.
This is consistent with upstream's note in the comment, though it is
unclear if 20% is the right order of magnitude: "these results are worse
than scalar compiler-generated code, but it's constant-time and
therefore preferred".
[*] http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.100458_0301_00_en/lau1442495529696.html
Bug: 246
Change-Id: If1dc87f5131fce742052498295476fbae4628dbf
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/35026
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>
2019-02-25 21:47:51 +00:00
|
|
|
OPENSSL_INLINE int bsaes_capable(void) { return CRYPTO_is_NEON_capable(); }
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if defined(OPENSSL_AARCH64)
|
|
|
|
#define VPAES
|
|
|
|
#define VPAES_CTR32
|
|
|
|
OPENSSL_INLINE int vpaes_capable(void) { return CRYPTO_is_NEON_capable(); }
|
2019-01-06 03:49:46 +00:00
|
|
|
#endif
|
|
|
|
|
2018-05-14 22:28:36 +01:00
|
|
|
#elif defined(OPENSSL_PPC64LE)
|
2017-05-03 21:23:37 +01:00
|
|
|
#define HWAES
|
|
|
|
|
2019-01-06 03:49:46 +00:00
|
|
|
OPENSSL_INLINE int hwaes_capable(void) {
|
2017-05-03 21:23:37 +01:00
|
|
|
return CRYPTO_is_PPC64LE_vcrypto_capable();
|
|
|
|
}
|
2018-05-14 22:28:36 +01:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif // !NO_ASM
|
2017-05-03 21:23:37 +01:00
|
|
|
|
|
|
|
|
|
|
|
#if defined(HWAES)
|
|
|
|
|
|
|
|
int aes_hw_set_encrypt_key(const uint8_t *user_key, const int bits,
|
|
|
|
AES_KEY *key);
|
|
|
|
int aes_hw_set_decrypt_key(const uint8_t *user_key, const int bits,
|
|
|
|
AES_KEY *key);
|
|
|
|
void aes_hw_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key);
|
|
|
|
void aes_hw_decrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key);
|
|
|
|
void aes_hw_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t length,
|
|
|
|
const AES_KEY *key, uint8_t *ivec, const int enc);
|
|
|
|
void aes_hw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len,
|
|
|
|
const AES_KEY *key, const uint8_t ivec[16]);
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
2017-08-18 19:06:02 +01:00
|
|
|
// If HWAES isn't defined then we provide dummy functions for each of the hwaes
|
|
|
|
// functions.
|
2019-01-06 03:49:46 +00:00
|
|
|
OPENSSL_INLINE int hwaes_capable(void) { return 0; }
|
2017-05-03 21:23:37 +01:00
|
|
|
|
2019-01-06 03:49:46 +00:00
|
|
|
OPENSSL_INLINE int aes_hw_set_encrypt_key(const uint8_t *user_key, int bits,
|
|
|
|
AES_KEY *key) {
|
2017-05-03 21:23:37 +01:00
|
|
|
abort();
|
|
|
|
}
|
|
|
|
|
2019-01-06 03:49:46 +00:00
|
|
|
OPENSSL_INLINE int aes_hw_set_decrypt_key(const uint8_t *user_key, int bits,
|
|
|
|
AES_KEY *key) {
|
2017-05-03 21:23:37 +01:00
|
|
|
abort();
|
|
|
|
}
|
|
|
|
|
2019-01-06 03:49:46 +00:00
|
|
|
OPENSSL_INLINE void aes_hw_encrypt(const uint8_t *in, uint8_t *out,
|
|
|
|
const AES_KEY *key) {
|
2017-05-03 21:23:37 +01:00
|
|
|
abort();
|
|
|
|
}
|
|
|
|
|
2019-01-06 03:49:46 +00:00
|
|
|
OPENSSL_INLINE void aes_hw_decrypt(const uint8_t *in, uint8_t *out,
|
|
|
|
const AES_KEY *key) {
|
2017-05-03 21:23:37 +01:00
|
|
|
abort();
|
|
|
|
}
|
|
|
|
|
2019-01-06 03:49:46 +00:00
|
|
|
OPENSSL_INLINE void aes_hw_cbc_encrypt(const uint8_t *in, uint8_t *out,
|
|
|
|
size_t length, const AES_KEY *key,
|
|
|
|
uint8_t *ivec, int enc) {
|
2017-05-03 21:23:37 +01:00
|
|
|
abort();
|
|
|
|
}
|
|
|
|
|
2019-01-06 03:49:46 +00:00
|
|
|
OPENSSL_INLINE void aes_hw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out,
|
|
|
|
size_t len, const AES_KEY *key,
|
|
|
|
const uint8_t ivec[16]) {
|
2017-05-03 21:23:37 +01:00
|
|
|
abort();
|
|
|
|
}
|
|
|
|
|
2017-08-18 19:06:02 +01:00
|
|
|
#endif // !HWAES
|
2017-05-03 21:23:37 +01:00
|
|
|
|
2018-05-14 22:28:36 +01:00
|
|
|
|
|
|
|
#if defined(HWAES_ECB)
|
|
|
|
void aes_hw_ecb_encrypt(const uint8_t *in, uint8_t *out, size_t length,
|
|
|
|
const AES_KEY *key, const int enc);
|
2019-01-06 03:49:46 +00:00
|
|
|
#endif // HWAES_ECB
|
|
|
|
|
|
|
|
|
|
|
|
#if defined(BSAES)
|
|
|
|
// On platforms where BSAES gets defined (just above), then these functions are
|
2019-01-12 15:20:22 +00:00
|
|
|
// provided by asm. Note |bsaes_cbc_encrypt| requires |enc| to be zero.
|
2019-01-06 03:49:46 +00:00
|
|
|
void bsaes_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t length,
|
|
|
|
const AES_KEY *key, uint8_t ivec[16], int enc);
|
|
|
|
void bsaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len,
|
|
|
|
const AES_KEY *key, const uint8_t ivec[16]);
|
|
|
|
#else
|
|
|
|
OPENSSL_INLINE char bsaes_capable(void) { return 0; }
|
|
|
|
|
|
|
|
// On other platforms, bsaes_capable() will always return false and so the
|
|
|
|
// following will never be called.
|
|
|
|
OPENSSL_INLINE void bsaes_cbc_encrypt(const uint8_t *in, uint8_t *out,
|
|
|
|
size_t length, const AES_KEY *key,
|
|
|
|
uint8_t ivec[16], int enc) {
|
|
|
|
abort();
|
|
|
|
}
|
|
|
|
|
|
|
|
OPENSSL_INLINE void bsaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out,
|
|
|
|
size_t len, const AES_KEY *key,
|
|
|
|
const uint8_t ivec[16]) {
|
|
|
|
abort();
|
|
|
|
}
|
|
|
|
#endif // !BSAES
|
|
|
|
|
|
|
|
|
|
|
|
#if defined(VPAES)
|
|
|
|
// On platforms where VPAES gets defined (just above), then these functions are
|
|
|
|
// provided by asm.
|
|
|
|
int vpaes_set_encrypt_key(const uint8_t *userKey, int bits, AES_KEY *key);
|
|
|
|
int vpaes_set_decrypt_key(const uint8_t *userKey, int bits, AES_KEY *key);
|
|
|
|
|
|
|
|
void vpaes_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key);
|
|
|
|
void vpaes_decrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key);
|
|
|
|
|
|
|
|
void vpaes_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t length,
|
|
|
|
const AES_KEY *key, uint8_t *ivec, int enc);
|
Enable vpaes for aarch64, with CTR optimizations.
This patches vpaes-armv8.pl to add vpaes_ctr32_encrypt_blocks. CTR mode
is by far the most important mode these days. It should have access to
_vpaes_encrypt_2x, which gives a considerable speed boost. Also exclude
vpaes_ecb_* as they're not even used.
For iOS, this change is completely a no-op. iOS ARMv8 always has crypto
extensions, and we already statically drop all other AES
implementations.
Android ARMv8 is *not* required to have crypto extensions, but every
ARMv8 device I've seen has them. For those, it is a no-op
performance-wise and a win on size. vpaes appears to be about 5.6KiB
smaller than the tables. ARMv8 always makes SIMD (NEON) available, so we
can statically drop aes_nohw.
In theory, however, crypto-less Android ARMv8 is possible. Today such
chips get a variable-time AES. This CL fixes this, but the performance
story is complex.
The Raspberry Pi 3 is not Android but has a Cortex-A53 chip
without crypto extensions. (But the official images are 32-bit, so even
this is slightly artificial...) There, vpaes is a performance win.
Raspberry Pi 3, Model B+, Cortex-A53
Before:
Did 265000 AES-128-GCM (16 bytes) seal operations in 1003312us (264125.2 ops/sec): 4.2 MB/s
Did 44000 AES-128-GCM (256 bytes) seal operations in 1002141us (43906.0 ops/sec): 11.2 MB/s
Did 9394 AES-128-GCM (1350 bytes) seal operations in 1032104us (9101.8 ops/sec): 12.3 MB/s
Did 1562 AES-128-GCM (8192 bytes) seal operations in 1008982us (1548.1 ops/sec): 12.7 MB/s
After:
Did 277000 AES-128-GCM (16 bytes) seal operations in 1001884us (276479.1 ops/sec): 4.4 MB/s
Did 52000 AES-128-GCM (256 bytes) seal operations in 1001480us (51923.2 ops/sec): 13.3 MB/s
Did 11000 AES-128-GCM (1350 bytes) seal operations in 1007979us (10912.9 ops/sec): 14.7 MB/s
Did 2013 AES-128-GCM (8192 bytes) seal operations in 1085545us (1854.4 ops/sec): 15.2 MB/s
The Pixel 3 has a Cortex-A75 with crypto extensions, so it would never
run this code. However, artificially ignoring them gives another data
point (ARM documentation[*] suggests the extensions are still optional
on a Cortex-A75.) Sadly, vpaes no longer wins on perf over aes_nohw.
But, it is constant-time:
Pixel 3, AES/PMULL extensions ignored, Cortex-A75:
Before:
Did 2102000 AES-128-GCM (16 bytes) seal operations in 1000378us (2101205.7 ops/sec): 33.6 MB/s
Did 358000 AES-128-GCM (256 bytes) seal operations in 1002658us (357051.0 ops/sec): 91.4 MB/s
Did 75000 AES-128-GCM (1350 bytes) seal operations in 1012830us (74049.9 ops/sec): 100.0 MB/s
Did 13000 AES-128-GCM (8192 bytes) seal operations in 1036524us (12541.9 ops/sec): 102.7 MB/s
After:
Did 1453000 AES-128-GCM (16 bytes) seal operations in 1000213us (1452690.6 ops/sec): 23.2 MB/s
Did 285000 AES-128-GCM (256 bytes) seal operations in 1002227us (284366.7 ops/sec): 72.8 MB/s
Did 60000 AES-128-GCM (1350 bytes) seal operations in 1016106us (59049.0 ops/sec): 79.7 MB/s
Did 11000 AES-128-GCM (8192 bytes) seal operations in 1094184us (10053.2 ops/sec): 82.4 MB/s
Note the numbers above run with PMULL off, so the slow GHASH is
dampening the regression. If we test aes_nohw and vpaes paired with
PMULL on, the 20% perf hit becomes a 31% hit. The PMULL-less variant is
more likely to represent a real chip.
This is consistent with upstream's note in the comment, though it is
unclear if 20% is the right order of magnitude: "these results are worse
than scalar compiler-generated code, but it's constant-time and
therefore preferred".
[*] http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.100458_0301_00_en/lau1442495529696.html
Bug: 246
Change-Id: If1dc87f5131fce742052498295476fbae4628dbf
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/35026
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>
2019-02-25 21:47:51 +00:00
|
|
|
#if defined(VPAES_CTR32)
|
|
|
|
void vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len,
|
|
|
|
const AES_KEY *key, const uint8_t ivec[16]);
|
|
|
|
#endif
|
2019-01-06 03:49:46 +00:00
|
|
|
#else
|
|
|
|
OPENSSL_INLINE char vpaes_capable(void) { return 0; }
|
|
|
|
|
|
|
|
// On other platforms, vpaes_capable() will always return false and so the
|
|
|
|
// following will never be called.
|
|
|
|
OPENSSL_INLINE int vpaes_set_encrypt_key(const uint8_t *userKey, int bits,
|
|
|
|
AES_KEY *key) {
|
|
|
|
abort();
|
|
|
|
}
|
|
|
|
OPENSSL_INLINE int vpaes_set_decrypt_key(const uint8_t *userKey, int bits,
|
|
|
|
AES_KEY *key) {
|
|
|
|
abort();
|
|
|
|
}
|
|
|
|
OPENSSL_INLINE void vpaes_encrypt(const uint8_t *in, uint8_t *out,
|
|
|
|
const AES_KEY *key) {
|
|
|
|
abort();
|
|
|
|
}
|
|
|
|
OPENSSL_INLINE void vpaes_decrypt(const uint8_t *in, uint8_t *out,
|
|
|
|
const AES_KEY *key) {
|
|
|
|
abort();
|
|
|
|
}
|
|
|
|
OPENSSL_INLINE void vpaes_cbc_encrypt(const uint8_t *in, uint8_t *out,
|
|
|
|
size_t length, const AES_KEY *key,
|
|
|
|
uint8_t *ivec, int enc) {
|
|
|
|
abort();
|
|
|
|
}
|
|
|
|
#endif // !VPAES
|
|
|
|
|
|
|
|
|
|
|
|
void aes_nohw_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key);
|
|
|
|
void aes_nohw_decrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key);
|
|
|
|
int aes_nohw_set_encrypt_key(const uint8_t *key, unsigned bits,
|
|
|
|
AES_KEY *aeskey);
|
|
|
|
int aes_nohw_set_decrypt_key(const uint8_t *key, unsigned bits,
|
|
|
|
AES_KEY *aeskey);
|
|
|
|
|
|
|
|
#if !defined(OPENSSL_NO_ASM) && \
|
|
|
|
(defined(OPENSSL_X86_64) || defined(OPENSSL_X86))
|
|
|
|
#define AES_NOHW_CBC
|
|
|
|
void aes_nohw_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t len,
|
|
|
|
const AES_KEY *key, uint8_t *ivec, const int enc);
|
2018-05-14 22:28:36 +01:00
|
|
|
#endif
|
|
|
|
|
2019-01-06 03:49:46 +00:00
|
|
|
|
2017-05-03 21:23:37 +01:00
|
|
|
#if defined(__cplusplus)
|
2017-08-18 19:06:02 +01:00
|
|
|
} // extern C
|
2017-05-03 21:23:37 +01:00
|
|
|
#endif
|
|
|
|
|
2017-08-18 19:06:02 +01:00
|
|
|
#endif // OPENSSL_HEADER_AES_INTERNAL_H
|