This patches vpaes-armv8.pl to add vpaes_ctr32_encrypt_blocks. CTR mode is by far the most important mode these days. It should have access to _vpaes_encrypt_2x, which gives a considerable speed boost. Also exclude vpaes_ecb_* as they're not even used. For iOS, this change is completely a no-op. iOS ARMv8 always has crypto extensions, and we already statically drop all other AES implementations. Android ARMv8 is *not* required to have crypto extensions, but every ARMv8 device I've seen has them. For those, it is a no-op performance-wise and a win on size. vpaes appears to be about 5.6KiB smaller than the tables. ARMv8 always makes SIMD (NEON) available, so we can statically drop aes_nohw. In theory, however, crypto-less Android ARMv8 is possible. Today such chips get a variable-time AES. This CL fixes this, but the performance story is complex. The Raspberry Pi 3 is not Android but has a Cortex-A53 chip without crypto extensions. (But the official images are 32-bit, so even this is slightly artificial...) There, vpaes is a performance win. Raspberry Pi 3, Model B+, Cortex-A53 Before: Did 265000 AES-128-GCM (16 bytes) seal operations in 1003312us (264125.2 ops/sec): 4.2 MB/s Did 44000 AES-128-GCM (256 bytes) seal operations in 1002141us (43906.0 ops/sec): 11.2 MB/s Did 9394 AES-128-GCM (1350 bytes) seal operations in 1032104us (9101.8 ops/sec): 12.3 MB/s Did 1562 AES-128-GCM (8192 bytes) seal operations in 1008982us (1548.1 ops/sec): 12.7 MB/s After: Did 277000 AES-128-GCM (16 bytes) seal operations in 1001884us (276479.1 ops/sec): 4.4 MB/s Did 52000 AES-128-GCM (256 bytes) seal operations in 1001480us (51923.2 ops/sec): 13.3 MB/s Did 11000 AES-128-GCM (1350 bytes) seal operations in 1007979us (10912.9 ops/sec): 14.7 MB/s Did 2013 AES-128-GCM (8192 bytes) seal operations in 1085545us (1854.4 ops/sec): 15.2 MB/s The Pixel 3 has a Cortex-A75 with crypto extensions, so it would never run this code. However, artificially ignoring them gives another data point (ARM documentation[*] suggests the extensions are still optional on a Cortex-A75.) Sadly, vpaes no longer wins on perf over aes_nohw. But, it is constant-time: Pixel 3, AES/PMULL extensions ignored, Cortex-A75: Before: Did 2102000 AES-128-GCM (16 bytes) seal operations in 1000378us (2101205.7 ops/sec): 33.6 MB/s Did 358000 AES-128-GCM (256 bytes) seal operations in 1002658us (357051.0 ops/sec): 91.4 MB/s Did 75000 AES-128-GCM (1350 bytes) seal operations in 1012830us (74049.9 ops/sec): 100.0 MB/s Did 13000 AES-128-GCM (8192 bytes) seal operations in 1036524us (12541.9 ops/sec): 102.7 MB/s After: Did 1453000 AES-128-GCM (16 bytes) seal operations in 1000213us (1452690.6 ops/sec): 23.2 MB/s Did 285000 AES-128-GCM (256 bytes) seal operations in 1002227us (284366.7 ops/sec): 72.8 MB/s Did 60000 AES-128-GCM (1350 bytes) seal operations in 1016106us (59049.0 ops/sec): 79.7 MB/s Did 11000 AES-128-GCM (8192 bytes) seal operations in 1094184us (10053.2 ops/sec): 82.4 MB/s Note the numbers above run with PMULL off, so the slow GHASH is dampening the regression. If we test aes_nohw and vpaes paired with PMULL on, the 20% perf hit becomes a 31% hit. The PMULL-less variant is more likely to represent a real chip. This is consistent with upstream's note in the comment, though it is unclear if 20% is the right order of magnitude: "these results are worse than scalar compiler-generated code, but it's constant-time and therefore preferred". [*] http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.100458_0301_00_en/lau1442495529696.html Bug: 246 Change-Id: If1dc87f5131fce742052498295476fbae4628dbf Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/35026 Commit-Queue: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com>kris/onging/CECPQ3_patch15
@@ -69,6 +69,7 @@ if(${ARCH} STREQUAL "aarch64") | |||||
sha1-armv8.${ASM_EXT} | sha1-armv8.${ASM_EXT} | ||||
sha256-armv8.${ASM_EXT} | sha256-armv8.${ASM_EXT} | ||||
sha512-armv8.${ASM_EXT} | sha512-armv8.${ASM_EXT} | ||||
vpaes-armv8.${ASM_EXT} | |||||
) | ) | ||||
endif() | endif() | ||||
@@ -120,6 +121,7 @@ perlasm(sha512-586.${ASM_EXT} sha/asm/sha512-586.pl) | |||||
perlasm(sha512-armv4.${ASM_EXT} sha/asm/sha512-armv4.pl) | perlasm(sha512-armv4.${ASM_EXT} sha/asm/sha512-armv4.pl) | ||||
perlasm(sha512-armv8.${ASM_EXT} sha/asm/sha512-armv8.pl) | perlasm(sha512-armv8.${ASM_EXT} sha/asm/sha512-armv8.pl) | ||||
perlasm(sha512-x86_64.${ASM_EXT} sha/asm/sha512-x86_64.pl) | perlasm(sha512-x86_64.${ASM_EXT} sha/asm/sha512-x86_64.pl) | ||||
perlasm(vpaes-armv8.${ASM_EXT} aes/asm/vpaes-armv8.pl) | |||||
perlasm(vpaes-x86_64.${ASM_EXT} aes/asm/vpaes-x86_64.pl) | perlasm(vpaes-x86_64.${ASM_EXT} aes/asm/vpaes-x86_64.pl) | ||||
perlasm(vpaes-x86.${ASM_EXT} aes/asm/vpaes-x86.pl) | perlasm(vpaes-x86.${ASM_EXT} aes/asm/vpaes-x86.pl) | ||||
perlasm(x86_64-mont5.${ASM_EXT} bn/asm/x86_64-mont5.pl) | perlasm(x86_64-mont5.${ASM_EXT} bn/asm/x86_64-mont5.pl) | ||||
@@ -250,6 +250,9 @@ TEST(AESTest, ABI) { | |||||
SCOPED_TRACE(blocks); | SCOPED_TRACE(blocks); | ||||
CHECK_ABI(vpaes_cbc_encrypt, buf, buf, AES_BLOCK_SIZE * blocks, &key, | CHECK_ABI(vpaes_cbc_encrypt, buf, buf, AES_BLOCK_SIZE * blocks, &key, | ||||
block, AES_ENCRYPT); | block, AES_ENCRYPT); | ||||
#if defined(VPAES_CTR32) | |||||
CHECK_ABI(vpaes_ctr32_encrypt_blocks, buf, buf, blocks, &key, block); | |||||
#endif | |||||
} | } | ||||
CHECK_ABI(vpaes_set_decrypt_key, kKey, bits, &key); | CHECK_ABI(vpaes_set_decrypt_key, kKey, bits, &key); | ||||
@@ -42,7 +42,7 @@ while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} | |||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or | ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or | ||||
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or | |||||
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or | |||||
die "can't locate arm-xlate.pl"; | die "can't locate arm-xlate.pl"; | ||||
open OUT,"| \"$^X\" $xlate $flavour $output"; | open OUT,"| \"$^X\" $xlate $flavour $output"; | ||||
@@ -1171,7 +1171,8 @@ vpaes_cbc_decrypt: | |||||
ret | ret | ||||
.size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt | .size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt | ||||
___ | ___ | ||||
if (1) { | |||||
# We omit vpaes_ecb_* in BoringSSL. They are unused. | |||||
if (0) { | |||||
$code.=<<___; | $code.=<<___; | ||||
.globl vpaes_ecb_encrypt | .globl vpaes_ecb_encrypt | ||||
.type vpaes_ecb_encrypt,%function | .type vpaes_ecb_encrypt,%function | ||||
@@ -1253,7 +1254,89 @@ vpaes_ecb_decrypt: | |||||
ret | ret | ||||
.size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt | .size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt | ||||
___ | ___ | ||||
} } | |||||
} | |||||
my ($ctr, $ctr_tmp) = ("w6", "w7"); | |||||
# void vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len, | |||||
# const AES_KEY *key, const uint8_t ivec[16]); | |||||
$code.=<<___; | |||||
.globl vpaes_ctr32_encrypt_blocks | |||||
.type vpaes_ctr32_encrypt_blocks,%function | |||||
.align 4 | |||||
vpaes_ctr32_encrypt_blocks: | |||||
stp x29,x30,[sp,#-16]! | |||||
add x29,sp,#0 | |||||
stp d8,d9,[sp,#-16]! // ABI spec says so | |||||
stp d10,d11,[sp,#-16]! | |||||
stp d12,d13,[sp,#-16]! | |||||
stp d14,d15,[sp,#-16]! | |||||
cbz $len, .Lctr32_done | |||||
// Note, unlike the other functions, $len here is measured in blocks, | |||||
// not bytes. | |||||
mov x17, $len | |||||
mov x2, $key | |||||
// Load the IV and counter portion. | |||||
ldr $ctr, [$ivec, #12] | |||||
ld1 {v7.16b}, [$ivec] | |||||
bl _vpaes_encrypt_preheat | |||||
tst x17, #1 | |||||
rev $ctr, $ctr // The counter is big-endian. | |||||
b.eq .Lctr32_prep_loop | |||||
// Handle one block so the remaining block count is even for | |||||
// _vpaes_encrypt_2x. | |||||
ld1 {v6.16b}, [$inp], #16 // Load input ahead of time | |||||
bl _vpaes_encrypt_core | |||||
eor v0.16b, v0.16b, v6.16b // XOR input and result | |||||
st1 {v0.16b}, [$out], #16 | |||||
subs x17, x17, #1 | |||||
// Update the counter. | |||||
add $ctr, $ctr, #1 | |||||
rev $ctr_tmp, $ctr | |||||
mov v7.s[3], $ctr_tmp | |||||
b.ls .Lctr32_done | |||||
.Lctr32_prep_loop: | |||||
// _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x | |||||
// uses v14 and v15. | |||||
mov v15.16b, v7.16b | |||||
mov v14.16b, v7.16b | |||||
add $ctr, $ctr, #1 | |||||
rev $ctr_tmp, $ctr | |||||
mov v15.s[3], $ctr_tmp | |||||
.Lctr32_loop: | |||||
ld1 {v6.16b,v7.16b}, [$inp], #32 // Load input ahead of time | |||||
bl _vpaes_encrypt_2x | |||||
eor v0.16b, v0.16b, v6.16b // XOR input and result | |||||
eor v1.16b, v1.16b, v7.16b // XOR input and result (#2) | |||||
st1 {v0.16b,v1.16b}, [$out], #32 | |||||
subs x17, x17, #2 | |||||
// Update the counter. | |||||
add $ctr_tmp, $ctr, #1 | |||||
add $ctr, $ctr, #2 | |||||
rev $ctr_tmp, $ctr_tmp | |||||
mov v14.s[3], $ctr_tmp | |||||
rev $ctr_tmp, $ctr | |||||
mov v15.s[3], $ctr_tmp | |||||
b.hi .Lctr32_loop | |||||
.Lctr32_done: | |||||
ldp d14,d15,[sp],#16 | |||||
ldp d12,d13,[sp],#16 | |||||
ldp d10,d11,[sp],#16 | |||||
ldp d8,d9,[sp],#16 | |||||
ldp x29,x30,[sp],#16 | |||||
ret | |||||
.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks | |||||
___ | |||||
} | |||||
print $code; | print $code; | ||||
close STDOUT; | close STDOUT; |
@@ -35,13 +35,13 @@ OPENSSL_INLINE int hwaes_capable(void) { | |||||
} | } | ||||
#define VPAES | #define VPAES | ||||
OPENSSL_INLINE char vpaes_capable(void) { | |||||
OPENSSL_INLINE int vpaes_capable(void) { | |||||
return (OPENSSL_ia32cap_get()[1] & (1 << (41 - 32))) != 0; | return (OPENSSL_ia32cap_get()[1] & (1 << (41 - 32))) != 0; | ||||
} | } | ||||
#if defined(OPENSSL_X86_64) | #if defined(OPENSSL_X86_64) | ||||
#define BSAES | #define BSAES | ||||
OPENSSL_INLINE char bsaes_capable(void) { return vpaes_capable(); } | |||||
OPENSSL_INLINE int bsaes_capable(void) { return vpaes_capable(); } | |||||
#endif // X86_64 | #endif // X86_64 | ||||
#elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64) | #elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64) | ||||
@@ -51,7 +51,13 @@ OPENSSL_INLINE int hwaes_capable(void) { return CRYPTO_is_ARMv8_AES_capable(); } | |||||
#if defined(OPENSSL_ARM) | #if defined(OPENSSL_ARM) | ||||
#define BSAES | #define BSAES | ||||
OPENSSL_INLINE char bsaes_capable(void) { return CRYPTO_is_NEON_capable(); } | |||||
OPENSSL_INLINE int bsaes_capable(void) { return CRYPTO_is_NEON_capable(); } | |||||
#endif | |||||
#if defined(OPENSSL_AARCH64) | |||||
#define VPAES | |||||
#define VPAES_CTR32 | |||||
OPENSSL_INLINE int vpaes_capable(void) { return CRYPTO_is_NEON_capable(); } | |||||
#endif | #endif | ||||
#elif defined(OPENSSL_PPC64LE) | #elif defined(OPENSSL_PPC64LE) | ||||
@@ -162,6 +168,10 @@ void vpaes_decrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key); | |||||
void vpaes_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t length, | void vpaes_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t length, | ||||
const AES_KEY *key, uint8_t *ivec, int enc); | const AES_KEY *key, uint8_t *ivec, int enc); | ||||
#if defined(VPAES_CTR32) | |||||
void vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len, | |||||
const AES_KEY *key, const uint8_t ivec[16]); | |||||
#endif | |||||
#else | #else | ||||
OPENSSL_INLINE char vpaes_capable(void) { return 0; } | OPENSSL_INLINE char vpaes_capable(void) { return 0; } | ||||
@@ -143,7 +143,15 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key, | |||||
} else if (vpaes_capable()) { | } else if (vpaes_capable()) { | ||||
ret = vpaes_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks); | ret = vpaes_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks); | ||||
dat->block = vpaes_encrypt; | dat->block = vpaes_encrypt; | ||||
dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ? vpaes_cbc_encrypt : NULL; | |||||
dat->stream.cbc = NULL; | |||||
if (mode == EVP_CIPH_CBC_MODE) { | |||||
dat->stream.cbc = vpaes_cbc_encrypt; | |||||
} | |||||
#if defined(VPAES_CTR32) | |||||
if (mode == EVP_CIPH_CTR_MODE) { | |||||
dat->stream.ctr = vpaes_ctr32_encrypt_blocks; | |||||
} | |||||
#endif | |||||
} else { | } else { | ||||
ret = aes_nohw_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks); | ret = aes_nohw_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks); | ||||
dat->block = aes_nohw_encrypt; | dat->block = aes_nohw_encrypt; | ||||
@@ -253,7 +261,11 @@ ctr128_f aes_ctr_set_key(AES_KEY *aes_key, GCM128_KEY *gcm_key, | |||||
if (gcm_key != NULL) { | if (gcm_key != NULL) { | ||||
CRYPTO_gcm128_init_key(gcm_key, aes_key, vpaes_encrypt, 0); | CRYPTO_gcm128_init_key(gcm_key, aes_key, vpaes_encrypt, 0); | ||||
} | } | ||||
#if defined(VPAES_CTR32) | |||||
return vpaes_ctr32_encrypt_blocks; | |||||
#else | |||||
return NULL; | return NULL; | ||||
#endif | |||||
} | } | ||||
aes_nohw_set_encrypt_key(key, key_bytes * 8, aes_key); | aes_nohw_set_encrypt_key(key, key_bytes * 8, aes_key); | ||||