From 885a63fb7486a62575d2dc42a73cdefd1c7b0957 Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Wed, 9 Jan 2019 03:48:25 +0000 Subject: [PATCH] Patch out the aes_nohw fallback in bsaes_ctr32_encrypt_blocks. bsaes_ctr32_encrypt_blocks previously fell back to the table-based aes_nohw_encrypt for inputs under 128 bytes. Instead, just run the usual bsaes code, though it means we compute more blocks than needed. This fixes some (but not all) the timing leaks and is needed for later bsaes work. Performance-wise, x86_64 actually sees a performance improvement for all but tiny inputs. ARM does see a loss at small inputs however. Cortex-A53 (Raspberry Pi 3 Model B+) Before: Did 299000 AES-128-GCM (16 bytes) seal operations in 1001123us (298664.6 ops/sec): 4.8 MB/s Did 236000 AES-128-GCM (32 bytes) seal operations in 1001611us (235620.4 ops/sec): 7.5 MB/s Did 167000 AES-128-GCM (64 bytes) seal operations in 1005706us (166052.5 ops/sec): 10.6 MB/s Did 129000 AES-128-GCM (96 bytes) seal operations in 1006129us (128214.2 ops/sec): 12.3 MB/s Did 116000 AES-128-GCM (112 bytes) seal operations in 1006302us (115273.5 ops/sec): 12.9 MB/s Did 107000 AES-128-GCM (128 bytes) seal operations in 1000986us (106894.6 ops/sec): 13.7 MB/s After: Did 132000 AES-128-GCM (16 bytes) seal operations in 1005165us (131321.7 ops/sec): 2.1 MB/s Did 128000 AES-128-GCM (32 bytes) seal operations in 1005966us (127240.9 ops/sec): 4.1 MB/s Did 120000 AES-128-GCM (64 bytes) seal operations in 1003080us (119631.5 ops/sec): 7.7 MB/s Did 113000 AES-128-GCM (96 bytes) seal operations in 1000557us (112937.1 ops/sec): 10.8 MB/s Did 110000 AES-128-GCM (112 bytes) seal operations in 1000407us (109955.2 ops/sec): 12.3 MB/s Did 108000 AES-128-GCM (128 bytes) seal operations in 1008830us (107054.7 ops/sec): 13.7 MB/s (Inputs 128 bytes and up are unaffected by this CL.) Nexus 7 Before: Did 544000 AES-128-GCM (16 bytes) seal operations in 1001282us (543303.5 ops/sec): 8.7 MB/s Did 475750 AES-128-GCM (32 bytes) seal operations in 1000244us (475633.9 ops/sec): 15.2 MB/s Did 370500 AES-128-GCM (64 bytes) seal operations in 1000519us (370307.8 ops/sec): 23.7 MB/s Did 300750 AES-128-GCM (96 bytes) seal operations in 1000122us (300713.3 ops/sec): 28.9 MB/s Did 275750 AES-128-GCM (112 bytes) seal operations in 1000702us (275556.6 ops/sec): 30.9 MB/s Did 251000 AES-128-GCM (128 bytes) seal operations in 1000214us (250946.3 ops/sec): 32.1 MB/s After: Did 296000 AES-128-GCM (16 bytes) seal operations in 1001129us (295666.2 ops/sec): 4.7 MB/s Did 288750 AES-128-GCM (32 bytes) seal operations in 1000488us (288609.2 ops/sec): 9.2 MB/s Did 267250 AES-128-GCM (64 bytes) seal operations in 1000641us (267078.8 ops/sec): 17.1 MB/s Did 253250 AES-128-GCM (96 bytes) seal operations in 1000915us (253018.5 ops/sec): 24.3 MB/s Did 248000 AES-128-GCM (112 bytes) seal operations in 1000091us (247977.4 ops/sec): 27.8 MB/s Did 249000 AES-128-GCM (128 bytes) seal operations in 1000794us (248802.5 ops/sec): 31.8 MB/s Penryn (Mac mini, mid 2010) Before: Did 1331000 AES-128-GCM (16 bytes) seal operations in 1000263us (1330650.0 ops/sec): 21.3 MB/s Did 991000 AES-128-GCM (32 bytes) seal operations in 1000274us (990728.5 ops/sec): 31.7 MB/s Did 780000 AES-128-GCM (48 bytes) seal operations in 1000278us (779783.2 ops/sec): 37.4 MB/s Did 483000 AES-128-GCM (96 bytes) seal operations in 1000137us (482933.8 ops/sec): 46.4 MB/s Did 428000 AES-128-GCM (112 bytes) seal operations in 1001132us (427516.1 ops/sec): 47.9 MB/s Did 682000 AES-128-GCM (128 bytes) seal operations in 1000564us (681615.6 ops/sec): 87.2 MB/s After: Did 953000 AES-128-GCM (16 bytes) seal operations in 1000385us (952633.2 ops/sec): 15.2 MB/s Did 903000 AES-128-GCM (32 bytes) seal operations in 1000998us (902099.7 ops/sec): 28.9 MB/s Did 850000 AES-128-GCM (48 bytes) seal operations in 1000938us (849203.4 ops/sec): 40.8 MB/s Did 736000 AES-128-GCM (96 bytes) seal operations in 1000886us (735348.5 ops/sec): 70.6 MB/s Did 702000 AES-128-GCM (112 bytes) seal operations in 1000657us (701539.1 ops/sec): 78.6 MB/s Did 676000 AES-128-GCM (128 bytes) seal operations in 1000405us (675726.3 ops/sec): 86.5 MB/s Bug: 256 Change-Id: I9403da607dd1feaff7b3c9b76fe78b66018fb753 Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/35166 Reviewed-by: Adam Langley --- crypto/fipsmodule/aes/asm/bsaes-armv7.pl | 52 ++--------------------- crypto/fipsmodule/aes/asm/bsaes-x86_64.pl | 27 ++---------- 2 files changed, 8 insertions(+), 71 deletions(-) diff --git a/crypto/fipsmodule/aes/asm/bsaes-armv7.pl b/crypto/fipsmodule/aes/asm/bsaes-armv7.pl index 11607d11..c6e0b173 100644 --- a/crypto/fipsmodule/aes/asm/bsaes-armv7.pl +++ b/crypto/fipsmodule/aes/asm/bsaes-armv7.pl @@ -1393,14 +1393,12 @@ my $const = "r6"; # shared with _bsaes_encrypt8_alt my $keysched = "sp"; $code.=<<___; -.extern aes_nohw_encrypt .global bsaes_ctr32_encrypt_blocks .type bsaes_ctr32_encrypt_blocks,%function .align 5 bsaes_ctr32_encrypt_blocks: - cmp $len, #8 @ use plain AES for - blo .Lctr_enc_short @ small sizes - + @ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this + @ out to retain a constant-time implementation. mov ip, sp stmdb sp!, {r4-r10, lr} VFP_ABI_PUSH @@ -1576,50 +1574,8 @@ bsaes_ctr32_encrypt_blocks: VFP_ABI_POP ldmia sp!, {r4-r10, pc} @ return -.align 4 -.Lctr_enc_short: - ldr ip, [sp] @ ctr pointer is passed on stack - stmdb sp!, {r4-r8, lr} - - mov r4, $inp @ copy arguments - mov r5, $out - mov r6, $len - mov r7, $key - ldr r8, [ip, #12] @ load counter LSW - vld1.8 {@XMM[1]}, [ip] @ load whole counter value -#ifdef __ARMEL__ - rev r8, r8 -#endif - sub sp, sp, #0x10 - vst1.8 {@XMM[1]}, [sp] @ copy counter value - sub sp, sp, #0x10 - -.Lctr_enc_short_loop: - add r0, sp, #0x10 @ input counter value - mov r1, sp @ output on the stack - mov r2, r7 @ key - - bl aes_nohw_encrypt - - vld1.8 {@XMM[0]}, [r4]! @ load input - vld1.8 {@XMM[1]}, [sp] @ load encrypted counter - add r8, r8, #1 -#ifdef __ARMEL__ - rev r0, r8 - str r0, [sp, #0x1c] @ next counter value -#else - str r8, [sp, #0x1c] @ next counter value -#endif - veor @XMM[0],@XMM[0],@XMM[1] - vst1.8 {@XMM[0]}, [r5]! @ store output - subs r6, r6, #1 - bne .Lctr_enc_short_loop - - vmov.i32 q0, #0 - vmov.i32 q1, #0 - vstmia sp!, {q0-q1} - - ldmia sp!, {r4-r8, pc} + @ OpenSSL contains aes_nohw_* fallback code here. We patch this + @ out to retain a constant-time implementation. .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks ___ } diff --git a/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl b/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl index 81331bfa..899490f7 100644 --- a/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl +++ b/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl @@ -811,7 +811,6 @@ ___ $code.=<<___; .text -.extern aes_nohw_encrypt .extern aes_nohw_decrypt .type _bsaes_encrypt8,\@abi-omnipotent @@ -1968,8 +1967,8 @@ $code.=<<___; mov $arg3, $len mov $arg4, $key movdqa %xmm0, 0x20(%rbp) # copy counter - cmp \$8, $arg3 - jb .Lctr_enc_short + # In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this + # out to retain a constant-time implementation. mov %eax, %ebx # rounds shl \$7, %rax # 128 bytes per inner round key @@ -2103,27 +2102,9 @@ $code.=<<___; movdqu 0x60($inp), @XMM[14] pxor @XMM[14], @XMM[2] movdqu @XMM[2], 0x60($out) - jmp .Lctr_enc_done - -.align 16 -.Lctr_enc_short: - lea 0x20(%rbp), $arg1 - lea 0x30(%rbp), $arg2 - lea ($key), $arg3 - call aes_nohw_encrypt - movdqu ($inp), @XMM[1] - lea 16($inp), $inp - mov 0x2c(%rbp), %eax # load 32-bit counter - bswap %eax - pxor 0x30(%rbp), @XMM[1] - inc %eax # increment - movdqu @XMM[1], ($out) - bswap %eax - lea 16($out), $out - mov %eax, 0x2c(%rsp) # save 32-bit counter - dec $len - jnz .Lctr_enc_short + # OpenSSL contains aes_nohw_* fallback code here. We patch this + # out to retain a constant-time implementation. .Lctr_enc_done: lea (%rsp), %rax pxor %xmm0, %xmm0