Patch out the aes_nohw fallback in bsaes_ctr32_encrypt_blocks.

bsaes_ctr32_encrypt_blocks previously fell back to the table-based
aes_nohw_encrypt for inputs under 128 bytes. Instead, just run the usual
bsaes code, though it means we compute more blocks than needed.

This fixes some (but not all) the timing leaks and is needed for later
bsaes work.

Performance-wise, x86_64 actually sees a performance improvement for all but
tiny inputs. ARM does see a loss at small inputs however.

Cortex-A53 (Raspberry Pi 3 Model B+)
Before:
Did 299000 AES-128-GCM (16 bytes) seal operations in 1001123us (298664.6 ops/sec): 4.8 MB/s
Did 236000 AES-128-GCM (32 bytes) seal operations in 1001611us (235620.4 ops/sec): 7.5 MB/s
Did 167000 AES-128-GCM (64 bytes) seal operations in 1005706us (166052.5 ops/sec): 10.6 MB/s
Did 129000 AES-128-GCM (96 bytes) seal operations in 1006129us (128214.2 ops/sec): 12.3 MB/s
Did 116000 AES-128-GCM (112 bytes) seal operations in 1006302us (115273.5 ops/sec): 12.9 MB/s
Did 107000 AES-128-GCM (128 bytes) seal operations in 1000986us (106894.6 ops/sec): 13.7 MB/s
After:
Did 132000 AES-128-GCM (16 bytes) seal operations in 1005165us (131321.7 ops/sec): 2.1 MB/s
Did 128000 AES-128-GCM (32 bytes) seal operations in 1005966us (127240.9 ops/sec): 4.1 MB/s
Did 120000 AES-128-GCM (64 bytes) seal operations in 1003080us (119631.5 ops/sec): 7.7 MB/s
Did 113000 AES-128-GCM (96 bytes) seal operations in 1000557us (112937.1 ops/sec): 10.8 MB/s
Did 110000 AES-128-GCM (112 bytes) seal operations in 1000407us (109955.2 ops/sec): 12.3 MB/s
Did 108000 AES-128-GCM (128 bytes) seal operations in 1008830us (107054.7 ops/sec): 13.7 MB/s
(Inputs 128 bytes and up are unaffected by this CL.)

Nexus 7
Before:
Did 544000 AES-128-GCM (16 bytes) seal operations in 1001282us (543303.5 ops/sec): 8.7 MB/s
Did 475750 AES-128-GCM (32 bytes) seal operations in 1000244us (475633.9 ops/sec): 15.2 MB/s
Did 370500 AES-128-GCM (64 bytes) seal operations in 1000519us (370307.8 ops/sec): 23.7 MB/s
Did 300750 AES-128-GCM (96 bytes) seal operations in 1000122us (300713.3 ops/sec): 28.9 MB/s
Did 275750 AES-128-GCM (112 bytes) seal operations in 1000702us (275556.6 ops/sec): 30.9 MB/s
Did 251000 AES-128-GCM (128 bytes) seal operations in 1000214us (250946.3 ops/sec): 32.1 MB/s
After:
Did 296000 AES-128-GCM (16 bytes) seal operations in 1001129us (295666.2 ops/sec): 4.7 MB/s
Did 288750 AES-128-GCM (32 bytes) seal operations in 1000488us (288609.2 ops/sec): 9.2 MB/s
Did 267250 AES-128-GCM (64 bytes) seal operations in 1000641us (267078.8 ops/sec): 17.1 MB/s
Did 253250 AES-128-GCM (96 bytes) seal operations in 1000915us (253018.5 ops/sec): 24.3 MB/s
Did 248000 AES-128-GCM (112 bytes) seal operations in 1000091us (247977.4 ops/sec): 27.8 MB/s
Did 249000 AES-128-GCM (128 bytes) seal operations in 1000794us (248802.5 ops/sec): 31.8 MB/s

Penryn (Mac mini, mid 2010)
Before:
Did 1331000 AES-128-GCM (16 bytes) seal operations in 1000263us (1330650.0 ops/sec): 21.3 MB/s
Did 991000 AES-128-GCM (32 bytes) seal operations in 1000274us (990728.5 ops/sec): 31.7 MB/s
Did 780000 AES-128-GCM (48 bytes) seal operations in 1000278us (779783.2 ops/sec): 37.4 MB/s
Did 483000 AES-128-GCM (96 bytes) seal operations in 1000137us (482933.8 ops/sec): 46.4 MB/s
Did 428000 AES-128-GCM (112 bytes) seal operations in 1001132us (427516.1 ops/sec): 47.9 MB/s
Did 682000 AES-128-GCM (128 bytes) seal operations in 1000564us (681615.6 ops/sec): 87.2 MB/s
After:
Did 953000 AES-128-GCM (16 bytes) seal operations in 1000385us (952633.2 ops/sec): 15.2 MB/s
Did 903000 AES-128-GCM (32 bytes) seal operations in 1000998us (902099.7 ops/sec): 28.9 MB/s
Did 850000 AES-128-GCM (48 bytes) seal operations in 1000938us (849203.4 ops/sec): 40.8 MB/s
Did 736000 AES-128-GCM (96 bytes) seal operations in 1000886us (735348.5 ops/sec): 70.6 MB/s
Did 702000 AES-128-GCM (112 bytes) seal operations in 1000657us (701539.1 ops/sec): 78.6 MB/s
Did 676000 AES-128-GCM (128 bytes) seal operations in 1000405us (675726.3 ops/sec): 86.5 MB/s

Bug: 256
Change-Id: I9403da607dd1feaff7b3c9b76fe78b66018fb753
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/35166
Reviewed-by: Adam Langley <agl@google.com>
This commit is contained in:
David Benjamin 2019-01-09 03:48:25 +00:00 committed by Adam Langley
parent aadcce380f
commit 885a63fb74
2 changed files with 8 additions and 71 deletions

View File

@ -1393,14 +1393,12 @@ my $const = "r6"; # shared with _bsaes_encrypt8_alt
my $keysched = "sp"; my $keysched = "sp";
$code.=<<___; $code.=<<___;
.extern aes_nohw_encrypt
.global bsaes_ctr32_encrypt_blocks .global bsaes_ctr32_encrypt_blocks
.type bsaes_ctr32_encrypt_blocks,%function .type bsaes_ctr32_encrypt_blocks,%function
.align 5 .align 5
bsaes_ctr32_encrypt_blocks: bsaes_ctr32_encrypt_blocks:
cmp $len, #8 @ use plain AES for @ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this
blo .Lctr_enc_short @ small sizes @ out to retain a constant-time implementation.
mov ip, sp mov ip, sp
stmdb sp!, {r4-r10, lr} stmdb sp!, {r4-r10, lr}
VFP_ABI_PUSH VFP_ABI_PUSH
@ -1576,50 +1574,8 @@ bsaes_ctr32_encrypt_blocks:
VFP_ABI_POP VFP_ABI_POP
ldmia sp!, {r4-r10, pc} @ return ldmia sp!, {r4-r10, pc} @ return
.align 4 @ OpenSSL contains aes_nohw_* fallback code here. We patch this
.Lctr_enc_short: @ out to retain a constant-time implementation.
ldr ip, [sp] @ ctr pointer is passed on stack
stmdb sp!, {r4-r8, lr}
mov r4, $inp @ copy arguments
mov r5, $out
mov r6, $len
mov r7, $key
ldr r8, [ip, #12] @ load counter LSW
vld1.8 {@XMM[1]}, [ip] @ load whole counter value
#ifdef __ARMEL__
rev r8, r8
#endif
sub sp, sp, #0x10
vst1.8 {@XMM[1]}, [sp] @ copy counter value
sub sp, sp, #0x10
.Lctr_enc_short_loop:
add r0, sp, #0x10 @ input counter value
mov r1, sp @ output on the stack
mov r2, r7 @ key
bl aes_nohw_encrypt
vld1.8 {@XMM[0]}, [r4]! @ load input
vld1.8 {@XMM[1]}, [sp] @ load encrypted counter
add r8, r8, #1
#ifdef __ARMEL__
rev r0, r8
str r0, [sp, #0x1c] @ next counter value
#else
str r8, [sp, #0x1c] @ next counter value
#endif
veor @XMM[0],@XMM[0],@XMM[1]
vst1.8 {@XMM[0]}, [r5]! @ store output
subs r6, r6, #1
bne .Lctr_enc_short_loop
vmov.i32 q0, #0
vmov.i32 q1, #0
vstmia sp!, {q0-q1}
ldmia sp!, {r4-r8, pc}
.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
___ ___
} }

View File

@ -811,7 +811,6 @@ ___
$code.=<<___; $code.=<<___;
.text .text
.extern aes_nohw_encrypt
.extern aes_nohw_decrypt .extern aes_nohw_decrypt
.type _bsaes_encrypt8,\@abi-omnipotent .type _bsaes_encrypt8,\@abi-omnipotent
@ -1968,8 +1967,8 @@ $code.=<<___;
mov $arg3, $len mov $arg3, $len
mov $arg4, $key mov $arg4, $key
movdqa %xmm0, 0x20(%rbp) # copy counter movdqa %xmm0, 0x20(%rbp) # copy counter
cmp \$8, $arg3 # In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this
jb .Lctr_enc_short # out to retain a constant-time implementation.
mov %eax, %ebx # rounds mov %eax, %ebx # rounds
shl \$7, %rax # 128 bytes per inner round key shl \$7, %rax # 128 bytes per inner round key
@ -2103,27 +2102,9 @@ $code.=<<___;
movdqu 0x60($inp), @XMM[14] movdqu 0x60($inp), @XMM[14]
pxor @XMM[14], @XMM[2] pxor @XMM[14], @XMM[2]
movdqu @XMM[2], 0x60($out) movdqu @XMM[2], 0x60($out)
jmp .Lctr_enc_done
.align 16
.Lctr_enc_short:
lea 0x20(%rbp), $arg1
lea 0x30(%rbp), $arg2
lea ($key), $arg3
call aes_nohw_encrypt
movdqu ($inp), @XMM[1]
lea 16($inp), $inp
mov 0x2c(%rbp), %eax # load 32-bit counter
bswap %eax
pxor 0x30(%rbp), @XMM[1]
inc %eax # increment
movdqu @XMM[1], ($out)
bswap %eax
lea 16($out), $out
mov %eax, 0x2c(%rsp) # save 32-bit counter
dec $len
jnz .Lctr_enc_short
# OpenSSL contains aes_nohw_* fallback code here. We patch this
# out to retain a constant-time implementation.
.Lctr_enc_done: .Lctr_enc_done:
lea (%rsp), %rax lea (%rsp), %rax
pxor %xmm0, %xmm0 pxor %xmm0, %xmm0