Add CFI information to the x86-64 X25519 asm.

This change serves to check that all our consumers can process assembly
with CFI directives in it.

For the first change I picked a file that's not perlasm to keep things
slightly simplier, but that might have been a mistake:

DJB's tooling always aligns the stack to 32 bytes and it's not possible
to express this in DWARF format (without using a register to store the
old stack pointer).

Since none of the functions here appear to care about that alignment, I
removed it from each of them. I also trimmed the set of saved registers
where possible and used the redzone for functions that didn't need much
stack.

Overall, this appears to have slightly improved the performance (by
about 0.7%):

Before:

Did 46000 Curve25519 base-point multiplication operations in 3023288us (15215.2 ops/sec)
Did 46000 Curve25519 arbitrary point multiplication operations in 3017315us (15245.3 ops/sec)
Did 46000 Curve25519 base-point multiplication operations in 3015346us (15255.3 ops/sec)
Did 46000 Curve25519 arbitrary point multiplication operations in 3018609us (15238.8 ops/sec)
Did 46000 Curve25519 base-point multiplication operations in 3019004us (15236.8 ops/sec)
Did 46000 Curve25519 arbitrary point multiplication operations in 3013135us (15266.5 ops/sec)

After:

Did 46000 Curve25519 base-point multiplication operations in 3007659us (15294.3 ops/sec)
Did 47000 Curve25519 arbitrary point multiplication operations in 3054202us (15388.6 ops/sec)
Did 46000 Curve25519 base-point multiplication operations in 3008714us (15288.9 ops/sec)
Did 46000 Curve25519 arbitrary point multiplication operations in 3004740us (15309.1 ops/sec)
Did 46000 Curve25519 base-point multiplication operations in 3009140us (15286.8 ops/sec)
Did 47000 Curve25519 arbitrary point multiplication operations in 3057518us (15371.9 ops/sec)

Change-Id: I31df11c45b2ea0bf44dde861d52c27f848331691
Reviewed-on: https://boringssl-review.googlesource.com/13200
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
Reviewed-by: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: Adam Langley <agl@google.com>
This commit is contained in:
Adam Langley 2017-01-19 10:40:47 -08:00 committed by CQ bot account: commit-bot@chromium.org
parent 8c2480f740
commit 3f38d80b2f

View File

@ -60,17 +60,10 @@ x25519_x86_64__38: .quad 38
.globl C_ABI(x25519_x86_64_freeze)
HIDDEN C_ABI(x25519_x86_64_freeze)
C_ABI(x25519_x86_64_freeze):
mov %rsp,%r11
and $31,%r11
add $64,%r11
sub %r11,%rsp
movq %r11,0(%rsp)
movq %r12,8(%rsp)
movq %r13,16(%rsp)
movq %r14,24(%rsp)
movq %r15,32(%rsp)
movq %rbx,40(%rsp)
movq %rbp,48(%rsp)
.cfi_startproc
/* This is a leaf function and uses the redzone for saving registers. */
movq %r12,-8(%rsp)
.cfi_rel_offset r12, -8
movq 0(%rdi),%rsi
movq 8(%rdi),%rdx
movq 16(%rdi),%rcx
@ -128,44 +121,40 @@ movq %rdx,8(%rdi)
movq %rcx,16(%rdi)
movq %r8,24(%rdi)
movq %r9,32(%rdi)
movq 0(%rsp),%r11
movq 8(%rsp),%r12
movq 16(%rsp),%r13
movq 24(%rsp),%r14
movq 32(%rsp),%r15
movq 40(%rsp),%rbx
movq 48(%rsp),%rbp
add %r11,%rsp
mov %rdi,%rax
mov %rsi,%rdx
movq -8(%rsp),%r12
ret
.cfi_endproc
.p2align 5
.globl C_ABI(x25519_x86_64_mul)
HIDDEN C_ABI(x25519_x86_64_mul)
C_ABI(x25519_x86_64_mul):
mov %rsp,%r11
and $31,%r11
add $96,%r11
sub %r11,%rsp
movq %r11,0(%rsp)
movq %r12,8(%rsp)
movq %r13,16(%rsp)
movq %r14,24(%rsp)
movq %r15,32(%rsp)
movq %rbx,40(%rsp)
movq %rbp,48(%rsp)
movq %rdi,56(%rsp)
.cfi_startproc
/* This is a leaf function and uses the redzone for saving registers. */
movq %r12,-8(%rsp)
.cfi_rel_offset r12, -8
movq %r13,-16(%rsp)
.cfi_rel_offset r13, -16
movq %r14,-24(%rsp)
.cfi_rel_offset r14, -24
movq %r15,-32(%rsp)
.cfi_rel_offset r15, -32
movq %rbx,-40(%rsp)
.cfi_rel_offset rbx, -40
movq %rbp,-48(%rsp)
.cfi_rel_offset rbp, -48
movq %rdi,-56(%rsp)
.cfi_rel_offset rdi, -56
mov %rdx,%rcx
movq 24(%rsi),%rdx
imulq $19,%rdx,%rax
movq %rax,64(%rsp)
movq %rax,-64(%rsp)
mulq 16(%rcx)
mov %rax,%r8
mov %rdx,%r9
movq 32(%rsi),%rdx
imulq $19,%rdx,%rax
movq %rax,72(%rsp)
movq %rax,-72(%rsp)
mulq 8(%rcx)
add %rax,%r8
adc %rdx,%r9
@ -240,11 +229,11 @@ movq 24(%rsi),%rax
mulq 8(%rcx)
add %rax,%rbx
adc %rdx,%rbp
movq 64(%rsp),%rax
movq -64(%rsp),%rax
mulq 24(%rcx)
add %rax,%r10
adc %rdx,%r11
movq 64(%rsp),%rax
movq -64(%rsp),%rax
mulq 32(%rcx)
add %rax,%r12
adc %rdx,%r13
@ -252,15 +241,15 @@ movq 32(%rsi),%rax
mulq 0(%rcx)
add %rax,%rbx
adc %rdx,%rbp
movq 72(%rsp),%rax
movq -72(%rsp),%rax
mulq 16(%rcx)
add %rax,%r10
adc %rdx,%r11
movq 72(%rsp),%rax
movq -72(%rsp),%rax
mulq 24(%rcx)
add %rax,%r12
adc %rdx,%r13
movq 72(%rsp),%rax
movq -72(%rsp),%rax
mulq 32(%rcx)
add %rax,%r14
adc %rdx,%r15
@ -307,33 +296,31 @@ movq %rcx,8(%rdi)
movq %r9,16(%rdi)
movq %rax,24(%rdi)
movq %r10,32(%rdi)
movq 0(%rsp),%r11
movq 8(%rsp),%r12
movq 16(%rsp),%r13
movq 24(%rsp),%r14
movq 32(%rsp),%r15
movq 40(%rsp),%rbx
movq 48(%rsp),%rbp
add %r11,%rsp
mov %rdi,%rax
mov %rsi,%rdx
movq -8(%rsp),%r12
movq -16(%rsp),%r13
movq -24(%rsp),%r14
movq -32(%rsp),%r15
movq -40(%rsp),%rbx
movq -48(%rsp),%rbp
ret
.cfi_endproc
.p2align 5
.globl C_ABI(x25519_x86_64_square)
HIDDEN C_ABI(x25519_x86_64_square)
C_ABI(x25519_x86_64_square):
mov %rsp,%r11
and $31,%r11
add $64,%r11
sub %r11,%rsp
movq %r11,0(%rsp)
movq %r12,8(%rsp)
movq %r13,16(%rsp)
movq %r14,24(%rsp)
movq %r15,32(%rsp)
movq %rbx,40(%rsp)
movq %rbp,48(%rsp)
.cfi_startproc
/* This is a leaf function and uses the redzone for saving registers. */
movq %r12,-8(%rsp)
.cfi_rel_offset r12, -8
movq %r13,-16(%rsp)
.cfi_rel_offset r13, -16
movq %r14,-24(%rsp)
.cfi_rel_offset r14, -24
movq %r15,-32(%rsp)
.cfi_rel_offset r15, -32
movq %rbx,-40(%rsp)
.cfi_rel_offset rbx, -40
movq 0(%rsi),%rax
mulq 0(%rsi)
mov %rax,%rcx
@ -449,33 +436,33 @@ movq %r8,8(%rdi)
movq %r9,16(%rdi)
movq %rax,24(%rdi)
movq %r10,32(%rdi)
movq 0(%rsp),%r11
movq 8(%rsp),%r12
movq 16(%rsp),%r13
movq 24(%rsp),%r14
movq 32(%rsp),%r15
movq 40(%rsp),%rbx
movq 48(%rsp),%rbp
add %r11,%rsp
mov %rdi,%rax
mov %rsi,%rdx
movq -8(%rsp),%r12
movq -16(%rsp),%r13
movq -24(%rsp),%r14
movq -32(%rsp),%r15
movq -40(%rsp),%rbx
ret
.cfi_endproc
.p2align 5
.globl C_ABI(x25519_x86_64_ladderstep)
HIDDEN C_ABI(x25519_x86_64_ladderstep)
C_ABI(x25519_x86_64_ladderstep):
mov %rsp,%r11
and $31,%r11
add $352,%r11
sub %r11,%rsp
movq %r11,0(%rsp)
.cfi_startproc
sub $352,%rsp
.cfi_adjust_cfa_offset 352
movq %r12,8(%rsp)
.cfi_rel_offset r12, 8
movq %r13,16(%rsp)
.cfi_rel_offset r13, 16
movq %r14,24(%rsp)
.cfi_rel_offset r14, 24
movq %r15,32(%rsp)
.cfi_rel_offset r15, 32
movq %rbx,40(%rsp)
.cfi_rel_offset rbx, 40
movq %rbp,48(%rsp)
.cfi_rel_offset rbp, 48
movq 40(%rdi),%rsi
movq 48(%rdi),%rdx
movq 56(%rdi),%rcx
@ -1837,26 +1824,22 @@ movq %r8,88(%rdi)
movq %r9,96(%rdi)
movq %rax,104(%rdi)
movq %r10,112(%rdi)
movq 0(%rsp),%r11
movq 8(%rsp),%r12
movq 16(%rsp),%r13
movq 24(%rsp),%r14
movq 32(%rsp),%r15
movq 40(%rsp),%rbx
movq 48(%rsp),%rbp
add %r11,%rsp
mov %rdi,%rax
mov %rsi,%rdx
add $352,%rsp
.cfi_adjust_cfa_offset -352
ret
.cfi_endproc
.p2align 5
.globl C_ABI(x25519_x86_64_work_cswap)
HIDDEN C_ABI(x25519_x86_64_work_cswap)
C_ABI(x25519_x86_64_work_cswap):
mov %rsp,%r11
and $31,%r11
add $0,%r11
sub %r11,%rsp
.cfi_startproc
cmp $1,%rsi
movq 0(%rdi),%rsi
movq 80(%rdi),%rdx
@ -1928,10 +1911,10 @@ movq %rsi,64(%rdi)
movq %rdx,144(%rdi)
movq %rcx,72(%rdi)
movq %r8,152(%rdi)
add %r11,%rsp
mov %rdi,%rax
mov %rsi,%rdx
ret
.cfi_endproc
#endif /* __x86_64__ */
#endif /* !OPENSSL_NO_ASM */