Add CFI information to the x86-64 X25519 asm.

This change serves to check that all our consumers can process assembly
with CFI directives in it.

For the first change I picked a file that's not perlasm to keep things
slightly simplier, but that might have been a mistake:

DJB's tooling always aligns the stack to 32 bytes and it's not possible
to express this in DWARF format (without using a register to store the
old stack pointer).

Since none of the functions here appear to care about that alignment, I
removed it from each of them. I also trimmed the set of saved registers
where possible and used the redzone for functions that didn't need much
stack.

Overall, this appears to have slightly improved the performance (by
about 0.7%):

Before:

Did 46000 Curve25519 base-point multiplication operations in 3023288us (15215.2 ops/sec)
Did 46000 Curve25519 arbitrary point multiplication operations in 3017315us (15245.3 ops/sec)
Did 46000 Curve25519 base-point multiplication operations in 3015346us (15255.3 ops/sec)
Did 46000 Curve25519 arbitrary point multiplication operations in 3018609us (15238.8 ops/sec)
Did 46000 Curve25519 base-point multiplication operations in 3019004us (15236.8 ops/sec)
Did 46000 Curve25519 arbitrary point multiplication operations in 3013135us (15266.5 ops/sec)

After:

Did 46000 Curve25519 base-point multiplication operations in 3007659us (15294.3 ops/sec)
Did 47000 Curve25519 arbitrary point multiplication operations in 3054202us (15388.6 ops/sec)
Did 46000 Curve25519 base-point multiplication operations in 3008714us (15288.9 ops/sec)
Did 46000 Curve25519 arbitrary point multiplication operations in 3004740us (15309.1 ops/sec)
Did 46000 Curve25519 base-point multiplication operations in 3009140us (15286.8 ops/sec)
Did 47000 Curve25519 arbitrary point multiplication operations in 3057518us (15371.9 ops/sec)

Change-Id: I31df11c45b2ea0bf44dde861d52c27f848331691
Reviewed-on: https://boringssl-review.googlesource.com/13200
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
Reviewed-by: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: Adam Langley <agl@google.com>
This commit is contained in:
Adam Langley 2017-01-19 10:40:47 -08:00 committed by CQ bot account: commit-bot@chromium.org
parent 8c2480f740
commit 3f38d80b2f

View File

@ -60,17 +60,10 @@ x25519_x86_64__38: .quad 38
.globl C_ABI(x25519_x86_64_freeze) .globl C_ABI(x25519_x86_64_freeze)
HIDDEN C_ABI(x25519_x86_64_freeze) HIDDEN C_ABI(x25519_x86_64_freeze)
C_ABI(x25519_x86_64_freeze): C_ABI(x25519_x86_64_freeze):
mov %rsp,%r11 .cfi_startproc
and $31,%r11 /* This is a leaf function and uses the redzone for saving registers. */
add $64,%r11 movq %r12,-8(%rsp)
sub %r11,%rsp .cfi_rel_offset r12, -8
movq %r11,0(%rsp)
movq %r12,8(%rsp)
movq %r13,16(%rsp)
movq %r14,24(%rsp)
movq %r15,32(%rsp)
movq %rbx,40(%rsp)
movq %rbp,48(%rsp)
movq 0(%rdi),%rsi movq 0(%rdi),%rsi
movq 8(%rdi),%rdx movq 8(%rdi),%rdx
movq 16(%rdi),%rcx movq 16(%rdi),%rcx
@ -128,44 +121,40 @@ movq %rdx,8(%rdi)
movq %rcx,16(%rdi) movq %rcx,16(%rdi)
movq %r8,24(%rdi) movq %r8,24(%rdi)
movq %r9,32(%rdi) movq %r9,32(%rdi)
movq 0(%rsp),%r11 movq -8(%rsp),%r12
movq 8(%rsp),%r12
movq 16(%rsp),%r13
movq 24(%rsp),%r14
movq 32(%rsp),%r15
movq 40(%rsp),%rbx
movq 48(%rsp),%rbp
add %r11,%rsp
mov %rdi,%rax
mov %rsi,%rdx
ret ret
.cfi_endproc
.p2align 5 .p2align 5
.globl C_ABI(x25519_x86_64_mul) .globl C_ABI(x25519_x86_64_mul)
HIDDEN C_ABI(x25519_x86_64_mul) HIDDEN C_ABI(x25519_x86_64_mul)
C_ABI(x25519_x86_64_mul): C_ABI(x25519_x86_64_mul):
mov %rsp,%r11 .cfi_startproc
and $31,%r11 /* This is a leaf function and uses the redzone for saving registers. */
add $96,%r11 movq %r12,-8(%rsp)
sub %r11,%rsp .cfi_rel_offset r12, -8
movq %r11,0(%rsp) movq %r13,-16(%rsp)
movq %r12,8(%rsp) .cfi_rel_offset r13, -16
movq %r13,16(%rsp) movq %r14,-24(%rsp)
movq %r14,24(%rsp) .cfi_rel_offset r14, -24
movq %r15,32(%rsp) movq %r15,-32(%rsp)
movq %rbx,40(%rsp) .cfi_rel_offset r15, -32
movq %rbp,48(%rsp) movq %rbx,-40(%rsp)
movq %rdi,56(%rsp) .cfi_rel_offset rbx, -40
movq %rbp,-48(%rsp)
.cfi_rel_offset rbp, -48
movq %rdi,-56(%rsp)
.cfi_rel_offset rdi, -56
mov %rdx,%rcx mov %rdx,%rcx
movq 24(%rsi),%rdx movq 24(%rsi),%rdx
imulq $19,%rdx,%rax imulq $19,%rdx,%rax
movq %rax,64(%rsp) movq %rax,-64(%rsp)
mulq 16(%rcx) mulq 16(%rcx)
mov %rax,%r8 mov %rax,%r8
mov %rdx,%r9 mov %rdx,%r9
movq 32(%rsi),%rdx movq 32(%rsi),%rdx
imulq $19,%rdx,%rax imulq $19,%rdx,%rax
movq %rax,72(%rsp) movq %rax,-72(%rsp)
mulq 8(%rcx) mulq 8(%rcx)
add %rax,%r8 add %rax,%r8
adc %rdx,%r9 adc %rdx,%r9
@ -240,11 +229,11 @@ movq 24(%rsi),%rax
mulq 8(%rcx) mulq 8(%rcx)
add %rax,%rbx add %rax,%rbx
adc %rdx,%rbp adc %rdx,%rbp
movq 64(%rsp),%rax movq -64(%rsp),%rax
mulq 24(%rcx) mulq 24(%rcx)
add %rax,%r10 add %rax,%r10
adc %rdx,%r11 adc %rdx,%r11
movq 64(%rsp),%rax movq -64(%rsp),%rax
mulq 32(%rcx) mulq 32(%rcx)
add %rax,%r12 add %rax,%r12
adc %rdx,%r13 adc %rdx,%r13
@ -252,15 +241,15 @@ movq 32(%rsi),%rax
mulq 0(%rcx) mulq 0(%rcx)
add %rax,%rbx add %rax,%rbx
adc %rdx,%rbp adc %rdx,%rbp
movq 72(%rsp),%rax movq -72(%rsp),%rax
mulq 16(%rcx) mulq 16(%rcx)
add %rax,%r10 add %rax,%r10
adc %rdx,%r11 adc %rdx,%r11
movq 72(%rsp),%rax movq -72(%rsp),%rax
mulq 24(%rcx) mulq 24(%rcx)
add %rax,%r12 add %rax,%r12
adc %rdx,%r13 adc %rdx,%r13
movq 72(%rsp),%rax movq -72(%rsp),%rax
mulq 32(%rcx) mulq 32(%rcx)
add %rax,%r14 add %rax,%r14
adc %rdx,%r15 adc %rdx,%r15
@ -307,33 +296,31 @@ movq %rcx,8(%rdi)
movq %r9,16(%rdi) movq %r9,16(%rdi)
movq %rax,24(%rdi) movq %rax,24(%rdi)
movq %r10,32(%rdi) movq %r10,32(%rdi)
movq 0(%rsp),%r11 movq -8(%rsp),%r12
movq 8(%rsp),%r12 movq -16(%rsp),%r13
movq 16(%rsp),%r13 movq -24(%rsp),%r14
movq 24(%rsp),%r14 movq -32(%rsp),%r15
movq 32(%rsp),%r15 movq -40(%rsp),%rbx
movq 40(%rsp),%rbx movq -48(%rsp),%rbp
movq 48(%rsp),%rbp
add %r11,%rsp
mov %rdi,%rax
mov %rsi,%rdx
ret ret
.cfi_endproc
.p2align 5 .p2align 5
.globl C_ABI(x25519_x86_64_square) .globl C_ABI(x25519_x86_64_square)
HIDDEN C_ABI(x25519_x86_64_square) HIDDEN C_ABI(x25519_x86_64_square)
C_ABI(x25519_x86_64_square): C_ABI(x25519_x86_64_square):
mov %rsp,%r11 .cfi_startproc
and $31,%r11 /* This is a leaf function and uses the redzone for saving registers. */
add $64,%r11 movq %r12,-8(%rsp)
sub %r11,%rsp .cfi_rel_offset r12, -8
movq %r11,0(%rsp) movq %r13,-16(%rsp)
movq %r12,8(%rsp) .cfi_rel_offset r13, -16
movq %r13,16(%rsp) movq %r14,-24(%rsp)
movq %r14,24(%rsp) .cfi_rel_offset r14, -24
movq %r15,32(%rsp) movq %r15,-32(%rsp)
movq %rbx,40(%rsp) .cfi_rel_offset r15, -32
movq %rbp,48(%rsp) movq %rbx,-40(%rsp)
.cfi_rel_offset rbx, -40
movq 0(%rsi),%rax movq 0(%rsi),%rax
mulq 0(%rsi) mulq 0(%rsi)
mov %rax,%rcx mov %rax,%rcx
@ -449,33 +436,33 @@ movq %r8,8(%rdi)
movq %r9,16(%rdi) movq %r9,16(%rdi)
movq %rax,24(%rdi) movq %rax,24(%rdi)
movq %r10,32(%rdi) movq %r10,32(%rdi)
movq 0(%rsp),%r11 movq -8(%rsp),%r12
movq 8(%rsp),%r12 movq -16(%rsp),%r13
movq 16(%rsp),%r13 movq -24(%rsp),%r14
movq 24(%rsp),%r14 movq -32(%rsp),%r15
movq 32(%rsp),%r15 movq -40(%rsp),%rbx
movq 40(%rsp),%rbx
movq 48(%rsp),%rbp
add %r11,%rsp
mov %rdi,%rax
mov %rsi,%rdx
ret ret
.cfi_endproc
.p2align 5 .p2align 5
.globl C_ABI(x25519_x86_64_ladderstep) .globl C_ABI(x25519_x86_64_ladderstep)
HIDDEN C_ABI(x25519_x86_64_ladderstep) HIDDEN C_ABI(x25519_x86_64_ladderstep)
C_ABI(x25519_x86_64_ladderstep): C_ABI(x25519_x86_64_ladderstep):
mov %rsp,%r11 .cfi_startproc
and $31,%r11 sub $352,%rsp
add $352,%r11 .cfi_adjust_cfa_offset 352
sub %r11,%rsp
movq %r11,0(%rsp)
movq %r12,8(%rsp) movq %r12,8(%rsp)
.cfi_rel_offset r12, 8
movq %r13,16(%rsp) movq %r13,16(%rsp)
.cfi_rel_offset r13, 16
movq %r14,24(%rsp) movq %r14,24(%rsp)
.cfi_rel_offset r14, 24
movq %r15,32(%rsp) movq %r15,32(%rsp)
.cfi_rel_offset r15, 32
movq %rbx,40(%rsp) movq %rbx,40(%rsp)
.cfi_rel_offset rbx, 40
movq %rbp,48(%rsp) movq %rbp,48(%rsp)
.cfi_rel_offset rbp, 48
movq 40(%rdi),%rsi movq 40(%rdi),%rsi
movq 48(%rdi),%rdx movq 48(%rdi),%rdx
movq 56(%rdi),%rcx movq 56(%rdi),%rcx
@ -1837,26 +1824,22 @@ movq %r8,88(%rdi)
movq %r9,96(%rdi) movq %r9,96(%rdi)
movq %rax,104(%rdi) movq %rax,104(%rdi)
movq %r10,112(%rdi) movq %r10,112(%rdi)
movq 0(%rsp),%r11
movq 8(%rsp),%r12 movq 8(%rsp),%r12
movq 16(%rsp),%r13 movq 16(%rsp),%r13
movq 24(%rsp),%r14 movq 24(%rsp),%r14
movq 32(%rsp),%r15 movq 32(%rsp),%r15
movq 40(%rsp),%rbx movq 40(%rsp),%rbx
movq 48(%rsp),%rbp movq 48(%rsp),%rbp
add %r11,%rsp add $352,%rsp
mov %rdi,%rax .cfi_adjust_cfa_offset -352
mov %rsi,%rdx
ret ret
.cfi_endproc
.p2align 5 .p2align 5
.globl C_ABI(x25519_x86_64_work_cswap) .globl C_ABI(x25519_x86_64_work_cswap)
HIDDEN C_ABI(x25519_x86_64_work_cswap) HIDDEN C_ABI(x25519_x86_64_work_cswap)
C_ABI(x25519_x86_64_work_cswap): C_ABI(x25519_x86_64_work_cswap):
mov %rsp,%r11 .cfi_startproc
and $31,%r11
add $0,%r11
sub %r11,%rsp
cmp $1,%rsi cmp $1,%rsi
movq 0(%rdi),%rsi movq 0(%rdi),%rsi
movq 80(%rdi),%rdx movq 80(%rdi),%rdx
@ -1928,10 +1911,10 @@ movq %rsi,64(%rdi)
movq %rdx,144(%rdi) movq %rdx,144(%rdi)
movq %rcx,72(%rdi) movq %rcx,72(%rdi)
movq %r8,152(%rdi) movq %r8,152(%rdi)
add %r11,%rsp
mov %rdi,%rax mov %rdi,%rax
mov %rsi,%rdx mov %rsi,%rdx
ret ret
.cfi_endproc
#endif /* __x86_64__ */ #endif /* __x86_64__ */
#endif /* !OPENSSL_NO_ASM */ #endif /* !OPENSSL_NO_ASM */