boringssl/crypto/curve25519/asm/x25519-asm-x86_64.S

1919 lines
34 KiB
ArmAsm
Raw Normal View History

/* Copyright (c) 2015, Google Inc.
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
/* This file is adapted from crypto_scalarmult/curve25519/amd64-51/ in
* SUPERCOP 20141124 (http://bench.cr.yp.to/supercop.html). That code is public
* domain licensed but the standard ISC license is included above to keep
* licensing simple. */
#if !defined(OPENSSL_NO_ASM)
#if defined(__x86_64__)
.data
.p2align 4
#if defined(__APPLE__)
/* OS X's C ABI prefixes functions with underscore. */
#define C_ABI(x) _ ## x
#define HIDDEN .private_extern
#else
#define C_ABI(x) x
#define HIDDEN .hidden
#endif
x25519_x86_64_REDMASK51: .quad 0x0007FFFFFFFFFFFF
x25519_x86_64_121666_213: .quad 996687872
x25519_x86_64_2P0: .quad 0xFFFFFFFFFFFDA
x25519_x86_64_2P1234: .quad 0xFFFFFFFFFFFFE
x25519_x86_64_4P0: .quad 0x1FFFFFFFFFFFB4
x25519_x86_64_4P1234: .quad 0x1FFFFFFFFFFFFC
x25519_x86_64_MU0: .quad 0xED9CE5A30A2C131B
x25519_x86_64_MU1: .quad 0x2106215D086329A7
x25519_x86_64_MU2: .quad 0xFFFFFFFFFFFFFFEB
x25519_x86_64_MU3: .quad 0xFFFFFFFFFFFFFFFF
x25519_x86_64_MU4: .quad 0x000000000000000F
x25519_x86_64_ORDER0: .quad 0x5812631A5CF5D3ED
x25519_x86_64_ORDER1: .quad 0x14DEF9DEA2F79CD6
x25519_x86_64_ORDER2: .quad 0x0000000000000000
x25519_x86_64_ORDER3: .quad 0x1000000000000000
x25519_x86_64_EC2D0: .quad 1859910466990425
x25519_x86_64_EC2D1: .quad 932731440258426
x25519_x86_64_EC2D2: .quad 1072319116312658
x25519_x86_64_EC2D3: .quad 1815898335770999
x25519_x86_64_EC2D4: .quad 633789495995903
x25519_x86_64__38: .quad 38
.text
.p2align 5
.globl C_ABI(x25519_x86_64_freeze)
HIDDEN C_ABI(x25519_x86_64_freeze)
C_ABI(x25519_x86_64_freeze):
Add CFI information to the x86-64 X25519 asm. This change serves to check that all our consumers can process assembly with CFI directives in it. For the first change I picked a file that's not perlasm to keep things slightly simplier, but that might have been a mistake: DJB's tooling always aligns the stack to 32 bytes and it's not possible to express this in DWARF format (without using a register to store the old stack pointer). Since none of the functions here appear to care about that alignment, I removed it from each of them. I also trimmed the set of saved registers where possible and used the redzone for functions that didn't need much stack. Overall, this appears to have slightly improved the performance (by about 0.7%): Before: Did 46000 Curve25519 base-point multiplication operations in 3023288us (15215.2 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3017315us (15245.3 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3015346us (15255.3 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3018609us (15238.8 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3019004us (15236.8 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3013135us (15266.5 ops/sec) After: Did 46000 Curve25519 base-point multiplication operations in 3007659us (15294.3 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3054202us (15388.6 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3008714us (15288.9 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3004740us (15309.1 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3009140us (15286.8 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3057518us (15371.9 ops/sec) Change-Id: I31df11c45b2ea0bf44dde861d52c27f848331691 Reviewed-on: https://boringssl-review.googlesource.com/13200 CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Reviewed-by: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Adam Langley <agl@google.com>
2017-01-19 18:40:47 +00:00
.cfi_startproc
/* This is a leaf function and uses the redzone for saving registers. */
movq %r12,-8(%rsp)
.cfi_rel_offset r12, -8
movq 0(%rdi),%rsi
movq 8(%rdi),%rdx
movq 16(%rdi),%rcx
movq 24(%rdi),%r8
movq 32(%rdi),%r9
movq x25519_x86_64_REDMASK51(%rip),%rax
mov %rax,%r10
sub $18,%r10
mov $3,%r11
._reduceloop:
mov %rsi,%r12
shr $51,%r12
and %rax,%rsi
add %r12,%rdx
mov %rdx,%r12
shr $51,%r12
and %rax,%rdx
add %r12,%rcx
mov %rcx,%r12
shr $51,%r12
and %rax,%rcx
add %r12,%r8
mov %r8,%r12
shr $51,%r12
and %rax,%r8
add %r12,%r9
mov %r9,%r12
shr $51,%r12
and %rax,%r9
imulq $19,%r12,%r12
add %r12,%rsi
sub $1,%r11
ja ._reduceloop
mov $1,%r12
cmp %r10,%rsi
cmovl %r11,%r12
cmp %rax,%rdx
cmovne %r11,%r12
cmp %rax,%rcx
cmovne %r11,%r12
cmp %rax,%r8
cmovne %r11,%r12
cmp %rax,%r9
cmovne %r11,%r12
neg %r12
and %r12,%rax
and %r12,%r10
sub %r10,%rsi
sub %rax,%rdx
sub %rax,%rcx
sub %rax,%r8
sub %rax,%r9
movq %rsi,0(%rdi)
movq %rdx,8(%rdi)
movq %rcx,16(%rdi)
movq %r8,24(%rdi)
movq %r9,32(%rdi)
Add CFI information to the x86-64 X25519 asm. This change serves to check that all our consumers can process assembly with CFI directives in it. For the first change I picked a file that's not perlasm to keep things slightly simplier, but that might have been a mistake: DJB's tooling always aligns the stack to 32 bytes and it's not possible to express this in DWARF format (without using a register to store the old stack pointer). Since none of the functions here appear to care about that alignment, I removed it from each of them. I also trimmed the set of saved registers where possible and used the redzone for functions that didn't need much stack. Overall, this appears to have slightly improved the performance (by about 0.7%): Before: Did 46000 Curve25519 base-point multiplication operations in 3023288us (15215.2 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3017315us (15245.3 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3015346us (15255.3 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3018609us (15238.8 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3019004us (15236.8 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3013135us (15266.5 ops/sec) After: Did 46000 Curve25519 base-point multiplication operations in 3007659us (15294.3 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3054202us (15388.6 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3008714us (15288.9 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3004740us (15309.1 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3009140us (15286.8 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3057518us (15371.9 ops/sec) Change-Id: I31df11c45b2ea0bf44dde861d52c27f848331691 Reviewed-on: https://boringssl-review.googlesource.com/13200 CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Reviewed-by: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Adam Langley <agl@google.com>
2017-01-19 18:40:47 +00:00
movq -8(%rsp),%r12
ret
Add CFI information to the x86-64 X25519 asm. This change serves to check that all our consumers can process assembly with CFI directives in it. For the first change I picked a file that's not perlasm to keep things slightly simplier, but that might have been a mistake: DJB's tooling always aligns the stack to 32 bytes and it's not possible to express this in DWARF format (without using a register to store the old stack pointer). Since none of the functions here appear to care about that alignment, I removed it from each of them. I also trimmed the set of saved registers where possible and used the redzone for functions that didn't need much stack. Overall, this appears to have slightly improved the performance (by about 0.7%): Before: Did 46000 Curve25519 base-point multiplication operations in 3023288us (15215.2 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3017315us (15245.3 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3015346us (15255.3 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3018609us (15238.8 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3019004us (15236.8 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3013135us (15266.5 ops/sec) After: Did 46000 Curve25519 base-point multiplication operations in 3007659us (15294.3 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3054202us (15388.6 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3008714us (15288.9 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3004740us (15309.1 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3009140us (15286.8 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3057518us (15371.9 ops/sec) Change-Id: I31df11c45b2ea0bf44dde861d52c27f848331691 Reviewed-on: https://boringssl-review.googlesource.com/13200 CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Reviewed-by: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Adam Langley <agl@google.com>
2017-01-19 18:40:47 +00:00
.cfi_endproc
.p2align 5
.globl C_ABI(x25519_x86_64_mul)
HIDDEN C_ABI(x25519_x86_64_mul)
C_ABI(x25519_x86_64_mul):
Add CFI information to the x86-64 X25519 asm. This change serves to check that all our consumers can process assembly with CFI directives in it. For the first change I picked a file that's not perlasm to keep things slightly simplier, but that might have been a mistake: DJB's tooling always aligns the stack to 32 bytes and it's not possible to express this in DWARF format (without using a register to store the old stack pointer). Since none of the functions here appear to care about that alignment, I removed it from each of them. I also trimmed the set of saved registers where possible and used the redzone for functions that didn't need much stack. Overall, this appears to have slightly improved the performance (by about 0.7%): Before: Did 46000 Curve25519 base-point multiplication operations in 3023288us (15215.2 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3017315us (15245.3 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3015346us (15255.3 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3018609us (15238.8 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3019004us (15236.8 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3013135us (15266.5 ops/sec) After: Did 46000 Curve25519 base-point multiplication operations in 3007659us (15294.3 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3054202us (15388.6 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3008714us (15288.9 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3004740us (15309.1 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3009140us (15286.8 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3057518us (15371.9 ops/sec) Change-Id: I31df11c45b2ea0bf44dde861d52c27f848331691 Reviewed-on: https://boringssl-review.googlesource.com/13200 CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Reviewed-by: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Adam Langley <agl@google.com>
2017-01-19 18:40:47 +00:00
.cfi_startproc
/* This is a leaf function and uses the redzone for saving registers. */
movq %r12,-8(%rsp)
.cfi_rel_offset r12, -8
movq %r13,-16(%rsp)
.cfi_rel_offset r13, -16
movq %r14,-24(%rsp)
.cfi_rel_offset r14, -24
movq %r15,-32(%rsp)
.cfi_rel_offset r15, -32
movq %rbx,-40(%rsp)
.cfi_rel_offset rbx, -40
movq %rbp,-48(%rsp)
.cfi_rel_offset rbp, -48
mov %rdx,%rcx
movq 24(%rsi),%rdx
imulq $19,%rdx,%rax
Add CFI information to the x86-64 X25519 asm. This change serves to check that all our consumers can process assembly with CFI directives in it. For the first change I picked a file that's not perlasm to keep things slightly simplier, but that might have been a mistake: DJB's tooling always aligns the stack to 32 bytes and it's not possible to express this in DWARF format (without using a register to store the old stack pointer). Since none of the functions here appear to care about that alignment, I removed it from each of them. I also trimmed the set of saved registers where possible and used the redzone for functions that didn't need much stack. Overall, this appears to have slightly improved the performance (by about 0.7%): Before: Did 46000 Curve25519 base-point multiplication operations in 3023288us (15215.2 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3017315us (15245.3 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3015346us (15255.3 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3018609us (15238.8 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3019004us (15236.8 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3013135us (15266.5 ops/sec) After: Did 46000 Curve25519 base-point multiplication operations in 3007659us (15294.3 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3054202us (15388.6 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3008714us (15288.9 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3004740us (15309.1 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3009140us (15286.8 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3057518us (15371.9 ops/sec) Change-Id: I31df11c45b2ea0bf44dde861d52c27f848331691 Reviewed-on: https://boringssl-review.googlesource.com/13200 CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Reviewed-by: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Adam Langley <agl@google.com>
2017-01-19 18:40:47 +00:00
movq %rax,-64(%rsp)
mulq 16(%rcx)
mov %rax,%r8
mov %rdx,%r9
movq 32(%rsi),%rdx
imulq $19,%rdx,%rax
Add CFI information to the x86-64 X25519 asm. This change serves to check that all our consumers can process assembly with CFI directives in it. For the first change I picked a file that's not perlasm to keep things slightly simplier, but that might have been a mistake: DJB's tooling always aligns the stack to 32 bytes and it's not possible to express this in DWARF format (without using a register to store the old stack pointer). Since none of the functions here appear to care about that alignment, I removed it from each of them. I also trimmed the set of saved registers where possible and used the redzone for functions that didn't need much stack. Overall, this appears to have slightly improved the performance (by about 0.7%): Before: Did 46000 Curve25519 base-point multiplication operations in 3023288us (15215.2 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3017315us (15245.3 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3015346us (15255.3 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3018609us (15238.8 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3019004us (15236.8 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3013135us (15266.5 ops/sec) After: Did 46000 Curve25519 base-point multiplication operations in 3007659us (15294.3 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3054202us (15388.6 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3008714us (15288.9 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3004740us (15309.1 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3009140us (15286.8 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3057518us (15371.9 ops/sec) Change-Id: I31df11c45b2ea0bf44dde861d52c27f848331691 Reviewed-on: https://boringssl-review.googlesource.com/13200 CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Reviewed-by: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Adam Langley <agl@google.com>
2017-01-19 18:40:47 +00:00
movq %rax,-72(%rsp)
mulq 8(%rcx)
add %rax,%r8
adc %rdx,%r9
movq 0(%rsi),%rax
mulq 0(%rcx)
add %rax,%r8
adc %rdx,%r9
movq 0(%rsi),%rax
mulq 8(%rcx)
mov %rax,%r10
mov %rdx,%r11
movq 0(%rsi),%rax
mulq 16(%rcx)
mov %rax,%r12
mov %rdx,%r13
movq 0(%rsi),%rax
mulq 24(%rcx)
mov %rax,%r14
mov %rdx,%r15
movq 0(%rsi),%rax
mulq 32(%rcx)
mov %rax,%rbx
mov %rdx,%rbp
movq 8(%rsi),%rax
mulq 0(%rcx)
add %rax,%r10
adc %rdx,%r11
movq 8(%rsi),%rax
mulq 8(%rcx)
add %rax,%r12
adc %rdx,%r13
movq 8(%rsi),%rax
mulq 16(%rcx)
add %rax,%r14
adc %rdx,%r15
movq 8(%rsi),%rax
mulq 24(%rcx)
add %rax,%rbx
adc %rdx,%rbp
movq 8(%rsi),%rdx
imulq $19,%rdx,%rax
mulq 32(%rcx)
add %rax,%r8
adc %rdx,%r9
movq 16(%rsi),%rax
mulq 0(%rcx)
add %rax,%r12
adc %rdx,%r13
movq 16(%rsi),%rax
mulq 8(%rcx)
add %rax,%r14
adc %rdx,%r15
movq 16(%rsi),%rax
mulq 16(%rcx)
add %rax,%rbx
adc %rdx,%rbp
movq 16(%rsi),%rdx
imulq $19,%rdx,%rax
mulq 24(%rcx)
add %rax,%r8
adc %rdx,%r9
movq 16(%rsi),%rdx
imulq $19,%rdx,%rax
mulq 32(%rcx)
add %rax,%r10
adc %rdx,%r11
movq 24(%rsi),%rax
mulq 0(%rcx)
add %rax,%r14
adc %rdx,%r15
movq 24(%rsi),%rax
mulq 8(%rcx)
add %rax,%rbx
adc %rdx,%rbp
Add CFI information to the x86-64 X25519 asm. This change serves to check that all our consumers can process assembly with CFI directives in it. For the first change I picked a file that's not perlasm to keep things slightly simplier, but that might have been a mistake: DJB's tooling always aligns the stack to 32 bytes and it's not possible to express this in DWARF format (without using a register to store the old stack pointer). Since none of the functions here appear to care about that alignment, I removed it from each of them. I also trimmed the set of saved registers where possible and used the redzone for functions that didn't need much stack. Overall, this appears to have slightly improved the performance (by about 0.7%): Before: Did 46000 Curve25519 base-point multiplication operations in 3023288us (15215.2 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3017315us (15245.3 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3015346us (15255.3 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3018609us (15238.8 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3019004us (15236.8 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3013135us (15266.5 ops/sec) After: Did 46000 Curve25519 base-point multiplication operations in 3007659us (15294.3 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3054202us (15388.6 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3008714us (15288.9 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3004740us (15309.1 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3009140us (15286.8 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3057518us (15371.9 ops/sec) Change-Id: I31df11c45b2ea0bf44dde861d52c27f848331691 Reviewed-on: https://boringssl-review.googlesource.com/13200 CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Reviewed-by: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Adam Langley <agl@google.com>
2017-01-19 18:40:47 +00:00
movq -64(%rsp),%rax
mulq 24(%rcx)
add %rax,%r10
adc %rdx,%r11
Add CFI information to the x86-64 X25519 asm. This change serves to check that all our consumers can process assembly with CFI directives in it. For the first change I picked a file that's not perlasm to keep things slightly simplier, but that might have been a mistake: DJB's tooling always aligns the stack to 32 bytes and it's not possible to express this in DWARF format (without using a register to store the old stack pointer). Since none of the functions here appear to care about that alignment, I removed it from each of them. I also trimmed the set of saved registers where possible and used the redzone for functions that didn't need much stack. Overall, this appears to have slightly improved the performance (by about 0.7%): Before: Did 46000 Curve25519 base-point multiplication operations in 3023288us (15215.2 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3017315us (15245.3 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3015346us (15255.3 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3018609us (15238.8 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3019004us (15236.8 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3013135us (15266.5 ops/sec) After: Did 46000 Curve25519 base-point multiplication operations in 3007659us (15294.3 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3054202us (15388.6 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3008714us (15288.9 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3004740us (15309.1 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3009140us (15286.8 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3057518us (15371.9 ops/sec) Change-Id: I31df11c45b2ea0bf44dde861d52c27f848331691 Reviewed-on: https://boringssl-review.googlesource.com/13200 CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Reviewed-by: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Adam Langley <agl@google.com>
2017-01-19 18:40:47 +00:00
movq -64(%rsp),%rax
mulq 32(%rcx)
add %rax,%r12
adc %rdx,%r13
movq 32(%rsi),%rax
mulq 0(%rcx)
add %rax,%rbx
adc %rdx,%rbp
Add CFI information to the x86-64 X25519 asm. This change serves to check that all our consumers can process assembly with CFI directives in it. For the first change I picked a file that's not perlasm to keep things slightly simplier, but that might have been a mistake: DJB's tooling always aligns the stack to 32 bytes and it's not possible to express this in DWARF format (without using a register to store the old stack pointer). Since none of the functions here appear to care about that alignment, I removed it from each of them. I also trimmed the set of saved registers where possible and used the redzone for functions that didn't need much stack. Overall, this appears to have slightly improved the performance (by about 0.7%): Before: Did 46000 Curve25519 base-point multiplication operations in 3023288us (15215.2 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3017315us (15245.3 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3015346us (15255.3 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3018609us (15238.8 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3019004us (15236.8 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3013135us (15266.5 ops/sec) After: Did 46000 Curve25519 base-point multiplication operations in 3007659us (15294.3 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3054202us (15388.6 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3008714us (15288.9 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3004740us (15309.1 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3009140us (15286.8 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3057518us (15371.9 ops/sec) Change-Id: I31df11c45b2ea0bf44dde861d52c27f848331691 Reviewed-on: https://boringssl-review.googlesource.com/13200 CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Reviewed-by: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Adam Langley <agl@google.com>
2017-01-19 18:40:47 +00:00
movq -72(%rsp),%rax
mulq 16(%rcx)
add %rax,%r10
adc %rdx,%r11
Add CFI information to the x86-64 X25519 asm. This change serves to check that all our consumers can process assembly with CFI directives in it. For the first change I picked a file that's not perlasm to keep things slightly simplier, but that might have been a mistake: DJB's tooling always aligns the stack to 32 bytes and it's not possible to express this in DWARF format (without using a register to store the old stack pointer). Since none of the functions here appear to care about that alignment, I removed it from each of them. I also trimmed the set of saved registers where possible and used the redzone for functions that didn't need much stack. Overall, this appears to have slightly improved the performance (by about 0.7%): Before: Did 46000 Curve25519 base-point multiplication operations in 3023288us (15215.2 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3017315us (15245.3 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3015346us (15255.3 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3018609us (15238.8 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3019004us (15236.8 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3013135us (15266.5 ops/sec) After: Did 46000 Curve25519 base-point multiplication operations in 3007659us (15294.3 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3054202us (15388.6 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3008714us (15288.9 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3004740us (15309.1 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3009140us (15286.8 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3057518us (15371.9 ops/sec) Change-Id: I31df11c45b2ea0bf44dde861d52c27f848331691 Reviewed-on: https://boringssl-review.googlesource.com/13200 CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Reviewed-by: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Adam Langley <agl@google.com>
2017-01-19 18:40:47 +00:00
movq -72(%rsp),%rax
mulq 24(%rcx)
add %rax,%r12
adc %rdx,%r13
Add CFI information to the x86-64 X25519 asm. This change serves to check that all our consumers can process assembly with CFI directives in it. For the first change I picked a file that's not perlasm to keep things slightly simplier, but that might have been a mistake: DJB's tooling always aligns the stack to 32 bytes and it's not possible to express this in DWARF format (without using a register to store the old stack pointer). Since none of the functions here appear to care about that alignment, I removed it from each of them. I also trimmed the set of saved registers where possible and used the redzone for functions that didn't need much stack. Overall, this appears to have slightly improved the performance (by about 0.7%): Before: Did 46000 Curve25519 base-point multiplication operations in 3023288us (15215.2 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3017315us (15245.3 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3015346us (15255.3 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3018609us (15238.8 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3019004us (15236.8 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3013135us (15266.5 ops/sec) After: Did 46000 Curve25519 base-point multiplication operations in 3007659us (15294.3 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3054202us (15388.6 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3008714us (15288.9 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3004740us (15309.1 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3009140us (15286.8 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3057518us (15371.9 ops/sec) Change-Id: I31df11c45b2ea0bf44dde861d52c27f848331691 Reviewed-on: https://boringssl-review.googlesource.com/13200 CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Reviewed-by: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Adam Langley <agl@google.com>
2017-01-19 18:40:47 +00:00
movq -72(%rsp),%rax
mulq 32(%rcx)
add %rax,%r14
adc %rdx,%r15
movq x25519_x86_64_REDMASK51(%rip),%rsi
shld $13,%r8,%r9
and %rsi,%r8
shld $13,%r10,%r11
and %rsi,%r10
add %r9,%r10
shld $13,%r12,%r13
and %rsi,%r12
add %r11,%r12
shld $13,%r14,%r15
and %rsi,%r14
add %r13,%r14
shld $13,%rbx,%rbp
and %rsi,%rbx
add %r15,%rbx
imulq $19,%rbp,%rdx
add %rdx,%r8
mov %r8,%rdx
shr $51,%rdx
add %r10,%rdx
mov %rdx,%rcx
shr $51,%rdx
and %rsi,%r8
add %r12,%rdx
mov %rdx,%r9
shr $51,%rdx
and %rsi,%rcx
add %r14,%rdx
mov %rdx,%rax
shr $51,%rdx
and %rsi,%r9
add %rbx,%rdx
mov %rdx,%r10
shr $51,%rdx
and %rsi,%rax
imulq $19,%rdx,%rdx
add %rdx,%r8
and %rsi,%r10
movq %r8,0(%rdi)
movq %rcx,8(%rdi)
movq %r9,16(%rdi)
movq %rax,24(%rdi)
movq %r10,32(%rdi)
Add CFI information to the x86-64 X25519 asm. This change serves to check that all our consumers can process assembly with CFI directives in it. For the first change I picked a file that's not perlasm to keep things slightly simplier, but that might have been a mistake: DJB's tooling always aligns the stack to 32 bytes and it's not possible to express this in DWARF format (without using a register to store the old stack pointer). Since none of the functions here appear to care about that alignment, I removed it from each of them. I also trimmed the set of saved registers where possible and used the redzone for functions that didn't need much stack. Overall, this appears to have slightly improved the performance (by about 0.7%): Before: Did 46000 Curve25519 base-point multiplication operations in 3023288us (15215.2 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3017315us (15245.3 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3015346us (15255.3 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3018609us (15238.8 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3019004us (15236.8 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3013135us (15266.5 ops/sec) After: Did 46000 Curve25519 base-point multiplication operations in 3007659us (15294.3 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3054202us (15388.6 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3008714us (15288.9 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3004740us (15309.1 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3009140us (15286.8 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3057518us (15371.9 ops/sec) Change-Id: I31df11c45b2ea0bf44dde861d52c27f848331691 Reviewed-on: https://boringssl-review.googlesource.com/13200 CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Reviewed-by: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Adam Langley <agl@google.com>
2017-01-19 18:40:47 +00:00
movq -8(%rsp),%r12
movq -16(%rsp),%r13
movq -24(%rsp),%r14
movq -32(%rsp),%r15
movq -40(%rsp),%rbx
movq -48(%rsp),%rbp
ret
Add CFI information to the x86-64 X25519 asm. This change serves to check that all our consumers can process assembly with CFI directives in it. For the first change I picked a file that's not perlasm to keep things slightly simplier, but that might have been a mistake: DJB's tooling always aligns the stack to 32 bytes and it's not possible to express this in DWARF format (without using a register to store the old stack pointer). Since none of the functions here appear to care about that alignment, I removed it from each of them. I also trimmed the set of saved registers where possible and used the redzone for functions that didn't need much stack. Overall, this appears to have slightly improved the performance (by about 0.7%): Before: Did 46000 Curve25519 base-point multiplication operations in 3023288us (15215.2 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3017315us (15245.3 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3015346us (15255.3 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3018609us (15238.8 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3019004us (15236.8 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3013135us (15266.5 ops/sec) After: Did 46000 Curve25519 base-point multiplication operations in 3007659us (15294.3 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3054202us (15388.6 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3008714us (15288.9 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3004740us (15309.1 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3009140us (15286.8 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3057518us (15371.9 ops/sec) Change-Id: I31df11c45b2ea0bf44dde861d52c27f848331691 Reviewed-on: https://boringssl-review.googlesource.com/13200 CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Reviewed-by: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Adam Langley <agl@google.com>
2017-01-19 18:40:47 +00:00
.cfi_endproc
.p2align 5
.globl C_ABI(x25519_x86_64_square)
HIDDEN C_ABI(x25519_x86_64_square)
C_ABI(x25519_x86_64_square):
Add CFI information to the x86-64 X25519 asm. This change serves to check that all our consumers can process assembly with CFI directives in it. For the first change I picked a file that's not perlasm to keep things slightly simplier, but that might have been a mistake: DJB's tooling always aligns the stack to 32 bytes and it's not possible to express this in DWARF format (without using a register to store the old stack pointer). Since none of the functions here appear to care about that alignment, I removed it from each of them. I also trimmed the set of saved registers where possible and used the redzone for functions that didn't need much stack. Overall, this appears to have slightly improved the performance (by about 0.7%): Before: Did 46000 Curve25519 base-point multiplication operations in 3023288us (15215.2 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3017315us (15245.3 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3015346us (15255.3 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3018609us (15238.8 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3019004us (15236.8 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3013135us (15266.5 ops/sec) After: Did 46000 Curve25519 base-point multiplication operations in 3007659us (15294.3 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3054202us (15388.6 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3008714us (15288.9 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3004740us (15309.1 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3009140us (15286.8 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3057518us (15371.9 ops/sec) Change-Id: I31df11c45b2ea0bf44dde861d52c27f848331691 Reviewed-on: https://boringssl-review.googlesource.com/13200 CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Reviewed-by: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Adam Langley <agl@google.com>
2017-01-19 18:40:47 +00:00
.cfi_startproc
/* This is a leaf function and uses the redzone for saving registers. */
movq %r12,-8(%rsp)
.cfi_rel_offset r12, -8
movq %r13,-16(%rsp)
.cfi_rel_offset r13, -16
movq %r14,-24(%rsp)
.cfi_rel_offset r14, -24
movq %r15,-32(%rsp)
.cfi_rel_offset r15, -32
movq %rbx,-40(%rsp)
.cfi_rel_offset rbx, -40
movq 0(%rsi),%rax
mulq 0(%rsi)
mov %rax,%rcx
mov %rdx,%r8
movq 0(%rsi),%rax
shl $1,%rax
mulq 8(%rsi)
mov %rax,%r9
mov %rdx,%r10
movq 0(%rsi),%rax
shl $1,%rax
mulq 16(%rsi)
mov %rax,%r11
mov %rdx,%r12
movq 0(%rsi),%rax
shl $1,%rax
mulq 24(%rsi)
mov %rax,%r13
mov %rdx,%r14
movq 0(%rsi),%rax
shl $1,%rax
mulq 32(%rsi)
mov %rax,%r15
mov %rdx,%rbx
movq 8(%rsi),%rax
mulq 8(%rsi)
add %rax,%r11
adc %rdx,%r12
movq 8(%rsi),%rax
shl $1,%rax
mulq 16(%rsi)
add %rax,%r13
adc %rdx,%r14
movq 8(%rsi),%rax
shl $1,%rax
mulq 24(%rsi)
add %rax,%r15
adc %rdx,%rbx
movq 8(%rsi),%rdx
imulq $38,%rdx,%rax
mulq 32(%rsi)
add %rax,%rcx
adc %rdx,%r8
movq 16(%rsi),%rax
mulq 16(%rsi)
add %rax,%r15
adc %rdx,%rbx
movq 16(%rsi),%rdx
imulq $38,%rdx,%rax
mulq 24(%rsi)
add %rax,%rcx
adc %rdx,%r8
movq 16(%rsi),%rdx
imulq $38,%rdx,%rax
mulq 32(%rsi)
add %rax,%r9
adc %rdx,%r10
movq 24(%rsi),%rdx
imulq $19,%rdx,%rax
mulq 24(%rsi)
add %rax,%r9
adc %rdx,%r10
movq 24(%rsi),%rdx
imulq $38,%rdx,%rax
mulq 32(%rsi)
add %rax,%r11
adc %rdx,%r12
movq 32(%rsi),%rdx
imulq $19,%rdx,%rax
mulq 32(%rsi)
add %rax,%r13
adc %rdx,%r14
movq x25519_x86_64_REDMASK51(%rip),%rsi
shld $13,%rcx,%r8
and %rsi,%rcx
shld $13,%r9,%r10
and %rsi,%r9
add %r8,%r9
shld $13,%r11,%r12
and %rsi,%r11
add %r10,%r11
shld $13,%r13,%r14
and %rsi,%r13
add %r12,%r13
shld $13,%r15,%rbx
and %rsi,%r15
add %r14,%r15
imulq $19,%rbx,%rdx
add %rdx,%rcx
mov %rcx,%rdx
shr $51,%rdx
add %r9,%rdx
and %rsi,%rcx
mov %rdx,%r8
shr $51,%rdx
add %r11,%rdx
and %rsi,%r8
mov %rdx,%r9
shr $51,%rdx
add %r13,%rdx
and %rsi,%r9
mov %rdx,%rax
shr $51,%rdx
add %r15,%rdx
and %rsi,%rax
mov %rdx,%r10
shr $51,%rdx
imulq $19,%rdx,%rdx
add %rdx,%rcx
and %rsi,%r10
movq %rcx,0(%rdi)
movq %r8,8(%rdi)
movq %r9,16(%rdi)
movq %rax,24(%rdi)
movq %r10,32(%rdi)
Add CFI information to the x86-64 X25519 asm. This change serves to check that all our consumers can process assembly with CFI directives in it. For the first change I picked a file that's not perlasm to keep things slightly simplier, but that might have been a mistake: DJB's tooling always aligns the stack to 32 bytes and it's not possible to express this in DWARF format (without using a register to store the old stack pointer). Since none of the functions here appear to care about that alignment, I removed it from each of them. I also trimmed the set of saved registers where possible and used the redzone for functions that didn't need much stack. Overall, this appears to have slightly improved the performance (by about 0.7%): Before: Did 46000 Curve25519 base-point multiplication operations in 3023288us (15215.2 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3017315us (15245.3 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3015346us (15255.3 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3018609us (15238.8 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3019004us (15236.8 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3013135us (15266.5 ops/sec) After: Did 46000 Curve25519 base-point multiplication operations in 3007659us (15294.3 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3054202us (15388.6 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3008714us (15288.9 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3004740us (15309.1 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3009140us (15286.8 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3057518us (15371.9 ops/sec) Change-Id: I31df11c45b2ea0bf44dde861d52c27f848331691 Reviewed-on: https://boringssl-review.googlesource.com/13200 CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Reviewed-by: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Adam Langley <agl@google.com>
2017-01-19 18:40:47 +00:00
movq -8(%rsp),%r12
movq -16(%rsp),%r13
movq -24(%rsp),%r14
movq -32(%rsp),%r15
movq -40(%rsp),%rbx
ret
Add CFI information to the x86-64 X25519 asm. This change serves to check that all our consumers can process assembly with CFI directives in it. For the first change I picked a file that's not perlasm to keep things slightly simplier, but that might have been a mistake: DJB's tooling always aligns the stack to 32 bytes and it's not possible to express this in DWARF format (without using a register to store the old stack pointer). Since none of the functions here appear to care about that alignment, I removed it from each of them. I also trimmed the set of saved registers where possible and used the redzone for functions that didn't need much stack. Overall, this appears to have slightly improved the performance (by about 0.7%): Before: Did 46000 Curve25519 base-point multiplication operations in 3023288us (15215.2 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3017315us (15245.3 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3015346us (15255.3 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3018609us (15238.8 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3019004us (15236.8 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3013135us (15266.5 ops/sec) After: Did 46000 Curve25519 base-point multiplication operations in 3007659us (15294.3 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3054202us (15388.6 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3008714us (15288.9 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3004740us (15309.1 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3009140us (15286.8 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3057518us (15371.9 ops/sec) Change-Id: I31df11c45b2ea0bf44dde861d52c27f848331691 Reviewed-on: https://boringssl-review.googlesource.com/13200 CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Reviewed-by: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Adam Langley <agl@google.com>
2017-01-19 18:40:47 +00:00
.cfi_endproc
.p2align 5
.globl C_ABI(x25519_x86_64_ladderstep)
HIDDEN C_ABI(x25519_x86_64_ladderstep)
C_ABI(x25519_x86_64_ladderstep):
Add CFI information to the x86-64 X25519 asm. This change serves to check that all our consumers can process assembly with CFI directives in it. For the first change I picked a file that's not perlasm to keep things slightly simplier, but that might have been a mistake: DJB's tooling always aligns the stack to 32 bytes and it's not possible to express this in DWARF format (without using a register to store the old stack pointer). Since none of the functions here appear to care about that alignment, I removed it from each of them. I also trimmed the set of saved registers where possible and used the redzone for functions that didn't need much stack. Overall, this appears to have slightly improved the performance (by about 0.7%): Before: Did 46000 Curve25519 base-point multiplication operations in 3023288us (15215.2 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3017315us (15245.3 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3015346us (15255.3 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3018609us (15238.8 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3019004us (15236.8 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3013135us (15266.5 ops/sec) After: Did 46000 Curve25519 base-point multiplication operations in 3007659us (15294.3 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3054202us (15388.6 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3008714us (15288.9 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3004740us (15309.1 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3009140us (15286.8 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3057518us (15371.9 ops/sec) Change-Id: I31df11c45b2ea0bf44dde861d52c27f848331691 Reviewed-on: https://boringssl-review.googlesource.com/13200 CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Reviewed-by: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Adam Langley <agl@google.com>
2017-01-19 18:40:47 +00:00
.cfi_startproc
sub $344,%rsp
.cfi_adjust_cfa_offset 344
movq %r12,296(%rsp)
.cfi_rel_offset r12, 296
movq %r13,304(%rsp)
.cfi_rel_offset r13, 304
movq %r14,312(%rsp)
.cfi_rel_offset r14, 312
movq %r15,320(%rsp)
.cfi_rel_offset r15, 320
movq %rbx,328(%rsp)
.cfi_rel_offset rbx, 328
movq %rbp,336(%rsp)
.cfi_rel_offset rbp, 336
movq 40(%rdi),%rsi
movq 48(%rdi),%rdx
movq 56(%rdi),%rcx
movq 64(%rdi),%r8
movq 72(%rdi),%r9
mov %rsi,%rax
mov %rdx,%r10
mov %rcx,%r11
mov %r8,%r12
mov %r9,%r13
add x25519_x86_64_2P0(%rip),%rax
add x25519_x86_64_2P1234(%rip),%r10
add x25519_x86_64_2P1234(%rip),%r11
add x25519_x86_64_2P1234(%rip),%r12
add x25519_x86_64_2P1234(%rip),%r13
addq 80(%rdi),%rsi
addq 88(%rdi),%rdx
addq 96(%rdi),%rcx
addq 104(%rdi),%r8
addq 112(%rdi),%r9
subq 80(%rdi),%rax
subq 88(%rdi),%r10
subq 96(%rdi),%r11
subq 104(%rdi),%r12
subq 112(%rdi),%r13
movq %rsi,0(%rsp)
movq %rdx,8(%rsp)
movq %rcx,16(%rsp)
movq %r8,24(%rsp)
movq %r9,32(%rsp)
movq %rax,40(%rsp)
movq %r10,48(%rsp)
movq %r11,56(%rsp)
movq %r12,64(%rsp)
movq %r13,72(%rsp)
movq 40(%rsp),%rax
mulq 40(%rsp)
mov %rax,%rsi
mov %rdx,%rcx
movq 40(%rsp),%rax
shl $1,%rax
mulq 48(%rsp)
mov %rax,%r8
mov %rdx,%r9
movq 40(%rsp),%rax
shl $1,%rax
mulq 56(%rsp)
mov %rax,%r10
mov %rdx,%r11
movq 40(%rsp),%rax
shl $1,%rax
mulq 64(%rsp)
mov %rax,%r12
mov %rdx,%r13
movq 40(%rsp),%rax
shl $1,%rax
mulq 72(%rsp)
mov %rax,%r14
mov %rdx,%r15
movq 48(%rsp),%rax
mulq 48(%rsp)
add %rax,%r10
adc %rdx,%r11
movq 48(%rsp),%rax
shl $1,%rax
mulq 56(%rsp)
add %rax,%r12
adc %rdx,%r13
movq 48(%rsp),%rax
shl $1,%rax
mulq 64(%rsp)
add %rax,%r14
adc %rdx,%r15
movq 48(%rsp),%rdx
imulq $38,%rdx,%rax
mulq 72(%rsp)
add %rax,%rsi
adc %rdx,%rcx
movq 56(%rsp),%rax
mulq 56(%rsp)
add %rax,%r14
adc %rdx,%r15
movq 56(%rsp),%rdx
imulq $38,%rdx,%rax
mulq 64(%rsp)
add %rax,%rsi
adc %rdx,%rcx
movq 56(%rsp),%rdx
imulq $38,%rdx,%rax
mulq 72(%rsp)
add %rax,%r8
adc %rdx,%r9
movq 64(%rsp),%rdx
imulq $19,%rdx,%rax
mulq 64(%rsp)
add %rax,%r8
adc %rdx,%r9
movq 64(%rsp),%rdx
imulq $38,%rdx,%rax
mulq 72(%rsp)
add %rax,%r10
adc %rdx,%r11
movq 72(%rsp),%rdx
imulq $19,%rdx,%rax
mulq 72(%rsp)
add %rax,%r12
adc %rdx,%r13
movq x25519_x86_64_REDMASK51(%rip),%rdx
shld $13,%rsi,%rcx
and %rdx,%rsi
shld $13,%r8,%r9
and %rdx,%r8
add %rcx,%r8
shld $13,%r10,%r11
and %rdx,%r10
add %r9,%r10
shld $13,%r12,%r13
and %rdx,%r12
add %r11,%r12
shld $13,%r14,%r15
and %rdx,%r14
add %r13,%r14
imulq $19,%r15,%rcx
add %rcx,%rsi
mov %rsi,%rcx
shr $51,%rcx
add %r8,%rcx
and %rdx,%rsi
mov %rcx,%r8
shr $51,%rcx
add %r10,%rcx
and %rdx,%r8
mov %rcx,%r9
shr $51,%rcx
add %r12,%rcx
and %rdx,%r9
mov %rcx,%rax
shr $51,%rcx
add %r14,%rcx
and %rdx,%rax
mov %rcx,%r10
shr $51,%rcx
imulq $19,%rcx,%rcx
add %rcx,%rsi
and %rdx,%r10
movq %rsi,80(%rsp)
movq %r8,88(%rsp)
movq %r9,96(%rsp)
movq %rax,104(%rsp)
movq %r10,112(%rsp)
movq 0(%rsp),%rax
mulq 0(%rsp)
mov %rax,%rsi
mov %rdx,%rcx
movq 0(%rsp),%rax
shl $1,%rax
mulq 8(%rsp)
mov %rax,%r8
mov %rdx,%r9
movq 0(%rsp),%rax
shl $1,%rax
mulq 16(%rsp)
mov %rax,%r10
mov %rdx,%r11
movq 0(%rsp),%rax
shl $1,%rax
mulq 24(%rsp)
mov %rax,%r12
mov %rdx,%r13
movq 0(%rsp),%rax
shl $1,%rax
mulq 32(%rsp)
mov %rax,%r14
mov %rdx,%r15
movq 8(%rsp),%rax
mulq 8(%rsp)
add %rax,%r10
adc %rdx,%r11
movq 8(%rsp),%rax
shl $1,%rax
mulq 16(%rsp)
add %rax,%r12
adc %rdx,%r13
movq 8(%rsp),%rax
shl $1,%rax
mulq 24(%rsp)
add %rax,%r14
adc %rdx,%r15
movq 8(%rsp),%rdx
imulq $38,%rdx,%rax
mulq 32(%rsp)
add %rax,%rsi
adc %rdx,%rcx
movq 16(%rsp),%rax
mulq 16(%rsp)
add %rax,%r14
adc %rdx,%r15
movq 16(%rsp),%rdx
imulq $38,%rdx,%rax
mulq 24(%rsp)
add %rax,%rsi
adc %rdx,%rcx
movq 16(%rsp),%rdx
imulq $38,%rdx,%rax
mulq 32(%rsp)
add %rax,%r8
adc %rdx,%r9
movq 24(%rsp),%rdx
imulq $19,%rdx,%rax
mulq 24(%rsp)
add %rax,%r8
adc %rdx,%r9
movq 24(%rsp),%rdx
imulq $38,%rdx,%rax
mulq 32(%rsp)
add %rax,%r10
adc %rdx,%r11
movq 32(%rsp),%rdx
imulq $19,%rdx,%rax
mulq 32(%rsp)
add %rax,%r12
adc %rdx,%r13
movq x25519_x86_64_REDMASK51(%rip),%rdx
shld $13,%rsi,%rcx
and %rdx,%rsi
shld $13,%r8,%r9
and %rdx,%r8
add %rcx,%r8
shld $13,%r10,%r11
and %rdx,%r10
add %r9,%r10
shld $13,%r12,%r13
and %rdx,%r12
add %r11,%r12
shld $13,%r14,%r15
and %rdx,%r14
add %r13,%r14
imulq $19,%r15,%rcx
add %rcx,%rsi
mov %rsi,%rcx
shr $51,%rcx
add %r8,%rcx
and %rdx,%rsi
mov %rcx,%r8
shr $51,%rcx
add %r10,%rcx
and %rdx,%r8
mov %rcx,%r9
shr $51,%rcx
add %r12,%rcx
and %rdx,%r9
mov %rcx,%rax
shr $51,%rcx
add %r14,%rcx
and %rdx,%rax
mov %rcx,%r10
shr $51,%rcx
imulq $19,%rcx,%rcx
add %rcx,%rsi
and %rdx,%r10
movq %rsi,120(%rsp)
movq %r8,128(%rsp)
movq %r9,136(%rsp)
movq %rax,144(%rsp)
movq %r10,152(%rsp)
mov %rsi,%rsi
mov %r8,%rdx
mov %r9,%rcx
mov %rax,%r8
mov %r10,%r9
add x25519_x86_64_2P0(%rip),%rsi
add x25519_x86_64_2P1234(%rip),%rdx
add x25519_x86_64_2P1234(%rip),%rcx
add x25519_x86_64_2P1234(%rip),%r8
add x25519_x86_64_2P1234(%rip),%r9
subq 80(%rsp),%rsi
subq 88(%rsp),%rdx
subq 96(%rsp),%rcx
subq 104(%rsp),%r8
subq 112(%rsp),%r9
movq %rsi,160(%rsp)
movq %rdx,168(%rsp)
movq %rcx,176(%rsp)
movq %r8,184(%rsp)
movq %r9,192(%rsp)
movq 120(%rdi),%rsi
movq 128(%rdi),%rdx
movq 136(%rdi),%rcx
movq 144(%rdi),%r8
movq 152(%rdi),%r9
mov %rsi,%rax
mov %rdx,%r10
mov %rcx,%r11
mov %r8,%r12
mov %r9,%r13
add x25519_x86_64_2P0(%rip),%rax
add x25519_x86_64_2P1234(%rip),%r10
add x25519_x86_64_2P1234(%rip),%r11
add x25519_x86_64_2P1234(%rip),%r12
add x25519_x86_64_2P1234(%rip),%r13
addq 160(%rdi),%rsi
addq 168(%rdi),%rdx
addq 176(%rdi),%rcx
addq 184(%rdi),%r8
addq 192(%rdi),%r9
subq 160(%rdi),%rax
subq 168(%rdi),%r10
subq 176(%rdi),%r11
subq 184(%rdi),%r12
subq 192(%rdi),%r13
movq %rsi,200(%rsp)
movq %rdx,208(%rsp)
movq %rcx,216(%rsp)
movq %r8,224(%rsp)
movq %r9,232(%rsp)
movq %rax,240(%rsp)
movq %r10,248(%rsp)
movq %r11,256(%rsp)
movq %r12,264(%rsp)
movq %r13,272(%rsp)
movq 224(%rsp),%rsi
imulq $19,%rsi,%rax
movq %rax,280(%rsp)
mulq 56(%rsp)
mov %rax,%rsi
mov %rdx,%rcx
movq 232(%rsp),%rdx
imulq $19,%rdx,%rax
movq %rax,288(%rsp)
mulq 48(%rsp)
add %rax,%rsi
adc %rdx,%rcx
movq 200(%rsp),%rax
mulq 40(%rsp)
add %rax,%rsi
adc %rdx,%rcx
movq 200(%rsp),%rax
mulq 48(%rsp)
mov %rax,%r8
mov %rdx,%r9
movq 200(%rsp),%rax
mulq 56(%rsp)
mov %rax,%r10
mov %rdx,%r11
movq 200(%rsp),%rax
mulq 64(%rsp)
mov %rax,%r12
mov %rdx,%r13
movq 200(%rsp),%rax
mulq 72(%rsp)
mov %rax,%r14
mov %rdx,%r15
movq 208(%rsp),%rax
mulq 40(%rsp)
add %rax,%r8
adc %rdx,%r9
movq 208(%rsp),%rax
mulq 48(%rsp)
add %rax,%r10
adc %rdx,%r11
movq 208(%rsp),%rax
mulq 56(%rsp)
add %rax,%r12
adc %rdx,%r13
movq 208(%rsp),%rax
mulq 64(%rsp)
add %rax,%r14
adc %rdx,%r15
movq 208(%rsp),%rdx
imulq $19,%rdx,%rax
mulq 72(%rsp)
add %rax,%rsi
adc %rdx,%rcx
movq 216(%rsp),%rax
mulq 40(%rsp)
add %rax,%r10
adc %rdx,%r11
movq 216(%rsp),%rax
mulq 48(%rsp)
add %rax,%r12
adc %rdx,%r13
movq 216(%rsp),%rax
mulq 56(%rsp)
add %rax,%r14
adc %rdx,%r15
movq 216(%rsp),%rdx
imulq $19,%rdx,%rax
mulq 64(%rsp)
add %rax,%rsi
adc %rdx,%rcx
movq 216(%rsp),%rdx
imulq $19,%rdx,%rax
mulq 72(%rsp)
add %rax,%r8
adc %rdx,%r9
movq 224(%rsp),%rax
mulq 40(%rsp)
add %rax,%r12
adc %rdx,%r13
movq 224(%rsp),%rax
mulq 48(%rsp)
add %rax,%r14
adc %rdx,%r15
movq 280(%rsp),%rax
mulq 64(%rsp)
add %rax,%r8
adc %rdx,%r9
movq 280(%rsp),%rax
mulq 72(%rsp)
add %rax,%r10
adc %rdx,%r11
movq 232(%rsp),%rax
mulq 40(%rsp)
add %rax,%r14
adc %rdx,%r15
movq 288(%rsp),%rax
mulq 56(%rsp)
add %rax,%r8
adc %rdx,%r9
movq 288(%rsp),%rax
mulq 64(%rsp)
add %rax,%r10
adc %rdx,%r11
movq 288(%rsp),%rax
mulq 72(%rsp)
add %rax,%r12
adc %rdx,%r13
movq x25519_x86_64_REDMASK51(%rip),%rdx
shld $13,%rsi,%rcx
and %rdx,%rsi
shld $13,%r8,%r9
and %rdx,%r8
add %rcx,%r8
shld $13,%r10,%r11
and %rdx,%r10
add %r9,%r10
shld $13,%r12,%r13
and %rdx,%r12
add %r11,%r12
shld $13,%r14,%r15
and %rdx,%r14
add %r13,%r14
imulq $19,%r15,%rcx
add %rcx,%rsi
mov %rsi,%rcx
shr $51,%rcx
add %r8,%rcx
mov %rcx,%r8
shr $51,%rcx
and %rdx,%rsi
add %r10,%rcx
mov %rcx,%r9
shr $51,%rcx
and %rdx,%r8
add %r12,%rcx
mov %rcx,%rax
shr $51,%rcx
and %rdx,%r9
add %r14,%rcx
mov %rcx,%r10
shr $51,%rcx
and %rdx,%rax
imulq $19,%rcx,%rcx
add %rcx,%rsi
and %rdx,%r10
movq %rsi,40(%rsp)
movq %r8,48(%rsp)
movq %r9,56(%rsp)
movq %rax,64(%rsp)
movq %r10,72(%rsp)
movq 264(%rsp),%rsi
imulq $19,%rsi,%rax
movq %rax,200(%rsp)
mulq 16(%rsp)
mov %rax,%rsi
mov %rdx,%rcx
movq 272(%rsp),%rdx
imulq $19,%rdx,%rax
movq %rax,208(%rsp)
mulq 8(%rsp)
add %rax,%rsi
adc %rdx,%rcx
movq 240(%rsp),%rax
mulq 0(%rsp)
add %rax,%rsi
adc %rdx,%rcx
movq 240(%rsp),%rax
mulq 8(%rsp)
mov %rax,%r8
mov %rdx,%r9
movq 240(%rsp),%rax
mulq 16(%rsp)
mov %rax,%r10
mov %rdx,%r11
movq 240(%rsp),%rax
mulq 24(%rsp)
mov %rax,%r12
mov %rdx,%r13
movq 240(%rsp),%rax
mulq 32(%rsp)
mov %rax,%r14
mov %rdx,%r15
movq 248(%rsp),%rax
mulq 0(%rsp)
add %rax,%r8
adc %rdx,%r9
movq 248(%rsp),%rax
mulq 8(%rsp)
add %rax,%r10
adc %rdx,%r11
movq 248(%rsp),%rax
mulq 16(%rsp)
add %rax,%r12
adc %rdx,%r13
movq 248(%rsp),%rax
mulq 24(%rsp)
add %rax,%r14
adc %rdx,%r15
movq 248(%rsp),%rdx
imulq $19,%rdx,%rax
mulq 32(%rsp)
add %rax,%rsi
adc %rdx,%rcx
movq 256(%rsp),%rax
mulq 0(%rsp)
add %rax,%r10
adc %rdx,%r11
movq 256(%rsp),%rax
mulq 8(%rsp)
add %rax,%r12
adc %rdx,%r13
movq 256(%rsp),%rax
mulq 16(%rsp)
add %rax,%r14
adc %rdx,%r15
movq 256(%rsp),%rdx
imulq $19,%rdx,%rax
mulq 24(%rsp)
add %rax,%rsi
adc %rdx,%rcx
movq 256(%rsp),%rdx
imulq $19,%rdx,%rax
mulq 32(%rsp)
add %rax,%r8
adc %rdx,%r9
movq 264(%rsp),%rax
mulq 0(%rsp)
add %rax,%r12
adc %rdx,%r13
movq 264(%rsp),%rax
mulq 8(%rsp)
add %rax,%r14
adc %rdx,%r15
movq 200(%rsp),%rax
mulq 24(%rsp)
add %rax,%r8
adc %rdx,%r9
movq 200(%rsp),%rax
mulq 32(%rsp)
add %rax,%r10
adc %rdx,%r11
movq 272(%rsp),%rax
mulq 0(%rsp)
add %rax,%r14
adc %rdx,%r15
movq 208(%rsp),%rax
mulq 16(%rsp)
add %rax,%r8
adc %rdx,%r9
movq 208(%rsp),%rax
mulq 24(%rsp)
add %rax,%r10
adc %rdx,%r11
movq 208(%rsp),%rax
mulq 32(%rsp)
add %rax,%r12
adc %rdx,%r13
movq x25519_x86_64_REDMASK51(%rip),%rdx
shld $13,%rsi,%rcx
and %rdx,%rsi
shld $13,%r8,%r9
and %rdx,%r8
add %rcx,%r8
shld $13,%r10,%r11
and %rdx,%r10
add %r9,%r10
shld $13,%r12,%r13
and %rdx,%r12
add %r11,%r12
shld $13,%r14,%r15
and %rdx,%r14
add %r13,%r14
imulq $19,%r15,%rcx
add %rcx,%rsi
mov %rsi,%rcx
shr $51,%rcx
add %r8,%rcx
mov %rcx,%r8
shr $51,%rcx
and %rdx,%rsi
add %r10,%rcx
mov %rcx,%r9
shr $51,%rcx
and %rdx,%r8
add %r12,%rcx
mov %rcx,%rax
shr $51,%rcx
and %rdx,%r9
add %r14,%rcx
mov %rcx,%r10
shr $51,%rcx
and %rdx,%rax
imulq $19,%rcx,%rcx
add %rcx,%rsi
and %rdx,%r10
mov %rsi,%rdx
mov %r8,%rcx
mov %r9,%r11
mov %rax,%r12
mov %r10,%r13
add x25519_x86_64_2P0(%rip),%rdx
add x25519_x86_64_2P1234(%rip),%rcx
add x25519_x86_64_2P1234(%rip),%r11
add x25519_x86_64_2P1234(%rip),%r12
add x25519_x86_64_2P1234(%rip),%r13
addq 40(%rsp),%rsi
addq 48(%rsp),%r8
addq 56(%rsp),%r9
addq 64(%rsp),%rax
addq 72(%rsp),%r10
subq 40(%rsp),%rdx
subq 48(%rsp),%rcx
subq 56(%rsp),%r11
subq 64(%rsp),%r12
subq 72(%rsp),%r13
movq %rsi,120(%rdi)
movq %r8,128(%rdi)
movq %r9,136(%rdi)
movq %rax,144(%rdi)
movq %r10,152(%rdi)
movq %rdx,160(%rdi)
movq %rcx,168(%rdi)
movq %r11,176(%rdi)
movq %r12,184(%rdi)
movq %r13,192(%rdi)
movq 120(%rdi),%rax
mulq 120(%rdi)
mov %rax,%rsi
mov %rdx,%rcx
movq 120(%rdi),%rax
shl $1,%rax
mulq 128(%rdi)
mov %rax,%r8
mov %rdx,%r9
movq 120(%rdi),%rax
shl $1,%rax
mulq 136(%rdi)
mov %rax,%r10
mov %rdx,%r11
movq 120(%rdi),%rax
shl $1,%rax
mulq 144(%rdi)
mov %rax,%r12
mov %rdx,%r13
movq 120(%rdi),%rax
shl $1,%rax
mulq 152(%rdi)
mov %rax,%r14
mov %rdx,%r15
movq 128(%rdi),%rax
mulq 128(%rdi)
add %rax,%r10
adc %rdx,%r11
movq 128(%rdi),%rax
shl $1,%rax
mulq 136(%rdi)
add %rax,%r12
adc %rdx,%r13
movq 128(%rdi),%rax
shl $1,%rax
mulq 144(%rdi)
add %rax,%r14
adc %rdx,%r15
movq 128(%rdi),%rdx
imulq $38,%rdx,%rax
mulq 152(%rdi)
add %rax,%rsi
adc %rdx,%rcx
movq 136(%rdi),%rax
mulq 136(%rdi)
add %rax,%r14
adc %rdx,%r15
movq 136(%rdi),%rdx
imulq $38,%rdx,%rax
mulq 144(%rdi)
add %rax,%rsi
adc %rdx,%rcx
movq 136(%rdi),%rdx
imulq $38,%rdx,%rax
mulq 152(%rdi)
add %rax,%r8
adc %rdx,%r9
movq 144(%rdi),%rdx
imulq $19,%rdx,%rax
mulq 144(%rdi)
add %rax,%r8
adc %rdx,%r9
movq 144(%rdi),%rdx
imulq $38,%rdx,%rax
mulq 152(%rdi)
add %rax,%r10
adc %rdx,%r11
movq 152(%rdi),%rdx
imulq $19,%rdx,%rax
mulq 152(%rdi)
add %rax,%r12
adc %rdx,%r13
movq x25519_x86_64_REDMASK51(%rip),%rdx
shld $13,%rsi,%rcx
and %rdx,%rsi
shld $13,%r8,%r9
and %rdx,%r8
add %rcx,%r8
shld $13,%r10,%r11
and %rdx,%r10
add %r9,%r10
shld $13,%r12,%r13
and %rdx,%r12
add %r11,%r12
shld $13,%r14,%r15
and %rdx,%r14
add %r13,%r14
imulq $19,%r15,%rcx
add %rcx,%rsi
mov %rsi,%rcx
shr $51,%rcx
add %r8,%rcx
and %rdx,%rsi
mov %rcx,%r8
shr $51,%rcx
add %r10,%rcx
and %rdx,%r8
mov %rcx,%r9
shr $51,%rcx
add %r12,%rcx
and %rdx,%r9
mov %rcx,%rax
shr $51,%rcx
add %r14,%rcx
and %rdx,%rax
mov %rcx,%r10
shr $51,%rcx
imulq $19,%rcx,%rcx
add %rcx,%rsi
and %rdx,%r10
movq %rsi,120(%rdi)
movq %r8,128(%rdi)
movq %r9,136(%rdi)
movq %rax,144(%rdi)
movq %r10,152(%rdi)
movq 160(%rdi),%rax
mulq 160(%rdi)
mov %rax,%rsi
mov %rdx,%rcx
movq 160(%rdi),%rax
shl $1,%rax
mulq 168(%rdi)
mov %rax,%r8
mov %rdx,%r9
movq 160(%rdi),%rax
shl $1,%rax
mulq 176(%rdi)
mov %rax,%r10
mov %rdx,%r11
movq 160(%rdi),%rax
shl $1,%rax
mulq 184(%rdi)
mov %rax,%r12
mov %rdx,%r13
movq 160(%rdi),%rax
shl $1,%rax
mulq 192(%rdi)
mov %rax,%r14
mov %rdx,%r15
movq 168(%rdi),%rax
mulq 168(%rdi)
add %rax,%r10
adc %rdx,%r11
movq 168(%rdi),%rax
shl $1,%rax
mulq 176(%rdi)
add %rax,%r12
adc %rdx,%r13
movq 168(%rdi),%rax
shl $1,%rax
mulq 184(%rdi)
add %rax,%r14
adc %rdx,%r15
movq 168(%rdi),%rdx
imulq $38,%rdx,%rax
mulq 192(%rdi)
add %rax,%rsi
adc %rdx,%rcx
movq 176(%rdi),%rax
mulq 176(%rdi)
add %rax,%r14
adc %rdx,%r15
movq 176(%rdi),%rdx
imulq $38,%rdx,%rax
mulq 184(%rdi)
add %rax,%rsi
adc %rdx,%rcx
movq 176(%rdi),%rdx
imulq $38,%rdx,%rax
mulq 192(%rdi)
add %rax,%r8
adc %rdx,%r9
movq 184(%rdi),%rdx
imulq $19,%rdx,%rax
mulq 184(%rdi)
add %rax,%r8
adc %rdx,%r9
movq 184(%rdi),%rdx
imulq $38,%rdx,%rax
mulq 192(%rdi)
add %rax,%r10
adc %rdx,%r11
movq 192(%rdi),%rdx
imulq $19,%rdx,%rax
mulq 192(%rdi)
add %rax,%r12
adc %rdx,%r13
movq x25519_x86_64_REDMASK51(%rip),%rdx
shld $13,%rsi,%rcx
and %rdx,%rsi
shld $13,%r8,%r9
and %rdx,%r8
add %rcx,%r8
shld $13,%r10,%r11
and %rdx,%r10
add %r9,%r10
shld $13,%r12,%r13
and %rdx,%r12
add %r11,%r12
shld $13,%r14,%r15
and %rdx,%r14
add %r13,%r14
imulq $19,%r15,%rcx
add %rcx,%rsi
mov %rsi,%rcx
shr $51,%rcx
add %r8,%rcx
and %rdx,%rsi
mov %rcx,%r8
shr $51,%rcx
add %r10,%rcx
and %rdx,%r8
mov %rcx,%r9
shr $51,%rcx
add %r12,%rcx
and %rdx,%r9
mov %rcx,%rax
shr $51,%rcx
add %r14,%rcx
and %rdx,%rax
mov %rcx,%r10
shr $51,%rcx
imulq $19,%rcx,%rcx
add %rcx,%rsi
and %rdx,%r10
movq %rsi,160(%rdi)
movq %r8,168(%rdi)
movq %r9,176(%rdi)
movq %rax,184(%rdi)
movq %r10,192(%rdi)
movq 184(%rdi),%rsi
imulq $19,%rsi,%rax
movq %rax,0(%rsp)
mulq 16(%rdi)
mov %rax,%rsi
mov %rdx,%rcx
movq 192(%rdi),%rdx
imulq $19,%rdx,%rax
movq %rax,8(%rsp)
mulq 8(%rdi)
add %rax,%rsi
adc %rdx,%rcx
movq 160(%rdi),%rax
mulq 0(%rdi)
add %rax,%rsi
adc %rdx,%rcx
movq 160(%rdi),%rax
mulq 8(%rdi)
mov %rax,%r8
mov %rdx,%r9
movq 160(%rdi),%rax
mulq 16(%rdi)
mov %rax,%r10
mov %rdx,%r11
movq 160(%rdi),%rax
mulq 24(%rdi)
mov %rax,%r12
mov %rdx,%r13
movq 160(%rdi),%rax
mulq 32(%rdi)
mov %rax,%r14
mov %rdx,%r15
movq 168(%rdi),%rax
mulq 0(%rdi)
add %rax,%r8
adc %rdx,%r9
movq 168(%rdi),%rax
mulq 8(%rdi)
add %rax,%r10
adc %rdx,%r11
movq 168(%rdi),%rax
mulq 16(%rdi)
add %rax,%r12
adc %rdx,%r13
movq 168(%rdi),%rax
mulq 24(%rdi)
add %rax,%r14
adc %rdx,%r15
movq 168(%rdi),%rdx
imulq $19,%rdx,%rax
mulq 32(%rdi)
add %rax,%rsi
adc %rdx,%rcx
movq 176(%rdi),%rax
mulq 0(%rdi)
add %rax,%r10
adc %rdx,%r11
movq 176(%rdi),%rax
mulq 8(%rdi)
add %rax,%r12
adc %rdx,%r13
movq 176(%rdi),%rax
mulq 16(%rdi)
add %rax,%r14
adc %rdx,%r15
movq 176(%rdi),%rdx
imulq $19,%rdx,%rax
mulq 24(%rdi)
add %rax,%rsi
adc %rdx,%rcx
movq 176(%rdi),%rdx
imulq $19,%rdx,%rax
mulq 32(%rdi)
add %rax,%r8
adc %rdx,%r9
movq 184(%rdi),%rax
mulq 0(%rdi)
add %rax,%r12
adc %rdx,%r13
movq 184(%rdi),%rax
mulq 8(%rdi)
add %rax,%r14
adc %rdx,%r15
movq 0(%rsp),%rax
mulq 24(%rdi)
add %rax,%r8
adc %rdx,%r9
movq 0(%rsp),%rax
mulq 32(%rdi)
add %rax,%r10
adc %rdx,%r11
movq 192(%rdi),%rax
mulq 0(%rdi)
add %rax,%r14
adc %rdx,%r15
movq 8(%rsp),%rax
mulq 16(%rdi)
add %rax,%r8
adc %rdx,%r9
movq 8(%rsp),%rax
mulq 24(%rdi)
add %rax,%r10
adc %rdx,%r11
movq 8(%rsp),%rax
mulq 32(%rdi)
add %rax,%r12
adc %rdx,%r13
movq x25519_x86_64_REDMASK51(%rip),%rdx
shld $13,%rsi,%rcx
and %rdx,%rsi
shld $13,%r8,%r9
and %rdx,%r8
add %rcx,%r8
shld $13,%r10,%r11
and %rdx,%r10
add %r9,%r10
shld $13,%r12,%r13
and %rdx,%r12
add %r11,%r12
shld $13,%r14,%r15
and %rdx,%r14
add %r13,%r14
imulq $19,%r15,%rcx
add %rcx,%rsi
mov %rsi,%rcx
shr $51,%rcx
add %r8,%rcx
mov %rcx,%r8
shr $51,%rcx
and %rdx,%rsi
add %r10,%rcx
mov %rcx,%r9
shr $51,%rcx
and %rdx,%r8
add %r12,%rcx
mov %rcx,%rax
shr $51,%rcx
and %rdx,%r9
add %r14,%rcx
mov %rcx,%r10
shr $51,%rcx
and %rdx,%rax
imulq $19,%rcx,%rcx
add %rcx,%rsi
and %rdx,%r10
movq %rsi,160(%rdi)
movq %r8,168(%rdi)
movq %r9,176(%rdi)
movq %rax,184(%rdi)
movq %r10,192(%rdi)
movq 144(%rsp),%rsi
imulq $19,%rsi,%rax
movq %rax,0(%rsp)
mulq 96(%rsp)
mov %rax,%rsi
mov %rdx,%rcx
movq 152(%rsp),%rdx
imulq $19,%rdx,%rax
movq %rax,8(%rsp)
mulq 88(%rsp)
add %rax,%rsi
adc %rdx,%rcx
movq 120(%rsp),%rax
mulq 80(%rsp)
add %rax,%rsi
adc %rdx,%rcx
movq 120(%rsp),%rax
mulq 88(%rsp)
mov %rax,%r8
mov %rdx,%r9
movq 120(%rsp),%rax
mulq 96(%rsp)
mov %rax,%r10
mov %rdx,%r11
movq 120(%rsp),%rax
mulq 104(%rsp)
mov %rax,%r12
mov %rdx,%r13
movq 120(%rsp),%rax
mulq 112(%rsp)
mov %rax,%r14
mov %rdx,%r15
movq 128(%rsp),%rax
mulq 80(%rsp)
add %rax,%r8
adc %rdx,%r9
movq 128(%rsp),%rax
mulq 88(%rsp)
add %rax,%r10
adc %rdx,%r11
movq 128(%rsp),%rax
mulq 96(%rsp)
add %rax,%r12
adc %rdx,%r13
movq 128(%rsp),%rax
mulq 104(%rsp)
add %rax,%r14
adc %rdx,%r15
movq 128(%rsp),%rdx
imulq $19,%rdx,%rax
mulq 112(%rsp)
add %rax,%rsi
adc %rdx,%rcx
movq 136(%rsp),%rax
mulq 80(%rsp)
add %rax,%r10
adc %rdx,%r11
movq 136(%rsp),%rax
mulq 88(%rsp)
add %rax,%r12
adc %rdx,%r13
movq 136(%rsp),%rax
mulq 96(%rsp)
add %rax,%r14
adc %rdx,%r15
movq 136(%rsp),%rdx
imulq $19,%rdx,%rax
mulq 104(%rsp)
add %rax,%rsi
adc %rdx,%rcx
movq 136(%rsp),%rdx
imulq $19,%rdx,%rax
mulq 112(%rsp)
add %rax,%r8
adc %rdx,%r9
movq 144(%rsp),%rax
mulq 80(%rsp)
add %rax,%r12
adc %rdx,%r13
movq 144(%rsp),%rax
mulq 88(%rsp)
add %rax,%r14
adc %rdx,%r15
movq 0(%rsp),%rax
mulq 104(%rsp)
add %rax,%r8
adc %rdx,%r9
movq 0(%rsp),%rax
mulq 112(%rsp)
add %rax,%r10
adc %rdx,%r11
movq 152(%rsp),%rax
mulq 80(%rsp)
add %rax,%r14
adc %rdx,%r15
movq 8(%rsp),%rax
mulq 96(%rsp)
add %rax,%r8
adc %rdx,%r9
movq 8(%rsp),%rax
mulq 104(%rsp)
add %rax,%r10
adc %rdx,%r11
movq 8(%rsp),%rax
mulq 112(%rsp)
add %rax,%r12
adc %rdx,%r13
movq x25519_x86_64_REDMASK51(%rip),%rdx
shld $13,%rsi,%rcx
and %rdx,%rsi
shld $13,%r8,%r9
and %rdx,%r8
add %rcx,%r8
shld $13,%r10,%r11
and %rdx,%r10
add %r9,%r10
shld $13,%r12,%r13
and %rdx,%r12
add %r11,%r12
shld $13,%r14,%r15
and %rdx,%r14
add %r13,%r14
imulq $19,%r15,%rcx
add %rcx,%rsi
mov %rsi,%rcx
shr $51,%rcx
add %r8,%rcx
mov %rcx,%r8
shr $51,%rcx
and %rdx,%rsi
add %r10,%rcx
mov %rcx,%r9
shr $51,%rcx
and %rdx,%r8
add %r12,%rcx
mov %rcx,%rax
shr $51,%rcx
and %rdx,%r9
add %r14,%rcx
mov %rcx,%r10
shr $51,%rcx
and %rdx,%rax
imulq $19,%rcx,%rcx
add %rcx,%rsi
and %rdx,%r10
movq %rsi,40(%rdi)
movq %r8,48(%rdi)
movq %r9,56(%rdi)
movq %rax,64(%rdi)
movq %r10,72(%rdi)
movq 160(%rsp),%rax
mulq x25519_x86_64_121666_213(%rip)
shr $13,%rax
mov %rax,%rsi
mov %rdx,%rcx
movq 168(%rsp),%rax
mulq x25519_x86_64_121666_213(%rip)
shr $13,%rax
add %rax,%rcx
mov %rdx,%r8
movq 176(%rsp),%rax
mulq x25519_x86_64_121666_213(%rip)
shr $13,%rax
add %rax,%r8
mov %rdx,%r9
movq 184(%rsp),%rax
mulq x25519_x86_64_121666_213(%rip)
shr $13,%rax
add %rax,%r9
mov %rdx,%r10
movq 192(%rsp),%rax
mulq x25519_x86_64_121666_213(%rip)
shr $13,%rax
add %rax,%r10
imulq $19,%rdx,%rdx
add %rdx,%rsi
addq 80(%rsp),%rsi
addq 88(%rsp),%rcx
addq 96(%rsp),%r8
addq 104(%rsp),%r9
addq 112(%rsp),%r10
movq %rsi,80(%rdi)
movq %rcx,88(%rdi)
movq %r8,96(%rdi)
movq %r9,104(%rdi)
movq %r10,112(%rdi)
movq 104(%rdi),%rsi
imulq $19,%rsi,%rax
movq %rax,0(%rsp)
mulq 176(%rsp)
mov %rax,%rsi
mov %rdx,%rcx
movq 112(%rdi),%rdx
imulq $19,%rdx,%rax
movq %rax,8(%rsp)
mulq 168(%rsp)
add %rax,%rsi
adc %rdx,%rcx
movq 80(%rdi),%rax
mulq 160(%rsp)
add %rax,%rsi
adc %rdx,%rcx
movq 80(%rdi),%rax
mulq 168(%rsp)
mov %rax,%r8
mov %rdx,%r9
movq 80(%rdi),%rax
mulq 176(%rsp)
mov %rax,%r10
mov %rdx,%r11
movq 80(%rdi),%rax
mulq 184(%rsp)
mov %rax,%r12
mov %rdx,%r13
movq 80(%rdi),%rax
mulq 192(%rsp)
mov %rax,%r14
mov %rdx,%r15
movq 88(%rdi),%rax
mulq 160(%rsp)
add %rax,%r8
adc %rdx,%r9
movq 88(%rdi),%rax
mulq 168(%rsp)
add %rax,%r10
adc %rdx,%r11
movq 88(%rdi),%rax
mulq 176(%rsp)
add %rax,%r12
adc %rdx,%r13
movq 88(%rdi),%rax
mulq 184(%rsp)
add %rax,%r14
adc %rdx,%r15
movq 88(%rdi),%rdx
imulq $19,%rdx,%rax
mulq 192(%rsp)
add %rax,%rsi
adc %rdx,%rcx
movq 96(%rdi),%rax
mulq 160(%rsp)
add %rax,%r10
adc %rdx,%r11
movq 96(%rdi),%rax
mulq 168(%rsp)
add %rax,%r12
adc %rdx,%r13
movq 96(%rdi),%rax
mulq 176(%rsp)
add %rax,%r14
adc %rdx,%r15
movq 96(%rdi),%rdx
imulq $19,%rdx,%rax
mulq 184(%rsp)
add %rax,%rsi
adc %rdx,%rcx
movq 96(%rdi),%rdx
imulq $19,%rdx,%rax
mulq 192(%rsp)
add %rax,%r8
adc %rdx,%r9
movq 104(%rdi),%rax
mulq 160(%rsp)
add %rax,%r12
adc %rdx,%r13
movq 104(%rdi),%rax
mulq 168(%rsp)
add %rax,%r14
adc %rdx,%r15
movq 0(%rsp),%rax
mulq 184(%rsp)
add %rax,%r8
adc %rdx,%r9
movq 0(%rsp),%rax
mulq 192(%rsp)
add %rax,%r10
adc %rdx,%r11
movq 112(%rdi),%rax
mulq 160(%rsp)
add %rax,%r14
adc %rdx,%r15
movq 8(%rsp),%rax
mulq 176(%rsp)
add %rax,%r8
adc %rdx,%r9
movq 8(%rsp),%rax
mulq 184(%rsp)
add %rax,%r10
adc %rdx,%r11
movq 8(%rsp),%rax
mulq 192(%rsp)
add %rax,%r12
adc %rdx,%r13
movq x25519_x86_64_REDMASK51(%rip),%rdx
shld $13,%rsi,%rcx
and %rdx,%rsi
shld $13,%r8,%r9
and %rdx,%r8
add %rcx,%r8
shld $13,%r10,%r11
and %rdx,%r10
add %r9,%r10
shld $13,%r12,%r13
and %rdx,%r12
add %r11,%r12
shld $13,%r14,%r15
and %rdx,%r14
add %r13,%r14
imulq $19,%r15,%rcx
add %rcx,%rsi
mov %rsi,%rcx
shr $51,%rcx
add %r8,%rcx
mov %rcx,%r8
shr $51,%rcx
and %rdx,%rsi
add %r10,%rcx
mov %rcx,%r9
shr $51,%rcx
and %rdx,%r8
add %r12,%rcx
mov %rcx,%rax
shr $51,%rcx
and %rdx,%r9
add %r14,%rcx
mov %rcx,%r10
shr $51,%rcx
and %rdx,%rax
imulq $19,%rcx,%rcx
add %rcx,%rsi
and %rdx,%r10
movq %rsi,80(%rdi)
movq %r8,88(%rdi)
movq %r9,96(%rdi)
movq %rax,104(%rdi)
movq %r10,112(%rdi)
movq 296(%rsp),%r12
movq 304(%rsp),%r13
movq 312(%rsp),%r14
movq 320(%rsp),%r15
movq 328(%rsp),%rbx
movq 336(%rsp),%rbp
add $344,%rsp
.cfi_adjust_cfa_offset -344
ret
Add CFI information to the x86-64 X25519 asm. This change serves to check that all our consumers can process assembly with CFI directives in it. For the first change I picked a file that's not perlasm to keep things slightly simplier, but that might have been a mistake: DJB's tooling always aligns the stack to 32 bytes and it's not possible to express this in DWARF format (without using a register to store the old stack pointer). Since none of the functions here appear to care about that alignment, I removed it from each of them. I also trimmed the set of saved registers where possible and used the redzone for functions that didn't need much stack. Overall, this appears to have slightly improved the performance (by about 0.7%): Before: Did 46000 Curve25519 base-point multiplication operations in 3023288us (15215.2 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3017315us (15245.3 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3015346us (15255.3 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3018609us (15238.8 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3019004us (15236.8 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3013135us (15266.5 ops/sec) After: Did 46000 Curve25519 base-point multiplication operations in 3007659us (15294.3 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3054202us (15388.6 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3008714us (15288.9 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3004740us (15309.1 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3009140us (15286.8 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3057518us (15371.9 ops/sec) Change-Id: I31df11c45b2ea0bf44dde861d52c27f848331691 Reviewed-on: https://boringssl-review.googlesource.com/13200 CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Reviewed-by: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Adam Langley <agl@google.com>
2017-01-19 18:40:47 +00:00
.cfi_endproc
.p2align 5
.globl C_ABI(x25519_x86_64_work_cswap)
HIDDEN C_ABI(x25519_x86_64_work_cswap)
C_ABI(x25519_x86_64_work_cswap):
Add CFI information to the x86-64 X25519 asm. This change serves to check that all our consumers can process assembly with CFI directives in it. For the first change I picked a file that's not perlasm to keep things slightly simplier, but that might have been a mistake: DJB's tooling always aligns the stack to 32 bytes and it's not possible to express this in DWARF format (without using a register to store the old stack pointer). Since none of the functions here appear to care about that alignment, I removed it from each of them. I also trimmed the set of saved registers where possible and used the redzone for functions that didn't need much stack. Overall, this appears to have slightly improved the performance (by about 0.7%): Before: Did 46000 Curve25519 base-point multiplication operations in 3023288us (15215.2 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3017315us (15245.3 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3015346us (15255.3 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3018609us (15238.8 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3019004us (15236.8 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3013135us (15266.5 ops/sec) After: Did 46000 Curve25519 base-point multiplication operations in 3007659us (15294.3 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3054202us (15388.6 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3008714us (15288.9 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3004740us (15309.1 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3009140us (15286.8 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3057518us (15371.9 ops/sec) Change-Id: I31df11c45b2ea0bf44dde861d52c27f848331691 Reviewed-on: https://boringssl-review.googlesource.com/13200 CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Reviewed-by: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Adam Langley <agl@google.com>
2017-01-19 18:40:47 +00:00
.cfi_startproc
cmp $1,%rsi
movq 0(%rdi),%rsi
movq 80(%rdi),%rdx
movq 8(%rdi),%rcx
movq 88(%rdi),%r8
mov %rsi,%r9
cmove %rdx,%rsi
cmove %r9,%rdx
mov %rcx,%r9
cmove %r8,%rcx
cmove %r9,%r8
movq %rsi,0(%rdi)
movq %rdx,80(%rdi)
movq %rcx,8(%rdi)
movq %r8,88(%rdi)
movq 16(%rdi),%rsi
movq 96(%rdi),%rdx
movq 24(%rdi),%rcx
movq 104(%rdi),%r8
mov %rsi,%r9
cmove %rdx,%rsi
cmove %r9,%rdx
mov %rcx,%r9
cmove %r8,%rcx
cmove %r9,%r8
movq %rsi,16(%rdi)
movq %rdx,96(%rdi)
movq %rcx,24(%rdi)
movq %r8,104(%rdi)
movq 32(%rdi),%rsi
movq 112(%rdi),%rdx
movq 40(%rdi),%rcx
movq 120(%rdi),%r8
mov %rsi,%r9
cmove %rdx,%rsi
cmove %r9,%rdx
mov %rcx,%r9
cmove %r8,%rcx
cmove %r9,%r8
movq %rsi,32(%rdi)
movq %rdx,112(%rdi)
movq %rcx,40(%rdi)
movq %r8,120(%rdi)
movq 48(%rdi),%rsi
movq 128(%rdi),%rdx
movq 56(%rdi),%rcx
movq 136(%rdi),%r8
mov %rsi,%r9
cmove %rdx,%rsi
cmove %r9,%rdx
mov %rcx,%r9
cmove %r8,%rcx
cmove %r9,%r8
movq %rsi,48(%rdi)
movq %rdx,128(%rdi)
movq %rcx,56(%rdi)
movq %r8,136(%rdi)
movq 64(%rdi),%rsi
movq 144(%rdi),%rdx
movq 72(%rdi),%rcx
movq 152(%rdi),%r8
mov %rsi,%r9
cmove %rdx,%rsi
cmove %r9,%rdx
mov %rcx,%r9
cmove %r8,%rcx
cmove %r9,%r8
movq %rsi,64(%rdi)
movq %rdx,144(%rdi)
movq %rcx,72(%rdi)
movq %r8,152(%rdi)
mov %rdi,%rax
mov %rsi,%rdx
ret
Add CFI information to the x86-64 X25519 asm. This change serves to check that all our consumers can process assembly with CFI directives in it. For the first change I picked a file that's not perlasm to keep things slightly simplier, but that might have been a mistake: DJB's tooling always aligns the stack to 32 bytes and it's not possible to express this in DWARF format (without using a register to store the old stack pointer). Since none of the functions here appear to care about that alignment, I removed it from each of them. I also trimmed the set of saved registers where possible and used the redzone for functions that didn't need much stack. Overall, this appears to have slightly improved the performance (by about 0.7%): Before: Did 46000 Curve25519 base-point multiplication operations in 3023288us (15215.2 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3017315us (15245.3 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3015346us (15255.3 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3018609us (15238.8 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3019004us (15236.8 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3013135us (15266.5 ops/sec) After: Did 46000 Curve25519 base-point multiplication operations in 3007659us (15294.3 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3054202us (15388.6 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3008714us (15288.9 ops/sec) Did 46000 Curve25519 arbitrary point multiplication operations in 3004740us (15309.1 ops/sec) Did 46000 Curve25519 base-point multiplication operations in 3009140us (15286.8 ops/sec) Did 47000 Curve25519 arbitrary point multiplication operations in 3057518us (15371.9 ops/sec) Change-Id: I31df11c45b2ea0bf44dde861d52c27f848331691 Reviewed-on: https://boringssl-review.googlesource.com/13200 CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Reviewed-by: David Benjamin <davidben@google.com> Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Adam Langley <agl@google.com>
2017-01-19 18:40:47 +00:00
.cfi_endproc
#endif /* __x86_64__ */
#endif /* !OPENSSL_NO_ASM */