Enable AVX2 and ADX in p256-x86_64-asm.pl.

We can test these with Intel SDE now. The AVX2 code just affects the two
select functions while the ADX code is a separate implementation.

Haswell numbers:

Before:
Did 84630 ECDH P-256 operations in 10031494us (8436.4 ops/sec)
Did 206000 ECDSA P-256 signing operations in 10015055us (20569.0 ops/sec)
Did 77256 ECDSA P-256 verify operations in 10064556us (7676.0 ops/sec)

After:
Did 86112 ECDH P-256 operations in 10015008us (8598.3 ops/sec)
Did 211000 ECDSA P-256 signing operations in 10025104us (21047.2 ops/sec)
Did 79344 ECDSA P-256 verify operations in 10017076us (7920.9 ops/sec)

Skylake numbers:

Before:
Did 75684 ECDH P-256 operations in 10016019us (7556.3 ops/sec)
Did 185000 ECDSA P-256 signing operations in 10012090us (18477.7 ops/sec)
Did 72885 ECDSA P-256 verify operations in 10027154us (7268.8 ops/sec)

After:
Did 89598 ECDH P-256 operations in 10032162us (8931.1 ops/sec)
Did 203000 ECDSA P-256 signing operations in 10019739us (20260.0 ops/sec)
Did 87040 ECDSA P-256 verify operations in 10000441us (8703.6 ops/sec)

The code was slightly patched for delocate.go compatibility.

Change-Id: Ic44ced4eca65c656bbe07d5a7fee91ec6925eb59
Reviewed-on: https://boringssl-review.googlesource.com/18967
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
This commit is contained in:
David Benjamin 2017-08-08 18:19:37 -04:00
parent 488ca0eace
commit 78f5e75739

View File

@ -54,9 +54,8 @@ die "can't locate x86_64-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT; *STDOUT=*OUT;
# TODO: enable these after testing. $avx goes to two and $addx to one. $avx = 2;
$avx=0; $addx = 1;
$addx=0;
$code.=<<___; $code.=<<___;
.text .text
@ -150,8 +149,9 @@ $code.=<<___;
ecp_nistz256_mul_mont: ecp_nistz256_mul_mont:
___ ___
$code.=<<___ if ($addx); $code.=<<___ if ($addx);
mov \$0x80100, %ecx leaq OPENSSL_ia32cap_P(%rip), %rcx
and OPENSSL_ia32cap_P+8(%rip), %ecx mov 8(%rcx), %rcx
and \$0x80100, %ecx
___ ___
$code.=<<___; $code.=<<___;
.Lmul_mont: .Lmul_mont:
@ -431,8 +431,9 @@ __ecp_nistz256_mul_montq:
ecp_nistz256_sqr_mont: ecp_nistz256_sqr_mont:
___ ___
$code.=<<___ if ($addx); $code.=<<___ if ($addx);
mov \$0x80100, %ecx leaq OPENSSL_ia32cap_P(%rip), %rcx
and OPENSSL_ia32cap_P+8(%rip), %ecx mov 8(%rcx), %rcx
and \$0x80100, %ecx
___ ___
$code.=<<___; $code.=<<___;
push %rbp push %rbp
@ -955,7 +956,8 @@ $code.=<<___;
ecp_nistz256_select_w5: ecp_nistz256_select_w5:
___ ___
$code.=<<___ if ($avx>1); $code.=<<___ if ($avx>1);
mov OPENSSL_ia32cap_P+8(%rip), %eax leaq OPENSSL_ia32cap_P(%rip), %rax
mov 8(%rax), %rax
test \$`1<<5`, %eax test \$`1<<5`, %eax
jnz .Lavx2_select_w5 jnz .Lavx2_select_w5
___ ___
@ -1052,7 +1054,8 @@ $code.=<<___;
ecp_nistz256_select_w7: ecp_nistz256_select_w7:
___ ___
$code.=<<___ if ($avx>1); $code.=<<___ if ($avx>1);
mov OPENSSL_ia32cap_P+8(%rip), %eax leaq OPENSSL_ia32cap_P(%rip), %rax
mov 8(%rax), %rax
test \$`1<<5`, %eax test \$`1<<5`, %eax
jnz .Lavx2_select_w7 jnz .Lavx2_select_w7
___ ___
@ -1555,8 +1558,9 @@ $code.=<<___;
ecp_nistz256_point_double: ecp_nistz256_point_double:
___ ___
$code.=<<___ if ($addx); $code.=<<___ if ($addx);
mov \$0x80100, %ecx leaq OPENSSL_ia32cap_P(%rip), %rcx
and OPENSSL_ia32cap_P+8(%rip), %ecx mov 8(%rcx), %rcx
and \$0x80100, %ecx
cmp \$0x80100, %ecx cmp \$0x80100, %ecx
je .Lpoint_doublex je .Lpoint_doublex
___ ___
@ -1785,8 +1789,9 @@ $code.=<<___;
ecp_nistz256_point_add: ecp_nistz256_point_add:
___ ___
$code.=<<___ if ($addx); $code.=<<___ if ($addx);
mov \$0x80100, %ecx leaq OPENSSL_ia32cap_P(%rip), %rcx
and OPENSSL_ia32cap_P+8(%rip), %ecx mov 8(%rcx), %rcx
and \$0x80100, %ecx
cmp \$0x80100, %ecx cmp \$0x80100, %ecx
je .Lpoint_addx je .Lpoint_addx
___ ___
@ -2152,8 +2157,9 @@ $code.=<<___;
ecp_nistz256_point_add_affine: ecp_nistz256_point_add_affine:
___ ___
$code.=<<___ if ($addx); $code.=<<___ if ($addx);
mov \$0x80100, %ecx leaq OPENSSL_ia32cap_P(%rip), %rcx
and OPENSSL_ia32cap_P+8(%rip), %ecx mov 8(%rcx), %rcx
and \$0x80100, %ecx
cmp \$0x80100, %ecx cmp \$0x80100, %ecx
je .Lpoint_add_affinex je .Lpoint_add_affinex
___ ___