From 78f5e7573902345a9354385739cb3776ab29ae7d Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Tue, 8 Aug 2017 18:19:37 -0400 Subject: [PATCH] Enable AVX2 and ADX in p256-x86_64-asm.pl. We can test these with Intel SDE now. The AVX2 code just affects the two select functions while the ADX code is a separate implementation. Haswell numbers: Before: Did 84630 ECDH P-256 operations in 10031494us (8436.4 ops/sec) Did 206000 ECDSA P-256 signing operations in 10015055us (20569.0 ops/sec) Did 77256 ECDSA P-256 verify operations in 10064556us (7676.0 ops/sec) After: Did 86112 ECDH P-256 operations in 10015008us (8598.3 ops/sec) Did 211000 ECDSA P-256 signing operations in 10025104us (21047.2 ops/sec) Did 79344 ECDSA P-256 verify operations in 10017076us (7920.9 ops/sec) Skylake numbers: Before: Did 75684 ECDH P-256 operations in 10016019us (7556.3 ops/sec) Did 185000 ECDSA P-256 signing operations in 10012090us (18477.7 ops/sec) Did 72885 ECDSA P-256 verify operations in 10027154us (7268.8 ops/sec) After: Did 89598 ECDH P-256 operations in 10032162us (8931.1 ops/sec) Did 203000 ECDSA P-256 signing operations in 10019739us (20260.0 ops/sec) Did 87040 ECDSA P-256 verify operations in 10000441us (8703.6 ops/sec) The code was slightly patched for delocate.go compatibility. Change-Id: Ic44ced4eca65c656bbe07d5a7fee91ec6925eb59 Reviewed-on: https://boringssl-review.googlesource.com/18967 Reviewed-by: Adam Langley Commit-Queue: David Benjamin --- crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl | 36 ++++++++++++--------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl b/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl index 1ac3d211..ff5756f2 100755 --- a/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl +++ b/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl @@ -54,9 +54,8 @@ die "can't locate x86_64-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; -# TODO: enable these after testing. $avx goes to two and $addx to one. -$avx=0; -$addx=0; +$avx = 2; +$addx = 1; $code.=<<___; .text @@ -150,8 +149,9 @@ $code.=<<___; ecp_nistz256_mul_mont: ___ $code.=<<___ if ($addx); - mov \$0x80100, %ecx - and OPENSSL_ia32cap_P+8(%rip), %ecx + leaq OPENSSL_ia32cap_P(%rip), %rcx + mov 8(%rcx), %rcx + and \$0x80100, %ecx ___ $code.=<<___; .Lmul_mont: @@ -431,8 +431,9 @@ __ecp_nistz256_mul_montq: ecp_nistz256_sqr_mont: ___ $code.=<<___ if ($addx); - mov \$0x80100, %ecx - and OPENSSL_ia32cap_P+8(%rip), %ecx + leaq OPENSSL_ia32cap_P(%rip), %rcx + mov 8(%rcx), %rcx + and \$0x80100, %ecx ___ $code.=<<___; push %rbp @@ -955,7 +956,8 @@ $code.=<<___; ecp_nistz256_select_w5: ___ $code.=<<___ if ($avx>1); - mov OPENSSL_ia32cap_P+8(%rip), %eax + leaq OPENSSL_ia32cap_P(%rip), %rax + mov 8(%rax), %rax test \$`1<<5`, %eax jnz .Lavx2_select_w5 ___ @@ -1052,7 +1054,8 @@ $code.=<<___; ecp_nistz256_select_w7: ___ $code.=<<___ if ($avx>1); - mov OPENSSL_ia32cap_P+8(%rip), %eax + leaq OPENSSL_ia32cap_P(%rip), %rax + mov 8(%rax), %rax test \$`1<<5`, %eax jnz .Lavx2_select_w7 ___ @@ -1555,8 +1558,9 @@ $code.=<<___; ecp_nistz256_point_double: ___ $code.=<<___ if ($addx); - mov \$0x80100, %ecx - and OPENSSL_ia32cap_P+8(%rip), %ecx + leaq OPENSSL_ia32cap_P(%rip), %rcx + mov 8(%rcx), %rcx + and \$0x80100, %ecx cmp \$0x80100, %ecx je .Lpoint_doublex ___ @@ -1785,8 +1789,9 @@ $code.=<<___; ecp_nistz256_point_add: ___ $code.=<<___ if ($addx); - mov \$0x80100, %ecx - and OPENSSL_ia32cap_P+8(%rip), %ecx + leaq OPENSSL_ia32cap_P(%rip), %rcx + mov 8(%rcx), %rcx + and \$0x80100, %ecx cmp \$0x80100, %ecx je .Lpoint_addx ___ @@ -2152,8 +2157,9 @@ $code.=<<___; ecp_nistz256_point_add_affine: ___ $code.=<<___ if ($addx); - mov \$0x80100, %ecx - and OPENSSL_ia32cap_P+8(%rip), %ecx + leaq OPENSSL_ia32cap_P(%rip), %rcx + mov 8(%rcx), %rcx + and \$0x80100, %ecx cmp \$0x80100, %ecx je .Lpoint_add_affinex ___