From 78f5e7573902345a9354385739cb3776ab29ae7d Mon Sep 17 00:00:00 2001
From: David Benjamin <davidben@google.com>
Date: Tue, 8 Aug 2017 18:19:37 -0400
Subject: [PATCH] Enable AVX2 and ADX in p256-x86_64-asm.pl.

We can test these with Intel SDE now. The AVX2 code just affects the two
select functions while the ADX code is a separate implementation.

Haswell numbers:

Before:
Did 84630 ECDH P-256 operations in 10031494us (8436.4 ops/sec)
Did 206000 ECDSA P-256 signing operations in 10015055us (20569.0 ops/sec)
Did 77256 ECDSA P-256 verify operations in 10064556us (7676.0 ops/sec)

After:
Did 86112 ECDH P-256 operations in 10015008us (8598.3 ops/sec)
Did 211000 ECDSA P-256 signing operations in 10025104us (21047.2 ops/sec)
Did 79344 ECDSA P-256 verify operations in 10017076us (7920.9 ops/sec)

Skylake numbers:

Before:
Did 75684 ECDH P-256 operations in 10016019us (7556.3 ops/sec)
Did 185000 ECDSA P-256 signing operations in 10012090us (18477.7 ops/sec)
Did 72885 ECDSA P-256 verify operations in 10027154us (7268.8 ops/sec)

After:
Did 89598 ECDH P-256 operations in 10032162us (8931.1 ops/sec)
Did 203000 ECDSA P-256 signing operations in 10019739us (20260.0 ops/sec)
Did 87040 ECDSA P-256 verify operations in 10000441us (8703.6 ops/sec)

The code was slightly patched for delocate.go compatibility.

Change-Id: Ic44ced4eca65c656bbe07d5a7fee91ec6925eb59
Reviewed-on: https://boringssl-review.googlesource.com/18967
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
---
 crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl | 36 ++++++++++++---------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl b/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl
index 1ac3d211..ff5756f2 100755
--- a/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl
+++ b/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl
@@ -54,9 +54,8 @@ die "can't locate x86_64-xlate.pl";
 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
 *STDOUT=*OUT;
 
-# TODO: enable these after testing. $avx goes to two and $addx to one.
-$avx=0;
-$addx=0;
+$avx = 2;
+$addx = 1;
 
 $code.=<<___;
 .text
@@ -150,8 +149,9 @@ $code.=<<___;
 ecp_nistz256_mul_mont:
 ___
 $code.=<<___	if ($addx);
-	mov	\$0x80100, %ecx
-	and	OPENSSL_ia32cap_P+8(%rip), %ecx
+	leaq	OPENSSL_ia32cap_P(%rip), %rcx
+	mov	8(%rcx), %rcx
+	and	\$0x80100, %ecx
 ___
 $code.=<<___;
 .Lmul_mont:
@@ -431,8 +431,9 @@ __ecp_nistz256_mul_montq:
 ecp_nistz256_sqr_mont:
 ___
 $code.=<<___	if ($addx);
-	mov	\$0x80100, %ecx
-	and	OPENSSL_ia32cap_P+8(%rip), %ecx
+	leaq	OPENSSL_ia32cap_P(%rip), %rcx
+	mov	8(%rcx), %rcx
+	and	\$0x80100, %ecx
 ___
 $code.=<<___;
 	push	%rbp
@@ -955,7 +956,8 @@ $code.=<<___;
 ecp_nistz256_select_w5:
 ___
 $code.=<<___	if ($avx>1);
-	mov	OPENSSL_ia32cap_P+8(%rip), %eax
+	leaq	OPENSSL_ia32cap_P(%rip), %rax
+	mov	8(%rax), %rax
 	test	\$`1<<5`, %eax
 	jnz	.Lavx2_select_w5
 ___
@@ -1052,7 +1054,8 @@ $code.=<<___;
 ecp_nistz256_select_w7:
 ___
 $code.=<<___	if ($avx>1);
-	mov	OPENSSL_ia32cap_P+8(%rip), %eax
+	leaq	OPENSSL_ia32cap_P(%rip), %rax
+	mov	8(%rax), %rax
 	test	\$`1<<5`, %eax
 	jnz	.Lavx2_select_w7
 ___
@@ -1555,8 +1558,9 @@ $code.=<<___;
 ecp_nistz256_point_double:
 ___
 $code.=<<___	if ($addx);
-	mov	\$0x80100, %ecx
-	and	OPENSSL_ia32cap_P+8(%rip), %ecx
+	leaq	OPENSSL_ia32cap_P(%rip), %rcx
+	mov	8(%rcx), %rcx
+	and	\$0x80100, %ecx
 	cmp	\$0x80100, %ecx
 	je	.Lpoint_doublex
 ___
@@ -1785,8 +1789,9 @@ $code.=<<___;
 ecp_nistz256_point_add:
 ___
 $code.=<<___	if ($addx);
-	mov	\$0x80100, %ecx
-	and	OPENSSL_ia32cap_P+8(%rip), %ecx
+	leaq	OPENSSL_ia32cap_P(%rip), %rcx
+	mov	8(%rcx), %rcx
+	and	\$0x80100, %ecx
 	cmp	\$0x80100, %ecx
 	je	.Lpoint_addx
 ___
@@ -2152,8 +2157,9 @@ $code.=<<___;
 ecp_nistz256_point_add_affine:
 ___
 $code.=<<___	if ($addx);
-	mov	\$0x80100, %ecx
-	and	OPENSSL_ia32cap_P+8(%rip), %ecx
+	leaq	OPENSSL_ia32cap_P(%rip), %rcx
+	mov	8(%rcx), %rcx
+	and	\$0x80100, %ecx
 	cmp	\$0x80100, %ecx
 	je	.Lpoint_add_affinex
 ___