Przeglądaj źródła

x86_64 assembly pack: "optimize" for Knights Landing, add AVX-512 results.

The changes to the assembly files are synced from upstream's
64d92d7498. cpu-intel.c is translated to C
from that commit and d84df59440.

Change-Id: I02c8f83aa4780df301c21f011ef2d8d8300e2f2a
Reviewed-on: https://boringssl-review.googlesource.com/18411
Commit-Queue: Adam Langley <agl@google.com>
Reviewed-by: Adam Langley <agl@google.com>
kris/onging/CECPQ3_patch15
David Benjamin 7 lat temu
rodzic
commit
d4e37951b4
8 zmienionych plików z 64 dodań i 9 usunięć
  1. +4
    -2
      crypto/chacha/asm/chacha-x86_64.pl
  2. +14
    -0
      crypto/cpu-intel.c
  3. +2
    -0
      crypto/fipsmodule/aes/asm/aesni-x86_64.pl
  4. +10
    -1
      crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl
  5. +11
    -1
      crypto/fipsmodule/modes/asm/ghash-x86_64.pl
  6. +12
    -3
      crypto/fipsmodule/sha/asm/sha1-x86_64.pl
  7. +10
    -2
      crypto/fipsmodule/sha/asm/sha512-x86_64.pl
  8. +1
    -0
      util/all_tests.go

+ 4
- 2
crypto/chacha/asm/chacha-x86_64.pl Wyświetl plik

@@ -24,7 +24,7 @@
#
# Performance in cycles per byte out of large buffer.
#
# IALU/gcc 4.8(i) 1xSSSE3/SSE2 4xSSSE3 8xAVX2
# IALU/gcc 4.8(i) 1xSSSE3/SSE2 4xSSSE3 NxAVX(v)
#
# P4 9.48/+99% -/22.7(ii) -
# Core2 7.83/+55% 7.90/8.08 4.35
@@ -32,11 +32,13 @@
# Sandy Bridge 8.31/+42% 5.45/6.76 2.72
# Ivy Bridge 6.71/+46% 5.40/6.49 2.41
# Haswell 5.92/+43% 5.20/6.45 2.42 1.23
# Skylake 5.87/+39% 4.70/- 2.31 1.19
# Skylake[-X] 5.87/+39% 4.70/- 2.31 1.19[0.57]
# Silvermont 12.0/+33% 7.75/7.40 7.03(iii)
# Knights L 11.7/- - 9.60(iii) 0.80
# Goldmont 10.6/+17% 5.10/- 3.28
# Sledgehammer 7.28/+52% -/14.2(ii) -
# Bulldozer 9.66/+28% 9.85/11.1 3.06(iv)
# Ryzen 5.96/+50% 5.19/- 2.40 2.09
# VIA Nano 10.5/+46% 6.72/8.60 6.05
#
# (i) compared to older gcc 3.x one can observe >2x improvement on


+ 14
- 0
crypto/cpu-intel.c Wyświetl plik

@@ -207,6 +207,14 @@ void OPENSSL_cpuid_setup(void) {
/* Reserved bit #30 is repurposed to signal an Intel CPU. */
if (is_intel) {
edx |= (1 << 30);

/* Clear the XSAVE bit on Knights Landing to mimic Silvermont. This enables
* some Silvermont-specific codepaths which perform better. See OpenSSL
* commit 64d92d74985ebb3d0be58a9718f9e080a14a8e7f. */
if ((eax & 0x0fff0ff0) == 0x00050670 /* Knights Landing */ ||
(eax & 0x0fff0ff0) == 0x00080650 /* Knights Mill (per SDE) */) {
ecx &= ~(1 << 26);
}
} else {
edx &= ~(1 << 30);
}
@@ -243,6 +251,12 @@ void OPENSSL_cpuid_setup(void) {
extended_features &= ~(1 << 16);
}

/* Disable ADX instructions on Knights Landing. See OpenSSL commit
* 64d92d74985ebb3d0be58a9718f9e080a14a8e7f. */
if ((ecx & (1 << 26)) == 0) {
extended_features &= ~(1 << 19);
}

OPENSSL_ia32cap_P[0] = edx;
OPENSSL_ia32cap_P[1] = ecx;
OPENSSL_ia32cap_P[2] = extended_features;


+ 2
- 0
crypto/fipsmodule/aes/asm/aesni-x86_64.pl Wyświetl plik

@@ -179,8 +179,10 @@
# Haswell 4.44/0.63 0.63 0.73 0.63 0.70
# Skylake 2.62/0.63 0.63 0.63 0.63
# Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11
# Knights L 2.54/0.77 0.78 0.85 - 1.50
# Goldmont 3.82/1.26 1.26 1.29 1.29 1.50
# Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95
# Ryzen 2.71/0.35 0.35 0.44 0.38 0.49
#
# (*) Atom Silvermont ECB result is suboptimal because of penalties
# incurred by operations on %xmm8-15. As ECB is not considered


+ 10
- 1
crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl Wyświetl plik

@@ -1,4 +1,11 @@
#!/usr/bin/env perl
#! /usr/bin/env perl
# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html

#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -28,6 +35,8 @@
# Applications using the EVP interface will observe a few percent
# worse performance.]
#
# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
#
# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf



+ 11
- 1
crypto/fipsmodule/modes/asm/ghash-x86_64.pl Wyświetl plik

@@ -1,4 +1,11 @@
#!/usr/bin/env perl
#! /usr/bin/env perl
# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html

#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -67,6 +74,7 @@
# Skylake 0.44(+110%)(if system doesn't support AVX)
# Bulldozer 1.49(+27%)
# Silvermont 2.88(+13%)
# Knights L 2.12(-) (if system doesn't support AVX)
# Goldmont 1.08(+24%)

# March 2013
@@ -79,6 +87,8 @@
# it performs in 0.41 cycles per byte on Haswell processor, in
# 0.29 on Broadwell, and in 0.36 on Skylake.
#
# Knights Landing achieves 1.09 cpb.
#
# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest

$flavour = shift;


+ 12
- 3
crypto/fipsmodule/sha/asm/sha1-x86_64.pl Wyświetl plik

@@ -1,4 +1,11 @@
#!/usr/bin/env perl
#! /usr/bin/env perl
# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html

#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -75,9 +82,11 @@
# Haswell 5.45 4.15/+31% 3.57/+53%
# Skylake 5.18 4.06/+28% 3.54/+46%
# Bulldozer 9.11 5.95/+53%
# Ryzen 4.75 3.80/+24% 1.93/+150%(**)
# VIA Nano 9.32 7.15/+30%
# Atom 10.3 9.17/+12%
# Silvermont 13.1(*) 9.37/+40%
# Knights L 13.2(*) 9.68/+36% 8.30/+59%
# Goldmont 8.13 6.42/+27% 1.70/+380%(**)
#
# (*) obviously suboptimal result, nothing was done about it,
@@ -537,7 +546,7 @@ sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
$code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
}

sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
@@ -1458,7 +1467,7 @@ sub bodyx_40_59 () { # 10 instructions, 3 cycles critical path
)
}

sub Xupdate_avx2_16_31() # recall that $Xi starts wtih 4
sub Xupdate_avx2_16_31() # recall that $Xi starts with 4
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 instructions


+ 10
- 2
crypto/fipsmodule/sha/asm/sha512-x86_64.pl Wyświetl plik

@@ -1,4 +1,11 @@
#!/usr/bin/env perl
#! /usr/bin/env perl
# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html

#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -88,9 +95,11 @@
# Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%)
# Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%)
# Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%)
# Ryzen 11.0 9.02(+22%) 2.05(+440%) 7.05 5.67(+20%)
# VIA Nano 23.0 16.5(+39%) - 14.7 -
# Atom 23.0 18.9(+22%) - 14.7 -
# Silvermont 27.4 20.6(+33%) - 17.5 -
# Knights L 27.4 21.0(+30%) 19.6(+40%) 17.5 12.8(+37%)
# Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 -
#
# (*) whichever best applicable, including SHAEXT;
@@ -311,7 +320,6 @@ $code.=<<___;
mov $SZ*5($ctx),$F
mov $SZ*6($ctx),$G
mov $SZ*7($ctx),$H

jmp .Lloop

.align 16


+ 1
- 0
util/all_tests.go Wyświetl plik

@@ -96,6 +96,7 @@ var sdeCPUs = []string{
"slt", // Saltwell
"slm", // Silvermont
"glm", // Goldmont
"knm", // Knights Mill
}

func newTestOutput() *testOutput {


Ładowanie…
Anuluj
Zapisz