2017-07-25 21:59:58 +01:00
|
|
|
#! /usr/bin/env perl
|
|
|
|
# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
|
|
|
|
#
|
|
|
|
# Licensed under the OpenSSL license (the "License"). You may not use
|
|
|
|
# this file except in compliance with the License. You can obtain a copy
|
|
|
|
# in the file LICENSE in the source distribution or at
|
|
|
|
# https://www.openssl.org/source/license.html
|
|
|
|
|
2014-06-20 20:00:00 +01:00
|
|
|
#
|
|
|
|
# ====================================================================
|
|
|
|
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
|
|
|
# project. The module is, however, dual licensed under OpenSSL and
|
|
|
|
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
|
|
|
# details see http://www.openssl.org/~appro/cryptogams/.
|
|
|
|
# ====================================================================
|
|
|
|
#
|
|
|
|
# sha1_block procedure for x86_64.
|
|
|
|
#
|
|
|
|
# It was brought to my attention that on EM64T compiler-generated code
|
|
|
|
# was far behind 32-bit assembler implementation. This is unlike on
|
|
|
|
# Opteron where compiler-generated code was only 15% behind 32-bit
|
|
|
|
# assembler, which originally made it hard to motivate the effort.
|
|
|
|
# There was suggestion to mechanically translate 32-bit code, but I
|
|
|
|
# dismissed it, reasoning that x86_64 offers enough register bank
|
|
|
|
# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
|
|
|
|
# implementation:-) However! While 64-bit code does perform better
|
|
|
|
# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
|
|
|
|
# x86_64 does offer larger *addressable* bank, but out-of-order core
|
|
|
|
# reaches for even more registers through dynamic aliasing, and EM64T
|
|
|
|
# core must have managed to run-time optimize even 32-bit code just as
|
|
|
|
# good as 64-bit one. Performance improvement is summarized in the
|
|
|
|
# following table:
|
|
|
|
#
|
|
|
|
# gcc 3.4 32-bit asm cycles/byte
|
|
|
|
# Opteron +45% +20% 6.8
|
|
|
|
# Xeon P4 +65% +0% 9.9
|
|
|
|
# Core2 +60% +10% 7.0
|
|
|
|
|
|
|
|
# August 2009.
|
|
|
|
#
|
|
|
|
# The code was revised to minimize code size and to maximize
|
|
|
|
# "distance" between instructions producing input to 'lea'
|
|
|
|
# instruction and the 'lea' instruction itself, which is essential
|
|
|
|
# for Intel Atom core.
|
|
|
|
|
|
|
|
# October 2010.
|
|
|
|
#
|
|
|
|
# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
|
|
|
|
# is to offload message schedule denoted by Wt in NIST specification,
|
|
|
|
# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
|
|
|
|
# for background and implementation details. The only difference from
|
|
|
|
# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
|
|
|
|
# to free temporary registers.
|
|
|
|
|
|
|
|
# April 2011.
|
|
|
|
#
|
|
|
|
# Add AVX code path. See sha1-586.pl for further information.
|
|
|
|
|
|
|
|
# May 2013.
|
|
|
|
#
|
|
|
|
# Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions
|
|
|
|
# and loading pair of consecutive blocks to 256-bit %ymm registers)
|
|
|
|
# did not provide impressive performance improvement till a crucial
|
|
|
|
# hint regarding the number of Xupdate iterations to pre-compute in
|
|
|
|
# advance was provided by Ilya Albrekht of Intel Corp.
|
|
|
|
|
2014-06-20 20:00:00 +01:00
|
|
|
# March 2014.
|
|
|
|
#
|
|
|
|
# Add support for Intel SHA Extensions.
|
|
|
|
|
2014-06-20 20:00:00 +01:00
|
|
|
######################################################################
|
|
|
|
# Current performance is summarized in following table. Numbers are
|
|
|
|
# CPU clock cycles spent to process single byte (less is better).
|
|
|
|
#
|
|
|
|
# x86_64 SSSE3 AVX[2]
|
2014-06-20 20:00:00 +01:00
|
|
|
# P4 9.05 -
|
|
|
|
# Opteron 6.26 -
|
|
|
|
# Core2 6.55 6.05/+8% -
|
|
|
|
# Westmere 6.73 5.30/+27% -
|
|
|
|
# Sandy Bridge 7.70 6.10/+26% 4.99/+54%
|
|
|
|
# Ivy Bridge 6.06 4.67/+30% 4.60/+32%
|
|
|
|
# Haswell 5.45 4.15/+31% 3.57/+53%
|
2017-02-09 20:26:22 +00:00
|
|
|
# Skylake 5.18 4.06/+28% 3.54/+46%
|
2014-06-20 20:00:00 +01:00
|
|
|
# Bulldozer 9.11 5.95/+53%
|
2017-07-25 21:59:58 +01:00
|
|
|
# Ryzen 4.75 3.80/+24% 1.93/+150%(**)
|
2014-06-20 20:00:00 +01:00
|
|
|
# VIA Nano 9.32 7.15/+30%
|
2014-06-20 20:00:00 +01:00
|
|
|
# Atom 10.3 9.17/+12%
|
2014-06-20 20:00:00 +01:00
|
|
|
# Silvermont 13.1(*) 9.37/+40%
|
2017-07-25 21:59:58 +01:00
|
|
|
# Knights L 13.2(*) 9.68/+36% 8.30/+59%
|
2017-02-09 20:34:59 +00:00
|
|
|
# Goldmont 8.13 6.42/+27% 1.70/+380%(**)
|
2014-06-20 20:00:00 +01:00
|
|
|
#
|
|
|
|
# (*) obviously suboptimal result, nothing was done about it,
|
|
|
|
# because SSSE3 code is compiled unconditionally;
|
2017-02-09 20:34:59 +00:00
|
|
|
# (**) SHAEXT result
|
2014-06-20 20:00:00 +01:00
|
|
|
|
|
|
|
$flavour = shift;
|
|
|
|
$output = shift;
|
|
|
|
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
|
|
|
|
|
|
|
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
|
|
|
|
|
|
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|
|
|
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
2017-04-04 22:21:43 +01:00
|
|
|
( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
2014-06-20 20:00:00 +01:00
|
|
|
die "can't locate x86_64-xlate.pl";
|
|
|
|
|
2015-10-14 19:03:23 +01:00
|
|
|
# In upstream, this is controlled by shelling out to the compiler to check
|
|
|
|
# versions, but BoringSSL is intended to be used with pre-generated perlasm
|
|
|
|
# output, so this isn't useful anyway.
|
|
|
|
#
|
Enable AVX code for SHA-*.
SHA-1, SHA-256, and SHA-512 get a 12-26%, 17-23%, and 33-37% improvement,
respectively on x86-64. SHA-1 and SHA-256 get a 8-20% and 14-17% improvement on
x86. (x86 does not have AVX code for SHA-512.) This costs us 12k of binary size
on x86-64 and 8k of binary size on x86.
$ bssl speed SHA- (x86-64, before)
Did 4811000 SHA-1 (16 bytes) operations in 1000013us (4810937.5 ops/sec): 77.0 MB/s
Did 1414000 SHA-1 (256 bytes) operations in 1000253us (1413642.3 ops/sec): 361.9 MB/s
Did 56000 SHA-1 (8192 bytes) operations in 1002640us (55852.5 ops/sec): 457.5 MB/s
Did 2536000 SHA-256 (16 bytes) operations in 1000140us (2535645.0 ops/sec): 40.6 MB/s
Did 603000 SHA-256 (256 bytes) operations in 1001613us (602028.9 ops/sec): 154.1 MB/s
Did 25000 SHA-256 (8192 bytes) operations in 1010132us (24749.2 ops/sec): 202.7 MB/s
Did 1767000 SHA-512 (16 bytes) operations in 1000477us (1766157.5 ops/sec): 28.3 MB/s
Did 638000 SHA-512 (256 bytes) operations in 1000933us (637405.3 ops/sec): 163.2 MB/s
Did 32000 SHA-512 (8192 bytes) operations in 1025646us (31199.8 ops/sec): 255.6 MB/s
$ bssl speed SHA- (x86-64, after)
Did 5438000 SHA-1 (16 bytes) operations in 1000060us (5437673.7 ops/sec): 87.0 MB/s
Did 1590000 SHA-1 (256 bytes) operations in 1000181us (1589712.3 ops/sec): 407.0 MB/s
Did 71000 SHA-1 (8192 bytes) operations in 1007958us (70439.4 ops/sec): 577.0 MB/s
Did 2955000 SHA-256 (16 bytes) operations in 1000251us (2954258.5 ops/sec): 47.3 MB/s
Did 740000 SHA-256 (256 bytes) operations in 1000628us (739535.6 ops/sec): 189.3 MB/s
Did 31000 SHA-256 (8192 bytes) operations in 1019619us (30403.5 ops/sec): 249.1 MB/s
Did 2348000 SHA-512 (16 bytes) operations in 1000285us (2347331.0 ops/sec): 37.6 MB/s
Did 878000 SHA-512 (256 bytes) operations in 1001064us (877066.8 ops/sec): 224.5 MB/s
Did 43000 SHA-512 (8192 bytes) operations in 1002485us (42893.4 ops/sec): 351.4 MB/s
$ bssl speed SHA- (x86, before, SHA-512 redacted because irrelevant)
Did 4319000 SHA-1 (16 bytes) operations in 1000066us (4318715.0 ops/sec): 69.1 MB/s
Did 1306000 SHA-1 (256 bytes) operations in 1000437us (1305429.5 ops/sec): 334.2 MB/s
Did 58000 SHA-1 (8192 bytes) operations in 1014807us (57153.7 ops/sec): 468.2 MB/s
Did 2291000 SHA-256 (16 bytes) operations in 1000343us (2290214.5 ops/sec): 36.6 MB/s
Did 594000 SHA-256 (256 bytes) operations in 1000684us (593594.0 ops/sec): 152.0 MB/s
Did 25000 SHA-256 (8192 bytes) operations in 1030688us (24255.6 ops/sec): 198.7 MB/s
$ bssl speed SHA- (x86, after, SHA-512 redacted because irrelevant)
Did 4673000 SHA-1 (16 bytes) operations in 1000063us (4672705.6 ops/sec): 74.8 MB/s
Did 1484000 SHA-1 (256 bytes) operations in 1000453us (1483328.1 ops/sec): 379.7 MB/s
Did 69000 SHA-1 (8192 bytes) operations in 1008305us (68431.7 ops/sec): 560.6 MB/s
Did 2684000 SHA-256 (16 bytes) operations in 1000196us (2683474.0 ops/sec): 42.9 MB/s
Did 679000 SHA-256 (256 bytes) operations in 1000525us (678643.7 ops/sec): 173.7 MB/s
Did 29000 SHA-256 (8192 bytes) operations in 1033251us (28066.8 ops/sec): 229.9 MB/s
Change-Id: I952a3b4fc4c52ebb50690da3b8c97770e8342e98
Reviewed-on: https://boringssl-review.googlesource.com/6470
Reviewed-by: Adam Langley <agl@google.com>
2015-11-10 03:07:24 +00:00
|
|
|
# TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it
|
|
|
|
# necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream
|
|
|
|
# did not tie them together until after $shaext was added.
|
|
|
|
$avx = 1;
|
2014-07-25 01:28:34 +01:00
|
|
|
|
2015-10-15 18:48:50 +01:00
|
|
|
# TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
|
|
|
|
# been tested.
|
2014-06-20 20:00:00 +01:00
|
|
|
$shaext=0; ### set to zero if compiling for 1.0.1
|
|
|
|
$avx=1 if (!$shaext && $avx);
|
|
|
|
|
2016-06-26 18:18:50 +01:00
|
|
|
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
|
2014-06-20 20:00:00 +01:00
|
|
|
*STDOUT=*OUT;
|
|
|
|
|
|
|
|
$ctx="%rdi"; # 1st arg
|
|
|
|
$inp="%rsi"; # 2nd arg
|
|
|
|
$num="%rdx"; # 3rd arg
|
|
|
|
|
|
|
|
# reassign arguments in order to produce more compact code
|
|
|
|
$ctx="%r8";
|
|
|
|
$inp="%r9";
|
|
|
|
$num="%r10";
|
|
|
|
|
|
|
|
$t0="%eax";
|
|
|
|
$t1="%ebx";
|
|
|
|
$t2="%ecx";
|
2014-06-20 20:00:00 +01:00
|
|
|
@xi=("%edx","%ebp","%r14d");
|
2014-06-20 20:00:00 +01:00
|
|
|
$A="%esi";
|
|
|
|
$B="%edi";
|
|
|
|
$C="%r11d";
|
|
|
|
$D="%r12d";
|
|
|
|
$E="%r13d";
|
|
|
|
|
|
|
|
@V=($A,$B,$C,$D,$E);
|
|
|
|
|
|
|
|
sub BODY_00_19 {
|
|
|
|
my ($i,$a,$b,$c,$d,$e)=@_;
|
|
|
|
my $j=$i+1;
|
|
|
|
$code.=<<___ if ($i==0);
|
|
|
|
mov `4*$i`($inp),$xi[0]
|
|
|
|
bswap $xi[0]
|
|
|
|
___
|
|
|
|
$code.=<<___ if ($i<15);
|
|
|
|
mov `4*$j`($inp),$xi[1]
|
2014-06-20 20:00:00 +01:00
|
|
|
mov $d,$t0
|
|
|
|
mov $xi[0],`4*$i`(%rsp)
|
2014-06-20 20:00:00 +01:00
|
|
|
mov $a,$t2
|
|
|
|
bswap $xi[1]
|
2014-06-20 20:00:00 +01:00
|
|
|
xor $c,$t0
|
2014-06-20 20:00:00 +01:00
|
|
|
rol \$5,$t2
|
|
|
|
and $b,$t0
|
2014-06-20 20:00:00 +01:00
|
|
|
lea 0x5a827999($xi[0],$e),$e
|
2014-06-20 20:00:00 +01:00
|
|
|
add $t2,$e
|
|
|
|
xor $d,$t0
|
|
|
|
rol \$30,$b
|
|
|
|
add $t0,$e
|
|
|
|
___
|
|
|
|
$code.=<<___ if ($i>=15);
|
2014-06-20 20:00:00 +01:00
|
|
|
xor `4*($j%16)`(%rsp),$xi[1]
|
|
|
|
mov $d,$t0
|
|
|
|
mov $xi[0],`4*($i%16)`(%rsp)
|
2014-06-20 20:00:00 +01:00
|
|
|
mov $a,$t2
|
|
|
|
xor `4*(($j+2)%16)`(%rsp),$xi[1]
|
2014-06-20 20:00:00 +01:00
|
|
|
xor $c,$t0
|
2014-06-20 20:00:00 +01:00
|
|
|
rol \$5,$t2
|
|
|
|
xor `4*(($j+8)%16)`(%rsp),$xi[1]
|
|
|
|
and $b,$t0
|
|
|
|
lea 0x5a827999($xi[0],$e),$e
|
2014-06-20 20:00:00 +01:00
|
|
|
rol \$30,$b
|
2014-06-20 20:00:00 +01:00
|
|
|
xor $d,$t0
|
|
|
|
add $t2,$e
|
2014-06-20 20:00:00 +01:00
|
|
|
rol \$1,$xi[1]
|
2014-06-20 20:00:00 +01:00
|
|
|
add $t0,$e
|
|
|
|
___
|
2014-06-20 20:00:00 +01:00
|
|
|
push(@xi,shift(@xi));
|
2014-06-20 20:00:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
sub BODY_20_39 {
|
|
|
|
my ($i,$a,$b,$c,$d,$e)=@_;
|
|
|
|
my $j=$i+1;
|
|
|
|
my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
|
|
|
|
$code.=<<___ if ($i<79);
|
2014-06-20 20:00:00 +01:00
|
|
|
xor `4*($j%16)`(%rsp),$xi[1]
|
|
|
|
mov $b,$t0
|
|
|
|
`"mov $xi[0],".4*($i%16)."(%rsp)" if ($i<72)`
|
2014-06-20 20:00:00 +01:00
|
|
|
mov $a,$t2
|
|
|
|
xor `4*(($j+2)%16)`(%rsp),$xi[1]
|
2014-06-20 20:00:00 +01:00
|
|
|
xor $d,$t0
|
2014-06-20 20:00:00 +01:00
|
|
|
rol \$5,$t2
|
|
|
|
xor `4*(($j+8)%16)`(%rsp),$xi[1]
|
2014-06-20 20:00:00 +01:00
|
|
|
lea $K($xi[0],$e),$e
|
|
|
|
xor $c,$t0
|
2014-06-20 20:00:00 +01:00
|
|
|
add $t2,$e
|
|
|
|
rol \$30,$b
|
|
|
|
add $t0,$e
|
|
|
|
rol \$1,$xi[1]
|
|
|
|
___
|
|
|
|
$code.=<<___ if ($i==79);
|
2014-06-20 20:00:00 +01:00
|
|
|
mov $b,$t0
|
2014-06-20 20:00:00 +01:00
|
|
|
mov $a,$t2
|
2014-06-20 20:00:00 +01:00
|
|
|
xor $d,$t0
|
2014-06-20 20:00:00 +01:00
|
|
|
lea $K($xi[0],$e),$e
|
|
|
|
rol \$5,$t2
|
2014-06-20 20:00:00 +01:00
|
|
|
xor $c,$t0
|
2014-06-20 20:00:00 +01:00
|
|
|
add $t2,$e
|
|
|
|
rol \$30,$b
|
|
|
|
add $t0,$e
|
|
|
|
___
|
2014-06-20 20:00:00 +01:00
|
|
|
push(@xi,shift(@xi));
|
2014-06-20 20:00:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
sub BODY_40_59 {
|
|
|
|
my ($i,$a,$b,$c,$d,$e)=@_;
|
|
|
|
my $j=$i+1;
|
|
|
|
$code.=<<___;
|
2014-06-20 20:00:00 +01:00
|
|
|
xor `4*($j%16)`(%rsp),$xi[1]
|
|
|
|
mov $d,$t0
|
|
|
|
mov $xi[0],`4*($i%16)`(%rsp)
|
|
|
|
mov $d,$t1
|
2014-06-20 20:00:00 +01:00
|
|
|
xor `4*(($j+2)%16)`(%rsp),$xi[1]
|
2014-06-20 20:00:00 +01:00
|
|
|
and $c,$t0
|
2014-06-20 20:00:00 +01:00
|
|
|
mov $a,$t2
|
|
|
|
xor `4*(($j+8)%16)`(%rsp),$xi[1]
|
|
|
|
lea 0x8f1bbcdc($xi[0],$e),$e
|
2014-06-20 20:00:00 +01:00
|
|
|
xor $c,$t1
|
2014-06-20 20:00:00 +01:00
|
|
|
rol \$5,$t2
|
|
|
|
add $t0,$e
|
|
|
|
rol \$1,$xi[1]
|
2014-06-20 20:00:00 +01:00
|
|
|
and $b,$t1
|
2014-06-20 20:00:00 +01:00
|
|
|
add $t2,$e
|
2014-06-20 20:00:00 +01:00
|
|
|
rol \$30,$b
|
|
|
|
add $t1,$e
|
2014-06-20 20:00:00 +01:00
|
|
|
___
|
2014-06-20 20:00:00 +01:00
|
|
|
push(@xi,shift(@xi));
|
2014-06-20 20:00:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
$code.=<<___;
|
|
|
|
.text
|
Revise OPENSSL_ia32cap_P strategy to avoid TEXTRELs.
OPENSSL_ia32cap_addr avoids any relocations within the module, at the
cost of a runtime TEXTREL, which causes problems in some cases.
(Notably, if someone links us into a binary which uses the GCC "ifunc"
attribute, the loader crashes.)
We add a OPENSSL_ia32cap_addr_delta symbol (which is reachable
relocation-free from the module) stores the difference between
OPENSSL_ia32cap_P and its own address. Next, reference
OPENSSL_ia32cap_P in code as usual, but always doing LEAQ (or the
equivalent GOTPCREL MOVQ) into a register first. This pattern we can
then transform into a LEAQ and ADDQ on OPENSSL_ia32cap_addr_delta.
ADDQ modifies the FLAGS register, so this is only a safe transformation
if we safe and restore flags first. That, in turn, is only a safe
transformation if code always uses %rsp as a stack pointer (specifically
everything below the stack must be fair game for scribbling over). Linux
delivers signals on %rsp, so this should already be an ABI requirement.
Further, we must clear the red zone (using LEAQ to avoid touching FLAGS)
which signal handlers may not scribble over.
This also fixes the GOTTPOFF logic to clear the red zone.
Change-Id: I4ca6133ab936d5a13d5c8ef265a12ab6bd0073c9
Reviewed-on: https://boringssl-review.googlesource.com/15545
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: Adam Langley <agl@google.com>
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
2017-04-25 23:32:32 +01:00
|
|
|
.extern OPENSSL_ia32cap_P
|
2014-06-20 20:00:00 +01:00
|
|
|
|
|
|
|
.globl sha1_block_data_order
|
|
|
|
.type sha1_block_data_order,\@function,3
|
|
|
|
.align 16
|
|
|
|
sha1_block_data_order:
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_startproc
|
Revise OPENSSL_ia32cap_P strategy to avoid TEXTRELs.
OPENSSL_ia32cap_addr avoids any relocations within the module, at the
cost of a runtime TEXTREL, which causes problems in some cases.
(Notably, if someone links us into a binary which uses the GCC "ifunc"
attribute, the loader crashes.)
We add a OPENSSL_ia32cap_addr_delta symbol (which is reachable
relocation-free from the module) stores the difference between
OPENSSL_ia32cap_P and its own address. Next, reference
OPENSSL_ia32cap_P in code as usual, but always doing LEAQ (or the
equivalent GOTPCREL MOVQ) into a register first. This pattern we can
then transform into a LEAQ and ADDQ on OPENSSL_ia32cap_addr_delta.
ADDQ modifies the FLAGS register, so this is only a safe transformation
if we safe and restore flags first. That, in turn, is only a safe
transformation if code always uses %rsp as a stack pointer (specifically
everything below the stack must be fair game for scribbling over). Linux
delivers signals on %rsp, so this should already be an ABI requirement.
Further, we must clear the red zone (using LEAQ to avoid touching FLAGS)
which signal handlers may not scribble over.
This also fixes the GOTTPOFF logic to clear the red zone.
Change-Id: I4ca6133ab936d5a13d5c8ef265a12ab6bd0073c9
Reviewed-on: https://boringssl-review.googlesource.com/15545
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: Adam Langley <agl@google.com>
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
2017-04-25 23:32:32 +01:00
|
|
|
leaq OPENSSL_ia32cap_P(%rip),%r10
|
2017-04-04 22:21:43 +01:00
|
|
|
mov 0(%r10),%r9d
|
|
|
|
mov 4(%r10),%r8d
|
|
|
|
mov 8(%r10),%r10d
|
2014-06-20 20:00:00 +01:00
|
|
|
test \$`1<<9`,%r8d # check SSSE3 bit
|
|
|
|
jz .Lialu
|
2014-06-20 20:00:00 +01:00
|
|
|
___
|
|
|
|
$code.=<<___ if ($shaext);
|
2017-02-09 20:21:08 +00:00
|
|
|
test \$`1<<29`,%r10d # check SHA bit
|
2014-06-20 20:00:00 +01:00
|
|
|
jnz _shaext_shortcut
|
2014-06-20 20:00:00 +01:00
|
|
|
___
|
|
|
|
$code.=<<___ if ($avx>1);
|
|
|
|
and \$`1<<3|1<<5|1<<8`,%r10d # check AVX2+BMI1+BMI2
|
|
|
|
cmp \$`1<<3|1<<5|1<<8`,%r10d
|
|
|
|
je _avx2_shortcut
|
|
|
|
___
|
|
|
|
$code.=<<___ if ($avx);
|
|
|
|
and \$`1<<28`,%r8d # mask AVX bit
|
|
|
|
and \$`1<<30`,%r9d # mask "Intel CPU" bit
|
|
|
|
or %r9d,%r8d
|
|
|
|
cmp \$`1<<28|1<<30`,%r8d
|
|
|
|
je _avx_shortcut
|
|
|
|
___
|
|
|
|
$code.=<<___;
|
|
|
|
jmp _ssse3_shortcut
|
|
|
|
|
|
|
|
.align 16
|
|
|
|
.Lialu:
|
2014-06-20 20:00:00 +01:00
|
|
|
mov %rsp,%rax
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_def_cfa_register %rax
|
2014-06-20 20:00:00 +01:00
|
|
|
push %rbx
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_push %rbx
|
2014-06-20 20:00:00 +01:00
|
|
|
push %rbp
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_push %rbp
|
2014-06-20 20:00:00 +01:00
|
|
|
push %r12
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_push %r12
|
2014-06-20 20:00:00 +01:00
|
|
|
push %r13
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_push %r13
|
2014-06-20 20:00:00 +01:00
|
|
|
push %r14
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_push %r14
|
2014-06-20 20:00:00 +01:00
|
|
|
mov %rdi,$ctx # reassigned argument
|
|
|
|
sub \$`8+16*4`,%rsp
|
|
|
|
mov %rsi,$inp # reassigned argument
|
|
|
|
and \$-64,%rsp
|
|
|
|
mov %rdx,$num # reassigned argument
|
2014-06-20 20:00:00 +01:00
|
|
|
mov %rax,`16*4`(%rsp)
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_cfa_expression %rsp+64,deref,+8
|
2014-06-20 20:00:00 +01:00
|
|
|
.Lprologue:
|
|
|
|
|
|
|
|
mov 0($ctx),$A
|
|
|
|
mov 4($ctx),$B
|
|
|
|
mov 8($ctx),$C
|
|
|
|
mov 12($ctx),$D
|
|
|
|
mov 16($ctx),$E
|
|
|
|
jmp .Lloop
|
|
|
|
|
|
|
|
.align 16
|
|
|
|
.Lloop:
|
|
|
|
___
|
|
|
|
for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
|
|
|
|
for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
|
|
|
|
for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
|
|
|
|
for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
|
|
|
|
$code.=<<___;
|
|
|
|
add 0($ctx),$A
|
|
|
|
add 4($ctx),$B
|
|
|
|
add 8($ctx),$C
|
|
|
|
add 12($ctx),$D
|
|
|
|
add 16($ctx),$E
|
|
|
|
mov $A,0($ctx)
|
|
|
|
mov $B,4($ctx)
|
|
|
|
mov $C,8($ctx)
|
|
|
|
mov $D,12($ctx)
|
|
|
|
mov $E,16($ctx)
|
|
|
|
|
|
|
|
sub \$1,$num
|
|
|
|
lea `16*4`($inp),$inp
|
|
|
|
jnz .Lloop
|
|
|
|
|
|
|
|
mov `16*4`(%rsp),%rsi
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_def_cfa %rsi,8
|
2014-06-20 20:00:00 +01:00
|
|
|
mov -40(%rsi),%r14
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_restore %r14
|
2014-06-20 20:00:00 +01:00
|
|
|
mov -32(%rsi),%r13
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_restore %r13
|
2014-06-20 20:00:00 +01:00
|
|
|
mov -24(%rsi),%r12
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_restore %r12
|
2014-06-20 20:00:00 +01:00
|
|
|
mov -16(%rsi),%rbp
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_restore %rbp
|
2014-06-20 20:00:00 +01:00
|
|
|
mov -8(%rsi),%rbx
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_restore %rbx
|
2014-06-20 20:00:00 +01:00
|
|
|
lea (%rsi),%rsp
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_def_cfa_register %rsp
|
2014-06-20 20:00:00 +01:00
|
|
|
.Lepilogue:
|
|
|
|
ret
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_endproc
|
2014-06-20 20:00:00 +01:00
|
|
|
.size sha1_block_data_order,.-sha1_block_data_order
|
|
|
|
___
|
2014-06-20 20:00:00 +01:00
|
|
|
if ($shaext) {{{
|
2014-06-20 20:00:00 +01:00
|
|
|
######################################################################
|
|
|
|
# Intel SHA Extensions implementation of SHA1 update function.
|
|
|
|
#
|
|
|
|
my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx");
|
|
|
|
my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9));
|
|
|
|
my @MSG=map("%xmm$_",(4..7));
|
|
|
|
|
|
|
|
$code.=<<___;
|
|
|
|
.type sha1_block_data_order_shaext,\@function,3
|
|
|
|
.align 32
|
|
|
|
sha1_block_data_order_shaext:
|
|
|
|
_shaext_shortcut:
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_startproc
|
2014-06-20 20:00:00 +01:00
|
|
|
___
|
|
|
|
$code.=<<___ if ($win64);
|
|
|
|
lea `-8-4*16`(%rsp),%rsp
|
|
|
|
movaps %xmm6,-8-4*16(%rax)
|
|
|
|
movaps %xmm7,-8-3*16(%rax)
|
|
|
|
movaps %xmm8,-8-2*16(%rax)
|
|
|
|
movaps %xmm9,-8-1*16(%rax)
|
|
|
|
.Lprologue_shaext:
|
|
|
|
___
|
|
|
|
$code.=<<___;
|
|
|
|
movdqu ($ctx),$ABCD
|
|
|
|
movd 16($ctx),$E
|
|
|
|
movdqa K_XX_XX+0xa0(%rip),$BSWAP # byte-n-word swap
|
|
|
|
|
|
|
|
movdqu ($inp),@MSG[0]
|
|
|
|
pshufd \$0b00011011,$ABCD,$ABCD # flip word order
|
|
|
|
movdqu 0x10($inp),@MSG[1]
|
|
|
|
pshufd \$0b00011011,$E,$E # flip word order
|
|
|
|
movdqu 0x20($inp),@MSG[2]
|
|
|
|
pshufb $BSWAP,@MSG[0]
|
|
|
|
movdqu 0x30($inp),@MSG[3]
|
|
|
|
pshufb $BSWAP,@MSG[1]
|
|
|
|
pshufb $BSWAP,@MSG[2]
|
|
|
|
movdqa $E,$E_SAVE # offload $E
|
|
|
|
pshufb $BSWAP,@MSG[3]
|
|
|
|
jmp .Loop_shaext
|
|
|
|
|
|
|
|
.align 16
|
|
|
|
.Loop_shaext:
|
|
|
|
dec $num
|
2016-08-16 06:54:53 +01:00
|
|
|
lea 0x40($inp),%r8 # next input block
|
2014-06-20 20:00:00 +01:00
|
|
|
paddd @MSG[0],$E
|
2016-08-16 06:54:53 +01:00
|
|
|
cmovne %r8,$inp
|
2014-06-20 20:00:00 +01:00
|
|
|
movdqa $ABCD,$ABCD_SAVE # offload $ABCD
|
|
|
|
___
|
|
|
|
for($i=0;$i<20-4;$i+=2) {
|
|
|
|
$code.=<<___;
|
|
|
|
sha1msg1 @MSG[1],@MSG[0]
|
|
|
|
movdqa $ABCD,$E_
|
|
|
|
sha1rnds4 \$`int($i/5)`,$E,$ABCD # 0-3...
|
|
|
|
sha1nexte @MSG[1],$E_
|
|
|
|
pxor @MSG[2],@MSG[0]
|
|
|
|
sha1msg1 @MSG[2],@MSG[1]
|
|
|
|
sha1msg2 @MSG[3],@MSG[0]
|
|
|
|
|
|
|
|
movdqa $ABCD,$E
|
|
|
|
sha1rnds4 \$`int(($i+1)/5)`,$E_,$ABCD
|
|
|
|
sha1nexte @MSG[2],$E
|
|
|
|
pxor @MSG[3],@MSG[1]
|
|
|
|
sha1msg2 @MSG[0],@MSG[1]
|
|
|
|
___
|
|
|
|
push(@MSG,shift(@MSG)); push(@MSG,shift(@MSG));
|
|
|
|
}
|
|
|
|
$code.=<<___;
|
|
|
|
movdqu ($inp),@MSG[0]
|
|
|
|
movdqa $ABCD,$E_
|
|
|
|
sha1rnds4 \$3,$E,$ABCD # 64-67
|
|
|
|
sha1nexte @MSG[1],$E_
|
|
|
|
movdqu 0x10($inp),@MSG[1]
|
|
|
|
pshufb $BSWAP,@MSG[0]
|
|
|
|
|
|
|
|
movdqa $ABCD,$E
|
|
|
|
sha1rnds4 \$3,$E_,$ABCD # 68-71
|
|
|
|
sha1nexte @MSG[2],$E
|
|
|
|
movdqu 0x20($inp),@MSG[2]
|
|
|
|
pshufb $BSWAP,@MSG[1]
|
|
|
|
|
|
|
|
movdqa $ABCD,$E_
|
|
|
|
sha1rnds4 \$3,$E,$ABCD # 72-75
|
|
|
|
sha1nexte @MSG[3],$E_
|
|
|
|
movdqu 0x30($inp),@MSG[3]
|
|
|
|
pshufb $BSWAP,@MSG[2]
|
|
|
|
|
|
|
|
movdqa $ABCD,$E
|
|
|
|
sha1rnds4 \$3,$E_,$ABCD # 76-79
|
|
|
|
sha1nexte $E_SAVE,$E
|
|
|
|
pshufb $BSWAP,@MSG[3]
|
|
|
|
|
|
|
|
paddd $ABCD_SAVE,$ABCD
|
|
|
|
movdqa $E,$E_SAVE # offload $E
|
|
|
|
|
|
|
|
jnz .Loop_shaext
|
|
|
|
|
|
|
|
pshufd \$0b00011011,$ABCD,$ABCD
|
|
|
|
pshufd \$0b00011011,$E,$E
|
|
|
|
movdqu $ABCD,($ctx)
|
|
|
|
movd $E,16($ctx)
|
|
|
|
___
|
|
|
|
$code.=<<___ if ($win64);
|
|
|
|
movaps -8-4*16(%rax),%xmm6
|
|
|
|
movaps -8-3*16(%rax),%xmm7
|
|
|
|
movaps -8-2*16(%rax),%xmm8
|
|
|
|
movaps -8-1*16(%rax),%xmm9
|
|
|
|
mov %rax,%rsp
|
|
|
|
.Lepilogue_shaext:
|
|
|
|
___
|
|
|
|
$code.=<<___;
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_endproc
|
2014-06-20 20:00:00 +01:00
|
|
|
ret
|
|
|
|
.size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
|
|
|
|
___
|
|
|
|
}}}
|
|
|
|
{{{
|
2014-06-20 20:00:00 +01:00
|
|
|
my $Xi=4;
|
|
|
|
my @X=map("%xmm$_",(4..7,0..3));
|
|
|
|
my @Tx=map("%xmm$_",(8..10));
|
|
|
|
my $Kx="%xmm11";
|
|
|
|
my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
|
|
|
|
my @T=("%esi","%edi");
|
|
|
|
my $j=0;
|
|
|
|
my $rx=0;
|
2017-02-09 22:17:39 +00:00
|
|
|
my $K_XX_XX="%r14";
|
|
|
|
my $fp="%r11";
|
2014-06-20 20:00:00 +01:00
|
|
|
|
|
|
|
my $_rol=sub { &rol(@_) };
|
|
|
|
my $_ror=sub { &ror(@_) };
|
|
|
|
|
|
|
|
{ my $sn;
|
|
|
|
sub align32() {
|
|
|
|
++$sn;
|
|
|
|
$code.=<<___;
|
|
|
|
jmp .Lalign32_$sn # see "Decoded ICache" in manual
|
|
|
|
.align 32
|
|
|
|
.Lalign32_$sn:
|
|
|
|
___
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
$code.=<<___;
|
|
|
|
.type sha1_block_data_order_ssse3,\@function,3
|
|
|
|
.align 16
|
|
|
|
sha1_block_data_order_ssse3:
|
|
|
|
_ssse3_shortcut:
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_startproc
|
2017-02-09 22:17:39 +00:00
|
|
|
mov %rsp,$fp # frame pointer
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_def_cfa_register $fp
|
2014-06-20 20:00:00 +01:00
|
|
|
push %rbx
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_push %rbx
|
2014-06-20 20:00:00 +01:00
|
|
|
push %rbp
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_push %rbp
|
2014-06-20 20:00:00 +01:00
|
|
|
push %r12
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_push %r12
|
2014-06-20 20:00:00 +01:00
|
|
|
push %r13 # redundant, done to share Win64 SE handler
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_push %r13
|
2014-06-20 20:00:00 +01:00
|
|
|
push %r14
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_push %r14
|
2014-06-20 20:00:00 +01:00
|
|
|
lea `-64-($win64?6*16:0)`(%rsp),%rsp
|
|
|
|
___
|
|
|
|
$code.=<<___ if ($win64);
|
2017-02-09 22:17:39 +00:00
|
|
|
movaps %xmm6,-40-6*16($fp)
|
|
|
|
movaps %xmm7,-40-5*16($fp)
|
|
|
|
movaps %xmm8,-40-4*16($fp)
|
|
|
|
movaps %xmm9,-40-3*16($fp)
|
|
|
|
movaps %xmm10,-40-2*16($fp)
|
|
|
|
movaps %xmm11,-40-1*16($fp)
|
2014-06-20 20:00:00 +01:00
|
|
|
.Lprologue_ssse3:
|
|
|
|
___
|
|
|
|
$code.=<<___;
|
|
|
|
and \$-64,%rsp
|
|
|
|
mov %rdi,$ctx # reassigned argument
|
|
|
|
mov %rsi,$inp # reassigned argument
|
|
|
|
mov %rdx,$num # reassigned argument
|
|
|
|
|
|
|
|
shl \$6,$num
|
|
|
|
add $inp,$num
|
|
|
|
lea K_XX_XX+64(%rip),$K_XX_XX
|
|
|
|
|
|
|
|
mov 0($ctx),$A # load context
|
|
|
|
mov 4($ctx),$B
|
|
|
|
mov 8($ctx),$C
|
|
|
|
mov 12($ctx),$D
|
|
|
|
mov $B,@T[0] # magic seed
|
|
|
|
mov 16($ctx),$E
|
|
|
|
mov $C,@T[1]
|
|
|
|
xor $D,@T[1]
|
|
|
|
and @T[1],@T[0]
|
|
|
|
|
|
|
|
movdqa 64($K_XX_XX),@X[2] # pbswap mask
|
|
|
|
movdqa -64($K_XX_XX),@Tx[1] # K_00_19
|
|
|
|
movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
|
|
|
|
movdqu 16($inp),@X[-3&7]
|
|
|
|
movdqu 32($inp),@X[-2&7]
|
|
|
|
movdqu 48($inp),@X[-1&7]
|
|
|
|
pshufb @X[2],@X[-4&7] # byte swap
|
|
|
|
pshufb @X[2],@X[-3&7]
|
|
|
|
pshufb @X[2],@X[-2&7]
|
2014-06-20 20:00:00 +01:00
|
|
|
add \$64,$inp
|
2014-06-20 20:00:00 +01:00
|
|
|
paddd @Tx[1],@X[-4&7] # add K_00_19
|
2014-06-20 20:00:00 +01:00
|
|
|
pshufb @X[2],@X[-1&7]
|
2014-06-20 20:00:00 +01:00
|
|
|
paddd @Tx[1],@X[-3&7]
|
|
|
|
paddd @Tx[1],@X[-2&7]
|
|
|
|
movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
|
|
|
|
psubd @Tx[1],@X[-4&7] # restore X[]
|
|
|
|
movdqa @X[-3&7],16(%rsp)
|
|
|
|
psubd @Tx[1],@X[-3&7]
|
|
|
|
movdqa @X[-2&7],32(%rsp)
|
|
|
|
psubd @Tx[1],@X[-2&7]
|
|
|
|
jmp .Loop_ssse3
|
|
|
|
___
|
|
|
|
|
|
|
|
sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
|
|
|
|
{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
|
|
|
|
my $arg = pop;
|
|
|
|
$arg = "\$$arg" if ($arg*1 eq $arg);
|
|
|
|
$code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
|
|
|
|
}
|
|
|
|
|
2017-07-25 21:59:58 +01:00
|
|
|
sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4
|
2014-06-20 20:00:00 +01:00
|
|
|
{ use integer;
|
|
|
|
my $body = shift;
|
|
|
|
my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
|
|
|
|
my ($a,$b,$c,$d,$e);
|
|
|
|
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns)); # ror
|
|
|
|
&pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]);
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns));
|
|
|
|
&movdqa (@Tx[0],@X[-1&7]);
|
2014-06-20 20:00:00 +01:00
|
|
|
&paddd (@Tx[1],@X[-1&7]);
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
2014-06-20 20:00:00 +01:00
|
|
|
&punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns));
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns)); # rol
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns));
|
|
|
|
&psrldq (@Tx[0],4); # "X[-3]", 3 dwords
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
2014-06-20 20:00:00 +01:00
|
|
|
|
2014-06-20 20:00:00 +01:00
|
|
|
&pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
|
|
|
|
eval(shift(@insns));
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns)); # ror
|
2014-06-20 20:00:00 +01:00
|
|
|
&pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
|
|
|
|
eval(shift(@insns));
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns)); # rol
|
2014-06-20 20:00:00 +01:00
|
|
|
&movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&movdqa (@Tx[2],@X[0]);
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns)); # ror
|
|
|
|
&movdqa (@Tx[0],@X[0]);
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
|
|
|
|
&paddd (@X[0],@X[0]);
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&psrld (@Tx[0],31);
|
|
|
|
eval(shift(@insns));
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns)); # rol
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns));
|
|
|
|
&movdqa (@Tx[1],@Tx[2]);
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&psrld (@Tx[2],30);
|
|
|
|
eval(shift(@insns));
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns)); # ror
|
|
|
|
&por (@X[0],@Tx[0]); # "X[0]"<<<=1
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&pslld (@Tx[1],2);
|
|
|
|
&pxor (@X[0],@Tx[2]);
|
|
|
|
eval(shift(@insns));
|
|
|
|
&movdqa (@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)"); # K_XX_XX
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns)); # rol
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
|
2014-06-20 20:00:00 +01:00
|
|
|
&pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
|
2014-06-20 20:00:00 +01:00
|
|
|
|
|
|
|
foreach (@insns) { eval; } # remaining instructions [if any]
|
|
|
|
|
|
|
|
$Xi++; push(@X,shift(@X)); # "rotate" X[]
|
|
|
|
push(@Tx,shift(@Tx));
|
|
|
|
}
|
|
|
|
|
|
|
|
sub Xupdate_ssse3_32_79()
|
|
|
|
{ use integer;
|
|
|
|
my $body = shift;
|
|
|
|
my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
|
|
|
|
my ($a,$b,$c,$d,$e);
|
|
|
|
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns)) if ($Xi==8);
|
2014-06-20 20:00:00 +01:00
|
|
|
&pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns)) if ($Xi==8);
|
|
|
|
eval(shift(@insns)); # body_20_39
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns));
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns)) if (@insns[1] =~ /_ror/);
|
|
|
|
eval(shift(@insns)) if (@insns[0] =~ /_ror/);
|
|
|
|
&punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns)); # rol
|
|
|
|
|
|
|
|
&pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
|
|
|
|
eval(shift(@insns));
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns));
|
2014-06-20 20:00:00 +01:00
|
|
|
if ($Xi%5) {
|
|
|
|
&movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
|
|
|
|
} else { # ... or load next one
|
|
|
|
&movdqa (@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)");
|
|
|
|
}
|
|
|
|
eval(shift(@insns)); # ror
|
2014-06-20 20:00:00 +01:00
|
|
|
&paddd (@Tx[1],@X[-1&7]);
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
|
|
|
|
eval(shift(@insns)); # body_20_39
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns)); # rol
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns)) if (@insns[0] =~ /_ror/);
|
2014-06-20 20:00:00 +01:00
|
|
|
|
|
|
|
&movdqa (@Tx[0],@X[0]);
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
2014-06-20 20:00:00 +01:00
|
|
|
&movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns)); # ror
|
|
|
|
eval(shift(@insns));
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns)); # body_20_39
|
2014-06-20 20:00:00 +01:00
|
|
|
|
|
|
|
&pslld (@X[0],2);
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
2014-06-20 20:00:00 +01:00
|
|
|
&psrld (@Tx[0],30);
|
|
|
|
eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns)); # ror
|
|
|
|
|
|
|
|
&por (@X[0],@Tx[0]); # "X[0]"<<<=2
|
|
|
|
eval(shift(@insns));
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns)); # body_20_39
|
|
|
|
eval(shift(@insns)) if (@insns[1] =~ /_rol/);
|
|
|
|
eval(shift(@insns)) if (@insns[0] =~ /_rol/);
|
|
|
|
&pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0])
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns)); # rol
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns)); # rol
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
foreach (@insns) { eval; } # remaining instructions
|
|
|
|
|
|
|
|
$Xi++; push(@X,shift(@X)); # "rotate" X[]
|
|
|
|
push(@Tx,shift(@Tx));
|
|
|
|
}
|
|
|
|
|
|
|
|
sub Xuplast_ssse3_80()
|
|
|
|
{ use integer;
|
|
|
|
my $body = shift;
|
|
|
|
my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
|
|
|
|
my ($a,$b,$c,$d,$e);
|
|
|
|
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns));
|
|
|
|
&paddd (@Tx[1],@X[-1&7]);
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
|
|
|
|
|
|
|
|
foreach (@insns) { eval; } # remaining instructions
|
|
|
|
|
|
|
|
&cmp ($inp,$num);
|
|
|
|
&je (".Ldone_ssse3");
|
|
|
|
|
|
|
|
unshift(@Tx,pop(@Tx));
|
|
|
|
|
|
|
|
&movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask
|
|
|
|
&movdqa (@Tx[1],"-64($K_XX_XX)"); # K_00_19
|
|
|
|
&movdqu (@X[-4&7],"0($inp)"); # load input
|
|
|
|
&movdqu (@X[-3&7],"16($inp)");
|
|
|
|
&movdqu (@X[-2&7],"32($inp)");
|
|
|
|
&movdqu (@X[-1&7],"48($inp)");
|
|
|
|
&pshufb (@X[-4&7],@X[2]); # byte swap
|
|
|
|
&add ($inp,64);
|
|
|
|
|
|
|
|
$Xi=0;
|
|
|
|
}
|
|
|
|
|
|
|
|
sub Xloop_ssse3()
|
|
|
|
{ use integer;
|
|
|
|
my $body = shift;
|
|
|
|
my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
|
|
|
|
my ($a,$b,$c,$d,$e);
|
|
|
|
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns));
|
2014-06-20 20:00:00 +01:00
|
|
|
&pshufb (@X[($Xi-3)&7],@X[2]);
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns));
|
2014-06-20 20:00:00 +01:00
|
|
|
&paddd (@X[($Xi-4)&7],@Tx[1]);
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
&movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
2014-06-20 20:00:00 +01:00
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
2014-06-20 20:00:00 +01:00
|
|
|
&psubd (@X[($Xi-4)&7],@Tx[1]);
|
|
|
|
|
|
|
|
foreach (@insns) { eval; }
|
|
|
|
$Xi++;
|
|
|
|
}
|
|
|
|
|
|
|
|
sub Xtail_ssse3()
|
|
|
|
{ use integer;
|
|
|
|
my $body = shift;
|
|
|
|
my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
|
|
|
|
my ($a,$b,$c,$d,$e);
|
|
|
|
|
|
|
|
foreach (@insns) { eval; }
|
|
|
|
}
|
|
|
|
|
|
|
|
sub body_00_19 () { # ((c^d)&b)^d
|
|
|
|
# on start @T[0]=(c^d)&b
|
|
|
|
return &body_20_39() if ($rx==19); $rx++;
|
|
|
|
(
|
|
|
|
'($a,$b,$c,$d,$e)=@V;'.
|
|
|
|
'&$_ror ($b,$j?7:2)', # $b>>>2
|
|
|
|
'&xor (@T[0],$d)',
|
|
|
|
'&mov (@T[1],$a)', # $b for next round
|
|
|
|
|
|
|
|
'&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
|
|
|
|
'&xor ($b,$c)', # $c^$d for next round
|
|
|
|
|
|
|
|
'&$_rol ($a,5)',
|
|
|
|
'&add ($e,@T[0])',
|
|
|
|
'&and (@T[1],$b)', # ($b&($c^$d)) for next round
|
|
|
|
|
|
|
|
'&xor ($b,$c)', # restore $b
|
|
|
|
'&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
sub body_20_39 () { # b^d^c
|
|
|
|
# on entry @T[0]=b^d
|
|
|
|
return &body_40_59() if ($rx==39); $rx++;
|
|
|
|
(
|
|
|
|
'($a,$b,$c,$d,$e)=@V;'.
|
|
|
|
'&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
|
|
|
|
'&xor (@T[0],$d) if($j==19);'.
|
|
|
|
'&xor (@T[0],$c) if($j> 19)', # ($b^$d^$c)
|
|
|
|
'&mov (@T[1],$a)', # $b for next round
|
|
|
|
|
|
|
|
'&$_rol ($a,5)',
|
|
|
|
'&add ($e,@T[0])',
|
|
|
|
'&xor (@T[1],$c) if ($j< 79)', # $b^$d for next round
|
|
|
|
|
|
|
|
'&$_ror ($b,7)', # $b>>>2
|
|
|
|
'&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
sub body_40_59 () { # ((b^c)&(c^d))^c
|
|
|
|
# on entry @T[0]=(b^c), (c^=d)
|
|
|
|
$rx++;
|
|
|
|
(
|
|
|
|
'($a,$b,$c,$d,$e)=@V;'.
|
|
|
|
'&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
|
|
|
|
'&and (@T[0],$c) if ($j>=40)', # (b^c)&(c^d)
|
|
|
|
'&xor ($c,$d) if ($j>=40)', # restore $c
|
|
|
|
|
|
|
|
'&$_ror ($b,7)', # $b>>>2
|
|
|
|
'&mov (@T[1],$a)', # $b for next round
|
|
|
|
'&xor (@T[0],$c)',
|
|
|
|
|
|
|
|
'&$_rol ($a,5)',
|
|
|
|
'&add ($e,@T[0])',
|
|
|
|
'&xor (@T[1],$c) if ($j==59);'.
|
|
|
|
'&xor (@T[1],$b) if ($j< 59)', # b^c for next round
|
|
|
|
|
|
|
|
'&xor ($b,$c) if ($j< 59)', # c^d for next round
|
|
|
|
'&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
|
|
|
|
);
|
|
|
|
}
|
|
|
|
$code.=<<___;
|
|
|
|
.align 16
|
|
|
|
.Loop_ssse3:
|
|
|
|
___
|
|
|
|
&Xupdate_ssse3_16_31(\&body_00_19);
|
|
|
|
&Xupdate_ssse3_16_31(\&body_00_19);
|
|
|
|
&Xupdate_ssse3_16_31(\&body_00_19);
|
|
|
|
&Xupdate_ssse3_16_31(\&body_00_19);
|
|
|
|
&Xupdate_ssse3_32_79(\&body_00_19);
|
|
|
|
&Xupdate_ssse3_32_79(\&body_20_39);
|
|
|
|
&Xupdate_ssse3_32_79(\&body_20_39);
|
|
|
|
&Xupdate_ssse3_32_79(\&body_20_39);
|
|
|
|
&Xupdate_ssse3_32_79(\&body_20_39);
|
|
|
|
&Xupdate_ssse3_32_79(\&body_20_39);
|
|
|
|
&Xupdate_ssse3_32_79(\&body_40_59);
|
|
|
|
&Xupdate_ssse3_32_79(\&body_40_59);
|
|
|
|
&Xupdate_ssse3_32_79(\&body_40_59);
|
|
|
|
&Xupdate_ssse3_32_79(\&body_40_59);
|
|
|
|
&Xupdate_ssse3_32_79(\&body_40_59);
|
|
|
|
&Xupdate_ssse3_32_79(\&body_20_39);
|
|
|
|
&Xuplast_ssse3_80(\&body_20_39); # can jump to "done"
|
|
|
|
|
|
|
|
$saved_j=$j; @saved_V=@V;
|
|
|
|
|
|
|
|
&Xloop_ssse3(\&body_20_39);
|
|
|
|
&Xloop_ssse3(\&body_20_39);
|
|
|
|
&Xloop_ssse3(\&body_20_39);
|
|
|
|
|
|
|
|
$code.=<<___;
|
|
|
|
add 0($ctx),$A # update context
|
|
|
|
add 4($ctx),@T[0]
|
|
|
|
add 8($ctx),$C
|
|
|
|
add 12($ctx),$D
|
|
|
|
mov $A,0($ctx)
|
|
|
|
add 16($ctx),$E
|
|
|
|
mov @T[0],4($ctx)
|
|
|
|
mov @T[0],$B # magic seed
|
|
|
|
mov $C,8($ctx)
|
|
|
|
mov $C,@T[1]
|
|
|
|
mov $D,12($ctx)
|
|
|
|
xor $D,@T[1]
|
|
|
|
mov $E,16($ctx)
|
|
|
|
and @T[1],@T[0]
|
|
|
|
jmp .Loop_ssse3
|
|
|
|
|
|
|
|
.align 16
|
|
|
|
.Ldone_ssse3:
|
|
|
|
___
|
|
|
|
$j=$saved_j; @V=@saved_V;
|
|
|
|
|
|
|
|
&Xtail_ssse3(\&body_20_39);
|
|
|
|
&Xtail_ssse3(\&body_20_39);
|
|
|
|
&Xtail_ssse3(\&body_20_39);
|
|
|
|
|
|
|
|
$code.=<<___;
|
|
|
|
add 0($ctx),$A # update context
|
|
|
|
add 4($ctx),@T[0]
|
|
|
|
add 8($ctx),$C
|
|
|
|
mov $A,0($ctx)
|
|
|
|
add 12($ctx),$D
|
|
|
|
mov @T[0],4($ctx)
|
|
|
|
add 16($ctx),$E
|
|
|
|
mov $C,8($ctx)
|
|
|
|
mov $D,12($ctx)
|
|
|
|
mov $E,16($ctx)
|
|
|
|
___
|
|
|
|
$code.=<<___ if ($win64);
|
2017-02-09 22:17:39 +00:00
|
|
|
movaps -40-6*16($fp),%xmm6
|
|
|
|
movaps -40-5*16($fp),%xmm7
|
|
|
|
movaps -40-4*16($fp),%xmm8
|
|
|
|
movaps -40-3*16($fp),%xmm9
|
|
|
|
movaps -40-2*16($fp),%xmm10
|
|
|
|
movaps -40-1*16($fp),%xmm11
|
2014-06-20 20:00:00 +01:00
|
|
|
___
|
|
|
|
$code.=<<___;
|
2017-02-09 22:17:39 +00:00
|
|
|
mov -40($fp),%r14
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_restore %r14
|
2017-02-09 22:17:39 +00:00
|
|
|
mov -32($fp),%r13
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_restore %r13
|
2017-02-09 22:17:39 +00:00
|
|
|
mov -24($fp),%r12
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_restore %r12
|
2017-02-09 22:17:39 +00:00
|
|
|
mov -16($fp),%rbp
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_restore %rbp
|
2017-02-09 22:17:39 +00:00
|
|
|
mov -8($fp),%rbx
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_restore %rbx
|
2017-02-09 22:17:39 +00:00
|
|
|
lea ($fp),%rsp
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_def_cfa_register %rsp
|
2014-06-20 20:00:00 +01:00
|
|
|
.Lepilogue_ssse3:
|
|
|
|
ret
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_endproc
|
2014-06-20 20:00:00 +01:00
|
|
|
.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
|
|
|
|
___
|
|
|
|
|
|
|
|
if ($avx) {
|
|
|
|
$Xi=4; # reset variables
|
|
|
|
@X=map("%xmm$_",(4..7,0..3));
|
|
|
|
@Tx=map("%xmm$_",(8..10));
|
|
|
|
$j=0;
|
|
|
|
$rx=0;
|
|
|
|
|
|
|
|
my $done_avx_label=".Ldone_avx";
|
|
|
|
|
|
|
|
my $_rol=sub { &shld(@_[0],@_) };
|
|
|
|
my $_ror=sub { &shrd(@_[0],@_) };
|
|
|
|
|
|
|
|
$code.=<<___;
|
|
|
|
.type sha1_block_data_order_avx,\@function,3
|
|
|
|
.align 16
|
|
|
|
sha1_block_data_order_avx:
|
|
|
|
_avx_shortcut:
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_startproc
|
2017-02-09 22:17:39 +00:00
|
|
|
mov %rsp,$fp
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_def_cfa_register $fp
|
2014-06-20 20:00:00 +01:00
|
|
|
push %rbx
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_push %rbx
|
2014-06-20 20:00:00 +01:00
|
|
|
push %rbp
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_push %rbp
|
2014-06-20 20:00:00 +01:00
|
|
|
push %r12
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_push %r12
|
2014-06-20 20:00:00 +01:00
|
|
|
push %r13 # redundant, done to share Win64 SE handler
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_push %r13
|
2014-06-20 20:00:00 +01:00
|
|
|
push %r14
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_push %r14
|
2014-06-20 20:00:00 +01:00
|
|
|
lea `-64-($win64?6*16:0)`(%rsp),%rsp
|
|
|
|
vzeroupper
|
|
|
|
___
|
|
|
|
$code.=<<___ if ($win64);
|
2017-02-09 22:17:39 +00:00
|
|
|
vmovaps %xmm6,-40-6*16($fp)
|
|
|
|
vmovaps %xmm7,-40-5*16($fp)
|
|
|
|
vmovaps %xmm8,-40-4*16($fp)
|
|
|
|
vmovaps %xmm9,-40-3*16($fp)
|
|
|
|
vmovaps %xmm10,-40-2*16($fp)
|
|
|
|
vmovaps %xmm11,-40-1*16($fp)
|
2014-06-20 20:00:00 +01:00
|
|
|
.Lprologue_avx:
|
|
|
|
___
|
|
|
|
$code.=<<___;
|
|
|
|
and \$-64,%rsp
|
|
|
|
mov %rdi,$ctx # reassigned argument
|
|
|
|
mov %rsi,$inp # reassigned argument
|
|
|
|
mov %rdx,$num # reassigned argument
|
|
|
|
|
|
|
|
shl \$6,$num
|
|
|
|
add $inp,$num
|
|
|
|
lea K_XX_XX+64(%rip),$K_XX_XX
|
|
|
|
|
|
|
|
mov 0($ctx),$A # load context
|
|
|
|
mov 4($ctx),$B
|
|
|
|
mov 8($ctx),$C
|
|
|
|
mov 12($ctx),$D
|
|
|
|
mov $B,@T[0] # magic seed
|
|
|
|
mov 16($ctx),$E
|
|
|
|
mov $C,@T[1]
|
|
|
|
xor $D,@T[1]
|
|
|
|
and @T[1],@T[0]
|
|
|
|
|
|
|
|
vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
|
|
|
|
vmovdqa -64($K_XX_XX),$Kx # K_00_19
|
|
|
|
vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
|
|
|
|
vmovdqu 16($inp),@X[-3&7]
|
|
|
|
vmovdqu 32($inp),@X[-2&7]
|
|
|
|
vmovdqu 48($inp),@X[-1&7]
|
|
|
|
vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
|
|
|
|
add \$64,$inp
|
|
|
|
vpshufb @X[2],@X[-3&7],@X[-3&7]
|
|
|
|
vpshufb @X[2],@X[-2&7],@X[-2&7]
|
|
|
|
vpshufb @X[2],@X[-1&7],@X[-1&7]
|
|
|
|
vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
|
|
|
|
vpaddd $Kx,@X[-3&7],@X[1]
|
|
|
|
vpaddd $Kx,@X[-2&7],@X[2]
|
|
|
|
vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
|
|
|
|
vmovdqa @X[1],16(%rsp)
|
|
|
|
vmovdqa @X[2],32(%rsp)
|
|
|
|
jmp .Loop_avx
|
|
|
|
___
|
|
|
|
|
2018-08-07 19:26:15 +01:00
|
|
|
sub Xupdate_avx_16_31() # recall that $Xi starts with 4
|
2014-06-20 20:00:00 +01:00
|
|
|
{ use integer;
|
|
|
|
my $body = shift;
|
|
|
|
my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
|
|
|
|
my ($a,$b,$c,$d,$e);
|
|
|
|
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
&vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&vpaddd (@Tx[1],$Kx,@X[-1&7]);
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
&vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
&vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
&vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&vpsrld (@Tx[0],@X[0],31);
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
|
|
|
|
&vpaddd (@X[0],@X[0],@X[0]);
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&vpsrld (@Tx[1],@Tx[2],30);
|
|
|
|
&vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&vpslld (@Tx[2],@Tx[2],2);
|
|
|
|
&vpxor (@X[0],@X[0],@Tx[1]);
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
&vmovdqa ($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
|
|
|
|
foreach (@insns) { eval; } # remaining instructions [if any]
|
|
|
|
|
|
|
|
$Xi++; push(@X,shift(@X)); # "rotate" X[]
|
|
|
|
}
|
|
|
|
|
|
|
|
sub Xupdate_avx_32_79()
|
|
|
|
{ use integer;
|
|
|
|
my $body = shift;
|
|
|
|
my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
|
|
|
|
my ($a,$b,$c,$d,$e);
|
|
|
|
|
|
|
|
&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
|
|
|
|
&vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
|
|
|
|
eval(shift(@insns)); # body_20_39
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns)); # rol
|
|
|
|
|
|
|
|
&vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
|
|
|
|
&vpaddd (@Tx[1],$Kx,@X[-1&7]);
|
|
|
|
&vmovdqa ($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)") if ($Xi%5==0);
|
|
|
|
eval(shift(@insns)); # ror
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
|
|
|
|
eval(shift(@insns)); # body_20_39
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns)); # rol
|
|
|
|
|
|
|
|
&vpsrld (@Tx[0],@X[0],30);
|
|
|
|
&vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns)); # ror
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&vpslld (@X[0],@X[0],2);
|
|
|
|
eval(shift(@insns)); # body_20_39
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns)); # rol
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns)); # ror
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
|
|
|
|
eval(shift(@insns)); # body_20_39
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns)); # rol
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns)); # rol
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
foreach (@insns) { eval; } # remaining instructions
|
|
|
|
|
|
|
|
$Xi++; push(@X,shift(@X)); # "rotate" X[]
|
|
|
|
}
|
|
|
|
|
|
|
|
sub Xuplast_avx_80()
|
|
|
|
{ use integer;
|
|
|
|
my $body = shift;
|
|
|
|
my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
|
|
|
|
my ($a,$b,$c,$d,$e);
|
|
|
|
|
|
|
|
eval(shift(@insns));
|
|
|
|
&vpaddd (@Tx[1],$Kx,@X[-1&7]);
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
|
|
|
|
|
|
|
|
foreach (@insns) { eval; } # remaining instructions
|
|
|
|
|
|
|
|
&cmp ($inp,$num);
|
|
|
|
&je ($done_avx_label);
|
|
|
|
|
|
|
|
&vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask
|
|
|
|
&vmovdqa($Kx,"-64($K_XX_XX)"); # K_00_19
|
|
|
|
&vmovdqu(@X[-4&7],"0($inp)"); # load input
|
|
|
|
&vmovdqu(@X[-3&7],"16($inp)");
|
|
|
|
&vmovdqu(@X[-2&7],"32($inp)");
|
|
|
|
&vmovdqu(@X[-1&7],"48($inp)");
|
|
|
|
&vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap
|
|
|
|
&add ($inp,64);
|
|
|
|
|
|
|
|
$Xi=0;
|
|
|
|
}
|
|
|
|
|
|
|
|
sub Xloop_avx()
|
|
|
|
{ use integer;
|
|
|
|
my $body = shift;
|
|
|
|
my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
|
|
|
|
my ($a,$b,$c,$d,$e);
|
|
|
|
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
&vpaddd (@X[$Xi&7],@X[($Xi-4)&7],$Kx);
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
foreach (@insns) { eval; }
|
|
|
|
$Xi++;
|
|
|
|
}
|
|
|
|
|
|
|
|
sub Xtail_avx()
|
|
|
|
{ use integer;
|
|
|
|
my $body = shift;
|
|
|
|
my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
|
|
|
|
my ($a,$b,$c,$d,$e);
|
|
|
|
|
|
|
|
foreach (@insns) { eval; }
|
|
|
|
}
|
|
|
|
|
|
|
|
$code.=<<___;
|
|
|
|
.align 16
|
|
|
|
.Loop_avx:
|
|
|
|
___
|
|
|
|
&Xupdate_avx_16_31(\&body_00_19);
|
|
|
|
&Xupdate_avx_16_31(\&body_00_19);
|
|
|
|
&Xupdate_avx_16_31(\&body_00_19);
|
|
|
|
&Xupdate_avx_16_31(\&body_00_19);
|
|
|
|
&Xupdate_avx_32_79(\&body_00_19);
|
|
|
|
&Xupdate_avx_32_79(\&body_20_39);
|
|
|
|
&Xupdate_avx_32_79(\&body_20_39);
|
|
|
|
&Xupdate_avx_32_79(\&body_20_39);
|
|
|
|
&Xupdate_avx_32_79(\&body_20_39);
|
|
|
|
&Xupdate_avx_32_79(\&body_20_39);
|
|
|
|
&Xupdate_avx_32_79(\&body_40_59);
|
|
|
|
&Xupdate_avx_32_79(\&body_40_59);
|
|
|
|
&Xupdate_avx_32_79(\&body_40_59);
|
|
|
|
&Xupdate_avx_32_79(\&body_40_59);
|
|
|
|
&Xupdate_avx_32_79(\&body_40_59);
|
|
|
|
&Xupdate_avx_32_79(\&body_20_39);
|
|
|
|
&Xuplast_avx_80(\&body_20_39); # can jump to "done"
|
|
|
|
|
|
|
|
$saved_j=$j; @saved_V=@V;
|
|
|
|
|
|
|
|
&Xloop_avx(\&body_20_39);
|
|
|
|
&Xloop_avx(\&body_20_39);
|
|
|
|
&Xloop_avx(\&body_20_39);
|
|
|
|
|
|
|
|
$code.=<<___;
|
|
|
|
add 0($ctx),$A # update context
|
|
|
|
add 4($ctx),@T[0]
|
|
|
|
add 8($ctx),$C
|
|
|
|
add 12($ctx),$D
|
|
|
|
mov $A,0($ctx)
|
|
|
|
add 16($ctx),$E
|
|
|
|
mov @T[0],4($ctx)
|
|
|
|
mov @T[0],$B # magic seed
|
|
|
|
mov $C,8($ctx)
|
|
|
|
mov $C,@T[1]
|
|
|
|
mov $D,12($ctx)
|
|
|
|
xor $D,@T[1]
|
|
|
|
mov $E,16($ctx)
|
|
|
|
and @T[1],@T[0]
|
|
|
|
jmp .Loop_avx
|
|
|
|
|
|
|
|
.align 16
|
|
|
|
$done_avx_label:
|
|
|
|
___
|
|
|
|
$j=$saved_j; @V=@saved_V;
|
|
|
|
|
|
|
|
&Xtail_avx(\&body_20_39);
|
|
|
|
&Xtail_avx(\&body_20_39);
|
|
|
|
&Xtail_avx(\&body_20_39);
|
|
|
|
|
|
|
|
$code.=<<___;
|
|
|
|
vzeroupper
|
|
|
|
|
|
|
|
add 0($ctx),$A # update context
|
|
|
|
add 4($ctx),@T[0]
|
|
|
|
add 8($ctx),$C
|
|
|
|
mov $A,0($ctx)
|
|
|
|
add 12($ctx),$D
|
|
|
|
mov @T[0],4($ctx)
|
|
|
|
add 16($ctx),$E
|
|
|
|
mov $C,8($ctx)
|
|
|
|
mov $D,12($ctx)
|
|
|
|
mov $E,16($ctx)
|
|
|
|
___
|
|
|
|
$code.=<<___ if ($win64);
|
2017-02-09 22:17:39 +00:00
|
|
|
movaps -40-6*16($fp),%xmm6
|
|
|
|
movaps -40-5*16($fp),%xmm7
|
|
|
|
movaps -40-4*16($fp),%xmm8
|
|
|
|
movaps -40-3*16($fp),%xmm9
|
|
|
|
movaps -40-2*16($fp),%xmm10
|
|
|
|
movaps -40-1*16($fp),%xmm11
|
2014-06-20 20:00:00 +01:00
|
|
|
___
|
|
|
|
$code.=<<___;
|
2017-02-09 22:17:39 +00:00
|
|
|
mov -40($fp),%r14
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_restore %r14
|
2017-02-09 22:17:39 +00:00
|
|
|
mov -32($fp),%r13
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_restore %r13
|
2017-02-09 22:17:39 +00:00
|
|
|
mov -24($fp),%r12
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_restore %r12
|
2017-02-09 22:17:39 +00:00
|
|
|
mov -16($fp),%rbp
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_restore %rbp
|
2017-02-09 22:17:39 +00:00
|
|
|
mov -8($fp),%rbx
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_restore %rbx
|
2017-02-09 22:17:39 +00:00
|
|
|
lea ($fp),%rsp
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_def_cfa_register %rsp
|
2014-06-20 20:00:00 +01:00
|
|
|
.Lepilogue_avx:
|
|
|
|
ret
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_endproc
|
2014-06-20 20:00:00 +01:00
|
|
|
.size sha1_block_data_order_avx,.-sha1_block_data_order_avx
|
|
|
|
___
|
|
|
|
|
|
|
|
if ($avx>1) {
|
|
|
|
use integer;
|
|
|
|
$Xi=4; # reset variables
|
|
|
|
@X=map("%ymm$_",(4..7,0..3));
|
|
|
|
@Tx=map("%ymm$_",(8..10));
|
|
|
|
$Kx="%ymm11";
|
|
|
|
$j=0;
|
|
|
|
|
|
|
|
my @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi");
|
|
|
|
my ($a5,$t0)=("%r12d","%edi");
|
|
|
|
|
|
|
|
my ($A,$F,$B,$C,$D,$E)=@ROTX;
|
|
|
|
my $rx=0;
|
|
|
|
my $frame="%r13";
|
|
|
|
|
|
|
|
$code.=<<___;
|
|
|
|
.type sha1_block_data_order_avx2,\@function,3
|
|
|
|
.align 16
|
|
|
|
sha1_block_data_order_avx2:
|
|
|
|
_avx2_shortcut:
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_startproc
|
2017-02-09 22:17:39 +00:00
|
|
|
mov %rsp,$fp
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_def_cfa_register $fp
|
2014-06-20 20:00:00 +01:00
|
|
|
push %rbx
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_push %rbx
|
2014-06-20 20:00:00 +01:00
|
|
|
push %rbp
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_push %rbp
|
2014-06-20 20:00:00 +01:00
|
|
|
push %r12
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_push %r12
|
2014-06-20 20:00:00 +01:00
|
|
|
push %r13
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_push %r13
|
2014-06-20 20:00:00 +01:00
|
|
|
push %r14
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_push %r14
|
2014-06-20 20:00:00 +01:00
|
|
|
vzeroupper
|
|
|
|
___
|
|
|
|
$code.=<<___ if ($win64);
|
|
|
|
lea -6*16(%rsp),%rsp
|
2017-02-09 22:17:39 +00:00
|
|
|
vmovaps %xmm6,-40-6*16($fp)
|
|
|
|
vmovaps %xmm7,-40-5*16($fp)
|
|
|
|
vmovaps %xmm8,-40-4*16($fp)
|
|
|
|
vmovaps %xmm9,-40-3*16($fp)
|
|
|
|
vmovaps %xmm10,-40-2*16($fp)
|
|
|
|
vmovaps %xmm11,-40-1*16($fp)
|
2014-06-20 20:00:00 +01:00
|
|
|
.Lprologue_avx2:
|
|
|
|
___
|
|
|
|
$code.=<<___;
|
|
|
|
mov %rdi,$ctx # reassigned argument
|
|
|
|
mov %rsi,$inp # reassigned argument
|
|
|
|
mov %rdx,$num # reassigned argument
|
|
|
|
|
|
|
|
lea -640(%rsp),%rsp
|
|
|
|
shl \$6,$num
|
|
|
|
lea 64($inp),$frame
|
|
|
|
and \$-128,%rsp
|
|
|
|
add $inp,$num
|
|
|
|
lea K_XX_XX+64(%rip),$K_XX_XX
|
|
|
|
|
|
|
|
mov 0($ctx),$A # load context
|
|
|
|
cmp $num,$frame
|
|
|
|
cmovae $inp,$frame # next or same block
|
|
|
|
mov 4($ctx),$F
|
|
|
|
mov 8($ctx),$C
|
|
|
|
mov 12($ctx),$D
|
|
|
|
mov 16($ctx),$E
|
|
|
|
vmovdqu 64($K_XX_XX),@X[2] # pbswap mask
|
|
|
|
|
|
|
|
vmovdqu ($inp),%xmm0
|
|
|
|
vmovdqu 16($inp),%xmm1
|
|
|
|
vmovdqu 32($inp),%xmm2
|
|
|
|
vmovdqu 48($inp),%xmm3
|
|
|
|
lea 64($inp),$inp
|
|
|
|
vinserti128 \$1,($frame),@X[-4&7],@X[-4&7]
|
|
|
|
vinserti128 \$1,16($frame),@X[-3&7],@X[-3&7]
|
|
|
|
vpshufb @X[2],@X[-4&7],@X[-4&7]
|
|
|
|
vinserti128 \$1,32($frame),@X[-2&7],@X[-2&7]
|
|
|
|
vpshufb @X[2],@X[-3&7],@X[-3&7]
|
|
|
|
vinserti128 \$1,48($frame),@X[-1&7],@X[-1&7]
|
|
|
|
vpshufb @X[2],@X[-2&7],@X[-2&7]
|
|
|
|
vmovdqu -64($K_XX_XX),$Kx # K_00_19
|
|
|
|
vpshufb @X[2],@X[-1&7],@X[-1&7]
|
|
|
|
|
|
|
|
vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
|
|
|
|
vpaddd $Kx,@X[-3&7],@X[1]
|
|
|
|
vmovdqu @X[0],0(%rsp) # X[]+K xfer to IALU
|
|
|
|
vpaddd $Kx,@X[-2&7],@X[2]
|
|
|
|
vmovdqu @X[1],32(%rsp)
|
|
|
|
vpaddd $Kx,@X[-1&7],@X[3]
|
|
|
|
vmovdqu @X[2],64(%rsp)
|
|
|
|
vmovdqu @X[3],96(%rsp)
|
|
|
|
___
|
|
|
|
for (;$Xi<8;$Xi++) { # Xupdate_avx2_16_31
|
|
|
|
use integer;
|
|
|
|
|
|
|
|
&vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
|
|
|
|
&vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
|
|
|
|
&vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
|
|
|
|
&vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
|
|
|
|
&vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
|
|
|
|
&vpsrld (@Tx[0],@X[0],31);
|
|
|
|
&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
|
|
|
|
&vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
|
|
|
|
&vpaddd (@X[0],@X[0],@X[0]);
|
|
|
|
&vpsrld (@Tx[1],@Tx[2],30);
|
|
|
|
&vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
|
|
|
|
&vpslld (@Tx[2],@Tx[2],2);
|
|
|
|
&vpxor (@X[0],@X[0],@Tx[1]);
|
|
|
|
&vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
|
|
|
|
&vpaddd (@Tx[1],@X[0],$Kx);
|
|
|
|
&vmovdqu("32*$Xi(%rsp)",@Tx[1]); # X[]+K xfer to IALU
|
|
|
|
|
|
|
|
push(@X,shift(@X)); # "rotate" X[]
|
|
|
|
}
|
|
|
|
$code.=<<___;
|
|
|
|
lea 128(%rsp),$frame
|
|
|
|
jmp .Loop_avx2
|
|
|
|
.align 32
|
|
|
|
.Loop_avx2:
|
|
|
|
rorx \$2,$F,$B
|
|
|
|
andn $D,$F,$t0
|
|
|
|
and $C,$F
|
|
|
|
xor $t0,$F
|
|
|
|
___
|
|
|
|
sub bodyx_00_19 () { # 8 instructions, 3 cycles critical path
|
|
|
|
# at start $f=(b&c)^(~b&d), $b>>>=2
|
|
|
|
return &bodyx_20_39() if ($rx==19); $rx++;
|
|
|
|
(
|
|
|
|
'($a,$f,$b,$c,$d,$e)=@ROTX;'.
|
|
|
|
|
|
|
|
'&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K
|
|
|
|
'&lea ($frame,"256($frame)") if ($j%32==31);',
|
|
|
|
'&andn ($t0,$a,$c)', # ~b&d for next round
|
|
|
|
|
|
|
|
'&add ($e,$f)', # e+=(b&c)^(~b&d)
|
|
|
|
'&rorx ($a5,$a,27)', # a<<<5
|
|
|
|
'&rorx ($f,$a,2)', # b>>>2 for next round
|
|
|
|
'&and ($a,$b)', # b&c for next round
|
|
|
|
|
|
|
|
'&add ($e,$a5)', # e+=a<<<5
|
|
|
|
'&xor ($a,$t0);'. # f=(b&c)^(~b&d) for next round
|
|
|
|
|
|
|
|
'unshift(@ROTX,pop(@ROTX)); $j++;'
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
|
|
|
sub bodyx_20_39 () { # 7 instructions, 2 cycles critical path
|
|
|
|
# on entry $f=b^c^d, $b>>>=2
|
|
|
|
return &bodyx_40_59() if ($rx==39); $rx++;
|
|
|
|
(
|
|
|
|
'($a,$f,$b,$c,$d,$e)=@ROTX;'.
|
|
|
|
|
|
|
|
'&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K
|
|
|
|
'&lea ($frame,"256($frame)") if ($j%32==31);',
|
|
|
|
|
|
|
|
'&lea ($e,"($e,$f)")', # e+=b^c^d
|
|
|
|
'&rorx ($a5,$a,27)', # a<<<5
|
|
|
|
'&rorx ($f,$a,2) if ($j<79)', # b>>>2 in next round
|
|
|
|
'&xor ($a,$b) if ($j<79)', # b^c for next round
|
|
|
|
|
|
|
|
'&add ($e,$a5)', # e+=a<<<5
|
|
|
|
'&xor ($a,$c) if ($j<79);'. # f=b^c^d for next round
|
|
|
|
|
|
|
|
'unshift(@ROTX,pop(@ROTX)); $j++;'
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
|
|
|
sub bodyx_40_59 () { # 10 instructions, 3 cycles critical path
|
|
|
|
# on entry $f=((b^c)&(c^d)), $b>>>=2
|
|
|
|
$rx++;
|
|
|
|
(
|
|
|
|
'($a,$f,$b,$c,$d,$e)=@ROTX;'.
|
|
|
|
|
|
|
|
'&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K
|
|
|
|
'&lea ($frame,"256($frame)") if ($j%32==31);',
|
|
|
|
'&xor ($f,$c) if ($j>39)', # (b^c)&(c^d)^c
|
|
|
|
'&mov ($t0,$b) if ($j<59)', # count on zero latency
|
|
|
|
'&xor ($t0,$c) if ($j<59)', # c^d for next round
|
|
|
|
|
|
|
|
'&lea ($e,"($e,$f)")', # e+=(b^c)&(c^d)^c
|
|
|
|
'&rorx ($a5,$a,27)', # a<<<5
|
|
|
|
'&rorx ($f,$a,2)', # b>>>2 in next round
|
|
|
|
'&xor ($a,$b)', # b^c for next round
|
|
|
|
|
|
|
|
'&add ($e,$a5)', # e+=a<<<5
|
|
|
|
'&and ($a,$t0) if ($j< 59);'. # f=(b^c)&(c^d) for next round
|
|
|
|
'&xor ($a,$c) if ($j==59);'. # f=b^c^d for next round
|
|
|
|
|
|
|
|
'unshift(@ROTX,pop(@ROTX)); $j++;'
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
2017-07-25 21:59:58 +01:00
|
|
|
sub Xupdate_avx2_16_31() # recall that $Xi starts with 4
|
2014-06-20 20:00:00 +01:00
|
|
|
{ use integer;
|
|
|
|
my $body = shift;
|
|
|
|
my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 instructions
|
|
|
|
my ($a,$b,$c,$d,$e);
|
|
|
|
|
|
|
|
&vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
|
|
|
|
&vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&vpsrld (@Tx[0],@X[0],31);
|
|
|
|
&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
|
|
|
|
&vpaddd (@X[0],@X[0],@X[0]);
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&vpsrld (@Tx[1],@Tx[2],30);
|
|
|
|
&vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&vpslld (@Tx[2],@Tx[2],2);
|
|
|
|
&vpxor (@X[0],@X[0],@Tx[1]);
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&vpaddd (@Tx[1],@X[0],$Kx);
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
&vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
|
|
|
|
|
|
|
|
foreach (@insns) { eval; } # remaining instructions [if any]
|
|
|
|
|
|
|
|
$Xi++;
|
|
|
|
push(@X,shift(@X)); # "rotate" X[]
|
|
|
|
}
|
|
|
|
|
|
|
|
sub Xupdate_avx2_32_79()
|
|
|
|
{ use integer;
|
|
|
|
my $body = shift;
|
|
|
|
my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 to 50 instructions
|
|
|
|
my ($a,$b,$c,$d,$e);
|
|
|
|
|
|
|
|
&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
|
|
|
|
&vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
|
|
|
|
&vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)") if ($Xi%5==0);
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&vpsrld (@Tx[0],@X[0],30);
|
|
|
|
&vpslld (@X[0],@X[0],2);
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
#&vpslld (@X[0],@X[0],2);
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&vpaddd (@Tx[1],@X[0],$Kx);
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
eval(shift(@insns));
|
|
|
|
|
|
|
|
&vmovdqu("32*$Xi(%rsp)",@Tx[1]); # X[]+K xfer to IALU
|
|
|
|
|
|
|
|
foreach (@insns) { eval; } # remaining instructions
|
|
|
|
|
|
|
|
$Xi++;
|
|
|
|
push(@X,shift(@X)); # "rotate" X[]
|
|
|
|
}
|
|
|
|
|
|
|
|
sub Xloop_avx2()
|
|
|
|
{ use integer;
|
|
|
|
my $body = shift;
|
|
|
|
my @insns = (&$body,&$body,&$body,&$body,&$body); # 32 instructions
|
|
|
|
my ($a,$b,$c,$d,$e);
|
|
|
|
|
|
|
|
foreach (@insns) { eval; }
|
|
|
|
}
|
|
|
|
|
|
|
|
&align32();
|
|
|
|
&Xupdate_avx2_32_79(\&bodyx_00_19);
|
|
|
|
&Xupdate_avx2_32_79(\&bodyx_00_19);
|
|
|
|
&Xupdate_avx2_32_79(\&bodyx_00_19);
|
|
|
|
&Xupdate_avx2_32_79(\&bodyx_00_19);
|
|
|
|
|
|
|
|
&Xupdate_avx2_32_79(\&bodyx_20_39);
|
|
|
|
&Xupdate_avx2_32_79(\&bodyx_20_39);
|
|
|
|
&Xupdate_avx2_32_79(\&bodyx_20_39);
|
|
|
|
&Xupdate_avx2_32_79(\&bodyx_20_39);
|
|
|
|
|
|
|
|
&align32();
|
|
|
|
&Xupdate_avx2_32_79(\&bodyx_40_59);
|
|
|
|
&Xupdate_avx2_32_79(\&bodyx_40_59);
|
|
|
|
&Xupdate_avx2_32_79(\&bodyx_40_59);
|
|
|
|
&Xupdate_avx2_32_79(\&bodyx_40_59);
|
|
|
|
|
|
|
|
&Xloop_avx2(\&bodyx_20_39);
|
|
|
|
&Xloop_avx2(\&bodyx_20_39);
|
|
|
|
&Xloop_avx2(\&bodyx_20_39);
|
|
|
|
&Xloop_avx2(\&bodyx_20_39);
|
|
|
|
|
|
|
|
$code.=<<___;
|
|
|
|
lea 128($inp),$frame
|
|
|
|
lea 128($inp),%rdi # borrow $t0
|
|
|
|
cmp $num,$frame
|
|
|
|
cmovae $inp,$frame # next or previous block
|
|
|
|
|
|
|
|
# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
|
|
|
|
add 0($ctx),@ROTX[0] # update context
|
|
|
|
add 4($ctx),@ROTX[1]
|
|
|
|
add 8($ctx),@ROTX[3]
|
|
|
|
mov @ROTX[0],0($ctx)
|
|
|
|
add 12($ctx),@ROTX[4]
|
|
|
|
mov @ROTX[1],4($ctx)
|
|
|
|
mov @ROTX[0],$A # A=d
|
|
|
|
add 16($ctx),@ROTX[5]
|
|
|
|
mov @ROTX[3],$a5
|
|
|
|
mov @ROTX[3],8($ctx)
|
|
|
|
mov @ROTX[4],$D # D=b
|
|
|
|
#xchg @ROTX[5],$F # F=c, C=f
|
|
|
|
mov @ROTX[4],12($ctx)
|
|
|
|
mov @ROTX[1],$F # F=e
|
|
|
|
mov @ROTX[5],16($ctx)
|
|
|
|
#mov $F,16($ctx)
|
|
|
|
mov @ROTX[5],$E # E=c
|
|
|
|
mov $a5,$C # C=f
|
|
|
|
#xchg $F,$E # E=c, F=e
|
|
|
|
|
|
|
|
cmp $num,$inp
|
|
|
|
je .Ldone_avx2
|
|
|
|
___
|
|
|
|
|
|
|
|
$Xi=4; # reset variables
|
|
|
|
@X=map("%ymm$_",(4..7,0..3));
|
|
|
|
|
|
|
|
$code.=<<___;
|
|
|
|
vmovdqu 64($K_XX_XX),@X[2] # pbswap mask
|
|
|
|
cmp $num,%rdi # borrowed $t0
|
|
|
|
ja .Last_avx2
|
|
|
|
|
|
|
|
vmovdqu -64(%rdi),%xmm0 # low part of @X[-4&7]
|
|
|
|
vmovdqu -48(%rdi),%xmm1
|
|
|
|
vmovdqu -32(%rdi),%xmm2
|
|
|
|
vmovdqu -16(%rdi),%xmm3
|
|
|
|
vinserti128 \$1,0($frame),@X[-4&7],@X[-4&7]
|
|
|
|
vinserti128 \$1,16($frame),@X[-3&7],@X[-3&7]
|
|
|
|
vinserti128 \$1,32($frame),@X[-2&7],@X[-2&7]
|
|
|
|
vinserti128 \$1,48($frame),@X[-1&7],@X[-1&7]
|
|
|
|
jmp .Last_avx2
|
|
|
|
|
|
|
|
.align 32
|
|
|
|
.Last_avx2:
|
|
|
|
lea 128+16(%rsp),$frame
|
|
|
|
rorx \$2,$F,$B
|
|
|
|
andn $D,$F,$t0
|
|
|
|
and $C,$F
|
|
|
|
xor $t0,$F
|
|
|
|
sub \$-128,$inp
|
|
|
|
___
|
|
|
|
$rx=$j=0; @ROTX=($A,$F,$B,$C,$D,$E);
|
|
|
|
|
|
|
|
&Xloop_avx2 (\&bodyx_00_19);
|
|
|
|
&Xloop_avx2 (\&bodyx_00_19);
|
|
|
|
&Xloop_avx2 (\&bodyx_00_19);
|
|
|
|
&Xloop_avx2 (\&bodyx_00_19);
|
|
|
|
|
|
|
|
&Xloop_avx2 (\&bodyx_20_39);
|
|
|
|
&vmovdqu ($Kx,"-64($K_XX_XX)"); # K_00_19
|
|
|
|
&vpshufb (@X[-4&7],@X[-4&7],@X[2]); # byte swap
|
|
|
|
&Xloop_avx2 (\&bodyx_20_39);
|
|
|
|
&vpshufb (@X[-3&7],@X[-3&7],@X[2]);
|
|
|
|
&vpaddd (@Tx[0],@X[-4&7],$Kx); # add K_00_19
|
|
|
|
&Xloop_avx2 (\&bodyx_20_39);
|
|
|
|
&vmovdqu ("0(%rsp)",@Tx[0]);
|
|
|
|
&vpshufb (@X[-2&7],@X[-2&7],@X[2]);
|
|
|
|
&vpaddd (@Tx[1],@X[-3&7],$Kx);
|
|
|
|
&Xloop_avx2 (\&bodyx_20_39);
|
|
|
|
&vmovdqu ("32(%rsp)",@Tx[1]);
|
|
|
|
&vpshufb (@X[-1&7],@X[-1&7],@X[2]);
|
|
|
|
&vpaddd (@X[2],@X[-2&7],$Kx);
|
|
|
|
|
|
|
|
&Xloop_avx2 (\&bodyx_40_59);
|
|
|
|
&align32 ();
|
|
|
|
&vmovdqu ("64(%rsp)",@X[2]);
|
|
|
|
&vpaddd (@X[3],@X[-1&7],$Kx);
|
|
|
|
&Xloop_avx2 (\&bodyx_40_59);
|
|
|
|
&vmovdqu ("96(%rsp)",@X[3]);
|
|
|
|
&Xloop_avx2 (\&bodyx_40_59);
|
|
|
|
&Xupdate_avx2_16_31(\&bodyx_40_59);
|
|
|
|
|
|
|
|
&Xupdate_avx2_16_31(\&bodyx_20_39);
|
|
|
|
&Xupdate_avx2_16_31(\&bodyx_20_39);
|
|
|
|
&Xupdate_avx2_16_31(\&bodyx_20_39);
|
|
|
|
&Xloop_avx2 (\&bodyx_20_39);
|
|
|
|
|
|
|
|
$code.=<<___;
|
|
|
|
lea 128(%rsp),$frame
|
|
|
|
|
|
|
|
# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
|
|
|
|
add 0($ctx),@ROTX[0] # update context
|
|
|
|
add 4($ctx),@ROTX[1]
|
|
|
|
add 8($ctx),@ROTX[3]
|
|
|
|
mov @ROTX[0],0($ctx)
|
|
|
|
add 12($ctx),@ROTX[4]
|
|
|
|
mov @ROTX[1],4($ctx)
|
|
|
|
mov @ROTX[0],$A # A=d
|
|
|
|
add 16($ctx),@ROTX[5]
|
|
|
|
mov @ROTX[3],$a5
|
|
|
|
mov @ROTX[3],8($ctx)
|
|
|
|
mov @ROTX[4],$D # D=b
|
|
|
|
#xchg @ROTX[5],$F # F=c, C=f
|
|
|
|
mov @ROTX[4],12($ctx)
|
|
|
|
mov @ROTX[1],$F # F=e
|
|
|
|
mov @ROTX[5],16($ctx)
|
|
|
|
#mov $F,16($ctx)
|
|
|
|
mov @ROTX[5],$E # E=c
|
|
|
|
mov $a5,$C # C=f
|
|
|
|
#xchg $F,$E # E=c, F=e
|
|
|
|
|
|
|
|
cmp $num,$inp
|
|
|
|
jbe .Loop_avx2
|
|
|
|
|
|
|
|
.Ldone_avx2:
|
|
|
|
vzeroupper
|
|
|
|
___
|
|
|
|
$code.=<<___ if ($win64);
|
2017-02-09 22:17:39 +00:00
|
|
|
movaps -40-6*16($fp),%xmm6
|
|
|
|
movaps -40-5*16($fp),%xmm7
|
|
|
|
movaps -40-4*16($fp),%xmm8
|
|
|
|
movaps -40-3*16($fp),%xmm9
|
|
|
|
movaps -40-2*16($fp),%xmm10
|
|
|
|
movaps -40-1*16($fp),%xmm11
|
2014-06-20 20:00:00 +01:00
|
|
|
___
|
|
|
|
$code.=<<___;
|
2017-02-09 22:17:39 +00:00
|
|
|
mov -40($fp),%r14
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_restore %r14
|
2017-02-09 22:17:39 +00:00
|
|
|
mov -32($fp),%r13
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_restore %r13
|
2017-02-09 22:17:39 +00:00
|
|
|
mov -24($fp),%r12
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_restore %r12
|
2017-02-09 22:17:39 +00:00
|
|
|
mov -16($fp),%rbp
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_restore %rbp
|
2017-02-09 22:17:39 +00:00
|
|
|
mov -8($fp),%rbx
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_restore %rbx
|
2017-02-09 22:17:39 +00:00
|
|
|
lea ($fp),%rsp
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_def_cfa_register %rsp
|
2014-06-20 20:00:00 +01:00
|
|
|
.Lepilogue_avx2:
|
|
|
|
ret
|
2018-08-07 19:26:15 +01:00
|
|
|
.cfi_endproc
|
2014-06-20 20:00:00 +01:00
|
|
|
.size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
|
|
|
|
___
|
|
|
|
}
|
|
|
|
}
|
|
|
|
$code.=<<___;
|
|
|
|
.align 64
|
|
|
|
K_XX_XX:
|
|
|
|
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
|
|
|
|
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
|
|
|
|
.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
|
|
|
|
.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
|
|
|
|
.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
|
|
|
|
.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
|
|
|
|
.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
|
|
|
|
.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
|
|
|
|
.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
|
|
|
|
.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
|
2014-06-20 20:00:00 +01:00
|
|
|
.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
|
2014-06-20 20:00:00 +01:00
|
|
|
___
|
|
|
|
}}}
|
|
|
|
$code.=<<___;
|
|
|
|
.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
|
|
|
|
.align 64
|
|
|
|
___
|
|
|
|
|
|
|
|
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
|
|
|
|
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
|
|
|
|
if ($win64) {
|
|
|
|
$rec="%rcx";
|
|
|
|
$frame="%rdx";
|
|
|
|
$context="%r8";
|
|
|
|
$disp="%r9";
|
|
|
|
|
|
|
|
$code.=<<___;
|
|
|
|
.extern __imp_RtlVirtualUnwind
|
|
|
|
.type se_handler,\@abi-omnipotent
|
|
|
|
.align 16
|
|
|
|
se_handler:
|
|
|
|
push %rsi
|
|
|
|
push %rdi
|
|
|
|
push %rbx
|
|
|
|
push %rbp
|
|
|
|
push %r12
|
|
|
|
push %r13
|
|
|
|
push %r14
|
|
|
|
push %r15
|
|
|
|
pushfq
|
|
|
|
sub \$64,%rsp
|
|
|
|
|
|
|
|
mov 120($context),%rax # pull context->Rax
|
|
|
|
mov 248($context),%rbx # pull context->Rip
|
|
|
|
|
|
|
|
lea .Lprologue(%rip),%r10
|
|
|
|
cmp %r10,%rbx # context->Rip<.Lprologue
|
|
|
|
jb .Lcommon_seh_tail
|
|
|
|
|
|
|
|
mov 152($context),%rax # pull context->Rsp
|
|
|
|
|
|
|
|
lea .Lepilogue(%rip),%r10
|
|
|
|
cmp %r10,%rbx # context->Rip>=.Lepilogue
|
|
|
|
jae .Lcommon_seh_tail
|
|
|
|
|
|
|
|
mov `16*4`(%rax),%rax # pull saved stack pointer
|
|
|
|
|
|
|
|
mov -8(%rax),%rbx
|
|
|
|
mov -16(%rax),%rbp
|
|
|
|
mov -24(%rax),%r12
|
|
|
|
mov -32(%rax),%r13
|
2014-06-20 20:00:00 +01:00
|
|
|
mov -40(%rax),%r14
|
2014-06-20 20:00:00 +01:00
|
|
|
mov %rbx,144($context) # restore context->Rbx
|
|
|
|
mov %rbp,160($context) # restore context->Rbp
|
|
|
|
mov %r12,216($context) # restore context->R12
|
|
|
|
mov %r13,224($context) # restore context->R13
|
2014-06-20 20:00:00 +01:00
|
|
|
mov %r14,232($context) # restore context->R14
|
2014-06-20 20:00:00 +01:00
|
|
|
|
|
|
|
jmp .Lcommon_seh_tail
|
|
|
|
.size se_handler,.-se_handler
|
2014-06-20 20:00:00 +01:00
|
|
|
___
|
2014-06-20 20:00:00 +01:00
|
|
|
|
2014-06-20 20:00:00 +01:00
|
|
|
$code.=<<___ if ($shaext);
|
2014-06-20 20:00:00 +01:00
|
|
|
.type shaext_handler,\@abi-omnipotent
|
|
|
|
.align 16
|
|
|
|
shaext_handler:
|
|
|
|
push %rsi
|
|
|
|
push %rdi
|
|
|
|
push %rbx
|
|
|
|
push %rbp
|
|
|
|
push %r12
|
|
|
|
push %r13
|
|
|
|
push %r14
|
|
|
|
push %r15
|
|
|
|
pushfq
|
|
|
|
sub \$64,%rsp
|
|
|
|
|
|
|
|
mov 120($context),%rax # pull context->Rax
|
|
|
|
mov 248($context),%rbx # pull context->Rip
|
|
|
|
|
|
|
|
lea .Lprologue_shaext(%rip),%r10
|
|
|
|
cmp %r10,%rbx # context->Rip<.Lprologue
|
|
|
|
jb .Lcommon_seh_tail
|
|
|
|
|
|
|
|
lea .Lepilogue_shaext(%rip),%r10
|
|
|
|
cmp %r10,%rbx # context->Rip>=.Lepilogue
|
|
|
|
jae .Lcommon_seh_tail
|
|
|
|
|
|
|
|
lea -8-4*16(%rax),%rsi
|
|
|
|
lea 512($context),%rdi # &context.Xmm6
|
|
|
|
mov \$8,%ecx
|
|
|
|
.long 0xa548f3fc # cld; rep movsq
|
|
|
|
|
|
|
|
jmp .Lcommon_seh_tail
|
|
|
|
.size shaext_handler,.-shaext_handler
|
2014-06-20 20:00:00 +01:00
|
|
|
___
|
2014-06-20 20:00:00 +01:00
|
|
|
|
2014-06-20 20:00:00 +01:00
|
|
|
$code.=<<___;
|
2014-06-20 20:00:00 +01:00
|
|
|
.type ssse3_handler,\@abi-omnipotent
|
|
|
|
.align 16
|
|
|
|
ssse3_handler:
|
|
|
|
push %rsi
|
|
|
|
push %rdi
|
|
|
|
push %rbx
|
|
|
|
push %rbp
|
|
|
|
push %r12
|
|
|
|
push %r13
|
|
|
|
push %r14
|
|
|
|
push %r15
|
|
|
|
pushfq
|
|
|
|
sub \$64,%rsp
|
|
|
|
|
|
|
|
mov 120($context),%rax # pull context->Rax
|
|
|
|
mov 248($context),%rbx # pull context->Rip
|
|
|
|
|
|
|
|
mov 8($disp),%rsi # disp->ImageBase
|
|
|
|
mov 56($disp),%r11 # disp->HandlerData
|
|
|
|
|
|
|
|
mov 0(%r11),%r10d # HandlerData[0]
|
|
|
|
lea (%rsi,%r10),%r10 # prologue label
|
|
|
|
cmp %r10,%rbx # context->Rip<prologue label
|
|
|
|
jb .Lcommon_seh_tail
|
|
|
|
|
2017-02-09 22:17:39 +00:00
|
|
|
mov 208($context),%rax # pull context->R11
|
2014-06-20 20:00:00 +01:00
|
|
|
|
|
|
|
mov 4(%r11),%r10d # HandlerData[1]
|
|
|
|
lea (%rsi,%r10),%r10 # epilogue label
|
|
|
|
cmp %r10,%rbx # context->Rip>=epilogue label
|
|
|
|
jae .Lcommon_seh_tail
|
|
|
|
|
|
|
|
lea -40-6*16(%rax),%rsi
|
|
|
|
lea 512($context),%rdi # &context.Xmm6
|
|
|
|
mov \$12,%ecx
|
|
|
|
.long 0xa548f3fc # cld; rep movsq
|
|
|
|
|
|
|
|
mov -8(%rax),%rbx
|
|
|
|
mov -16(%rax),%rbp
|
|
|
|
mov -24(%rax),%r12
|
|
|
|
mov -32(%rax),%r13
|
|
|
|
mov -40(%rax),%r14
|
|
|
|
mov %rbx,144($context) # restore context->Rbx
|
|
|
|
mov %rbp,160($context) # restore context->Rbp
|
2018-08-07 19:26:15 +01:00
|
|
|
mov %r12,216($context) # restore context->R12
|
|
|
|
mov %r13,224($context) # restore context->R13
|
|
|
|
mov %r14,232($context) # restore context->R14
|
2014-06-20 20:00:00 +01:00
|
|
|
|
|
|
|
.Lcommon_seh_tail:
|
|
|
|
mov 8(%rax),%rdi
|
|
|
|
mov 16(%rax),%rsi
|
|
|
|
mov %rax,152($context) # restore context->Rsp
|
|
|
|
mov %rsi,168($context) # restore context->Rsi
|
|
|
|
mov %rdi,176($context) # restore context->Rdi
|
|
|
|
|
|
|
|
mov 40($disp),%rdi # disp->ContextRecord
|
|
|
|
mov $context,%rsi # context
|
|
|
|
mov \$154,%ecx # sizeof(CONTEXT)
|
|
|
|
.long 0xa548f3fc # cld; rep movsq
|
|
|
|
|
|
|
|
mov $disp,%rsi
|
|
|
|
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
|
|
|
|
mov 8(%rsi),%rdx # arg2, disp->ImageBase
|
|
|
|
mov 0(%rsi),%r8 # arg3, disp->ControlPc
|
|
|
|
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
|
|
|
|
mov 40(%rsi),%r10 # disp->ContextRecord
|
|
|
|
lea 56(%rsi),%r11 # &disp->HandlerData
|
|
|
|
lea 24(%rsi),%r12 # &disp->EstablisherFrame
|
|
|
|
mov %r10,32(%rsp) # arg5
|
|
|
|
mov %r11,40(%rsp) # arg6
|
|
|
|
mov %r12,48(%rsp) # arg7
|
|
|
|
mov %rcx,56(%rsp) # arg8, (NULL)
|
|
|
|
call *__imp_RtlVirtualUnwind(%rip)
|
|
|
|
|
|
|
|
mov \$1,%eax # ExceptionContinueSearch
|
|
|
|
add \$64,%rsp
|
|
|
|
popfq
|
|
|
|
pop %r15
|
|
|
|
pop %r14
|
|
|
|
pop %r13
|
|
|
|
pop %r12
|
|
|
|
pop %rbp
|
|
|
|
pop %rbx
|
|
|
|
pop %rdi
|
|
|
|
pop %rsi
|
|
|
|
ret
|
|
|
|
.size ssse3_handler,.-ssse3_handler
|
|
|
|
|
|
|
|
.section .pdata
|
|
|
|
.align 4
|
|
|
|
.rva .LSEH_begin_sha1_block_data_order
|
|
|
|
.rva .LSEH_end_sha1_block_data_order
|
|
|
|
.rva .LSEH_info_sha1_block_data_order
|
2014-06-20 20:00:00 +01:00
|
|
|
___
|
|
|
|
$code.=<<___ if ($shaext);
|
2014-06-20 20:00:00 +01:00
|
|
|
.rva .LSEH_begin_sha1_block_data_order_shaext
|
|
|
|
.rva .LSEH_end_sha1_block_data_order_shaext
|
|
|
|
.rva .LSEH_info_sha1_block_data_order_shaext
|
2014-06-20 20:00:00 +01:00
|
|
|
___
|
|
|
|
$code.=<<___;
|
2014-06-20 20:00:00 +01:00
|
|
|
.rva .LSEH_begin_sha1_block_data_order_ssse3
|
|
|
|
.rva .LSEH_end_sha1_block_data_order_ssse3
|
|
|
|
.rva .LSEH_info_sha1_block_data_order_ssse3
|
|
|
|
___
|
|
|
|
$code.=<<___ if ($avx);
|
|
|
|
.rva .LSEH_begin_sha1_block_data_order_avx
|
|
|
|
.rva .LSEH_end_sha1_block_data_order_avx
|
|
|
|
.rva .LSEH_info_sha1_block_data_order_avx
|
|
|
|
___
|
|
|
|
$code.=<<___ if ($avx>1);
|
|
|
|
.rva .LSEH_begin_sha1_block_data_order_avx2
|
|
|
|
.rva .LSEH_end_sha1_block_data_order_avx2
|
|
|
|
.rva .LSEH_info_sha1_block_data_order_avx2
|
|
|
|
___
|
|
|
|
$code.=<<___;
|
|
|
|
.section .xdata
|
|
|
|
.align 8
|
|
|
|
.LSEH_info_sha1_block_data_order:
|
|
|
|
.byte 9,0,0,0
|
|
|
|
.rva se_handler
|
2014-06-20 20:00:00 +01:00
|
|
|
___
|
|
|
|
$code.=<<___ if ($shaext);
|
2014-06-20 20:00:00 +01:00
|
|
|
.LSEH_info_sha1_block_data_order_shaext:
|
|
|
|
.byte 9,0,0,0
|
|
|
|
.rva shaext_handler
|
2014-06-20 20:00:00 +01:00
|
|
|
___
|
|
|
|
$code.=<<___;
|
2014-06-20 20:00:00 +01:00
|
|
|
.LSEH_info_sha1_block_data_order_ssse3:
|
|
|
|
.byte 9,0,0,0
|
|
|
|
.rva ssse3_handler
|
|
|
|
.rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
|
|
|
|
___
|
|
|
|
$code.=<<___ if ($avx);
|
|
|
|
.LSEH_info_sha1_block_data_order_avx:
|
|
|
|
.byte 9,0,0,0
|
|
|
|
.rva ssse3_handler
|
|
|
|
.rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
|
|
|
|
___
|
|
|
|
$code.=<<___ if ($avx>1);
|
|
|
|
.LSEH_info_sha1_block_data_order_avx2:
|
|
|
|
.byte 9,0,0,0
|
|
|
|
.rva ssse3_handler
|
|
|
|
.rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
|
|
|
|
___
|
|
|
|
}
|
|
|
|
|
|
|
|
####################################################################
|
|
|
|
|
2014-06-20 20:00:00 +01:00
|
|
|
sub sha1rnds4 {
|
|
|
|
if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) {
|
|
|
|
my @opcode=(0x0f,0x3a,0xcc);
|
|
|
|
push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
|
|
|
|
my $c=$1;
|
|
|
|
push @opcode,$c=~/^0/?oct($c):$c;
|
|
|
|
return ".byte\t".join(',',@opcode);
|
|
|
|
} else {
|
|
|
|
return "sha1rnds4\t".@_[0];
|
|
|
|
}
|
|
|
|
}
|
2014-06-20 20:00:00 +01:00
|
|
|
|
2014-06-20 20:00:00 +01:00
|
|
|
sub sha1op38 {
|
|
|
|
my $instr = shift;
|
|
|
|
my %opcodelet = (
|
|
|
|
"sha1nexte" => 0xc8,
|
|
|
|
"sha1msg1" => 0xc9,
|
|
|
|
"sha1msg2" => 0xca );
|
|
|
|
|
2014-06-20 20:00:00 +01:00
|
|
|
if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
|
2014-06-20 20:00:00 +01:00
|
|
|
my @opcode=(0x0f,0x38);
|
2014-06-20 20:00:00 +01:00
|
|
|
my $rex=0;
|
|
|
|
$rex|=0x04 if ($2>=8);
|
|
|
|
$rex|=0x01 if ($1>=8);
|
|
|
|
unshift @opcode,0x40|$rex if ($rex);
|
2014-06-20 20:00:00 +01:00
|
|
|
push @opcode,$opcodelet{$instr};
|
|
|
|
push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
|
|
|
|
return ".byte\t".join(',',@opcode);
|
|
|
|
} else {
|
|
|
|
return $instr."\t".@_[0];
|
|
|
|
}
|
|
|
|
}
|
2014-06-20 20:00:00 +01:00
|
|
|
|
2014-06-20 20:00:00 +01:00
|
|
|
foreach (split("\n",$code)) {
|
|
|
|
s/\`([^\`]*)\`/eval $1/geo;
|
|
|
|
|
|
|
|
s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or
|
|
|
|
s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo;
|
|
|
|
|
|
|
|
print $_,"\n";
|
|
|
|
}
|
2014-06-20 20:00:00 +01:00
|
|
|
close STDOUT;
|