Browse Source

x86_64 assembly pack: Win64 SEH face-lift.

(Imports upstream's 384e6de4c7. Changes to
P-256 assembly dropped because we're so different there.)

 - harmonize handlers with guidelines and themselves;
 - fix some bugs in handlers;

Change-Id: Ic0b6a37bed6baedc50448c72fab088327f12898d
Reviewed-on: https://boringssl-review.googlesource.com/13782
Commit-Queue: Adam Langley <agl@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
Reviewed-by: David Benjamin <davidben@google.com>
kris/onging/CECPQ3_patch15
Adam Langley 7 years ago
committed by CQ bot account: commit-bot@chromium.org
parent
commit
cb1b333c2b
10 changed files with 1732 additions and 469 deletions
  1. +20
    -22
      crypto/aes/asm/aes-x86_64.pl
  2. +1144
    -130
      crypto/aes/asm/aesni-x86_64.pl
  3. +88
    -73
      crypto/aes/asm/bsaes-x86_64.pl
  4. +1
    -1
      crypto/bn/asm/rsaz-avx2.pl
  5. +6
    -6
      crypto/bn/asm/x86_64-mont.pl
  6. +2
    -2
      crypto/bn/asm/x86_64-mont5.pl
  7. +343
    -110
      crypto/chacha/asm/chacha-x86_64.pl
  8. +23
    -12
      crypto/modes/asm/ghash-x86_64.pl
  9. +60
    -67
      crypto/sha/asm/sha1-x86_64.pl
  10. +45
    -46
      crypto/sha/asm/sha512-x86_64.pl

+ 20
- 22
crypto/aes/asm/aes-x86_64.pl View File

@@ -590,6 +590,7 @@ $code.=<<___;
.type asm_AES_encrypt,\@function,3
.hidden asm_AES_encrypt
asm_AES_encrypt:
mov %rsp,%rax
push %rbx
push %rbp
push %r12
@@ -598,7 +599,6 @@ asm_AES_encrypt:
push %r15

# allocate frame "above" key schedule
mov %rsp,%r10
lea -63(%rdx),%rcx # %rdx is key argument
and \$-64,%rsp
sub %rsp,%rcx
@@ -608,7 +608,7 @@ asm_AES_encrypt:
sub \$32,%rsp

mov %rsi,16(%rsp) # save out
mov %r10,24(%rsp) # save real stack pointer
mov %rax,24(%rsp) # save original stack pointer
.Lenc_prologue:

mov %rdx,$key
@@ -640,13 +640,13 @@ asm_AES_encrypt:
mov $s2,8($out)
mov $s3,12($out)

mov (%rsi),%r15
mov 8(%rsi),%r14
mov 16(%rsi),%r13
mov 24(%rsi),%r12
mov 32(%rsi),%rbp
mov 40(%rsi),%rbx
lea 48(%rsi),%rsp
mov -48(%rsi),%r15
mov -40(%rsi),%r14
mov -32(%rsi),%r13
mov -24(%rsi),%r12
mov -16(%rsi),%rbp
mov -8(%rsi),%rbx
lea (%rsi),%rsp
.Lenc_epilogue:
ret
.size asm_AES_encrypt,.-asm_AES_encrypt
@@ -1186,6 +1186,7 @@ $code.=<<___;
.type asm_AES_decrypt,\@function,3
.hidden asm_AES_decrypt
asm_AES_decrypt:
mov %rsp,%rax
push %rbx
push %rbp
push %r12
@@ -1194,7 +1195,6 @@ asm_AES_decrypt:
push %r15

# allocate frame "above" key schedule
mov %rsp,%r10
lea -63(%rdx),%rcx # %rdx is key argument
and \$-64,%rsp
sub %rsp,%rcx
@@ -1204,7 +1204,7 @@ asm_AES_decrypt:
sub \$32,%rsp

mov %rsi,16(%rsp) # save out
mov %r10,24(%rsp) # save real stack pointer
mov %rax,24(%rsp) # save original stack pointer
.Ldec_prologue:

mov %rdx,$key
@@ -1238,13 +1238,13 @@ asm_AES_decrypt:
mov $s2,8($out)
mov $s3,12($out)

mov (%rsi),%r15
mov 8(%rsi),%r14
mov 16(%rsi),%r13
mov 24(%rsi),%r12
mov 32(%rsi),%rbp
mov 40(%rsi),%rbx
lea 48(%rsi),%rsp
mov -48(%rsi),%r15
mov -40(%rsi),%r14
mov -32(%rsi),%r13
mov -24(%rsi),%r12
mov -16(%rsi),%rbp
mov -8(%rsi),%rbx
lea (%rsi),%rsp
.Ldec_epilogue:
ret
.size asm_AES_decrypt,.-asm_AES_decrypt
@@ -1660,10 +1660,9 @@ asm_AES_cbc_encrypt:
mov %r9d,%r9d # clear upper half of enc

lea .LAES_Te(%rip),$sbox
lea .LAES_Td(%rip),%r10
cmp \$0,%r9
jne .Lcbc_picked_te
lea .LAES_Td(%rip),$sbox
.Lcbc_picked_te:
cmoveq %r10,$sbox

mov OPENSSL_ia32cap_P(%rip),%r10d
cmp \$$speed_limit,%rdx
@@ -2565,7 +2564,6 @@ block_se_handler:
jae .Lin_block_prologue

mov 24(%rax),%rax # pull saved real stack pointer
lea 48(%rax),%rax # adjust...

mov -8(%rax),%rbx
mov -16(%rax),%rbp


+ 1144
- 130
crypto/aes/asm/aesni-x86_64.pl
File diff suppressed because it is too large
View File


+ 88
- 73
crypto/aes/asm/bsaes-x86_64.pl View File

@@ -1327,7 +1327,7 @@ $code.=<<___;
cmp %rax, %rbp
jb .Lecb_enc_bzero

lea (%rbp),%rsp # restore %rsp
lea 0x78(%rbp),%rax
___
$code.=<<___ if ($win64);
movaps 0x40(%rbp), %xmm6
@@ -1340,17 +1340,17 @@ $code.=<<___ if ($win64);
movaps 0xb0(%rbp), %xmm13
movaps 0xc0(%rbp), %xmm14
movaps 0xd0(%rbp), %xmm15
lea 0xa0(%rbp), %rsp
lea 0xa0(%rax), %rax
.Lecb_enc_tail:
___
$code.=<<___;
mov 0x48(%rsp), %r15
mov 0x50(%rsp), %r14
mov 0x58(%rsp), %r13
mov 0x60(%rsp), %r12
mov 0x68(%rsp), %rbx
mov 0x70(%rsp), %rax
lea 0x78(%rsp), %rsp
mov %rax, %rbp
mov -48(%rax), %r15
mov -40(%rax), %r14
mov -32(%rax), %r13
mov -24(%rax), %r12
mov -16(%rax), %rbx
mov -8(%rax), %rbp
lea (%rax), %rsp # restore %rsp
.Lecb_enc_epilogue:
ret
.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
@@ -1529,7 +1529,7 @@ $code.=<<___;
cmp %rax, %rbp
jb .Lecb_dec_bzero

lea (%rbp),%rsp # restore %rsp
lea 0x78(%rbp),%rax
___
$code.=<<___ if ($win64);
movaps 0x40(%rbp), %xmm6
@@ -1542,17 +1542,17 @@ $code.=<<___ if ($win64);
movaps 0xb0(%rbp), %xmm13
movaps 0xc0(%rbp), %xmm14
movaps 0xd0(%rbp), %xmm15
lea 0xa0(%rbp), %rsp
lea 0xa0(%rax), %rax
.Lecb_dec_tail:
___
$code.=<<___;
mov 0x48(%rsp), %r15
mov 0x50(%rsp), %r14
mov 0x58(%rsp), %r13
mov 0x60(%rsp), %r12
mov 0x68(%rsp), %rbx
mov 0x70(%rsp), %rax
lea 0x78(%rsp), %rsp
mov %rax, %rbp
mov -48(%rax), %r15
mov -40(%rax), %r14
mov -32(%rax), %r13
mov -24(%rax), %r12
mov -16(%rax), %rbx
mov -8(%rax), %rbp
lea (%rax), %rsp # restore %rsp
.Lecb_dec_epilogue:
ret
.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
@@ -1819,7 +1819,7 @@ $code.=<<___;
cmp %rax, %rbp
ja .Lcbc_dec_bzero

lea (%rbp),%rsp # restore %rsp
lea 0x78(%rbp),%rax
___
$code.=<<___ if ($win64);
movaps 0x40(%rbp), %xmm6
@@ -1832,17 +1832,17 @@ $code.=<<___ if ($win64);
movaps 0xb0(%rbp), %xmm13
movaps 0xc0(%rbp), %xmm14
movaps 0xd0(%rbp), %xmm15
lea 0xa0(%rbp), %rsp
lea 0xa0(%rax), %rax
.Lcbc_dec_tail:
___
$code.=<<___;
mov 0x48(%rsp), %r15
mov 0x50(%rsp), %r14
mov 0x58(%rsp), %r13
mov 0x60(%rsp), %r12
mov 0x68(%rsp), %rbx
mov 0x70(%rsp), %rax
lea 0x78(%rsp), %rsp
mov %rax, %rbp
mov -48(%rax), %r15
mov -40(%rax), %r14
mov -32(%rax), %r13
mov -24(%rax), %r12
mov -16(%rax), %rbx
mov -8(%rax), %rbp
lea (%rax), %rsp # restore %rsp
.Lcbc_dec_epilogue:
ret
.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
@@ -2051,7 +2051,7 @@ $code.=<<___;
cmp %rax, %rbp
ja .Lctr_enc_bzero

lea (%rbp),%rsp # restore %rsp
lea 0x78(%rbp),%rax
___
$code.=<<___ if ($win64);
movaps 0x40(%rbp), %xmm6
@@ -2064,17 +2064,17 @@ $code.=<<___ if ($win64);
movaps 0xb0(%rbp), %xmm13
movaps 0xc0(%rbp), %xmm14
movaps 0xd0(%rbp), %xmm15
lea 0xa0(%rbp), %rsp
lea 0xa0(%rax), %rax
.Lctr_enc_tail:
___
$code.=<<___;
mov 0x48(%rsp), %r15
mov 0x50(%rsp), %r14
mov 0x58(%rsp), %r13
mov 0x60(%rsp), %r12
mov 0x68(%rsp), %rbx
mov 0x70(%rsp), %rax
lea 0x78(%rsp), %rsp
mov %rax, %rbp
mov -48(%rax), %r15
mov -40(%rax), %r14
mov -32(%rax), %r13
mov -24(%rax), %r12
mov -16(%rax), %rbx
mov -8(%rax), %rbp
lea (%rax), %rsp # restore %rsp
.Lctr_enc_epilogue:
ret
.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
@@ -2441,7 +2441,7 @@ $code.=<<___;
cmp %rax, %rbp
ja .Lxts_enc_bzero

lea (%rbp),%rsp # restore %rsp
lea 0x78(%rbp),%rax
___
$code.=<<___ if ($win64);
movaps 0x40(%rbp), %xmm6
@@ -2454,17 +2454,17 @@ $code.=<<___ if ($win64);
movaps 0xb0(%rbp), %xmm13
movaps 0xc0(%rbp), %xmm14
movaps 0xd0(%rbp), %xmm15
lea 0xa0(%rbp), %rsp
lea 0xa0(%rax), %rax
.Lxts_enc_tail:
___
$code.=<<___;
mov 0x48(%rsp), %r15
mov 0x50(%rsp), %r14
mov 0x58(%rsp), %r13
mov 0x60(%rsp), %r12
mov 0x68(%rsp), %rbx
mov 0x70(%rsp), %rax
lea 0x78(%rsp), %rsp
mov %rax, %rbp
mov -48(%rax), %r15
mov -40(%rax), %r14
mov -32(%rax), %r13
mov -24(%rax), %r12
mov -16(%rax), %rbx
mov -8(%rax), %rbp
lea (%rax), %rsp # restore %rsp
.Lxts_enc_epilogue:
ret
.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
@@ -2848,7 +2848,7 @@ $code.=<<___;
cmp %rax, %rbp
ja .Lxts_dec_bzero

lea (%rbp),%rsp # restore %rsp
lea 0x78(%rbp),%rax
___
$code.=<<___ if ($win64);
movaps 0x40(%rbp), %xmm6
@@ -2861,17 +2861,17 @@ $code.=<<___ if ($win64);
movaps 0xb0(%rbp), %xmm13
movaps 0xc0(%rbp), %xmm14
movaps 0xd0(%rbp), %xmm15
lea 0xa0(%rbp), %rsp
lea 0xa0(%rax), %rax
.Lxts_dec_tail:
___
$code.=<<___;
mov 0x48(%rsp), %r15
mov 0x50(%rsp), %r14
mov 0x58(%rsp), %r13
mov 0x60(%rsp), %r12
mov 0x68(%rsp), %rbx
mov 0x70(%rsp), %rax
lea 0x78(%rsp), %rsp
mov %rax, %rbp
mov -48(%rax), %r15
mov -40(%rax), %r14
mov -32(%rax), %r13
mov -24(%rax), %r12
mov -16(%rax), %rbx
mov -8(%rax), %rbp
lea (%rax), %rsp # restore %rsp
.Lxts_dec_epilogue:
ret
.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
@@ -2967,31 +2967,34 @@ se_handler:

mov 0(%r11),%r10d # HandlerData[0]
lea (%rsi,%r10),%r10 # prologue label
cmp %r10,%rbx # context->Rip<prologue label
jb .Lin_prologue

mov 152($context),%rax # pull context->Rsp
cmp %r10,%rbx # context->Rip<=prologue label
jbe .Lin_prologue

mov 4(%r11),%r10d # HandlerData[1]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lin_prologue

mov 8(%r11),%r10d # HandlerData[2]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=tail label
jae .Lin_tail

mov 160($context),%rax # pull context->Rbp

lea 0x40(%rax),%rsi # %xmm save area
lea 512($context),%rdi # &context.Xmm6
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
.long 0xa548f3fc # cld; rep movsq
lea 0xa0(%rax),%rax # adjust stack pointer
mov 0x70(%rax),%rbp
mov 0x68(%rax),%rbx
mov 0x60(%rax),%r12
mov 0x58(%rax),%r13
mov 0x50(%rax),%r14
mov 0x48(%rax),%r15
lea 0x78(%rax),%rax # adjust stack pointer
lea 0xa0+0x78(%rax),%rax # adjust stack pointer
.Lin_tail:
mov -48(%rax),%rbp
mov -40(%rax),%rbx
mov -32(%rax),%r12
mov -24(%rax),%r13
mov -16(%rax),%r14
mov -8(%rax),%r15
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %r12,216($context) # restore context->R12
@@ -3072,28 +3075,40 @@ $code.=<<___ if ($ecb);
.byte 9,0,0,0
.rva se_handler
.rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
.rva .Lecb_enc_tail
.long 0
.Lecb_dec_info:
.byte 9,0,0,0
.rva se_handler
.rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
.rva .Lecb_dec_tail
.long 0
___
$code.=<<___;
.Lcbc_dec_info:
.byte 9,0,0,0
.rva se_handler
.rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
.rva .Lcbc_dec_tail
.long 0
.Lctr_enc_info:
.byte 9,0,0,0
.rva se_handler
.rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
.rva .Lctr_enc_tail
.long 0
.Lxts_enc_info:
.byte 9,0,0,0
.rva se_handler
.rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
.rva .Lxts_enc_tail
.long 0
.Lxts_dec_info:
.byte 9,0,0,0
.rva se_handler
.rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
.rva .Lxts_dec_tail
.long 0
___
}



+ 1
- 1
crypto/bn/asm/rsaz-avx2.pl View File

@@ -1717,11 +1717,11 @@ $code.=<<___ if ($win64);
movaps -0x38(%r11),%xmm13
movaps -0x28(%r11),%xmm14
movaps -0x18(%r11),%xmm15
.LSEH_end_rsaz_1024_gather5:
___
$code.=<<___;
lea (%r11),%rsp
ret
.LSEH_end_rsaz_1024_gather5:
.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
___
}


+ 6
- 6
crypto/bn/asm/x86_64-mont.pl View File

@@ -675,10 +675,11 @@ ___
my @ri=("%rax","%rdx",$m0,$m1);
$code.=<<___;
mov 16(%rsp,$num,8),$rp # restore $rp
lea -4($num),$j
mov 0(%rsp),@ri[0] # tp[0]
pxor %xmm0,%xmm0
mov 8(%rsp),@ri[1] # tp[1]
shr \$2,$num # num/=4
shr \$2,$j # j=num/4-1
lea (%rsp),$ap # borrow ap for tp
xor $i,$i # i=0 and clear CF!

@@ -686,7 +687,6 @@ $code.=<<___;
mov 16($ap),@ri[2] # tp[2]
mov 24($ap),@ri[3] # tp[3]
sbb 8($np),@ri[1]
lea -1($num),$j # j=num/4-1
jmp .Lsub4x
.align 16
.Lsub4x:
@@ -720,8 +720,9 @@ $code.=<<___;
not @ri[0]
mov $rp,$np
and @ri[0],$np
lea -1($num),$j
lea -4($num),$j
or $np,$ap # ap=borrow?tp:rp
shr \$2,$j # j=num/4-1

movdqu ($ap),%xmm1
movdqa %xmm0,(%rsp)
@@ -739,7 +740,6 @@ $code.=<<___;
dec $j
jnz .Lcopy4x

shl \$2,$num
movdqu 16($ap,$i),%xmm2
movdqa %xmm0,16(%rsp,$i)
movdqu %xmm2,16($rp,$i)
@@ -1381,12 +1381,12 @@ sqr_handler:

mov 0(%r11),%r10d # HandlerData[0]
lea (%rsi,%r10),%r10 # end of prologue label
cmp %r10,%rbx # context->Rip<.Lsqr_body
cmp %r10,%rbx # context->Rip<.Lsqr_prologue
jb .Lcommon_seh_tail

mov 4(%r11),%r10d # HandlerData[1]
lea (%rsi,%r10),%r10 # body label
cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue
cmp %r10,%rbx # context->Rip<.Lsqr_body
jb .Lcommon_pop_regs

mov 152($context),%rax # pull context->Rsp


+ 2
- 2
crypto/bn/asm/x86_64-mont5.pl View File

@@ -3630,8 +3630,8 @@ mul_handler:
jb .Lcommon_seh_tail

mov 4(%r11),%r10d # HandlerData[1]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
lea (%rsi,%r10),%r10 # beginning of body label
cmp %r10,%rbx # context->Rip<body label
jb .Lcommon_pop_regs

mov 152($context),%rax # pull context->Rsp


+ 343
- 110
crypto/chacha/asm/chacha-x86_64.pl View File

@@ -245,6 +245,7 @@ $code.=<<___;
push %r14
push %r15
sub \$64+24,%rsp
.Lctr32_body:

#movdqa .Lsigma(%rip),%xmm0
movdqu ($key),%xmm1
@@ -383,13 +384,14 @@ $code.=<<___;
jnz .Loop_tail

.Ldone:
add \$64+24,%rsp
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
lea 64+24+48(%rsp),%rsi
mov -48(%rsi),%r15
mov -40(%rsi),%r14
mov -32(%rsi),%r13
mov -24(%rsi),%r12
mov -16(%rsi),%rbp
mov -8(%rsi),%rbx
lea (%rsi),%rsp
.Lno_data:
ret
.size ChaCha20_ctr32,.-ChaCha20_ctr32
@@ -424,31 +426,26 @@ sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
&por ($b,$t);
}

my $xframe = $win64 ? 32+32+8 : 24;
my $xframe = $win64 ? 32+8 : 8;

$code.=<<___;
.type ChaCha20_ssse3,\@function,5
.align 32
ChaCha20_ssse3:
.LChaCha20_ssse3:
mov %rsp,%r9 # frame pointer
___
$code.=<<___;
cmp \$128,$len # we might throw away some data,
ja .LChaCha20_4x # but overall it won't be slower

.Ldo_sse3_after_all:
push %rbx # just to share SEH handler, no pops
push %rbp
push %r12
push %r13
push %r14
push %r15

sub \$64+$xframe,%rsp
___
$code.=<<___ if ($win64);
movaps %xmm6,64+32(%rsp)
movaps %xmm7,64+48(%rsp)
movaps %xmm6,-0x28(%r9)
movaps %xmm7,-0x18(%r9)
.Lssse3_body:
___
$code.=<<___;
movdqa .Lsigma(%rip),$a
@@ -543,11 +540,12 @@ $code.=<<___;
.Ldone_ssse3:
___
$code.=<<___ if ($win64);
movaps 64+32(%rsp),%xmm6
movaps 64+48(%rsp),%xmm7
movaps -0x28(%r9),%xmm6
movaps -0x18(%r9),%xmm7
___
$code.=<<___;
add \$64+$xframe+48,%rsp
lea (%r9),%rsp
.Lssse3_epilogue:
ret
.size ChaCha20_ssse3,.-ChaCha20_ssse3
___
@@ -684,13 +682,14 @@ my @x=map("\"$_\"",@xx);
);
}

my $xframe = $win64 ? 0xa0 : 0;
my $xframe = $win64 ? 0xa8 : 8;

$code.=<<___;
.type ChaCha20_4x,\@function,5
.align 32
ChaCha20_4x:
.LChaCha20_4x:
mov %rsp,%r9 # frame pointer
mov %r10,%r11
___
$code.=<<___ if ($avx>1);
@@ -707,8 +706,7 @@ $code.=<<___;
je .Ldo_sse3_after_all # to detect Atom

.Lproceed4x:
lea -0x78(%rsp),%r11
sub \$0x148+$xframe,%rsp
sub \$0x140+$xframe,%rsp
___
################ stack layout
# +0x00 SIMD equivalent of @x[8-12]
@@ -719,16 +717,17 @@ ___
# ...
# +0x140
$code.=<<___ if ($win64);
movaps %xmm6,-0x30(%r11)
movaps %xmm7,-0x20(%r11)
movaps %xmm8,-0x10(%r11)
movaps %xmm9,0x00(%r11)
movaps %xmm10,0x10(%r11)
movaps %xmm11,0x20(%r11)
movaps %xmm12,0x30(%r11)
movaps %xmm13,0x40(%r11)
movaps %xmm14,0x50(%r11)
movaps %xmm15,0x60(%r11)
movaps %xmm6,-0xa8(%r9)
movaps %xmm7,-0x98(%r9)
movaps %xmm8,-0x88(%r9)
movaps %xmm9,-0x78(%r9)
movaps %xmm10,-0x68(%r9)
movaps %xmm11,-0x58(%r9)
movaps %xmm12,-0x48(%r9)
movaps %xmm13,-0x38(%r9)
movaps %xmm14,-0x28(%r9)
movaps %xmm15,-0x18(%r9)
.L4x_body:
___
$code.=<<___;
movdqa .Lsigma(%rip),$xa3 # key[0]
@@ -1117,20 +1116,20 @@ $code.=<<___;
.Ldone4x:
___
$code.=<<___ if ($win64);
lea 0x140+0x30(%rsp),%r11
movaps -0x30(%r11),%xmm6
movaps -0x20(%r11),%xmm7
movaps -0x10(%r11),%xmm8
movaps 0x00(%r11),%xmm9
movaps 0x10(%r11),%xmm10
movaps 0x20(%r11),%xmm11
movaps 0x30(%r11),%xmm12
movaps 0x40(%r11),%xmm13
movaps 0x50(%r11),%xmm14
movaps 0x60(%r11),%xmm15
movaps -0xa8(%r9),%xmm6
movaps -0x98(%r9),%xmm7
movaps -0x88(%r9),%xmm8
movaps -0x78(%r9),%xmm9
movaps -0x68(%r9),%xmm10
movaps -0x58(%r9),%xmm11
movaps -0x48(%r9),%xmm12
movaps -0x38(%r9),%xmm13
movaps -0x28(%r9),%xmm14
movaps -0x18(%r9),%xmm15
___
$code.=<<___;
add \$0x148+$xframe,%rsp
lea (%r9),%rsp
.L4x_epilogue:
ret
.size ChaCha20_4x,.-ChaCha20_4x
___
@@ -1258,33 +1257,32 @@ my @x=map("\"$_\"",@xx);
);
}

my $xframe = $win64 ? 0xb0 : 8;
my $xframe = $win64 ? 0xa8 : 8;

$code.=<<___;
.type ChaCha20_8x,\@function,5
.align 32
ChaCha20_8x:
.LChaCha20_8x:
mov %rsp,%r10
mov %rsp,%r9 # frame register
sub \$0x280+$xframe,%rsp
and \$-32,%rsp
___
$code.=<<___ if ($win64);
lea 0x290+0x30(%rsp),%r11
movaps %xmm6,-0x30(%r11)
movaps %xmm7,-0x20(%r11)
movaps %xmm8,-0x10(%r11)
movaps %xmm9,0x00(%r11)
movaps %xmm10,0x10(%r11)
movaps %xmm11,0x20(%r11)
movaps %xmm12,0x30(%r11)
movaps %xmm13,0x40(%r11)
movaps %xmm14,0x50(%r11)
movaps %xmm15,0x60(%r11)
movaps %xmm6,-0xa8(%r9)
movaps %xmm7,-0x98(%r9)
movaps %xmm8,-0x88(%r9)
movaps %xmm9,-0x78(%r9)
movaps %xmm10,-0x68(%r9)
movaps %xmm11,-0x58(%r9)
movaps %xmm12,-0x48(%r9)
movaps %xmm13,-0x38(%r9)
movaps %xmm14,-0x28(%r9)
movaps %xmm15,-0x18(%r9)
.L8x_body:
___
$code.=<<___;
vzeroupper
mov %r10,0x280(%rsp)

################ stack layout
# +0x00 SIMD equivalent of @x[8-12]
@@ -1293,7 +1291,7 @@ $code.=<<___;
# ...
# +0x200 SIMD counters (with nonce smashed by lanes)
# ...
# +0x280 saved %rsp
# +0x280

vbroadcasti128 .Lsigma(%rip),$xa3 # key[0]
vbroadcasti128 ($key),$xb3 # key[1]
@@ -1759,20 +1757,20 @@ $code.=<<___;
vzeroall
___
$code.=<<___ if ($win64);
lea 0x290+0x30(%rsp),%r11
movaps -0x30(%r11),%xmm6
movaps -0x20(%r11),%xmm7
movaps -0x10(%r11),%xmm8
movaps 0x00(%r11),%xmm9
movaps 0x10(%r11),%xmm10
movaps 0x20(%r11),%xmm11
movaps 0x30(%r11),%xmm12
movaps 0x40(%r11),%xmm13
movaps 0x50(%r11),%xmm14
movaps 0x60(%r11),%xmm15
movaps -0xa8(%r9),%xmm6
movaps -0x98(%r9),%xmm7
movaps -0x88(%r9),%xmm8
movaps -0x78(%r9),%xmm9
movaps -0x68(%r9),%xmm10
movaps -0x58(%r9),%xmm11
movaps -0x48(%r9),%xmm12
movaps -0x38(%r9),%xmm13
movaps -0x28(%r9),%xmm14
movaps -0x18(%r9),%xmm15
___
$code.=<<___;
mov 0x280(%rsp),%rsp
lea (%r9),%rsp
.L8x_epilogue:
ret
.size ChaCha20_8x,.-ChaCha20_8x
___
@@ -1804,28 +1802,23 @@ sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round
&vprold ($b,$b,7);
}

my $xframe = $win64 ? 32+32+8 : 24;
my $xframe = $win64 ? 32+8 : 8;

$code.=<<___;
.type ChaCha20_avx512,\@function,5
.align 32
ChaCha20_avx512:
.LChaCha20_avx512:
mov %rsp,%r9 # frame pointer
cmp \$512,$len
ja .LChaCha20_16x

push %rbx # just to share SEH handler, no pops
push %rbp
push %r12
push %r13
push %r14
push %r15

sub \$64+$xframe,%rsp
___
$code.=<<___ if ($win64);
movaps %xmm6,64+32(%rsp)
movaps %xmm7,64+48(%rsp)
movaps %xmm6,-0x28(%r9)
movaps %xmm7,-0x18(%r9)
.Lavx512_body:
___
$code.=<<___;
vbroadcasti32x4 .Lsigma(%rip),$a
@@ -1991,11 +1984,12 @@ $code.=<<___;
vzeroall
___
$code.=<<___ if ($win64);
movaps 64+32(%rsp),%xmm6
movaps 64+48(%rsp),%xmm7
movaps -0x28(%r9),%xmm6
movaps -0x18(%r9),%xmm7
___
$code.=<<___;
add \$64+$xframe+48,%rsp
lea (%r9),%rsp
.Lavx512_epilogue:
ret
.size ChaCha20_avx512,.-ChaCha20_avx512
___
@@ -2072,29 +2066,29 @@ my @x=map("\"$_\"",@xx);
);
}

my $xframe = $win64 ? 0xb0 : 8;
my $xframe = $win64 ? 0xa8 : 8;

$code.=<<___;
.type ChaCha20_16x,\@function,5
.align 32
ChaCha20_16x:
.LChaCha20_16x:
mov %rsp,%r11
mov %rsp,%r9 # frame register
sub \$64+$xframe,%rsp
and \$-64,%rsp
___
$code.=<<___ if ($win64);
lea 0x290+0x30(%rsp),%r11
movaps %xmm6,-0x30(%r11)
movaps %xmm7,-0x20(%r11)
movaps %xmm8,-0x10(%r11)
movaps %xmm9,0x00(%r11)
movaps %xmm10,0x10(%r11)
movaps %xmm11,0x20(%r11)
movaps %xmm12,0x30(%r11)
movaps %xmm13,0x40(%r11)
movaps %xmm14,0x50(%r11)
movaps %xmm15,0x60(%r11)
movaps %xmm6,-0xa8(%r9)
movaps %xmm7,-0x98(%r9)
movaps %xmm8,-0x88(%r9)
movaps %xmm9,-0x78(%r9)
movaps %xmm10,-0x68(%r9)
movaps %xmm11,-0x58(%r9)
movaps %xmm12,-0x48(%r9)
movaps %xmm13,-0x38(%r9)
movaps %xmm14,-0x28(%r9)
movaps %xmm15,-0x18(%r9)
.L16x_body:
___
$code.=<<___;
vzeroupper
@@ -2484,25 +2478,264 @@ $code.=<<___;
vzeroall
___
$code.=<<___ if ($win64);
lea 0x290+0x30(%rsp),%r11
movaps -0x30(%r11),%xmm6
movaps -0x20(%r11),%xmm7
movaps -0x10(%r11),%xmm8
movaps 0x00(%r11),%xmm9
movaps 0x10(%r11),%xmm10
movaps 0x20(%r11),%xmm11
movaps 0x30(%r11),%xmm12
movaps 0x40(%r11),%xmm13
movaps 0x50(%r11),%xmm14
movaps 0x60(%r11),%xmm15
movaps -0xa8(%r9),%xmm6
movaps -0x98(%r9),%xmm7
movaps -0x88(%r9),%xmm8
movaps -0x78(%r9),%xmm9
movaps -0x68(%r9),%xmm10
movaps -0x58(%r9),%xmm11
movaps -0x48(%r9),%xmm12
movaps -0x38(%r9),%xmm13
movaps -0x28(%r9),%xmm14
movaps -0x18(%r9),%xmm15
___
$code.=<<___;
mov %r11,%rsp
lea (%r9),%rsp
.L16x_epilogue:
ret
.size ChaCha20_16x,.-ChaCha20_16x
___
}

# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
if ($win64) {
$rec="%rcx";
$frame="%rdx";
$context="%r8";
$disp="%r9";

$code.=<<___;
.extern __imp_RtlVirtualUnwind
.type se_handler,\@abi-omnipotent
.align 16
se_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp

mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip

mov 8($disp),%rsi # disp->ImageBase
mov 56($disp),%r11 # disp->HandlerData

lea .Lctr32_body(%rip),%r10
cmp %r10,%rbx # context->Rip<.Lprologue
jb .Lcommon_seh_tail

mov 152($context),%rax # pull context->Rsp

lea .Lno_data(%rip),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=.Lepilogue
jae .Lcommon_seh_tail

lea 64+24+48(%rax),%rax

mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
mov -32(%rax),%r13
mov -40(%rax),%r14
mov -48(%rax),%r15
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %r12,216($context) # restore context->R12
mov %r13,224($context) # restore context->R13
mov %r14,232($context) # restore context->R14
mov %r15,240($context) # restore context->R14

.Lcommon_seh_tail:
mov 8(%rax),%rdi
mov 16(%rax),%rsi
mov %rax,152($context) # restore context->Rsp
mov %rsi,168($context) # restore context->Rsi
mov %rdi,176($context) # restore context->Rdi

mov 40($disp),%rdi # disp->ContextRecord
mov $context,%rsi # context
mov \$154,%ecx # sizeof(CONTEXT)
.long 0xa548f3fc # cld; rep movsq

mov $disp,%rsi
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
mov 8(%rsi),%rdx # arg2, disp->ImageBase
mov 0(%rsi),%r8 # arg3, disp->ControlPc
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
mov 40(%rsi),%r10 # disp->ContextRecord
lea 56(%rsi),%r11 # &disp->HandlerData
lea 24(%rsi),%r12 # &disp->EstablisherFrame
mov %r10,32(%rsp) # arg5
mov %r11,40(%rsp) # arg6
mov %r12,48(%rsp) # arg7
mov %rcx,56(%rsp) # arg8, (NULL)
call *__imp_RtlVirtualUnwind(%rip)

mov \$1,%eax # ExceptionContinueSearch
add \$64,%rsp
popfq
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
pop %rdi
pop %rsi
ret
.size se_handler,.-se_handler

.type ssse3_handler,\@abi-omnipotent
.align 16
ssse3_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp

mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip

mov 8($disp),%rsi # disp->ImageBase
mov 56($disp),%r11 # disp->HandlerData

mov 0(%r11),%r10d # HandlerData[0]
lea (%rsi,%r10),%r10 # prologue label
cmp %r10,%rbx # context->Rip<prologue label
jb .Lcommon_seh_tail

mov 192($context),%rax # pull context->R9

mov 4(%r11),%r10d # HandlerData[1]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail

lea -0x28(%rax),%rsi
lea 512($context),%rdi # &context.Xmm6
mov \$4,%ecx
.long 0xa548f3fc # cld; rep movsq

jmp .Lcommon_seh_tail
.size ssse3_handler,.-ssse3_handler

.type full_handler,\@abi-omnipotent
.align 16
full_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp

mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip

mov 8($disp),%rsi # disp->ImageBase
mov 56($disp),%r11 # disp->HandlerData

mov 0(%r11),%r10d # HandlerData[0]
lea (%rsi,%r10),%r10 # prologue label
cmp %r10,%rbx # context->Rip<prologue label
jb .Lcommon_seh_tail

mov 192($context),%rax # pull context->R9

mov 4(%r11),%r10d # HandlerData[1]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail

lea -0xa8(%rax),%rsi
lea 512($context),%rdi # &context.Xmm6
mov \$20,%ecx
.long 0xa548f3fc # cld; rep movsq

jmp .Lcommon_seh_tail
.size full_handler,.-full_handler

.section .pdata
.align 4
.rva .LSEH_begin_ChaCha20_ctr32
.rva .LSEH_end_ChaCha20_ctr32
.rva .LSEH_info_ChaCha20_ctr32

.rva .LSEH_begin_ChaCha20_ssse3
.rva .LSEH_end_ChaCha20_ssse3
.rva .LSEH_info_ChaCha20_ssse3

.rva .LSEH_begin_ChaCha20_4x
.rva .LSEH_end_ChaCha20_4x
.rva .LSEH_info_ChaCha20_4x
___
$code.=<<___ if ($avx>1);
.rva .LSEH_begin_ChaCha20_8x
.rva .LSEH_end_ChaCha20_8x
.rva .LSEH_info_ChaCha20_8x
___
$code.=<<___ if ($avx>2);
.rva .LSEH_begin_ChaCha20_avx512
.rva .LSEH_end_ChaCha20_avx512
.rva .LSEH_info_ChaCha20_avx512

.rva .LSEH_begin_ChaCha20_16x
.rva .LSEH_end_ChaCha20_16x
.rva .LSEH_info_ChaCha20_16x
___
$code.=<<___;
.section .xdata
.align 8
.LSEH_info_ChaCha20_ctr32:
.byte 9,0,0,0
.rva se_handler

.LSEH_info_ChaCha20_ssse3:
.byte 9,0,0,0
.rva ssse3_handler
.rva .Lssse3_body,.Lssse3_epilogue

.LSEH_info_ChaCha20_4x:
.byte 9,0,0,0
.rva full_handler
.rva .L4x_body,.L4x_epilogue
___
$code.=<<___ if ($avx>1);
.LSEH_info_ChaCha20_8x:
.byte 9,0,0,0
.rva full_handler
.rva .L8x_body,.L8x_epilogue # HandlerData[]
___
$code.=<<___ if ($avx>2);
.LSEH_info_ChaCha20_avx512:
.byte 9,0,0,0
.rva ssse3_handler
.rva .Lavx512_body,.Lavx512_epilogue # HandlerData[]

.LSEH_info_ChaCha20_16x:
.byte 9,0,0,0
.rva full_handler
.rva .L16x_body,.L16x_epilogue # HandlerData[]
___
}

foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;



+ 23
- 12
crypto/modes/asm/ghash-x86_64.pl View File

@@ -219,8 +219,12 @@ $code=<<___;
.align 16
gcm_gmult_4bit:
push %rbx
push %rbp # %rbp and %r12 are pushed exclusively in
push %rbp # %rbp and others are pushed exclusively in
push %r12 # order to reuse Win64 exception handler...
push %r13
push %r14
push %r15
sub \$280,%rsp
.Lgmult_prologue:

movzb 15($Xi),$Zlo
@@ -231,8 +235,9 @@ $code.=<<___;
mov $Zlo,8($Xi)
mov $Zhi,($Xi)

mov 16(%rsp),%rbx
lea 24(%rsp),%rsp
lea 280+48(%rsp),%rsi
mov -8(%rsi),%rbx
lea (%rsi),%rsp
.Lgmult_epilogue:
ret
.size gcm_gmult_4bit,.-gcm_gmult_4bit
@@ -382,14 +387,14 @@ $code.=<<___;
mov $Zlo,8($Xi)
mov $Zhi,($Xi)

lea 280(%rsp),%rsi
mov 0(%rsi),%r15
mov 8(%rsi),%r14
mov 16(%rsi),%r13
mov 24(%rsi),%r12
mov 32(%rsi),%rbp
mov 40(%rsi),%rbx
lea 48(%rsi),%rsp
lea 280+48(%rsp),%rsi
mov -48(%rsi),%r15
mov -40(%rsi),%r14
mov -32(%rsi),%r13
mov -24(%rsi),%r12
mov -16(%rsi),%rbp
mov -8(%rsi),%rbx
lea 0(%rsi),%rsp
.Lghash_epilogue:
ret
.size gcm_ghash_4bit,.-gcm_ghash_4bit
@@ -1630,14 +1635,20 @@ se_handler:
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lin_prologue

lea 24(%rax),%rax # adjust "rsp"
lea 48+280(%rax),%rax # adjust "rsp"

mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
mov -32(%rax),%r13
mov -40(%rax),%r14
mov -48(%rax),%r15
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %r12,216($context) # restore context->R12
mov %r13,224($context) # restore context->R13
mov %r14,232($context) # restore context->R14
mov %r15,240($context) # restore context->R15

.Lin_prologue:
mov 8(%rax),%rdi


+ 60
- 67
crypto/sha/asm/sha1-x86_64.pl View File

@@ -447,7 +447,8 @@ my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
my @T=("%esi","%edi");
my $j=0;
my $rx=0;
my $K_XX_XX="%r11";
my $K_XX_XX="%r14";
my $fp="%r11";

my $_rol=sub { &rol(@_) };
my $_ror=sub { &ror(@_) };
@@ -468,7 +469,7 @@ $code.=<<___;
.align 16
sha1_block_data_order_ssse3:
_ssse3_shortcut:
mov %rsp,%rax
mov %rsp,$fp # frame pointer
push %rbx
push %rbp
push %r12
@@ -477,16 +478,15 @@ _ssse3_shortcut:
lea `-64-($win64?6*16:0)`(%rsp),%rsp
___
$code.=<<___ if ($win64);
movaps %xmm6,-40-6*16(%rax)
movaps %xmm7,-40-5*16(%rax)
movaps %xmm8,-40-4*16(%rax)
movaps %xmm9,-40-3*16(%rax)
movaps %xmm10,-40-2*16(%rax)
movaps %xmm11,-40-1*16(%rax)
movaps %xmm6,-40-6*16($fp)
movaps %xmm7,-40-5*16($fp)
movaps %xmm8,-40-4*16($fp)
movaps %xmm9,-40-3*16($fp)
movaps %xmm10,-40-2*16($fp)
movaps %xmm11,-40-1*16($fp)
.Lprologue_ssse3:
___
$code.=<<___;
mov %rax,%r14 # original %rsp
and \$-64,%rsp
mov %rdi,$ctx # reassigned argument
mov %rsi,$inp # reassigned argument
@@ -893,21 +893,20 @@ $code.=<<___;
mov $E,16($ctx)
___
$code.=<<___ if ($win64);
movaps -40-6*16(%r14),%xmm6
movaps -40-5*16(%r14),%xmm7
movaps -40-4*16(%r14),%xmm8
movaps -40-3*16(%r14),%xmm9
movaps -40-2*16(%r14),%xmm10
movaps -40-1*16(%r14),%xmm11
movaps -40-6*16($fp),%xmm6
movaps -40-5*16($fp),%xmm7
movaps -40-4*16($fp),%xmm8
movaps -40-3*16($fp),%xmm9
movaps -40-2*16($fp),%xmm10
movaps -40-1*16($fp),%xmm11
___
$code.=<<___;
lea (%r14),%rsi
mov -40(%rsi),%r14
mov -32(%rsi),%r13
mov -24(%rsi),%r12
mov -16(%rsi),%rbp
mov -8(%rsi),%rbx
lea (%rsi),%rsp
mov -40($fp),%r14
mov -32($fp),%r13
mov -24($fp),%r12
mov -16($fp),%rbp
mov -8($fp),%rbx
lea ($fp),%rsp
.Lepilogue_ssse3:
ret
.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
@@ -930,7 +929,7 @@ $code.=<<___;
.align 16
sha1_block_data_order_avx:
_avx_shortcut:
mov %rsp,%rax
mov %rsp,$fp
push %rbx
push %rbp
push %r12
@@ -940,16 +939,15 @@ _avx_shortcut:
vzeroupper
___
$code.=<<___ if ($win64);
vmovaps %xmm6,-40-6*16(%rax)
vmovaps %xmm7,-40-5*16(%rax)
vmovaps %xmm8,-40-4*16(%rax)
vmovaps %xmm9,-40-3*16(%rax)
vmovaps %xmm10,-40-2*16(%rax)
vmovaps %xmm11,-40-1*16(%rax)
vmovaps %xmm6,-40-6*16($fp)
vmovaps %xmm7,-40-5*16($fp)
vmovaps %xmm8,-40-4*16($fp)
vmovaps %xmm9,-40-3*16($fp)
vmovaps %xmm10,-40-2*16($fp)
vmovaps %xmm11,-40-1*16($fp)
.Lprologue_avx:
___
$code.=<<___;
mov %rax,%r14 # original %rsp
and \$-64,%rsp
mov %rdi,$ctx # reassigned argument
mov %rsi,$inp # reassigned argument
@@ -1257,21 +1255,20 @@ $code.=<<___;
mov $E,16($ctx)
___
$code.=<<___ if ($win64);
movaps -40-6*16(%r14),%xmm6
movaps -40-5*16(%r14),%xmm7
movaps -40-4*16(%r14),%xmm8
movaps -40-3*16(%r14),%xmm9
movaps -40-2*16(%r14),%xmm10
movaps -40-1*16(%r14),%xmm11
movaps -40-6*16($fp),%xmm6
movaps -40-5*16($fp),%xmm7
movaps -40-4*16($fp),%xmm8
movaps -40-3*16($fp),%xmm9
movaps -40-2*16($fp),%xmm10
movaps -40-1*16($fp),%xmm11
___
$code.=<<___;
lea (%r14),%rsi
mov -40(%rsi),%r14
mov -32(%rsi),%r13
mov -24(%rsi),%r12
mov -16(%rsi),%rbp
mov -8(%rsi),%rbx
lea (%rsi),%rsp
mov -40($fp),%r14
mov -32($fp),%r13
mov -24($fp),%r12
mov -16($fp),%rbp
mov -8($fp),%rbx
lea ($fp),%rsp
.Lepilogue_avx:
ret
.size sha1_block_data_order_avx,.-sha1_block_data_order_avx
@@ -1297,7 +1294,7 @@ $code.=<<___;
.align 16
sha1_block_data_order_avx2:
_avx2_shortcut:
mov %rsp,%rax
mov %rsp,$fp
push %rbx
push %rbp
push %r12
@@ -1307,16 +1304,15 @@ _avx2_shortcut:
___
$code.=<<___ if ($win64);
lea -6*16(%rsp),%rsp
vmovaps %xmm6,-40-6*16(%rax)
vmovaps %xmm7,-40-5*16(%rax)
vmovaps %xmm8,-40-4*16(%rax)
vmovaps %xmm9,-40-3*16(%rax)
vmovaps %xmm10,-40-2*16(%rax)
vmovaps %xmm11,-40-1*16(%rax)
vmovaps %xmm6,-40-6*16($fp)
vmovaps %xmm7,-40-5*16($fp)
vmovaps %xmm8,-40-4*16($fp)
vmovaps %xmm9,-40-3*16($fp)
vmovaps %xmm10,-40-2*16($fp)
vmovaps %xmm11,-40-1*16($fp)
.Lprologue_avx2:
___
$code.=<<___;
mov %rax,%r14 # original %rsp
mov %rdi,$ctx # reassigned argument
mov %rsi,$inp # reassigned argument
mov %rdx,$num # reassigned argument
@@ -1736,21 +1732,20 @@ $code.=<<___;
vzeroupper
___
$code.=<<___ if ($win64);
movaps -40-6*16(%r14),%xmm6
movaps -40-5*16(%r14),%xmm7
movaps -40-4*16(%r14),%xmm8
movaps -40-3*16(%r14),%xmm9
movaps -40-2*16(%r14),%xmm10
movaps -40-1*16(%r14),%xmm11
movaps -40-6*16($fp),%xmm6
movaps -40-5*16($fp),%xmm7
movaps -40-4*16($fp),%xmm8
movaps -40-3*16($fp),%xmm9
movaps -40-2*16($fp),%xmm10
movaps -40-1*16($fp),%xmm11
___
$code.=<<___;
lea (%r14),%rsi
mov -40(%rsi),%r14
mov -32(%rsi),%r13
mov -24(%rsi),%r12
mov -16(%rsi),%rbp
mov -8(%rsi),%rbx
lea (%rsi),%rsp
mov -40($fp),%r14
mov -32($fp),%r13
mov -24($fp),%r12
mov -16($fp),%rbp
mov -8($fp),%rbx
lea ($fp),%rsp
.Lepilogue_avx2:
ret
.size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
@@ -1893,15 +1888,13 @@ ssse3_handler:
cmp %r10,%rbx # context->Rip<prologue label
jb .Lcommon_seh_tail

mov 152($context),%rax # pull context->Rsp
mov 208($context),%rax # pull context->R11

mov 4(%r11),%r10d # HandlerData[1]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail

mov 232($context),%rax # pull context->R14

lea -40-6*16(%rax),%rsi
lea 512($context),%rdi # &context.Xmm6
mov \$12,%ecx


+ 45
- 46
crypto/sha/asm/sha512-x86_64.pl View File

@@ -286,13 +286,13 @@ $code.=<<___ if ($SZ==4);
jnz .Lssse3_shortcut
___
$code.=<<___;
mov %rsp,%rax # copy %rsp
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
mov %rsp,%r11 # copy %rsp
shl \$4,%rdx # num*16
sub \$$framesz,%rsp
lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
@@ -300,7 +300,7 @@ $code.=<<___;
mov $ctx,$_ctx # save ctx, 1st arg
mov $inp,$_inp # save inp, 2nd arh
mov %rdx,$_end # save end pointer, "3rd" arg
mov %r11,$_rsp # save copy of %rsp
mov %rax,$_rsp # save copy of %rsp
.Lprologue:

mov $SZ*0($ctx),$A
@@ -367,13 +367,13 @@ $code.=<<___;
jb .Lloop

mov $_rsp,%rsi
mov (%rsi),%r15
mov 8(%rsi),%r14
mov 16(%rsi),%r13
mov 24(%rsi),%r12
mov 32(%rsi),%rbp
mov 40(%rsi),%rbx
lea 48(%rsi),%rsp
mov -48(%rsi),%r15
mov -40(%rsi),%r14
mov -32(%rsi),%r13
mov -24(%rsi),%r12
mov -16(%rsi),%rbp
mov -8(%rsi),%rbx
lea (%rsi),%rsp
.Lepilogue:
ret
.size $func,.-$func
@@ -746,13 +746,13 @@ $code.=<<___;
.align 64
${func}_ssse3:
.Lssse3_shortcut:
mov %rsp,%rax # copy %rsp
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
mov %rsp,%r11 # copy %rsp
shl \$4,%rdx # num*16
sub \$`$framesz+$win64*16*4`,%rsp
lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
@@ -760,7 +760,7 @@ ${func}_ssse3:
mov $ctx,$_ctx # save ctx, 1st arg
mov $inp,$_inp # save inp, 2nd arh
mov %rdx,$_end # save end pointer, "3rd" arg
mov %r11,$_rsp # save copy of %rsp
mov %rax,$_rsp # save copy of %rsp
___
$code.=<<___ if ($win64);
movaps %xmm6,16*$SZ+32(%rsp)
@@ -1067,13 +1067,13 @@ $code.=<<___ if ($win64);
movaps 16*$SZ+80(%rsp),%xmm9
___
$code.=<<___;
mov (%rsi),%r15
mov 8(%rsi),%r14
mov 16(%rsi),%r13
mov 24(%rsi),%r12
mov 32(%rsi),%rbp
mov 40(%rsi),%rbx
lea 48(%rsi),%rsp
mov -48(%rsi),%r15
mov -40(%rsi),%r14
mov -32(%rsi),%r13
mov -24(%rsi),%r12
mov -16(%rsi),%rbp
mov -8(%rsi),%rbx
lea (%rsi),%rsp
.Lepilogue_ssse3:
ret
.size ${func}_ssse3,.-${func}_ssse3
@@ -1090,13 +1090,13 @@ $code.=<<___;
.align 64
${func}_xop:
.Lxop_shortcut:
mov %rsp,%rax # copy %rsp
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
mov %rsp,%r11 # copy %rsp
shl \$4,%rdx # num*16
sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
@@ -1104,7 +1104,7 @@ ${func}_xop:
mov $ctx,$_ctx # save ctx, 1st arg
mov $inp,$_inp # save inp, 2nd arh
mov %rdx,$_end # save end pointer, "3rd" arg
mov %r11,$_rsp # save copy of %rsp
mov %rax,$_rsp # save copy of %rsp
___
$code.=<<___ if ($win64);
movaps %xmm6,16*$SZ+32(%rsp)
@@ -1444,13 +1444,13 @@ $code.=<<___ if ($win64 && $SZ>4);
movaps 16*$SZ+112(%rsp),%xmm11
___
$code.=<<___;
mov (%rsi),%r15
mov 8(%rsi),%r14
mov 16(%rsi),%r13
mov 24(%rsi),%r12
mov 32(%rsi),%rbp
mov 40(%rsi),%rbx
lea 48(%rsi),%rsp
mov -48(%rsi),%r15
mov -40(%rsi),%r14
mov -32(%rsi),%r13
mov -24(%rsi),%r12
mov -16(%rsi),%rbp
mov -8(%rsi),%rbx
lea (%rsi),%rsp
.Lepilogue_xop:
ret
.size ${func}_xop,.-${func}_xop
@@ -1466,13 +1466,13 @@ $code.=<<___;
.align 64
${func}_avx:
.Lavx_shortcut:
mov %rsp,%rax # copy %rsp
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
mov %rsp,%r11 # copy %rsp
shl \$4,%rdx # num*16
sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
@@ -1480,7 +1480,7 @@ ${func}_avx:
mov $ctx,$_ctx # save ctx, 1st arg
mov $inp,$_inp # save inp, 2nd arh
mov %rdx,$_end # save end pointer, "3rd" arg
mov %r11,$_rsp # save copy of %rsp
mov %rax,$_rsp # save copy of %rsp
___
$code.=<<___ if ($win64);
movaps %xmm6,16*$SZ+32(%rsp)
@@ -1752,13 +1752,13 @@ $code.=<<___ if ($win64 && $SZ>4);
movaps 16*$SZ+112(%rsp),%xmm11
___
$code.=<<___;
mov (%rsi),%r15
mov 8(%rsi),%r14
mov 16(%rsi),%r13
mov 24(%rsi),%r12
mov 32(%rsi),%rbp
mov 40(%rsi),%rbx
lea 48(%rsi),%rsp
mov -48(%rsi),%r15
mov -40(%rsi),%r14
mov -32(%rsi),%r13
mov -24(%rsi),%r12
mov -16(%rsi),%rbp
mov -8(%rsi),%rbx
lea (%rsi),%rsp
.Lepilogue_avx:
ret
.size ${func}_avx,.-${func}_avx
@@ -1817,13 +1817,13 @@ $code.=<<___;
.align 64
${func}_avx2:
.Lavx2_shortcut:
mov %rsp,%rax # copy %rsp
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
mov %rsp,%r11 # copy %rsp
sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
shl \$4,%rdx # num*16
and \$-256*$SZ,%rsp # align stack frame
@@ -1832,7 +1832,7 @@ ${func}_avx2:
mov $ctx,$_ctx # save ctx, 1st arg
mov $inp,$_inp # save inp, 2nd arh
mov %rdx,$_end # save end pointer, "3rd" arg
mov %r11,$_rsp # save copy of %rsp
mov %rax,$_rsp # save copy of %rsp
___
$code.=<<___ if ($win64);
movaps %xmm6,16*$SZ+32(%rsp)
@@ -2126,13 +2126,13 @@ $code.=<<___ if ($win64 && $SZ>4);
movaps 16*$SZ+112(%rsp),%xmm11
___
$code.=<<___;
mov (%rsi),%r15
mov 8(%rsi),%r14
mov 16(%rsi),%r13
mov 24(%rsi),%r12
mov 32(%rsi),%rbp
mov 40(%rsi),%rbx
lea 48(%rsi),%rsp
mov -48(%rsi),%r15
mov -40(%rsi),%r14
mov -32(%rsi),%r13
mov -24(%rsi),%r12
mov -16(%rsi),%rbp
mov -8(%rsi),%rbx
lea (%rsi),%rsp
.Lepilogue_avx2:
ret
.size ${func}_avx2,.-${func}_avx2
@@ -2194,7 +2194,6 @@ ___
$code.=<<___;
mov %rax,%rsi # put aside Rsp
mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
lea 48(%rax),%rax

mov -8(%rax),%rbx
mov -16(%rax),%rbp


Loading…
Cancel
Save