(Imports upstream's 384e6de4c7
. Changes to
P-256 assembly dropped because we're so different there.)
- harmonize handlers with guidelines and themselves;
- fix some bugs in handlers;
Change-Id: Ic0b6a37bed6baedc50448c72fab088327f12898d
Reviewed-on: https://boringssl-review.googlesource.com/13782
Commit-Queue: Adam Langley <agl@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
Reviewed-by: David Benjamin <davidben@google.com>
kris/onging/CECPQ3_patch15
@@ -590,6 +590,7 @@ $code.=<<___; | |||
.type asm_AES_encrypt,\@function,3 | |||
.hidden asm_AES_encrypt | |||
asm_AES_encrypt: | |||
mov %rsp,%rax | |||
push %rbx | |||
push %rbp | |||
push %r12 | |||
@@ -598,7 +599,6 @@ asm_AES_encrypt: | |||
push %r15 | |||
# allocate frame "above" key schedule | |||
mov %rsp,%r10 | |||
lea -63(%rdx),%rcx # %rdx is key argument | |||
and \$-64,%rsp | |||
sub %rsp,%rcx | |||
@@ -608,7 +608,7 @@ asm_AES_encrypt: | |||
sub \$32,%rsp | |||
mov %rsi,16(%rsp) # save out | |||
mov %r10,24(%rsp) # save real stack pointer | |||
mov %rax,24(%rsp) # save original stack pointer | |||
.Lenc_prologue: | |||
mov %rdx,$key | |||
@@ -640,13 +640,13 @@ asm_AES_encrypt: | |||
mov $s2,8($out) | |||
mov $s3,12($out) | |||
mov (%rsi),%r15 | |||
mov 8(%rsi),%r14 | |||
mov 16(%rsi),%r13 | |||
mov 24(%rsi),%r12 | |||
mov 32(%rsi),%rbp | |||
mov 40(%rsi),%rbx | |||
lea 48(%rsi),%rsp | |||
mov -48(%rsi),%r15 | |||
mov -40(%rsi),%r14 | |||
mov -32(%rsi),%r13 | |||
mov -24(%rsi),%r12 | |||
mov -16(%rsi),%rbp | |||
mov -8(%rsi),%rbx | |||
lea (%rsi),%rsp | |||
.Lenc_epilogue: | |||
ret | |||
.size asm_AES_encrypt,.-asm_AES_encrypt | |||
@@ -1186,6 +1186,7 @@ $code.=<<___; | |||
.type asm_AES_decrypt,\@function,3 | |||
.hidden asm_AES_decrypt | |||
asm_AES_decrypt: | |||
mov %rsp,%rax | |||
push %rbx | |||
push %rbp | |||
push %r12 | |||
@@ -1194,7 +1195,6 @@ asm_AES_decrypt: | |||
push %r15 | |||
# allocate frame "above" key schedule | |||
mov %rsp,%r10 | |||
lea -63(%rdx),%rcx # %rdx is key argument | |||
and \$-64,%rsp | |||
sub %rsp,%rcx | |||
@@ -1204,7 +1204,7 @@ asm_AES_decrypt: | |||
sub \$32,%rsp | |||
mov %rsi,16(%rsp) # save out | |||
mov %r10,24(%rsp) # save real stack pointer | |||
mov %rax,24(%rsp) # save original stack pointer | |||
.Ldec_prologue: | |||
mov %rdx,$key | |||
@@ -1238,13 +1238,13 @@ asm_AES_decrypt: | |||
mov $s2,8($out) | |||
mov $s3,12($out) | |||
mov (%rsi),%r15 | |||
mov 8(%rsi),%r14 | |||
mov 16(%rsi),%r13 | |||
mov 24(%rsi),%r12 | |||
mov 32(%rsi),%rbp | |||
mov 40(%rsi),%rbx | |||
lea 48(%rsi),%rsp | |||
mov -48(%rsi),%r15 | |||
mov -40(%rsi),%r14 | |||
mov -32(%rsi),%r13 | |||
mov -24(%rsi),%r12 | |||
mov -16(%rsi),%rbp | |||
mov -8(%rsi),%rbx | |||
lea (%rsi),%rsp | |||
.Ldec_epilogue: | |||
ret | |||
.size asm_AES_decrypt,.-asm_AES_decrypt | |||
@@ -1660,10 +1660,9 @@ asm_AES_cbc_encrypt: | |||
mov %r9d,%r9d # clear upper half of enc | |||
lea .LAES_Te(%rip),$sbox | |||
lea .LAES_Td(%rip),%r10 | |||
cmp \$0,%r9 | |||
jne .Lcbc_picked_te | |||
lea .LAES_Td(%rip),$sbox | |||
.Lcbc_picked_te: | |||
cmoveq %r10,$sbox | |||
mov OPENSSL_ia32cap_P(%rip),%r10d | |||
cmp \$$speed_limit,%rdx | |||
@@ -2565,7 +2564,6 @@ block_se_handler: | |||
jae .Lin_block_prologue | |||
mov 24(%rax),%rax # pull saved real stack pointer | |||
lea 48(%rax),%rax # adjust... | |||
mov -8(%rax),%rbx | |||
mov -16(%rax),%rbp | |||
@@ -1327,7 +1327,7 @@ $code.=<<___; | |||
cmp %rax, %rbp | |||
jb .Lecb_enc_bzero | |||
lea (%rbp),%rsp # restore %rsp | |||
lea 0x78(%rbp),%rax | |||
___ | |||
$code.=<<___ if ($win64); | |||
movaps 0x40(%rbp), %xmm6 | |||
@@ -1340,17 +1340,17 @@ $code.=<<___ if ($win64); | |||
movaps 0xb0(%rbp), %xmm13 | |||
movaps 0xc0(%rbp), %xmm14 | |||
movaps 0xd0(%rbp), %xmm15 | |||
lea 0xa0(%rbp), %rsp | |||
lea 0xa0(%rax), %rax | |||
.Lecb_enc_tail: | |||
___ | |||
$code.=<<___; | |||
mov 0x48(%rsp), %r15 | |||
mov 0x50(%rsp), %r14 | |||
mov 0x58(%rsp), %r13 | |||
mov 0x60(%rsp), %r12 | |||
mov 0x68(%rsp), %rbx | |||
mov 0x70(%rsp), %rax | |||
lea 0x78(%rsp), %rsp | |||
mov %rax, %rbp | |||
mov -48(%rax), %r15 | |||
mov -40(%rax), %r14 | |||
mov -32(%rax), %r13 | |||
mov -24(%rax), %r12 | |||
mov -16(%rax), %rbx | |||
mov -8(%rax), %rbp | |||
lea (%rax), %rsp # restore %rsp | |||
.Lecb_enc_epilogue: | |||
ret | |||
.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks | |||
@@ -1529,7 +1529,7 @@ $code.=<<___; | |||
cmp %rax, %rbp | |||
jb .Lecb_dec_bzero | |||
lea (%rbp),%rsp # restore %rsp | |||
lea 0x78(%rbp),%rax | |||
___ | |||
$code.=<<___ if ($win64); | |||
movaps 0x40(%rbp), %xmm6 | |||
@@ -1542,17 +1542,17 @@ $code.=<<___ if ($win64); | |||
movaps 0xb0(%rbp), %xmm13 | |||
movaps 0xc0(%rbp), %xmm14 | |||
movaps 0xd0(%rbp), %xmm15 | |||
lea 0xa0(%rbp), %rsp | |||
lea 0xa0(%rax), %rax | |||
.Lecb_dec_tail: | |||
___ | |||
$code.=<<___; | |||
mov 0x48(%rsp), %r15 | |||
mov 0x50(%rsp), %r14 | |||
mov 0x58(%rsp), %r13 | |||
mov 0x60(%rsp), %r12 | |||
mov 0x68(%rsp), %rbx | |||
mov 0x70(%rsp), %rax | |||
lea 0x78(%rsp), %rsp | |||
mov %rax, %rbp | |||
mov -48(%rax), %r15 | |||
mov -40(%rax), %r14 | |||
mov -32(%rax), %r13 | |||
mov -24(%rax), %r12 | |||
mov -16(%rax), %rbx | |||
mov -8(%rax), %rbp | |||
lea (%rax), %rsp # restore %rsp | |||
.Lecb_dec_epilogue: | |||
ret | |||
.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks | |||
@@ -1819,7 +1819,7 @@ $code.=<<___; | |||
cmp %rax, %rbp | |||
ja .Lcbc_dec_bzero | |||
lea (%rbp),%rsp # restore %rsp | |||
lea 0x78(%rbp),%rax | |||
___ | |||
$code.=<<___ if ($win64); | |||
movaps 0x40(%rbp), %xmm6 | |||
@@ -1832,17 +1832,17 @@ $code.=<<___ if ($win64); | |||
movaps 0xb0(%rbp), %xmm13 | |||
movaps 0xc0(%rbp), %xmm14 | |||
movaps 0xd0(%rbp), %xmm15 | |||
lea 0xa0(%rbp), %rsp | |||
lea 0xa0(%rax), %rax | |||
.Lcbc_dec_tail: | |||
___ | |||
$code.=<<___; | |||
mov 0x48(%rsp), %r15 | |||
mov 0x50(%rsp), %r14 | |||
mov 0x58(%rsp), %r13 | |||
mov 0x60(%rsp), %r12 | |||
mov 0x68(%rsp), %rbx | |||
mov 0x70(%rsp), %rax | |||
lea 0x78(%rsp), %rsp | |||
mov %rax, %rbp | |||
mov -48(%rax), %r15 | |||
mov -40(%rax), %r14 | |||
mov -32(%rax), %r13 | |||
mov -24(%rax), %r12 | |||
mov -16(%rax), %rbx | |||
mov -8(%rax), %rbp | |||
lea (%rax), %rsp # restore %rsp | |||
.Lcbc_dec_epilogue: | |||
ret | |||
.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt | |||
@@ -2051,7 +2051,7 @@ $code.=<<___; | |||
cmp %rax, %rbp | |||
ja .Lctr_enc_bzero | |||
lea (%rbp),%rsp # restore %rsp | |||
lea 0x78(%rbp),%rax | |||
___ | |||
$code.=<<___ if ($win64); | |||
movaps 0x40(%rbp), %xmm6 | |||
@@ -2064,17 +2064,17 @@ $code.=<<___ if ($win64); | |||
movaps 0xb0(%rbp), %xmm13 | |||
movaps 0xc0(%rbp), %xmm14 | |||
movaps 0xd0(%rbp), %xmm15 | |||
lea 0xa0(%rbp), %rsp | |||
lea 0xa0(%rax), %rax | |||
.Lctr_enc_tail: | |||
___ | |||
$code.=<<___; | |||
mov 0x48(%rsp), %r15 | |||
mov 0x50(%rsp), %r14 | |||
mov 0x58(%rsp), %r13 | |||
mov 0x60(%rsp), %r12 | |||
mov 0x68(%rsp), %rbx | |||
mov 0x70(%rsp), %rax | |||
lea 0x78(%rsp), %rsp | |||
mov %rax, %rbp | |||
mov -48(%rax), %r15 | |||
mov -40(%rax), %r14 | |||
mov -32(%rax), %r13 | |||
mov -24(%rax), %r12 | |||
mov -16(%rax), %rbx | |||
mov -8(%rax), %rbp | |||
lea (%rax), %rsp # restore %rsp | |||
.Lctr_enc_epilogue: | |||
ret | |||
.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks | |||
@@ -2441,7 +2441,7 @@ $code.=<<___; | |||
cmp %rax, %rbp | |||
ja .Lxts_enc_bzero | |||
lea (%rbp),%rsp # restore %rsp | |||
lea 0x78(%rbp),%rax | |||
___ | |||
$code.=<<___ if ($win64); | |||
movaps 0x40(%rbp), %xmm6 | |||
@@ -2454,17 +2454,17 @@ $code.=<<___ if ($win64); | |||
movaps 0xb0(%rbp), %xmm13 | |||
movaps 0xc0(%rbp), %xmm14 | |||
movaps 0xd0(%rbp), %xmm15 | |||
lea 0xa0(%rbp), %rsp | |||
lea 0xa0(%rax), %rax | |||
.Lxts_enc_tail: | |||
___ | |||
$code.=<<___; | |||
mov 0x48(%rsp), %r15 | |||
mov 0x50(%rsp), %r14 | |||
mov 0x58(%rsp), %r13 | |||
mov 0x60(%rsp), %r12 | |||
mov 0x68(%rsp), %rbx | |||
mov 0x70(%rsp), %rax | |||
lea 0x78(%rsp), %rsp | |||
mov %rax, %rbp | |||
mov -48(%rax), %r15 | |||
mov -40(%rax), %r14 | |||
mov -32(%rax), %r13 | |||
mov -24(%rax), %r12 | |||
mov -16(%rax), %rbx | |||
mov -8(%rax), %rbp | |||
lea (%rax), %rsp # restore %rsp | |||
.Lxts_enc_epilogue: | |||
ret | |||
.size bsaes_xts_encrypt,.-bsaes_xts_encrypt | |||
@@ -2848,7 +2848,7 @@ $code.=<<___; | |||
cmp %rax, %rbp | |||
ja .Lxts_dec_bzero | |||
lea (%rbp),%rsp # restore %rsp | |||
lea 0x78(%rbp),%rax | |||
___ | |||
$code.=<<___ if ($win64); | |||
movaps 0x40(%rbp), %xmm6 | |||
@@ -2861,17 +2861,17 @@ $code.=<<___ if ($win64); | |||
movaps 0xb0(%rbp), %xmm13 | |||
movaps 0xc0(%rbp), %xmm14 | |||
movaps 0xd0(%rbp), %xmm15 | |||
lea 0xa0(%rbp), %rsp | |||
lea 0xa0(%rax), %rax | |||
.Lxts_dec_tail: | |||
___ | |||
$code.=<<___; | |||
mov 0x48(%rsp), %r15 | |||
mov 0x50(%rsp), %r14 | |||
mov 0x58(%rsp), %r13 | |||
mov 0x60(%rsp), %r12 | |||
mov 0x68(%rsp), %rbx | |||
mov 0x70(%rsp), %rax | |||
lea 0x78(%rsp), %rsp | |||
mov %rax, %rbp | |||
mov -48(%rax), %r15 | |||
mov -40(%rax), %r14 | |||
mov -32(%rax), %r13 | |||
mov -24(%rax), %r12 | |||
mov -16(%rax), %rbx | |||
mov -8(%rax), %rbp | |||
lea (%rax), %rsp # restore %rsp | |||
.Lxts_dec_epilogue: | |||
ret | |||
.size bsaes_xts_decrypt,.-bsaes_xts_decrypt | |||
@@ -2967,31 +2967,34 @@ se_handler: | |||
mov 0(%r11),%r10d # HandlerData[0] | |||
lea (%rsi,%r10),%r10 # prologue label | |||
cmp %r10,%rbx # context->Rip<prologue label | |||
jb .Lin_prologue | |||
mov 152($context),%rax # pull context->Rsp | |||
cmp %r10,%rbx # context->Rip<=prologue label | |||
jbe .Lin_prologue | |||
mov 4(%r11),%r10d # HandlerData[1] | |||
lea (%rsi,%r10),%r10 # epilogue label | |||
cmp %r10,%rbx # context->Rip>=epilogue label | |||
jae .Lin_prologue | |||
mov 8(%r11),%r10d # HandlerData[2] | |||
lea (%rsi,%r10),%r10 # epilogue label | |||
cmp %r10,%rbx # context->Rip>=tail label | |||
jae .Lin_tail | |||
mov 160($context),%rax # pull context->Rbp | |||
lea 0x40(%rax),%rsi # %xmm save area | |||
lea 512($context),%rdi # &context.Xmm6 | |||
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) | |||
.long 0xa548f3fc # cld; rep movsq | |||
lea 0xa0(%rax),%rax # adjust stack pointer | |||
mov 0x70(%rax),%rbp | |||
mov 0x68(%rax),%rbx | |||
mov 0x60(%rax),%r12 | |||
mov 0x58(%rax),%r13 | |||
mov 0x50(%rax),%r14 | |||
mov 0x48(%rax),%r15 | |||
lea 0x78(%rax),%rax # adjust stack pointer | |||
lea 0xa0+0x78(%rax),%rax # adjust stack pointer | |||
.Lin_tail: | |||
mov -48(%rax),%rbp | |||
mov -40(%rax),%rbx | |||
mov -32(%rax),%r12 | |||
mov -24(%rax),%r13 | |||
mov -16(%rax),%r14 | |||
mov -8(%rax),%r15 | |||
mov %rbx,144($context) # restore context->Rbx | |||
mov %rbp,160($context) # restore context->Rbp | |||
mov %r12,216($context) # restore context->R12 | |||
@@ -3072,28 +3075,40 @@ $code.=<<___ if ($ecb); | |||
.byte 9,0,0,0 | |||
.rva se_handler | |||
.rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[] | |||
.rva .Lecb_enc_tail | |||
.long 0 | |||
.Lecb_dec_info: | |||
.byte 9,0,0,0 | |||
.rva se_handler | |||
.rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[] | |||
.rva .Lecb_dec_tail | |||
.long 0 | |||
___ | |||
$code.=<<___; | |||
.Lcbc_dec_info: | |||
.byte 9,0,0,0 | |||
.rva se_handler | |||
.rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[] | |||
.rva .Lcbc_dec_tail | |||
.long 0 | |||
.Lctr_enc_info: | |||
.byte 9,0,0,0 | |||
.rva se_handler | |||
.rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[] | |||
.rva .Lctr_enc_tail | |||
.long 0 | |||
.Lxts_enc_info: | |||
.byte 9,0,0,0 | |||
.rva se_handler | |||
.rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] | |||
.rva .Lxts_enc_tail | |||
.long 0 | |||
.Lxts_dec_info: | |||
.byte 9,0,0,0 | |||
.rva se_handler | |||
.rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] | |||
.rva .Lxts_dec_tail | |||
.long 0 | |||
___ | |||
} | |||
@@ -1717,11 +1717,11 @@ $code.=<<___ if ($win64); | |||
movaps -0x38(%r11),%xmm13 | |||
movaps -0x28(%r11),%xmm14 | |||
movaps -0x18(%r11),%xmm15 | |||
.LSEH_end_rsaz_1024_gather5: | |||
___ | |||
$code.=<<___; | |||
lea (%r11),%rsp | |||
ret | |||
.LSEH_end_rsaz_1024_gather5: | |||
.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2 | |||
___ | |||
} | |||
@@ -675,10 +675,11 @@ ___ | |||
my @ri=("%rax","%rdx",$m0,$m1); | |||
$code.=<<___; | |||
mov 16(%rsp,$num,8),$rp # restore $rp | |||
lea -4($num),$j | |||
mov 0(%rsp),@ri[0] # tp[0] | |||
pxor %xmm0,%xmm0 | |||
mov 8(%rsp),@ri[1] # tp[1] | |||
shr \$2,$num # num/=4 | |||
shr \$2,$j # j=num/4-1 | |||
lea (%rsp),$ap # borrow ap for tp | |||
xor $i,$i # i=0 and clear CF! | |||
@@ -686,7 +687,6 @@ $code.=<<___; | |||
mov 16($ap),@ri[2] # tp[2] | |||
mov 24($ap),@ri[3] # tp[3] | |||
sbb 8($np),@ri[1] | |||
lea -1($num),$j # j=num/4-1 | |||
jmp .Lsub4x | |||
.align 16 | |||
.Lsub4x: | |||
@@ -720,8 +720,9 @@ $code.=<<___; | |||
not @ri[0] | |||
mov $rp,$np | |||
and @ri[0],$np | |||
lea -1($num),$j | |||
lea -4($num),$j | |||
or $np,$ap # ap=borrow?tp:rp | |||
shr \$2,$j # j=num/4-1 | |||
movdqu ($ap),%xmm1 | |||
movdqa %xmm0,(%rsp) | |||
@@ -739,7 +740,6 @@ $code.=<<___; | |||
dec $j | |||
jnz .Lcopy4x | |||
shl \$2,$num | |||
movdqu 16($ap,$i),%xmm2 | |||
movdqa %xmm0,16(%rsp,$i) | |||
movdqu %xmm2,16($rp,$i) | |||
@@ -1381,12 +1381,12 @@ sqr_handler: | |||
mov 0(%r11),%r10d # HandlerData[0] | |||
lea (%rsi,%r10),%r10 # end of prologue label | |||
cmp %r10,%rbx # context->Rip<.Lsqr_body | |||
cmp %r10,%rbx # context->Rip<.Lsqr_prologue | |||
jb .Lcommon_seh_tail | |||
mov 4(%r11),%r10d # HandlerData[1] | |||
lea (%rsi,%r10),%r10 # body label | |||
cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue | |||
cmp %r10,%rbx # context->Rip<.Lsqr_body | |||
jb .Lcommon_pop_regs | |||
mov 152($context),%rax # pull context->Rsp | |||
@@ -3630,8 +3630,8 @@ mul_handler: | |||
jb .Lcommon_seh_tail | |||
mov 4(%r11),%r10d # HandlerData[1] | |||
lea (%rsi,%r10),%r10 # epilogue label | |||
cmp %r10,%rbx # context->Rip>=epilogue label | |||
lea (%rsi,%r10),%r10 # beginning of body label | |||
cmp %r10,%rbx # context->Rip<body label | |||
jb .Lcommon_pop_regs | |||
mov 152($context),%rax # pull context->Rsp | |||
@@ -245,6 +245,7 @@ $code.=<<___; | |||
push %r14 | |||
push %r15 | |||
sub \$64+24,%rsp | |||
.Lctr32_body: | |||
#movdqa .Lsigma(%rip),%xmm0 | |||
movdqu ($key),%xmm1 | |||
@@ -383,13 +384,14 @@ $code.=<<___; | |||
jnz .Loop_tail | |||
.Ldone: | |||
add \$64+24,%rsp | |||
pop %r15 | |||
pop %r14 | |||
pop %r13 | |||
pop %r12 | |||
pop %rbp | |||
pop %rbx | |||
lea 64+24+48(%rsp),%rsi | |||
mov -48(%rsi),%r15 | |||
mov -40(%rsi),%r14 | |||
mov -32(%rsi),%r13 | |||
mov -24(%rsi),%r12 | |||
mov -16(%rsi),%rbp | |||
mov -8(%rsi),%rbx | |||
lea (%rsi),%rsp | |||
.Lno_data: | |||
ret | |||
.size ChaCha20_ctr32,.-ChaCha20_ctr32 | |||
@@ -424,31 +426,26 @@ sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round | |||
&por ($b,$t); | |||
} | |||
my $xframe = $win64 ? 32+32+8 : 24; | |||
my $xframe = $win64 ? 32+8 : 8; | |||
$code.=<<___; | |||
.type ChaCha20_ssse3,\@function,5 | |||
.align 32 | |||
ChaCha20_ssse3: | |||
.LChaCha20_ssse3: | |||
mov %rsp,%r9 # frame pointer | |||
___ | |||
$code.=<<___; | |||
cmp \$128,$len # we might throw away some data, | |||
ja .LChaCha20_4x # but overall it won't be slower | |||
.Ldo_sse3_after_all: | |||
push %rbx # just to share SEH handler, no pops | |||
push %rbp | |||
push %r12 | |||
push %r13 | |||
push %r14 | |||
push %r15 | |||
sub \$64+$xframe,%rsp | |||
___ | |||
$code.=<<___ if ($win64); | |||
movaps %xmm6,64+32(%rsp) | |||
movaps %xmm7,64+48(%rsp) | |||
movaps %xmm6,-0x28(%r9) | |||
movaps %xmm7,-0x18(%r9) | |||
.Lssse3_body: | |||
___ | |||
$code.=<<___; | |||
movdqa .Lsigma(%rip),$a | |||
@@ -543,11 +540,12 @@ $code.=<<___; | |||
.Ldone_ssse3: | |||
___ | |||
$code.=<<___ if ($win64); | |||
movaps 64+32(%rsp),%xmm6 | |||
movaps 64+48(%rsp),%xmm7 | |||
movaps -0x28(%r9),%xmm6 | |||
movaps -0x18(%r9),%xmm7 | |||
___ | |||
$code.=<<___; | |||
add \$64+$xframe+48,%rsp | |||
lea (%r9),%rsp | |||
.Lssse3_epilogue: | |||
ret | |||
.size ChaCha20_ssse3,.-ChaCha20_ssse3 | |||
___ | |||
@@ -684,13 +682,14 @@ my @x=map("\"$_\"",@xx); | |||
); | |||
} | |||
my $xframe = $win64 ? 0xa0 : 0; | |||
my $xframe = $win64 ? 0xa8 : 8; | |||
$code.=<<___; | |||
.type ChaCha20_4x,\@function,5 | |||
.align 32 | |||
ChaCha20_4x: | |||
.LChaCha20_4x: | |||
mov %rsp,%r9 # frame pointer | |||
mov %r10,%r11 | |||
___ | |||
$code.=<<___ if ($avx>1); | |||
@@ -707,8 +706,7 @@ $code.=<<___; | |||
je .Ldo_sse3_after_all # to detect Atom | |||
.Lproceed4x: | |||
lea -0x78(%rsp),%r11 | |||
sub \$0x148+$xframe,%rsp | |||
sub \$0x140+$xframe,%rsp | |||
___ | |||
################ stack layout | |||
# +0x00 SIMD equivalent of @x[8-12] | |||
@@ -719,16 +717,17 @@ ___ | |||
# ... | |||
# +0x140 | |||
$code.=<<___ if ($win64); | |||
movaps %xmm6,-0x30(%r11) | |||
movaps %xmm7,-0x20(%r11) | |||
movaps %xmm8,-0x10(%r11) | |||
movaps %xmm9,0x00(%r11) | |||
movaps %xmm10,0x10(%r11) | |||
movaps %xmm11,0x20(%r11) | |||
movaps %xmm12,0x30(%r11) | |||
movaps %xmm13,0x40(%r11) | |||
movaps %xmm14,0x50(%r11) | |||
movaps %xmm15,0x60(%r11) | |||
movaps %xmm6,-0xa8(%r9) | |||
movaps %xmm7,-0x98(%r9) | |||
movaps %xmm8,-0x88(%r9) | |||
movaps %xmm9,-0x78(%r9) | |||
movaps %xmm10,-0x68(%r9) | |||
movaps %xmm11,-0x58(%r9) | |||
movaps %xmm12,-0x48(%r9) | |||
movaps %xmm13,-0x38(%r9) | |||
movaps %xmm14,-0x28(%r9) | |||
movaps %xmm15,-0x18(%r9) | |||
.L4x_body: | |||
___ | |||
$code.=<<___; | |||
movdqa .Lsigma(%rip),$xa3 # key[0] | |||
@@ -1117,20 +1116,20 @@ $code.=<<___; | |||
.Ldone4x: | |||
___ | |||
$code.=<<___ if ($win64); | |||
lea 0x140+0x30(%rsp),%r11 | |||
movaps -0x30(%r11),%xmm6 | |||
movaps -0x20(%r11),%xmm7 | |||
movaps -0x10(%r11),%xmm8 | |||
movaps 0x00(%r11),%xmm9 | |||
movaps 0x10(%r11),%xmm10 | |||
movaps 0x20(%r11),%xmm11 | |||
movaps 0x30(%r11),%xmm12 | |||
movaps 0x40(%r11),%xmm13 | |||
movaps 0x50(%r11),%xmm14 | |||
movaps 0x60(%r11),%xmm15 | |||
movaps -0xa8(%r9),%xmm6 | |||
movaps -0x98(%r9),%xmm7 | |||
movaps -0x88(%r9),%xmm8 | |||
movaps -0x78(%r9),%xmm9 | |||
movaps -0x68(%r9),%xmm10 | |||
movaps -0x58(%r9),%xmm11 | |||
movaps -0x48(%r9),%xmm12 | |||
movaps -0x38(%r9),%xmm13 | |||
movaps -0x28(%r9),%xmm14 | |||
movaps -0x18(%r9),%xmm15 | |||
___ | |||
$code.=<<___; | |||
add \$0x148+$xframe,%rsp | |||
lea (%r9),%rsp | |||
.L4x_epilogue: | |||
ret | |||
.size ChaCha20_4x,.-ChaCha20_4x | |||
___ | |||
@@ -1258,33 +1257,32 @@ my @x=map("\"$_\"",@xx); | |||
); | |||
} | |||
my $xframe = $win64 ? 0xb0 : 8; | |||
my $xframe = $win64 ? 0xa8 : 8; | |||
$code.=<<___; | |||
.type ChaCha20_8x,\@function,5 | |||
.align 32 | |||
ChaCha20_8x: | |||
.LChaCha20_8x: | |||
mov %rsp,%r10 | |||
mov %rsp,%r9 # frame register | |||
sub \$0x280+$xframe,%rsp | |||
and \$-32,%rsp | |||
___ | |||
$code.=<<___ if ($win64); | |||
lea 0x290+0x30(%rsp),%r11 | |||
movaps %xmm6,-0x30(%r11) | |||
movaps %xmm7,-0x20(%r11) | |||
movaps %xmm8,-0x10(%r11) | |||
movaps %xmm9,0x00(%r11) | |||
movaps %xmm10,0x10(%r11) | |||
movaps %xmm11,0x20(%r11) | |||
movaps %xmm12,0x30(%r11) | |||
movaps %xmm13,0x40(%r11) | |||
movaps %xmm14,0x50(%r11) | |||
movaps %xmm15,0x60(%r11) | |||
movaps %xmm6,-0xa8(%r9) | |||
movaps %xmm7,-0x98(%r9) | |||
movaps %xmm8,-0x88(%r9) | |||
movaps %xmm9,-0x78(%r9) | |||
movaps %xmm10,-0x68(%r9) | |||
movaps %xmm11,-0x58(%r9) | |||
movaps %xmm12,-0x48(%r9) | |||
movaps %xmm13,-0x38(%r9) | |||
movaps %xmm14,-0x28(%r9) | |||
movaps %xmm15,-0x18(%r9) | |||
.L8x_body: | |||
___ | |||
$code.=<<___; | |||
vzeroupper | |||
mov %r10,0x280(%rsp) | |||
################ stack layout | |||
# +0x00 SIMD equivalent of @x[8-12] | |||
@@ -1293,7 +1291,7 @@ $code.=<<___; | |||
# ... | |||
# +0x200 SIMD counters (with nonce smashed by lanes) | |||
# ... | |||
# +0x280 saved %rsp | |||
# +0x280 | |||
vbroadcasti128 .Lsigma(%rip),$xa3 # key[0] | |||
vbroadcasti128 ($key),$xb3 # key[1] | |||
@@ -1759,20 +1757,20 @@ $code.=<<___; | |||
vzeroall | |||
___ | |||
$code.=<<___ if ($win64); | |||
lea 0x290+0x30(%rsp),%r11 | |||
movaps -0x30(%r11),%xmm6 | |||
movaps -0x20(%r11),%xmm7 | |||
movaps -0x10(%r11),%xmm8 | |||
movaps 0x00(%r11),%xmm9 | |||
movaps 0x10(%r11),%xmm10 | |||
movaps 0x20(%r11),%xmm11 | |||
movaps 0x30(%r11),%xmm12 | |||
movaps 0x40(%r11),%xmm13 | |||
movaps 0x50(%r11),%xmm14 | |||
movaps 0x60(%r11),%xmm15 | |||
movaps -0xa8(%r9),%xmm6 | |||
movaps -0x98(%r9),%xmm7 | |||
movaps -0x88(%r9),%xmm8 | |||
movaps -0x78(%r9),%xmm9 | |||
movaps -0x68(%r9),%xmm10 | |||
movaps -0x58(%r9),%xmm11 | |||
movaps -0x48(%r9),%xmm12 | |||
movaps -0x38(%r9),%xmm13 | |||
movaps -0x28(%r9),%xmm14 | |||
movaps -0x18(%r9),%xmm15 | |||
___ | |||
$code.=<<___; | |||
mov 0x280(%rsp),%rsp | |||
lea (%r9),%rsp | |||
.L8x_epilogue: | |||
ret | |||
.size ChaCha20_8x,.-ChaCha20_8x | |||
___ | |||
@@ -1804,28 +1802,23 @@ sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round | |||
&vprold ($b,$b,7); | |||
} | |||
my $xframe = $win64 ? 32+32+8 : 24; | |||
my $xframe = $win64 ? 32+8 : 8; | |||
$code.=<<___; | |||
.type ChaCha20_avx512,\@function,5 | |||
.align 32 | |||
ChaCha20_avx512: | |||
.LChaCha20_avx512: | |||
mov %rsp,%r9 # frame pointer | |||
cmp \$512,$len | |||
ja .LChaCha20_16x | |||
push %rbx # just to share SEH handler, no pops | |||
push %rbp | |||
push %r12 | |||
push %r13 | |||
push %r14 | |||
push %r15 | |||
sub \$64+$xframe,%rsp | |||
___ | |||
$code.=<<___ if ($win64); | |||
movaps %xmm6,64+32(%rsp) | |||
movaps %xmm7,64+48(%rsp) | |||
movaps %xmm6,-0x28(%r9) | |||
movaps %xmm7,-0x18(%r9) | |||
.Lavx512_body: | |||
___ | |||
$code.=<<___; | |||
vbroadcasti32x4 .Lsigma(%rip),$a | |||
@@ -1991,11 +1984,12 @@ $code.=<<___; | |||
vzeroall | |||
___ | |||
$code.=<<___ if ($win64); | |||
movaps 64+32(%rsp),%xmm6 | |||
movaps 64+48(%rsp),%xmm7 | |||
movaps -0x28(%r9),%xmm6 | |||
movaps -0x18(%r9),%xmm7 | |||
___ | |||
$code.=<<___; | |||
add \$64+$xframe+48,%rsp | |||
lea (%r9),%rsp | |||
.Lavx512_epilogue: | |||
ret | |||
.size ChaCha20_avx512,.-ChaCha20_avx512 | |||
___ | |||
@@ -2072,29 +2066,29 @@ my @x=map("\"$_\"",@xx); | |||
); | |||
} | |||
my $xframe = $win64 ? 0xb0 : 8; | |||
my $xframe = $win64 ? 0xa8 : 8; | |||
$code.=<<___; | |||
.type ChaCha20_16x,\@function,5 | |||
.align 32 | |||
ChaCha20_16x: | |||
.LChaCha20_16x: | |||
mov %rsp,%r11 | |||
mov %rsp,%r9 # frame register | |||
sub \$64+$xframe,%rsp | |||
and \$-64,%rsp | |||
___ | |||
$code.=<<___ if ($win64); | |||
lea 0x290+0x30(%rsp),%r11 | |||
movaps %xmm6,-0x30(%r11) | |||
movaps %xmm7,-0x20(%r11) | |||
movaps %xmm8,-0x10(%r11) | |||
movaps %xmm9,0x00(%r11) | |||
movaps %xmm10,0x10(%r11) | |||
movaps %xmm11,0x20(%r11) | |||
movaps %xmm12,0x30(%r11) | |||
movaps %xmm13,0x40(%r11) | |||
movaps %xmm14,0x50(%r11) | |||
movaps %xmm15,0x60(%r11) | |||
movaps %xmm6,-0xa8(%r9) | |||
movaps %xmm7,-0x98(%r9) | |||
movaps %xmm8,-0x88(%r9) | |||
movaps %xmm9,-0x78(%r9) | |||
movaps %xmm10,-0x68(%r9) | |||
movaps %xmm11,-0x58(%r9) | |||
movaps %xmm12,-0x48(%r9) | |||
movaps %xmm13,-0x38(%r9) | |||
movaps %xmm14,-0x28(%r9) | |||
movaps %xmm15,-0x18(%r9) | |||
.L16x_body: | |||
___ | |||
$code.=<<___; | |||
vzeroupper | |||
@@ -2484,25 +2478,264 @@ $code.=<<___; | |||
vzeroall | |||
___ | |||
$code.=<<___ if ($win64); | |||
lea 0x290+0x30(%rsp),%r11 | |||
movaps -0x30(%r11),%xmm6 | |||
movaps -0x20(%r11),%xmm7 | |||
movaps -0x10(%r11),%xmm8 | |||
movaps 0x00(%r11),%xmm9 | |||
movaps 0x10(%r11),%xmm10 | |||
movaps 0x20(%r11),%xmm11 | |||
movaps 0x30(%r11),%xmm12 | |||
movaps 0x40(%r11),%xmm13 | |||
movaps 0x50(%r11),%xmm14 | |||
movaps 0x60(%r11),%xmm15 | |||
movaps -0xa8(%r9),%xmm6 | |||
movaps -0x98(%r9),%xmm7 | |||
movaps -0x88(%r9),%xmm8 | |||
movaps -0x78(%r9),%xmm9 | |||
movaps -0x68(%r9),%xmm10 | |||
movaps -0x58(%r9),%xmm11 | |||
movaps -0x48(%r9),%xmm12 | |||
movaps -0x38(%r9),%xmm13 | |||
movaps -0x28(%r9),%xmm14 | |||
movaps -0x18(%r9),%xmm15 | |||
___ | |||
$code.=<<___; | |||
mov %r11,%rsp | |||
lea (%r9),%rsp | |||
.L16x_epilogue: | |||
ret | |||
.size ChaCha20_16x,.-ChaCha20_16x | |||
___ | |||
} | |||
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | |||
# CONTEXT *context,DISPATCHER_CONTEXT *disp) | |||
if ($win64) { | |||
$rec="%rcx"; | |||
$frame="%rdx"; | |||
$context="%r8"; | |||
$disp="%r9"; | |||
$code.=<<___; | |||
.extern __imp_RtlVirtualUnwind | |||
.type se_handler,\@abi-omnipotent | |||
.align 16 | |||
se_handler: | |||
push %rsi | |||
push %rdi | |||
push %rbx | |||
push %rbp | |||
push %r12 | |||
push %r13 | |||
push %r14 | |||
push %r15 | |||
pushfq | |||
sub \$64,%rsp | |||
mov 120($context),%rax # pull context->Rax | |||
mov 248($context),%rbx # pull context->Rip | |||
mov 8($disp),%rsi # disp->ImageBase | |||
mov 56($disp),%r11 # disp->HandlerData | |||
lea .Lctr32_body(%rip),%r10 | |||
cmp %r10,%rbx # context->Rip<.Lprologue | |||
jb .Lcommon_seh_tail | |||
mov 152($context),%rax # pull context->Rsp | |||
lea .Lno_data(%rip),%r10 # epilogue label | |||
cmp %r10,%rbx # context->Rip>=.Lepilogue | |||
jae .Lcommon_seh_tail | |||
lea 64+24+48(%rax),%rax | |||
mov -8(%rax),%rbx | |||
mov -16(%rax),%rbp | |||
mov -24(%rax),%r12 | |||
mov -32(%rax),%r13 | |||
mov -40(%rax),%r14 | |||
mov -48(%rax),%r15 | |||
mov %rbx,144($context) # restore context->Rbx | |||
mov %rbp,160($context) # restore context->Rbp | |||
mov %r12,216($context) # restore context->R12 | |||
mov %r13,224($context) # restore context->R13 | |||
mov %r14,232($context) # restore context->R14 | |||
mov %r15,240($context) # restore context->R14 | |||
.Lcommon_seh_tail: | |||
mov 8(%rax),%rdi | |||
mov 16(%rax),%rsi | |||
mov %rax,152($context) # restore context->Rsp | |||
mov %rsi,168($context) # restore context->Rsi | |||
mov %rdi,176($context) # restore context->Rdi | |||
mov 40($disp),%rdi # disp->ContextRecord | |||
mov $context,%rsi # context | |||
mov \$154,%ecx # sizeof(CONTEXT) | |||
.long 0xa548f3fc # cld; rep movsq | |||
mov $disp,%rsi | |||
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | |||
mov 8(%rsi),%rdx # arg2, disp->ImageBase | |||
mov 0(%rsi),%r8 # arg3, disp->ControlPc | |||
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | |||
mov 40(%rsi),%r10 # disp->ContextRecord | |||
lea 56(%rsi),%r11 # &disp->HandlerData | |||
lea 24(%rsi),%r12 # &disp->EstablisherFrame | |||
mov %r10,32(%rsp) # arg5 | |||
mov %r11,40(%rsp) # arg6 | |||
mov %r12,48(%rsp) # arg7 | |||
mov %rcx,56(%rsp) # arg8, (NULL) | |||
call *__imp_RtlVirtualUnwind(%rip) | |||
mov \$1,%eax # ExceptionContinueSearch | |||
add \$64,%rsp | |||
popfq | |||
pop %r15 | |||
pop %r14 | |||
pop %r13 | |||
pop %r12 | |||
pop %rbp | |||
pop %rbx | |||
pop %rdi | |||
pop %rsi | |||
ret | |||
.size se_handler,.-se_handler | |||
.type ssse3_handler,\@abi-omnipotent | |||
.align 16 | |||
ssse3_handler: | |||
push %rsi | |||
push %rdi | |||
push %rbx | |||
push %rbp | |||
push %r12 | |||
push %r13 | |||
push %r14 | |||
push %r15 | |||
pushfq | |||
sub \$64,%rsp | |||
mov 120($context),%rax # pull context->Rax | |||
mov 248($context),%rbx # pull context->Rip | |||
mov 8($disp),%rsi # disp->ImageBase | |||
mov 56($disp),%r11 # disp->HandlerData | |||
mov 0(%r11),%r10d # HandlerData[0] | |||
lea (%rsi,%r10),%r10 # prologue label | |||
cmp %r10,%rbx # context->Rip<prologue label | |||
jb .Lcommon_seh_tail | |||
mov 192($context),%rax # pull context->R9 | |||
mov 4(%r11),%r10d # HandlerData[1] | |||
lea (%rsi,%r10),%r10 # epilogue label | |||
cmp %r10,%rbx # context->Rip>=epilogue label | |||
jae .Lcommon_seh_tail | |||
lea -0x28(%rax),%rsi | |||
lea 512($context),%rdi # &context.Xmm6 | |||
mov \$4,%ecx | |||
.long 0xa548f3fc # cld; rep movsq | |||
jmp .Lcommon_seh_tail | |||
.size ssse3_handler,.-ssse3_handler | |||
.type full_handler,\@abi-omnipotent | |||
.align 16 | |||
full_handler: | |||
push %rsi | |||
push %rdi | |||
push %rbx | |||
push %rbp | |||
push %r12 | |||
push %r13 | |||
push %r14 | |||
push %r15 | |||
pushfq | |||
sub \$64,%rsp | |||
mov 120($context),%rax # pull context->Rax | |||
mov 248($context),%rbx # pull context->Rip | |||
mov 8($disp),%rsi # disp->ImageBase | |||
mov 56($disp),%r11 # disp->HandlerData | |||
mov 0(%r11),%r10d # HandlerData[0] | |||
lea (%rsi,%r10),%r10 # prologue label | |||
cmp %r10,%rbx # context->Rip<prologue label | |||
jb .Lcommon_seh_tail | |||
mov 192($context),%rax # pull context->R9 | |||
mov 4(%r11),%r10d # HandlerData[1] | |||
lea (%rsi,%r10),%r10 # epilogue label | |||
cmp %r10,%rbx # context->Rip>=epilogue label | |||
jae .Lcommon_seh_tail | |||
lea -0xa8(%rax),%rsi | |||
lea 512($context),%rdi # &context.Xmm6 | |||
mov \$20,%ecx | |||
.long 0xa548f3fc # cld; rep movsq | |||
jmp .Lcommon_seh_tail | |||
.size full_handler,.-full_handler | |||
.section .pdata | |||
.align 4 | |||
.rva .LSEH_begin_ChaCha20_ctr32 | |||
.rva .LSEH_end_ChaCha20_ctr32 | |||
.rva .LSEH_info_ChaCha20_ctr32 | |||
.rva .LSEH_begin_ChaCha20_ssse3 | |||
.rva .LSEH_end_ChaCha20_ssse3 | |||
.rva .LSEH_info_ChaCha20_ssse3 | |||
.rva .LSEH_begin_ChaCha20_4x | |||
.rva .LSEH_end_ChaCha20_4x | |||
.rva .LSEH_info_ChaCha20_4x | |||
___ | |||
$code.=<<___ if ($avx>1); | |||
.rva .LSEH_begin_ChaCha20_8x | |||
.rva .LSEH_end_ChaCha20_8x | |||
.rva .LSEH_info_ChaCha20_8x | |||
___ | |||
$code.=<<___ if ($avx>2); | |||
.rva .LSEH_begin_ChaCha20_avx512 | |||
.rva .LSEH_end_ChaCha20_avx512 | |||
.rva .LSEH_info_ChaCha20_avx512 | |||
.rva .LSEH_begin_ChaCha20_16x | |||
.rva .LSEH_end_ChaCha20_16x | |||
.rva .LSEH_info_ChaCha20_16x | |||
___ | |||
$code.=<<___; | |||
.section .xdata | |||
.align 8 | |||
.LSEH_info_ChaCha20_ctr32: | |||
.byte 9,0,0,0 | |||
.rva se_handler | |||
.LSEH_info_ChaCha20_ssse3: | |||
.byte 9,0,0,0 | |||
.rva ssse3_handler | |||
.rva .Lssse3_body,.Lssse3_epilogue | |||
.LSEH_info_ChaCha20_4x: | |||
.byte 9,0,0,0 | |||
.rva full_handler | |||
.rva .L4x_body,.L4x_epilogue | |||
___ | |||
$code.=<<___ if ($avx>1); | |||
.LSEH_info_ChaCha20_8x: | |||
.byte 9,0,0,0 | |||
.rva full_handler | |||
.rva .L8x_body,.L8x_epilogue # HandlerData[] | |||
___ | |||
$code.=<<___ if ($avx>2); | |||
.LSEH_info_ChaCha20_avx512: | |||
.byte 9,0,0,0 | |||
.rva ssse3_handler | |||
.rva .Lavx512_body,.Lavx512_epilogue # HandlerData[] | |||
.LSEH_info_ChaCha20_16x: | |||
.byte 9,0,0,0 | |||
.rva full_handler | |||
.rva .L16x_body,.L16x_epilogue # HandlerData[] | |||
___ | |||
} | |||
foreach (split("\n",$code)) { | |||
s/\`([^\`]*)\`/eval $1/ge; | |||
@@ -219,8 +219,12 @@ $code=<<___; | |||
.align 16 | |||
gcm_gmult_4bit: | |||
push %rbx | |||
push %rbp # %rbp and %r12 are pushed exclusively in | |||
push %rbp # %rbp and others are pushed exclusively in | |||
push %r12 # order to reuse Win64 exception handler... | |||
push %r13 | |||
push %r14 | |||
push %r15 | |||
sub \$280,%rsp | |||
.Lgmult_prologue: | |||
movzb 15($Xi),$Zlo | |||
@@ -231,8 +235,9 @@ $code.=<<___; | |||
mov $Zlo,8($Xi) | |||
mov $Zhi,($Xi) | |||
mov 16(%rsp),%rbx | |||
lea 24(%rsp),%rsp | |||
lea 280+48(%rsp),%rsi | |||
mov -8(%rsi),%rbx | |||
lea (%rsi),%rsp | |||
.Lgmult_epilogue: | |||
ret | |||
.size gcm_gmult_4bit,.-gcm_gmult_4bit | |||
@@ -382,14 +387,14 @@ $code.=<<___; | |||
mov $Zlo,8($Xi) | |||
mov $Zhi,($Xi) | |||
lea 280(%rsp),%rsi | |||
mov 0(%rsi),%r15 | |||
mov 8(%rsi),%r14 | |||
mov 16(%rsi),%r13 | |||
mov 24(%rsi),%r12 | |||
mov 32(%rsi),%rbp | |||
mov 40(%rsi),%rbx | |||
lea 48(%rsi),%rsp | |||
lea 280+48(%rsp),%rsi | |||
mov -48(%rsi),%r15 | |||
mov -40(%rsi),%r14 | |||
mov -32(%rsi),%r13 | |||
mov -24(%rsi),%r12 | |||
mov -16(%rsi),%rbp | |||
mov -8(%rsi),%rbx | |||
lea 0(%rsi),%rsp | |||
.Lghash_epilogue: | |||
ret | |||
.size gcm_ghash_4bit,.-gcm_ghash_4bit | |||
@@ -1630,14 +1635,20 @@ se_handler: | |||
cmp %r10,%rbx # context->Rip>=epilogue label | |||
jae .Lin_prologue | |||
lea 24(%rax),%rax # adjust "rsp" | |||
lea 48+280(%rax),%rax # adjust "rsp" | |||
mov -8(%rax),%rbx | |||
mov -16(%rax),%rbp | |||
mov -24(%rax),%r12 | |||
mov -32(%rax),%r13 | |||
mov -40(%rax),%r14 | |||
mov -48(%rax),%r15 | |||
mov %rbx,144($context) # restore context->Rbx | |||
mov %rbp,160($context) # restore context->Rbp | |||
mov %r12,216($context) # restore context->R12 | |||
mov %r13,224($context) # restore context->R13 | |||
mov %r14,232($context) # restore context->R14 | |||
mov %r15,240($context) # restore context->R15 | |||
.Lin_prologue: | |||
mov 8(%rax),%rdi | |||
@@ -447,7 +447,8 @@ my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization | |||
my @T=("%esi","%edi"); | |||
my $j=0; | |||
my $rx=0; | |||
my $K_XX_XX="%r11"; | |||
my $K_XX_XX="%r14"; | |||
my $fp="%r11"; | |||
my $_rol=sub { &rol(@_) }; | |||
my $_ror=sub { &ror(@_) }; | |||
@@ -468,7 +469,7 @@ $code.=<<___; | |||
.align 16 | |||
sha1_block_data_order_ssse3: | |||
_ssse3_shortcut: | |||
mov %rsp,%rax | |||
mov %rsp,$fp # frame pointer | |||
push %rbx | |||
push %rbp | |||
push %r12 | |||
@@ -477,16 +478,15 @@ _ssse3_shortcut: | |||
lea `-64-($win64?6*16:0)`(%rsp),%rsp | |||
___ | |||
$code.=<<___ if ($win64); | |||
movaps %xmm6,-40-6*16(%rax) | |||
movaps %xmm7,-40-5*16(%rax) | |||
movaps %xmm8,-40-4*16(%rax) | |||
movaps %xmm9,-40-3*16(%rax) | |||
movaps %xmm10,-40-2*16(%rax) | |||
movaps %xmm11,-40-1*16(%rax) | |||
movaps %xmm6,-40-6*16($fp) | |||
movaps %xmm7,-40-5*16($fp) | |||
movaps %xmm8,-40-4*16($fp) | |||
movaps %xmm9,-40-3*16($fp) | |||
movaps %xmm10,-40-2*16($fp) | |||
movaps %xmm11,-40-1*16($fp) | |||
.Lprologue_ssse3: | |||
___ | |||
$code.=<<___; | |||
mov %rax,%r14 # original %rsp | |||
and \$-64,%rsp | |||
mov %rdi,$ctx # reassigned argument | |||
mov %rsi,$inp # reassigned argument | |||
@@ -893,21 +893,20 @@ $code.=<<___; | |||
mov $E,16($ctx) | |||
___ | |||
$code.=<<___ if ($win64); | |||
movaps -40-6*16(%r14),%xmm6 | |||
movaps -40-5*16(%r14),%xmm7 | |||
movaps -40-4*16(%r14),%xmm8 | |||
movaps -40-3*16(%r14),%xmm9 | |||
movaps -40-2*16(%r14),%xmm10 | |||
movaps -40-1*16(%r14),%xmm11 | |||
movaps -40-6*16($fp),%xmm6 | |||
movaps -40-5*16($fp),%xmm7 | |||
movaps -40-4*16($fp),%xmm8 | |||
movaps -40-3*16($fp),%xmm9 | |||
movaps -40-2*16($fp),%xmm10 | |||
movaps -40-1*16($fp),%xmm11 | |||
___ | |||
$code.=<<___; | |||
lea (%r14),%rsi | |||
mov -40(%rsi),%r14 | |||
mov -32(%rsi),%r13 | |||
mov -24(%rsi),%r12 | |||
mov -16(%rsi),%rbp | |||
mov -8(%rsi),%rbx | |||
lea (%rsi),%rsp | |||
mov -40($fp),%r14 | |||
mov -32($fp),%r13 | |||
mov -24($fp),%r12 | |||
mov -16($fp),%rbp | |||
mov -8($fp),%rbx | |||
lea ($fp),%rsp | |||
.Lepilogue_ssse3: | |||
ret | |||
.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 | |||
@@ -930,7 +929,7 @@ $code.=<<___; | |||
.align 16 | |||
sha1_block_data_order_avx: | |||
_avx_shortcut: | |||
mov %rsp,%rax | |||
mov %rsp,$fp | |||
push %rbx | |||
push %rbp | |||
push %r12 | |||
@@ -940,16 +939,15 @@ _avx_shortcut: | |||
vzeroupper | |||
___ | |||
$code.=<<___ if ($win64); | |||
vmovaps %xmm6,-40-6*16(%rax) | |||
vmovaps %xmm7,-40-5*16(%rax) | |||
vmovaps %xmm8,-40-4*16(%rax) | |||
vmovaps %xmm9,-40-3*16(%rax) | |||
vmovaps %xmm10,-40-2*16(%rax) | |||
vmovaps %xmm11,-40-1*16(%rax) | |||
vmovaps %xmm6,-40-6*16($fp) | |||
vmovaps %xmm7,-40-5*16($fp) | |||
vmovaps %xmm8,-40-4*16($fp) | |||
vmovaps %xmm9,-40-3*16($fp) | |||
vmovaps %xmm10,-40-2*16($fp) | |||
vmovaps %xmm11,-40-1*16($fp) | |||
.Lprologue_avx: | |||
___ | |||
$code.=<<___; | |||
mov %rax,%r14 # original %rsp | |||
and \$-64,%rsp | |||
mov %rdi,$ctx # reassigned argument | |||
mov %rsi,$inp # reassigned argument | |||
@@ -1257,21 +1255,20 @@ $code.=<<___; | |||
mov $E,16($ctx) | |||
___ | |||
$code.=<<___ if ($win64); | |||
movaps -40-6*16(%r14),%xmm6 | |||
movaps -40-5*16(%r14),%xmm7 | |||
movaps -40-4*16(%r14),%xmm8 | |||
movaps -40-3*16(%r14),%xmm9 | |||
movaps -40-2*16(%r14),%xmm10 | |||
movaps -40-1*16(%r14),%xmm11 | |||
movaps -40-6*16($fp),%xmm6 | |||
movaps -40-5*16($fp),%xmm7 | |||
movaps -40-4*16($fp),%xmm8 | |||
movaps -40-3*16($fp),%xmm9 | |||
movaps -40-2*16($fp),%xmm10 | |||
movaps -40-1*16($fp),%xmm11 | |||
___ | |||
$code.=<<___; | |||
lea (%r14),%rsi | |||
mov -40(%rsi),%r14 | |||
mov -32(%rsi),%r13 | |||
mov -24(%rsi),%r12 | |||
mov -16(%rsi),%rbp | |||
mov -8(%rsi),%rbx | |||
lea (%rsi),%rsp | |||
mov -40($fp),%r14 | |||
mov -32($fp),%r13 | |||
mov -24($fp),%r12 | |||
mov -16($fp),%rbp | |||
mov -8($fp),%rbx | |||
lea ($fp),%rsp | |||
.Lepilogue_avx: | |||
ret | |||
.size sha1_block_data_order_avx,.-sha1_block_data_order_avx | |||
@@ -1297,7 +1294,7 @@ $code.=<<___; | |||
.align 16 | |||
sha1_block_data_order_avx2: | |||
_avx2_shortcut: | |||
mov %rsp,%rax | |||
mov %rsp,$fp | |||
push %rbx | |||
push %rbp | |||
push %r12 | |||
@@ -1307,16 +1304,15 @@ _avx2_shortcut: | |||
___ | |||
$code.=<<___ if ($win64); | |||
lea -6*16(%rsp),%rsp | |||
vmovaps %xmm6,-40-6*16(%rax) | |||
vmovaps %xmm7,-40-5*16(%rax) | |||
vmovaps %xmm8,-40-4*16(%rax) | |||
vmovaps %xmm9,-40-3*16(%rax) | |||
vmovaps %xmm10,-40-2*16(%rax) | |||
vmovaps %xmm11,-40-1*16(%rax) | |||
vmovaps %xmm6,-40-6*16($fp) | |||
vmovaps %xmm7,-40-5*16($fp) | |||
vmovaps %xmm8,-40-4*16($fp) | |||
vmovaps %xmm9,-40-3*16($fp) | |||
vmovaps %xmm10,-40-2*16($fp) | |||
vmovaps %xmm11,-40-1*16($fp) | |||
.Lprologue_avx2: | |||
___ | |||
$code.=<<___; | |||
mov %rax,%r14 # original %rsp | |||
mov %rdi,$ctx # reassigned argument | |||
mov %rsi,$inp # reassigned argument | |||
mov %rdx,$num # reassigned argument | |||
@@ -1736,21 +1732,20 @@ $code.=<<___; | |||
vzeroupper | |||
___ | |||
$code.=<<___ if ($win64); | |||
movaps -40-6*16(%r14),%xmm6 | |||
movaps -40-5*16(%r14),%xmm7 | |||
movaps -40-4*16(%r14),%xmm8 | |||
movaps -40-3*16(%r14),%xmm9 | |||
movaps -40-2*16(%r14),%xmm10 | |||
movaps -40-1*16(%r14),%xmm11 | |||
movaps -40-6*16($fp),%xmm6 | |||
movaps -40-5*16($fp),%xmm7 | |||
movaps -40-4*16($fp),%xmm8 | |||
movaps -40-3*16($fp),%xmm9 | |||
movaps -40-2*16($fp),%xmm10 | |||
movaps -40-1*16($fp),%xmm11 | |||
___ | |||
$code.=<<___; | |||
lea (%r14),%rsi | |||
mov -40(%rsi),%r14 | |||
mov -32(%rsi),%r13 | |||
mov -24(%rsi),%r12 | |||
mov -16(%rsi),%rbp | |||
mov -8(%rsi),%rbx | |||
lea (%rsi),%rsp | |||
mov -40($fp),%r14 | |||
mov -32($fp),%r13 | |||
mov -24($fp),%r12 | |||
mov -16($fp),%rbp | |||
mov -8($fp),%rbx | |||
lea ($fp),%rsp | |||
.Lepilogue_avx2: | |||
ret | |||
.size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2 | |||
@@ -1893,15 +1888,13 @@ ssse3_handler: | |||
cmp %r10,%rbx # context->Rip<prologue label | |||
jb .Lcommon_seh_tail | |||
mov 152($context),%rax # pull context->Rsp | |||
mov 208($context),%rax # pull context->R11 | |||
mov 4(%r11),%r10d # HandlerData[1] | |||
lea (%rsi,%r10),%r10 # epilogue label | |||
cmp %r10,%rbx # context->Rip>=epilogue label | |||
jae .Lcommon_seh_tail | |||
mov 232($context),%rax # pull context->R14 | |||
lea -40-6*16(%rax),%rsi | |||
lea 512($context),%rdi # &context.Xmm6 | |||
mov \$12,%ecx | |||
@@ -286,13 +286,13 @@ $code.=<<___ if ($SZ==4); | |||
jnz .Lssse3_shortcut | |||
___ | |||
$code.=<<___; | |||
mov %rsp,%rax # copy %rsp | |||
push %rbx | |||
push %rbp | |||
push %r12 | |||
push %r13 | |||
push %r14 | |||
push %r15 | |||
mov %rsp,%r11 # copy %rsp | |||
shl \$4,%rdx # num*16 | |||
sub \$$framesz,%rsp | |||
lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ | |||
@@ -300,7 +300,7 @@ $code.=<<___; | |||
mov $ctx,$_ctx # save ctx, 1st arg | |||
mov $inp,$_inp # save inp, 2nd arh | |||
mov %rdx,$_end # save end pointer, "3rd" arg | |||
mov %r11,$_rsp # save copy of %rsp | |||
mov %rax,$_rsp # save copy of %rsp | |||
.Lprologue: | |||
mov $SZ*0($ctx),$A | |||
@@ -367,13 +367,13 @@ $code.=<<___; | |||
jb .Lloop | |||
mov $_rsp,%rsi | |||
mov (%rsi),%r15 | |||
mov 8(%rsi),%r14 | |||
mov 16(%rsi),%r13 | |||
mov 24(%rsi),%r12 | |||
mov 32(%rsi),%rbp | |||
mov 40(%rsi),%rbx | |||
lea 48(%rsi),%rsp | |||
mov -48(%rsi),%r15 | |||
mov -40(%rsi),%r14 | |||
mov -32(%rsi),%r13 | |||
mov -24(%rsi),%r12 | |||
mov -16(%rsi),%rbp | |||
mov -8(%rsi),%rbx | |||
lea (%rsi),%rsp | |||
.Lepilogue: | |||
ret | |||
.size $func,.-$func | |||
@@ -746,13 +746,13 @@ $code.=<<___; | |||
.align 64 | |||
${func}_ssse3: | |||
.Lssse3_shortcut: | |||
mov %rsp,%rax # copy %rsp | |||
push %rbx | |||
push %rbp | |||
push %r12 | |||
push %r13 | |||
push %r14 | |||
push %r15 | |||
mov %rsp,%r11 # copy %rsp | |||
shl \$4,%rdx # num*16 | |||
sub \$`$framesz+$win64*16*4`,%rsp | |||
lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ | |||
@@ -760,7 +760,7 @@ ${func}_ssse3: | |||
mov $ctx,$_ctx # save ctx, 1st arg | |||
mov $inp,$_inp # save inp, 2nd arh | |||
mov %rdx,$_end # save end pointer, "3rd" arg | |||
mov %r11,$_rsp # save copy of %rsp | |||
mov %rax,$_rsp # save copy of %rsp | |||
___ | |||
$code.=<<___ if ($win64); | |||
movaps %xmm6,16*$SZ+32(%rsp) | |||
@@ -1067,13 +1067,13 @@ $code.=<<___ if ($win64); | |||
movaps 16*$SZ+80(%rsp),%xmm9 | |||
___ | |||
$code.=<<___; | |||
mov (%rsi),%r15 | |||
mov 8(%rsi),%r14 | |||
mov 16(%rsi),%r13 | |||
mov 24(%rsi),%r12 | |||
mov 32(%rsi),%rbp | |||
mov 40(%rsi),%rbx | |||
lea 48(%rsi),%rsp | |||
mov -48(%rsi),%r15 | |||
mov -40(%rsi),%r14 | |||
mov -32(%rsi),%r13 | |||
mov -24(%rsi),%r12 | |||
mov -16(%rsi),%rbp | |||
mov -8(%rsi),%rbx | |||
lea (%rsi),%rsp | |||
.Lepilogue_ssse3: | |||
ret | |||
.size ${func}_ssse3,.-${func}_ssse3 | |||
@@ -1090,13 +1090,13 @@ $code.=<<___; | |||
.align 64 | |||
${func}_xop: | |||
.Lxop_shortcut: | |||
mov %rsp,%rax # copy %rsp | |||
push %rbx | |||
push %rbp | |||
push %r12 | |||
push %r13 | |||
push %r14 | |||
push %r15 | |||
mov %rsp,%r11 # copy %rsp | |||
shl \$4,%rdx # num*16 | |||
sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp | |||
lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ | |||
@@ -1104,7 +1104,7 @@ ${func}_xop: | |||
mov $ctx,$_ctx # save ctx, 1st arg | |||
mov $inp,$_inp # save inp, 2nd arh | |||
mov %rdx,$_end # save end pointer, "3rd" arg | |||
mov %r11,$_rsp # save copy of %rsp | |||
mov %rax,$_rsp # save copy of %rsp | |||
___ | |||
$code.=<<___ if ($win64); | |||
movaps %xmm6,16*$SZ+32(%rsp) | |||
@@ -1444,13 +1444,13 @@ $code.=<<___ if ($win64 && $SZ>4); | |||
movaps 16*$SZ+112(%rsp),%xmm11 | |||
___ | |||
$code.=<<___; | |||
mov (%rsi),%r15 | |||
mov 8(%rsi),%r14 | |||
mov 16(%rsi),%r13 | |||
mov 24(%rsi),%r12 | |||
mov 32(%rsi),%rbp | |||
mov 40(%rsi),%rbx | |||
lea 48(%rsi),%rsp | |||
mov -48(%rsi),%r15 | |||
mov -40(%rsi),%r14 | |||
mov -32(%rsi),%r13 | |||
mov -24(%rsi),%r12 | |||
mov -16(%rsi),%rbp | |||
mov -8(%rsi),%rbx | |||
lea (%rsi),%rsp | |||
.Lepilogue_xop: | |||
ret | |||
.size ${func}_xop,.-${func}_xop | |||
@@ -1466,13 +1466,13 @@ $code.=<<___; | |||
.align 64 | |||
${func}_avx: | |||
.Lavx_shortcut: | |||
mov %rsp,%rax # copy %rsp | |||
push %rbx | |||
push %rbp | |||
push %r12 | |||
push %r13 | |||
push %r14 | |||
push %r15 | |||
mov %rsp,%r11 # copy %rsp | |||
shl \$4,%rdx # num*16 | |||
sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp | |||
lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ | |||
@@ -1480,7 +1480,7 @@ ${func}_avx: | |||
mov $ctx,$_ctx # save ctx, 1st arg | |||
mov $inp,$_inp # save inp, 2nd arh | |||
mov %rdx,$_end # save end pointer, "3rd" arg | |||
mov %r11,$_rsp # save copy of %rsp | |||
mov %rax,$_rsp # save copy of %rsp | |||
___ | |||
$code.=<<___ if ($win64); | |||
movaps %xmm6,16*$SZ+32(%rsp) | |||
@@ -1752,13 +1752,13 @@ $code.=<<___ if ($win64 && $SZ>4); | |||
movaps 16*$SZ+112(%rsp),%xmm11 | |||
___ | |||
$code.=<<___; | |||
mov (%rsi),%r15 | |||
mov 8(%rsi),%r14 | |||
mov 16(%rsi),%r13 | |||
mov 24(%rsi),%r12 | |||
mov 32(%rsi),%rbp | |||
mov 40(%rsi),%rbx | |||
lea 48(%rsi),%rsp | |||
mov -48(%rsi),%r15 | |||
mov -40(%rsi),%r14 | |||
mov -32(%rsi),%r13 | |||
mov -24(%rsi),%r12 | |||
mov -16(%rsi),%rbp | |||
mov -8(%rsi),%rbx | |||
lea (%rsi),%rsp | |||
.Lepilogue_avx: | |||
ret | |||
.size ${func}_avx,.-${func}_avx | |||
@@ -1817,13 +1817,13 @@ $code.=<<___; | |||
.align 64 | |||
${func}_avx2: | |||
.Lavx2_shortcut: | |||
mov %rsp,%rax # copy %rsp | |||
push %rbx | |||
push %rbp | |||
push %r12 | |||
push %r13 | |||
push %r14 | |||
push %r15 | |||
mov %rsp,%r11 # copy %rsp | |||
sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp | |||
shl \$4,%rdx # num*16 | |||
and \$-256*$SZ,%rsp # align stack frame | |||
@@ -1832,7 +1832,7 @@ ${func}_avx2: | |||
mov $ctx,$_ctx # save ctx, 1st arg | |||
mov $inp,$_inp # save inp, 2nd arh | |||
mov %rdx,$_end # save end pointer, "3rd" arg | |||
mov %r11,$_rsp # save copy of %rsp | |||
mov %rax,$_rsp # save copy of %rsp | |||
___ | |||
$code.=<<___ if ($win64); | |||
movaps %xmm6,16*$SZ+32(%rsp) | |||
@@ -2126,13 +2126,13 @@ $code.=<<___ if ($win64 && $SZ>4); | |||
movaps 16*$SZ+112(%rsp),%xmm11 | |||
___ | |||
$code.=<<___; | |||
mov (%rsi),%r15 | |||
mov 8(%rsi),%r14 | |||
mov 16(%rsi),%r13 | |||
mov 24(%rsi),%r12 | |||
mov 32(%rsi),%rbp | |||
mov 40(%rsi),%rbx | |||
lea 48(%rsi),%rsp | |||
mov -48(%rsi),%r15 | |||
mov -40(%rsi),%r14 | |||
mov -32(%rsi),%r13 | |||
mov -24(%rsi),%r12 | |||
mov -16(%rsi),%rbp | |||
mov -8(%rsi),%rbx | |||
lea (%rsi),%rsp | |||
.Lepilogue_avx2: | |||
ret | |||
.size ${func}_avx2,.-${func}_avx2 | |||
@@ -2194,7 +2194,6 @@ ___ | |||
$code.=<<___; | |||
mov %rax,%rsi # put aside Rsp | |||
mov 16*$SZ+3*8(%rax),%rax # pull $_rsp | |||
lea 48(%rax),%rax | |||
mov -8(%rax),%rbx | |||
mov -16(%rax),%rbp | |||