boringssl/crypto/chacha/asm/chacha-x86.pl
David Benjamin fdd8e9c8c7 Switch perlasm calling convention.
Depending on architecture, perlasm differed on which one or both of:

  perl foo.pl flavor output.S
  perl foo.pl flavor > output.S

Upstream has now unified on the first form after making a number of
changes to their files (the second does not even work for their x86
files anymore). Sync those portions of our perlasm scripts with upstream
and update CMakeLists.txt and generate_build_files.py per the new
convention.

This imports various commits like this one:
184bc45f683c76531d7e065b6553ca9086564576 (this was done by taking a
diff, so I don't have the full list)

Confirmed that generate_build_files.py sees no change.

BUG=14

Change-Id: Id2fb5b8bc2a7369d077221b5df9a6947d41f50d2
Reviewed-on: https://boringssl-review.googlesource.com/8518
Reviewed-by: Adam Langley <agl@google.com>
2016-06-27 21:59:26 +00:00

759 lines
21 KiB
Raku
Executable File

#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# January 2015
#
# ChaCha20 for x86.
#
# Performance in cycles per byte out of large buffer.
#
# 1xIALU/gcc 4xSSSE3
# Pentium 17.5/+80%
# PIII 14.2/+60%
# P4 18.6/+84%
# Core2 9.56/+89% 4.83
# Westmere 9.50/+45% 3.35
# Sandy Bridge 10.5/+47% 3.20
# Haswell 8.15/+50% 2.83
# Silvermont 17.4/+36% 8.35
# Sledgehammer 10.2/+54%
# Bulldozer 13.4/+50% 4.38(*)
#
# (*) Bulldozer actually executes 4xXOP code path that delivers 3.55;
#
# Modified from upstream OpenSSL to remove the XOP code.
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
$output=pop;
open STDOUT,">$output";
&asm_init($ARGV[0],"chacha-x86.pl",$ARGV[$#ARGV] eq "386");
$xmm=$ymm=0;
for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
$ymm=$xmm;
$a="eax";
($b,$b_)=("ebx","ebp");
($c,$c_)=("ecx","esi");
($d,$d_)=("edx","edi");
sub QUARTERROUND {
my ($ai,$bi,$ci,$di,$i)=@_;
my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next
my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous
# a b c d
#
# 0 4 8 12 < even round
# 1 5 9 13
# 2 6 10 14
# 3 7 11 15
# 0 5 10 15 < odd round
# 1 6 11 12
# 2 7 8 13
# 3 4 9 14
if ($i==0) {
my $j=4;
($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
} elsif ($i==3) {
my $j=0;
($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
} elsif ($i==4) {
my $j=4;
($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
} elsif ($i==7) {
my $j=0;
($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
}
#&add ($a,$b); # see elsewhere
&xor ($d,$a);
&mov (&DWP(4*$cp,"esp"),$c_) if ($ai>0 && $ai<3);
&rol ($d,16);
&mov (&DWP(4*$bp,"esp"),$b_) if ($i!=0);
&add ($c,$d);
&mov ($c_,&DWP(4*$cn,"esp")) if ($ai>0 && $ai<3);
&xor ($b,$c);
&mov ($d_,&DWP(4*$dn,"esp")) if ($di!=$dn);
&rol ($b,12);
&mov ($b_,&DWP(4*$bn,"esp")) if ($i<7);
&mov ($b_,&DWP(128,"esp")) if ($i==7); # loop counter
&add ($a,$b);
&xor ($d,$a);
&mov (&DWP(4*$ai,"esp"),$a);
&rol ($d,8);
&mov ($a,&DWP(4*$an,"esp"));
&add ($c,$d);
&mov (&DWP(4*$di,"esp"),$d) if ($di!=$dn);
&mov ($d_,$d) if ($di==$dn);
&xor ($b,$c);
&add ($a,$b_) if ($i<7); # elsewhere
&rol ($b,7);
($b,$b_)=($b_,$b);
($c,$c_)=($c_,$c);
($d,$d_)=($d_,$d);
}
&static_label("ssse3_shortcut");
&static_label("ssse3_data");
&static_label("pic_point");
&function_begin("ChaCha20_ctr32");
&xor ("eax","eax");
&cmp ("eax",&wparam(2)); # len==0?
&je (&label("no_data"));
if ($xmm) {
&call (&label("pic_point"));
&set_label("pic_point");
&blindpop("eax");
&picmeup("ebp","OPENSSL_ia32cap_P","eax",&label("pic_point"));
&test (&DWP(0,"ebp"),1<<24); # test FXSR bit
&jz (&label("x86"));
&test (&DWP(4,"ebp"),1<<9); # test SSSE3 bit
&jz (&label("x86"));
&jmp (&label("ssse3_shortcut"));
&set_label("x86");
}
&mov ("esi",&wparam(3)); # key
&mov ("edi",&wparam(4)); # counter and nonce
&stack_push(33);
&mov ("eax",&DWP(4*0,"esi")); # copy key
&mov ("ebx",&DWP(4*1,"esi"));
&mov ("ecx",&DWP(4*2,"esi"));
&mov ("edx",&DWP(4*3,"esi"));
&mov (&DWP(64+4*4,"esp"),"eax");
&mov (&DWP(64+4*5,"esp"),"ebx");
&mov (&DWP(64+4*6,"esp"),"ecx");
&mov (&DWP(64+4*7,"esp"),"edx");
&mov ("eax",&DWP(4*4,"esi"));
&mov ("ebx",&DWP(4*5,"esi"));
&mov ("ecx",&DWP(4*6,"esi"));
&mov ("edx",&DWP(4*7,"esi"));
&mov (&DWP(64+4*8,"esp"),"eax");
&mov (&DWP(64+4*9,"esp"),"ebx");
&mov (&DWP(64+4*10,"esp"),"ecx");
&mov (&DWP(64+4*11,"esp"),"edx");
&mov ("eax",&DWP(4*0,"edi")); # copy counter and nonce
&mov ("ebx",&DWP(4*1,"edi"));
&mov ("ecx",&DWP(4*2,"edi"));
&mov ("edx",&DWP(4*3,"edi"));
&sub ("eax",1);
&mov (&DWP(64+4*12,"esp"),"eax");
&mov (&DWP(64+4*13,"esp"),"ebx");
&mov (&DWP(64+4*14,"esp"),"ecx");
&mov (&DWP(64+4*15,"esp"),"edx");
&jmp (&label("entry"));
&set_label("outer_loop",16);
&mov (&wparam(1),$b); # save input
&mov (&wparam(0),$a); # save output
&mov (&wparam(2),$c); # save len
&set_label("entry");
&mov ($a,0x61707865);
&mov (&DWP(4*1,"esp"),0x3320646e);
&mov (&DWP(4*2,"esp"),0x79622d32);
&mov (&DWP(4*3,"esp"),0x6b206574);
&mov ($b, &DWP(64+4*5,"esp")); # copy key material
&mov ($b_,&DWP(64+4*6,"esp"));
&mov ($c, &DWP(64+4*10,"esp"));
&mov ($c_,&DWP(64+4*11,"esp"));
&mov ($d, &DWP(64+4*13,"esp"));
&mov ($d_,&DWP(64+4*14,"esp"));
&mov (&DWP(4*5,"esp"),$b);
&mov (&DWP(4*6,"esp"),$b_);
&mov (&DWP(4*10,"esp"),$c);
&mov (&DWP(4*11,"esp"),$c_);
&mov (&DWP(4*13,"esp"),$d);
&mov (&DWP(4*14,"esp"),$d_);
&mov ($b, &DWP(64+4*7,"esp"));
&mov ($d_,&DWP(64+4*15,"esp"));
&mov ($d, &DWP(64+4*12,"esp"));
&mov ($b_,&DWP(64+4*4,"esp"));
&mov ($c, &DWP(64+4*8,"esp"));
&mov ($c_,&DWP(64+4*9,"esp"));
&add ($d,1); # counter value
&mov (&DWP(4*7,"esp"),$b);
&mov (&DWP(4*15,"esp"),$d_);
&mov (&DWP(64+4*12,"esp"),$d); # save counter value
&mov ($b,10); # loop counter
&jmp (&label("loop"));
&set_label("loop",16);
&add ($a,$b_); # elsewhere
&mov (&DWP(128,"esp"),$b); # save loop counter
&mov ($b,$b_);
&QUARTERROUND(0, 4, 8, 12, 0);
&QUARTERROUND(1, 5, 9, 13, 1);
&QUARTERROUND(2, 6,10, 14, 2);
&QUARTERROUND(3, 7,11, 15, 3);
&QUARTERROUND(0, 5,10, 15, 4);
&QUARTERROUND(1, 6,11, 12, 5);
&QUARTERROUND(2, 7, 8, 13, 6);
&QUARTERROUND(3, 4, 9, 14, 7);
&dec ($b);
&jnz (&label("loop"));
&mov ($b,&wparam(2)); # load len
&add ($a,0x61707865); # accumulate key material
&add ($b_,&DWP(64+4*4,"esp"));
&add ($c, &DWP(64+4*8,"esp"));
&add ($c_,&DWP(64+4*9,"esp"));
&cmp ($b,64);
&jb (&label("tail"));
&mov ($b,&wparam(1)); # load input pointer
&add ($d, &DWP(64+4*12,"esp"));
&add ($d_,&DWP(64+4*14,"esp"));
&xor ($a, &DWP(4*0,$b)); # xor with input
&xor ($b_,&DWP(4*4,$b));
&mov (&DWP(4*0,"esp"),$a);
&mov ($a,&wparam(0)); # load output pointer
&xor ($c, &DWP(4*8,$b));
&xor ($c_,&DWP(4*9,$b));
&xor ($d, &DWP(4*12,$b));
&xor ($d_,&DWP(4*14,$b));
&mov (&DWP(4*4,$a),$b_); # write output
&mov (&DWP(4*8,$a),$c);
&mov (&DWP(4*9,$a),$c_);
&mov (&DWP(4*12,$a),$d);
&mov (&DWP(4*14,$a),$d_);
&mov ($b_,&DWP(4*1,"esp"));
&mov ($c, &DWP(4*2,"esp"));
&mov ($c_,&DWP(4*3,"esp"));
&mov ($d, &DWP(4*5,"esp"));
&mov ($d_,&DWP(4*6,"esp"));
&add ($b_,0x3320646e); # accumulate key material
&add ($c, 0x79622d32);
&add ($c_,0x6b206574);
&add ($d, &DWP(64+4*5,"esp"));
&add ($d_,&DWP(64+4*6,"esp"));
&xor ($b_,&DWP(4*1,$b));
&xor ($c, &DWP(4*2,$b));
&xor ($c_,&DWP(4*3,$b));
&xor ($d, &DWP(4*5,$b));
&xor ($d_,&DWP(4*6,$b));
&mov (&DWP(4*1,$a),$b_);
&mov (&DWP(4*2,$a),$c);
&mov (&DWP(4*3,$a),$c_);
&mov (&DWP(4*5,$a),$d);
&mov (&DWP(4*6,$a),$d_);
&mov ($b_,&DWP(4*7,"esp"));
&mov ($c, &DWP(4*10,"esp"));
&mov ($c_,&DWP(4*11,"esp"));
&mov ($d, &DWP(4*13,"esp"));
&mov ($d_,&DWP(4*15,"esp"));
&add ($b_,&DWP(64+4*7,"esp"));
&add ($c, &DWP(64+4*10,"esp"));
&add ($c_,&DWP(64+4*11,"esp"));
&add ($d, &DWP(64+4*13,"esp"));
&add ($d_,&DWP(64+4*15,"esp"));
&xor ($b_,&DWP(4*7,$b));
&xor ($c, &DWP(4*10,$b));
&xor ($c_,&DWP(4*11,$b));
&xor ($d, &DWP(4*13,$b));
&xor ($d_,&DWP(4*15,$b));
&lea ($b,&DWP(4*16,$b));
&mov (&DWP(4*7,$a),$b_);
&mov ($b_,&DWP(4*0,"esp"));
&mov (&DWP(4*10,$a),$c);
&mov ($c,&wparam(2)); # len
&mov (&DWP(4*11,$a),$c_);
&mov (&DWP(4*13,$a),$d);
&mov (&DWP(4*15,$a),$d_);
&mov (&DWP(4*0,$a),$b_);
&lea ($a,&DWP(4*16,$a));
&sub ($c,64);
&jnz (&label("outer_loop"));
&jmp (&label("done"));
&set_label("tail");
&add ($d, &DWP(64+4*12,"esp"));
&add ($d_,&DWP(64+4*14,"esp"));
&mov (&DWP(4*0,"esp"),$a);
&mov (&DWP(4*4,"esp"),$b_);
&mov (&DWP(4*8,"esp"),$c);
&mov (&DWP(4*9,"esp"),$c_);
&mov (&DWP(4*12,"esp"),$d);
&mov (&DWP(4*14,"esp"),$d_);
&mov ($b_,&DWP(4*1,"esp"));
&mov ($c, &DWP(4*2,"esp"));
&mov ($c_,&DWP(4*3,"esp"));
&mov ($d, &DWP(4*5,"esp"));
&mov ($d_,&DWP(4*6,"esp"));
&add ($b_,0x3320646e); # accumulate key material
&add ($c, 0x79622d32);
&add ($c_,0x6b206574);
&add ($d, &DWP(64+4*5,"esp"));
&add ($d_,&DWP(64+4*6,"esp"));
&mov (&DWP(4*1,"esp"),$b_);
&mov (&DWP(4*2,"esp"),$c);
&mov (&DWP(4*3,"esp"),$c_);
&mov (&DWP(4*5,"esp"),$d);
&mov (&DWP(4*6,"esp"),$d_);
&mov ($b_,&DWP(4*7,"esp"));
&mov ($c, &DWP(4*10,"esp"));
&mov ($c_,&DWP(4*11,"esp"));
&mov ($d, &DWP(4*13,"esp"));
&mov ($d_,&DWP(4*15,"esp"));
&add ($b_,&DWP(64+4*7,"esp"));
&add ($c, &DWP(64+4*10,"esp"));
&add ($c_,&DWP(64+4*11,"esp"));
&add ($d, &DWP(64+4*13,"esp"));
&add ($d_,&DWP(64+4*15,"esp"));
&mov (&DWP(4*7,"esp"),$b_);
&mov ($b_,&wparam(1)); # load input
&mov (&DWP(4*10,"esp"),$c);
&mov ($c,&wparam(0)); # load output
&mov (&DWP(4*11,"esp"),$c_);
&xor ($c_,$c_);
&mov (&DWP(4*13,"esp"),$d);
&mov (&DWP(4*15,"esp"),$d_);
&xor ("eax","eax");
&xor ("edx","edx");
&set_label("tail_loop");
&movb ("al",&BP(0,$c_,$b_));
&movb ("dl",&BP(0,"esp",$c_));
&lea ($c_,&DWP(1,$c_));
&xor ("al","dl");
&mov (&BP(-1,$c,$c_),"al");
&dec ($b);
&jnz (&label("tail_loop"));
&set_label("done");
&stack_pop(33);
&set_label("no_data");
&function_end("ChaCha20_ctr32");
if ($xmm) {
my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7));
my ($out,$inp,$len)=("edi","esi","ecx");
sub QUARTERROUND_SSSE3 {
my ($ai,$bi,$ci,$di,$i)=@_;
my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next
my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous
# a b c d
#
# 0 4 8 12 < even round
# 1 5 9 13
# 2 6 10 14
# 3 7 11 15
# 0 5 10 15 < odd round
# 1 6 11 12
# 2 7 8 13
# 3 4 9 14
if ($i==0) {
my $j=4;
($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
} elsif ($i==3) {
my $j=0;
($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
} elsif ($i==4) {
my $j=4;
($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
} elsif ($i==7) {
my $j=0;
($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
}
#&paddd ($xa,$xb); # see elsewhere
#&pxor ($xd,$xa); # see elsewhere
&movdqa(&QWP(16*$cp-128,"ebx"),$xc_) if ($ai>0 && $ai<3);
&pshufb ($xd,&QWP(0,"eax")); # rot16
&movdqa(&QWP(16*$bp-128,"ebx"),$xb_) if ($i!=0);
&paddd ($xc,$xd);
&movdqa($xc_,&QWP(16*$cn-128,"ebx")) if ($ai>0 && $ai<3);
&pxor ($xb,$xc);
&movdqa($xb_,&QWP(16*$bn-128,"ebx")) if ($i<7);
&movdqa ($xa_,$xb); # borrow as temporary
&pslld ($xb,12);
&psrld ($xa_,20);
&por ($xb,$xa_);
&movdqa($xa_,&QWP(16*$an-128,"ebx"));
&paddd ($xa,$xb);
&movdqa($xd_,&QWP(16*$dn-128,"ebx")) if ($di!=$dn);
&pxor ($xd,$xa);
&movdqa (&QWP(16*$ai-128,"ebx"),$xa);
&pshufb ($xd,&QWP(16,"eax")); # rot8
&paddd ($xc,$xd);
&movdqa (&QWP(16*$di-128,"ebx"),$xd) if ($di!=$dn);
&movdqa ($xd_,$xd) if ($di==$dn);
&pxor ($xb,$xc);
&paddd ($xa_,$xb_) if ($i<7); # elsewhere
&movdqa ($xa,$xb); # borrow as temporary
&pslld ($xb,7);
&psrld ($xa,25);
&pxor ($xd_,$xa_) if ($i<7); # elsewhere
&por ($xb,$xa);
($xa,$xa_)=($xa_,$xa);
($xb,$xb_)=($xb_,$xb);
($xc,$xc_)=($xc_,$xc);
($xd,$xd_)=($xd_,$xd);
}
&function_begin("ChaCha20_ssse3");
&set_label("ssse3_shortcut");
&mov ($out,&wparam(0));
&mov ($inp,&wparam(1));
&mov ($len,&wparam(2));
&mov ("edx",&wparam(3)); # key
&mov ("ebx",&wparam(4)); # counter and nonce
&mov ("ebp","esp");
&stack_push (131);
&and ("esp",-64);
&mov (&DWP(512,"esp"),"ebp");
&lea ("eax",&DWP(&label("ssse3_data")."-".
&label("pic_point"),"eax"));
&movdqu ("xmm3",&QWP(0,"ebx")); # counter and nonce
&cmp ($len,64*4);
&jb (&label("1x"));
&mov (&DWP(512+4,"esp"),"edx"); # offload pointers
&mov (&DWP(512+8,"esp"),"ebx");
&sub ($len,64*4); # bias len
&lea ("ebp",&DWP(256+128,"esp")); # size optimization
&movdqu ("xmm7",&QWP(0,"edx")); # key
&pshufd ("xmm0","xmm3",0x00);
&pshufd ("xmm1","xmm3",0x55);
&pshufd ("xmm2","xmm3",0xaa);
&pshufd ("xmm3","xmm3",0xff);
&paddd ("xmm0",&QWP(16*3,"eax")); # fix counters
&pshufd ("xmm4","xmm7",0x00);
&pshufd ("xmm5","xmm7",0x55);
&psubd ("xmm0",&QWP(16*4,"eax"));
&pshufd ("xmm6","xmm7",0xaa);
&pshufd ("xmm7","xmm7",0xff);
&movdqa (&QWP(16*12-128,"ebp"),"xmm0");
&movdqa (&QWP(16*13-128,"ebp"),"xmm1");
&movdqa (&QWP(16*14-128,"ebp"),"xmm2");
&movdqa (&QWP(16*15-128,"ebp"),"xmm3");
&movdqu ("xmm3",&QWP(16,"edx")); # key
&movdqa (&QWP(16*4-128,"ebp"),"xmm4");
&movdqa (&QWP(16*5-128,"ebp"),"xmm5");
&movdqa (&QWP(16*6-128,"ebp"),"xmm6");
&movdqa (&QWP(16*7-128,"ebp"),"xmm7");
&movdqa ("xmm7",&QWP(16*2,"eax")); # sigma
&lea ("ebx",&DWP(128,"esp")); # size optimization
&pshufd ("xmm0","xmm3",0x00);
&pshufd ("xmm1","xmm3",0x55);
&pshufd ("xmm2","xmm3",0xaa);
&pshufd ("xmm3","xmm3",0xff);
&pshufd ("xmm4","xmm7",0x00);
&pshufd ("xmm5","xmm7",0x55);
&pshufd ("xmm6","xmm7",0xaa);
&pshufd ("xmm7","xmm7",0xff);
&movdqa (&QWP(16*8-128,"ebp"),"xmm0");
&movdqa (&QWP(16*9-128,"ebp"),"xmm1");
&movdqa (&QWP(16*10-128,"ebp"),"xmm2");
&movdqa (&QWP(16*11-128,"ebp"),"xmm3");
&movdqa (&QWP(16*0-128,"ebp"),"xmm4");
&movdqa (&QWP(16*1-128,"ebp"),"xmm5");
&movdqa (&QWP(16*2-128,"ebp"),"xmm6");
&movdqa (&QWP(16*3-128,"ebp"),"xmm7");
&lea ($inp,&DWP(128,$inp)); # size optimization
&lea ($out,&DWP(128,$out)); # size optimization
&jmp (&label("outer_loop"));
&set_label("outer_loop",16);
#&movdqa ("xmm0",&QWP(16*0-128,"ebp")); # copy key material
&movdqa ("xmm1",&QWP(16*1-128,"ebp"));
&movdqa ("xmm2",&QWP(16*2-128,"ebp"));
&movdqa ("xmm3",&QWP(16*3-128,"ebp"));
#&movdqa ("xmm4",&QWP(16*4-128,"ebp"));
&movdqa ("xmm5",&QWP(16*5-128,"ebp"));
&movdqa ("xmm6",&QWP(16*6-128,"ebp"));
&movdqa ("xmm7",&QWP(16*7-128,"ebp"));
#&movdqa (&QWP(16*0-128,"ebx"),"xmm0");
&movdqa (&QWP(16*1-128,"ebx"),"xmm1");
&movdqa (&QWP(16*2-128,"ebx"),"xmm2");
&movdqa (&QWP(16*3-128,"ebx"),"xmm3");
#&movdqa (&QWP(16*4-128,"ebx"),"xmm4");
&movdqa (&QWP(16*5-128,"ebx"),"xmm5");
&movdqa (&QWP(16*6-128,"ebx"),"xmm6");
&movdqa (&QWP(16*7-128,"ebx"),"xmm7");
#&movdqa ("xmm0",&QWP(16*8-128,"ebp"));
#&movdqa ("xmm1",&QWP(16*9-128,"ebp"));
&movdqa ("xmm2",&QWP(16*10-128,"ebp"));
&movdqa ("xmm3",&QWP(16*11-128,"ebp"));
&movdqa ("xmm4",&QWP(16*12-128,"ebp"));
&movdqa ("xmm5",&QWP(16*13-128,"ebp"));
&movdqa ("xmm6",&QWP(16*14-128,"ebp"));
&movdqa ("xmm7",&QWP(16*15-128,"ebp"));
&paddd ("xmm4",&QWP(16*4,"eax")); # counter value
#&movdqa (&QWP(16*8-128,"ebx"),"xmm0");
#&movdqa (&QWP(16*9-128,"ebx"),"xmm1");
&movdqa (&QWP(16*10-128,"ebx"),"xmm2");
&movdqa (&QWP(16*11-128,"ebx"),"xmm3");
&movdqa (&QWP(16*12-128,"ebx"),"xmm4");
&movdqa (&QWP(16*13-128,"ebx"),"xmm5");
&movdqa (&QWP(16*14-128,"ebx"),"xmm6");
&movdqa (&QWP(16*15-128,"ebx"),"xmm7");
&movdqa (&QWP(16*12-128,"ebp"),"xmm4"); # save counter value
&movdqa ($xa, &QWP(16*0-128,"ebp"));
&movdqa ($xd, "xmm4");
&movdqa ($xb_,&QWP(16*4-128,"ebp"));
&movdqa ($xc, &QWP(16*8-128,"ebp"));
&movdqa ($xc_,&QWP(16*9-128,"ebp"));
&mov ("edx",10); # loop counter
&nop ();
&set_label("loop",16);
&paddd ($xa,$xb_); # elsewhere
&movdqa ($xb,$xb_);
&pxor ($xd,$xa); # elsewhere
&QUARTERROUND_SSSE3(0, 4, 8, 12, 0);
&QUARTERROUND_SSSE3(1, 5, 9, 13, 1);
&QUARTERROUND_SSSE3(2, 6,10, 14, 2);
&QUARTERROUND_SSSE3(3, 7,11, 15, 3);
&QUARTERROUND_SSSE3(0, 5,10, 15, 4);
&QUARTERROUND_SSSE3(1, 6,11, 12, 5);
&QUARTERROUND_SSSE3(2, 7, 8, 13, 6);
&QUARTERROUND_SSSE3(3, 4, 9, 14, 7);
&dec ("edx");
&jnz (&label("loop"));
&movdqa (&QWP(16*4-128,"ebx"),$xb_);
&movdqa (&QWP(16*8-128,"ebx"),$xc);
&movdqa (&QWP(16*9-128,"ebx"),$xc_);
&movdqa (&QWP(16*12-128,"ebx"),$xd);
&movdqa (&QWP(16*14-128,"ebx"),$xd_);
my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7));
#&movdqa ($xa0,&QWP(16*0-128,"ebx")); # it's there
&movdqa ($xa1,&QWP(16*1-128,"ebx"));
&movdqa ($xa2,&QWP(16*2-128,"ebx"));
&movdqa ($xa3,&QWP(16*3-128,"ebx"));
for($i=0;$i<256;$i+=64) {
&paddd ($xa0,&QWP($i+16*0-128,"ebp")); # accumulate key material
&paddd ($xa1,&QWP($i+16*1-128,"ebp"));
&paddd ($xa2,&QWP($i+16*2-128,"ebp"));
&paddd ($xa3,&QWP($i+16*3-128,"ebp"));
&movdqa ($xt2,$xa0); # "de-interlace" data
&punpckldq ($xa0,$xa1);
&movdqa ($xt3,$xa2);
&punpckldq ($xa2,$xa3);
&punpckhdq ($xt2,$xa1);
&punpckhdq ($xt3,$xa3);
&movdqa ($xa1,$xa0);
&punpcklqdq ($xa0,$xa2); # "a0"
&movdqa ($xa3,$xt2);
&punpcklqdq ($xt2,$xt3); # "a2"
&punpckhqdq ($xa1,$xa2); # "a1"
&punpckhqdq ($xa3,$xt3); # "a3"
#($xa2,$xt2)=($xt2,$xa2);
&movdqu ($xt0,&QWP(64*0-128,$inp)); # load input
&movdqu ($xt1,&QWP(64*1-128,$inp));
&movdqu ($xa2,&QWP(64*2-128,$inp));
&movdqu ($xt3,&QWP(64*3-128,$inp));
&lea ($inp,&QWP($i<192?16:(64*4-16*3),$inp));
&pxor ($xt0,$xa0);
&movdqa ($xa0,&QWP($i+16*4-128,"ebx")) if ($i<192);
&pxor ($xt1,$xa1);
&movdqa ($xa1,&QWP($i+16*5-128,"ebx")) if ($i<192);
&pxor ($xt2,$xa2);
&movdqa ($xa2,&QWP($i+16*6-128,"ebx")) if ($i<192);
&pxor ($xt3,$xa3);
&movdqa ($xa3,&QWP($i+16*7-128,"ebx")) if ($i<192);
&movdqu (&QWP(64*0-128,$out),$xt0); # store output
&movdqu (&QWP(64*1-128,$out),$xt1);
&movdqu (&QWP(64*2-128,$out),$xt2);
&movdqu (&QWP(64*3-128,$out),$xt3);
&lea ($out,&QWP($i<192?16:(64*4-16*3),$out));
}
&sub ($len,64*4);
&jnc (&label("outer_loop"));
&add ($len,64*4);
&jz (&label("done"));
&mov ("ebx",&DWP(512+8,"esp")); # restore pointers
&lea ($inp,&DWP(-128,$inp));
&mov ("edx",&DWP(512+4,"esp"));
&lea ($out,&DWP(-128,$out));
&movd ("xmm2",&DWP(16*12-128,"ebp")); # counter value
&movdqu ("xmm3",&QWP(0,"ebx"));
&paddd ("xmm2",&QWP(16*6,"eax")); # +four
&pand ("xmm3",&QWP(16*7,"eax"));
&por ("xmm3","xmm2"); # counter value
{
my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7));
sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
&paddd ($a,$b);
&pxor ($d,$a);
&pshufb ($d,$rot16);
&paddd ($c,$d);
&pxor ($b,$c);
&movdqa ($t,$b);
&psrld ($b,20);
&pslld ($t,12);
&por ($b,$t);
&paddd ($a,$b);
&pxor ($d,$a);
&pshufb ($d,$rot24);
&paddd ($c,$d);
&pxor ($b,$c);
&movdqa ($t,$b);
&psrld ($b,25);
&pslld ($t,7);
&por ($b,$t);
}
&set_label("1x");
&movdqa ($a,&QWP(16*2,"eax")); # sigma
&movdqu ($b,&QWP(0,"edx"));
&movdqu ($c,&QWP(16,"edx"));
#&movdqu ($d,&QWP(0,"ebx")); # already loaded
&movdqa ($rot16,&QWP(0,"eax"));
&movdqa ($rot24,&QWP(16,"eax"));
&mov (&DWP(16*3,"esp"),"ebp");
&movdqa (&QWP(16*0,"esp"),$a);
&movdqa (&QWP(16*1,"esp"),$b);
&movdqa (&QWP(16*2,"esp"),$c);
&movdqa (&QWP(16*3,"esp"),$d);
&mov ("edx",10);
&jmp (&label("loop1x"));
&set_label("outer1x",16);
&movdqa ($d,&QWP(16*5,"eax")); # one
&movdqa ($a,&QWP(16*0,"esp"));
&movdqa ($b,&QWP(16*1,"esp"));
&movdqa ($c,&QWP(16*2,"esp"));
&paddd ($d,&QWP(16*3,"esp"));
&mov ("edx",10);
&movdqa (&QWP(16*3,"esp"),$d);
&jmp (&label("loop1x"));
&set_label("loop1x",16);
&SSSE3ROUND();
&pshufd ($c,$c,0b01001110);
&pshufd ($b,$b,0b00111001);
&pshufd ($d,$d,0b10010011);
&nop ();
&SSSE3ROUND();
&pshufd ($c,$c,0b01001110);
&pshufd ($b,$b,0b10010011);
&pshufd ($d,$d,0b00111001);
&dec ("edx");
&jnz (&label("loop1x"));
&paddd ($a,&QWP(16*0,"esp"));
&paddd ($b,&QWP(16*1,"esp"));
&paddd ($c,&QWP(16*2,"esp"));
&paddd ($d,&QWP(16*3,"esp"));
&cmp ($len,64);
&jb (&label("tail"));
&movdqu ($t,&QWP(16*0,$inp));
&movdqu ($t1,&QWP(16*1,$inp));
&pxor ($a,$t); # xor with input
&movdqu ($t,&QWP(16*2,$inp));
&pxor ($b,$t1);
&movdqu ($t1,&QWP(16*3,$inp));
&pxor ($c,$t);
&pxor ($d,$t1);
&lea ($inp,&DWP(16*4,$inp)); # inp+=64
&movdqu (&QWP(16*0,$out),$a); # write output
&movdqu (&QWP(16*1,$out),$b);
&movdqu (&QWP(16*2,$out),$c);
&movdqu (&QWP(16*3,$out),$d);
&lea ($out,&DWP(16*4,$out)); # inp+=64
&sub ($len,64);
&jnz (&label("outer1x"));
&jmp (&label("done"));
&set_label("tail");
&movdqa (&QWP(16*0,"esp"),$a);
&movdqa (&QWP(16*1,"esp"),$b);
&movdqa (&QWP(16*2,"esp"),$c);
&movdqa (&QWP(16*3,"esp"),$d);
&xor ("eax","eax");
&xor ("edx","edx");
&xor ("ebp","ebp");
&set_label("tail_loop");
&movb ("al",&BP(0,"esp","ebp"));
&movb ("dl",&BP(0,$inp,"ebp"));
&lea ("ebp",&DWP(1,"ebp"));
&xor ("al","dl");
&movb (&BP(-1,$out,"ebp"),"al");
&dec ($len);
&jnz (&label("tail_loop"));
}
&set_label("done");
&mov ("esp",&DWP(512,"esp"));
&function_end("ChaCha20_ssse3");
&align (64);
&set_label("ssse3_data");
&data_byte(0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd);
&data_byte(0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe);
&data_word(0x61707865,0x3320646e,0x79622d32,0x6b206574);
&data_word(0,1,2,3);
&data_word(4,4,4,4);
&data_word(1,0,0,0);
&data_word(4,0,0,0);
&data_word(0,-1,-1,-1);
&align (64);
}
&asciz ("ChaCha20 for x86, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();
close STDOUT;