fdd8e9c8c7
Depending on architecture, perlasm differed on which one or both of: perl foo.pl flavor output.S perl foo.pl flavor > output.S Upstream has now unified on the first form after making a number of changes to their files (the second does not even work for their x86 files anymore). Sync those portions of our perlasm scripts with upstream and update CMakeLists.txt and generate_build_files.py per the new convention. This imports various commits like this one: 184bc45f683c76531d7e065b6553ca9086564576 (this was done by taking a diff, so I don't have the full list) Confirmed that generate_build_files.py sees no change. BUG=14 Change-Id: Id2fb5b8bc2a7369d077221b5df9a6947d41f50d2 Reviewed-on: https://boringssl-review.googlesource.com/8518 Reviewed-by: Adam Langley <agl@google.com>
1152 lines
27 KiB
Raku
Executable File
1152 lines
27 KiB
Raku
Executable File
#!/usr/bin/env perl
|
|
#
|
|
# ====================================================================
|
|
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
|
# project. The module is, however, dual licensed under OpenSSL and
|
|
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
|
# details see http://www.openssl.org/~appro/cryptogams/.
|
|
# ====================================================================
|
|
#
|
|
# December 2014
|
|
#
|
|
# ChaCha20 for ARMv4.
|
|
#
|
|
# Performance in cycles per byte out of large buffer.
|
|
#
|
|
# IALU/gcc-4.4 1xNEON 3xNEON+1xIALU
|
|
#
|
|
# Cortex-A5 19.3(*)/+95% 21.8 14.1
|
|
# Cortex-A8 10.5(*)/+160% 13.9 6.35
|
|
# Cortex-A9 12.9(**)/+110% 14.3 6.50
|
|
# Cortex-A15 11.0/+40% 16.0 5.00
|
|
# Snapdragon S4 11.5/+125% 13.6 4.90
|
|
#
|
|
# (*) most "favourable" result for aligned data on little-endian
|
|
# processor, result for misaligned data is 10-15% lower;
|
|
# (**) this result is a trade-off: it can be improved by 20%,
|
|
# but then Snapdragon S4 and Cortex-A8 results get
|
|
# 20-25% worse;
|
|
|
|
$flavour = shift;
|
|
if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
|
|
else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
|
|
|
|
if ($flavour && $flavour ne "void") {
|
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
|
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
|
die "can't locate arm-xlate.pl";
|
|
|
|
open STDOUT,"| \"$^X\" $xlate $flavour $output";
|
|
} else {
|
|
open STDOUT,">$output";
|
|
}
|
|
|
|
sub AUTOLOAD() # thunk [simplified] x86-style perlasm
|
|
{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
|
|
my $arg = pop;
|
|
$arg = "#$arg" if ($arg*1 eq $arg);
|
|
$code .= "\t$opcode\t".join(',',@_,$arg)."\n";
|
|
}
|
|
|
|
my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x"));
|
|
my @t=map("r$_",(8..11));
|
|
|
|
sub ROUND {
|
|
my ($a0,$b0,$c0,$d0)=@_;
|
|
my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
|
|
my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
|
|
my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
|
|
my $odd = $d0&1;
|
|
my ($xc,$xc_) = (@t[0..1]);
|
|
my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]);
|
|
my @ret;
|
|
|
|
# Consider order in which variables are addressed by their
|
|
# index:
|
|
#
|
|
# a b c d
|
|
#
|
|
# 0 4 8 12 < even round
|
|
# 1 5 9 13
|
|
# 2 6 10 14
|
|
# 3 7 11 15
|
|
# 0 5 10 15 < odd round
|
|
# 1 6 11 12
|
|
# 2 7 8 13
|
|
# 3 4 9 14
|
|
#
|
|
# 'a', 'b' are permanently allocated in registers, @x[0..7],
|
|
# while 'c's and pair of 'd's are maintained in memory. If
|
|
# you observe 'c' column, you'll notice that pair of 'c's is
|
|
# invariant between rounds. This means that we have to reload
|
|
# them once per round, in the middle. This is why you'll see
|
|
# bunch of 'c' stores and loads in the middle, but none in
|
|
# the beginning or end. If you observe 'd' column, you'll
|
|
# notice that 15 and 13 are reused in next pair of rounds.
|
|
# This is why these two are chosen for offloading to memory,
|
|
# to make loads count more.
|
|
push @ret,(
|
|
"&add (@x[$a0],@x[$a0],@x[$b0])",
|
|
"&mov ($xd,$xd,'ror#16')",
|
|
"&add (@x[$a1],@x[$a1],@x[$b1])",
|
|
"&mov ($xd_,$xd_,'ror#16')",
|
|
"&eor ($xd,$xd,@x[$a0],'ror#16')",
|
|
"&eor ($xd_,$xd_,@x[$a1],'ror#16')",
|
|
|
|
"&add ($xc,$xc,$xd)",
|
|
"&mov (@x[$b0],@x[$b0],'ror#20')",
|
|
"&add ($xc_,$xc_,$xd_)",
|
|
"&mov (@x[$b1],@x[$b1],'ror#20')",
|
|
"&eor (@x[$b0],@x[$b0],$xc,'ror#20')",
|
|
"&eor (@x[$b1],@x[$b1],$xc_,'ror#20')",
|
|
|
|
"&add (@x[$a0],@x[$a0],@x[$b0])",
|
|
"&mov ($xd,$xd,'ror#24')",
|
|
"&add (@x[$a1],@x[$a1],@x[$b1])",
|
|
"&mov ($xd_,$xd_,'ror#24')",
|
|
"&eor ($xd,$xd,@x[$a0],'ror#24')",
|
|
"&eor ($xd_,$xd_,@x[$a1],'ror#24')",
|
|
|
|
"&add ($xc,$xc,$xd)",
|
|
"&mov (@x[$b0],@x[$b0],'ror#25')" );
|
|
push @ret,(
|
|
"&str ($xd,'[sp,#4*(16+$d0)]')",
|
|
"&ldr ($xd,'[sp,#4*(16+$d2)]')" ) if ($odd);
|
|
push @ret,(
|
|
"&add ($xc_,$xc_,$xd_)",
|
|
"&mov (@x[$b1],@x[$b1],'ror#25')" );
|
|
push @ret,(
|
|
"&str ($xd_,'[sp,#4*(16+$d1)]')",
|
|
"&ldr ($xd_,'[sp,#4*(16+$d3)]')" ) if (!$odd);
|
|
push @ret,(
|
|
"&eor (@x[$b0],@x[$b0],$xc,'ror#25')",
|
|
"&eor (@x[$b1],@x[$b1],$xc_,'ror#25')" );
|
|
|
|
$xd=@x[$d2] if (!$odd);
|
|
$xd_=@x[$d3] if ($odd);
|
|
push @ret,(
|
|
"&str ($xc,'[sp,#4*(16+$c0)]')",
|
|
"&ldr ($xc,'[sp,#4*(16+$c2)]')",
|
|
"&add (@x[$a2],@x[$a2],@x[$b2])",
|
|
"&mov ($xd,$xd,'ror#16')",
|
|
"&str ($xc_,'[sp,#4*(16+$c1)]')",
|
|
"&ldr ($xc_,'[sp,#4*(16+$c3)]')",
|
|
"&add (@x[$a3],@x[$a3],@x[$b3])",
|
|
"&mov ($xd_,$xd_,'ror#16')",
|
|
"&eor ($xd,$xd,@x[$a2],'ror#16')",
|
|
"&eor ($xd_,$xd_,@x[$a3],'ror#16')",
|
|
|
|
"&add ($xc,$xc,$xd)",
|
|
"&mov (@x[$b2],@x[$b2],'ror#20')",
|
|
"&add ($xc_,$xc_,$xd_)",
|
|
"&mov (@x[$b3],@x[$b3],'ror#20')",
|
|
"&eor (@x[$b2],@x[$b2],$xc,'ror#20')",
|
|
"&eor (@x[$b3],@x[$b3],$xc_,'ror#20')",
|
|
|
|
"&add (@x[$a2],@x[$a2],@x[$b2])",
|
|
"&mov ($xd,$xd,'ror#24')",
|
|
"&add (@x[$a3],@x[$a3],@x[$b3])",
|
|
"&mov ($xd_,$xd_,'ror#24')",
|
|
"&eor ($xd,$xd,@x[$a2],'ror#24')",
|
|
"&eor ($xd_,$xd_,@x[$a3],'ror#24')",
|
|
|
|
"&add ($xc,$xc,$xd)",
|
|
"&mov (@x[$b2],@x[$b2],'ror#25')",
|
|
"&add ($xc_,$xc_,$xd_)",
|
|
"&mov (@x[$b3],@x[$b3],'ror#25')",
|
|
"&eor (@x[$b2],@x[$b2],$xc,'ror#25')",
|
|
"&eor (@x[$b3],@x[$b3],$xc_,'ror#25')" );
|
|
|
|
@ret;
|
|
}
|
|
|
|
$code.=<<___;
|
|
#include <openssl/arm_arch.h>
|
|
|
|
.text
|
|
#if defined(__thumb2__)
|
|
.syntax unified
|
|
.thumb
|
|
#else
|
|
.code 32
|
|
#endif
|
|
|
|
#if defined(__thumb2__) || defined(__clang__)
|
|
#define ldrhsb ldrbhs
|
|
#endif
|
|
|
|
.align 5
|
|
.Lsigma:
|
|
.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
|
|
.Lone:
|
|
.long 1,0,0,0
|
|
#if __ARM_MAX_ARCH__>=7
|
|
.LOPENSSL_armcap:
|
|
.word OPENSSL_armcap_P-.LChaCha20_ctr32
|
|
#else
|
|
.word -1
|
|
#endif
|
|
|
|
.globl ChaCha20_ctr32
|
|
.type ChaCha20_ctr32,%function
|
|
.align 5
|
|
ChaCha20_ctr32:
|
|
.LChaCha20_ctr32:
|
|
ldr r12,[sp,#0] @ pull pointer to counter and nonce
|
|
stmdb sp!,{r0-r2,r4-r11,lr}
|
|
#if __ARM_ARCH__<7 && !defined(__thumb2__)
|
|
sub r14,pc,#16 @ ChaCha20_ctr32
|
|
#else
|
|
adr r14,.LChaCha20_ctr32
|
|
#endif
|
|
cmp r2,#0 @ len==0?
|
|
#ifdef __thumb2__
|
|
itt eq
|
|
#endif
|
|
addeq sp,sp,#4*3
|
|
beq .Lno_data
|
|
#if __ARM_MAX_ARCH__>=7
|
|
cmp r2,#192 @ test len
|
|
bls .Lshort
|
|
ldr r4,[r14,#-32]
|
|
ldr r4,[r14,r4]
|
|
# ifdef __APPLE__
|
|
ldr r4,[r4]
|
|
# endif
|
|
tst r4,#ARMV7_NEON
|
|
bne .LChaCha20_neon
|
|
.Lshort:
|
|
#endif
|
|
ldmia r12,{r4-r7} @ load counter and nonce
|
|
sub sp,sp,#4*(16) @ off-load area
|
|
sub r14,r14,#64 @ .Lsigma
|
|
stmdb sp!,{r4-r7} @ copy counter and nonce
|
|
ldmia r3,{r4-r11} @ load key
|
|
ldmia r14,{r0-r3} @ load sigma
|
|
stmdb sp!,{r4-r11} @ copy key
|
|
stmdb sp!,{r0-r3} @ copy sigma
|
|
str r10,[sp,#4*(16+10)] @ off-load "@x[10]"
|
|
str r11,[sp,#4*(16+11)] @ off-load "@x[11]"
|
|
b .Loop_outer_enter
|
|
|
|
.align 4
|
|
.Loop_outer:
|
|
ldmia sp,{r0-r9} @ load key material
|
|
str @t[3],[sp,#4*(32+2)] @ save len
|
|
str r12, [sp,#4*(32+1)] @ save inp
|
|
str r14, [sp,#4*(32+0)] @ save out
|
|
.Loop_outer_enter:
|
|
ldr @t[3], [sp,#4*(15)]
|
|
ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
|
|
ldr @t[2], [sp,#4*(13)]
|
|
ldr @x[14],[sp,#4*(14)]
|
|
str @t[3], [sp,#4*(16+15)]
|
|
mov @t[3],#10
|
|
b .Loop
|
|
|
|
.align 4
|
|
.Loop:
|
|
subs @t[3],@t[3],#1
|
|
___
|
|
foreach (&ROUND(0, 4, 8,12)) { eval; }
|
|
foreach (&ROUND(0, 5,10,15)) { eval; }
|
|
$code.=<<___;
|
|
bne .Loop
|
|
|
|
ldr @t[3],[sp,#4*(32+2)] @ load len
|
|
|
|
str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store
|
|
str @t[1], [sp,#4*(16+9)]
|
|
str @x[12],[sp,#4*(16+12)]
|
|
str @t[2], [sp,#4*(16+13)]
|
|
str @x[14],[sp,#4*(16+14)]
|
|
|
|
@ at this point we have first half of 512-bit result in
|
|
@ @x[0-7] and second half at sp+4*(16+8)
|
|
|
|
cmp @t[3],#64 @ done yet?
|
|
#ifdef __thumb2__
|
|
itete lo
|
|
#endif
|
|
addlo r12,sp,#4*(0) @ shortcut or ...
|
|
ldrhs r12,[sp,#4*(32+1)] @ ... load inp
|
|
addlo r14,sp,#4*(0) @ shortcut or ...
|
|
ldrhs r14,[sp,#4*(32+0)] @ ... load out
|
|
|
|
ldr @t[0],[sp,#4*(0)] @ load key material
|
|
ldr @t[1],[sp,#4*(1)]
|
|
|
|
#if __ARM_ARCH__>=6 || !defined(__ARMEB__)
|
|
# if __ARM_ARCH__<7
|
|
orr @t[2],r12,r14
|
|
tst @t[2],#3 @ are input and output aligned?
|
|
ldr @t[2],[sp,#4*(2)]
|
|
bne .Lunaligned
|
|
cmp @t[3],#64 @ restore flags
|
|
# else
|
|
ldr @t[2],[sp,#4*(2)]
|
|
# endif
|
|
ldr @t[3],[sp,#4*(3)]
|
|
|
|
add @x[0],@x[0],@t[0] @ accumulate key material
|
|
add @x[1],@x[1],@t[1]
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
ldrhs @t[0],[r12],#16 @ load input
|
|
ldrhs @t[1],[r12,#-12]
|
|
|
|
add @x[2],@x[2],@t[2]
|
|
add @x[3],@x[3],@t[3]
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
ldrhs @t[2],[r12,#-8]
|
|
ldrhs @t[3],[r12,#-4]
|
|
# if __ARM_ARCH__>=6 && defined(__ARMEB__)
|
|
rev @x[0],@x[0]
|
|
rev @x[1],@x[1]
|
|
rev @x[2],@x[2]
|
|
rev @x[3],@x[3]
|
|
# endif
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
eorhs @x[0],@x[0],@t[0] @ xor with input
|
|
eorhs @x[1],@x[1],@t[1]
|
|
add @t[0],sp,#4*(4)
|
|
str @x[0],[r14],#16 @ store output
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
eorhs @x[2],@x[2],@t[2]
|
|
eorhs @x[3],@x[3],@t[3]
|
|
ldmia @t[0],{@t[0]-@t[3]} @ load key material
|
|
str @x[1],[r14,#-12]
|
|
str @x[2],[r14,#-8]
|
|
str @x[3],[r14,#-4]
|
|
|
|
add @x[4],@x[4],@t[0] @ accumulate key material
|
|
add @x[5],@x[5],@t[1]
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
ldrhs @t[0],[r12],#16 @ load input
|
|
ldrhs @t[1],[r12,#-12]
|
|
add @x[6],@x[6],@t[2]
|
|
add @x[7],@x[7],@t[3]
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
ldrhs @t[2],[r12,#-8]
|
|
ldrhs @t[3],[r12,#-4]
|
|
# if __ARM_ARCH__>=6 && defined(__ARMEB__)
|
|
rev @x[4],@x[4]
|
|
rev @x[5],@x[5]
|
|
rev @x[6],@x[6]
|
|
rev @x[7],@x[7]
|
|
# endif
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
eorhs @x[4],@x[4],@t[0]
|
|
eorhs @x[5],@x[5],@t[1]
|
|
add @t[0],sp,#4*(8)
|
|
str @x[4],[r14],#16 @ store output
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
eorhs @x[6],@x[6],@t[2]
|
|
eorhs @x[7],@x[7],@t[3]
|
|
str @x[5],[r14,#-12]
|
|
ldmia @t[0],{@t[0]-@t[3]} @ load key material
|
|
str @x[6],[r14,#-8]
|
|
add @x[0],sp,#4*(16+8)
|
|
str @x[7],[r14,#-4]
|
|
|
|
ldmia @x[0],{@x[0]-@x[7]} @ load second half
|
|
|
|
add @x[0],@x[0],@t[0] @ accumulate key material
|
|
add @x[1],@x[1],@t[1]
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
ldrhs @t[0],[r12],#16 @ load input
|
|
ldrhs @t[1],[r12,#-12]
|
|
# ifdef __thumb2__
|
|
itt hi
|
|
# endif
|
|
strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it
|
|
strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it
|
|
add @x[2],@x[2],@t[2]
|
|
add @x[3],@x[3],@t[3]
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
ldrhs @t[2],[r12,#-8]
|
|
ldrhs @t[3],[r12,#-4]
|
|
# if __ARM_ARCH__>=6 && defined(__ARMEB__)
|
|
rev @x[0],@x[0]
|
|
rev @x[1],@x[1]
|
|
rev @x[2],@x[2]
|
|
rev @x[3],@x[3]
|
|
# endif
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
eorhs @x[0],@x[0],@t[0]
|
|
eorhs @x[1],@x[1],@t[1]
|
|
add @t[0],sp,#4*(12)
|
|
str @x[0],[r14],#16 @ store output
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
eorhs @x[2],@x[2],@t[2]
|
|
eorhs @x[3],@x[3],@t[3]
|
|
str @x[1],[r14,#-12]
|
|
ldmia @t[0],{@t[0]-@t[3]} @ load key material
|
|
str @x[2],[r14,#-8]
|
|
str @x[3],[r14,#-4]
|
|
|
|
add @x[4],@x[4],@t[0] @ accumulate key material
|
|
add @x[5],@x[5],@t[1]
|
|
# ifdef __thumb2__
|
|
itt hi
|
|
# endif
|
|
addhi @t[0],@t[0],#1 @ next counter value
|
|
strhi @t[0],[sp,#4*(12)] @ save next counter value
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
ldrhs @t[0],[r12],#16 @ load input
|
|
ldrhs @t[1],[r12,#-12]
|
|
add @x[6],@x[6],@t[2]
|
|
add @x[7],@x[7],@t[3]
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
ldrhs @t[2],[r12,#-8]
|
|
ldrhs @t[3],[r12,#-4]
|
|
# if __ARM_ARCH__>=6 && defined(__ARMEB__)
|
|
rev @x[4],@x[4]
|
|
rev @x[5],@x[5]
|
|
rev @x[6],@x[6]
|
|
rev @x[7],@x[7]
|
|
# endif
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
eorhs @x[4],@x[4],@t[0]
|
|
eorhs @x[5],@x[5],@t[1]
|
|
# ifdef __thumb2__
|
|
it ne
|
|
# endif
|
|
ldrne @t[0],[sp,#4*(32+2)] @ re-load len
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
eorhs @x[6],@x[6],@t[2]
|
|
eorhs @x[7],@x[7],@t[3]
|
|
str @x[4],[r14],#16 @ store output
|
|
str @x[5],[r14,#-12]
|
|
# ifdef __thumb2__
|
|
it hs
|
|
# endif
|
|
subhs @t[3],@t[0],#64 @ len-=64
|
|
str @x[6],[r14,#-8]
|
|
str @x[7],[r14,#-4]
|
|
bhi .Loop_outer
|
|
|
|
beq .Ldone
|
|
# if __ARM_ARCH__<7
|
|
b .Ltail
|
|
|
|
.align 4
|
|
.Lunaligned: @ unaligned endian-neutral path
|
|
cmp @t[3],#64 @ restore flags
|
|
# endif
|
|
#endif
|
|
#if __ARM_ARCH__<7
|
|
ldr @t[3],[sp,#4*(3)]
|
|
___
|
|
for ($i=0;$i<16;$i+=4) {
|
|
my $j=$i&0x7;
|
|
|
|
$code.=<<___ if ($i==4);
|
|
add @x[0],sp,#4*(16+8)
|
|
___
|
|
$code.=<<___ if ($i==8);
|
|
ldmia @x[0],{@x[0]-@x[7]} @ load second half
|
|
# ifdef __thumb2__
|
|
itt hi
|
|
# endif
|
|
strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]"
|
|
strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]"
|
|
___
|
|
$code.=<<___;
|
|
add @x[$j+0],@x[$j+0],@t[0] @ accumulate key material
|
|
___
|
|
$code.=<<___ if ($i==12);
|
|
# ifdef __thumb2__
|
|
itt hi
|
|
# endif
|
|
addhi @t[0],@t[0],#1 @ next counter value
|
|
strhi @t[0],[sp,#4*(12)] @ save next counter value
|
|
___
|
|
$code.=<<___;
|
|
add @x[$j+1],@x[$j+1],@t[1]
|
|
add @x[$j+2],@x[$j+2],@t[2]
|
|
# ifdef __thumb2__
|
|
itete lo
|
|
# endif
|
|
eorlo @t[0],@t[0],@t[0] @ zero or ...
|
|
ldrhsb @t[0],[r12],#16 @ ... load input
|
|
eorlo @t[1],@t[1],@t[1]
|
|
ldrhsb @t[1],[r12,#-12]
|
|
|
|
add @x[$j+3],@x[$j+3],@t[3]
|
|
# ifdef __thumb2__
|
|
itete lo
|
|
# endif
|
|
eorlo @t[2],@t[2],@t[2]
|
|
ldrhsb @t[2],[r12,#-8]
|
|
eorlo @t[3],@t[3],@t[3]
|
|
ldrhsb @t[3],[r12,#-4]
|
|
|
|
eor @x[$j+0],@t[0],@x[$j+0] @ xor with input (or zero)
|
|
eor @x[$j+1],@t[1],@x[$j+1]
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
ldrhsb @t[0],[r12,#-15] @ load more input
|
|
ldrhsb @t[1],[r12,#-11]
|
|
eor @x[$j+2],@t[2],@x[$j+2]
|
|
strb @x[$j+0],[r14],#16 @ store output
|
|
eor @x[$j+3],@t[3],@x[$j+3]
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
ldrhsb @t[2],[r12,#-7]
|
|
ldrhsb @t[3],[r12,#-3]
|
|
strb @x[$j+1],[r14,#-12]
|
|
eor @x[$j+0],@t[0],@x[$j+0],lsr#8
|
|
strb @x[$j+2],[r14,#-8]
|
|
eor @x[$j+1],@t[1],@x[$j+1],lsr#8
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
ldrhsb @t[0],[r12,#-14] @ load more input
|
|
ldrhsb @t[1],[r12,#-10]
|
|
strb @x[$j+3],[r14,#-4]
|
|
eor @x[$j+2],@t[2],@x[$j+2],lsr#8
|
|
strb @x[$j+0],[r14,#-15]
|
|
eor @x[$j+3],@t[3],@x[$j+3],lsr#8
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
ldrhsb @t[2],[r12,#-6]
|
|
ldrhsb @t[3],[r12,#-2]
|
|
strb @x[$j+1],[r14,#-11]
|
|
eor @x[$j+0],@t[0],@x[$j+0],lsr#8
|
|
strb @x[$j+2],[r14,#-7]
|
|
eor @x[$j+1],@t[1],@x[$j+1],lsr#8
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
ldrhsb @t[0],[r12,#-13] @ load more input
|
|
ldrhsb @t[1],[r12,#-9]
|
|
strb @x[$j+3],[r14,#-3]
|
|
eor @x[$j+2],@t[2],@x[$j+2],lsr#8
|
|
strb @x[$j+0],[r14,#-14]
|
|
eor @x[$j+3],@t[3],@x[$j+3],lsr#8
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
ldrhsb @t[2],[r12,#-5]
|
|
ldrhsb @t[3],[r12,#-1]
|
|
strb @x[$j+1],[r14,#-10]
|
|
strb @x[$j+2],[r14,#-6]
|
|
eor @x[$j+0],@t[0],@x[$j+0],lsr#8
|
|
strb @x[$j+3],[r14,#-2]
|
|
eor @x[$j+1],@t[1],@x[$j+1],lsr#8
|
|
strb @x[$j+0],[r14,#-13]
|
|
eor @x[$j+2],@t[2],@x[$j+2],lsr#8
|
|
strb @x[$j+1],[r14,#-9]
|
|
eor @x[$j+3],@t[3],@x[$j+3],lsr#8
|
|
strb @x[$j+2],[r14,#-5]
|
|
strb @x[$j+3],[r14,#-1]
|
|
___
|
|
$code.=<<___ if ($i<12);
|
|
add @t[0],sp,#4*(4+$i)
|
|
ldmia @t[0],{@t[0]-@t[3]} @ load key material
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
# ifdef __thumb2__
|
|
it ne
|
|
# endif
|
|
ldrne @t[0],[sp,#4*(32+2)] @ re-load len
|
|
# ifdef __thumb2__
|
|
it hs
|
|
# endif
|
|
subhs @t[3],@t[0],#64 @ len-=64
|
|
bhi .Loop_outer
|
|
|
|
beq .Ldone
|
|
#endif
|
|
|
|
.Ltail:
|
|
ldr r12,[sp,#4*(32+1)] @ load inp
|
|
add @t[1],sp,#4*(0)
|
|
ldr r14,[sp,#4*(32+0)] @ load out
|
|
|
|
.Loop_tail:
|
|
ldrb @t[2],[@t[1]],#1 @ read buffer on stack
|
|
ldrb @t[3],[r12],#1 @ read input
|
|
subs @t[0],@t[0],#1
|
|
eor @t[3],@t[3],@t[2]
|
|
strb @t[3],[r14],#1 @ store output
|
|
bne .Loop_tail
|
|
|
|
.Ldone:
|
|
add sp,sp,#4*(32+3)
|
|
.Lno_data:
|
|
ldmia sp!,{r4-r11,pc}
|
|
.size ChaCha20_ctr32,.-ChaCha20_ctr32
|
|
___
|
|
|
|
{{{
|
|
my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) =
|
|
map("q$_",(0..15));
|
|
|
|
sub NEONROUND {
|
|
my $odd = pop;
|
|
my ($a,$b,$c,$d,$t)=@_;
|
|
|
|
(
|
|
"&vadd_i32 ($a,$a,$b)",
|
|
"&veor ($d,$d,$a)",
|
|
"&vrev32_16 ($d,$d)", # vrot ($d,16)
|
|
|
|
"&vadd_i32 ($c,$c,$d)",
|
|
"&veor ($t,$b,$c)",
|
|
"&vshr_u32 ($b,$t,20)",
|
|
"&vsli_32 ($b,$t,12)",
|
|
|
|
"&vadd_i32 ($a,$a,$b)",
|
|
"&veor ($t,$d,$a)",
|
|
"&vshr_u32 ($d,$t,24)",
|
|
"&vsli_32 ($d,$t,8)",
|
|
|
|
"&vadd_i32 ($c,$c,$d)",
|
|
"&veor ($t,$b,$c)",
|
|
"&vshr_u32 ($b,$t,25)",
|
|
"&vsli_32 ($b,$t,7)",
|
|
|
|
"&vext_8 ($c,$c,$c,8)",
|
|
"&vext_8 ($b,$b,$b,$odd?12:4)",
|
|
"&vext_8 ($d,$d,$d,$odd?4:12)"
|
|
);
|
|
}
|
|
|
|
$code.=<<___;
|
|
#if __ARM_MAX_ARCH__>=7
|
|
.arch armv7-a
|
|
.fpu neon
|
|
|
|
.type ChaCha20_neon,%function
|
|
.align 5
|
|
ChaCha20_neon:
|
|
ldr r12,[sp,#0] @ pull pointer to counter and nonce
|
|
stmdb sp!,{r0-r2,r4-r11,lr}
|
|
.LChaCha20_neon:
|
|
adr r14,.Lsigma
|
|
vstmdb sp!,{d8-d15} @ ABI spec says so
|
|
stmdb sp!,{r0-r3}
|
|
|
|
vld1.32 {$b0-$c0},[r3] @ load key
|
|
ldmia r3,{r4-r11} @ load key
|
|
|
|
sub sp,sp,#4*(16+16)
|
|
vld1.32 {$d0},[r12] @ load counter and nonce
|
|
add r12,sp,#4*8
|
|
ldmia r14,{r0-r3} @ load sigma
|
|
vld1.32 {$a0},[r14]! @ load sigma
|
|
vld1.32 {$t0},[r14] @ one
|
|
vst1.32 {$c0-$d0},[r12] @ copy 1/2key|counter|nonce
|
|
vst1.32 {$a0-$b0},[sp] @ copy sigma|1/2key
|
|
|
|
str r10,[sp,#4*(16+10)] @ off-load "@x[10]"
|
|
str r11,[sp,#4*(16+11)] @ off-load "@x[11]"
|
|
vshl.i32 $t1#lo,$t0#lo,#1 @ two
|
|
vstr $t0#lo,[sp,#4*(16+0)]
|
|
vshl.i32 $t2#lo,$t0#lo,#2 @ four
|
|
vstr $t1#lo,[sp,#4*(16+2)]
|
|
vmov $a1,$a0
|
|
vstr $t2#lo,[sp,#4*(16+4)]
|
|
vmov $a2,$a0
|
|
vmov $b1,$b0
|
|
vmov $b2,$b0
|
|
b .Loop_neon_enter
|
|
|
|
.align 4
|
|
.Loop_neon_outer:
|
|
ldmia sp,{r0-r9} @ load key material
|
|
cmp @t[3],#64*2 @ if len<=64*2
|
|
bls .Lbreak_neon @ switch to integer-only
|
|
vmov $a1,$a0
|
|
str @t[3],[sp,#4*(32+2)] @ save len
|
|
vmov $a2,$a0
|
|
str r12, [sp,#4*(32+1)] @ save inp
|
|
vmov $b1,$b0
|
|
str r14, [sp,#4*(32+0)] @ save out
|
|
vmov $b2,$b0
|
|
.Loop_neon_enter:
|
|
ldr @t[3], [sp,#4*(15)]
|
|
vadd.i32 $d1,$d0,$t0 @ counter+1
|
|
ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
|
|
vmov $c1,$c0
|
|
ldr @t[2], [sp,#4*(13)]
|
|
vmov $c2,$c0
|
|
ldr @x[14],[sp,#4*(14)]
|
|
vadd.i32 $d2,$d1,$t0 @ counter+2
|
|
str @t[3], [sp,#4*(16+15)]
|
|
mov @t[3],#10
|
|
add @x[12],@x[12],#3 @ counter+3
|
|
b .Loop_neon
|
|
|
|
.align 4
|
|
.Loop_neon:
|
|
subs @t[3],@t[3],#1
|
|
___
|
|
my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0);
|
|
my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0);
|
|
my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0);
|
|
my @thread3=&ROUND(0,4,8,12);
|
|
|
|
foreach (@thread0) {
|
|
eval; eval(shift(@thread3));
|
|
eval(shift(@thread1)); eval(shift(@thread3));
|
|
eval(shift(@thread2)); eval(shift(@thread3));
|
|
}
|
|
|
|
@thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1);
|
|
@thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1);
|
|
@thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1);
|
|
@thread3=&ROUND(0,5,10,15);
|
|
|
|
foreach (@thread0) {
|
|
eval; eval(shift(@thread3));
|
|
eval(shift(@thread1)); eval(shift(@thread3));
|
|
eval(shift(@thread2)); eval(shift(@thread3));
|
|
}
|
|
$code.=<<___;
|
|
bne .Loop_neon
|
|
|
|
add @t[3],sp,#32
|
|
vld1.32 {$t0-$t1},[sp] @ load key material
|
|
vld1.32 {$t2-$t3},[@t[3]]
|
|
|
|
ldr @t[3],[sp,#4*(32+2)] @ load len
|
|
|
|
str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store
|
|
str @t[1], [sp,#4*(16+9)]
|
|
str @x[12],[sp,#4*(16+12)]
|
|
str @t[2], [sp,#4*(16+13)]
|
|
str @x[14],[sp,#4*(16+14)]
|
|
|
|
@ at this point we have first half of 512-bit result in
|
|
@ @x[0-7] and second half at sp+4*(16+8)
|
|
|
|
ldr r12,[sp,#4*(32+1)] @ load inp
|
|
ldr r14,[sp,#4*(32+0)] @ load out
|
|
|
|
vadd.i32 $a0,$a0,$t0 @ accumulate key material
|
|
vadd.i32 $a1,$a1,$t0
|
|
vadd.i32 $a2,$a2,$t0
|
|
vldr $t0#lo,[sp,#4*(16+0)] @ one
|
|
|
|
vadd.i32 $b0,$b0,$t1
|
|
vadd.i32 $b1,$b1,$t1
|
|
vadd.i32 $b2,$b2,$t1
|
|
vldr $t1#lo,[sp,#4*(16+2)] @ two
|
|
|
|
vadd.i32 $c0,$c0,$t2
|
|
vadd.i32 $c1,$c1,$t2
|
|
vadd.i32 $c2,$c2,$t2
|
|
vadd.i32 $d1#lo,$d1#lo,$t0#lo @ counter+1
|
|
vadd.i32 $d2#lo,$d2#lo,$t1#lo @ counter+2
|
|
|
|
vadd.i32 $d0,$d0,$t3
|
|
vadd.i32 $d1,$d1,$t3
|
|
vadd.i32 $d2,$d2,$t3
|
|
|
|
cmp @t[3],#64*4
|
|
blo .Ltail_neon
|
|
|
|
vld1.8 {$t0-$t1},[r12]! @ load input
|
|
mov @t[3],sp
|
|
vld1.8 {$t2-$t3},[r12]!
|
|
veor $a0,$a0,$t0 @ xor with input
|
|
veor $b0,$b0,$t1
|
|
vld1.8 {$t0-$t1},[r12]!
|
|
veor $c0,$c0,$t2
|
|
veor $d0,$d0,$t3
|
|
vld1.8 {$t2-$t3},[r12]!
|
|
|
|
veor $a1,$a1,$t0
|
|
vst1.8 {$a0-$b0},[r14]! @ store output
|
|
veor $b1,$b1,$t1
|
|
vld1.8 {$t0-$t1},[r12]!
|
|
veor $c1,$c1,$t2
|
|
vst1.8 {$c0-$d0},[r14]!
|
|
veor $d1,$d1,$t3
|
|
vld1.8 {$t2-$t3},[r12]!
|
|
|
|
veor $a2,$a2,$t0
|
|
vld1.32 {$a0-$b0},[@t[3]]! @ load for next iteration
|
|
veor $t0#hi,$t0#hi,$t0#hi
|
|
vldr $t0#lo,[sp,#4*(16+4)] @ four
|
|
veor $b2,$b2,$t1
|
|
vld1.32 {$c0-$d0},[@t[3]]
|
|
veor $c2,$c2,$t2
|
|
vst1.8 {$a1-$b1},[r14]!
|
|
veor $d2,$d2,$t3
|
|
vst1.8 {$c1-$d1},[r14]!
|
|
|
|
vadd.i32 $d0#lo,$d0#lo,$t0#lo @ next counter value
|
|
vldr $t0#lo,[sp,#4*(16+0)] @ one
|
|
|
|
ldmia sp,{@t[0]-@t[3]} @ load key material
|
|
add @x[0],@x[0],@t[0] @ accumulate key material
|
|
ldr @t[0],[r12],#16 @ load input
|
|
vst1.8 {$a2-$b2},[r14]!
|
|
add @x[1],@x[1],@t[1]
|
|
ldr @t[1],[r12,#-12]
|
|
vst1.8 {$c2-$d2},[r14]!
|
|
add @x[2],@x[2],@t[2]
|
|
ldr @t[2],[r12,#-8]
|
|
add @x[3],@x[3],@t[3]
|
|
ldr @t[3],[r12,#-4]
|
|
# ifdef __ARMEB__
|
|
rev @x[0],@x[0]
|
|
rev @x[1],@x[1]
|
|
rev @x[2],@x[2]
|
|
rev @x[3],@x[3]
|
|
# endif
|
|
eor @x[0],@x[0],@t[0] @ xor with input
|
|
add @t[0],sp,#4*(4)
|
|
eor @x[1],@x[1],@t[1]
|
|
str @x[0],[r14],#16 @ store output
|
|
eor @x[2],@x[2],@t[2]
|
|
str @x[1],[r14,#-12]
|
|
eor @x[3],@x[3],@t[3]
|
|
ldmia @t[0],{@t[0]-@t[3]} @ load key material
|
|
str @x[2],[r14,#-8]
|
|
str @x[3],[r14,#-4]
|
|
|
|
add @x[4],@x[4],@t[0] @ accumulate key material
|
|
ldr @t[0],[r12],#16 @ load input
|
|
add @x[5],@x[5],@t[1]
|
|
ldr @t[1],[r12,#-12]
|
|
add @x[6],@x[6],@t[2]
|
|
ldr @t[2],[r12,#-8]
|
|
add @x[7],@x[7],@t[3]
|
|
ldr @t[3],[r12,#-4]
|
|
# ifdef __ARMEB__
|
|
rev @x[4],@x[4]
|
|
rev @x[5],@x[5]
|
|
rev @x[6],@x[6]
|
|
rev @x[7],@x[7]
|
|
# endif
|
|
eor @x[4],@x[4],@t[0]
|
|
add @t[0],sp,#4*(8)
|
|
eor @x[5],@x[5],@t[1]
|
|
str @x[4],[r14],#16 @ store output
|
|
eor @x[6],@x[6],@t[2]
|
|
str @x[5],[r14,#-12]
|
|
eor @x[7],@x[7],@t[3]
|
|
ldmia @t[0],{@t[0]-@t[3]} @ load key material
|
|
str @x[6],[r14,#-8]
|
|
add @x[0],sp,#4*(16+8)
|
|
str @x[7],[r14,#-4]
|
|
|
|
ldmia @x[0],{@x[0]-@x[7]} @ load second half
|
|
|
|
add @x[0],@x[0],@t[0] @ accumulate key material
|
|
ldr @t[0],[r12],#16 @ load input
|
|
add @x[1],@x[1],@t[1]
|
|
ldr @t[1],[r12,#-12]
|
|
# ifdef __thumb2__
|
|
it hi
|
|
# endif
|
|
strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it
|
|
add @x[2],@x[2],@t[2]
|
|
ldr @t[2],[r12,#-8]
|
|
# ifdef __thumb2__
|
|
it hi
|
|
# endif
|
|
strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it
|
|
add @x[3],@x[3],@t[3]
|
|
ldr @t[3],[r12,#-4]
|
|
# ifdef __ARMEB__
|
|
rev @x[0],@x[0]
|
|
rev @x[1],@x[1]
|
|
rev @x[2],@x[2]
|
|
rev @x[3],@x[3]
|
|
# endif
|
|
eor @x[0],@x[0],@t[0]
|
|
add @t[0],sp,#4*(12)
|
|
eor @x[1],@x[1],@t[1]
|
|
str @x[0],[r14],#16 @ store output
|
|
eor @x[2],@x[2],@t[2]
|
|
str @x[1],[r14,#-12]
|
|
eor @x[3],@x[3],@t[3]
|
|
ldmia @t[0],{@t[0]-@t[3]} @ load key material
|
|
str @x[2],[r14,#-8]
|
|
str @x[3],[r14,#-4]
|
|
|
|
add @x[4],@x[4],@t[0] @ accumulate key material
|
|
add @t[0],@t[0],#4 @ next counter value
|
|
add @x[5],@x[5],@t[1]
|
|
str @t[0],[sp,#4*(12)] @ save next counter value
|
|
ldr @t[0],[r12],#16 @ load input
|
|
add @x[6],@x[6],@t[2]
|
|
add @x[4],@x[4],#3 @ counter+3
|
|
ldr @t[1],[r12,#-12]
|
|
add @x[7],@x[7],@t[3]
|
|
ldr @t[2],[r12,#-8]
|
|
ldr @t[3],[r12,#-4]
|
|
# ifdef __ARMEB__
|
|
rev @x[4],@x[4]
|
|
rev @x[5],@x[5]
|
|
rev @x[6],@x[6]
|
|
rev @x[7],@x[7]
|
|
# endif
|
|
eor @x[4],@x[4],@t[0]
|
|
# ifdef __thumb2__
|
|
it hi
|
|
# endif
|
|
ldrhi @t[0],[sp,#4*(32+2)] @ re-load len
|
|
eor @x[5],@x[5],@t[1]
|
|
eor @x[6],@x[6],@t[2]
|
|
str @x[4],[r14],#16 @ store output
|
|
eor @x[7],@x[7],@t[3]
|
|
str @x[5],[r14,#-12]
|
|
sub @t[3],@t[0],#64*4 @ len-=64*4
|
|
str @x[6],[r14,#-8]
|
|
str @x[7],[r14,#-4]
|
|
bhi .Loop_neon_outer
|
|
|
|
b .Ldone_neon
|
|
|
|
.align 4
|
|
.Lbreak_neon:
|
|
@ harmonize NEON and integer-only stack frames: load data
|
|
@ from NEON frame, but save to integer-only one; distance
|
|
@ between the two is 4*(32+4+16-32)=4*(20).
|
|
|
|
str @t[3], [sp,#4*(20+32+2)] @ save len
|
|
add @t[3],sp,#4*(32+4)
|
|
str r12, [sp,#4*(20+32+1)] @ save inp
|
|
str r14, [sp,#4*(20+32+0)] @ save out
|
|
|
|
ldr @x[12],[sp,#4*(16+10)]
|
|
ldr @x[14],[sp,#4*(16+11)]
|
|
vldmia @t[3],{d8-d15} @ fulfill ABI requirement
|
|
str @x[12],[sp,#4*(20+16+10)] @ copy "@x[10]"
|
|
str @x[14],[sp,#4*(20+16+11)] @ copy "@x[11]"
|
|
|
|
ldr @t[3], [sp,#4*(15)]
|
|
ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
|
|
ldr @t[2], [sp,#4*(13)]
|
|
ldr @x[14],[sp,#4*(14)]
|
|
str @t[3], [sp,#4*(20+16+15)]
|
|
add @t[3],sp,#4*(20)
|
|
vst1.32 {$a0-$b0},[@t[3]]! @ copy key
|
|
add sp,sp,#4*(20) @ switch frame
|
|
vst1.32 {$c0-$d0},[@t[3]]
|
|
mov @t[3],#10
|
|
b .Loop @ go integer-only
|
|
|
|
.align 4
|
|
.Ltail_neon:
|
|
cmp @t[3],#64*3
|
|
bhs .L192_or_more_neon
|
|
cmp @t[3],#64*2
|
|
bhs .L128_or_more_neon
|
|
cmp @t[3],#64*1
|
|
bhs .L64_or_more_neon
|
|
|
|
add @t[0],sp,#4*(8)
|
|
vst1.8 {$a0-$b0},[sp]
|
|
add @t[2],sp,#4*(0)
|
|
vst1.8 {$c0-$d0},[@t[0]]
|
|
b .Loop_tail_neon
|
|
|
|
.align 4
|
|
.L64_or_more_neon:
|
|
vld1.8 {$t0-$t1},[r12]!
|
|
vld1.8 {$t2-$t3},[r12]!
|
|
veor $a0,$a0,$t0
|
|
veor $b0,$b0,$t1
|
|
veor $c0,$c0,$t2
|
|
veor $d0,$d0,$t3
|
|
vst1.8 {$a0-$b0},[r14]!
|
|
vst1.8 {$c0-$d0},[r14]!
|
|
|
|
beq .Ldone_neon
|
|
|
|
add @t[0],sp,#4*(8)
|
|
vst1.8 {$a1-$b1},[sp]
|
|
add @t[2],sp,#4*(0)
|
|
vst1.8 {$c1-$d1},[@t[0]]
|
|
sub @t[3],@t[3],#64*1 @ len-=64*1
|
|
b .Loop_tail_neon
|
|
|
|
.align 4
|
|
.L128_or_more_neon:
|
|
vld1.8 {$t0-$t1},[r12]!
|
|
vld1.8 {$t2-$t3},[r12]!
|
|
veor $a0,$a0,$t0
|
|
veor $b0,$b0,$t1
|
|
vld1.8 {$t0-$t1},[r12]!
|
|
veor $c0,$c0,$t2
|
|
veor $d0,$d0,$t3
|
|
vld1.8 {$t2-$t3},[r12]!
|
|
|
|
veor $a1,$a1,$t0
|
|
veor $b1,$b1,$t1
|
|
vst1.8 {$a0-$b0},[r14]!
|
|
veor $c1,$c1,$t2
|
|
vst1.8 {$c0-$d0},[r14]!
|
|
veor $d1,$d1,$t3
|
|
vst1.8 {$a1-$b1},[r14]!
|
|
vst1.8 {$c1-$d1},[r14]!
|
|
|
|
beq .Ldone_neon
|
|
|
|
add @t[0],sp,#4*(8)
|
|
vst1.8 {$a2-$b2},[sp]
|
|
add @t[2],sp,#4*(0)
|
|
vst1.8 {$c2-$d2},[@t[0]]
|
|
sub @t[3],@t[3],#64*2 @ len-=64*2
|
|
b .Loop_tail_neon
|
|
|
|
.align 4
|
|
.L192_or_more_neon:
|
|
vld1.8 {$t0-$t1},[r12]!
|
|
vld1.8 {$t2-$t3},[r12]!
|
|
veor $a0,$a0,$t0
|
|
veor $b0,$b0,$t1
|
|
vld1.8 {$t0-$t1},[r12]!
|
|
veor $c0,$c0,$t2
|
|
veor $d0,$d0,$t3
|
|
vld1.8 {$t2-$t3},[r12]!
|
|
|
|
veor $a1,$a1,$t0
|
|
veor $b1,$b1,$t1
|
|
vld1.8 {$t0-$t1},[r12]!
|
|
veor $c1,$c1,$t2
|
|
vst1.8 {$a0-$b0},[r14]!
|
|
veor $d1,$d1,$t3
|
|
vld1.8 {$t2-$t3},[r12]!
|
|
|
|
veor $a2,$a2,$t0
|
|
vst1.8 {$c0-$d0},[r14]!
|
|
veor $b2,$b2,$t1
|
|
vst1.8 {$a1-$b1},[r14]!
|
|
veor $c2,$c2,$t2
|
|
vst1.8 {$c1-$d1},[r14]!
|
|
veor $d2,$d2,$t3
|
|
vst1.8 {$a2-$b2},[r14]!
|
|
vst1.8 {$c2-$d2},[r14]!
|
|
|
|
beq .Ldone_neon
|
|
|
|
ldmia sp,{@t[0]-@t[3]} @ load key material
|
|
add @x[0],@x[0],@t[0] @ accumulate key material
|
|
add @t[0],sp,#4*(4)
|
|
add @x[1],@x[1],@t[1]
|
|
add @x[2],@x[2],@t[2]
|
|
add @x[3],@x[3],@t[3]
|
|
ldmia @t[0],{@t[0]-@t[3]} @ load key material
|
|
|
|
add @x[4],@x[4],@t[0] @ accumulate key material
|
|
add @t[0],sp,#4*(8)
|
|
add @x[5],@x[5],@t[1]
|
|
add @x[6],@x[6],@t[2]
|
|
add @x[7],@x[7],@t[3]
|
|
ldmia @t[0],{@t[0]-@t[3]} @ load key material
|
|
# ifdef __ARMEB__
|
|
rev @x[0],@x[0]
|
|
rev @x[1],@x[1]
|
|
rev @x[2],@x[2]
|
|
rev @x[3],@x[3]
|
|
rev @x[4],@x[4]
|
|
rev @x[5],@x[5]
|
|
rev @x[6],@x[6]
|
|
rev @x[7],@x[7]
|
|
# endif
|
|
stmia sp,{@x[0]-@x[7]}
|
|
add @x[0],sp,#4*(16+8)
|
|
|
|
ldmia @x[0],{@x[0]-@x[7]} @ load second half
|
|
|
|
add @x[0],@x[0],@t[0] @ accumulate key material
|
|
add @t[0],sp,#4*(12)
|
|
add @x[1],@x[1],@t[1]
|
|
add @x[2],@x[2],@t[2]
|
|
add @x[3],@x[3],@t[3]
|
|
ldmia @t[0],{@t[0]-@t[3]} @ load key material
|
|
|
|
add @x[4],@x[4],@t[0] @ accumulate key material
|
|
add @t[0],sp,#4*(8)
|
|
add @x[5],@x[5],@t[1]
|
|
add @x[4],@x[4],#3 @ counter+3
|
|
add @x[6],@x[6],@t[2]
|
|
add @x[7],@x[7],@t[3]
|
|
ldr @t[3],[sp,#4*(32+2)] @ re-load len
|
|
# ifdef __ARMEB__
|
|
rev @x[0],@x[0]
|
|
rev @x[1],@x[1]
|
|
rev @x[2],@x[2]
|
|
rev @x[3],@x[3]
|
|
rev @x[4],@x[4]
|
|
rev @x[5],@x[5]
|
|
rev @x[6],@x[6]
|
|
rev @x[7],@x[7]
|
|
# endif
|
|
stmia @t[0],{@x[0]-@x[7]}
|
|
add @t[2],sp,#4*(0)
|
|
sub @t[3],@t[3],#64*3 @ len-=64*3
|
|
|
|
.Loop_tail_neon:
|
|
ldrb @t[0],[@t[2]],#1 @ read buffer on stack
|
|
ldrb @t[1],[r12],#1 @ read input
|
|
subs @t[3],@t[3],#1
|
|
eor @t[0],@t[0],@t[1]
|
|
strb @t[0],[r14],#1 @ store ouput
|
|
bne .Loop_tail_neon
|
|
|
|
.Ldone_neon:
|
|
add sp,sp,#4*(32+4)
|
|
vldmia sp,{d8-d15}
|
|
add sp,sp,#4*(16+3)
|
|
ldmia sp!,{r4-r11,pc}
|
|
.size ChaCha20_neon,.-ChaCha20_neon
|
|
.comm OPENSSL_armcap_P,4,4
|
|
#endif
|
|
___
|
|
}}}
|
|
|
|
foreach (split("\n",$code)) {
|
|
s/\`([^\`]*)\`/eval $1/geo;
|
|
|
|
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
|
|
|
|
print $_,"\n";
|
|
}
|
|
close STDOUT;
|