35be688078
This removes chacha_vec_arm.S and chacha_vec.c in favor of unifying on upstream's code. Upstream's is faster and this cuts down on the number of distinct codepaths. Our old scheme also didn't give vectorized code on Windows or aarch64. BoringSSL-specific modifications made to the assembly: - As usual, the shelling out to $CC is replaced with hardcoding $avx. I've tested up to the AVX2 codepath, so enable it all. - I've removed the AMD XOP code as I have not tested it. - As usual, the ARM file need the arm_arch.h include tweaked. Speed numbers follow. We can hope for further wins on these benchmarks after importing the Poly1305 assembly. x86 --- Old: Did 1422000 ChaCha20-Poly1305 (16 bytes) seal operations in 1000433us (1421384.5 ops/sec): 22.7 MB/s Did 123000 ChaCha20-Poly1305 (1350 bytes) seal operations in 1003803us (122534.0 ops/sec): 165.4 MB/s Did 22000 ChaCha20-Poly1305 (8192 bytes) seal operations in 1000282us (21993.8 ops/sec): 180.2 MB/s Did 1428000 ChaCha20-Poly1305-Old (16 bytes) seal operations in 1000214us (1427694.5 ops/sec): 22.8 MB/s Did 124000 ChaCha20-Poly1305-Old (1350 bytes) seal operations in 1006332us (123219.8 ops/sec): 166.3 MB/s Did 22000 ChaCha20-Poly1305-Old (8192 bytes) seal operations in 1020771us (21552.3 ops/sec): 176.6 MB/s New: Did 1520000 ChaCha20-Poly1305 (16 bytes) seal operations in 1000567us (1519138.6 ops/sec): 24.3 MB/s Did 152000 ChaCha20-Poly1305 (1350 bytes) seal operations in 1004216us (151361.9 ops/sec): 204.3 MB/s Did 31000 ChaCha20-Poly1305 (8192 bytes) seal operations in 1009085us (30720.9 ops/sec): 251.7 MB/s Did 1797000 ChaCha20-Poly1305-Old (16 bytes) seal operations in 1000141us (1796746.7 ops/sec): 28.7 MB/s Did 171000 ChaCha20-Poly1305-Old (1350 bytes) seal operations in 1003204us (170453.9 ops/sec): 230.1 MB/s Did 31000 ChaCha20-Poly1305-Old (8192 bytes) seal operations in 1005349us (30835.1 ops/sec): 252.6 MB/s x86_64, no AVX2 --- Old: Did 1782000 ChaCha20-Poly1305 (16 bytes) seal operations in 1000204us (1781636.5 ops/sec): 28.5 MB/s Did 317000 ChaCha20-Poly1305 (1350 bytes) seal operations in 1001579us (316500.2 ops/sec): 427.3 MB/s Did 62000 ChaCha20-Poly1305 (8192 bytes) seal operations in 1012146us (61256.0 ops/sec): 501.8 MB/s Did 1778000 ChaCha20-Poly1305-Old (16 bytes) seal operations in 1000220us (1777608.9 ops/sec): 28.4 MB/s Did 315000 ChaCha20-Poly1305-Old (1350 bytes) seal operations in 1002886us (314093.5 ops/sec): 424.0 MB/s Did 71000 ChaCha20-Poly1305-Old (8192 bytes) seal operations in 1014606us (69977.9 ops/sec): 573.3 MB/s New: Did 1866000 ChaCha20-Poly1305 (16 bytes) seal operations in 1000019us (1865964.5 ops/sec): 29.9 MB/s Did 399000 ChaCha20-Poly1305 (1350 bytes) seal operations in 1001017us (398594.6 ops/sec): 538.1 MB/s Did 84000 ChaCha20-Poly1305 (8192 bytes) seal operations in 1005645us (83528.5 ops/sec): 684.3 MB/s Did 1881000 ChaCha20-Poly1305-Old (16 bytes) seal operations in 1000325us (1880388.9 ops/sec): 30.1 MB/s Did 404000 ChaCha20-Poly1305-Old (1350 bytes) seal operations in 1000004us (403998.4 ops/sec): 545.4 MB/s Did 85000 ChaCha20-Poly1305-Old (8192 bytes) seal operations in 1010048us (84154.4 ops/sec): 689.4 MB/s x86_64, AVX2 --- Old: Did 2375000 ChaCha20-Poly1305 (16 bytes) seal operations in 1000282us (2374330.4 ops/sec): 38.0 MB/s Did 448000 ChaCha20-Poly1305 (1350 bytes) seal operations in 1001865us (447166.0 ops/sec): 603.7 MB/s Did 88000 ChaCha20-Poly1305 (8192 bytes) seal operations in 1005217us (87543.3 ops/sec): 717.2 MB/s Did 2409000 ChaCha20-Poly1305-Old (16 bytes) seal operations in 1000188us (2408547.2 ops/sec): 38.5 MB/s Did 446000 ChaCha20-Poly1305-Old (1350 bytes) seal operations in 1001003us (445553.1 ops/sec): 601.5 MB/s Did 90000 ChaCha20-Poly1305-Old (8192 bytes) seal operations in 1006722us (89399.1 ops/sec): 732.4 MB/s New: Did 2622000 ChaCha20-Poly1305 (16 bytes) seal operations in 1000266us (2621302.7 ops/sec): 41.9 MB/s Did 794000 ChaCha20-Poly1305 (1350 bytes) seal operations in 1000783us (793378.8 ops/sec): 1071.1 MB/s Did 173000 ChaCha20-Poly1305 (8192 bytes) seal operations in 1000176us (172969.6 ops/sec): 1417.0 MB/s Did 2623000 ChaCha20-Poly1305-Old (16 bytes) seal operations in 1000330us (2622134.7 ops/sec): 42.0 MB/s Did 783000 ChaCha20-Poly1305-Old (1350 bytes) seal operations in 1000531us (782584.4 ops/sec): 1056.5 MB/s Did 174000 ChaCha20-Poly1305-Old (8192 bytes) seal operations in 1000840us (173854.0 ops/sec): 1424.2 MB/s arm, Nexus 4 --- Old: Did 388550 ChaCha20-Poly1305 (16 bytes) seal operations in 1000580us (388324.8 ops/sec): 6.2 MB/s Did 90000 ChaCha20-Poly1305 (1350 bytes) seal operations in 1003816us (89657.9 ops/sec): 121.0 MB/s Did 19000 ChaCha20-Poly1305 (8192 bytes) seal operations in 1045750us (18168.8 ops/sec): 148.8 MB/s Did 398500 ChaCha20-Poly1305-Old (16 bytes) seal operations in 1000305us (398378.5 ops/sec): 6.4 MB/s Did 90500 ChaCha20-Poly1305-Old (1350 bytes) seal operations in 1000305us (90472.4 ops/sec): 122.1 MB/s Did 19000 ChaCha20-Poly1305-Old (8192 bytes) seal operations in 1043278us (18211.8 ops/sec): 149.2 MB/s New: Did 424788 ChaCha20-Poly1305 (16 bytes) seal operations in 1000641us (424515.9 ops/sec): 6.8 MB/s Did 115000 ChaCha20-Poly1305 (1350 bytes) seal operations in 1001526us (114824.8 ops/sec): 155.0 MB/s Did 27000 ChaCha20-Poly1305 (8192 bytes) seal operations in 1033023us (26136.9 ops/sec): 214.1 MB/s Did 447750 ChaCha20-Poly1305-Old (16 bytes) seal operations in 1000549us (447504.3 ops/sec): 7.2 MB/s Did 117500 ChaCha20-Poly1305-Old (1350 bytes) seal operations in 1001923us (117274.5 ops/sec): 158.3 MB/s Did 27000 ChaCha20-Poly1305-Old (8192 bytes) seal operations in 1025118us (26338.4 ops/sec): 215.8 MB/s aarch64, Nexus 6p (Note we didn't have aarch64 assembly before at all, and still don't have it for Poly1305. Hopefully once that's added this will be faster than the arm numbers...) --- Old: Did 145040 ChaCha20-Poly1305 (16 bytes) seal operations in 1003065us (144596.8 ops/sec): 2.3 MB/s Did 14000 ChaCha20-Poly1305 (1350 bytes) seal operations in 1042605us (13427.9 ops/sec): 18.1 MB/s Did 2618 ChaCha20-Poly1305 (8192 bytes) seal operations in 1093241us (2394.7 ops/sec): 19.6 MB/s Did 148000 ChaCha20-Poly1305-Old (16 bytes) seal operations in 1000709us (147895.1 ops/sec): 2.4 MB/s Did 14000 ChaCha20-Poly1305-Old (1350 bytes) seal operations in 1047294us (13367.8 ops/sec): 18.0 MB/s Did 2607 ChaCha20-Poly1305-Old (8192 bytes) seal operations in 1090745us (2390.1 ops/sec): 19.6 MB/s New: Did 358000 ChaCha20-Poly1305 (16 bytes) seal operations in 1000769us (357724.9 ops/sec): 5.7 MB/s Did 45000 ChaCha20-Poly1305 (1350 bytes) seal operations in 1021267us (44062.9 ops/sec): 59.5 MB/s Did 8591 ChaCha20-Poly1305 (8192 bytes) seal operations in 1047136us (8204.3 ops/sec): 67.2 MB/s Did 343000 ChaCha20-Poly1305-Old (16 bytes) seal operations in 1000489us (342832.4 ops/sec): 5.5 MB/s Did 44000 ChaCha20-Poly1305-Old (1350 bytes) seal operations in 1008326us (43636.7 ops/sec): 58.9 MB/s Did 8866 ChaCha20-Poly1305-Old (8192 bytes) seal operations in 1083341us (8183.9 ops/sec): 67.0 MB/s Change-Id: I629fe195d072f2c99e8f947578fad6d70823c4c8 Reviewed-on: https://boringssl-review.googlesource.com/7202 Reviewed-by: Adam Langley <agl@google.com>
1152 lines
27 KiB
Raku
Executable File
1152 lines
27 KiB
Raku
Executable File
#!/usr/bin/env perl
|
|
#
|
|
# ====================================================================
|
|
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
|
# project. The module is, however, dual licensed under OpenSSL and
|
|
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
|
# details see http://www.openssl.org/~appro/cryptogams/.
|
|
# ====================================================================
|
|
#
|
|
# December 2014
|
|
#
|
|
# ChaCha20 for ARMv4.
|
|
#
|
|
# Performance in cycles per byte out of large buffer.
|
|
#
|
|
# IALU/gcc-4.4 1xNEON 3xNEON+1xIALU
|
|
#
|
|
# Cortex-A5 19.3(*)/+95% 21.8 14.1
|
|
# Cortex-A8 10.5(*)/+160% 13.9 6.35
|
|
# Cortex-A9 12.9(**)/+110% 14.3 6.50
|
|
# Cortex-A15 11.0/+40% 16.0 5.00
|
|
# Snapdragon S4 11.5/+125% 13.6 4.90
|
|
#
|
|
# (*) most "favourable" result for aligned data on little-endian
|
|
# processor, result for misaligned data is 10-15% lower;
|
|
# (**) this result is a trade-off: it can be improved by 20%,
|
|
# but then Snapdragon S4 and Cortex-A8 results get
|
|
# 20-25% worse;
|
|
|
|
$flavour = shift;
|
|
if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
|
|
else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
|
|
|
|
if ($flavour && $flavour ne "void") {
|
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
|
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
|
die "can't locate arm-xlate.pl";
|
|
|
|
open STDOUT,"| \"$^X\" $xlate $flavour $output";
|
|
} else {
|
|
open STDOUT,">$output";
|
|
}
|
|
|
|
sub AUTOLOAD() # thunk [simplified] x86-style perlasm
|
|
{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
|
|
my $arg = pop;
|
|
$arg = "#$arg" if ($arg*1 eq $arg);
|
|
$code .= "\t$opcode\t".join(',',@_,$arg)."\n";
|
|
}
|
|
|
|
my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x"));
|
|
my @t=map("r$_",(8..11));
|
|
|
|
sub ROUND {
|
|
my ($a0,$b0,$c0,$d0)=@_;
|
|
my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
|
|
my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
|
|
my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
|
|
my $odd = $d0&1;
|
|
my ($xc,$xc_) = (@t[0..1]);
|
|
my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]);
|
|
my @ret;
|
|
|
|
# Consider order in which variables are addressed by their
|
|
# index:
|
|
#
|
|
# a b c d
|
|
#
|
|
# 0 4 8 12 < even round
|
|
# 1 5 9 13
|
|
# 2 6 10 14
|
|
# 3 7 11 15
|
|
# 0 5 10 15 < odd round
|
|
# 1 6 11 12
|
|
# 2 7 8 13
|
|
# 3 4 9 14
|
|
#
|
|
# 'a', 'b' are permanently allocated in registers, @x[0..7],
|
|
# while 'c's and pair of 'd's are maintained in memory. If
|
|
# you observe 'c' column, you'll notice that pair of 'c's is
|
|
# invariant between rounds. This means that we have to reload
|
|
# them once per round, in the middle. This is why you'll see
|
|
# bunch of 'c' stores and loads in the middle, but none in
|
|
# the beginning or end. If you observe 'd' column, you'll
|
|
# notice that 15 and 13 are reused in next pair of rounds.
|
|
# This is why these two are chosen for offloading to memory,
|
|
# to make loads count more.
|
|
push @ret,(
|
|
"&add (@x[$a0],@x[$a0],@x[$b0])",
|
|
"&mov ($xd,$xd,'ror#16')",
|
|
"&add (@x[$a1],@x[$a1],@x[$b1])",
|
|
"&mov ($xd_,$xd_,'ror#16')",
|
|
"&eor ($xd,$xd,@x[$a0],'ror#16')",
|
|
"&eor ($xd_,$xd_,@x[$a1],'ror#16')",
|
|
|
|
"&add ($xc,$xc,$xd)",
|
|
"&mov (@x[$b0],@x[$b0],'ror#20')",
|
|
"&add ($xc_,$xc_,$xd_)",
|
|
"&mov (@x[$b1],@x[$b1],'ror#20')",
|
|
"&eor (@x[$b0],@x[$b0],$xc,'ror#20')",
|
|
"&eor (@x[$b1],@x[$b1],$xc_,'ror#20')",
|
|
|
|
"&add (@x[$a0],@x[$a0],@x[$b0])",
|
|
"&mov ($xd,$xd,'ror#24')",
|
|
"&add (@x[$a1],@x[$a1],@x[$b1])",
|
|
"&mov ($xd_,$xd_,'ror#24')",
|
|
"&eor ($xd,$xd,@x[$a0],'ror#24')",
|
|
"&eor ($xd_,$xd_,@x[$a1],'ror#24')",
|
|
|
|
"&add ($xc,$xc,$xd)",
|
|
"&mov (@x[$b0],@x[$b0],'ror#25')" );
|
|
push @ret,(
|
|
"&str ($xd,'[sp,#4*(16+$d0)]')",
|
|
"&ldr ($xd,'[sp,#4*(16+$d2)]')" ) if ($odd);
|
|
push @ret,(
|
|
"&add ($xc_,$xc_,$xd_)",
|
|
"&mov (@x[$b1],@x[$b1],'ror#25')" );
|
|
push @ret,(
|
|
"&str ($xd_,'[sp,#4*(16+$d1)]')",
|
|
"&ldr ($xd_,'[sp,#4*(16+$d3)]')" ) if (!$odd);
|
|
push @ret,(
|
|
"&eor (@x[$b0],@x[$b0],$xc,'ror#25')",
|
|
"&eor (@x[$b1],@x[$b1],$xc_,'ror#25')" );
|
|
|
|
$xd=@x[$d2] if (!$odd);
|
|
$xd_=@x[$d3] if ($odd);
|
|
push @ret,(
|
|
"&str ($xc,'[sp,#4*(16+$c0)]')",
|
|
"&ldr ($xc,'[sp,#4*(16+$c2)]')",
|
|
"&add (@x[$a2],@x[$a2],@x[$b2])",
|
|
"&mov ($xd,$xd,'ror#16')",
|
|
"&str ($xc_,'[sp,#4*(16+$c1)]')",
|
|
"&ldr ($xc_,'[sp,#4*(16+$c3)]')",
|
|
"&add (@x[$a3],@x[$a3],@x[$b3])",
|
|
"&mov ($xd_,$xd_,'ror#16')",
|
|
"&eor ($xd,$xd,@x[$a2],'ror#16')",
|
|
"&eor ($xd_,$xd_,@x[$a3],'ror#16')",
|
|
|
|
"&add ($xc,$xc,$xd)",
|
|
"&mov (@x[$b2],@x[$b2],'ror#20')",
|
|
"&add ($xc_,$xc_,$xd_)",
|
|
"&mov (@x[$b3],@x[$b3],'ror#20')",
|
|
"&eor (@x[$b2],@x[$b2],$xc,'ror#20')",
|
|
"&eor (@x[$b3],@x[$b3],$xc_,'ror#20')",
|
|
|
|
"&add (@x[$a2],@x[$a2],@x[$b2])",
|
|
"&mov ($xd,$xd,'ror#24')",
|
|
"&add (@x[$a3],@x[$a3],@x[$b3])",
|
|
"&mov ($xd_,$xd_,'ror#24')",
|
|
"&eor ($xd,$xd,@x[$a2],'ror#24')",
|
|
"&eor ($xd_,$xd_,@x[$a3],'ror#24')",
|
|
|
|
"&add ($xc,$xc,$xd)",
|
|
"&mov (@x[$b2],@x[$b2],'ror#25')",
|
|
"&add ($xc_,$xc_,$xd_)",
|
|
"&mov (@x[$b3],@x[$b3],'ror#25')",
|
|
"&eor (@x[$b2],@x[$b2],$xc,'ror#25')",
|
|
"&eor (@x[$b3],@x[$b3],$xc_,'ror#25')" );
|
|
|
|
@ret;
|
|
}
|
|
|
|
$code.=<<___;
|
|
#include <openssl/arm_arch.h>
|
|
|
|
.text
|
|
#if defined(__thumb2__)
|
|
.syntax unified
|
|
.thumb
|
|
#else
|
|
.code 32
|
|
#endif
|
|
|
|
#if defined(__thumb2__) || defined(__clang__)
|
|
#define ldrhsb ldrbhs
|
|
#endif
|
|
|
|
.align 5
|
|
.Lsigma:
|
|
.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
|
|
.Lone:
|
|
.long 1,0,0,0
|
|
#if __ARM_MAX_ARCH__>=7
|
|
.LOPENSSL_armcap:
|
|
.word OPENSSL_armcap_P-.LChaCha20_ctr32
|
|
#else
|
|
.word -1
|
|
#endif
|
|
|
|
.globl ChaCha20_ctr32
|
|
.type ChaCha20_ctr32,%function
|
|
.align 5
|
|
ChaCha20_ctr32:
|
|
.LChaCha20_ctr32:
|
|
ldr r12,[sp,#0] @ pull pointer to counter and nonce
|
|
stmdb sp!,{r0-r2,r4-r11,lr}
|
|
#if __ARM_ARCH__<7 && !defined(__thumb2__)
|
|
sub r14,pc,#16 @ ChaCha20_ctr32
|
|
#else
|
|
adr r14,.LChaCha20_ctr32
|
|
#endif
|
|
cmp r2,#0 @ len==0?
|
|
#ifdef __thumb2__
|
|
itt eq
|
|
#endif
|
|
addeq sp,sp,#4*3
|
|
beq .Lno_data
|
|
#if __ARM_MAX_ARCH__>=7
|
|
cmp r2,#192 @ test len
|
|
bls .Lshort
|
|
ldr r4,[r14,#-32]
|
|
ldr r4,[r14,r4]
|
|
# ifdef __APPLE__
|
|
ldr r4,[r4]
|
|
# endif
|
|
tst r4,#ARMV7_NEON
|
|
bne .LChaCha20_neon
|
|
.Lshort:
|
|
#endif
|
|
ldmia r12,{r4-r7} @ load counter and nonce
|
|
sub sp,sp,#4*(16) @ off-load area
|
|
sub r14,r14,#64 @ .Lsigma
|
|
stmdb sp!,{r4-r7} @ copy counter and nonce
|
|
ldmia r3,{r4-r11} @ load key
|
|
ldmia r14,{r0-r3} @ load sigma
|
|
stmdb sp!,{r4-r11} @ copy key
|
|
stmdb sp!,{r0-r3} @ copy sigma
|
|
str r10,[sp,#4*(16+10)] @ off-load "@x[10]"
|
|
str r11,[sp,#4*(16+11)] @ off-load "@x[11]"
|
|
b .Loop_outer_enter
|
|
|
|
.align 4
|
|
.Loop_outer:
|
|
ldmia sp,{r0-r9} @ load key material
|
|
str @t[3],[sp,#4*(32+2)] @ save len
|
|
str r12, [sp,#4*(32+1)] @ save inp
|
|
str r14, [sp,#4*(32+0)] @ save out
|
|
.Loop_outer_enter:
|
|
ldr @t[3], [sp,#4*(15)]
|
|
ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
|
|
ldr @t[2], [sp,#4*(13)]
|
|
ldr @x[14],[sp,#4*(14)]
|
|
str @t[3], [sp,#4*(16+15)]
|
|
mov @t[3],#10
|
|
b .Loop
|
|
|
|
.align 4
|
|
.Loop:
|
|
subs @t[3],@t[3],#1
|
|
___
|
|
foreach (&ROUND(0, 4, 8,12)) { eval; }
|
|
foreach (&ROUND(0, 5,10,15)) { eval; }
|
|
$code.=<<___;
|
|
bne .Loop
|
|
|
|
ldr @t[3],[sp,#4*(32+2)] @ load len
|
|
|
|
str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store
|
|
str @t[1], [sp,#4*(16+9)]
|
|
str @x[12],[sp,#4*(16+12)]
|
|
str @t[2], [sp,#4*(16+13)]
|
|
str @x[14],[sp,#4*(16+14)]
|
|
|
|
@ at this point we have first half of 512-bit result in
|
|
@ @x[0-7] and second half at sp+4*(16+8)
|
|
|
|
cmp @t[3],#64 @ done yet?
|
|
#ifdef __thumb2__
|
|
itete lo
|
|
#endif
|
|
addlo r12,sp,#4*(0) @ shortcut or ...
|
|
ldrhs r12,[sp,#4*(32+1)] @ ... load inp
|
|
addlo r14,sp,#4*(0) @ shortcut or ...
|
|
ldrhs r14,[sp,#4*(32+0)] @ ... load out
|
|
|
|
ldr @t[0],[sp,#4*(0)] @ load key material
|
|
ldr @t[1],[sp,#4*(1)]
|
|
|
|
#if __ARM_ARCH__>=6 || !defined(__ARMEB__)
|
|
# if __ARM_ARCH__<7
|
|
orr @t[2],r12,r14
|
|
tst @t[2],#3 @ are input and output aligned?
|
|
ldr @t[2],[sp,#4*(2)]
|
|
bne .Lunaligned
|
|
cmp @t[3],#64 @ restore flags
|
|
# else
|
|
ldr @t[2],[sp,#4*(2)]
|
|
# endif
|
|
ldr @t[3],[sp,#4*(3)]
|
|
|
|
add @x[0],@x[0],@t[0] @ accumulate key material
|
|
add @x[1],@x[1],@t[1]
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
ldrhs @t[0],[r12],#16 @ load input
|
|
ldrhs @t[1],[r12,#-12]
|
|
|
|
add @x[2],@x[2],@t[2]
|
|
add @x[3],@x[3],@t[3]
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
ldrhs @t[2],[r12,#-8]
|
|
ldrhs @t[3],[r12,#-4]
|
|
# if __ARM_ARCH__>=6 && defined(__ARMEB__)
|
|
rev @x[0],@x[0]
|
|
rev @x[1],@x[1]
|
|
rev @x[2],@x[2]
|
|
rev @x[3],@x[3]
|
|
# endif
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
eorhs @x[0],@x[0],@t[0] @ xor with input
|
|
eorhs @x[1],@x[1],@t[1]
|
|
add @t[0],sp,#4*(4)
|
|
str @x[0],[r14],#16 @ store output
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
eorhs @x[2],@x[2],@t[2]
|
|
eorhs @x[3],@x[3],@t[3]
|
|
ldmia @t[0],{@t[0]-@t[3]} @ load key material
|
|
str @x[1],[r14,#-12]
|
|
str @x[2],[r14,#-8]
|
|
str @x[3],[r14,#-4]
|
|
|
|
add @x[4],@x[4],@t[0] @ accumulate key material
|
|
add @x[5],@x[5],@t[1]
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
ldrhs @t[0],[r12],#16 @ load input
|
|
ldrhs @t[1],[r12,#-12]
|
|
add @x[6],@x[6],@t[2]
|
|
add @x[7],@x[7],@t[3]
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
ldrhs @t[2],[r12,#-8]
|
|
ldrhs @t[3],[r12,#-4]
|
|
# if __ARM_ARCH__>=6 && defined(__ARMEB__)
|
|
rev @x[4],@x[4]
|
|
rev @x[5],@x[5]
|
|
rev @x[6],@x[6]
|
|
rev @x[7],@x[7]
|
|
# endif
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
eorhs @x[4],@x[4],@t[0]
|
|
eorhs @x[5],@x[5],@t[1]
|
|
add @t[0],sp,#4*(8)
|
|
str @x[4],[r14],#16 @ store output
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
eorhs @x[6],@x[6],@t[2]
|
|
eorhs @x[7],@x[7],@t[3]
|
|
str @x[5],[r14,#-12]
|
|
ldmia @t[0],{@t[0]-@t[3]} @ load key material
|
|
str @x[6],[r14,#-8]
|
|
add @x[0],sp,#4*(16+8)
|
|
str @x[7],[r14,#-4]
|
|
|
|
ldmia @x[0],{@x[0]-@x[7]} @ load second half
|
|
|
|
add @x[0],@x[0],@t[0] @ accumulate key material
|
|
add @x[1],@x[1],@t[1]
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
ldrhs @t[0],[r12],#16 @ load input
|
|
ldrhs @t[1],[r12,#-12]
|
|
# ifdef __thumb2__
|
|
itt hi
|
|
# endif
|
|
strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it
|
|
strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it
|
|
add @x[2],@x[2],@t[2]
|
|
add @x[3],@x[3],@t[3]
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
ldrhs @t[2],[r12,#-8]
|
|
ldrhs @t[3],[r12,#-4]
|
|
# if __ARM_ARCH__>=6 && defined(__ARMEB__)
|
|
rev @x[0],@x[0]
|
|
rev @x[1],@x[1]
|
|
rev @x[2],@x[2]
|
|
rev @x[3],@x[3]
|
|
# endif
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
eorhs @x[0],@x[0],@t[0]
|
|
eorhs @x[1],@x[1],@t[1]
|
|
add @t[0],sp,#4*(12)
|
|
str @x[0],[r14],#16 @ store output
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
eorhs @x[2],@x[2],@t[2]
|
|
eorhs @x[3],@x[3],@t[3]
|
|
str @x[1],[r14,#-12]
|
|
ldmia @t[0],{@t[0]-@t[3]} @ load key material
|
|
str @x[2],[r14,#-8]
|
|
str @x[3],[r14,#-4]
|
|
|
|
add @x[4],@x[4],@t[0] @ accumulate key material
|
|
add @x[5],@x[5],@t[1]
|
|
# ifdef __thumb2__
|
|
itt hi
|
|
# endif
|
|
addhi @t[0],@t[0],#1 @ next counter value
|
|
strhi @t[0],[sp,#4*(12)] @ save next counter value
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
ldrhs @t[0],[r12],#16 @ load input
|
|
ldrhs @t[1],[r12,#-12]
|
|
add @x[6],@x[6],@t[2]
|
|
add @x[7],@x[7],@t[3]
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
ldrhs @t[2],[r12,#-8]
|
|
ldrhs @t[3],[r12,#-4]
|
|
# if __ARM_ARCH__>=6 && defined(__ARMEB__)
|
|
rev @x[4],@x[4]
|
|
rev @x[5],@x[5]
|
|
rev @x[6],@x[6]
|
|
rev @x[7],@x[7]
|
|
# endif
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
eorhs @x[4],@x[4],@t[0]
|
|
eorhs @x[5],@x[5],@t[1]
|
|
# ifdef __thumb2__
|
|
it ne
|
|
# endif
|
|
ldrne @t[0],[sp,#4*(32+2)] @ re-load len
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
eorhs @x[6],@x[6],@t[2]
|
|
eorhs @x[7],@x[7],@t[3]
|
|
str @x[4],[r14],#16 @ store output
|
|
str @x[5],[r14,#-12]
|
|
# ifdef __thumb2__
|
|
it hs
|
|
# endif
|
|
subhs @t[3],@t[0],#64 @ len-=64
|
|
str @x[6],[r14,#-8]
|
|
str @x[7],[r14,#-4]
|
|
bhi .Loop_outer
|
|
|
|
beq .Ldone
|
|
# if __ARM_ARCH__<7
|
|
b .Ltail
|
|
|
|
.align 4
|
|
.Lunaligned: @ unaligned endian-neutral path
|
|
cmp @t[3],#64 @ restore flags
|
|
# endif
|
|
#endif
|
|
#if __ARM_ARCH__<7
|
|
ldr @t[3],[sp,#4*(3)]
|
|
___
|
|
for ($i=0;$i<16;$i+=4) {
|
|
my $j=$i&0x7;
|
|
|
|
$code.=<<___ if ($i==4);
|
|
add @x[0],sp,#4*(16+8)
|
|
___
|
|
$code.=<<___ if ($i==8);
|
|
ldmia @x[0],{@x[0]-@x[7]} @ load second half
|
|
# ifdef __thumb2__
|
|
itt hi
|
|
# endif
|
|
strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]"
|
|
strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]"
|
|
___
|
|
$code.=<<___;
|
|
add @x[$j+0],@x[$j+0],@t[0] @ accumulate key material
|
|
___
|
|
$code.=<<___ if ($i==12);
|
|
# ifdef __thumb2__
|
|
itt hi
|
|
# endif
|
|
addhi @t[0],@t[0],#1 @ next counter value
|
|
strhi @t[0],[sp,#4*(12)] @ save next counter value
|
|
___
|
|
$code.=<<___;
|
|
add @x[$j+1],@x[$j+1],@t[1]
|
|
add @x[$j+2],@x[$j+2],@t[2]
|
|
# ifdef __thumb2__
|
|
itete lo
|
|
# endif
|
|
eorlo @t[0],@t[0],@t[0] @ zero or ...
|
|
ldrhsb @t[0],[r12],#16 @ ... load input
|
|
eorlo @t[1],@t[1],@t[1]
|
|
ldrhsb @t[1],[r12,#-12]
|
|
|
|
add @x[$j+3],@x[$j+3],@t[3]
|
|
# ifdef __thumb2__
|
|
itete lo
|
|
# endif
|
|
eorlo @t[2],@t[2],@t[2]
|
|
ldrhsb @t[2],[r12,#-8]
|
|
eorlo @t[3],@t[3],@t[3]
|
|
ldrhsb @t[3],[r12,#-4]
|
|
|
|
eor @x[$j+0],@t[0],@x[$j+0] @ xor with input (or zero)
|
|
eor @x[$j+1],@t[1],@x[$j+1]
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
ldrhsb @t[0],[r12,#-15] @ load more input
|
|
ldrhsb @t[1],[r12,#-11]
|
|
eor @x[$j+2],@t[2],@x[$j+2]
|
|
strb @x[$j+0],[r14],#16 @ store output
|
|
eor @x[$j+3],@t[3],@x[$j+3]
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
ldrhsb @t[2],[r12,#-7]
|
|
ldrhsb @t[3],[r12,#-3]
|
|
strb @x[$j+1],[r14,#-12]
|
|
eor @x[$j+0],@t[0],@x[$j+0],lsr#8
|
|
strb @x[$j+2],[r14,#-8]
|
|
eor @x[$j+1],@t[1],@x[$j+1],lsr#8
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
ldrhsb @t[0],[r12,#-14] @ load more input
|
|
ldrhsb @t[1],[r12,#-10]
|
|
strb @x[$j+3],[r14,#-4]
|
|
eor @x[$j+2],@t[2],@x[$j+2],lsr#8
|
|
strb @x[$j+0],[r14,#-15]
|
|
eor @x[$j+3],@t[3],@x[$j+3],lsr#8
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
ldrhsb @t[2],[r12,#-6]
|
|
ldrhsb @t[3],[r12,#-2]
|
|
strb @x[$j+1],[r14,#-11]
|
|
eor @x[$j+0],@t[0],@x[$j+0],lsr#8
|
|
strb @x[$j+2],[r14,#-7]
|
|
eor @x[$j+1],@t[1],@x[$j+1],lsr#8
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
ldrhsb @t[0],[r12,#-13] @ load more input
|
|
ldrhsb @t[1],[r12,#-9]
|
|
strb @x[$j+3],[r14,#-3]
|
|
eor @x[$j+2],@t[2],@x[$j+2],lsr#8
|
|
strb @x[$j+0],[r14,#-14]
|
|
eor @x[$j+3],@t[3],@x[$j+3],lsr#8
|
|
# ifdef __thumb2__
|
|
itt hs
|
|
# endif
|
|
ldrhsb @t[2],[r12,#-5]
|
|
ldrhsb @t[3],[r12,#-1]
|
|
strb @x[$j+1],[r14,#-10]
|
|
strb @x[$j+2],[r14,#-6]
|
|
eor @x[$j+0],@t[0],@x[$j+0],lsr#8
|
|
strb @x[$j+3],[r14,#-2]
|
|
eor @x[$j+1],@t[1],@x[$j+1],lsr#8
|
|
strb @x[$j+0],[r14,#-13]
|
|
eor @x[$j+2],@t[2],@x[$j+2],lsr#8
|
|
strb @x[$j+1],[r14,#-9]
|
|
eor @x[$j+3],@t[3],@x[$j+3],lsr#8
|
|
strb @x[$j+2],[r14,#-5]
|
|
strb @x[$j+3],[r14,#-1]
|
|
___
|
|
$code.=<<___ if ($i<12);
|
|
add @t[0],sp,#4*(4+$i)
|
|
ldmia @t[0],{@t[0]-@t[3]} @ load key material
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
# ifdef __thumb2__
|
|
it ne
|
|
# endif
|
|
ldrne @t[0],[sp,#4*(32+2)] @ re-load len
|
|
# ifdef __thumb2__
|
|
it hs
|
|
# endif
|
|
subhs @t[3],@t[0],#64 @ len-=64
|
|
bhi .Loop_outer
|
|
|
|
beq .Ldone
|
|
#endif
|
|
|
|
.Ltail:
|
|
ldr r12,[sp,#4*(32+1)] @ load inp
|
|
add @t[1],sp,#4*(0)
|
|
ldr r14,[sp,#4*(32+0)] @ load out
|
|
|
|
.Loop_tail:
|
|
ldrb @t[2],[@t[1]],#1 @ read buffer on stack
|
|
ldrb @t[3],[r12],#1 @ read input
|
|
subs @t[0],@t[0],#1
|
|
eor @t[3],@t[3],@t[2]
|
|
strb @t[3],[r14],#1 @ store output
|
|
bne .Loop_tail
|
|
|
|
.Ldone:
|
|
add sp,sp,#4*(32+3)
|
|
.Lno_data:
|
|
ldmia sp!,{r4-r11,pc}
|
|
.size ChaCha20_ctr32,.-ChaCha20_ctr32
|
|
___
|
|
|
|
{{{
|
|
my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) =
|
|
map("q$_",(0..15));
|
|
|
|
sub NEONROUND {
|
|
my $odd = pop;
|
|
my ($a,$b,$c,$d,$t)=@_;
|
|
|
|
(
|
|
"&vadd_i32 ($a,$a,$b)",
|
|
"&veor ($d,$d,$a)",
|
|
"&vrev32_16 ($d,$d)", # vrot ($d,16)
|
|
|
|
"&vadd_i32 ($c,$c,$d)",
|
|
"&veor ($t,$b,$c)",
|
|
"&vshr_u32 ($b,$t,20)",
|
|
"&vsli_32 ($b,$t,12)",
|
|
|
|
"&vadd_i32 ($a,$a,$b)",
|
|
"&veor ($t,$d,$a)",
|
|
"&vshr_u32 ($d,$t,24)",
|
|
"&vsli_32 ($d,$t,8)",
|
|
|
|
"&vadd_i32 ($c,$c,$d)",
|
|
"&veor ($t,$b,$c)",
|
|
"&vshr_u32 ($b,$t,25)",
|
|
"&vsli_32 ($b,$t,7)",
|
|
|
|
"&vext_8 ($c,$c,$c,8)",
|
|
"&vext_8 ($b,$b,$b,$odd?12:4)",
|
|
"&vext_8 ($d,$d,$d,$odd?4:12)"
|
|
);
|
|
}
|
|
|
|
$code.=<<___;
|
|
#if __ARM_MAX_ARCH__>=7
|
|
.arch armv7-a
|
|
.fpu neon
|
|
|
|
.type ChaCha20_neon,%function
|
|
.align 5
|
|
ChaCha20_neon:
|
|
ldr r12,[sp,#0] @ pull pointer to counter and nonce
|
|
stmdb sp!,{r0-r2,r4-r11,lr}
|
|
.LChaCha20_neon:
|
|
adr r14,.Lsigma
|
|
vstmdb sp!,{d8-d15} @ ABI spec says so
|
|
stmdb sp!,{r0-r3}
|
|
|
|
vld1.32 {$b0-$c0},[r3] @ load key
|
|
ldmia r3,{r4-r11} @ load key
|
|
|
|
sub sp,sp,#4*(16+16)
|
|
vld1.32 {$d0},[r12] @ load counter and nonce
|
|
add r12,sp,#4*8
|
|
ldmia r14,{r0-r3} @ load sigma
|
|
vld1.32 {$a0},[r14]! @ load sigma
|
|
vld1.32 {$t0},[r14] @ one
|
|
vst1.32 {$c0-$d0},[r12] @ copy 1/2key|counter|nonce
|
|
vst1.32 {$a0-$b0},[sp] @ copy sigma|1/2key
|
|
|
|
str r10,[sp,#4*(16+10)] @ off-load "@x[10]"
|
|
str r11,[sp,#4*(16+11)] @ off-load "@x[11]"
|
|
vshl.i32 $t1#lo,$t0#lo,#1 @ two
|
|
vstr $t0#lo,[sp,#4*(16+0)]
|
|
vshl.i32 $t2#lo,$t0#lo,#2 @ four
|
|
vstr $t1#lo,[sp,#4*(16+2)]
|
|
vmov $a1,$a0
|
|
vstr $t2#lo,[sp,#4*(16+4)]
|
|
vmov $a2,$a0
|
|
vmov $b1,$b0
|
|
vmov $b2,$b0
|
|
b .Loop_neon_enter
|
|
|
|
.align 4
|
|
.Loop_neon_outer:
|
|
ldmia sp,{r0-r9} @ load key material
|
|
cmp @t[3],#64*2 @ if len<=64*2
|
|
bls .Lbreak_neon @ switch to integer-only
|
|
vmov $a1,$a0
|
|
str @t[3],[sp,#4*(32+2)] @ save len
|
|
vmov $a2,$a0
|
|
str r12, [sp,#4*(32+1)] @ save inp
|
|
vmov $b1,$b0
|
|
str r14, [sp,#4*(32+0)] @ save out
|
|
vmov $b2,$b0
|
|
.Loop_neon_enter:
|
|
ldr @t[3], [sp,#4*(15)]
|
|
vadd.i32 $d1,$d0,$t0 @ counter+1
|
|
ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
|
|
vmov $c1,$c0
|
|
ldr @t[2], [sp,#4*(13)]
|
|
vmov $c2,$c0
|
|
ldr @x[14],[sp,#4*(14)]
|
|
vadd.i32 $d2,$d1,$t0 @ counter+2
|
|
str @t[3], [sp,#4*(16+15)]
|
|
mov @t[3],#10
|
|
add @x[12],@x[12],#3 @ counter+3
|
|
b .Loop_neon
|
|
|
|
.align 4
|
|
.Loop_neon:
|
|
subs @t[3],@t[3],#1
|
|
___
|
|
my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0);
|
|
my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0);
|
|
my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0);
|
|
my @thread3=&ROUND(0,4,8,12);
|
|
|
|
foreach (@thread0) {
|
|
eval; eval(shift(@thread3));
|
|
eval(shift(@thread1)); eval(shift(@thread3));
|
|
eval(shift(@thread2)); eval(shift(@thread3));
|
|
}
|
|
|
|
@thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1);
|
|
@thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1);
|
|
@thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1);
|
|
@thread3=&ROUND(0,5,10,15);
|
|
|
|
foreach (@thread0) {
|
|
eval; eval(shift(@thread3));
|
|
eval(shift(@thread1)); eval(shift(@thread3));
|
|
eval(shift(@thread2)); eval(shift(@thread3));
|
|
}
|
|
$code.=<<___;
|
|
bne .Loop_neon
|
|
|
|
add @t[3],sp,#32
|
|
vld1.32 {$t0-$t1},[sp] @ load key material
|
|
vld1.32 {$t2-$t3},[@t[3]]
|
|
|
|
ldr @t[3],[sp,#4*(32+2)] @ load len
|
|
|
|
str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store
|
|
str @t[1], [sp,#4*(16+9)]
|
|
str @x[12],[sp,#4*(16+12)]
|
|
str @t[2], [sp,#4*(16+13)]
|
|
str @x[14],[sp,#4*(16+14)]
|
|
|
|
@ at this point we have first half of 512-bit result in
|
|
@ @x[0-7] and second half at sp+4*(16+8)
|
|
|
|
ldr r12,[sp,#4*(32+1)] @ load inp
|
|
ldr r14,[sp,#4*(32+0)] @ load out
|
|
|
|
vadd.i32 $a0,$a0,$t0 @ accumulate key material
|
|
vadd.i32 $a1,$a1,$t0
|
|
vadd.i32 $a2,$a2,$t0
|
|
vldr $t0#lo,[sp,#4*(16+0)] @ one
|
|
|
|
vadd.i32 $b0,$b0,$t1
|
|
vadd.i32 $b1,$b1,$t1
|
|
vadd.i32 $b2,$b2,$t1
|
|
vldr $t1#lo,[sp,#4*(16+2)] @ two
|
|
|
|
vadd.i32 $c0,$c0,$t2
|
|
vadd.i32 $c1,$c1,$t2
|
|
vadd.i32 $c2,$c2,$t2
|
|
vadd.i32 $d1#lo,$d1#lo,$t0#lo @ counter+1
|
|
vadd.i32 $d2#lo,$d2#lo,$t1#lo @ counter+2
|
|
|
|
vadd.i32 $d0,$d0,$t3
|
|
vadd.i32 $d1,$d1,$t3
|
|
vadd.i32 $d2,$d2,$t3
|
|
|
|
cmp @t[3],#64*4
|
|
blo .Ltail_neon
|
|
|
|
vld1.8 {$t0-$t1},[r12]! @ load input
|
|
mov @t[3],sp
|
|
vld1.8 {$t2-$t3},[r12]!
|
|
veor $a0,$a0,$t0 @ xor with input
|
|
veor $b0,$b0,$t1
|
|
vld1.8 {$t0-$t1},[r12]!
|
|
veor $c0,$c0,$t2
|
|
veor $d0,$d0,$t3
|
|
vld1.8 {$t2-$t3},[r12]!
|
|
|
|
veor $a1,$a1,$t0
|
|
vst1.8 {$a0-$b0},[r14]! @ store output
|
|
veor $b1,$b1,$t1
|
|
vld1.8 {$t0-$t1},[r12]!
|
|
veor $c1,$c1,$t2
|
|
vst1.8 {$c0-$d0},[r14]!
|
|
veor $d1,$d1,$t3
|
|
vld1.8 {$t2-$t3},[r12]!
|
|
|
|
veor $a2,$a2,$t0
|
|
vld1.32 {$a0-$b0},[@t[3]]! @ load for next iteration
|
|
veor $t0#hi,$t0#hi,$t0#hi
|
|
vldr $t0#lo,[sp,#4*(16+4)] @ four
|
|
veor $b2,$b2,$t1
|
|
vld1.32 {$c0-$d0},[@t[3]]
|
|
veor $c2,$c2,$t2
|
|
vst1.8 {$a1-$b1},[r14]!
|
|
veor $d2,$d2,$t3
|
|
vst1.8 {$c1-$d1},[r14]!
|
|
|
|
vadd.i32 $d0#lo,$d0#lo,$t0#lo @ next counter value
|
|
vldr $t0#lo,[sp,#4*(16+0)] @ one
|
|
|
|
ldmia sp,{@t[0]-@t[3]} @ load key material
|
|
add @x[0],@x[0],@t[0] @ accumulate key material
|
|
ldr @t[0],[r12],#16 @ load input
|
|
vst1.8 {$a2-$b2},[r14]!
|
|
add @x[1],@x[1],@t[1]
|
|
ldr @t[1],[r12,#-12]
|
|
vst1.8 {$c2-$d2},[r14]!
|
|
add @x[2],@x[2],@t[2]
|
|
ldr @t[2],[r12,#-8]
|
|
add @x[3],@x[3],@t[3]
|
|
ldr @t[3],[r12,#-4]
|
|
# ifdef __ARMEB__
|
|
rev @x[0],@x[0]
|
|
rev @x[1],@x[1]
|
|
rev @x[2],@x[2]
|
|
rev @x[3],@x[3]
|
|
# endif
|
|
eor @x[0],@x[0],@t[0] @ xor with input
|
|
add @t[0],sp,#4*(4)
|
|
eor @x[1],@x[1],@t[1]
|
|
str @x[0],[r14],#16 @ store output
|
|
eor @x[2],@x[2],@t[2]
|
|
str @x[1],[r14,#-12]
|
|
eor @x[3],@x[3],@t[3]
|
|
ldmia @t[0],{@t[0]-@t[3]} @ load key material
|
|
str @x[2],[r14,#-8]
|
|
str @x[3],[r14,#-4]
|
|
|
|
add @x[4],@x[4],@t[0] @ accumulate key material
|
|
ldr @t[0],[r12],#16 @ load input
|
|
add @x[5],@x[5],@t[1]
|
|
ldr @t[1],[r12,#-12]
|
|
add @x[6],@x[6],@t[2]
|
|
ldr @t[2],[r12,#-8]
|
|
add @x[7],@x[7],@t[3]
|
|
ldr @t[3],[r12,#-4]
|
|
# ifdef __ARMEB__
|
|
rev @x[4],@x[4]
|
|
rev @x[5],@x[5]
|
|
rev @x[6],@x[6]
|
|
rev @x[7],@x[7]
|
|
# endif
|
|
eor @x[4],@x[4],@t[0]
|
|
add @t[0],sp,#4*(8)
|
|
eor @x[5],@x[5],@t[1]
|
|
str @x[4],[r14],#16 @ store output
|
|
eor @x[6],@x[6],@t[2]
|
|
str @x[5],[r14,#-12]
|
|
eor @x[7],@x[7],@t[3]
|
|
ldmia @t[0],{@t[0]-@t[3]} @ load key material
|
|
str @x[6],[r14,#-8]
|
|
add @x[0],sp,#4*(16+8)
|
|
str @x[7],[r14,#-4]
|
|
|
|
ldmia @x[0],{@x[0]-@x[7]} @ load second half
|
|
|
|
add @x[0],@x[0],@t[0] @ accumulate key material
|
|
ldr @t[0],[r12],#16 @ load input
|
|
add @x[1],@x[1],@t[1]
|
|
ldr @t[1],[r12,#-12]
|
|
# ifdef __thumb2__
|
|
it hi
|
|
# endif
|
|
strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it
|
|
add @x[2],@x[2],@t[2]
|
|
ldr @t[2],[r12,#-8]
|
|
# ifdef __thumb2__
|
|
it hi
|
|
# endif
|
|
strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it
|
|
add @x[3],@x[3],@t[3]
|
|
ldr @t[3],[r12,#-4]
|
|
# ifdef __ARMEB__
|
|
rev @x[0],@x[0]
|
|
rev @x[1],@x[1]
|
|
rev @x[2],@x[2]
|
|
rev @x[3],@x[3]
|
|
# endif
|
|
eor @x[0],@x[0],@t[0]
|
|
add @t[0],sp,#4*(12)
|
|
eor @x[1],@x[1],@t[1]
|
|
str @x[0],[r14],#16 @ store output
|
|
eor @x[2],@x[2],@t[2]
|
|
str @x[1],[r14,#-12]
|
|
eor @x[3],@x[3],@t[3]
|
|
ldmia @t[0],{@t[0]-@t[3]} @ load key material
|
|
str @x[2],[r14,#-8]
|
|
str @x[3],[r14,#-4]
|
|
|
|
add @x[4],@x[4],@t[0] @ accumulate key material
|
|
add @t[0],@t[0],#4 @ next counter value
|
|
add @x[5],@x[5],@t[1]
|
|
str @t[0],[sp,#4*(12)] @ save next counter value
|
|
ldr @t[0],[r12],#16 @ load input
|
|
add @x[6],@x[6],@t[2]
|
|
add @x[4],@x[4],#3 @ counter+3
|
|
ldr @t[1],[r12,#-12]
|
|
add @x[7],@x[7],@t[3]
|
|
ldr @t[2],[r12,#-8]
|
|
ldr @t[3],[r12,#-4]
|
|
# ifdef __ARMEB__
|
|
rev @x[4],@x[4]
|
|
rev @x[5],@x[5]
|
|
rev @x[6],@x[6]
|
|
rev @x[7],@x[7]
|
|
# endif
|
|
eor @x[4],@x[4],@t[0]
|
|
# ifdef __thumb2__
|
|
it hi
|
|
# endif
|
|
ldrhi @t[0],[sp,#4*(32+2)] @ re-load len
|
|
eor @x[5],@x[5],@t[1]
|
|
eor @x[6],@x[6],@t[2]
|
|
str @x[4],[r14],#16 @ store output
|
|
eor @x[7],@x[7],@t[3]
|
|
str @x[5],[r14,#-12]
|
|
sub @t[3],@t[0],#64*4 @ len-=64*4
|
|
str @x[6],[r14,#-8]
|
|
str @x[7],[r14,#-4]
|
|
bhi .Loop_neon_outer
|
|
|
|
b .Ldone_neon
|
|
|
|
.align 4
|
|
.Lbreak_neon:
|
|
@ harmonize NEON and integer-only stack frames: load data
|
|
@ from NEON frame, but save to integer-only one; distance
|
|
@ between the two is 4*(32+4+16-32)=4*(20).
|
|
|
|
str @t[3], [sp,#4*(20+32+2)] @ save len
|
|
add @t[3],sp,#4*(32+4)
|
|
str r12, [sp,#4*(20+32+1)] @ save inp
|
|
str r14, [sp,#4*(20+32+0)] @ save out
|
|
|
|
ldr @x[12],[sp,#4*(16+10)]
|
|
ldr @x[14],[sp,#4*(16+11)]
|
|
vldmia @t[3],{d8-d15} @ fulfill ABI requirement
|
|
str @x[12],[sp,#4*(20+16+10)] @ copy "@x[10]"
|
|
str @x[14],[sp,#4*(20+16+11)] @ copy "@x[11]"
|
|
|
|
ldr @t[3], [sp,#4*(15)]
|
|
ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
|
|
ldr @t[2], [sp,#4*(13)]
|
|
ldr @x[14],[sp,#4*(14)]
|
|
str @t[3], [sp,#4*(20+16+15)]
|
|
add @t[3],sp,#4*(20)
|
|
vst1.32 {$a0-$b0},[@t[3]]! @ copy key
|
|
add sp,sp,#4*(20) @ switch frame
|
|
vst1.32 {$c0-$d0},[@t[3]]
|
|
mov @t[3],#10
|
|
b .Loop @ go integer-only
|
|
|
|
.align 4
|
|
.Ltail_neon:
|
|
cmp @t[3],#64*3
|
|
bhs .L192_or_more_neon
|
|
cmp @t[3],#64*2
|
|
bhs .L128_or_more_neon
|
|
cmp @t[3],#64*1
|
|
bhs .L64_or_more_neon
|
|
|
|
add @t[0],sp,#4*(8)
|
|
vst1.8 {$a0-$b0},[sp]
|
|
add @t[2],sp,#4*(0)
|
|
vst1.8 {$c0-$d0},[@t[0]]
|
|
b .Loop_tail_neon
|
|
|
|
.align 4
|
|
.L64_or_more_neon:
|
|
vld1.8 {$t0-$t1},[r12]!
|
|
vld1.8 {$t2-$t3},[r12]!
|
|
veor $a0,$a0,$t0
|
|
veor $b0,$b0,$t1
|
|
veor $c0,$c0,$t2
|
|
veor $d0,$d0,$t3
|
|
vst1.8 {$a0-$b0},[r14]!
|
|
vst1.8 {$c0-$d0},[r14]!
|
|
|
|
beq .Ldone_neon
|
|
|
|
add @t[0],sp,#4*(8)
|
|
vst1.8 {$a1-$b1},[sp]
|
|
add @t[2],sp,#4*(0)
|
|
vst1.8 {$c1-$d1},[@t[0]]
|
|
sub @t[3],@t[3],#64*1 @ len-=64*1
|
|
b .Loop_tail_neon
|
|
|
|
.align 4
|
|
.L128_or_more_neon:
|
|
vld1.8 {$t0-$t1},[r12]!
|
|
vld1.8 {$t2-$t3},[r12]!
|
|
veor $a0,$a0,$t0
|
|
veor $b0,$b0,$t1
|
|
vld1.8 {$t0-$t1},[r12]!
|
|
veor $c0,$c0,$t2
|
|
veor $d0,$d0,$t3
|
|
vld1.8 {$t2-$t3},[r12]!
|
|
|
|
veor $a1,$a1,$t0
|
|
veor $b1,$b1,$t1
|
|
vst1.8 {$a0-$b0},[r14]!
|
|
veor $c1,$c1,$t2
|
|
vst1.8 {$c0-$d0},[r14]!
|
|
veor $d1,$d1,$t3
|
|
vst1.8 {$a1-$b1},[r14]!
|
|
vst1.8 {$c1-$d1},[r14]!
|
|
|
|
beq .Ldone_neon
|
|
|
|
add @t[0],sp,#4*(8)
|
|
vst1.8 {$a2-$b2},[sp]
|
|
add @t[2],sp,#4*(0)
|
|
vst1.8 {$c2-$d2},[@t[0]]
|
|
sub @t[3],@t[3],#64*2 @ len-=64*2
|
|
b .Loop_tail_neon
|
|
|
|
.align 4
|
|
.L192_or_more_neon:
|
|
vld1.8 {$t0-$t1},[r12]!
|
|
vld1.8 {$t2-$t3},[r12]!
|
|
veor $a0,$a0,$t0
|
|
veor $b0,$b0,$t1
|
|
vld1.8 {$t0-$t1},[r12]!
|
|
veor $c0,$c0,$t2
|
|
veor $d0,$d0,$t3
|
|
vld1.8 {$t2-$t3},[r12]!
|
|
|
|
veor $a1,$a1,$t0
|
|
veor $b1,$b1,$t1
|
|
vld1.8 {$t0-$t1},[r12]!
|
|
veor $c1,$c1,$t2
|
|
vst1.8 {$a0-$b0},[r14]!
|
|
veor $d1,$d1,$t3
|
|
vld1.8 {$t2-$t3},[r12]!
|
|
|
|
veor $a2,$a2,$t0
|
|
vst1.8 {$c0-$d0},[r14]!
|
|
veor $b2,$b2,$t1
|
|
vst1.8 {$a1-$b1},[r14]!
|
|
veor $c2,$c2,$t2
|
|
vst1.8 {$c1-$d1},[r14]!
|
|
veor $d2,$d2,$t3
|
|
vst1.8 {$a2-$b2},[r14]!
|
|
vst1.8 {$c2-$d2},[r14]!
|
|
|
|
beq .Ldone_neon
|
|
|
|
ldmia sp,{@t[0]-@t[3]} @ load key material
|
|
add @x[0],@x[0],@t[0] @ accumulate key material
|
|
add @t[0],sp,#4*(4)
|
|
add @x[1],@x[1],@t[1]
|
|
add @x[2],@x[2],@t[2]
|
|
add @x[3],@x[3],@t[3]
|
|
ldmia @t[0],{@t[0]-@t[3]} @ load key material
|
|
|
|
add @x[4],@x[4],@t[0] @ accumulate key material
|
|
add @t[0],sp,#4*(8)
|
|
add @x[5],@x[5],@t[1]
|
|
add @x[6],@x[6],@t[2]
|
|
add @x[7],@x[7],@t[3]
|
|
ldmia @t[0],{@t[0]-@t[3]} @ load key material
|
|
# ifdef __ARMEB__
|
|
rev @x[0],@x[0]
|
|
rev @x[1],@x[1]
|
|
rev @x[2],@x[2]
|
|
rev @x[3],@x[3]
|
|
rev @x[4],@x[4]
|
|
rev @x[5],@x[5]
|
|
rev @x[6],@x[6]
|
|
rev @x[7],@x[7]
|
|
# endif
|
|
stmia sp,{@x[0]-@x[7]}
|
|
add @x[0],sp,#4*(16+8)
|
|
|
|
ldmia @x[0],{@x[0]-@x[7]} @ load second half
|
|
|
|
add @x[0],@x[0],@t[0] @ accumulate key material
|
|
add @t[0],sp,#4*(12)
|
|
add @x[1],@x[1],@t[1]
|
|
add @x[2],@x[2],@t[2]
|
|
add @x[3],@x[3],@t[3]
|
|
ldmia @t[0],{@t[0]-@t[3]} @ load key material
|
|
|
|
add @x[4],@x[4],@t[0] @ accumulate key material
|
|
add @t[0],sp,#4*(8)
|
|
add @x[5],@x[5],@t[1]
|
|
add @x[4],@x[4],#3 @ counter+3
|
|
add @x[6],@x[6],@t[2]
|
|
add @x[7],@x[7],@t[3]
|
|
ldr @t[3],[sp,#4*(32+2)] @ re-load len
|
|
# ifdef __ARMEB__
|
|
rev @x[0],@x[0]
|
|
rev @x[1],@x[1]
|
|
rev @x[2],@x[2]
|
|
rev @x[3],@x[3]
|
|
rev @x[4],@x[4]
|
|
rev @x[5],@x[5]
|
|
rev @x[6],@x[6]
|
|
rev @x[7],@x[7]
|
|
# endif
|
|
stmia @t[0],{@x[0]-@x[7]}
|
|
add @t[2],sp,#4*(0)
|
|
sub @t[3],@t[3],#64*3 @ len-=64*3
|
|
|
|
.Loop_tail_neon:
|
|
ldrb @t[0],[@t[2]],#1 @ read buffer on stack
|
|
ldrb @t[1],[r12],#1 @ read input
|
|
subs @t[3],@t[3],#1
|
|
eor @t[0],@t[0],@t[1]
|
|
strb @t[0],[r14],#1 @ store ouput
|
|
bne .Loop_tail_neon
|
|
|
|
.Ldone_neon:
|
|
add sp,sp,#4*(32+4)
|
|
vldmia sp,{d8-d15}
|
|
add sp,sp,#4*(16+3)
|
|
ldmia sp!,{r4-r11,pc}
|
|
.size ChaCha20_neon,.-ChaCha20_neon
|
|
.comm OPENSSL_armcap_P,4,4
|
|
#endif
|
|
___
|
|
}}}
|
|
|
|
foreach (split("\n",$code)) {
|
|
s/\`([^\`]*)\`/eval $1/geo;
|
|
|
|
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
|
|
|
|
print $_,"\n";
|
|
}
|
|
close STDOUT;
|