Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.

chacha-armv8.pl 26 KiB

Enable upstream's ChaCha20 assembly for x86 and ARM (32- and 64-bit). This removes chacha_vec_arm.S and chacha_vec.c in favor of unifying on upstream's code. Upstream's is faster and this cuts down on the number of distinct codepaths. Our old scheme also didn't give vectorized code on Windows or aarch64. BoringSSL-specific modifications made to the assembly: - As usual, the shelling out to $CC is replaced with hardcoding $avx. I've tested up to the AVX2 codepath, so enable it all. - I've removed the AMD XOP code as I have not tested it. - As usual, the ARM file need the arm_arch.h include tweaked. Speed numbers follow. We can hope for further wins on these benchmarks after importing the Poly1305 assembly. x86 --- Old: Did 1422000 ChaCha20-Poly1305 (16 bytes) seal operations in 1000433us (1421384.5 ops/sec): 22.7 MB/s Did 123000 ChaCha20-Poly1305 (1350 bytes) seal operations in 1003803us (122534.0 ops/sec): 165.4 MB/s Did 22000 ChaCha20-Poly1305 (8192 bytes) seal operations in 1000282us (21993.8 ops/sec): 180.2 MB/s Did 1428000 ChaCha20-Poly1305-Old (16 bytes) seal operations in 1000214us (1427694.5 ops/sec): 22.8 MB/s Did 124000 ChaCha20-Poly1305-Old (1350 bytes) seal operations in 1006332us (123219.8 ops/sec): 166.3 MB/s Did 22000 ChaCha20-Poly1305-Old (8192 bytes) seal operations in 1020771us (21552.3 ops/sec): 176.6 MB/s New: Did 1520000 ChaCha20-Poly1305 (16 bytes) seal operations in 1000567us (1519138.6 ops/sec): 24.3 MB/s Did 152000 ChaCha20-Poly1305 (1350 bytes) seal operations in 1004216us (151361.9 ops/sec): 204.3 MB/s Did 31000 ChaCha20-Poly1305 (8192 bytes) seal operations in 1009085us (30720.9 ops/sec): 251.7 MB/s Did 1797000 ChaCha20-Poly1305-Old (16 bytes) seal operations in 1000141us (1796746.7 ops/sec): 28.7 MB/s Did 171000 ChaCha20-Poly1305-Old (1350 bytes) seal operations in 1003204us (170453.9 ops/sec): 230.1 MB/s Did 31000 ChaCha20-Poly1305-Old (8192 bytes) seal operations in 1005349us (30835.1 ops/sec): 252.6 MB/s x86_64, no AVX2 --- Old: Did 1782000 ChaCha20-Poly1305 (16 bytes) seal operations in 1000204us (1781636.5 ops/sec): 28.5 MB/s Did 317000 ChaCha20-Poly1305 (1350 bytes) seal operations in 1001579us (316500.2 ops/sec): 427.3 MB/s Did 62000 ChaCha20-Poly1305 (8192 bytes) seal operations in 1012146us (61256.0 ops/sec): 501.8 MB/s Did 1778000 ChaCha20-Poly1305-Old (16 bytes) seal operations in 1000220us (1777608.9 ops/sec): 28.4 MB/s Did 315000 ChaCha20-Poly1305-Old (1350 bytes) seal operations in 1002886us (314093.5 ops/sec): 424.0 MB/s Did 71000 ChaCha20-Poly1305-Old (8192 bytes) seal operations in 1014606us (69977.9 ops/sec): 573.3 MB/s New: Did 1866000 ChaCha20-Poly1305 (16 bytes) seal operations in 1000019us (1865964.5 ops/sec): 29.9 MB/s Did 399000 ChaCha20-Poly1305 (1350 bytes) seal operations in 1001017us (398594.6 ops/sec): 538.1 MB/s Did 84000 ChaCha20-Poly1305 (8192 bytes) seal operations in 1005645us (83528.5 ops/sec): 684.3 MB/s Did 1881000 ChaCha20-Poly1305-Old (16 bytes) seal operations in 1000325us (1880388.9 ops/sec): 30.1 MB/s Did 404000 ChaCha20-Poly1305-Old (1350 bytes) seal operations in 1000004us (403998.4 ops/sec): 545.4 MB/s Did 85000 ChaCha20-Poly1305-Old (8192 bytes) seal operations in 1010048us (84154.4 ops/sec): 689.4 MB/s x86_64, AVX2 --- Old: Did 2375000 ChaCha20-Poly1305 (16 bytes) seal operations in 1000282us (2374330.4 ops/sec): 38.0 MB/s Did 448000 ChaCha20-Poly1305 (1350 bytes) seal operations in 1001865us (447166.0 ops/sec): 603.7 MB/s Did 88000 ChaCha20-Poly1305 (8192 bytes) seal operations in 1005217us (87543.3 ops/sec): 717.2 MB/s Did 2409000 ChaCha20-Poly1305-Old (16 bytes) seal operations in 1000188us (2408547.2 ops/sec): 38.5 MB/s Did 446000 ChaCha20-Poly1305-Old (1350 bytes) seal operations in 1001003us (445553.1 ops/sec): 601.5 MB/s Did 90000 ChaCha20-Poly1305-Old (8192 bytes) seal operations in 1006722us (89399.1 ops/sec): 732.4 MB/s New: Did 2622000 ChaCha20-Poly1305 (16 bytes) seal operations in 1000266us (2621302.7 ops/sec): 41.9 MB/s Did 794000 ChaCha20-Poly1305 (1350 bytes) seal operations in 1000783us (793378.8 ops/sec): 1071.1 MB/s Did 173000 ChaCha20-Poly1305 (8192 bytes) seal operations in 1000176us (172969.6 ops/sec): 1417.0 MB/s Did 2623000 ChaCha20-Poly1305-Old (16 bytes) seal operations in 1000330us (2622134.7 ops/sec): 42.0 MB/s Did 783000 ChaCha20-Poly1305-Old (1350 bytes) seal operations in 1000531us (782584.4 ops/sec): 1056.5 MB/s Did 174000 ChaCha20-Poly1305-Old (8192 bytes) seal operations in 1000840us (173854.0 ops/sec): 1424.2 MB/s arm, Nexus 4 --- Old: Did 388550 ChaCha20-Poly1305 (16 bytes) seal operations in 1000580us (388324.8 ops/sec): 6.2 MB/s Did 90000 ChaCha20-Poly1305 (1350 bytes) seal operations in 1003816us (89657.9 ops/sec): 121.0 MB/s Did 19000 ChaCha20-Poly1305 (8192 bytes) seal operations in 1045750us (18168.8 ops/sec): 148.8 MB/s Did 398500 ChaCha20-Poly1305-Old (16 bytes) seal operations in 1000305us (398378.5 ops/sec): 6.4 MB/s Did 90500 ChaCha20-Poly1305-Old (1350 bytes) seal operations in 1000305us (90472.4 ops/sec): 122.1 MB/s Did 19000 ChaCha20-Poly1305-Old (8192 bytes) seal operations in 1043278us (18211.8 ops/sec): 149.2 MB/s New: Did 424788 ChaCha20-Poly1305 (16 bytes) seal operations in 1000641us (424515.9 ops/sec): 6.8 MB/s Did 115000 ChaCha20-Poly1305 (1350 bytes) seal operations in 1001526us (114824.8 ops/sec): 155.0 MB/s Did 27000 ChaCha20-Poly1305 (8192 bytes) seal operations in 1033023us (26136.9 ops/sec): 214.1 MB/s Did 447750 ChaCha20-Poly1305-Old (16 bytes) seal operations in 1000549us (447504.3 ops/sec): 7.2 MB/s Did 117500 ChaCha20-Poly1305-Old (1350 bytes) seal operations in 1001923us (117274.5 ops/sec): 158.3 MB/s Did 27000 ChaCha20-Poly1305-Old (8192 bytes) seal operations in 1025118us (26338.4 ops/sec): 215.8 MB/s aarch64, Nexus 6p (Note we didn't have aarch64 assembly before at all, and still don't have it for Poly1305. Hopefully once that's added this will be faster than the arm numbers...) --- Old: Did 145040 ChaCha20-Poly1305 (16 bytes) seal operations in 1003065us (144596.8 ops/sec): 2.3 MB/s Did 14000 ChaCha20-Poly1305 (1350 bytes) seal operations in 1042605us (13427.9 ops/sec): 18.1 MB/s Did 2618 ChaCha20-Poly1305 (8192 bytes) seal operations in 1093241us (2394.7 ops/sec): 19.6 MB/s Did 148000 ChaCha20-Poly1305-Old (16 bytes) seal operations in 1000709us (147895.1 ops/sec): 2.4 MB/s Did 14000 ChaCha20-Poly1305-Old (1350 bytes) seal operations in 1047294us (13367.8 ops/sec): 18.0 MB/s Did 2607 ChaCha20-Poly1305-Old (8192 bytes) seal operations in 1090745us (2390.1 ops/sec): 19.6 MB/s New: Did 358000 ChaCha20-Poly1305 (16 bytes) seal operations in 1000769us (357724.9 ops/sec): 5.7 MB/s Did 45000 ChaCha20-Poly1305 (1350 bytes) seal operations in 1021267us (44062.9 ops/sec): 59.5 MB/s Did 8591 ChaCha20-Poly1305 (8192 bytes) seal operations in 1047136us (8204.3 ops/sec): 67.2 MB/s Did 343000 ChaCha20-Poly1305-Old (16 bytes) seal operations in 1000489us (342832.4 ops/sec): 5.5 MB/s Did 44000 ChaCha20-Poly1305-Old (1350 bytes) seal operations in 1008326us (43636.7 ops/sec): 58.9 MB/s Did 8866 ChaCha20-Poly1305-Old (8192 bytes) seal operations in 1083341us (8183.9 ops/sec): 67.0 MB/s Change-Id: I629fe195d072f2c99e8f947578fad6d70823c4c8 Reviewed-on: https://boringssl-review.googlesource.com/7202 Reviewed-by: Adam Langley <agl@google.com>
před 8 roky
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127
  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  5. # project. The module is, however, dual licensed under OpenSSL and
  6. # CRYPTOGAMS licenses depending on where you obtain it. For further
  7. # details see http://www.openssl.org/~appro/cryptogams/.
  8. # ====================================================================
  9. #
  10. # June 2015
  11. #
  12. # ChaCha20 for ARMv8.
  13. #
  14. # Performance in cycles per byte out of large buffer.
  15. #
  16. # IALU/gcc-4.9 3xNEON+1xIALU 6xNEON+2xIALU
  17. #
  18. # Apple A7 5.50/+49% 3.33 1.70
  19. # Cortex-A53 8.40/+80% 4.72 4.72(*)
  20. # Cortex-A57 8.06/+43% 4.90 4.43(**)
  21. # Denver 4.50/+82% 2.63 2.67(*)
  22. # X-Gene 9.50/+46% 8.82 8.89(*)
  23. #
  24. # (*) it's expected that doubling interleave factor doesn't help
  25. # all processors, only those with higher NEON latency and
  26. # higher instruction issue rate;
  27. # (**) expected improvement was actually higher;
  28. $flavour=shift;
  29. $output=shift;
  30. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  31. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  32. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  33. die "can't locate arm-xlate.pl";
  34. open OUT,"| \"$^X\" $xlate $flavour $output";
  35. *STDOUT=*OUT;
  36. sub AUTOLOAD() # thunk [simplified] x86-style perlasm
  37. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
  38. my $arg = pop;
  39. $arg = "#$arg" if ($arg*1 eq $arg);
  40. $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
  41. }
  42. my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4));
  43. my @x=map("x$_",(5..17,19..21));
  44. my @d=map("x$_",(22..28,30));
  45. sub ROUND {
  46. my ($a0,$b0,$c0,$d0)=@_;
  47. my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
  48. my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
  49. my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
  50. (
  51. "&add_32 (@x[$a0],@x[$a0],@x[$b0])",
  52. "&add_32 (@x[$a1],@x[$a1],@x[$b1])",
  53. "&add_32 (@x[$a2],@x[$a2],@x[$b2])",
  54. "&add_32 (@x[$a3],@x[$a3],@x[$b3])",
  55. "&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
  56. "&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
  57. "&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
  58. "&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
  59. "&ror_32 (@x[$d0],@x[$d0],16)",
  60. "&ror_32 (@x[$d1],@x[$d1],16)",
  61. "&ror_32 (@x[$d2],@x[$d2],16)",
  62. "&ror_32 (@x[$d3],@x[$d3],16)",
  63. "&add_32 (@x[$c0],@x[$c0],@x[$d0])",
  64. "&add_32 (@x[$c1],@x[$c1],@x[$d1])",
  65. "&add_32 (@x[$c2],@x[$c2],@x[$d2])",
  66. "&add_32 (@x[$c3],@x[$c3],@x[$d3])",
  67. "&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
  68. "&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
  69. "&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
  70. "&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
  71. "&ror_32 (@x[$b0],@x[$b0],20)",
  72. "&ror_32 (@x[$b1],@x[$b1],20)",
  73. "&ror_32 (@x[$b2],@x[$b2],20)",
  74. "&ror_32 (@x[$b3],@x[$b3],20)",
  75. "&add_32 (@x[$a0],@x[$a0],@x[$b0])",
  76. "&add_32 (@x[$a1],@x[$a1],@x[$b1])",
  77. "&add_32 (@x[$a2],@x[$a2],@x[$b2])",
  78. "&add_32 (@x[$a3],@x[$a3],@x[$b3])",
  79. "&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
  80. "&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
  81. "&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
  82. "&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
  83. "&ror_32 (@x[$d0],@x[$d0],24)",
  84. "&ror_32 (@x[$d1],@x[$d1],24)",
  85. "&ror_32 (@x[$d2],@x[$d2],24)",
  86. "&ror_32 (@x[$d3],@x[$d3],24)",
  87. "&add_32 (@x[$c0],@x[$c0],@x[$d0])",
  88. "&add_32 (@x[$c1],@x[$c1],@x[$d1])",
  89. "&add_32 (@x[$c2],@x[$c2],@x[$d2])",
  90. "&add_32 (@x[$c3],@x[$c3],@x[$d3])",
  91. "&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
  92. "&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
  93. "&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
  94. "&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
  95. "&ror_32 (@x[$b0],@x[$b0],25)",
  96. "&ror_32 (@x[$b1],@x[$b1],25)",
  97. "&ror_32 (@x[$b2],@x[$b2],25)",
  98. "&ror_32 (@x[$b3],@x[$b3],25)"
  99. );
  100. }
  101. $code.=<<___;
  102. #include <openssl/arm_arch.h>
  103. .text
  104. .extern OPENSSL_armcap_P
  105. .align 5
  106. .Lsigma:
  107. .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
  108. .Lone:
  109. .long 1,0,0,0
  110. .LOPENSSL_armcap_P:
  111. #ifdef __ILP32__
  112. .long OPENSSL_armcap_P-.
  113. #else
  114. .quad OPENSSL_armcap_P-.
  115. #endif
  116. .asciz "ChaCha20 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  117. .globl ChaCha20_ctr32
  118. .type ChaCha20_ctr32,%function
  119. .align 5
  120. ChaCha20_ctr32:
  121. cbz $len,.Labort
  122. adr @x[0],.LOPENSSL_armcap_P
  123. cmp $len,#192
  124. b.lo .Lshort
  125. #ifdef __ILP32__
  126. ldrsw @x[1],[@x[0]]
  127. #else
  128. ldr @x[1],[@x[0]]
  129. #endif
  130. ldr w17,[@x[1],@x[0]]
  131. tst w17,#ARMV7_NEON
  132. b.ne ChaCha20_neon
  133. .Lshort:
  134. stp x29,x30,[sp,#-96]!
  135. add x29,sp,#0
  136. adr @x[0],.Lsigma
  137. stp x19,x20,[sp,#16]
  138. stp x21,x22,[sp,#32]
  139. stp x23,x24,[sp,#48]
  140. stp x25,x26,[sp,#64]
  141. stp x27,x28,[sp,#80]
  142. sub sp,sp,#64
  143. ldp @d[0],@d[1],[@x[0]] // load sigma
  144. ldp @d[2],@d[3],[$key] // load key
  145. ldp @d[4],@d[5],[$key,#16]
  146. ldp @d[6],@d[7],[$ctr] // load counter
  147. #ifdef __ARMEB__
  148. ror @d[2],@d[2],#32
  149. ror @d[3],@d[3],#32
  150. ror @d[4],@d[4],#32
  151. ror @d[5],@d[5],#32
  152. ror @d[6],@d[6],#32
  153. ror @d[7],@d[7],#32
  154. #endif
  155. .Loop_outer:
  156. mov.32 @x[0],@d[0] // unpack key block
  157. lsr @x[1],@d[0],#32
  158. mov.32 @x[2],@d[1]
  159. lsr @x[3],@d[1],#32
  160. mov.32 @x[4],@d[2]
  161. lsr @x[5],@d[2],#32
  162. mov.32 @x[6],@d[3]
  163. lsr @x[7],@d[3],#32
  164. mov.32 @x[8],@d[4]
  165. lsr @x[9],@d[4],#32
  166. mov.32 @x[10],@d[5]
  167. lsr @x[11],@d[5],#32
  168. mov.32 @x[12],@d[6]
  169. lsr @x[13],@d[6],#32
  170. mov.32 @x[14],@d[7]
  171. lsr @x[15],@d[7],#32
  172. mov $ctr,#10
  173. subs $len,$len,#64
  174. .Loop:
  175. sub $ctr,$ctr,#1
  176. ___
  177. foreach (&ROUND(0, 4, 8,12)) { eval; }
  178. foreach (&ROUND(0, 5,10,15)) { eval; }
  179. $code.=<<___;
  180. cbnz $ctr,.Loop
  181. add.32 @x[0],@x[0],@d[0] // accumulate key block
  182. add @x[1],@x[1],@d[0],lsr#32
  183. add.32 @x[2],@x[2],@d[1]
  184. add @x[3],@x[3],@d[1],lsr#32
  185. add.32 @x[4],@x[4],@d[2]
  186. add @x[5],@x[5],@d[2],lsr#32
  187. add.32 @x[6],@x[6],@d[3]
  188. add @x[7],@x[7],@d[3],lsr#32
  189. add.32 @x[8],@x[8],@d[4]
  190. add @x[9],@x[9],@d[4],lsr#32
  191. add.32 @x[10],@x[10],@d[5]
  192. add @x[11],@x[11],@d[5],lsr#32
  193. add.32 @x[12],@x[12],@d[6]
  194. add @x[13],@x[13],@d[6],lsr#32
  195. add.32 @x[14],@x[14],@d[7]
  196. add @x[15],@x[15],@d[7],lsr#32
  197. b.lo .Ltail
  198. add @x[0],@x[0],@x[1],lsl#32 // pack
  199. add @x[2],@x[2],@x[3],lsl#32
  200. ldp @x[1],@x[3],[$inp,#0] // load input
  201. add @x[4],@x[4],@x[5],lsl#32
  202. add @x[6],@x[6],@x[7],lsl#32
  203. ldp @x[5],@x[7],[$inp,#16]
  204. add @x[8],@x[8],@x[9],lsl#32
  205. add @x[10],@x[10],@x[11],lsl#32
  206. ldp @x[9],@x[11],[$inp,#32]
  207. add @x[12],@x[12],@x[13],lsl#32
  208. add @x[14],@x[14],@x[15],lsl#32
  209. ldp @x[13],@x[15],[$inp,#48]
  210. add $inp,$inp,#64
  211. #ifdef __ARMEB__
  212. rev @x[0],@x[0]
  213. rev @x[2],@x[2]
  214. rev @x[4],@x[4]
  215. rev @x[6],@x[6]
  216. rev @x[8],@x[8]
  217. rev @x[10],@x[10]
  218. rev @x[12],@x[12]
  219. rev @x[14],@x[14]
  220. #endif
  221. eor @x[0],@x[0],@x[1]
  222. eor @x[2],@x[2],@x[3]
  223. eor @x[4],@x[4],@x[5]
  224. eor @x[6],@x[6],@x[7]
  225. eor @x[8],@x[8],@x[9]
  226. eor @x[10],@x[10],@x[11]
  227. eor @x[12],@x[12],@x[13]
  228. eor @x[14],@x[14],@x[15]
  229. stp @x[0],@x[2],[$out,#0] // store output
  230. add @d[6],@d[6],#1 // increment counter
  231. stp @x[4],@x[6],[$out,#16]
  232. stp @x[8],@x[10],[$out,#32]
  233. stp @x[12],@x[14],[$out,#48]
  234. add $out,$out,#64
  235. b.hi .Loop_outer
  236. ldp x19,x20,[x29,#16]
  237. add sp,sp,#64
  238. ldp x21,x22,[x29,#32]
  239. ldp x23,x24,[x29,#48]
  240. ldp x25,x26,[x29,#64]
  241. ldp x27,x28,[x29,#80]
  242. ldp x29,x30,[sp],#96
  243. .Labort:
  244. ret
  245. .align 4
  246. .Ltail:
  247. add $len,$len,#64
  248. .Less_than_64:
  249. sub $out,$out,#1
  250. add $inp,$inp,$len
  251. add $out,$out,$len
  252. add $ctr,sp,$len
  253. neg $len,$len
  254. add @x[0],@x[0],@x[1],lsl#32 // pack
  255. add @x[2],@x[2],@x[3],lsl#32
  256. add @x[4],@x[4],@x[5],lsl#32
  257. add @x[6],@x[6],@x[7],lsl#32
  258. add @x[8],@x[8],@x[9],lsl#32
  259. add @x[10],@x[10],@x[11],lsl#32
  260. add @x[12],@x[12],@x[13],lsl#32
  261. add @x[14],@x[14],@x[15],lsl#32
  262. #ifdef __ARMEB__
  263. rev @x[0],@x[0]
  264. rev @x[2],@x[2]
  265. rev @x[4],@x[4]
  266. rev @x[6],@x[6]
  267. rev @x[8],@x[8]
  268. rev @x[10],@x[10]
  269. rev @x[12],@x[12]
  270. rev @x[14],@x[14]
  271. #endif
  272. stp @x[0],@x[2],[sp,#0]
  273. stp @x[4],@x[6],[sp,#16]
  274. stp @x[8],@x[10],[sp,#32]
  275. stp @x[12],@x[14],[sp,#48]
  276. .Loop_tail:
  277. ldrb w10,[$inp,$len]
  278. ldrb w11,[$ctr,$len]
  279. add $len,$len,#1
  280. eor w10,w10,w11
  281. strb w10,[$out,$len]
  282. cbnz $len,.Loop_tail
  283. stp xzr,xzr,[sp,#0]
  284. stp xzr,xzr,[sp,#16]
  285. stp xzr,xzr,[sp,#32]
  286. stp xzr,xzr,[sp,#48]
  287. ldp x19,x20,[x29,#16]
  288. add sp,sp,#64
  289. ldp x21,x22,[x29,#32]
  290. ldp x23,x24,[x29,#48]
  291. ldp x25,x26,[x29,#64]
  292. ldp x27,x28,[x29,#80]
  293. ldp x29,x30,[sp],#96
  294. ret
  295. .size ChaCha20_ctr32,.-ChaCha20_ctr32
  296. ___
  297. {{{
  298. my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) =
  299. map("v$_.4s",(0..7,16..23));
  300. my (@K)=map("v$_.4s",(24..30));
  301. my $ONE="v31.4s";
  302. sub NEONROUND {
  303. my $odd = pop;
  304. my ($a,$b,$c,$d,$t)=@_;
  305. (
  306. "&add ('$a','$a','$b')",
  307. "&eor ('$d','$d','$a')",
  308. "&rev32_16 ('$d','$d')", # vrot ($d,16)
  309. "&add ('$c','$c','$d')",
  310. "&eor ('$t','$b','$c')",
  311. "&ushr ('$b','$t',20)",
  312. "&sli ('$b','$t',12)",
  313. "&add ('$a','$a','$b')",
  314. "&eor ('$t','$d','$a')",
  315. "&ushr ('$d','$t',24)",
  316. "&sli ('$d','$t',8)",
  317. "&add ('$c','$c','$d')",
  318. "&eor ('$t','$b','$c')",
  319. "&ushr ('$b','$t',25)",
  320. "&sli ('$b','$t',7)",
  321. "&ext ('$c','$c','$c',8)",
  322. "&ext ('$d','$d','$d',$odd?4:12)",
  323. "&ext ('$b','$b','$b',$odd?12:4)"
  324. );
  325. }
  326. $code.=<<___;
  327. .type ChaCha20_neon,%function
  328. .align 5
  329. ChaCha20_neon:
  330. stp x29,x30,[sp,#-96]!
  331. add x29,sp,#0
  332. adr @x[0],.Lsigma
  333. stp x19,x20,[sp,#16]
  334. stp x21,x22,[sp,#32]
  335. stp x23,x24,[sp,#48]
  336. stp x25,x26,[sp,#64]
  337. stp x27,x28,[sp,#80]
  338. cmp $len,#512
  339. b.hs .L512_or_more_neon
  340. sub sp,sp,#64
  341. ldp @d[0],@d[1],[@x[0]] // load sigma
  342. ld1 {@K[0]},[@x[0]],#16
  343. ldp @d[2],@d[3],[$key] // load key
  344. ldp @d[4],@d[5],[$key,#16]
  345. ld1 {@K[1],@K[2]},[$key]
  346. ldp @d[6],@d[7],[$ctr] // load counter
  347. ld1 {@K[3]},[$ctr]
  348. ld1 {$ONE},[@x[0]]
  349. #ifdef __ARMEB__
  350. rev64 @K[0],@K[0]
  351. ror @d[2],@d[2],#32
  352. ror @d[3],@d[3],#32
  353. ror @d[4],@d[4],#32
  354. ror @d[5],@d[5],#32
  355. ror @d[6],@d[6],#32
  356. ror @d[7],@d[7],#32
  357. #endif
  358. add @K[3],@K[3],$ONE // += 1
  359. add @K[4],@K[3],$ONE
  360. add @K[5],@K[4],$ONE
  361. shl $ONE,$ONE,#2 // 1 -> 4
  362. .Loop_outer_neon:
  363. mov.32 @x[0],@d[0] // unpack key block
  364. lsr @x[1],@d[0],#32
  365. mov $A0,@K[0]
  366. mov.32 @x[2],@d[1]
  367. lsr @x[3],@d[1],#32
  368. mov $A1,@K[0]
  369. mov.32 @x[4],@d[2]
  370. lsr @x[5],@d[2],#32
  371. mov $A2,@K[0]
  372. mov.32 @x[6],@d[3]
  373. mov $B0,@K[1]
  374. lsr @x[7],@d[3],#32
  375. mov $B1,@K[1]
  376. mov.32 @x[8],@d[4]
  377. mov $B2,@K[1]
  378. lsr @x[9],@d[4],#32
  379. mov $D0,@K[3]
  380. mov.32 @x[10],@d[5]
  381. mov $D1,@K[4]
  382. lsr @x[11],@d[5],#32
  383. mov $D2,@K[5]
  384. mov.32 @x[12],@d[6]
  385. mov $C0,@K[2]
  386. lsr @x[13],@d[6],#32
  387. mov $C1,@K[2]
  388. mov.32 @x[14],@d[7]
  389. mov $C2,@K[2]
  390. lsr @x[15],@d[7],#32
  391. mov $ctr,#10
  392. subs $len,$len,#256
  393. .Loop_neon:
  394. sub $ctr,$ctr,#1
  395. ___
  396. my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
  397. my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
  398. my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
  399. my @thread3=&ROUND(0,4,8,12);
  400. foreach (@thread0) {
  401. eval; eval(shift(@thread3));
  402. eval(shift(@thread1)); eval(shift(@thread3));
  403. eval(shift(@thread2)); eval(shift(@thread3));
  404. }
  405. @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
  406. @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
  407. @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
  408. @thread3=&ROUND(0,5,10,15);
  409. foreach (@thread0) {
  410. eval; eval(shift(@thread3));
  411. eval(shift(@thread1)); eval(shift(@thread3));
  412. eval(shift(@thread2)); eval(shift(@thread3));
  413. }
  414. $code.=<<___;
  415. cbnz $ctr,.Loop_neon
  416. add.32 @x[0],@x[0],@d[0] // accumulate key block
  417. add $A0,$A0,@K[0]
  418. add @x[1],@x[1],@d[0],lsr#32
  419. add $A1,$A1,@K[0]
  420. add.32 @x[2],@x[2],@d[1]
  421. add $A2,$A2,@K[0]
  422. add @x[3],@x[3],@d[1],lsr#32
  423. add $C0,$C0,@K[2]
  424. add.32 @x[4],@x[4],@d[2]
  425. add $C1,$C1,@K[2]
  426. add @x[5],@x[5],@d[2],lsr#32
  427. add $C2,$C2,@K[2]
  428. add.32 @x[6],@x[6],@d[3]
  429. add $D0,$D0,@K[3]
  430. add @x[7],@x[7],@d[3],lsr#32
  431. add.32 @x[8],@x[8],@d[4]
  432. add $D1,$D1,@K[4]
  433. add @x[9],@x[9],@d[4],lsr#32
  434. add.32 @x[10],@x[10],@d[5]
  435. add $D2,$D2,@K[5]
  436. add @x[11],@x[11],@d[5],lsr#32
  437. add.32 @x[12],@x[12],@d[6]
  438. add $B0,$B0,@K[1]
  439. add @x[13],@x[13],@d[6],lsr#32
  440. add.32 @x[14],@x[14],@d[7]
  441. add $B1,$B1,@K[1]
  442. add @x[15],@x[15],@d[7],lsr#32
  443. add $B2,$B2,@K[1]
  444. b.lo .Ltail_neon
  445. add @x[0],@x[0],@x[1],lsl#32 // pack
  446. add @x[2],@x[2],@x[3],lsl#32
  447. ldp @x[1],@x[3],[$inp,#0] // load input
  448. add @x[4],@x[4],@x[5],lsl#32
  449. add @x[6],@x[6],@x[7],lsl#32
  450. ldp @x[5],@x[7],[$inp,#16]
  451. add @x[8],@x[8],@x[9],lsl#32
  452. add @x[10],@x[10],@x[11],lsl#32
  453. ldp @x[9],@x[11],[$inp,#32]
  454. add @x[12],@x[12],@x[13],lsl#32
  455. add @x[14],@x[14],@x[15],lsl#32
  456. ldp @x[13],@x[15],[$inp,#48]
  457. add $inp,$inp,#64
  458. #ifdef __ARMEB__
  459. rev @x[0],@x[0]
  460. rev @x[2],@x[2]
  461. rev @x[4],@x[4]
  462. rev @x[6],@x[6]
  463. rev @x[8],@x[8]
  464. rev @x[10],@x[10]
  465. rev @x[12],@x[12]
  466. rev @x[14],@x[14]
  467. #endif
  468. ld1.8 {$T0-$T3},[$inp],#64
  469. eor @x[0],@x[0],@x[1]
  470. eor @x[2],@x[2],@x[3]
  471. eor @x[4],@x[4],@x[5]
  472. eor @x[6],@x[6],@x[7]
  473. eor @x[8],@x[8],@x[9]
  474. eor $A0,$A0,$T0
  475. eor @x[10],@x[10],@x[11]
  476. eor $B0,$B0,$T1
  477. eor @x[12],@x[12],@x[13]
  478. eor $C0,$C0,$T2
  479. eor @x[14],@x[14],@x[15]
  480. eor $D0,$D0,$T3
  481. ld1.8 {$T0-$T3},[$inp],#64
  482. stp @x[0],@x[2],[$out,#0] // store output
  483. add @d[6],@d[6],#4 // increment counter
  484. stp @x[4],@x[6],[$out,#16]
  485. add @K[3],@K[3],$ONE // += 4
  486. stp @x[8],@x[10],[$out,#32]
  487. add @K[4],@K[4],$ONE
  488. stp @x[12],@x[14],[$out,#48]
  489. add @K[5],@K[5],$ONE
  490. add $out,$out,#64
  491. st1.8 {$A0-$D0},[$out],#64
  492. ld1.8 {$A0-$D0},[$inp],#64
  493. eor $A1,$A1,$T0
  494. eor $B1,$B1,$T1
  495. eor $C1,$C1,$T2
  496. eor $D1,$D1,$T3
  497. st1.8 {$A1-$D1},[$out],#64
  498. eor $A2,$A2,$A0
  499. eor $B2,$B2,$B0
  500. eor $C2,$C2,$C0
  501. eor $D2,$D2,$D0
  502. st1.8 {$A2-$D2},[$out],#64
  503. b.hi .Loop_outer_neon
  504. ldp x19,x20,[x29,#16]
  505. add sp,sp,#64
  506. ldp x21,x22,[x29,#32]
  507. ldp x23,x24,[x29,#48]
  508. ldp x25,x26,[x29,#64]
  509. ldp x27,x28,[x29,#80]
  510. ldp x29,x30,[sp],#96
  511. ret
  512. .Ltail_neon:
  513. add $len,$len,#256
  514. cmp $len,#64
  515. b.lo .Less_than_64
  516. add @x[0],@x[0],@x[1],lsl#32 // pack
  517. add @x[2],@x[2],@x[3],lsl#32
  518. ldp @x[1],@x[3],[$inp,#0] // load input
  519. add @x[4],@x[4],@x[5],lsl#32
  520. add @x[6],@x[6],@x[7],lsl#32
  521. ldp @x[5],@x[7],[$inp,#16]
  522. add @x[8],@x[8],@x[9],lsl#32
  523. add @x[10],@x[10],@x[11],lsl#32
  524. ldp @x[9],@x[11],[$inp,#32]
  525. add @x[12],@x[12],@x[13],lsl#32
  526. add @x[14],@x[14],@x[15],lsl#32
  527. ldp @x[13],@x[15],[$inp,#48]
  528. add $inp,$inp,#64
  529. #ifdef __ARMEB__
  530. rev @x[0],@x[0]
  531. rev @x[2],@x[2]
  532. rev @x[4],@x[4]
  533. rev @x[6],@x[6]
  534. rev @x[8],@x[8]
  535. rev @x[10],@x[10]
  536. rev @x[12],@x[12]
  537. rev @x[14],@x[14]
  538. #endif
  539. eor @x[0],@x[0],@x[1]
  540. eor @x[2],@x[2],@x[3]
  541. eor @x[4],@x[4],@x[5]
  542. eor @x[6],@x[6],@x[7]
  543. eor @x[8],@x[8],@x[9]
  544. eor @x[10],@x[10],@x[11]
  545. eor @x[12],@x[12],@x[13]
  546. eor @x[14],@x[14],@x[15]
  547. stp @x[0],@x[2],[$out,#0] // store output
  548. add @d[6],@d[6],#4 // increment counter
  549. stp @x[4],@x[6],[$out,#16]
  550. stp @x[8],@x[10],[$out,#32]
  551. stp @x[12],@x[14],[$out,#48]
  552. add $out,$out,#64
  553. b.eq .Ldone_neon
  554. sub $len,$len,#64
  555. cmp $len,#64
  556. b.lo .Less_than_128
  557. ld1.8 {$T0-$T3},[$inp],#64
  558. eor $A0,$A0,$T0
  559. eor $B0,$B0,$T1
  560. eor $C0,$C0,$T2
  561. eor $D0,$D0,$T3
  562. st1.8 {$A0-$D0},[$out],#64
  563. b.eq .Ldone_neon
  564. sub $len,$len,#64
  565. cmp $len,#64
  566. b.lo .Less_than_192
  567. ld1.8 {$T0-$T3},[$inp],#64
  568. eor $A1,$A1,$T0
  569. eor $B1,$B1,$T1
  570. eor $C1,$C1,$T2
  571. eor $D1,$D1,$T3
  572. st1.8 {$A1-$D1},[$out],#64
  573. b.eq .Ldone_neon
  574. sub $len,$len,#64
  575. st1.8 {$A2-$D2},[sp]
  576. b .Last_neon
  577. .Less_than_128:
  578. st1.8 {$A0-$D0},[sp]
  579. b .Last_neon
  580. .Less_than_192:
  581. st1.8 {$A1-$D1},[sp]
  582. b .Last_neon
  583. .align 4
  584. .Last_neon:
  585. sub $out,$out,#1
  586. add $inp,$inp,$len
  587. add $out,$out,$len
  588. add $ctr,sp,$len
  589. neg $len,$len
  590. .Loop_tail_neon:
  591. ldrb w10,[$inp,$len]
  592. ldrb w11,[$ctr,$len]
  593. add $len,$len,#1
  594. eor w10,w10,w11
  595. strb w10,[$out,$len]
  596. cbnz $len,.Loop_tail_neon
  597. stp xzr,xzr,[sp,#0]
  598. stp xzr,xzr,[sp,#16]
  599. stp xzr,xzr,[sp,#32]
  600. stp xzr,xzr,[sp,#48]
  601. .Ldone_neon:
  602. ldp x19,x20,[x29,#16]
  603. add sp,sp,#64
  604. ldp x21,x22,[x29,#32]
  605. ldp x23,x24,[x29,#48]
  606. ldp x25,x26,[x29,#64]
  607. ldp x27,x28,[x29,#80]
  608. ldp x29,x30,[sp],#96
  609. ret
  610. .size ChaCha20_neon,.-ChaCha20_neon
  611. ___
  612. {
  613. my ($T0,$T1,$T2,$T3,$T4,$T5)=@K;
  614. my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,
  615. $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23));
  616. $code.=<<___;
  617. .type ChaCha20_512_neon,%function
  618. .align 5
  619. ChaCha20_512_neon:
  620. stp x29,x30,[sp,#-96]!
  621. add x29,sp,#0
  622. adr @x[0],.Lsigma
  623. stp x19,x20,[sp,#16]
  624. stp x21,x22,[sp,#32]
  625. stp x23,x24,[sp,#48]
  626. stp x25,x26,[sp,#64]
  627. stp x27,x28,[sp,#80]
  628. .L512_or_more_neon:
  629. sub sp,sp,#128+64
  630. ldp @d[0],@d[1],[@x[0]] // load sigma
  631. ld1 {@K[0]},[@x[0]],#16
  632. ldp @d[2],@d[3],[$key] // load key
  633. ldp @d[4],@d[5],[$key,#16]
  634. ld1 {@K[1],@K[2]},[$key]
  635. ldp @d[6],@d[7],[$ctr] // load counter
  636. ld1 {@K[3]},[$ctr]
  637. ld1 {$ONE},[@x[0]]
  638. #ifdef __ARMEB__
  639. rev64 @K[0],@K[0]
  640. ror @d[2],@d[2],#32
  641. ror @d[3],@d[3],#32
  642. ror @d[4],@d[4],#32
  643. ror @d[5],@d[5],#32
  644. ror @d[6],@d[6],#32
  645. ror @d[7],@d[7],#32
  646. #endif
  647. add @K[3],@K[3],$ONE // += 1
  648. stp @K[0],@K[1],[sp,#0] // off-load key block, invariant part
  649. add @K[3],@K[3],$ONE // not typo
  650. str @K[2],[sp,#32]
  651. add @K[4],@K[3],$ONE
  652. add @K[5],@K[4],$ONE
  653. add @K[6],@K[5],$ONE
  654. shl $ONE,$ONE,#2 // 1 -> 4
  655. stp d8,d9,[sp,#128+0] // meet ABI requirements
  656. stp d10,d11,[sp,#128+16]
  657. stp d12,d13,[sp,#128+32]
  658. stp d14,d15,[sp,#128+48]
  659. sub $len,$len,#512 // not typo
  660. .Loop_outer_512_neon:
  661. mov $A0,@K[0]
  662. mov $A1,@K[0]
  663. mov $A2,@K[0]
  664. mov $A3,@K[0]
  665. mov $A4,@K[0]
  666. mov $A5,@K[0]
  667. mov $B0,@K[1]
  668. mov.32 @x[0],@d[0] // unpack key block
  669. mov $B1,@K[1]
  670. lsr @x[1],@d[0],#32
  671. mov $B2,@K[1]
  672. mov.32 @x[2],@d[1]
  673. mov $B3,@K[1]
  674. lsr @x[3],@d[1],#32
  675. mov $B4,@K[1]
  676. mov.32 @x[4],@d[2]
  677. mov $B5,@K[1]
  678. lsr @x[5],@d[2],#32
  679. mov $D0,@K[3]
  680. mov.32 @x[6],@d[3]
  681. mov $D1,@K[4]
  682. lsr @x[7],@d[3],#32
  683. mov $D2,@K[5]
  684. mov.32 @x[8],@d[4]
  685. mov $D3,@K[6]
  686. lsr @x[9],@d[4],#32
  687. mov $C0,@K[2]
  688. mov.32 @x[10],@d[5]
  689. mov $C1,@K[2]
  690. lsr @x[11],@d[5],#32
  691. add $D4,$D0,$ONE // +4
  692. mov.32 @x[12],@d[6]
  693. add $D5,$D1,$ONE // +4
  694. lsr @x[13],@d[6],#32
  695. mov $C2,@K[2]
  696. mov.32 @x[14],@d[7]
  697. mov $C3,@K[2]
  698. lsr @x[15],@d[7],#32
  699. mov $C4,@K[2]
  700. stp @K[3],@K[4],[sp,#48] // off-load key block, variable part
  701. mov $C5,@K[2]
  702. str @K[5],[sp,#80]
  703. mov $ctr,#5
  704. subs $len,$len,#512
  705. .Loop_upper_neon:
  706. sub $ctr,$ctr,#1
  707. ___
  708. my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
  709. my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
  710. my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
  711. my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
  712. my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
  713. my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
  714. my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
  715. my $diff = ($#thread0+1)*6 - $#thread67 - 1;
  716. my $i = 0;
  717. foreach (@thread0) {
  718. eval; eval(shift(@thread67));
  719. eval(shift(@thread1)); eval(shift(@thread67));
  720. eval(shift(@thread2)); eval(shift(@thread67));
  721. eval(shift(@thread3)); eval(shift(@thread67));
  722. eval(shift(@thread4)); eval(shift(@thread67));
  723. eval(shift(@thread5)); eval(shift(@thread67));
  724. }
  725. @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
  726. @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
  727. @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
  728. @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
  729. @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
  730. @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
  731. @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
  732. foreach (@thread0) {
  733. eval; eval(shift(@thread67));
  734. eval(shift(@thread1)); eval(shift(@thread67));
  735. eval(shift(@thread2)); eval(shift(@thread67));
  736. eval(shift(@thread3)); eval(shift(@thread67));
  737. eval(shift(@thread4)); eval(shift(@thread67));
  738. eval(shift(@thread5)); eval(shift(@thread67));
  739. }
  740. $code.=<<___;
  741. cbnz $ctr,.Loop_upper_neon
  742. add.32 @x[0],@x[0],@d[0] // accumulate key block
  743. add @x[1],@x[1],@d[0],lsr#32
  744. add.32 @x[2],@x[2],@d[1]
  745. add @x[3],@x[3],@d[1],lsr#32
  746. add.32 @x[4],@x[4],@d[2]
  747. add @x[5],@x[5],@d[2],lsr#32
  748. add.32 @x[6],@x[6],@d[3]
  749. add @x[7],@x[7],@d[3],lsr#32
  750. add.32 @x[8],@x[8],@d[4]
  751. add @x[9],@x[9],@d[4],lsr#32
  752. add.32 @x[10],@x[10],@d[5]
  753. add @x[11],@x[11],@d[5],lsr#32
  754. add.32 @x[12],@x[12],@d[6]
  755. add @x[13],@x[13],@d[6],lsr#32
  756. add.32 @x[14],@x[14],@d[7]
  757. add @x[15],@x[15],@d[7],lsr#32
  758. add @x[0],@x[0],@x[1],lsl#32 // pack
  759. add @x[2],@x[2],@x[3],lsl#32
  760. ldp @x[1],@x[3],[$inp,#0] // load input
  761. add @x[4],@x[4],@x[5],lsl#32
  762. add @x[6],@x[6],@x[7],lsl#32
  763. ldp @x[5],@x[7],[$inp,#16]
  764. add @x[8],@x[8],@x[9],lsl#32
  765. add @x[10],@x[10],@x[11],lsl#32
  766. ldp @x[9],@x[11],[$inp,#32]
  767. add @x[12],@x[12],@x[13],lsl#32
  768. add @x[14],@x[14],@x[15],lsl#32
  769. ldp @x[13],@x[15],[$inp,#48]
  770. add $inp,$inp,#64
  771. #ifdef __ARMEB__
  772. rev @x[0],@x[0]
  773. rev @x[2],@x[2]
  774. rev @x[4],@x[4]
  775. rev @x[6],@x[6]
  776. rev @x[8],@x[8]
  777. rev @x[10],@x[10]
  778. rev @x[12],@x[12]
  779. rev @x[14],@x[14]
  780. #endif
  781. eor @x[0],@x[0],@x[1]
  782. eor @x[2],@x[2],@x[3]
  783. eor @x[4],@x[4],@x[5]
  784. eor @x[6],@x[6],@x[7]
  785. eor @x[8],@x[8],@x[9]
  786. eor @x[10],@x[10],@x[11]
  787. eor @x[12],@x[12],@x[13]
  788. eor @x[14],@x[14],@x[15]
  789. stp @x[0],@x[2],[$out,#0] // store output
  790. add @d[6],@d[6],#1 // increment counter
  791. mov.32 @x[0],@d[0] // unpack key block
  792. lsr @x[1],@d[0],#32
  793. stp @x[4],@x[6],[$out,#16]
  794. mov.32 @x[2],@d[1]
  795. lsr @x[3],@d[1],#32
  796. stp @x[8],@x[10],[$out,#32]
  797. mov.32 @x[4],@d[2]
  798. lsr @x[5],@d[2],#32
  799. stp @x[12],@x[14],[$out,#48]
  800. add $out,$out,#64
  801. mov.32 @x[6],@d[3]
  802. lsr @x[7],@d[3],#32
  803. mov.32 @x[8],@d[4]
  804. lsr @x[9],@d[4],#32
  805. mov.32 @x[10],@d[5]
  806. lsr @x[11],@d[5],#32
  807. mov.32 @x[12],@d[6]
  808. lsr @x[13],@d[6],#32
  809. mov.32 @x[14],@d[7]
  810. lsr @x[15],@d[7],#32
  811. mov $ctr,#5
  812. .Loop_lower_neon:
  813. sub $ctr,$ctr,#1
  814. ___
  815. @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
  816. @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
  817. @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
  818. @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
  819. @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
  820. @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
  821. @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
  822. foreach (@thread0) {
  823. eval; eval(shift(@thread67));
  824. eval(shift(@thread1)); eval(shift(@thread67));
  825. eval(shift(@thread2)); eval(shift(@thread67));
  826. eval(shift(@thread3)); eval(shift(@thread67));
  827. eval(shift(@thread4)); eval(shift(@thread67));
  828. eval(shift(@thread5)); eval(shift(@thread67));
  829. }
  830. @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
  831. @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
  832. @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
  833. @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
  834. @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
  835. @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
  836. @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
  837. foreach (@thread0) {
  838. eval; eval(shift(@thread67));
  839. eval(shift(@thread1)); eval(shift(@thread67));
  840. eval(shift(@thread2)); eval(shift(@thread67));
  841. eval(shift(@thread3)); eval(shift(@thread67));
  842. eval(shift(@thread4)); eval(shift(@thread67));
  843. eval(shift(@thread5)); eval(shift(@thread67));
  844. }
  845. $code.=<<___;
  846. cbnz $ctr,.Loop_lower_neon
  847. add.32 @x[0],@x[0],@d[0] // accumulate key block
  848. ldp @K[0],@K[1],[sp,#0]
  849. add @x[1],@x[1],@d[0],lsr#32
  850. ldp @K[2],@K[3],[sp,#32]
  851. add.32 @x[2],@x[2],@d[1]
  852. ldp @K[4],@K[5],[sp,#64]
  853. add @x[3],@x[3],@d[1],lsr#32
  854. add $A0,$A0,@K[0]
  855. add.32 @x[4],@x[4],@d[2]
  856. add $A1,$A1,@K[0]
  857. add @x[5],@x[5],@d[2],lsr#32
  858. add $A2,$A2,@K[0]
  859. add.32 @x[6],@x[6],@d[3]
  860. add $A3,$A3,@K[0]
  861. add @x[7],@x[7],@d[3],lsr#32
  862. add $A4,$A4,@K[0]
  863. add.32 @x[8],@x[8],@d[4]
  864. add $A5,$A5,@K[0]
  865. add @x[9],@x[9],@d[4],lsr#32
  866. add $C0,$C0,@K[2]
  867. add.32 @x[10],@x[10],@d[5]
  868. add $C1,$C1,@K[2]
  869. add @x[11],@x[11],@d[5],lsr#32
  870. add $C2,$C2,@K[2]
  871. add.32 @x[12],@x[12],@d[6]
  872. add $C3,$C3,@K[2]
  873. add @x[13],@x[13],@d[6],lsr#32
  874. add $C4,$C4,@K[2]
  875. add.32 @x[14],@x[14],@d[7]
  876. add $C5,$C5,@K[2]
  877. add @x[15],@x[15],@d[7],lsr#32
  878. add $D4,$D4,$ONE // +4
  879. add @x[0],@x[0],@x[1],lsl#32 // pack
  880. add $D5,$D5,$ONE // +4
  881. add @x[2],@x[2],@x[3],lsl#32
  882. add $D0,$D0,@K[3]
  883. ldp @x[1],@x[3],[$inp,#0] // load input
  884. add $D1,$D1,@K[4]
  885. add @x[4],@x[4],@x[5],lsl#32
  886. add $D2,$D2,@K[5]
  887. add @x[6],@x[6],@x[7],lsl#32
  888. add $D3,$D3,@K[6]
  889. ldp @x[5],@x[7],[$inp,#16]
  890. add $D4,$D4,@K[3]
  891. add @x[8],@x[8],@x[9],lsl#32
  892. add $D5,$D5,@K[4]
  893. add @x[10],@x[10],@x[11],lsl#32
  894. add $B0,$B0,@K[1]
  895. ldp @x[9],@x[11],[$inp,#32]
  896. add $B1,$B1,@K[1]
  897. add @x[12],@x[12],@x[13],lsl#32
  898. add $B2,$B2,@K[1]
  899. add @x[14],@x[14],@x[15],lsl#32
  900. add $B3,$B3,@K[1]
  901. ldp @x[13],@x[15],[$inp,#48]
  902. add $B4,$B4,@K[1]
  903. add $inp,$inp,#64
  904. add $B5,$B5,@K[1]
  905. #ifdef __ARMEB__
  906. rev @x[0],@x[0]
  907. rev @x[2],@x[2]
  908. rev @x[4],@x[4]
  909. rev @x[6],@x[6]
  910. rev @x[8],@x[8]
  911. rev @x[10],@x[10]
  912. rev @x[12],@x[12]
  913. rev @x[14],@x[14]
  914. #endif
  915. ld1.8 {$T0-$T3},[$inp],#64
  916. eor @x[0],@x[0],@x[1]
  917. eor @x[2],@x[2],@x[3]
  918. eor @x[4],@x[4],@x[5]
  919. eor @x[6],@x[6],@x[7]
  920. eor @x[8],@x[8],@x[9]
  921. eor $A0,$A0,$T0
  922. eor @x[10],@x[10],@x[11]
  923. eor $B0,$B0,$T1
  924. eor @x[12],@x[12],@x[13]
  925. eor $C0,$C0,$T2
  926. eor @x[14],@x[14],@x[15]
  927. eor $D0,$D0,$T3
  928. ld1.8 {$T0-$T3},[$inp],#64
  929. stp @x[0],@x[2],[$out,#0] // store output
  930. add @d[6],@d[6],#7 // increment counter
  931. stp @x[4],@x[6],[$out,#16]
  932. stp @x[8],@x[10],[$out,#32]
  933. stp @x[12],@x[14],[$out,#48]
  934. add $out,$out,#64
  935. st1.8 {$A0-$D0},[$out],#64
  936. ld1.8 {$A0-$D0},[$inp],#64
  937. eor $A1,$A1,$T0
  938. eor $B1,$B1,$T1
  939. eor $C1,$C1,$T2
  940. eor $D1,$D1,$T3
  941. st1.8 {$A1-$D1},[$out],#64
  942. ld1.8 {$A1-$D1},[$inp],#64
  943. eor $A2,$A2,$A0
  944. ldp @K[0],@K[1],[sp,#0]
  945. eor $B2,$B2,$B0
  946. ldp @K[2],@K[3],[sp,#32]
  947. eor $C2,$C2,$C0
  948. eor $D2,$D2,$D0
  949. st1.8 {$A2-$D2},[$out],#64
  950. ld1.8 {$A2-$D2},[$inp],#64
  951. eor $A3,$A3,$A1
  952. eor $B3,$B3,$B1
  953. eor $C3,$C3,$C1
  954. eor $D3,$D3,$D1
  955. st1.8 {$A3-$D3},[$out],#64
  956. ld1.8 {$A3-$D3},[$inp],#64
  957. eor $A4,$A4,$A2
  958. eor $B4,$B4,$B2
  959. eor $C4,$C4,$C2
  960. eor $D4,$D4,$D2
  961. st1.8 {$A4-$D4},[$out],#64
  962. shl $A0,$ONE,#1 // 4 -> 8
  963. eor $A5,$A5,$A3
  964. eor $B5,$B5,$B3
  965. eor $C5,$C5,$C3
  966. eor $D5,$D5,$D3
  967. st1.8 {$A5-$D5},[$out],#64
  968. add @K[3],@K[3],$A0 // += 8
  969. add @K[4],@K[4],$A0
  970. add @K[5],@K[5],$A0
  971. add @K[6],@K[6],$A0
  972. b.hs .Loop_outer_512_neon
  973. adds $len,$len,#512
  974. ushr $A0,$ONE,#2 // 4 -> 1
  975. ldp d8,d9,[sp,#128+0] // meet ABI requirements
  976. ldp d10,d11,[sp,#128+16]
  977. ldp d12,d13,[sp,#128+32]
  978. ldp d14,d15,[sp,#128+48]
  979. stp @K[0],$ONE,[sp,#0] // wipe off-load area
  980. stp @K[0],$ONE,[sp,#32]
  981. stp @K[0],$ONE,[sp,#64]
  982. b.eq .Ldone_512_neon
  983. cmp $len,#192
  984. sub @K[3],@K[3],$A0 // -= 1
  985. sub @K[4],@K[4],$A0
  986. sub @K[5],@K[5],$A0
  987. add sp,sp,#128
  988. b.hs .Loop_outer_neon
  989. eor @K[1],@K[1],@K[1]
  990. eor @K[2],@K[2],@K[2]
  991. eor @K[3],@K[3],@K[3]
  992. eor @K[4],@K[4],@K[4]
  993. eor @K[5],@K[5],@K[5]
  994. eor @K[6],@K[6],@K[6]
  995. b .Loop_outer
  996. .Ldone_512_neon:
  997. ldp x19,x20,[x29,#16]
  998. add sp,sp,#128+64
  999. ldp x21,x22,[x29,#32]
  1000. ldp x23,x24,[x29,#48]
  1001. ldp x25,x26,[x29,#64]
  1002. ldp x27,x28,[x29,#80]
  1003. ldp x29,x30,[sp],#96
  1004. ret
  1005. .size ChaCha20_512_neon,.-ChaCha20_512_neon
  1006. ___
  1007. }
  1008. }}}
  1009. foreach (split("\n",$code)) {
  1010. s/\`([^\`]*)\`/eval $1/geo;
  1011. (s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1)) or
  1012. (m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1)) or
  1013. (s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1)) or
  1014. (m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1)) or
  1015. (s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1));
  1016. #s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
  1017. print $_,"\n";
  1018. }
  1019. close STDOUT; # flush