Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.
 
 
 
 
 
 

1128 rindas
26 KiB

  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  5. # project. The module is, however, dual licensed under OpenSSL and
  6. # CRYPTOGAMS licenses depending on where you obtain it. For further
  7. # details see http://www.openssl.org/~appro/cryptogams/.
  8. # ====================================================================
  9. #
  10. # June 2015
  11. #
  12. # ChaCha20 for ARMv8.
  13. #
  14. # Performance in cycles per byte out of large buffer.
  15. #
  16. # IALU/gcc-4.9 3xNEON+1xIALU 6xNEON+2xIALU
  17. #
  18. # Apple A7 5.50/+49% 3.33 1.70
  19. # Cortex-A53 8.40/+80% 4.72 4.72(*)
  20. # Cortex-A57 8.06/+43% 4.90 4.43(**)
  21. # Denver 4.50/+82% 2.63 2.67(*)
  22. # X-Gene 9.50/+46% 8.82 8.89(*)
  23. #
  24. # (*) it's expected that doubling interleave factor doesn't help
  25. # all processors, only those with higher NEON latency and
  26. # higher instruction issue rate;
  27. # (**) expected improvement was actually higher;
  28. $flavour=shift;
  29. $output=shift;
  30. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  31. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  32. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  33. die "can't locate arm-xlate.pl";
  34. open OUT,"| \"$^X\" $xlate $flavour $output";
  35. *STDOUT=*OUT;
  36. sub AUTOLOAD() # thunk [simplified] x86-style perlasm
  37. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
  38. my $arg = pop;
  39. $arg = "#$arg" if ($arg*1 eq $arg);
  40. $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
  41. }
  42. my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4));
  43. my @x=map("x$_",(5..17,19..21));
  44. my @d=map("x$_",(22..28,30));
  45. sub ROUND {
  46. my ($a0,$b0,$c0,$d0)=@_;
  47. my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
  48. my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
  49. my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
  50. (
  51. "&add_32 (@x[$a0],@x[$a0],@x[$b0])",
  52. "&add_32 (@x[$a1],@x[$a1],@x[$b1])",
  53. "&add_32 (@x[$a2],@x[$a2],@x[$b2])",
  54. "&add_32 (@x[$a3],@x[$a3],@x[$b3])",
  55. "&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
  56. "&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
  57. "&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
  58. "&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
  59. "&ror_32 (@x[$d0],@x[$d0],16)",
  60. "&ror_32 (@x[$d1],@x[$d1],16)",
  61. "&ror_32 (@x[$d2],@x[$d2],16)",
  62. "&ror_32 (@x[$d3],@x[$d3],16)",
  63. "&add_32 (@x[$c0],@x[$c0],@x[$d0])",
  64. "&add_32 (@x[$c1],@x[$c1],@x[$d1])",
  65. "&add_32 (@x[$c2],@x[$c2],@x[$d2])",
  66. "&add_32 (@x[$c3],@x[$c3],@x[$d3])",
  67. "&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
  68. "&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
  69. "&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
  70. "&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
  71. "&ror_32 (@x[$b0],@x[$b0],20)",
  72. "&ror_32 (@x[$b1],@x[$b1],20)",
  73. "&ror_32 (@x[$b2],@x[$b2],20)",
  74. "&ror_32 (@x[$b3],@x[$b3],20)",
  75. "&add_32 (@x[$a0],@x[$a0],@x[$b0])",
  76. "&add_32 (@x[$a1],@x[$a1],@x[$b1])",
  77. "&add_32 (@x[$a2],@x[$a2],@x[$b2])",
  78. "&add_32 (@x[$a3],@x[$a3],@x[$b3])",
  79. "&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
  80. "&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
  81. "&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
  82. "&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
  83. "&ror_32 (@x[$d0],@x[$d0],24)",
  84. "&ror_32 (@x[$d1],@x[$d1],24)",
  85. "&ror_32 (@x[$d2],@x[$d2],24)",
  86. "&ror_32 (@x[$d3],@x[$d3],24)",
  87. "&add_32 (@x[$c0],@x[$c0],@x[$d0])",
  88. "&add_32 (@x[$c1],@x[$c1],@x[$d1])",
  89. "&add_32 (@x[$c2],@x[$c2],@x[$d2])",
  90. "&add_32 (@x[$c3],@x[$c3],@x[$d3])",
  91. "&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
  92. "&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
  93. "&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
  94. "&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
  95. "&ror_32 (@x[$b0],@x[$b0],25)",
  96. "&ror_32 (@x[$b1],@x[$b1],25)",
  97. "&ror_32 (@x[$b2],@x[$b2],25)",
  98. "&ror_32 (@x[$b3],@x[$b3],25)"
  99. );
  100. }
  101. $code.=<<___;
  102. #include <openssl/arm_arch.h>
  103. .text
  104. .extern OPENSSL_armcap_P
  105. .align 5
  106. .Lsigma:
  107. .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
  108. .Lone:
  109. .long 1,0,0,0
  110. .LOPENSSL_armcap_P:
  111. #ifdef __ILP32__
  112. .long OPENSSL_armcap_P-.
  113. #else
  114. .quad OPENSSL_armcap_P-.
  115. #endif
  116. .asciz "ChaCha20 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  117. .globl ChaCha20_ctr32
  118. .type ChaCha20_ctr32,%function
  119. .align 5
  120. ChaCha20_ctr32:
  121. cbz $len,.Labort
  122. adr @x[0],.LOPENSSL_armcap_P
  123. cmp $len,#192
  124. b.lo .Lshort
  125. #ifdef __ILP32__
  126. ldrsw @x[1],[@x[0]]
  127. #else
  128. ldr @x[1],[@x[0]]
  129. #endif
  130. ldr w17,[@x[1],@x[0]]
  131. tst w17,#ARMV7_NEON
  132. b.ne ChaCha20_neon
  133. .Lshort:
  134. stp x29,x30,[sp,#-96]!
  135. add x29,sp,#0
  136. adr @x[0],.Lsigma
  137. stp x19,x20,[sp,#16]
  138. stp x21,x22,[sp,#32]
  139. stp x23,x24,[sp,#48]
  140. stp x25,x26,[sp,#64]
  141. stp x27,x28,[sp,#80]
  142. sub sp,sp,#64
  143. ldp @d[0],@d[1],[@x[0]] // load sigma
  144. ldp @d[2],@d[3],[$key] // load key
  145. ldp @d[4],@d[5],[$key,#16]
  146. ldp @d[6],@d[7],[$ctr] // load counter
  147. #ifdef __ARMEB__
  148. ror @d[2],@d[2],#32
  149. ror @d[3],@d[3],#32
  150. ror @d[4],@d[4],#32
  151. ror @d[5],@d[5],#32
  152. ror @d[6],@d[6],#32
  153. ror @d[7],@d[7],#32
  154. #endif
  155. .Loop_outer:
  156. mov.32 @x[0],@d[0] // unpack key block
  157. lsr @x[1],@d[0],#32
  158. mov.32 @x[2],@d[1]
  159. lsr @x[3],@d[1],#32
  160. mov.32 @x[4],@d[2]
  161. lsr @x[5],@d[2],#32
  162. mov.32 @x[6],@d[3]
  163. lsr @x[7],@d[3],#32
  164. mov.32 @x[8],@d[4]
  165. lsr @x[9],@d[4],#32
  166. mov.32 @x[10],@d[5]
  167. lsr @x[11],@d[5],#32
  168. mov.32 @x[12],@d[6]
  169. lsr @x[13],@d[6],#32
  170. mov.32 @x[14],@d[7]
  171. lsr @x[15],@d[7],#32
  172. mov $ctr,#10
  173. subs $len,$len,#64
  174. .Loop:
  175. sub $ctr,$ctr,#1
  176. ___
  177. foreach (&ROUND(0, 4, 8,12)) { eval; }
  178. foreach (&ROUND(0, 5,10,15)) { eval; }
  179. $code.=<<___;
  180. cbnz $ctr,.Loop
  181. add.32 @x[0],@x[0],@d[0] // accumulate key block
  182. add @x[1],@x[1],@d[0],lsr#32
  183. add.32 @x[2],@x[2],@d[1]
  184. add @x[3],@x[3],@d[1],lsr#32
  185. add.32 @x[4],@x[4],@d[2]
  186. add @x[5],@x[5],@d[2],lsr#32
  187. add.32 @x[6],@x[6],@d[3]
  188. add @x[7],@x[7],@d[3],lsr#32
  189. add.32 @x[8],@x[8],@d[4]
  190. add @x[9],@x[9],@d[4],lsr#32
  191. add.32 @x[10],@x[10],@d[5]
  192. add @x[11],@x[11],@d[5],lsr#32
  193. add.32 @x[12],@x[12],@d[6]
  194. add @x[13],@x[13],@d[6],lsr#32
  195. add.32 @x[14],@x[14],@d[7]
  196. add @x[15],@x[15],@d[7],lsr#32
  197. b.lo .Ltail
  198. add @x[0],@x[0],@x[1],lsl#32 // pack
  199. add @x[2],@x[2],@x[3],lsl#32
  200. ldp @x[1],@x[3],[$inp,#0] // load input
  201. add @x[4],@x[4],@x[5],lsl#32
  202. add @x[6],@x[6],@x[7],lsl#32
  203. ldp @x[5],@x[7],[$inp,#16]
  204. add @x[8],@x[8],@x[9],lsl#32
  205. add @x[10],@x[10],@x[11],lsl#32
  206. ldp @x[9],@x[11],[$inp,#32]
  207. add @x[12],@x[12],@x[13],lsl#32
  208. add @x[14],@x[14],@x[15],lsl#32
  209. ldp @x[13],@x[15],[$inp,#48]
  210. add $inp,$inp,#64
  211. #ifdef __ARMEB__
  212. rev @x[0],@x[0]
  213. rev @x[2],@x[2]
  214. rev @x[4],@x[4]
  215. rev @x[6],@x[6]
  216. rev @x[8],@x[8]
  217. rev @x[10],@x[10]
  218. rev @x[12],@x[12]
  219. rev @x[14],@x[14]
  220. #endif
  221. eor @x[0],@x[0],@x[1]
  222. eor @x[2],@x[2],@x[3]
  223. eor @x[4],@x[4],@x[5]
  224. eor @x[6],@x[6],@x[7]
  225. eor @x[8],@x[8],@x[9]
  226. eor @x[10],@x[10],@x[11]
  227. eor @x[12],@x[12],@x[13]
  228. eor @x[14],@x[14],@x[15]
  229. stp @x[0],@x[2],[$out,#0] // store output
  230. add @d[6],@d[6],#1 // increment counter
  231. stp @x[4],@x[6],[$out,#16]
  232. stp @x[8],@x[10],[$out,#32]
  233. stp @x[12],@x[14],[$out,#48]
  234. add $out,$out,#64
  235. b.hi .Loop_outer
  236. ldp x19,x20,[x29,#16]
  237. add sp,sp,#64
  238. ldp x21,x22,[x29,#32]
  239. ldp x23,x24,[x29,#48]
  240. ldp x25,x26,[x29,#64]
  241. ldp x27,x28,[x29,#80]
  242. ldp x29,x30,[sp],#96
  243. .Labort:
  244. ret
  245. .align 4
  246. .Ltail:
  247. add $len,$len,#64
  248. .Less_than_64:
  249. sub $out,$out,#1
  250. add $inp,$inp,$len
  251. add $out,$out,$len
  252. add $ctr,sp,$len
  253. neg $len,$len
  254. add @x[0],@x[0],@x[1],lsl#32 // pack
  255. add @x[2],@x[2],@x[3],lsl#32
  256. add @x[4],@x[4],@x[5],lsl#32
  257. add @x[6],@x[6],@x[7],lsl#32
  258. add @x[8],@x[8],@x[9],lsl#32
  259. add @x[10],@x[10],@x[11],lsl#32
  260. add @x[12],@x[12],@x[13],lsl#32
  261. add @x[14],@x[14],@x[15],lsl#32
  262. #ifdef __ARMEB__
  263. rev @x[0],@x[0]
  264. rev @x[2],@x[2]
  265. rev @x[4],@x[4]
  266. rev @x[6],@x[6]
  267. rev @x[8],@x[8]
  268. rev @x[10],@x[10]
  269. rev @x[12],@x[12]
  270. rev @x[14],@x[14]
  271. #endif
  272. stp @x[0],@x[2],[sp,#0]
  273. stp @x[4],@x[6],[sp,#16]
  274. stp @x[8],@x[10],[sp,#32]
  275. stp @x[12],@x[14],[sp,#48]
  276. .Loop_tail:
  277. ldrb w10,[$inp,$len]
  278. ldrb w11,[$ctr,$len]
  279. add $len,$len,#1
  280. eor w10,w10,w11
  281. strb w10,[$out,$len]
  282. cbnz $len,.Loop_tail
  283. stp xzr,xzr,[sp,#0]
  284. stp xzr,xzr,[sp,#16]
  285. stp xzr,xzr,[sp,#32]
  286. stp xzr,xzr,[sp,#48]
  287. ldp x19,x20,[x29,#16]
  288. add sp,sp,#64
  289. ldp x21,x22,[x29,#32]
  290. ldp x23,x24,[x29,#48]
  291. ldp x25,x26,[x29,#64]
  292. ldp x27,x28,[x29,#80]
  293. ldp x29,x30,[sp],#96
  294. ret
  295. .size ChaCha20_ctr32,.-ChaCha20_ctr32
  296. ___
  297. {{{
  298. my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) =
  299. map("v$_.4s",(0..7,16..23));
  300. my (@K)=map("v$_.4s",(24..30));
  301. my $ONE="v31.4s";
  302. sub NEONROUND {
  303. my $odd = pop;
  304. my ($a,$b,$c,$d,$t)=@_;
  305. (
  306. "&add ('$a','$a','$b')",
  307. "&eor ('$d','$d','$a')",
  308. "&rev32_16 ('$d','$d')", # vrot ($d,16)
  309. "&add ('$c','$c','$d')",
  310. "&eor ('$t','$b','$c')",
  311. "&ushr ('$b','$t',20)",
  312. "&sli ('$b','$t',12)",
  313. "&add ('$a','$a','$b')",
  314. "&eor ('$t','$d','$a')",
  315. "&ushr ('$d','$t',24)",
  316. "&sli ('$d','$t',8)",
  317. "&add ('$c','$c','$d')",
  318. "&eor ('$t','$b','$c')",
  319. "&ushr ('$b','$t',25)",
  320. "&sli ('$b','$t',7)",
  321. "&ext ('$c','$c','$c',8)",
  322. "&ext ('$d','$d','$d',$odd?4:12)",
  323. "&ext ('$b','$b','$b',$odd?12:4)"
  324. );
  325. }
  326. $code.=<<___;
  327. .type ChaCha20_neon,%function
  328. .align 5
  329. ChaCha20_neon:
  330. stp x29,x30,[sp,#-96]!
  331. add x29,sp,#0
  332. adr @x[0],.Lsigma
  333. stp x19,x20,[sp,#16]
  334. stp x21,x22,[sp,#32]
  335. stp x23,x24,[sp,#48]
  336. stp x25,x26,[sp,#64]
  337. stp x27,x28,[sp,#80]
  338. cmp $len,#512
  339. b.hs .L512_or_more_neon
  340. sub sp,sp,#64
  341. ldp @d[0],@d[1],[@x[0]] // load sigma
  342. ld1 {@K[0]},[@x[0]],#16
  343. ldp @d[2],@d[3],[$key] // load key
  344. ldp @d[4],@d[5],[$key,#16]
  345. ld1 {@K[1],@K[2]},[$key]
  346. ldp @d[6],@d[7],[$ctr] // load counter
  347. ld1 {@K[3]},[$ctr]
  348. ld1 {$ONE},[@x[0]]
  349. #ifdef __ARMEB__
  350. rev64 @K[0],@K[0]
  351. ror @d[2],@d[2],#32
  352. ror @d[3],@d[3],#32
  353. ror @d[4],@d[4],#32
  354. ror @d[5],@d[5],#32
  355. ror @d[6],@d[6],#32
  356. ror @d[7],@d[7],#32
  357. #endif
  358. add @K[3],@K[3],$ONE // += 1
  359. add @K[4],@K[3],$ONE
  360. add @K[5],@K[4],$ONE
  361. shl $ONE,$ONE,#2 // 1 -> 4
  362. .Loop_outer_neon:
  363. mov.32 @x[0],@d[0] // unpack key block
  364. lsr @x[1],@d[0],#32
  365. mov $A0,@K[0]
  366. mov.32 @x[2],@d[1]
  367. lsr @x[3],@d[1],#32
  368. mov $A1,@K[0]
  369. mov.32 @x[4],@d[2]
  370. lsr @x[5],@d[2],#32
  371. mov $A2,@K[0]
  372. mov.32 @x[6],@d[3]
  373. mov $B0,@K[1]
  374. lsr @x[7],@d[3],#32
  375. mov $B1,@K[1]
  376. mov.32 @x[8],@d[4]
  377. mov $B2,@K[1]
  378. lsr @x[9],@d[4],#32
  379. mov $D0,@K[3]
  380. mov.32 @x[10],@d[5]
  381. mov $D1,@K[4]
  382. lsr @x[11],@d[5],#32
  383. mov $D2,@K[5]
  384. mov.32 @x[12],@d[6]
  385. mov $C0,@K[2]
  386. lsr @x[13],@d[6],#32
  387. mov $C1,@K[2]
  388. mov.32 @x[14],@d[7]
  389. mov $C2,@K[2]
  390. lsr @x[15],@d[7],#32
  391. mov $ctr,#10
  392. subs $len,$len,#256
  393. .Loop_neon:
  394. sub $ctr,$ctr,#1
  395. ___
  396. my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
  397. my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
  398. my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
  399. my @thread3=&ROUND(0,4,8,12);
  400. foreach (@thread0) {
  401. eval; eval(shift(@thread3));
  402. eval(shift(@thread1)); eval(shift(@thread3));
  403. eval(shift(@thread2)); eval(shift(@thread3));
  404. }
  405. @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
  406. @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
  407. @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
  408. @thread3=&ROUND(0,5,10,15);
  409. foreach (@thread0) {
  410. eval; eval(shift(@thread3));
  411. eval(shift(@thread1)); eval(shift(@thread3));
  412. eval(shift(@thread2)); eval(shift(@thread3));
  413. }
  414. $code.=<<___;
  415. cbnz $ctr,.Loop_neon
  416. add.32 @x[0],@x[0],@d[0] // accumulate key block
  417. add $A0,$A0,@K[0]
  418. add @x[1],@x[1],@d[0],lsr#32
  419. add $A1,$A1,@K[0]
  420. add.32 @x[2],@x[2],@d[1]
  421. add $A2,$A2,@K[0]
  422. add @x[3],@x[3],@d[1],lsr#32
  423. add $C0,$C0,@K[2]
  424. add.32 @x[4],@x[4],@d[2]
  425. add $C1,$C1,@K[2]
  426. add @x[5],@x[5],@d[2],lsr#32
  427. add $C2,$C2,@K[2]
  428. add.32 @x[6],@x[6],@d[3]
  429. add $D0,$D0,@K[3]
  430. add @x[7],@x[7],@d[3],lsr#32
  431. add.32 @x[8],@x[8],@d[4]
  432. add $D1,$D1,@K[4]
  433. add @x[9],@x[9],@d[4],lsr#32
  434. add.32 @x[10],@x[10],@d[5]
  435. add $D2,$D2,@K[5]
  436. add @x[11],@x[11],@d[5],lsr#32
  437. add.32 @x[12],@x[12],@d[6]
  438. add $B0,$B0,@K[1]
  439. add @x[13],@x[13],@d[6],lsr#32
  440. add.32 @x[14],@x[14],@d[7]
  441. add $B1,$B1,@K[1]
  442. add @x[15],@x[15],@d[7],lsr#32
  443. add $B2,$B2,@K[1]
  444. b.lo .Ltail_neon
  445. add @x[0],@x[0],@x[1],lsl#32 // pack
  446. add @x[2],@x[2],@x[3],lsl#32
  447. ldp @x[1],@x[3],[$inp,#0] // load input
  448. add @x[4],@x[4],@x[5],lsl#32
  449. add @x[6],@x[6],@x[7],lsl#32
  450. ldp @x[5],@x[7],[$inp,#16]
  451. add @x[8],@x[8],@x[9],lsl#32
  452. add @x[10],@x[10],@x[11],lsl#32
  453. ldp @x[9],@x[11],[$inp,#32]
  454. add @x[12],@x[12],@x[13],lsl#32
  455. add @x[14],@x[14],@x[15],lsl#32
  456. ldp @x[13],@x[15],[$inp,#48]
  457. add $inp,$inp,#64
  458. #ifdef __ARMEB__
  459. rev @x[0],@x[0]
  460. rev @x[2],@x[2]
  461. rev @x[4],@x[4]
  462. rev @x[6],@x[6]
  463. rev @x[8],@x[8]
  464. rev @x[10],@x[10]
  465. rev @x[12],@x[12]
  466. rev @x[14],@x[14]
  467. #endif
  468. ld1.8 {$T0-$T3},[$inp],#64
  469. eor @x[0],@x[0],@x[1]
  470. eor @x[2],@x[2],@x[3]
  471. eor @x[4],@x[4],@x[5]
  472. eor @x[6],@x[6],@x[7]
  473. eor @x[8],@x[8],@x[9]
  474. eor $A0,$A0,$T0
  475. eor @x[10],@x[10],@x[11]
  476. eor $B0,$B0,$T1
  477. eor @x[12],@x[12],@x[13]
  478. eor $C0,$C0,$T2
  479. eor @x[14],@x[14],@x[15]
  480. eor $D0,$D0,$T3
  481. ld1.8 {$T0-$T3},[$inp],#64
  482. stp @x[0],@x[2],[$out,#0] // store output
  483. add @d[6],@d[6],#4 // increment counter
  484. stp @x[4],@x[6],[$out,#16]
  485. add @K[3],@K[3],$ONE // += 4
  486. stp @x[8],@x[10],[$out,#32]
  487. add @K[4],@K[4],$ONE
  488. stp @x[12],@x[14],[$out,#48]
  489. add @K[5],@K[5],$ONE
  490. add $out,$out,#64
  491. st1.8 {$A0-$D0},[$out],#64
  492. ld1.8 {$A0-$D0},[$inp],#64
  493. eor $A1,$A1,$T0
  494. eor $B1,$B1,$T1
  495. eor $C1,$C1,$T2
  496. eor $D1,$D1,$T3
  497. st1.8 {$A1-$D1},[$out],#64
  498. eor $A2,$A2,$A0
  499. eor $B2,$B2,$B0
  500. eor $C2,$C2,$C0
  501. eor $D2,$D2,$D0
  502. st1.8 {$A2-$D2},[$out],#64
  503. b.hi .Loop_outer_neon
  504. ldp x19,x20,[x29,#16]
  505. add sp,sp,#64
  506. ldp x21,x22,[x29,#32]
  507. ldp x23,x24,[x29,#48]
  508. ldp x25,x26,[x29,#64]
  509. ldp x27,x28,[x29,#80]
  510. ldp x29,x30,[sp],#96
  511. ret
  512. .Ltail_neon:
  513. add $len,$len,#256
  514. cmp $len,#64
  515. b.lo .Less_than_64
  516. add @x[0],@x[0],@x[1],lsl#32 // pack
  517. add @x[2],@x[2],@x[3],lsl#32
  518. ldp @x[1],@x[3],[$inp,#0] // load input
  519. add @x[4],@x[4],@x[5],lsl#32
  520. add @x[6],@x[6],@x[7],lsl#32
  521. ldp @x[5],@x[7],[$inp,#16]
  522. add @x[8],@x[8],@x[9],lsl#32
  523. add @x[10],@x[10],@x[11],lsl#32
  524. ldp @x[9],@x[11],[$inp,#32]
  525. add @x[12],@x[12],@x[13],lsl#32
  526. add @x[14],@x[14],@x[15],lsl#32
  527. ldp @x[13],@x[15],[$inp,#48]
  528. add $inp,$inp,#64
  529. #ifdef __ARMEB__
  530. rev @x[0],@x[0]
  531. rev @x[2],@x[2]
  532. rev @x[4],@x[4]
  533. rev @x[6],@x[6]
  534. rev @x[8],@x[8]
  535. rev @x[10],@x[10]
  536. rev @x[12],@x[12]
  537. rev @x[14],@x[14]
  538. #endif
  539. eor @x[0],@x[0],@x[1]
  540. eor @x[2],@x[2],@x[3]
  541. eor @x[4],@x[4],@x[5]
  542. eor @x[6],@x[6],@x[7]
  543. eor @x[8],@x[8],@x[9]
  544. eor @x[10],@x[10],@x[11]
  545. eor @x[12],@x[12],@x[13]
  546. eor @x[14],@x[14],@x[15]
  547. stp @x[0],@x[2],[$out,#0] // store output
  548. add @d[6],@d[6],#4 // increment counter
  549. stp @x[4],@x[6],[$out,#16]
  550. stp @x[8],@x[10],[$out,#32]
  551. stp @x[12],@x[14],[$out,#48]
  552. add $out,$out,#64
  553. b.eq .Ldone_neon
  554. sub $len,$len,#64
  555. cmp $len,#64
  556. b.lo .Less_than_128
  557. ld1.8 {$T0-$T3},[$inp],#64
  558. eor $A0,$A0,$T0
  559. eor $B0,$B0,$T1
  560. eor $C0,$C0,$T2
  561. eor $D0,$D0,$T3
  562. st1.8 {$A0-$D0},[$out],#64
  563. b.eq .Ldone_neon
  564. sub $len,$len,#64
  565. cmp $len,#64
  566. b.lo .Less_than_192
  567. ld1.8 {$T0-$T3},[$inp],#64
  568. eor $A1,$A1,$T0
  569. eor $B1,$B1,$T1
  570. eor $C1,$C1,$T2
  571. eor $D1,$D1,$T3
  572. st1.8 {$A1-$D1},[$out],#64
  573. b.eq .Ldone_neon
  574. sub $len,$len,#64
  575. st1.8 {$A2-$D2},[sp]
  576. b .Last_neon
  577. .Less_than_128:
  578. st1.8 {$A0-$D0},[sp]
  579. b .Last_neon
  580. .Less_than_192:
  581. st1.8 {$A1-$D1},[sp]
  582. b .Last_neon
  583. .align 4
  584. .Last_neon:
  585. sub $out,$out,#1
  586. add $inp,$inp,$len
  587. add $out,$out,$len
  588. add $ctr,sp,$len
  589. neg $len,$len
  590. .Loop_tail_neon:
  591. ldrb w10,[$inp,$len]
  592. ldrb w11,[$ctr,$len]
  593. add $len,$len,#1
  594. eor w10,w10,w11
  595. strb w10,[$out,$len]
  596. cbnz $len,.Loop_tail_neon
  597. stp xzr,xzr,[sp,#0]
  598. stp xzr,xzr,[sp,#16]
  599. stp xzr,xzr,[sp,#32]
  600. stp xzr,xzr,[sp,#48]
  601. .Ldone_neon:
  602. ldp x19,x20,[x29,#16]
  603. add sp,sp,#64
  604. ldp x21,x22,[x29,#32]
  605. ldp x23,x24,[x29,#48]
  606. ldp x25,x26,[x29,#64]
  607. ldp x27,x28,[x29,#80]
  608. ldp x29,x30,[sp],#96
  609. ret
  610. .size ChaCha20_neon,.-ChaCha20_neon
  611. ___
  612. {
  613. my ($T0,$T1,$T2,$T3,$T4,$T5)=@K;
  614. my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,
  615. $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23));
  616. $code.=<<___;
  617. .type ChaCha20_512_neon,%function
  618. .align 5
  619. ChaCha20_512_neon:
  620. stp x29,x30,[sp,#-96]!
  621. add x29,sp,#0
  622. adr @x[0],.Lsigma
  623. stp x19,x20,[sp,#16]
  624. stp x21,x22,[sp,#32]
  625. stp x23,x24,[sp,#48]
  626. stp x25,x26,[sp,#64]
  627. stp x27,x28,[sp,#80]
  628. .L512_or_more_neon:
  629. sub sp,sp,#128+64
  630. ldp @d[0],@d[1],[@x[0]] // load sigma
  631. ld1 {@K[0]},[@x[0]],#16
  632. ldp @d[2],@d[3],[$key] // load key
  633. ldp @d[4],@d[5],[$key,#16]
  634. ld1 {@K[1],@K[2]},[$key]
  635. ldp @d[6],@d[7],[$ctr] // load counter
  636. ld1 {@K[3]},[$ctr]
  637. ld1 {$ONE},[@x[0]]
  638. #ifdef __ARMEB__
  639. rev64 @K[0],@K[0]
  640. ror @d[2],@d[2],#32
  641. ror @d[3],@d[3],#32
  642. ror @d[4],@d[4],#32
  643. ror @d[5],@d[5],#32
  644. ror @d[6],@d[6],#32
  645. ror @d[7],@d[7],#32
  646. #endif
  647. add @K[3],@K[3],$ONE // += 1
  648. stp @K[0],@K[1],[sp,#0] // off-load key block, invariant part
  649. add @K[3],@K[3],$ONE // not typo
  650. str @K[2],[sp,#32]
  651. add @K[4],@K[3],$ONE
  652. add @K[5],@K[4],$ONE
  653. add @K[6],@K[5],$ONE
  654. shl $ONE,$ONE,#2 // 1 -> 4
  655. stp d8,d9,[sp,#128+0] // meet ABI requirements
  656. stp d10,d11,[sp,#128+16]
  657. stp d12,d13,[sp,#128+32]
  658. stp d14,d15,[sp,#128+48]
  659. sub $len,$len,#512 // not typo
  660. .Loop_outer_512_neon:
  661. mov $A0,@K[0]
  662. mov $A1,@K[0]
  663. mov $A2,@K[0]
  664. mov $A3,@K[0]
  665. mov $A4,@K[0]
  666. mov $A5,@K[0]
  667. mov $B0,@K[1]
  668. mov.32 @x[0],@d[0] // unpack key block
  669. mov $B1,@K[1]
  670. lsr @x[1],@d[0],#32
  671. mov $B2,@K[1]
  672. mov.32 @x[2],@d[1]
  673. mov $B3,@K[1]
  674. lsr @x[3],@d[1],#32
  675. mov $B4,@K[1]
  676. mov.32 @x[4],@d[2]
  677. mov $B5,@K[1]
  678. lsr @x[5],@d[2],#32
  679. mov $D0,@K[3]
  680. mov.32 @x[6],@d[3]
  681. mov $D1,@K[4]
  682. lsr @x[7],@d[3],#32
  683. mov $D2,@K[5]
  684. mov.32 @x[8],@d[4]
  685. mov $D3,@K[6]
  686. lsr @x[9],@d[4],#32
  687. mov $C0,@K[2]
  688. mov.32 @x[10],@d[5]
  689. mov $C1,@K[2]
  690. lsr @x[11],@d[5],#32
  691. add $D4,$D0,$ONE // +4
  692. mov.32 @x[12],@d[6]
  693. add $D5,$D1,$ONE // +4
  694. lsr @x[13],@d[6],#32
  695. mov $C2,@K[2]
  696. mov.32 @x[14],@d[7]
  697. mov $C3,@K[2]
  698. lsr @x[15],@d[7],#32
  699. mov $C4,@K[2]
  700. stp @K[3],@K[4],[sp,#48] // off-load key block, variable part
  701. mov $C5,@K[2]
  702. str @K[5],[sp,#80]
  703. mov $ctr,#5
  704. subs $len,$len,#512
  705. .Loop_upper_neon:
  706. sub $ctr,$ctr,#1
  707. ___
  708. my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
  709. my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
  710. my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
  711. my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
  712. my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
  713. my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
  714. my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
  715. my $diff = ($#thread0+1)*6 - $#thread67 - 1;
  716. my $i = 0;
  717. foreach (@thread0) {
  718. eval; eval(shift(@thread67));
  719. eval(shift(@thread1)); eval(shift(@thread67));
  720. eval(shift(@thread2)); eval(shift(@thread67));
  721. eval(shift(@thread3)); eval(shift(@thread67));
  722. eval(shift(@thread4)); eval(shift(@thread67));
  723. eval(shift(@thread5)); eval(shift(@thread67));
  724. }
  725. @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
  726. @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
  727. @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
  728. @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
  729. @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
  730. @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
  731. @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
  732. foreach (@thread0) {
  733. eval; eval(shift(@thread67));
  734. eval(shift(@thread1)); eval(shift(@thread67));
  735. eval(shift(@thread2)); eval(shift(@thread67));
  736. eval(shift(@thread3)); eval(shift(@thread67));
  737. eval(shift(@thread4)); eval(shift(@thread67));
  738. eval(shift(@thread5)); eval(shift(@thread67));
  739. }
  740. $code.=<<___;
  741. cbnz $ctr,.Loop_upper_neon
  742. add.32 @x[0],@x[0],@d[0] // accumulate key block
  743. add @x[1],@x[1],@d[0],lsr#32
  744. add.32 @x[2],@x[2],@d[1]
  745. add @x[3],@x[3],@d[1],lsr#32
  746. add.32 @x[4],@x[4],@d[2]
  747. add @x[5],@x[5],@d[2],lsr#32
  748. add.32 @x[6],@x[6],@d[3]
  749. add @x[7],@x[7],@d[3],lsr#32
  750. add.32 @x[8],@x[8],@d[4]
  751. add @x[9],@x[9],@d[4],lsr#32
  752. add.32 @x[10],@x[10],@d[5]
  753. add @x[11],@x[11],@d[5],lsr#32
  754. add.32 @x[12],@x[12],@d[6]
  755. add @x[13],@x[13],@d[6],lsr#32
  756. add.32 @x[14],@x[14],@d[7]
  757. add @x[15],@x[15],@d[7],lsr#32
  758. add @x[0],@x[0],@x[1],lsl#32 // pack
  759. add @x[2],@x[2],@x[3],lsl#32
  760. ldp @x[1],@x[3],[$inp,#0] // load input
  761. add @x[4],@x[4],@x[5],lsl#32
  762. add @x[6],@x[6],@x[7],lsl#32
  763. ldp @x[5],@x[7],[$inp,#16]
  764. add @x[8],@x[8],@x[9],lsl#32
  765. add @x[10],@x[10],@x[11],lsl#32
  766. ldp @x[9],@x[11],[$inp,#32]
  767. add @x[12],@x[12],@x[13],lsl#32
  768. add @x[14],@x[14],@x[15],lsl#32
  769. ldp @x[13],@x[15],[$inp,#48]
  770. add $inp,$inp,#64
  771. #ifdef __ARMEB__
  772. rev @x[0],@x[0]
  773. rev @x[2],@x[2]
  774. rev @x[4],@x[4]
  775. rev @x[6],@x[6]
  776. rev @x[8],@x[8]
  777. rev @x[10],@x[10]
  778. rev @x[12],@x[12]
  779. rev @x[14],@x[14]
  780. #endif
  781. eor @x[0],@x[0],@x[1]
  782. eor @x[2],@x[2],@x[3]
  783. eor @x[4],@x[4],@x[5]
  784. eor @x[6],@x[6],@x[7]
  785. eor @x[8],@x[8],@x[9]
  786. eor @x[10],@x[10],@x[11]
  787. eor @x[12],@x[12],@x[13]
  788. eor @x[14],@x[14],@x[15]
  789. stp @x[0],@x[2],[$out,#0] // store output
  790. add @d[6],@d[6],#1 // increment counter
  791. mov.32 @x[0],@d[0] // unpack key block
  792. lsr @x[1],@d[0],#32
  793. stp @x[4],@x[6],[$out,#16]
  794. mov.32 @x[2],@d[1]
  795. lsr @x[3],@d[1],#32
  796. stp @x[8],@x[10],[$out,#32]
  797. mov.32 @x[4],@d[2]
  798. lsr @x[5],@d[2],#32
  799. stp @x[12],@x[14],[$out,#48]
  800. add $out,$out,#64
  801. mov.32 @x[6],@d[3]
  802. lsr @x[7],@d[3],#32
  803. mov.32 @x[8],@d[4]
  804. lsr @x[9],@d[4],#32
  805. mov.32 @x[10],@d[5]
  806. lsr @x[11],@d[5],#32
  807. mov.32 @x[12],@d[6]
  808. lsr @x[13],@d[6],#32
  809. mov.32 @x[14],@d[7]
  810. lsr @x[15],@d[7],#32
  811. mov $ctr,#5
  812. .Loop_lower_neon:
  813. sub $ctr,$ctr,#1
  814. ___
  815. @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
  816. @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
  817. @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
  818. @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
  819. @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
  820. @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
  821. @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
  822. foreach (@thread0) {
  823. eval; eval(shift(@thread67));
  824. eval(shift(@thread1)); eval(shift(@thread67));
  825. eval(shift(@thread2)); eval(shift(@thread67));
  826. eval(shift(@thread3)); eval(shift(@thread67));
  827. eval(shift(@thread4)); eval(shift(@thread67));
  828. eval(shift(@thread5)); eval(shift(@thread67));
  829. }
  830. @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
  831. @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
  832. @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
  833. @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
  834. @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
  835. @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
  836. @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
  837. foreach (@thread0) {
  838. eval; eval(shift(@thread67));
  839. eval(shift(@thread1)); eval(shift(@thread67));
  840. eval(shift(@thread2)); eval(shift(@thread67));
  841. eval(shift(@thread3)); eval(shift(@thread67));
  842. eval(shift(@thread4)); eval(shift(@thread67));
  843. eval(shift(@thread5)); eval(shift(@thread67));
  844. }
  845. $code.=<<___;
  846. cbnz $ctr,.Loop_lower_neon
  847. add.32 @x[0],@x[0],@d[0] // accumulate key block
  848. ldp @K[0],@K[1],[sp,#0]
  849. add @x[1],@x[1],@d[0],lsr#32
  850. ldp @K[2],@K[3],[sp,#32]
  851. add.32 @x[2],@x[2],@d[1]
  852. ldp @K[4],@K[5],[sp,#64]
  853. add @x[3],@x[3],@d[1],lsr#32
  854. add $A0,$A0,@K[0]
  855. add.32 @x[4],@x[4],@d[2]
  856. add $A1,$A1,@K[0]
  857. add @x[5],@x[5],@d[2],lsr#32
  858. add $A2,$A2,@K[0]
  859. add.32 @x[6],@x[6],@d[3]
  860. add $A3,$A3,@K[0]
  861. add @x[7],@x[7],@d[3],lsr#32
  862. add $A4,$A4,@K[0]
  863. add.32 @x[8],@x[8],@d[4]
  864. add $A5,$A5,@K[0]
  865. add @x[9],@x[9],@d[4],lsr#32
  866. add $C0,$C0,@K[2]
  867. add.32 @x[10],@x[10],@d[5]
  868. add $C1,$C1,@K[2]
  869. add @x[11],@x[11],@d[5],lsr#32
  870. add $C2,$C2,@K[2]
  871. add.32 @x[12],@x[12],@d[6]
  872. add $C3,$C3,@K[2]
  873. add @x[13],@x[13],@d[6],lsr#32
  874. add $C4,$C4,@K[2]
  875. add.32 @x[14],@x[14],@d[7]
  876. add $C5,$C5,@K[2]
  877. add @x[15],@x[15],@d[7],lsr#32
  878. add $D4,$D4,$ONE // +4
  879. add @x[0],@x[0],@x[1],lsl#32 // pack
  880. add $D5,$D5,$ONE // +4
  881. add @x[2],@x[2],@x[3],lsl#32
  882. add $D0,$D0,@K[3]
  883. ldp @x[1],@x[3],[$inp,#0] // load input
  884. add $D1,$D1,@K[4]
  885. add @x[4],@x[4],@x[5],lsl#32
  886. add $D2,$D2,@K[5]
  887. add @x[6],@x[6],@x[7],lsl#32
  888. add $D3,$D3,@K[6]
  889. ldp @x[5],@x[7],[$inp,#16]
  890. add $D4,$D4,@K[3]
  891. add @x[8],@x[8],@x[9],lsl#32
  892. add $D5,$D5,@K[4]
  893. add @x[10],@x[10],@x[11],lsl#32
  894. add $B0,$B0,@K[1]
  895. ldp @x[9],@x[11],[$inp,#32]
  896. add $B1,$B1,@K[1]
  897. add @x[12],@x[12],@x[13],lsl#32
  898. add $B2,$B2,@K[1]
  899. add @x[14],@x[14],@x[15],lsl#32
  900. add $B3,$B3,@K[1]
  901. ldp @x[13],@x[15],[$inp,#48]
  902. add $B4,$B4,@K[1]
  903. add $inp,$inp,#64
  904. add $B5,$B5,@K[1]
  905. #ifdef __ARMEB__
  906. rev @x[0],@x[0]
  907. rev @x[2],@x[2]
  908. rev @x[4],@x[4]
  909. rev @x[6],@x[6]
  910. rev @x[8],@x[8]
  911. rev @x[10],@x[10]
  912. rev @x[12],@x[12]
  913. rev @x[14],@x[14]
  914. #endif
  915. ld1.8 {$T0-$T3},[$inp],#64
  916. eor @x[0],@x[0],@x[1]
  917. eor @x[2],@x[2],@x[3]
  918. eor @x[4],@x[4],@x[5]
  919. eor @x[6],@x[6],@x[7]
  920. eor @x[8],@x[8],@x[9]
  921. eor $A0,$A0,$T0
  922. eor @x[10],@x[10],@x[11]
  923. eor $B0,$B0,$T1
  924. eor @x[12],@x[12],@x[13]
  925. eor $C0,$C0,$T2
  926. eor @x[14],@x[14],@x[15]
  927. eor $D0,$D0,$T3
  928. ld1.8 {$T0-$T3},[$inp],#64
  929. stp @x[0],@x[2],[$out,#0] // store output
  930. add @d[6],@d[6],#7 // increment counter
  931. stp @x[4],@x[6],[$out,#16]
  932. stp @x[8],@x[10],[$out,#32]
  933. stp @x[12],@x[14],[$out,#48]
  934. add $out,$out,#64
  935. st1.8 {$A0-$D0},[$out],#64
  936. ld1.8 {$A0-$D0},[$inp],#64
  937. eor $A1,$A1,$T0
  938. eor $B1,$B1,$T1
  939. eor $C1,$C1,$T2
  940. eor $D1,$D1,$T3
  941. st1.8 {$A1-$D1},[$out],#64
  942. ld1.8 {$A1-$D1},[$inp],#64
  943. eor $A2,$A2,$A0
  944. ldp @K[0],@K[1],[sp,#0]
  945. eor $B2,$B2,$B0
  946. ldp @K[2],@K[3],[sp,#32]
  947. eor $C2,$C2,$C0
  948. eor $D2,$D2,$D0
  949. st1.8 {$A2-$D2},[$out],#64
  950. ld1.8 {$A2-$D2},[$inp],#64
  951. eor $A3,$A3,$A1
  952. eor $B3,$B3,$B1
  953. eor $C3,$C3,$C1
  954. eor $D3,$D3,$D1
  955. st1.8 {$A3-$D3},[$out],#64
  956. ld1.8 {$A3-$D3},[$inp],#64
  957. eor $A4,$A4,$A2
  958. eor $B4,$B4,$B2
  959. eor $C4,$C4,$C2
  960. eor $D4,$D4,$D2
  961. st1.8 {$A4-$D4},[$out],#64
  962. shl $A0,$ONE,#1 // 4 -> 8
  963. eor $A5,$A5,$A3
  964. eor $B5,$B5,$B3
  965. eor $C5,$C5,$C3
  966. eor $D5,$D5,$D3
  967. st1.8 {$A5-$D5},[$out],#64
  968. add @K[3],@K[3],$A0 // += 8
  969. add @K[4],@K[4],$A0
  970. add @K[5],@K[5],$A0
  971. add @K[6],@K[6],$A0
  972. b.hs .Loop_outer_512_neon
  973. adds $len,$len,#512
  974. ushr $A0,$ONE,#2 // 4 -> 1
  975. ldp d8,d9,[sp,#128+0] // meet ABI requirements
  976. ldp d10,d11,[sp,#128+16]
  977. ldp d12,d13,[sp,#128+32]
  978. ldp d14,d15,[sp,#128+48]
  979. stp @K[0],$ONE,[sp,#0] // wipe off-load area
  980. stp @K[0],$ONE,[sp,#32]
  981. stp @K[0],$ONE,[sp,#64]
  982. b.eq .Ldone_512_neon
  983. cmp $len,#192
  984. sub @K[3],@K[3],$A0 // -= 1
  985. sub @K[4],@K[4],$A0
  986. sub @K[5],@K[5],$A0
  987. add sp,sp,#128
  988. b.hs .Loop_outer_neon
  989. eor @K[1],@K[1],@K[1]
  990. eor @K[2],@K[2],@K[2]
  991. eor @K[3],@K[3],@K[3]
  992. eor @K[4],@K[4],@K[4]
  993. eor @K[5],@K[5],@K[5]
  994. eor @K[6],@K[6],@K[6]
  995. b .Loop_outer
  996. .Ldone_512_neon:
  997. ldp x19,x20,[x29,#16]
  998. add sp,sp,#128+64
  999. ldp x21,x22,[x29,#32]
  1000. ldp x23,x24,[x29,#48]
  1001. ldp x25,x26,[x29,#64]
  1002. ldp x27,x28,[x29,#80]
  1003. ldp x29,x30,[sp],#96
  1004. ret
  1005. .size ChaCha20_512_neon,.-ChaCha20_512_neon
  1006. ___
  1007. }
  1008. }}}
  1009. foreach (split("\n",$code)) {
  1010. s/\`([^\`]*)\`/eval $1/geo;
  1011. (s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1)) or
  1012. (m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1)) or
  1013. (s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1)) or
  1014. (m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1)) or
  1015. (s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1));
  1016. #s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
  1017. print $_,"\n";
  1018. }
  1019. close STDOUT; # flush