Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.
 
 
 
 
 
 

1217 rindas
27 KiB

  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  5. # project. The module is, however, dual licensed under OpenSSL and
  6. # CRYPTOGAMS licenses depending on where you obtain it. For further
  7. # details see http://www.openssl.org/~appro/cryptogams/.
  8. # ====================================================================
  9. #
  10. # IALU(*)/gcc-4.4 NEON
  11. #
  12. # ARM11xx(ARMv6) 7.78/+100% -
  13. # Cortex-A5 6.30/+130% 2.96
  14. # Cortex-A8 6.25/+115% 2.36
  15. # Cortex-A9 5.10/+95% 2.55
  16. # Cortex-A15 3.79/+85% 1.25(**)
  17. # Snapdragon S4 5.70/+100% 1.48(**)
  18. #
  19. # (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
  20. # (**) these are trade-off results, they can be improved by ~8% but at
  21. # the cost of 15/12% regression on Cortex-A5/A7, it's even possible
  22. # to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
  23. $flavour = shift;
  24. if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
  25. else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
  26. if ($flavour && $flavour ne "void") {
  27. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  28. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  29. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  30. die "can't locate arm-xlate.pl";
  31. open STDOUT,"| \"$^X\" $xlate $flavour $output";
  32. } else {
  33. open STDOUT,">$output";
  34. }
  35. ($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
  36. $code.=<<___;
  37. #include <openssl/arm_arch.h>
  38. .text
  39. #if defined(__thumb2__)
  40. .syntax unified
  41. .thumb
  42. #else
  43. .code 32
  44. #endif
  45. .globl poly1305_emit
  46. .globl poly1305_blocks
  47. .globl poly1305_init
  48. .type poly1305_init,%function
  49. .align 5
  50. poly1305_init:
  51. .Lpoly1305_init:
  52. stmdb sp!,{r4-r11}
  53. eor r3,r3,r3
  54. cmp $inp,#0
  55. str r3,[$ctx,#0] @ zero hash value
  56. str r3,[$ctx,#4]
  57. str r3,[$ctx,#8]
  58. str r3,[$ctx,#12]
  59. str r3,[$ctx,#16]
  60. str r3,[$ctx,#36] @ is_base2_26
  61. add $ctx,$ctx,#20
  62. #ifdef __thumb2__
  63. it eq
  64. #endif
  65. moveq r0,#0
  66. beq .Lno_key
  67. #if __ARM_MAX_ARCH__>=7
  68. adr r11,.Lpoly1305_init
  69. ldr r12,.LOPENSSL_armcap
  70. #endif
  71. ldrb r4,[$inp,#0]
  72. mov r10,#0x0fffffff
  73. ldrb r5,[$inp,#1]
  74. and r3,r10,#-4 @ 0x0ffffffc
  75. ldrb r6,[$inp,#2]
  76. ldrb r7,[$inp,#3]
  77. orr r4,r4,r5,lsl#8
  78. ldrb r5,[$inp,#4]
  79. orr r4,r4,r6,lsl#16
  80. ldrb r6,[$inp,#5]
  81. orr r4,r4,r7,lsl#24
  82. ldrb r7,[$inp,#6]
  83. and r4,r4,r10
  84. #if __ARM_MAX_ARCH__>=7
  85. ldr r12,[r11,r12] @ OPENSSL_armcap_P
  86. # ifdef __APPLE__
  87. ldr r12,[r12]
  88. # endif
  89. #endif
  90. ldrb r8,[$inp,#7]
  91. orr r5,r5,r6,lsl#8
  92. ldrb r6,[$inp,#8]
  93. orr r5,r5,r7,lsl#16
  94. ldrb r7,[$inp,#9]
  95. orr r5,r5,r8,lsl#24
  96. ldrb r8,[$inp,#10]
  97. and r5,r5,r3
  98. #if __ARM_MAX_ARCH__>=7
  99. tst r12,#ARMV7_NEON @ check for NEON
  100. # ifdef __APPLE__
  101. adr r9,poly1305_blocks_neon
  102. adr r11,poly1305_blocks
  103. # ifdef __thumb2__
  104. it ne
  105. # endif
  106. movne r11,r9
  107. adr r12,poly1305_emit
  108. adr r10,poly1305_emit_neon
  109. # ifdef __thumb2__
  110. it ne
  111. # endif
  112. movne r12,r10
  113. # else
  114. # ifdef __thumb2__
  115. itete eq
  116. # endif
  117. addeq r12,r11,#(poly1305_emit-.Lpoly1305_init)
  118. addne r12,r11,#(poly1305_emit_neon-.Lpoly1305_init)
  119. addeq r11,r11,#(poly1305_blocks-.Lpoly1305_init)
  120. addne r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init)
  121. # endif
  122. # ifdef __thumb2__
  123. orr r12,r12,#1 @ thumb-ify address
  124. orr r11,r11,#1
  125. # endif
  126. #endif
  127. ldrb r9,[$inp,#11]
  128. orr r6,r6,r7,lsl#8
  129. ldrb r7,[$inp,#12]
  130. orr r6,r6,r8,lsl#16
  131. ldrb r8,[$inp,#13]
  132. orr r6,r6,r9,lsl#24
  133. ldrb r9,[$inp,#14]
  134. and r6,r6,r3
  135. ldrb r10,[$inp,#15]
  136. orr r7,r7,r8,lsl#8
  137. str r4,[$ctx,#0]
  138. orr r7,r7,r9,lsl#16
  139. str r5,[$ctx,#4]
  140. orr r7,r7,r10,lsl#24
  141. str r6,[$ctx,#8]
  142. and r7,r7,r3
  143. str r7,[$ctx,#12]
  144. #if __ARM_MAX_ARCH__>=7
  145. stmia r2,{r11,r12} @ fill functions table
  146. mov r0,#1
  147. #else
  148. mov r0,#0
  149. #endif
  150. .Lno_key:
  151. ldmia sp!,{r4-r11}
  152. #if __ARM_ARCH__>=5
  153. ret @ bx lr
  154. #else
  155. tst lr,#1
  156. moveq pc,lr @ be binary compatible with V4, yet
  157. bx lr @ interoperable with Thumb ISA:-)
  158. #endif
  159. .size poly1305_init,.-poly1305_init
  160. ___
  161. {
  162. my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
  163. my ($s1,$s2,$s3)=($r1,$r2,$r3);
  164. $code.=<<___;
  165. .type poly1305_blocks,%function
  166. .align 5
  167. poly1305_blocks:
  168. stmdb sp!,{r3-r11,lr}
  169. ands $len,$len,#-16
  170. beq .Lno_data
  171. cmp $padbit,#0
  172. add $len,$len,$inp @ end pointer
  173. sub sp,sp,#32
  174. ldmia $ctx,{$h0-$r3} @ load context
  175. str $ctx,[sp,#12] @ offload stuff
  176. mov lr,$inp
  177. str $len,[sp,#16]
  178. str $r1,[sp,#20]
  179. str $r2,[sp,#24]
  180. str $r3,[sp,#28]
  181. b .Loop
  182. .Loop:
  183. #if __ARM_ARCH__<7
  184. ldrb r0,[lr],#16 @ load input
  185. # ifdef __thumb2__
  186. it hi
  187. # endif
  188. addhi $h4,$h4,#1 @ 1<<128
  189. ldrb r1,[lr,#-15]
  190. ldrb r2,[lr,#-14]
  191. ldrb r3,[lr,#-13]
  192. orr r1,r0,r1,lsl#8
  193. ldrb r0,[lr,#-12]
  194. orr r2,r1,r2,lsl#16
  195. ldrb r1,[lr,#-11]
  196. orr r3,r2,r3,lsl#24
  197. ldrb r2,[lr,#-10]
  198. adds $h0,$h0,r3 @ accumulate input
  199. ldrb r3,[lr,#-9]
  200. orr r1,r0,r1,lsl#8
  201. ldrb r0,[lr,#-8]
  202. orr r2,r1,r2,lsl#16
  203. ldrb r1,[lr,#-7]
  204. orr r3,r2,r3,lsl#24
  205. ldrb r2,[lr,#-6]
  206. adcs $h1,$h1,r3
  207. ldrb r3,[lr,#-5]
  208. orr r1,r0,r1,lsl#8
  209. ldrb r0,[lr,#-4]
  210. orr r2,r1,r2,lsl#16
  211. ldrb r1,[lr,#-3]
  212. orr r3,r2,r3,lsl#24
  213. ldrb r2,[lr,#-2]
  214. adcs $h2,$h2,r3
  215. ldrb r3,[lr,#-1]
  216. orr r1,r0,r1,lsl#8
  217. str lr,[sp,#8] @ offload input pointer
  218. orr r2,r1,r2,lsl#16
  219. add $s1,$r1,$r1,lsr#2
  220. orr r3,r2,r3,lsl#24
  221. #else
  222. ldr r0,[lr],#16 @ load input
  223. # ifdef __thumb2__
  224. it hi
  225. # endif
  226. addhi $h4,$h4,#1 @ padbit
  227. ldr r1,[lr,#-12]
  228. ldr r2,[lr,#-8]
  229. ldr r3,[lr,#-4]
  230. # ifdef __ARMEB__
  231. rev r0,r0
  232. rev r1,r1
  233. rev r2,r2
  234. rev r3,r3
  235. # endif
  236. adds $h0,$h0,r0 @ accumulate input
  237. str lr,[sp,#8] @ offload input pointer
  238. adcs $h1,$h1,r1
  239. add $s1,$r1,$r1,lsr#2
  240. adcs $h2,$h2,r2
  241. #endif
  242. add $s2,$r2,$r2,lsr#2
  243. adcs $h3,$h3,r3
  244. add $s3,$r3,$r3,lsr#2
  245. umull r2,r3,$h1,$r0
  246. adc $h4,$h4,#0
  247. umull r0,r1,$h0,$r0
  248. umlal r2,r3,$h4,$s1
  249. umlal r0,r1,$h3,$s1
  250. ldr $r1,[sp,#20] @ reload $r1
  251. umlal r2,r3,$h2,$s3
  252. umlal r0,r1,$h1,$s3
  253. umlal r2,r3,$h3,$s2
  254. umlal r0,r1,$h2,$s2
  255. umlal r2,r3,$h0,$r1
  256. str r0,[sp,#0] @ future $h0
  257. mul r0,$s2,$h4
  258. ldr $r2,[sp,#24] @ reload $r2
  259. adds r2,r2,r1 @ d1+=d0>>32
  260. eor r1,r1,r1
  261. adc lr,r3,#0 @ future $h2
  262. str r2,[sp,#4] @ future $h1
  263. mul r2,$s3,$h4
  264. eor r3,r3,r3
  265. umlal r0,r1,$h3,$s3
  266. ldr $r3,[sp,#28] @ reload $r3
  267. umlal r2,r3,$h3,$r0
  268. umlal r0,r1,$h2,$r0
  269. umlal r2,r3,$h2,$r1
  270. umlal r0,r1,$h1,$r1
  271. umlal r2,r3,$h1,$r2
  272. umlal r0,r1,$h0,$r2
  273. umlal r2,r3,$h0,$r3
  274. ldr $h0,[sp,#0]
  275. mul $h4,$r0,$h4
  276. ldr $h1,[sp,#4]
  277. adds $h2,lr,r0 @ d2+=d1>>32
  278. ldr lr,[sp,#8] @ reload input pointer
  279. adc r1,r1,#0
  280. adds $h3,r2,r1 @ d3+=d2>>32
  281. ldr r0,[sp,#16] @ reload end pointer
  282. adc r3,r3,#0
  283. add $h4,$h4,r3 @ h4+=d3>>32
  284. and r1,$h4,#-4
  285. and $h4,$h4,#3
  286. add r1,r1,r1,lsr#2 @ *=5
  287. adds $h0,$h0,r1
  288. adcs $h1,$h1,#0
  289. adcs $h2,$h2,#0
  290. adc $h3,$h3,#0
  291. cmp r0,lr @ done yet?
  292. bhi .Loop
  293. ldr $ctx,[sp,#12]
  294. add sp,sp,#32
  295. stmia $ctx,{$h0-$h4} @ store the result
  296. .Lno_data:
  297. #if __ARM_ARCH__>=5
  298. ldmia sp!,{r3-r11,pc}
  299. #else
  300. ldmia sp!,{r3-r11,lr}
  301. tst lr,#1
  302. moveq pc,lr @ be binary compatible with V4, yet
  303. bx lr @ interoperable with Thumb ISA:-)
  304. #endif
  305. .size poly1305_blocks,.-poly1305_blocks
  306. ___
  307. }
  308. {
  309. my ($ctx,$mac,$nonce)=map("r$_",(0..2));
  310. my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
  311. my $g4=$h4;
  312. $code.=<<___;
  313. .type poly1305_emit,%function
  314. .align 5
  315. poly1305_emit:
  316. stmdb sp!,{r4-r11}
  317. .Lpoly1305_emit_enter:
  318. ldmia $ctx,{$h0-$h4}
  319. adds $g0,$h0,#5 @ compare to modulus
  320. adcs $g1,$h1,#0
  321. adcs $g2,$h2,#0
  322. adcs $g3,$h3,#0
  323. adc $g4,$h4,#0
  324. tst $g4,#4 @ did it carry/borrow?
  325. #ifdef __thumb2__
  326. it ne
  327. #endif
  328. movne $h0,$g0
  329. ldr $g0,[$nonce,#0]
  330. #ifdef __thumb2__
  331. it ne
  332. #endif
  333. movne $h1,$g1
  334. ldr $g1,[$nonce,#4]
  335. #ifdef __thumb2__
  336. it ne
  337. #endif
  338. movne $h2,$g2
  339. ldr $g2,[$nonce,#8]
  340. #ifdef __thumb2__
  341. it ne
  342. #endif
  343. movne $h3,$g3
  344. ldr $g3,[$nonce,#12]
  345. adds $h0,$h0,$g0
  346. adcs $h1,$h1,$g1
  347. adcs $h2,$h2,$g2
  348. adc $h3,$h3,$g3
  349. #if __ARM_ARCH__>=7
  350. # ifdef __ARMEB__
  351. rev $h0,$h0
  352. rev $h1,$h1
  353. rev $h2,$h2
  354. rev $h3,$h3
  355. # endif
  356. str $h0,[$mac,#0]
  357. str $h1,[$mac,#4]
  358. str $h2,[$mac,#8]
  359. str $h3,[$mac,#12]
  360. #else
  361. strb $h0,[$mac,#0]
  362. mov $h0,$h0,lsr#8
  363. strb $h1,[$mac,#4]
  364. mov $h1,$h1,lsr#8
  365. strb $h2,[$mac,#8]
  366. mov $h2,$h2,lsr#8
  367. strb $h3,[$mac,#12]
  368. mov $h3,$h3,lsr#8
  369. strb $h0,[$mac,#1]
  370. mov $h0,$h0,lsr#8
  371. strb $h1,[$mac,#5]
  372. mov $h1,$h1,lsr#8
  373. strb $h2,[$mac,#9]
  374. mov $h2,$h2,lsr#8
  375. strb $h3,[$mac,#13]
  376. mov $h3,$h3,lsr#8
  377. strb $h0,[$mac,#2]
  378. mov $h0,$h0,lsr#8
  379. strb $h1,[$mac,#6]
  380. mov $h1,$h1,lsr#8
  381. strb $h2,[$mac,#10]
  382. mov $h2,$h2,lsr#8
  383. strb $h3,[$mac,#14]
  384. mov $h3,$h3,lsr#8
  385. strb $h0,[$mac,#3]
  386. strb $h1,[$mac,#7]
  387. strb $h2,[$mac,#11]
  388. strb $h3,[$mac,#15]
  389. #endif
  390. ldmia sp!,{r4-r11}
  391. #if __ARM_ARCH__>=5
  392. ret @ bx lr
  393. #else
  394. tst lr,#1
  395. moveq pc,lr @ be binary compatible with V4, yet
  396. bx lr @ interoperable with Thumb ISA:-)
  397. #endif
  398. .size poly1305_emit,.-poly1305_emit
  399. ___
  400. {
  401. my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
  402. my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
  403. my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
  404. my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
  405. $code.=<<___;
  406. #if __ARM_MAX_ARCH__>=7
  407. .fpu neon
  408. .type poly1305_init_neon,%function
  409. .align 5
  410. poly1305_init_neon:
  411. ldr r4,[$ctx,#20] @ load key base 2^32
  412. ldr r5,[$ctx,#24]
  413. ldr r6,[$ctx,#28]
  414. ldr r7,[$ctx,#32]
  415. and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
  416. mov r3,r4,lsr#26
  417. mov r4,r5,lsr#20
  418. orr r3,r3,r5,lsl#6
  419. mov r5,r6,lsr#14
  420. orr r4,r4,r6,lsl#12
  421. mov r6,r7,lsr#8
  422. orr r5,r5,r7,lsl#18
  423. and r3,r3,#0x03ffffff
  424. and r4,r4,#0x03ffffff
  425. and r5,r5,#0x03ffffff
  426. vdup.32 $R0,r2 @ r^1 in both lanes
  427. add r2,r3,r3,lsl#2 @ *5
  428. vdup.32 $R1,r3
  429. add r3,r4,r4,lsl#2
  430. vdup.32 $S1,r2
  431. vdup.32 $R2,r4
  432. add r4,r5,r5,lsl#2
  433. vdup.32 $S2,r3
  434. vdup.32 $R3,r5
  435. add r5,r6,r6,lsl#2
  436. vdup.32 $S3,r4
  437. vdup.32 $R4,r6
  438. vdup.32 $S4,r5
  439. mov $zeros,#2 @ counter
  440. .Lsquare_neon:
  441. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  442. @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
  443. @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
  444. @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  445. @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
  446. @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
  447. vmull.u32 $D0,$R0,${R0}[1]
  448. vmull.u32 $D1,$R1,${R0}[1]
  449. vmull.u32 $D2,$R2,${R0}[1]
  450. vmull.u32 $D3,$R3,${R0}[1]
  451. vmull.u32 $D4,$R4,${R0}[1]
  452. vmlal.u32 $D0,$R4,${S1}[1]
  453. vmlal.u32 $D1,$R0,${R1}[1]
  454. vmlal.u32 $D2,$R1,${R1}[1]
  455. vmlal.u32 $D3,$R2,${R1}[1]
  456. vmlal.u32 $D4,$R3,${R1}[1]
  457. vmlal.u32 $D0,$R3,${S2}[1]
  458. vmlal.u32 $D1,$R4,${S2}[1]
  459. vmlal.u32 $D3,$R1,${R2}[1]
  460. vmlal.u32 $D2,$R0,${R2}[1]
  461. vmlal.u32 $D4,$R2,${R2}[1]
  462. vmlal.u32 $D0,$R2,${S3}[1]
  463. vmlal.u32 $D3,$R0,${R3}[1]
  464. vmlal.u32 $D1,$R3,${S3}[1]
  465. vmlal.u32 $D2,$R4,${S3}[1]
  466. vmlal.u32 $D4,$R1,${R3}[1]
  467. vmlal.u32 $D3,$R4,${S4}[1]
  468. vmlal.u32 $D0,$R1,${S4}[1]
  469. vmlal.u32 $D1,$R2,${S4}[1]
  470. vmlal.u32 $D2,$R3,${S4}[1]
  471. vmlal.u32 $D4,$R0,${R4}[1]
  472. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  473. @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
  474. @ and P. Schwabe
  475. vshr.u64 $T0,$D3,#26
  476. vmovn.i64 $D3#lo,$D3
  477. vshr.u64 $T1,$D0,#26
  478. vmovn.i64 $D0#lo,$D0
  479. vadd.i64 $D4,$D4,$T0 @ h3 -> h4
  480. vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff
  481. vadd.i64 $D1,$D1,$T1 @ h0 -> h1
  482. vbic.i32 $D0#lo,#0xfc000000
  483. vshrn.u64 $T0#lo,$D4,#26
  484. vmovn.i64 $D4#lo,$D4
  485. vshr.u64 $T1,$D1,#26
  486. vmovn.i64 $D1#lo,$D1
  487. vadd.i64 $D2,$D2,$T1 @ h1 -> h2
  488. vbic.i32 $D4#lo,#0xfc000000
  489. vbic.i32 $D1#lo,#0xfc000000
  490. vadd.i32 $D0#lo,$D0#lo,$T0#lo
  491. vshl.u32 $T0#lo,$T0#lo,#2
  492. vshrn.u64 $T1#lo,$D2,#26
  493. vmovn.i64 $D2#lo,$D2
  494. vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0
  495. vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
  496. vbic.i32 $D2#lo,#0xfc000000
  497. vshr.u32 $T0#lo,$D0#lo,#26
  498. vbic.i32 $D0#lo,#0xfc000000
  499. vshr.u32 $T1#lo,$D3#lo,#26
  500. vbic.i32 $D3#lo,#0xfc000000
  501. vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
  502. vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
  503. subs $zeros,$zeros,#1
  504. beq .Lsquare_break_neon
  505. add $tbl0,$ctx,#(48+0*9*4)
  506. add $tbl1,$ctx,#(48+1*9*4)
  507. vtrn.32 $R0,$D0#lo @ r^2:r^1
  508. vtrn.32 $R2,$D2#lo
  509. vtrn.32 $R3,$D3#lo
  510. vtrn.32 $R1,$D1#lo
  511. vtrn.32 $R4,$D4#lo
  512. vshl.u32 $S2,$R2,#2 @ *5
  513. vshl.u32 $S3,$R3,#2
  514. vshl.u32 $S1,$R1,#2
  515. vshl.u32 $S4,$R4,#2
  516. vadd.i32 $S2,$S2,$R2
  517. vadd.i32 $S1,$S1,$R1
  518. vadd.i32 $S3,$S3,$R3
  519. vadd.i32 $S4,$S4,$R4
  520. vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
  521. vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
  522. vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
  523. vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
  524. vst1.32 {${S4}[0]},[$tbl0,:32]
  525. vst1.32 {${S4}[1]},[$tbl1,:32]
  526. b .Lsquare_neon
  527. .align 4
  528. .Lsquare_break_neon:
  529. add $tbl0,$ctx,#(48+2*4*9)
  530. add $tbl1,$ctx,#(48+3*4*9)
  531. vmov $R0,$D0#lo @ r^4:r^3
  532. vshl.u32 $S1,$D1#lo,#2 @ *5
  533. vmov $R1,$D1#lo
  534. vshl.u32 $S2,$D2#lo,#2
  535. vmov $R2,$D2#lo
  536. vshl.u32 $S3,$D3#lo,#2
  537. vmov $R3,$D3#lo
  538. vshl.u32 $S4,$D4#lo,#2
  539. vmov $R4,$D4#lo
  540. vadd.i32 $S1,$S1,$D1#lo
  541. vadd.i32 $S2,$S2,$D2#lo
  542. vadd.i32 $S3,$S3,$D3#lo
  543. vadd.i32 $S4,$S4,$D4#lo
  544. vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
  545. vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
  546. vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
  547. vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
  548. vst1.32 {${S4}[0]},[$tbl0]
  549. vst1.32 {${S4}[1]},[$tbl1]
  550. ret @ bx lr
  551. .size poly1305_init_neon,.-poly1305_init_neon
  552. .type poly1305_blocks_neon,%function
  553. .align 5
  554. poly1305_blocks_neon:
  555. ldr ip,[$ctx,#36] @ is_base2_26
  556. ands $len,$len,#-16
  557. beq .Lno_data_neon
  558. cmp $len,#64
  559. bhs .Lenter_neon
  560. tst ip,ip @ is_base2_26?
  561. beq poly1305_blocks
  562. .Lenter_neon:
  563. stmdb sp!,{r4-r7}
  564. vstmdb sp!,{d8-d15} @ ABI specification says so
  565. tst ip,ip @ is_base2_26?
  566. bne .Lbase2_26_neon
  567. stmdb sp!,{r1-r3,lr}
  568. bl poly1305_init_neon
  569. ldr r4,[$ctx,#0] @ load hash value base 2^32
  570. ldr r5,[$ctx,#4]
  571. ldr r6,[$ctx,#8]
  572. ldr r7,[$ctx,#12]
  573. ldr ip,[$ctx,#16]
  574. and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
  575. mov r3,r4,lsr#26
  576. veor $D0#lo,$D0#lo,$D0#lo
  577. mov r4,r5,lsr#20
  578. orr r3,r3,r5,lsl#6
  579. veor $D1#lo,$D1#lo,$D1#lo
  580. mov r5,r6,lsr#14
  581. orr r4,r4,r6,lsl#12
  582. veor $D2#lo,$D2#lo,$D2#lo
  583. mov r6,r7,lsr#8
  584. orr r5,r5,r7,lsl#18
  585. veor $D3#lo,$D3#lo,$D3#lo
  586. and r3,r3,#0x03ffffff
  587. orr r6,r6,ip,lsl#24
  588. veor $D4#lo,$D4#lo,$D4#lo
  589. and r4,r4,#0x03ffffff
  590. mov r1,#1
  591. and r5,r5,#0x03ffffff
  592. str r1,[$ctx,#36] @ is_base2_26
  593. vmov.32 $D0#lo[0],r2
  594. vmov.32 $D1#lo[0],r3
  595. vmov.32 $D2#lo[0],r4
  596. vmov.32 $D3#lo[0],r5
  597. vmov.32 $D4#lo[0],r6
  598. adr $zeros,.Lzeros
  599. ldmia sp!,{r1-r3,lr}
  600. b .Lbase2_32_neon
  601. .align 4
  602. .Lbase2_26_neon:
  603. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  604. @ load hash value
  605. veor $D0#lo,$D0#lo,$D0#lo
  606. veor $D1#lo,$D1#lo,$D1#lo
  607. veor $D2#lo,$D2#lo,$D2#lo
  608. veor $D3#lo,$D3#lo,$D3#lo
  609. veor $D4#lo,$D4#lo,$D4#lo
  610. vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
  611. adr $zeros,.Lzeros
  612. vld1.32 {$D4#lo[0]},[$ctx]
  613. sub $ctx,$ctx,#16 @ rewind
  614. .Lbase2_32_neon:
  615. add $in2,$inp,#32
  616. mov $padbit,$padbit,lsl#24
  617. tst $len,#31
  618. beq .Leven
  619. vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
  620. vmov.32 $H4#lo[0],$padbit
  621. sub $len,$len,#16
  622. add $in2,$inp,#32
  623. # ifdef __ARMEB__
  624. vrev32.8 $H0,$H0
  625. vrev32.8 $H3,$H3
  626. vrev32.8 $H1,$H1
  627. vrev32.8 $H2,$H2
  628. # endif
  629. vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26
  630. vshl.u32 $H3#lo,$H3#lo,#18
  631. vsri.u32 $H3#lo,$H2#lo,#14
  632. vshl.u32 $H2#lo,$H2#lo,#12
  633. vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi
  634. vbic.i32 $H3#lo,#0xfc000000
  635. vsri.u32 $H2#lo,$H1#lo,#20
  636. vshl.u32 $H1#lo,$H1#lo,#6
  637. vbic.i32 $H2#lo,#0xfc000000
  638. vsri.u32 $H1#lo,$H0#lo,#26
  639. vadd.i32 $H3#hi,$H3#lo,$D3#lo
  640. vbic.i32 $H0#lo,#0xfc000000
  641. vbic.i32 $H1#lo,#0xfc000000
  642. vadd.i32 $H2#hi,$H2#lo,$D2#lo
  643. vadd.i32 $H0#hi,$H0#lo,$D0#lo
  644. vadd.i32 $H1#hi,$H1#lo,$D1#lo
  645. mov $tbl1,$zeros
  646. add $tbl0,$ctx,#48
  647. cmp $len,$len
  648. b .Long_tail
  649. .align 4
  650. .Leven:
  651. subs $len,$len,#64
  652. # ifdef __thumb2__
  653. it lo
  654. # endif
  655. movlo $in2,$zeros
  656. vmov.i32 $H4,#1<<24 @ padbit, yes, always
  657. vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
  658. add $inp,$inp,#64
  659. vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
  660. add $in2,$in2,#64
  661. # ifdef __thumb2__
  662. itt hi
  663. # endif
  664. addhi $tbl1,$ctx,#(48+1*9*4)
  665. addhi $tbl0,$ctx,#(48+3*9*4)
  666. # ifdef __ARMEB__
  667. vrev32.8 $H0,$H0
  668. vrev32.8 $H3,$H3
  669. vrev32.8 $H1,$H1
  670. vrev32.8 $H2,$H2
  671. # endif
  672. vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
  673. vshl.u32 $H3,$H3,#18
  674. vsri.u32 $H3,$H2,#14
  675. vshl.u32 $H2,$H2,#12
  676. vbic.i32 $H3,#0xfc000000
  677. vsri.u32 $H2,$H1,#20
  678. vshl.u32 $H1,$H1,#6
  679. vbic.i32 $H2,#0xfc000000
  680. vsri.u32 $H1,$H0,#26
  681. vbic.i32 $H0,#0xfc000000
  682. vbic.i32 $H1,#0xfc000000
  683. bls .Lskip_loop
  684. vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2
  685. vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
  686. vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
  687. vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
  688. b .Loop_neon
  689. .align 5
  690. .Loop_neon:
  691. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  692. @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
  693. @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
  694. @ \___________________/
  695. @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
  696. @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
  697. @ \___________________/ \____________________/
  698. @
  699. @ Note that we start with inp[2:3]*r^2. This is because it
  700. @ doesn't depend on reduction in previous iteration.
  701. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  702. @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
  703. @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
  704. @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  705. @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
  706. @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
  707. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  708. @ inp[2:3]*r^2
  709. vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1]
  710. vmull.u32 $D2,$H2#hi,${R0}[1]
  711. vadd.i32 $H0#lo,$H0#lo,$D0#lo
  712. vmull.u32 $D0,$H0#hi,${R0}[1]
  713. vadd.i32 $H3#lo,$H3#lo,$D3#lo
  714. vmull.u32 $D3,$H3#hi,${R0}[1]
  715. vmlal.u32 $D2,$H1#hi,${R1}[1]
  716. vadd.i32 $H1#lo,$H1#lo,$D1#lo
  717. vmull.u32 $D1,$H1#hi,${R0}[1]
  718. vadd.i32 $H4#lo,$H4#lo,$D4#lo
  719. vmull.u32 $D4,$H4#hi,${R0}[1]
  720. subs $len,$len,#64
  721. vmlal.u32 $D0,$H4#hi,${S1}[1]
  722. # ifdef __thumb2__
  723. it lo
  724. # endif
  725. movlo $in2,$zeros
  726. vmlal.u32 $D3,$H2#hi,${R1}[1]
  727. vld1.32 ${S4}[1],[$tbl1,:32]
  728. vmlal.u32 $D1,$H0#hi,${R1}[1]
  729. vmlal.u32 $D4,$H3#hi,${R1}[1]
  730. vmlal.u32 $D0,$H3#hi,${S2}[1]
  731. vmlal.u32 $D3,$H1#hi,${R2}[1]
  732. vmlal.u32 $D4,$H2#hi,${R2}[1]
  733. vmlal.u32 $D1,$H4#hi,${S2}[1]
  734. vmlal.u32 $D2,$H0#hi,${R2}[1]
  735. vmlal.u32 $D3,$H0#hi,${R3}[1]
  736. vmlal.u32 $D0,$H2#hi,${S3}[1]
  737. vmlal.u32 $D4,$H1#hi,${R3}[1]
  738. vmlal.u32 $D1,$H3#hi,${S3}[1]
  739. vmlal.u32 $D2,$H4#hi,${S3}[1]
  740. vmlal.u32 $D3,$H4#hi,${S4}[1]
  741. vmlal.u32 $D0,$H1#hi,${S4}[1]
  742. vmlal.u32 $D4,$H0#hi,${R4}[1]
  743. vmlal.u32 $D1,$H2#hi,${S4}[1]
  744. vmlal.u32 $D2,$H3#hi,${S4}[1]
  745. vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
  746. add $in2,$in2,#64
  747. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  748. @ (hash+inp[0:1])*r^4 and accumulate
  749. vmlal.u32 $D3,$H3#lo,${R0}[0]
  750. vmlal.u32 $D0,$H0#lo,${R0}[0]
  751. vmlal.u32 $D4,$H4#lo,${R0}[0]
  752. vmlal.u32 $D1,$H1#lo,${R0}[0]
  753. vmlal.u32 $D2,$H2#lo,${R0}[0]
  754. vld1.32 ${S4}[0],[$tbl0,:32]
  755. vmlal.u32 $D3,$H2#lo,${R1}[0]
  756. vmlal.u32 $D0,$H4#lo,${S1}[0]
  757. vmlal.u32 $D4,$H3#lo,${R1}[0]
  758. vmlal.u32 $D1,$H0#lo,${R1}[0]
  759. vmlal.u32 $D2,$H1#lo,${R1}[0]
  760. vmlal.u32 $D3,$H1#lo,${R2}[0]
  761. vmlal.u32 $D0,$H3#lo,${S2}[0]
  762. vmlal.u32 $D4,$H2#lo,${R2}[0]
  763. vmlal.u32 $D1,$H4#lo,${S2}[0]
  764. vmlal.u32 $D2,$H0#lo,${R2}[0]
  765. vmlal.u32 $D3,$H0#lo,${R3}[0]
  766. vmlal.u32 $D0,$H2#lo,${S3}[0]
  767. vmlal.u32 $D4,$H1#lo,${R3}[0]
  768. vmlal.u32 $D1,$H3#lo,${S3}[0]
  769. vmlal.u32 $D3,$H4#lo,${S4}[0]
  770. vmlal.u32 $D2,$H4#lo,${S3}[0]
  771. vmlal.u32 $D0,$H1#lo,${S4}[0]
  772. vmlal.u32 $D4,$H0#lo,${R4}[0]
  773. vmov.i32 $H4,#1<<24 @ padbit, yes, always
  774. vmlal.u32 $D1,$H2#lo,${S4}[0]
  775. vmlal.u32 $D2,$H3#lo,${S4}[0]
  776. vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
  777. add $inp,$inp,#64
  778. # ifdef __ARMEB__
  779. vrev32.8 $H0,$H0
  780. vrev32.8 $H1,$H1
  781. vrev32.8 $H2,$H2
  782. vrev32.8 $H3,$H3
  783. # endif
  784. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  785. @ lazy reduction interleaved with base 2^32 -> base 2^26
  786. vshr.u64 $T0,$D3,#26
  787. vmovn.i64 $D3#lo,$D3
  788. vshr.u64 $T1,$D0,#26
  789. vmovn.i64 $D0#lo,$D0
  790. vadd.i64 $D4,$D4,$T0 @ h3 -> h4
  791. vbic.i32 $D3#lo,#0xfc000000
  792. vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
  793. vadd.i64 $D1,$D1,$T1 @ h0 -> h1
  794. vshl.u32 $H3,$H3,#18
  795. vbic.i32 $D0#lo,#0xfc000000
  796. vshrn.u64 $T0#lo,$D4,#26
  797. vmovn.i64 $D4#lo,$D4
  798. vshr.u64 $T1,$D1,#26
  799. vmovn.i64 $D1#lo,$D1
  800. vadd.i64 $D2,$D2,$T1 @ h1 -> h2
  801. vsri.u32 $H3,$H2,#14
  802. vbic.i32 $D4#lo,#0xfc000000
  803. vshl.u32 $H2,$H2,#12
  804. vbic.i32 $D1#lo,#0xfc000000
  805. vadd.i32 $D0#lo,$D0#lo,$T0#lo
  806. vshl.u32 $T0#lo,$T0#lo,#2
  807. vbic.i32 $H3,#0xfc000000
  808. vshrn.u64 $T1#lo,$D2,#26
  809. vmovn.i64 $D2#lo,$D2
  810. vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0
  811. vsri.u32 $H2,$H1,#20
  812. vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
  813. vshl.u32 $H1,$H1,#6
  814. vbic.i32 $D2#lo,#0xfc000000
  815. vbic.i32 $H2,#0xfc000000
  816. vshr.u32 $T0#lo,$D0#lo,#26
  817. vbic.i32 $D0#lo,#0xfc000000
  818. vsri.u32 $H1,$H0,#26
  819. vbic.i32 $H0,#0xfc000000
  820. vshr.u32 $T1#lo,$D3#lo,#26
  821. vbic.i32 $D3#lo,#0xfc000000
  822. vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
  823. vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
  824. vbic.i32 $H1,#0xfc000000
  825. bhi .Loop_neon
  826. .Lskip_loop:
  827. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  828. @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
  829. add $tbl1,$ctx,#(48+0*9*4)
  830. add $tbl0,$ctx,#(48+1*9*4)
  831. adds $len,$len,#32
  832. # ifdef __thumb2__
  833. it ne
  834. # endif
  835. movne $len,#0
  836. bne .Long_tail
  837. vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi
  838. vadd.i32 $H0#hi,$H0#lo,$D0#lo
  839. vadd.i32 $H3#hi,$H3#lo,$D3#lo
  840. vadd.i32 $H1#hi,$H1#lo,$D1#lo
  841. vadd.i32 $H4#hi,$H4#lo,$D4#lo
  842. .Long_tail:
  843. vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1
  844. vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2
  845. vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant
  846. vmull.u32 $D2,$H2#hi,$R0
  847. vadd.i32 $H0#lo,$H0#lo,$D0#lo
  848. vmull.u32 $D0,$H0#hi,$R0
  849. vadd.i32 $H3#lo,$H3#lo,$D3#lo
  850. vmull.u32 $D3,$H3#hi,$R0
  851. vadd.i32 $H1#lo,$H1#lo,$D1#lo
  852. vmull.u32 $D1,$H1#hi,$R0
  853. vadd.i32 $H4#lo,$H4#lo,$D4#lo
  854. vmull.u32 $D4,$H4#hi,$R0
  855. vmlal.u32 $D0,$H4#hi,$S1
  856. vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
  857. vmlal.u32 $D3,$H2#hi,$R1
  858. vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
  859. vmlal.u32 $D1,$H0#hi,$R1
  860. vmlal.u32 $D4,$H3#hi,$R1
  861. vmlal.u32 $D2,$H1#hi,$R1
  862. vmlal.u32 $D3,$H1#hi,$R2
  863. vld1.32 ${S4}[1],[$tbl1,:32]
  864. vmlal.u32 $D0,$H3#hi,$S2
  865. vld1.32 ${S4}[0],[$tbl0,:32]
  866. vmlal.u32 $D4,$H2#hi,$R2
  867. vmlal.u32 $D1,$H4#hi,$S2
  868. vmlal.u32 $D2,$H0#hi,$R2
  869. vmlal.u32 $D3,$H0#hi,$R3
  870. # ifdef __thumb2__
  871. it ne
  872. # endif
  873. addne $tbl1,$ctx,#(48+2*9*4)
  874. vmlal.u32 $D0,$H2#hi,$S3
  875. # ifdef __thumb2__
  876. it ne
  877. # endif
  878. addne $tbl0,$ctx,#(48+3*9*4)
  879. vmlal.u32 $D4,$H1#hi,$R3
  880. vmlal.u32 $D1,$H3#hi,$S3
  881. vmlal.u32 $D2,$H4#hi,$S3
  882. vmlal.u32 $D3,$H4#hi,$S4
  883. vorn $MASK,$MASK,$MASK @ all-ones, can be redundant
  884. vmlal.u32 $D0,$H1#hi,$S4
  885. vshr.u64 $MASK,$MASK,#38
  886. vmlal.u32 $D4,$H0#hi,$R4
  887. vmlal.u32 $D1,$H2#hi,$S4
  888. vmlal.u32 $D2,$H3#hi,$S4
  889. beq .Lshort_tail
  890. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  891. @ (hash+inp[0:1])*r^4:r^3 and accumulate
  892. vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3
  893. vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
  894. vmlal.u32 $D2,$H2#lo,$R0
  895. vmlal.u32 $D0,$H0#lo,$R0
  896. vmlal.u32 $D3,$H3#lo,$R0
  897. vmlal.u32 $D1,$H1#lo,$R0
  898. vmlal.u32 $D4,$H4#lo,$R0
  899. vmlal.u32 $D0,$H4#lo,$S1
  900. vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
  901. vmlal.u32 $D3,$H2#lo,$R1
  902. vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
  903. vmlal.u32 $D1,$H0#lo,$R1
  904. vmlal.u32 $D4,$H3#lo,$R1
  905. vmlal.u32 $D2,$H1#lo,$R1
  906. vmlal.u32 $D3,$H1#lo,$R2
  907. vld1.32 ${S4}[1],[$tbl1,:32]
  908. vmlal.u32 $D0,$H3#lo,$S2
  909. vld1.32 ${S4}[0],[$tbl0,:32]
  910. vmlal.u32 $D4,$H2#lo,$R2
  911. vmlal.u32 $D1,$H4#lo,$S2
  912. vmlal.u32 $D2,$H0#lo,$R2
  913. vmlal.u32 $D3,$H0#lo,$R3
  914. vmlal.u32 $D0,$H2#lo,$S3
  915. vmlal.u32 $D4,$H1#lo,$R3
  916. vmlal.u32 $D1,$H3#lo,$S3
  917. vmlal.u32 $D2,$H4#lo,$S3
  918. vmlal.u32 $D3,$H4#lo,$S4
  919. vorn $MASK,$MASK,$MASK @ all-ones
  920. vmlal.u32 $D0,$H1#lo,$S4
  921. vshr.u64 $MASK,$MASK,#38
  922. vmlal.u32 $D4,$H0#lo,$R4
  923. vmlal.u32 $D1,$H2#lo,$S4
  924. vmlal.u32 $D2,$H3#lo,$S4
  925. .Lshort_tail:
  926. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  927. @ horizontal addition
  928. vadd.i64 $D3#lo,$D3#lo,$D3#hi
  929. vadd.i64 $D0#lo,$D0#lo,$D0#hi
  930. vadd.i64 $D4#lo,$D4#lo,$D4#hi
  931. vadd.i64 $D1#lo,$D1#lo,$D1#hi
  932. vadd.i64 $D2#lo,$D2#lo,$D2#hi
  933. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  934. @ lazy reduction, but without narrowing
  935. vshr.u64 $T0,$D3,#26
  936. vand.i64 $D3,$D3,$MASK
  937. vshr.u64 $T1,$D0,#26
  938. vand.i64 $D0,$D0,$MASK
  939. vadd.i64 $D4,$D4,$T0 @ h3 -> h4
  940. vadd.i64 $D1,$D1,$T1 @ h0 -> h1
  941. vshr.u64 $T0,$D4,#26
  942. vand.i64 $D4,$D4,$MASK
  943. vshr.u64 $T1,$D1,#26
  944. vand.i64 $D1,$D1,$MASK
  945. vadd.i64 $D2,$D2,$T1 @ h1 -> h2
  946. vadd.i64 $D0,$D0,$T0
  947. vshl.u64 $T0,$T0,#2
  948. vshr.u64 $T1,$D2,#26
  949. vand.i64 $D2,$D2,$MASK
  950. vadd.i64 $D0,$D0,$T0 @ h4 -> h0
  951. vadd.i64 $D3,$D3,$T1 @ h2 -> h3
  952. vshr.u64 $T0,$D0,#26
  953. vand.i64 $D0,$D0,$MASK
  954. vshr.u64 $T1,$D3,#26
  955. vand.i64 $D3,$D3,$MASK
  956. vadd.i64 $D1,$D1,$T0 @ h0 -> h1
  957. vadd.i64 $D4,$D4,$T1 @ h3 -> h4
  958. cmp $len,#0
  959. bne .Leven
  960. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  961. @ store hash value
  962. vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
  963. vst1.32 {$D4#lo[0]},[$ctx]
  964. vldmia sp!,{d8-d15} @ epilogue
  965. ldmia sp!,{r4-r7}
  966. .Lno_data_neon:
  967. ret @ bx lr
  968. .size poly1305_blocks_neon,.-poly1305_blocks_neon
  969. .type poly1305_emit_neon,%function
  970. .align 5
  971. poly1305_emit_neon:
  972. ldr ip,[$ctx,#36] @ is_base2_26
  973. stmdb sp!,{r4-r11}
  974. tst ip,ip
  975. beq .Lpoly1305_emit_enter
  976. ldmia $ctx,{$h0-$h4}
  977. eor $g0,$g0,$g0
  978. adds $h0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
  979. mov $h1,$h1,lsr#6
  980. adcs $h1,$h1,$h2,lsl#20
  981. mov $h2,$h2,lsr#12
  982. adcs $h2,$h2,$h3,lsl#14
  983. mov $h3,$h3,lsr#18
  984. adcs $h3,$h3,$h4,lsl#8
  985. adc $h4,$g0,$h4,lsr#24 @ can be partially reduced ...
  986. and $g0,$h4,#-4 @ ... so reduce
  987. and $h4,$h3,#3
  988. add $g0,$g0,$g0,lsr#2 @ *= 5
  989. adds $h0,$h0,$g0
  990. adcs $h1,$h1,#0
  991. adcs $h2,$h2,#0
  992. adc $h3,$h3,#0
  993. adds $g0,$h0,#5 @ compare to modulus
  994. adcs $g1,$h1,#0
  995. adcs $g2,$h2,#0
  996. adcs $g3,$h3,#0
  997. adc $g4,$h4,#0
  998. tst $g4,#4 @ did it carry/borrow?
  999. # ifdef __thumb2__
  1000. it ne
  1001. # endif
  1002. movne $h0,$g0
  1003. ldr $g0,[$nonce,#0]
  1004. # ifdef __thumb2__
  1005. it ne
  1006. # endif
  1007. movne $h1,$g1
  1008. ldr $g1,[$nonce,#4]
  1009. # ifdef __thumb2__
  1010. it ne
  1011. # endif
  1012. movne $h2,$g2
  1013. ldr $g2,[$nonce,#8]
  1014. # ifdef __thumb2__
  1015. it ne
  1016. # endif
  1017. movne $h3,$g3
  1018. ldr $g3,[$nonce,#12]
  1019. adds $h0,$h0,$g0 @ accumulate nonce
  1020. adcs $h1,$h1,$g1
  1021. adcs $h2,$h2,$g2
  1022. adc $h3,$h3,$g3
  1023. # ifdef __ARMEB__
  1024. rev $h0,$h0
  1025. rev $h1,$h1
  1026. rev $h2,$h2
  1027. rev $h3,$h3
  1028. # endif
  1029. str $h0,[$mac,#0] @ store the result
  1030. str $h1,[$mac,#4]
  1031. str $h2,[$mac,#8]
  1032. str $h3,[$mac,#12]
  1033. ldmia sp!,{r4-r11}
  1034. ret @ bx lr
  1035. .size poly1305_emit_neon,.-poly1305_emit_neon
  1036. .align 5
  1037. .Lzeros:
  1038. .long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  1039. .LOPENSSL_armcap:
  1040. .word OPENSSL_armcap_P-.Lpoly1305_init
  1041. #endif
  1042. ___
  1043. } }
  1044. $code.=<<___;
  1045. .asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
  1046. .align 2
  1047. #if __ARM_MAX_ARCH__>=7
  1048. .comm OPENSSL_armcap_P,4,4
  1049. #endif
  1050. ___
  1051. foreach (split("\n",$code)) {
  1052. s/\`([^\`]*)\`/eval $1/geo;
  1053. s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
  1054. s/\bret\b/bx lr/go or
  1055. s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
  1056. print $_,"\n";
  1057. }
  1058. close STDOUT; # enforce flush