Ви не можете вибрати більше 25 тем Теми мають розпочинатися з літери або цифри, можуть містити дефіси (-) і не повинні перевищувати 35 символів.
 
 
 
 
 
 

687 рядки
17 KiB

  1. #!/usr/bin/env perl
  2. # ====================================================================
  3. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  4. # project. The module is, however, dual licensed under OpenSSL and
  5. # CRYPTOGAMS licenses depending on where you obtain it. For further
  6. # details see http://www.openssl.org/~appro/cryptogams/.
  7. # ====================================================================
  8. # sha1_block procedure for ARMv4.
  9. #
  10. # January 2007.
  11. # Size/performance trade-off
  12. # ====================================================================
  13. # impl size in bytes comp cycles[*] measured performance
  14. # ====================================================================
  15. # thumb 304 3212 4420
  16. # armv4-small 392/+29% 1958/+64% 2250/+96%
  17. # armv4-compact 740/+89% 1552/+26% 1840/+22%
  18. # armv4-large 1420/+92% 1307/+19% 1370/+34%[***]
  19. # full unroll ~5100/+260% ~1260/+4% ~1300/+5%
  20. # ====================================================================
  21. # thumb = same as 'small' but in Thumb instructions[**] and
  22. # with recurring code in two private functions;
  23. # small = detached Xload/update, loops are folded;
  24. # compact = detached Xload/update, 5x unroll;
  25. # large = interleaved Xload/update, 5x unroll;
  26. # full unroll = interleaved Xload/update, full unroll, estimated[!];
  27. #
  28. # [*] Manually counted instructions in "grand" loop body. Measured
  29. # performance is affected by prologue and epilogue overhead,
  30. # i-cache availability, branch penalties, etc.
  31. # [**] While each Thumb instruction is twice smaller, they are not as
  32. # diverse as ARM ones: e.g., there are only two arithmetic
  33. # instructions with 3 arguments, no [fixed] rotate, addressing
  34. # modes are limited. As result it takes more instructions to do
  35. # the same job in Thumb, therefore the code is never twice as
  36. # small and always slower.
  37. # [***] which is also ~35% better than compiler generated code. Dual-
  38. # issue Cortex A8 core was measured to process input block in
  39. # ~990 cycles.
  40. # August 2010.
  41. #
  42. # Rescheduling for dual-issue pipeline resulted in 13% improvement on
  43. # Cortex A8 core and in absolute terms ~870 cycles per input block
  44. # [or 13.6 cycles per byte].
  45. # February 2011.
  46. #
  47. # Profiler-assisted and platform-specific optimization resulted in 10%
  48. # improvement on Cortex A8 core and 12.2 cycles per byte.
  49. # September 2013.
  50. #
  51. # Add NEON implementation (see sha1-586.pl for background info). On
  52. # Cortex A8 it was measured to process one byte in 6.7 cycles or >80%
  53. # faster than integer-only code. Because [fully unrolled] NEON code
  54. # is ~2.5x larger and there are some redundant instructions executed
  55. # when processing last block, improvement is not as big for smallest
  56. # blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per
  57. # byte, which is also >80% faster than integer-only code. Cortex-A15
  58. # is even faster spending 5.6 cycles per byte outperforming integer-
  59. # only code by factor of 2.
  60. # May 2014.
  61. #
  62. # Add ARMv8 code path performing at 2.35 cpb on Apple A7.
  63. while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  64. open STDOUT,">$output";
  65. $ctx="r0";
  66. $inp="r1";
  67. $len="r2";
  68. $a="r3";
  69. $b="r4";
  70. $c="r5";
  71. $d="r6";
  72. $e="r7";
  73. $K="r8";
  74. $t0="r9";
  75. $t1="r10";
  76. $t2="r11";
  77. $t3="r12";
  78. $Xi="r14";
  79. @V=($a,$b,$c,$d,$e);
  80. sub Xupdate {
  81. my ($a,$b,$c,$d,$e,$opt1,$opt2)=@_;
  82. $code.=<<___;
  83. ldr $t0,[$Xi,#15*4]
  84. ldr $t1,[$Xi,#13*4]
  85. ldr $t2,[$Xi,#7*4]
  86. add $e,$K,$e,ror#2 @ E+=K_xx_xx
  87. ldr $t3,[$Xi,#2*4]
  88. eor $t0,$t0,$t1
  89. eor $t2,$t2,$t3 @ 1 cycle stall
  90. eor $t1,$c,$d @ F_xx_xx
  91. mov $t0,$t0,ror#31
  92. add $e,$e,$a,ror#27 @ E+=ROR(A,27)
  93. eor $t0,$t0,$t2,ror#31
  94. str $t0,[$Xi,#-4]!
  95. $opt1 @ F_xx_xx
  96. $opt2 @ F_xx_xx
  97. add $e,$e,$t0 @ E+=X[i]
  98. ___
  99. }
  100. sub BODY_00_15 {
  101. my ($a,$b,$c,$d,$e)=@_;
  102. $code.=<<___;
  103. #if __ARM_ARCH__<7
  104. ldrb $t1,[$inp,#2]
  105. ldrb $t0,[$inp,#3]
  106. ldrb $t2,[$inp,#1]
  107. add $e,$K,$e,ror#2 @ E+=K_00_19
  108. ldrb $t3,[$inp],#4
  109. orr $t0,$t0,$t1,lsl#8
  110. eor $t1,$c,$d @ F_xx_xx
  111. orr $t0,$t0,$t2,lsl#16
  112. add $e,$e,$a,ror#27 @ E+=ROR(A,27)
  113. orr $t0,$t0,$t3,lsl#24
  114. #else
  115. ldr $t0,[$inp],#4 @ handles unaligned
  116. add $e,$K,$e,ror#2 @ E+=K_00_19
  117. eor $t1,$c,$d @ F_xx_xx
  118. add $e,$e,$a,ror#27 @ E+=ROR(A,27)
  119. #ifdef __ARMEL__
  120. rev $t0,$t0 @ byte swap
  121. #endif
  122. #endif
  123. and $t1,$b,$t1,ror#2
  124. add $e,$e,$t0 @ E+=X[i]
  125. eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
  126. str $t0,[$Xi,#-4]!
  127. add $e,$e,$t1 @ E+=F_00_19(B,C,D)
  128. ___
  129. }
  130. sub BODY_16_19 {
  131. my ($a,$b,$c,$d,$e)=@_;
  132. &Xupdate(@_,"and $t1,$b,$t1,ror#2");
  133. $code.=<<___;
  134. eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
  135. add $e,$e,$t1 @ E+=F_00_19(B,C,D)
  136. ___
  137. }
  138. sub BODY_20_39 {
  139. my ($a,$b,$c,$d,$e)=@_;
  140. &Xupdate(@_,"eor $t1,$b,$t1,ror#2");
  141. $code.=<<___;
  142. add $e,$e,$t1 @ E+=F_20_39(B,C,D)
  143. ___
  144. }
  145. sub BODY_40_59 {
  146. my ($a,$b,$c,$d,$e)=@_;
  147. &Xupdate(@_,"and $t1,$b,$t1,ror#2","and $t2,$c,$d");
  148. $code.=<<___;
  149. add $e,$e,$t1 @ E+=F_40_59(B,C,D)
  150. add $e,$e,$t2,ror#2
  151. ___
  152. }
  153. $code=<<___;
  154. #include "arm_arch.h"
  155. .text
  156. .code 32
  157. .global sha1_block_data_order
  158. .type sha1_block_data_order,%function
  159. .align 5
  160. sha1_block_data_order:
  161. #if __ARM_MAX_ARCH__>=7
  162. sub r3,pc,#8 @ sha1_block_data_order
  163. ldr r12,.LOPENSSL_armcap
  164. ldr r12,[r3,r12] @ OPENSSL_armcap_P
  165. tst r12,#ARMV8_SHA1
  166. bne .LARMv8
  167. tst r12,#ARMV7_NEON
  168. bne .LNEON
  169. #endif
  170. stmdb sp!,{r4-r12,lr}
  171. add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
  172. ldmia $ctx,{$a,$b,$c,$d,$e}
  173. .Lloop:
  174. ldr $K,.LK_00_19
  175. mov $Xi,sp
  176. sub sp,sp,#15*4
  177. mov $c,$c,ror#30
  178. mov $d,$d,ror#30
  179. mov $e,$e,ror#30 @ [6]
  180. .L_00_15:
  181. ___
  182. for($i=0;$i<5;$i++) {
  183. &BODY_00_15(@V); unshift(@V,pop(@V));
  184. }
  185. $code.=<<___;
  186. teq $Xi,sp
  187. bne .L_00_15 @ [((11+4)*5+2)*3]
  188. sub sp,sp,#25*4
  189. ___
  190. &BODY_00_15(@V); unshift(@V,pop(@V));
  191. &BODY_16_19(@V); unshift(@V,pop(@V));
  192. &BODY_16_19(@V); unshift(@V,pop(@V));
  193. &BODY_16_19(@V); unshift(@V,pop(@V));
  194. &BODY_16_19(@V); unshift(@V,pop(@V));
  195. $code.=<<___;
  196. ldr $K,.LK_20_39 @ [+15+16*4]
  197. cmn sp,#0 @ [+3], clear carry to denote 20_39
  198. .L_20_39_or_60_79:
  199. ___
  200. for($i=0;$i<5;$i++) {
  201. &BODY_20_39(@V); unshift(@V,pop(@V));
  202. }
  203. $code.=<<___;
  204. teq $Xi,sp @ preserve carry
  205. bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
  206. bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
  207. ldr $K,.LK_40_59
  208. sub sp,sp,#20*4 @ [+2]
  209. .L_40_59:
  210. ___
  211. for($i=0;$i<5;$i++) {
  212. &BODY_40_59(@V); unshift(@V,pop(@V));
  213. }
  214. $code.=<<___;
  215. teq $Xi,sp
  216. bne .L_40_59 @ [+((12+5)*5+2)*4]
  217. ldr $K,.LK_60_79
  218. sub sp,sp,#20*4
  219. cmp sp,#0 @ set carry to denote 60_79
  220. b .L_20_39_or_60_79 @ [+4], spare 300 bytes
  221. .L_done:
  222. add sp,sp,#80*4 @ "deallocate" stack frame
  223. ldmia $ctx,{$K,$t0,$t1,$t2,$t3}
  224. add $a,$K,$a
  225. add $b,$t0,$b
  226. add $c,$t1,$c,ror#2
  227. add $d,$t2,$d,ror#2
  228. add $e,$t3,$e,ror#2
  229. stmia $ctx,{$a,$b,$c,$d,$e}
  230. teq $inp,$len
  231. bne .Lloop @ [+18], total 1307
  232. #if __ARM_ARCH__>=5
  233. ldmia sp!,{r4-r12,pc}
  234. #else
  235. ldmia sp!,{r4-r12,lr}
  236. tst lr,#1
  237. moveq pc,lr @ be binary compatible with V4, yet
  238. bx lr @ interoperable with Thumb ISA:-)
  239. #endif
  240. .size sha1_block_data_order,.-sha1_block_data_order
  241. .align 5
  242. .LK_00_19: .word 0x5a827999
  243. .LK_20_39: .word 0x6ed9eba1
  244. .LK_40_59: .word 0x8f1bbcdc
  245. .LK_60_79: .word 0xca62c1d6
  246. #if __ARM_MAX_ARCH__>=7
  247. .LOPENSSL_armcap:
  248. .word OPENSSL_armcap_P-sha1_block_data_order
  249. #endif
  250. .asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  251. .align 5
  252. ___
  253. #####################################################################
  254. # NEON stuff
  255. #
  256. {{{
  257. my @V=($a,$b,$c,$d,$e);
  258. my ($K_XX_XX,$Ki,$t0,$t1,$Xfer,$saved_sp)=map("r$_",(8..12,14));
  259. my $Xi=4;
  260. my @X=map("q$_",(8..11,0..3));
  261. my @Tx=("q12","q13");
  262. my ($K,$zero)=("q14","q15");
  263. my $j=0;
  264. sub AUTOLOAD() # thunk [simplified] x86-style perlasm
  265. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
  266. my $arg = pop;
  267. $arg = "#$arg" if ($arg*1 eq $arg);
  268. $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
  269. }
  270. sub body_00_19 () {
  271. (
  272. '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
  273. '&bic ($t0,$d,$b)',
  274. '&add ($e,$e,$Ki)', # e+=X[i]+K
  275. '&and ($t1,$c,$b)',
  276. '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
  277. '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
  278. '&eor ($t1,$t1,$t0)', # F_00_19
  279. '&mov ($b,$b,"ror#2")', # b=ROR(b,2)
  280. '&add ($e,$e,$t1);'. # e+=F_00_19
  281. '$j++; unshift(@V,pop(@V));'
  282. )
  283. }
  284. sub body_20_39 () {
  285. (
  286. '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
  287. '&eor ($t0,$b,$d)',
  288. '&add ($e,$e,$Ki)', # e+=X[i]+K
  289. '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15)) if ($j<79)',
  290. '&eor ($t1,$t0,$c)', # F_20_39
  291. '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
  292. '&mov ($b,$b,"ror#2")', # b=ROR(b,2)
  293. '&add ($e,$e,$t1);'. # e+=F_20_39
  294. '$j++; unshift(@V,pop(@V));'
  295. )
  296. }
  297. sub body_40_59 () {
  298. (
  299. '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
  300. '&add ($e,$e,$Ki)', # e+=X[i]+K
  301. '&and ($t0,$c,$d)',
  302. '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
  303. '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
  304. '&eor ($t1,$c,$d)',
  305. '&add ($e,$e,$t0)',
  306. '&and ($t1,$t1,$b)',
  307. '&mov ($b,$b,"ror#2")', # b=ROR(b,2)
  308. '&add ($e,$e,$t1);'. # e+=F_40_59
  309. '$j++; unshift(@V,pop(@V));'
  310. )
  311. }
  312. sub Xupdate_16_31 ()
  313. { use integer;
  314. my $body = shift;
  315. my @insns = (&$body,&$body,&$body,&$body);
  316. my ($a,$b,$c,$d,$e);
  317. &vext_8 (@X[0],@X[-4&7],@X[-3&7],8); # compose "X[-14]" in "X[0]"
  318. eval(shift(@insns));
  319. eval(shift(@insns));
  320. eval(shift(@insns));
  321. &vadd_i32 (@Tx[1],@X[-1&7],$K);
  322. eval(shift(@insns));
  323. &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0);
  324. eval(shift(@insns));
  325. &vext_8 (@Tx[0],@X[-1&7],$zero,4); # "X[-3]", 3 words
  326. eval(shift(@insns));
  327. eval(shift(@insns));
  328. eval(shift(@insns));
  329. &veor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
  330. eval(shift(@insns));
  331. eval(shift(@insns));
  332. &veor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
  333. eval(shift(@insns));
  334. eval(shift(@insns));
  335. &veor (@Tx[0],@Tx[0],@X[0]); # "X[0]"^="X[-3]"^"X[-8]
  336. eval(shift(@insns));
  337. eval(shift(@insns));
  338. &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer
  339. &sub ($Xfer,$Xfer,64) if ($Xi%4==0);
  340. eval(shift(@insns));
  341. eval(shift(@insns));
  342. &vext_8 (@Tx[1],$zero,@Tx[0],4); # "X[0]"<<96, extract one dword
  343. eval(shift(@insns));
  344. eval(shift(@insns));
  345. &vadd_i32 (@X[0],@Tx[0],@Tx[0]);
  346. eval(shift(@insns));
  347. eval(shift(@insns));
  348. &vsri_32 (@X[0],@Tx[0],31); # "X[0]"<<<=1
  349. eval(shift(@insns));
  350. eval(shift(@insns));
  351. eval(shift(@insns));
  352. &vshr_u32 (@Tx[0],@Tx[1],30);
  353. eval(shift(@insns));
  354. eval(shift(@insns));
  355. &vshl_u32 (@Tx[1],@Tx[1],2);
  356. eval(shift(@insns));
  357. eval(shift(@insns));
  358. &veor (@X[0],@X[0],@Tx[0]);
  359. eval(shift(@insns));
  360. eval(shift(@insns));
  361. &veor (@X[0],@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
  362. foreach (@insns) { eval; } # remaining instructions [if any]
  363. $Xi++; push(@X,shift(@X)); # "rotate" X[]
  364. }
  365. sub Xupdate_32_79 ()
  366. { use integer;
  367. my $body = shift;
  368. my @insns = (&$body,&$body,&$body,&$body);
  369. my ($a,$b,$c,$d,$e);
  370. &vext_8 (@Tx[0],@X[-2&7],@X[-1&7],8); # compose "X[-6]"
  371. eval(shift(@insns));
  372. eval(shift(@insns));
  373. eval(shift(@insns));
  374. &veor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
  375. eval(shift(@insns));
  376. eval(shift(@insns));
  377. &veor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
  378. eval(shift(@insns));
  379. eval(shift(@insns));
  380. &vadd_i32 (@Tx[1],@X[-1&7],$K);
  381. eval(shift(@insns));
  382. &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0);
  383. eval(shift(@insns));
  384. &veor (@Tx[0],@Tx[0],@X[0]); # "X[-6]"^="X[0]"
  385. eval(shift(@insns));
  386. eval(shift(@insns));
  387. &vshr_u32 (@X[0],@Tx[0],30);
  388. eval(shift(@insns));
  389. eval(shift(@insns));
  390. &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer
  391. &sub ($Xfer,$Xfer,64) if ($Xi%4==0);
  392. eval(shift(@insns));
  393. eval(shift(@insns));
  394. &vsli_32 (@X[0],@Tx[0],2); # "X[0]"="X[-6]"<<<2
  395. foreach (@insns) { eval; } # remaining instructions [if any]
  396. $Xi++; push(@X,shift(@X)); # "rotate" X[]
  397. }
  398. sub Xuplast_80 ()
  399. { use integer;
  400. my $body = shift;
  401. my @insns = (&$body,&$body,&$body,&$body);
  402. my ($a,$b,$c,$d,$e);
  403. &vadd_i32 (@Tx[1],@X[-1&7],$K);
  404. eval(shift(@insns));
  405. eval(shift(@insns));
  406. &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!");
  407. &sub ($Xfer,$Xfer,64);
  408. &teq ($inp,$len);
  409. &sub ($K_XX_XX,$K_XX_XX,16); # rewind $K_XX_XX
  410. &subeq ($inp,$inp,64); # reload last block to avoid SEGV
  411. &vld1_8 ("{@X[-4&7]-@X[-3&7]}","[$inp]!");
  412. eval(shift(@insns));
  413. eval(shift(@insns));
  414. &vld1_8 ("{@X[-2&7]-@X[-1&7]}","[$inp]!");
  415. eval(shift(@insns));
  416. eval(shift(@insns));
  417. &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!"); # load K_00_19
  418. eval(shift(@insns));
  419. eval(shift(@insns));
  420. &vrev32_8 (@X[-4&7],@X[-4&7]);
  421. foreach (@insns) { eval; } # remaining instructions
  422. $Xi=0;
  423. }
  424. sub Xloop()
  425. { use integer;
  426. my $body = shift;
  427. my @insns = (&$body,&$body,&$body,&$body);
  428. my ($a,$b,$c,$d,$e);
  429. &vrev32_8 (@X[($Xi-3)&7],@X[($Xi-3)&7]);
  430. eval(shift(@insns));
  431. eval(shift(@insns));
  432. &vadd_i32 (@X[$Xi&7],@X[($Xi-4)&7],$K);
  433. eval(shift(@insns));
  434. eval(shift(@insns));
  435. &vst1_32 ("{@X[$Xi&7]}","[$Xfer,:128]!");# X[]+K xfer to IALU
  436. foreach (@insns) { eval; }
  437. $Xi++;
  438. }
  439. $code.=<<___;
  440. #if __ARM_MAX_ARCH__>=7
  441. .arch armv7-a
  442. .fpu neon
  443. .type sha1_block_data_order_neon,%function
  444. .align 4
  445. sha1_block_data_order_neon:
  446. .LNEON:
  447. stmdb sp!,{r4-r12,lr}
  448. add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
  449. @ dmb @ errata #451034 on early Cortex A8
  450. @ vstmdb sp!,{d8-d15} @ ABI specification says so
  451. mov $saved_sp,sp
  452. sub sp,sp,#64 @ alloca
  453. adr $K_XX_XX,.LK_00_19
  454. bic sp,sp,#15 @ align for 128-bit stores
  455. ldmia $ctx,{$a,$b,$c,$d,$e} @ load context
  456. mov $Xfer,sp
  457. vld1.8 {@X[-4&7]-@X[-3&7]},[$inp]! @ handles unaligned
  458. veor $zero,$zero,$zero
  459. vld1.8 {@X[-2&7]-@X[-1&7]},[$inp]!
  460. vld1.32 {${K}\[]},[$K_XX_XX,:32]! @ load K_00_19
  461. vrev32.8 @X[-4&7],@X[-4&7] @ yes, even on
  462. vrev32.8 @X[-3&7],@X[-3&7] @ big-endian...
  463. vrev32.8 @X[-2&7],@X[-2&7]
  464. vadd.i32 @X[0],@X[-4&7],$K
  465. vrev32.8 @X[-1&7],@X[-1&7]
  466. vadd.i32 @X[1],@X[-3&7],$K
  467. vst1.32 {@X[0]},[$Xfer,:128]!
  468. vadd.i32 @X[2],@X[-2&7],$K
  469. vst1.32 {@X[1]},[$Xfer,:128]!
  470. vst1.32 {@X[2]},[$Xfer,:128]!
  471. ldr $Ki,[sp] @ big RAW stall
  472. .Loop_neon:
  473. ___
  474. &Xupdate_16_31(\&body_00_19);
  475. &Xupdate_16_31(\&body_00_19);
  476. &Xupdate_16_31(\&body_00_19);
  477. &Xupdate_16_31(\&body_00_19);
  478. &Xupdate_32_79(\&body_00_19);
  479. &Xupdate_32_79(\&body_20_39);
  480. &Xupdate_32_79(\&body_20_39);
  481. &Xupdate_32_79(\&body_20_39);
  482. &Xupdate_32_79(\&body_20_39);
  483. &Xupdate_32_79(\&body_20_39);
  484. &Xupdate_32_79(\&body_40_59);
  485. &Xupdate_32_79(\&body_40_59);
  486. &Xupdate_32_79(\&body_40_59);
  487. &Xupdate_32_79(\&body_40_59);
  488. &Xupdate_32_79(\&body_40_59);
  489. &Xupdate_32_79(\&body_20_39);
  490. &Xuplast_80(\&body_20_39);
  491. &Xloop(\&body_20_39);
  492. &Xloop(\&body_20_39);
  493. &Xloop(\&body_20_39);
  494. $code.=<<___;
  495. ldmia $ctx,{$Ki,$t0,$t1,$Xfer} @ accumulate context
  496. add $a,$a,$Ki
  497. ldr $Ki,[$ctx,#16]
  498. add $b,$b,$t0
  499. add $c,$c,$t1
  500. add $d,$d,$Xfer
  501. moveq sp,$saved_sp
  502. add $e,$e,$Ki
  503. ldrne $Ki,[sp]
  504. stmia $ctx,{$a,$b,$c,$d,$e}
  505. addne $Xfer,sp,#3*16
  506. bne .Loop_neon
  507. @ vldmia sp!,{d8-d15}
  508. ldmia sp!,{r4-r12,pc}
  509. .size sha1_block_data_order_neon,.-sha1_block_data_order_neon
  510. #endif
  511. ___
  512. }}}
  513. #####################################################################
  514. # ARMv8 stuff
  515. #
  516. {{{
  517. my ($ABCD,$E,$E0,$E1)=map("q$_",(0..3));
  518. my @MSG=map("q$_",(4..7));
  519. my @Kxx=map("q$_",(8..11));
  520. my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14));
  521. $code.=<<___;
  522. #if __ARM_MAX_ARCH__>=7
  523. .type sha1_block_data_order_armv8,%function
  524. .align 5
  525. sha1_block_data_order_armv8:
  526. .LARMv8:
  527. vstmdb sp!,{d8-d15} @ ABI specification says so
  528. veor $E,$E,$E
  529. adr r3,.LK_00_19
  530. vld1.32 {$ABCD},[$ctx]!
  531. vld1.32 {$E\[0]},[$ctx]
  532. sub $ctx,$ctx,#16
  533. vld1.32 {@Kxx[0]\[]},[r3,:32]!
  534. vld1.32 {@Kxx[1]\[]},[r3,:32]!
  535. vld1.32 {@Kxx[2]\[]},[r3,:32]!
  536. vld1.32 {@Kxx[3]\[]},[r3,:32]
  537. .Loop_v8:
  538. vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
  539. vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
  540. vrev32.8 @MSG[0],@MSG[0]
  541. vrev32.8 @MSG[1],@MSG[1]
  542. vadd.i32 $W0,@Kxx[0],@MSG[0]
  543. vrev32.8 @MSG[2],@MSG[2]
  544. vmov $ABCD_SAVE,$ABCD @ offload
  545. subs $len,$len,#1
  546. vadd.i32 $W1,@Kxx[0],@MSG[1]
  547. vrev32.8 @MSG[3],@MSG[3]
  548. sha1h $E1,$ABCD @ 0
  549. sha1c $ABCD,$E,$W0
  550. vadd.i32 $W0,@Kxx[$j],@MSG[2]
  551. sha1su0 @MSG[0],@MSG[1],@MSG[2]
  552. ___
  553. for ($j=0,$i=1;$i<20-3;$i++) {
  554. my $f=("c","p","m","p")[$i/5];
  555. $code.=<<___;
  556. sha1h $E0,$ABCD @ $i
  557. sha1$f $ABCD,$E1,$W1
  558. vadd.i32 $W1,@Kxx[$j],@MSG[3]
  559. sha1su1 @MSG[0],@MSG[3]
  560. ___
  561. $code.=<<___ if ($i<20-4);
  562. sha1su0 @MSG[1],@MSG[2],@MSG[3]
  563. ___
  564. ($E0,$E1)=($E1,$E0); ($W0,$W1)=($W1,$W0);
  565. push(@MSG,shift(@MSG)); $j++ if ((($i+3)%5)==0);
  566. }
  567. $code.=<<___;
  568. sha1h $E0,$ABCD @ $i
  569. sha1p $ABCD,$E1,$W1
  570. vadd.i32 $W1,@Kxx[$j],@MSG[3]
  571. sha1h $E1,$ABCD @ 18
  572. sha1p $ABCD,$E0,$W0
  573. sha1h $E0,$ABCD @ 19
  574. sha1p $ABCD,$E1,$W1
  575. vadd.i32 $E,$E,$E0
  576. vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
  577. bne .Loop_v8
  578. vst1.32 {$ABCD},[$ctx]!
  579. vst1.32 {$E\[0]},[$ctx]
  580. vldmia sp!,{d8-d15}
  581. ret @ bx lr
  582. .size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
  583. #endif
  584. ___
  585. }}}
  586. $code.=<<___;
  587. #if __ARM_MAX_ARCH__>=7
  588. .comm OPENSSL_armcap_P,4,4
  589. .hidden OPENSSL_armcap_P
  590. #endif
  591. ___
  592. { my %opcode = (
  593. "sha1c" => 0xf2000c40, "sha1p" => 0xf2100c40,
  594. "sha1m" => 0xf2200c40, "sha1su0" => 0xf2300c40,
  595. "sha1h" => 0xf3b902c0, "sha1su1" => 0xf3ba0380 );
  596. sub unsha1 {
  597. my ($mnemonic,$arg)=@_;
  598. if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
  599. my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
  600. |(($2&7)<<17)|(($2&8)<<4)
  601. |(($3&7)<<1) |(($3&8)<<2);
  602. # since ARMv7 instructions are always encoded little-endian.
  603. # correct solution is to use .inst directive, but older
  604. # assemblers don't implement it:-(
  605. sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
  606. $word&0xff,($word>>8)&0xff,
  607. ($word>>16)&0xff,($word>>24)&0xff,
  608. $mnemonic,$arg;
  609. }
  610. }
  611. }
  612. foreach (split($/,$code)) {
  613. s/{q([0-9]+)\[\]}/sprintf "{d%d[],d%d[]}",2*$1,2*$1+1/eo or
  614. s/{q([0-9]+)\[0\]}/sprintf "{d%d[0]}",2*$1/eo;
  615. s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/geo;
  616. s/\bret\b/bx lr/o or
  617. s/\bbx\s+lr\b/.word\t0xe12fff1e/o; # make it possible to compile with -march=armv4
  618. print $_,$/;
  619. }
  620. close STDOUT; # enforce flush