您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符
 
 
 
 
 
 

702 行
18 KiB

  1. #!/usr/bin/env perl
  2. # ====================================================================
  3. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  4. # project. The module is, however, dual licensed under OpenSSL and
  5. # CRYPTOGAMS licenses depending on where you obtain it. For further
  6. # details see http://www.openssl.org/~appro/cryptogams/.
  7. # ====================================================================
  8. # sha1_block procedure for ARMv4.
  9. #
  10. # January 2007.
  11. # Size/performance trade-off
  12. # ====================================================================
  13. # impl size in bytes comp cycles[*] measured performance
  14. # ====================================================================
  15. # thumb 304 3212 4420
  16. # armv4-small 392/+29% 1958/+64% 2250/+96%
  17. # armv4-compact 740/+89% 1552/+26% 1840/+22%
  18. # armv4-large 1420/+92% 1307/+19% 1370/+34%[***]
  19. # full unroll ~5100/+260% ~1260/+4% ~1300/+5%
  20. # ====================================================================
  21. # thumb = same as 'small' but in Thumb instructions[**] and
  22. # with recurring code in two private functions;
  23. # small = detached Xload/update, loops are folded;
  24. # compact = detached Xload/update, 5x unroll;
  25. # large = interleaved Xload/update, 5x unroll;
  26. # full unroll = interleaved Xload/update, full unroll, estimated[!];
  27. #
  28. # [*] Manually counted instructions in "grand" loop body. Measured
  29. # performance is affected by prologue and epilogue overhead,
  30. # i-cache availability, branch penalties, etc.
  31. # [**] While each Thumb instruction is twice smaller, they are not as
  32. # diverse as ARM ones: e.g., there are only two arithmetic
  33. # instructions with 3 arguments, no [fixed] rotate, addressing
  34. # modes are limited. As result it takes more instructions to do
  35. # the same job in Thumb, therefore the code is never twice as
  36. # small and always slower.
  37. # [***] which is also ~35% better than compiler generated code. Dual-
  38. # issue Cortex A8 core was measured to process input block in
  39. # ~990 cycles.
  40. # August 2010.
  41. #
  42. # Rescheduling for dual-issue pipeline resulted in 13% improvement on
  43. # Cortex A8 core and in absolute terms ~870 cycles per input block
  44. # [or 13.6 cycles per byte].
  45. # February 2011.
  46. #
  47. # Profiler-assisted and platform-specific optimization resulted in 10%
  48. # improvement on Cortex A8 core and 12.2 cycles per byte.
  49. # September 2013.
  50. #
  51. # Add NEON implementation (see sha1-586.pl for background info). On
  52. # Cortex A8 it was measured to process one byte in 6.7 cycles or >80%
  53. # faster than integer-only code. Because [fully unrolled] NEON code
  54. # is ~2.5x larger and there are some redundant instructions executed
  55. # when processing last block, improvement is not as big for smallest
  56. # blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per
  57. # byte, which is also >80% faster than integer-only code. Cortex-A15
  58. # is even faster spending 5.6 cycles per byte outperforming integer-
  59. # only code by factor of 2.
  60. # May 2014.
  61. #
  62. # Add ARMv8 code path performing at 2.35 cpb on Apple A7.
  63. $flavour = shift;
  64. if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
  65. else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
  66. if ($flavour && $flavour ne "void") {
  67. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  68. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  69. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  70. die "can't locate arm-xlate.pl";
  71. open STDOUT,"| \"$^X\" $xlate $flavour $output";
  72. } else {
  73. open STDOUT,">$output";
  74. }
  75. $ctx="r0";
  76. $inp="r1";
  77. $len="r2";
  78. $a="r3";
  79. $b="r4";
  80. $c="r5";
  81. $d="r6";
  82. $e="r7";
  83. $K="r8";
  84. $t0="r9";
  85. $t1="r10";
  86. $t2="r11";
  87. $t3="r12";
  88. $Xi="r14";
  89. @V=($a,$b,$c,$d,$e);
  90. sub Xupdate {
  91. my ($a,$b,$c,$d,$e,$opt1,$opt2)=@_;
  92. $code.=<<___;
  93. ldr $t0,[$Xi,#15*4]
  94. ldr $t1,[$Xi,#13*4]
  95. ldr $t2,[$Xi,#7*4]
  96. add $e,$K,$e,ror#2 @ E+=K_xx_xx
  97. ldr $t3,[$Xi,#2*4]
  98. eor $t0,$t0,$t1
  99. eor $t2,$t2,$t3 @ 1 cycle stall
  100. eor $t1,$c,$d @ F_xx_xx
  101. mov $t0,$t0,ror#31
  102. add $e,$e,$a,ror#27 @ E+=ROR(A,27)
  103. eor $t0,$t0,$t2,ror#31
  104. str $t0,[$Xi,#-4]!
  105. $opt1 @ F_xx_xx
  106. $opt2 @ F_xx_xx
  107. add $e,$e,$t0 @ E+=X[i]
  108. ___
  109. }
  110. sub BODY_00_15 {
  111. my ($a,$b,$c,$d,$e)=@_;
  112. $code.=<<___;
  113. #if __ARM_ARCH__<7
  114. ldrb $t1,[$inp,#2]
  115. ldrb $t0,[$inp,#3]
  116. ldrb $t2,[$inp,#1]
  117. add $e,$K,$e,ror#2 @ E+=K_00_19
  118. ldrb $t3,[$inp],#4
  119. orr $t0,$t0,$t1,lsl#8
  120. eor $t1,$c,$d @ F_xx_xx
  121. orr $t0,$t0,$t2,lsl#16
  122. add $e,$e,$a,ror#27 @ E+=ROR(A,27)
  123. orr $t0,$t0,$t3,lsl#24
  124. #else
  125. ldr $t0,[$inp],#4 @ handles unaligned
  126. add $e,$K,$e,ror#2 @ E+=K_00_19
  127. eor $t1,$c,$d @ F_xx_xx
  128. add $e,$e,$a,ror#27 @ E+=ROR(A,27)
  129. #ifdef __ARMEL__
  130. rev $t0,$t0 @ byte swap
  131. #endif
  132. #endif
  133. and $t1,$b,$t1,ror#2
  134. add $e,$e,$t0 @ E+=X[i]
  135. eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
  136. str $t0,[$Xi,#-4]!
  137. add $e,$e,$t1 @ E+=F_00_19(B,C,D)
  138. ___
  139. }
  140. sub BODY_16_19 {
  141. my ($a,$b,$c,$d,$e)=@_;
  142. &Xupdate(@_,"and $t1,$b,$t1,ror#2");
  143. $code.=<<___;
  144. eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
  145. add $e,$e,$t1 @ E+=F_00_19(B,C,D)
  146. ___
  147. }
  148. sub BODY_20_39 {
  149. my ($a,$b,$c,$d,$e)=@_;
  150. &Xupdate(@_,"eor $t1,$b,$t1,ror#2");
  151. $code.=<<___;
  152. add $e,$e,$t1 @ E+=F_20_39(B,C,D)
  153. ___
  154. }
  155. sub BODY_40_59 {
  156. my ($a,$b,$c,$d,$e)=@_;
  157. &Xupdate(@_,"and $t1,$b,$t1,ror#2","and $t2,$c,$d");
  158. $code.=<<___;
  159. add $e,$e,$t1 @ E+=F_40_59(B,C,D)
  160. add $e,$e,$t2,ror#2
  161. ___
  162. }
  163. $code=<<___;
  164. #include "arm_arch.h"
  165. .text
  166. .code 32
  167. .global sha1_block_data_order
  168. .type sha1_block_data_order,%function
  169. .align 5
  170. sha1_block_data_order:
  171. #if __ARM_MAX_ARCH__>=7
  172. sub r3,pc,#8 @ sha1_block_data_order
  173. ldr r12,.LOPENSSL_armcap
  174. ldr r12,[r3,r12] @ OPENSSL_armcap_P
  175. #ifdef __APPLE__
  176. ldr r12,[r12]
  177. #endif
  178. tst r12,#ARMV8_SHA1
  179. bne .LARMv8
  180. tst r12,#ARMV7_NEON
  181. bne .LNEON
  182. #endif
  183. stmdb sp!,{r4-r12,lr}
  184. add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
  185. ldmia $ctx,{$a,$b,$c,$d,$e}
  186. .Lloop:
  187. ldr $K,.LK_00_19
  188. mov $Xi,sp
  189. sub sp,sp,#15*4
  190. mov $c,$c,ror#30
  191. mov $d,$d,ror#30
  192. mov $e,$e,ror#30 @ [6]
  193. .L_00_15:
  194. ___
  195. for($i=0;$i<5;$i++) {
  196. &BODY_00_15(@V); unshift(@V,pop(@V));
  197. }
  198. $code.=<<___;
  199. teq $Xi,sp
  200. bne .L_00_15 @ [((11+4)*5+2)*3]
  201. sub sp,sp,#25*4
  202. ___
  203. &BODY_00_15(@V); unshift(@V,pop(@V));
  204. &BODY_16_19(@V); unshift(@V,pop(@V));
  205. &BODY_16_19(@V); unshift(@V,pop(@V));
  206. &BODY_16_19(@V); unshift(@V,pop(@V));
  207. &BODY_16_19(@V); unshift(@V,pop(@V));
  208. $code.=<<___;
  209. ldr $K,.LK_20_39 @ [+15+16*4]
  210. cmn sp,#0 @ [+3], clear carry to denote 20_39
  211. .L_20_39_or_60_79:
  212. ___
  213. for($i=0;$i<5;$i++) {
  214. &BODY_20_39(@V); unshift(@V,pop(@V));
  215. }
  216. $code.=<<___;
  217. teq $Xi,sp @ preserve carry
  218. bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
  219. bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
  220. ldr $K,.LK_40_59
  221. sub sp,sp,#20*4 @ [+2]
  222. .L_40_59:
  223. ___
  224. for($i=0;$i<5;$i++) {
  225. &BODY_40_59(@V); unshift(@V,pop(@V));
  226. }
  227. $code.=<<___;
  228. teq $Xi,sp
  229. bne .L_40_59 @ [+((12+5)*5+2)*4]
  230. ldr $K,.LK_60_79
  231. sub sp,sp,#20*4
  232. cmp sp,#0 @ set carry to denote 60_79
  233. b .L_20_39_or_60_79 @ [+4], spare 300 bytes
  234. .L_done:
  235. add sp,sp,#80*4 @ "deallocate" stack frame
  236. ldmia $ctx,{$K,$t0,$t1,$t2,$t3}
  237. add $a,$K,$a
  238. add $b,$t0,$b
  239. add $c,$t1,$c,ror#2
  240. add $d,$t2,$d,ror#2
  241. add $e,$t3,$e,ror#2
  242. stmia $ctx,{$a,$b,$c,$d,$e}
  243. teq $inp,$len
  244. bne .Lloop @ [+18], total 1307
  245. #if __ARM_ARCH__>=5
  246. ldmia sp!,{r4-r12,pc}
  247. #else
  248. ldmia sp!,{r4-r12,lr}
  249. tst lr,#1
  250. moveq pc,lr @ be binary compatible with V4, yet
  251. bx lr @ interoperable with Thumb ISA:-)
  252. #endif
  253. .size sha1_block_data_order,.-sha1_block_data_order
  254. .align 5
  255. .LK_00_19: .word 0x5a827999
  256. .LK_20_39: .word 0x6ed9eba1
  257. .LK_40_59: .word 0x8f1bbcdc
  258. .LK_60_79: .word 0xca62c1d6
  259. #if __ARM_MAX_ARCH__>=7
  260. .LOPENSSL_armcap:
  261. .word OPENSSL_armcap_P-sha1_block_data_order
  262. #endif
  263. .asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  264. .align 5
  265. ___
  266. #####################################################################
  267. # NEON stuff
  268. #
  269. {{{
  270. my @V=($a,$b,$c,$d,$e);
  271. my ($K_XX_XX,$Ki,$t0,$t1,$Xfer,$saved_sp)=map("r$_",(8..12,14));
  272. my $Xi=4;
  273. my @X=map("q$_",(8..11,0..3));
  274. my @Tx=("q12","q13");
  275. my ($K,$zero)=("q14","q15");
  276. my $j=0;
  277. sub AUTOLOAD() # thunk [simplified] x86-style perlasm
  278. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
  279. my $arg = pop;
  280. $arg = "#$arg" if ($arg*1 eq $arg);
  281. $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
  282. }
  283. sub body_00_19 () {
  284. (
  285. '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
  286. '&bic ($t0,$d,$b)',
  287. '&add ($e,$e,$Ki)', # e+=X[i]+K
  288. '&and ($t1,$c,$b)',
  289. '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
  290. '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
  291. '&eor ($t1,$t1,$t0)', # F_00_19
  292. '&mov ($b,$b,"ror#2")', # b=ROR(b,2)
  293. '&add ($e,$e,$t1);'. # e+=F_00_19
  294. '$j++; unshift(@V,pop(@V));'
  295. )
  296. }
  297. sub body_20_39 () {
  298. (
  299. '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
  300. '&eor ($t0,$b,$d)',
  301. '&add ($e,$e,$Ki)', # e+=X[i]+K
  302. '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15)) if ($j<79)',
  303. '&eor ($t1,$t0,$c)', # F_20_39
  304. '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
  305. '&mov ($b,$b,"ror#2")', # b=ROR(b,2)
  306. '&add ($e,$e,$t1);'. # e+=F_20_39
  307. '$j++; unshift(@V,pop(@V));'
  308. )
  309. }
  310. sub body_40_59 () {
  311. (
  312. '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
  313. '&add ($e,$e,$Ki)', # e+=X[i]+K
  314. '&and ($t0,$c,$d)',
  315. '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
  316. '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
  317. '&eor ($t1,$c,$d)',
  318. '&add ($e,$e,$t0)',
  319. '&and ($t1,$t1,$b)',
  320. '&mov ($b,$b,"ror#2")', # b=ROR(b,2)
  321. '&add ($e,$e,$t1);'. # e+=F_40_59
  322. '$j++; unshift(@V,pop(@V));'
  323. )
  324. }
  325. sub Xupdate_16_31 ()
  326. { use integer;
  327. my $body = shift;
  328. my @insns = (&$body,&$body,&$body,&$body);
  329. my ($a,$b,$c,$d,$e);
  330. &vext_8 (@X[0],@X[-4&7],@X[-3&7],8); # compose "X[-14]" in "X[0]"
  331. eval(shift(@insns));
  332. eval(shift(@insns));
  333. eval(shift(@insns));
  334. &vadd_i32 (@Tx[1],@X[-1&7],$K);
  335. eval(shift(@insns));
  336. &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0);
  337. eval(shift(@insns));
  338. &vext_8 (@Tx[0],@X[-1&7],$zero,4); # "X[-3]", 3 words
  339. eval(shift(@insns));
  340. eval(shift(@insns));
  341. eval(shift(@insns));
  342. &veor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
  343. eval(shift(@insns));
  344. eval(shift(@insns));
  345. &veor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
  346. eval(shift(@insns));
  347. eval(shift(@insns));
  348. &veor (@Tx[0],@Tx[0],@X[0]); # "X[0]"^="X[-3]"^"X[-8]
  349. eval(shift(@insns));
  350. eval(shift(@insns));
  351. &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer
  352. &sub ($Xfer,$Xfer,64) if ($Xi%4==0);
  353. eval(shift(@insns));
  354. eval(shift(@insns));
  355. &vext_8 (@Tx[1],$zero,@Tx[0],4); # "X[0]"<<96, extract one dword
  356. eval(shift(@insns));
  357. eval(shift(@insns));
  358. &vadd_i32 (@X[0],@Tx[0],@Tx[0]);
  359. eval(shift(@insns));
  360. eval(shift(@insns));
  361. &vsri_32 (@X[0],@Tx[0],31); # "X[0]"<<<=1
  362. eval(shift(@insns));
  363. eval(shift(@insns));
  364. eval(shift(@insns));
  365. &vshr_u32 (@Tx[0],@Tx[1],30);
  366. eval(shift(@insns));
  367. eval(shift(@insns));
  368. &vshl_u32 (@Tx[1],@Tx[1],2);
  369. eval(shift(@insns));
  370. eval(shift(@insns));
  371. &veor (@X[0],@X[0],@Tx[0]);
  372. eval(shift(@insns));
  373. eval(shift(@insns));
  374. &veor (@X[0],@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
  375. foreach (@insns) { eval; } # remaining instructions [if any]
  376. $Xi++; push(@X,shift(@X)); # "rotate" X[]
  377. }
  378. sub Xupdate_32_79 ()
  379. { use integer;
  380. my $body = shift;
  381. my @insns = (&$body,&$body,&$body,&$body);
  382. my ($a,$b,$c,$d,$e);
  383. &vext_8 (@Tx[0],@X[-2&7],@X[-1&7],8); # compose "X[-6]"
  384. eval(shift(@insns));
  385. eval(shift(@insns));
  386. eval(shift(@insns));
  387. &veor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
  388. eval(shift(@insns));
  389. eval(shift(@insns));
  390. &veor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
  391. eval(shift(@insns));
  392. eval(shift(@insns));
  393. &vadd_i32 (@Tx[1],@X[-1&7],$K);
  394. eval(shift(@insns));
  395. &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0);
  396. eval(shift(@insns));
  397. &veor (@Tx[0],@Tx[0],@X[0]); # "X[-6]"^="X[0]"
  398. eval(shift(@insns));
  399. eval(shift(@insns));
  400. &vshr_u32 (@X[0],@Tx[0],30);
  401. eval(shift(@insns));
  402. eval(shift(@insns));
  403. &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer
  404. &sub ($Xfer,$Xfer,64) if ($Xi%4==0);
  405. eval(shift(@insns));
  406. eval(shift(@insns));
  407. &vsli_32 (@X[0],@Tx[0],2); # "X[0]"="X[-6]"<<<2
  408. foreach (@insns) { eval; } # remaining instructions [if any]
  409. $Xi++; push(@X,shift(@X)); # "rotate" X[]
  410. }
  411. sub Xuplast_80 ()
  412. { use integer;
  413. my $body = shift;
  414. my @insns = (&$body,&$body,&$body,&$body);
  415. my ($a,$b,$c,$d,$e);
  416. &vadd_i32 (@Tx[1],@X[-1&7],$K);
  417. eval(shift(@insns));
  418. eval(shift(@insns));
  419. &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!");
  420. &sub ($Xfer,$Xfer,64);
  421. &teq ($inp,$len);
  422. &sub ($K_XX_XX,$K_XX_XX,16); # rewind $K_XX_XX
  423. &subeq ($inp,$inp,64); # reload last block to avoid SEGV
  424. &vld1_8 ("{@X[-4&7]-@X[-3&7]}","[$inp]!");
  425. eval(shift(@insns));
  426. eval(shift(@insns));
  427. &vld1_8 ("{@X[-2&7]-@X[-1&7]}","[$inp]!");
  428. eval(shift(@insns));
  429. eval(shift(@insns));
  430. &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!"); # load K_00_19
  431. eval(shift(@insns));
  432. eval(shift(@insns));
  433. &vrev32_8 (@X[-4&7],@X[-4&7]);
  434. foreach (@insns) { eval; } # remaining instructions
  435. $Xi=0;
  436. }
  437. sub Xloop()
  438. { use integer;
  439. my $body = shift;
  440. my @insns = (&$body,&$body,&$body,&$body);
  441. my ($a,$b,$c,$d,$e);
  442. &vrev32_8 (@X[($Xi-3)&7],@X[($Xi-3)&7]);
  443. eval(shift(@insns));
  444. eval(shift(@insns));
  445. &vadd_i32 (@X[$Xi&7],@X[($Xi-4)&7],$K);
  446. eval(shift(@insns));
  447. eval(shift(@insns));
  448. &vst1_32 ("{@X[$Xi&7]}","[$Xfer,:128]!");# X[]+K xfer to IALU
  449. foreach (@insns) { eval; }
  450. $Xi++;
  451. }
  452. $code.=<<___;
  453. #if __ARM_MAX_ARCH__>=7
  454. .arch armv7-a
  455. .fpu neon
  456. .type sha1_block_data_order_neon,%function
  457. .align 4
  458. sha1_block_data_order_neon:
  459. .LNEON:
  460. stmdb sp!,{r4-r12,lr}
  461. add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
  462. @ dmb @ errata #451034 on early Cortex A8
  463. @ vstmdb sp!,{d8-d15} @ ABI specification says so
  464. mov $saved_sp,sp
  465. sub sp,sp,#64 @ alloca
  466. adr $K_XX_XX,.LK_00_19
  467. bic sp,sp,#15 @ align for 128-bit stores
  468. ldmia $ctx,{$a,$b,$c,$d,$e} @ load context
  469. mov $Xfer,sp
  470. vld1.8 {@X[-4&7]-@X[-3&7]},[$inp]! @ handles unaligned
  471. veor $zero,$zero,$zero
  472. vld1.8 {@X[-2&7]-@X[-1&7]},[$inp]!
  473. vld1.32 {${K}\[]},[$K_XX_XX,:32]! @ load K_00_19
  474. vrev32.8 @X[-4&7],@X[-4&7] @ yes, even on
  475. vrev32.8 @X[-3&7],@X[-3&7] @ big-endian...
  476. vrev32.8 @X[-2&7],@X[-2&7]
  477. vadd.i32 @X[0],@X[-4&7],$K
  478. vrev32.8 @X[-1&7],@X[-1&7]
  479. vadd.i32 @X[1],@X[-3&7],$K
  480. vst1.32 {@X[0]},[$Xfer,:128]!
  481. vadd.i32 @X[2],@X[-2&7],$K
  482. vst1.32 {@X[1]},[$Xfer,:128]!
  483. vst1.32 {@X[2]},[$Xfer,:128]!
  484. ldr $Ki,[sp] @ big RAW stall
  485. .Loop_neon:
  486. ___
  487. &Xupdate_16_31(\&body_00_19);
  488. &Xupdate_16_31(\&body_00_19);
  489. &Xupdate_16_31(\&body_00_19);
  490. &Xupdate_16_31(\&body_00_19);
  491. &Xupdate_32_79(\&body_00_19);
  492. &Xupdate_32_79(\&body_20_39);
  493. &Xupdate_32_79(\&body_20_39);
  494. &Xupdate_32_79(\&body_20_39);
  495. &Xupdate_32_79(\&body_20_39);
  496. &Xupdate_32_79(\&body_20_39);
  497. &Xupdate_32_79(\&body_40_59);
  498. &Xupdate_32_79(\&body_40_59);
  499. &Xupdate_32_79(\&body_40_59);
  500. &Xupdate_32_79(\&body_40_59);
  501. &Xupdate_32_79(\&body_40_59);
  502. &Xupdate_32_79(\&body_20_39);
  503. &Xuplast_80(\&body_20_39);
  504. &Xloop(\&body_20_39);
  505. &Xloop(\&body_20_39);
  506. &Xloop(\&body_20_39);
  507. $code.=<<___;
  508. ldmia $ctx,{$Ki,$t0,$t1,$Xfer} @ accumulate context
  509. add $a,$a,$Ki
  510. ldr $Ki,[$ctx,#16]
  511. add $b,$b,$t0
  512. add $c,$c,$t1
  513. add $d,$d,$Xfer
  514. moveq sp,$saved_sp
  515. add $e,$e,$Ki
  516. ldrne $Ki,[sp]
  517. stmia $ctx,{$a,$b,$c,$d,$e}
  518. addne $Xfer,sp,#3*16
  519. bne .Loop_neon
  520. @ vldmia sp!,{d8-d15}
  521. ldmia sp!,{r4-r12,pc}
  522. .size sha1_block_data_order_neon,.-sha1_block_data_order_neon
  523. #endif
  524. ___
  525. }}}
  526. #####################################################################
  527. # ARMv8 stuff
  528. #
  529. {{{
  530. my ($ABCD,$E,$E0,$E1)=map("q$_",(0..3));
  531. my @MSG=map("q$_",(4..7));
  532. my @Kxx=map("q$_",(8..11));
  533. my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14));
  534. $code.=<<___;
  535. #if __ARM_MAX_ARCH__>=7
  536. .type sha1_block_data_order_armv8,%function
  537. .align 5
  538. sha1_block_data_order_armv8:
  539. .LARMv8:
  540. vstmdb sp!,{d8-d15} @ ABI specification says so
  541. veor $E,$E,$E
  542. adr r3,.LK_00_19
  543. vld1.32 {$ABCD},[$ctx]!
  544. vld1.32 {$E\[0]},[$ctx]
  545. sub $ctx,$ctx,#16
  546. vld1.32 {@Kxx[0]\[]},[r3,:32]!
  547. vld1.32 {@Kxx[1]\[]},[r3,:32]!
  548. vld1.32 {@Kxx[2]\[]},[r3,:32]!
  549. vld1.32 {@Kxx[3]\[]},[r3,:32]
  550. .Loop_v8:
  551. vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
  552. vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
  553. vrev32.8 @MSG[0],@MSG[0]
  554. vrev32.8 @MSG[1],@MSG[1]
  555. vadd.i32 $W0,@Kxx[0],@MSG[0]
  556. vrev32.8 @MSG[2],@MSG[2]
  557. vmov $ABCD_SAVE,$ABCD @ offload
  558. subs $len,$len,#1
  559. vadd.i32 $W1,@Kxx[0],@MSG[1]
  560. vrev32.8 @MSG[3],@MSG[3]
  561. sha1h $E1,$ABCD @ 0
  562. sha1c $ABCD,$E,$W0
  563. vadd.i32 $W0,@Kxx[$j],@MSG[2]
  564. sha1su0 @MSG[0],@MSG[1],@MSG[2]
  565. ___
  566. for ($j=0,$i=1;$i<20-3;$i++) {
  567. my $f=("c","p","m","p")[$i/5];
  568. $code.=<<___;
  569. sha1h $E0,$ABCD @ $i
  570. sha1$f $ABCD,$E1,$W1
  571. vadd.i32 $W1,@Kxx[$j],@MSG[3]
  572. sha1su1 @MSG[0],@MSG[3]
  573. ___
  574. $code.=<<___ if ($i<20-4);
  575. sha1su0 @MSG[1],@MSG[2],@MSG[3]
  576. ___
  577. ($E0,$E1)=($E1,$E0); ($W0,$W1)=($W1,$W0);
  578. push(@MSG,shift(@MSG)); $j++ if ((($i+3)%5)==0);
  579. }
  580. $code.=<<___;
  581. sha1h $E0,$ABCD @ $i
  582. sha1p $ABCD,$E1,$W1
  583. vadd.i32 $W1,@Kxx[$j],@MSG[3]
  584. sha1h $E1,$ABCD @ 18
  585. sha1p $ABCD,$E0,$W0
  586. sha1h $E0,$ABCD @ 19
  587. sha1p $ABCD,$E1,$W1
  588. vadd.i32 $E,$E,$E0
  589. vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
  590. bne .Loop_v8
  591. vst1.32 {$ABCD},[$ctx]!
  592. vst1.32 {$E\[0]},[$ctx]
  593. vldmia sp!,{d8-d15}
  594. ret @ bx lr
  595. .size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
  596. #endif
  597. ___
  598. }}}
  599. $code.=<<___;
  600. #if __ARM_MAX_ARCH__>=7
  601. .comm OPENSSL_armcap_P,4,4
  602. .hidden OPENSSL_armcap_P
  603. #endif
  604. ___
  605. { my %opcode = (
  606. "sha1c" => 0xf2000c40, "sha1p" => 0xf2100c40,
  607. "sha1m" => 0xf2200c40, "sha1su0" => 0xf2300c40,
  608. "sha1h" => 0xf3b902c0, "sha1su1" => 0xf3ba0380 );
  609. sub unsha1 {
  610. my ($mnemonic,$arg)=@_;
  611. if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
  612. my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
  613. |(($2&7)<<17)|(($2&8)<<4)
  614. |(($3&7)<<1) |(($3&8)<<2);
  615. # since ARMv7 instructions are always encoded little-endian.
  616. # correct solution is to use .inst directive, but older
  617. # assemblers don't implement it:-(
  618. sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
  619. $word&0xff,($word>>8)&0xff,
  620. ($word>>16)&0xff,($word>>24)&0xff,
  621. $mnemonic,$arg;
  622. }
  623. }
  624. }
  625. foreach (split($/,$code)) {
  626. s/{q([0-9]+)\[\]}/sprintf "{d%d[],d%d[]}",2*$1,2*$1+1/eo or
  627. s/{q([0-9]+)\[0\]}/sprintf "{d%d[0]}",2*$1/eo;
  628. s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/geo;
  629. s/\bret\b/bx lr/o or
  630. s/\bbx\s+lr\b/.word\t0xe12fff1e/o; # make it possible to compile with -march=armv4
  631. print $_,$/;
  632. }
  633. close STDOUT; # enforce flush