Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.
 
 
 
 
 
 

541 Zeilen
14 KiB

  1. #!/usr/bin/env perl
  2. # ====================================================================
  3. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  4. # project. The module is, however, dual licensed under OpenSSL and
  5. # CRYPTOGAMS licenses depending on where you obtain it. For further
  6. # details see http://www.openssl.org/~appro/cryptogams/.
  7. # ====================================================================
  8. # SHA256 block procedure for ARMv4. May 2007.
  9. # Performance is ~2x better than gcc 3.4 generated code and in "abso-
  10. # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
  11. # byte [on single-issue Xscale PXA250 core].
  12. # July 2010.
  13. #
  14. # Rescheduling for dual-issue pipeline resulted in 22% improvement on
  15. # Cortex A8 core and ~20 cycles per processed byte.
  16. # February 2011.
  17. #
  18. # Profiler-assisted and platform-specific optimization resulted in 16%
  19. # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
  20. # September 2013.
  21. #
  22. # Add NEON implementation. On Cortex A8 it was measured to process one
  23. # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
  24. # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
  25. # code (meaning that latter performs sub-optimally, nothing was done
  26. # about it).
  27. while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  28. open STDOUT,">$output";
  29. $ctx="r0"; $t0="r0";
  30. $inp="r1"; $t4="r1";
  31. $len="r2"; $t1="r2";
  32. $T1="r3"; $t3="r3";
  33. $A="r4";
  34. $B="r5";
  35. $C="r6";
  36. $D="r7";
  37. $E="r8";
  38. $F="r9";
  39. $G="r10";
  40. $H="r11";
  41. @V=($A,$B,$C,$D,$E,$F,$G,$H);
  42. $t2="r12";
  43. $Ktbl="r14";
  44. @Sigma0=( 2,13,22);
  45. @Sigma1=( 6,11,25);
  46. @sigma0=( 7,18, 3);
  47. @sigma1=(17,19,10);
  48. sub BODY_00_15 {
  49. my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  50. $code.=<<___ if ($i<16);
  51. #if __ARM_ARCH__>=7
  52. @ ldr $t1,[$inp],#4 @ $i
  53. # if $i==15
  54. str $inp,[sp,#17*4] @ make room for $t4
  55. # endif
  56. eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
  57. add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
  58. eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
  59. rev $t1,$t1
  60. #else
  61. @ ldrb $t1,[$inp,#3] @ $i
  62. add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
  63. ldrb $t2,[$inp,#2]
  64. ldrb $t0,[$inp,#1]
  65. orr $t1,$t1,$t2,lsl#8
  66. ldrb $t2,[$inp],#4
  67. orr $t1,$t1,$t0,lsl#16
  68. # if $i==15
  69. str $inp,[sp,#17*4] @ make room for $t4
  70. # endif
  71. eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
  72. orr $t1,$t1,$t2,lsl#24
  73. eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
  74. #endif
  75. ___
  76. $code.=<<___;
  77. ldr $t2,[$Ktbl],#4 @ *K256++
  78. add $h,$h,$t1 @ h+=X[i]
  79. str $t1,[sp,#`$i%16`*4]
  80. eor $t1,$f,$g
  81. add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
  82. and $t1,$t1,$e
  83. add $h,$h,$t2 @ h+=K256[i]
  84. eor $t1,$t1,$g @ Ch(e,f,g)
  85. eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
  86. add $h,$h,$t1 @ h+=Ch(e,f,g)
  87. #if $i==31
  88. and $t2,$t2,#0xff
  89. cmp $t2,#0xf2 @ done?
  90. #endif
  91. #if $i<15
  92. # if __ARM_ARCH__>=7
  93. ldr $t1,[$inp],#4 @ prefetch
  94. # else
  95. ldrb $t1,[$inp,#3]
  96. # endif
  97. eor $t2,$a,$b @ a^b, b^c in next round
  98. #else
  99. ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
  100. eor $t2,$a,$b @ a^b, b^c in next round
  101. ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
  102. #endif
  103. eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
  104. and $t3,$t3,$t2 @ (b^c)&=(a^b)
  105. add $d,$d,$h @ d+=h
  106. eor $t3,$t3,$b @ Maj(a,b,c)
  107. add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
  108. @ add $h,$h,$t3 @ h+=Maj(a,b,c)
  109. ___
  110. ($t2,$t3)=($t3,$t2);
  111. }
  112. sub BODY_16_XX {
  113. my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  114. $code.=<<___;
  115. @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
  116. @ ldr $t4,[sp,#`($i+14)%16`*4]
  117. mov $t0,$t1,ror#$sigma0[0]
  118. add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
  119. mov $t2,$t4,ror#$sigma1[0]
  120. eor $t0,$t0,$t1,ror#$sigma0[1]
  121. eor $t2,$t2,$t4,ror#$sigma1[1]
  122. eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
  123. ldr $t1,[sp,#`($i+0)%16`*4]
  124. eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
  125. ldr $t4,[sp,#`($i+9)%16`*4]
  126. add $t2,$t2,$t0
  127. eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
  128. add $t1,$t1,$t2
  129. eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
  130. add $t1,$t1,$t4 @ X[i]
  131. ___
  132. &BODY_00_15(@_);
  133. }
  134. $code=<<___;
  135. #if defined(__arm__)
  136. #include "arm_arch.h"
  137. .text
  138. .code 32
  139. .type K256,%object
  140. .align 5
  141. K256:
  142. .word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  143. .word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  144. .word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  145. .word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  146. .word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  147. .word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  148. .word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  149. .word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  150. .word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  151. .word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  152. .word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  153. .word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  154. .word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  155. .word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  156. .word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  157. .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  158. .size K256,.-K256
  159. .word 0 @ terminator
  160. .LOPENSSL_armcap:
  161. .word OPENSSL_armcap_P-sha256_block_data_order
  162. .align 5
  163. .global sha256_block_data_order
  164. .type sha256_block_data_order,%function
  165. sha256_block_data_order:
  166. sub r3,pc,#8 @ sha256_block_data_order
  167. add $len,$inp,$len,lsl#6 @ len to point at the end of inp
  168. #if __ARM_ARCH__>=7
  169. ldr r12,.LOPENSSL_armcap
  170. ldr r12,[r3,r12] @ OPENSSL_armcap_P
  171. tst r12,#1
  172. bne .LNEON
  173. #endif
  174. stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
  175. ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
  176. sub $Ktbl,r3,#256+32 @ K256
  177. sub sp,sp,#16*4 @ alloca(X[16])
  178. .Loop:
  179. # if __ARM_ARCH__>=7
  180. ldr $t1,[$inp],#4
  181. # else
  182. ldrb $t1,[$inp,#3]
  183. # endif
  184. eor $t3,$B,$C @ magic
  185. eor $t2,$t2,$t2
  186. ___
  187. for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
  188. $code.=".Lrounds_16_xx:\n";
  189. for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
  190. $code.=<<___;
  191. ldreq $t3,[sp,#16*4] @ pull ctx
  192. bne .Lrounds_16_xx
  193. add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
  194. ldr $t0,[$t3,#0]
  195. ldr $t1,[$t3,#4]
  196. ldr $t2,[$t3,#8]
  197. add $A,$A,$t0
  198. ldr $t0,[$t3,#12]
  199. add $B,$B,$t1
  200. ldr $t1,[$t3,#16]
  201. add $C,$C,$t2
  202. ldr $t2,[$t3,#20]
  203. add $D,$D,$t0
  204. ldr $t0,[$t3,#24]
  205. add $E,$E,$t1
  206. ldr $t1,[$t3,#28]
  207. add $F,$F,$t2
  208. ldr $inp,[sp,#17*4] @ pull inp
  209. ldr $t2,[sp,#18*4] @ pull inp+len
  210. add $G,$G,$t0
  211. add $H,$H,$t1
  212. stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
  213. cmp $inp,$t2
  214. sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
  215. bne .Loop
  216. add sp,sp,#`16+3`*4 @ destroy frame
  217. #if __ARM_ARCH__>=5
  218. ldmia sp!,{r4-r11,pc}
  219. #else
  220. ldmia sp!,{r4-r11,lr}
  221. tst lr,#1
  222. moveq pc,lr @ be binary compatible with V4, yet
  223. bx lr @ interoperable with Thumb ISA:-)
  224. #endif
  225. ___
  226. ######################################################################
  227. # NEON stuff
  228. #
  229. {{{
  230. my @X=map("q$_",(0..3));
  231. my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
  232. my $Xfer=$t4;
  233. my $j=0;
  234. sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
  235. sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
  236. sub AUTOLOAD() # thunk [simplified] x86-style perlasm
  237. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
  238. my $arg = pop;
  239. $arg = "#$arg" if ($arg*1 eq $arg);
  240. $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
  241. }
  242. sub Xupdate()
  243. { use integer;
  244. my $body = shift;
  245. my @insns = (&$body,&$body,&$body,&$body);
  246. my ($a,$b,$c,$d,$e,$f,$g,$h);
  247. &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
  248. eval(shift(@insns));
  249. eval(shift(@insns));
  250. eval(shift(@insns));
  251. &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
  252. eval(shift(@insns));
  253. eval(shift(@insns));
  254. eval(shift(@insns));
  255. &vshr_u32 ($T2,$T0,$sigma0[0]);
  256. eval(shift(@insns));
  257. eval(shift(@insns));
  258. &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
  259. eval(shift(@insns));
  260. eval(shift(@insns));
  261. &vshr_u32 ($T1,$T0,$sigma0[2]);
  262. eval(shift(@insns));
  263. eval(shift(@insns));
  264. &vsli_32 ($T2,$T0,32-$sigma0[0]);
  265. eval(shift(@insns));
  266. eval(shift(@insns));
  267. &vshr_u32 ($T3,$T0,$sigma0[1]);
  268. eval(shift(@insns));
  269. eval(shift(@insns));
  270. &veor ($T1,$T1,$T2);
  271. eval(shift(@insns));
  272. eval(shift(@insns));
  273. &vsli_32 ($T3,$T0,32-$sigma0[1]);
  274. eval(shift(@insns));
  275. eval(shift(@insns));
  276. &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
  277. eval(shift(@insns));
  278. eval(shift(@insns));
  279. &veor ($T1,$T1,$T3); # sigma0(X[1..4])
  280. eval(shift(@insns));
  281. eval(shift(@insns));
  282. &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
  283. eval(shift(@insns));
  284. eval(shift(@insns));
  285. &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
  286. eval(shift(@insns));
  287. eval(shift(@insns));
  288. &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
  289. eval(shift(@insns));
  290. eval(shift(@insns));
  291. &veor ($T5,$T5,$T4);
  292. eval(shift(@insns));
  293. eval(shift(@insns));
  294. &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
  295. eval(shift(@insns));
  296. eval(shift(@insns));
  297. &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
  298. eval(shift(@insns));
  299. eval(shift(@insns));
  300. &veor ($T5,$T5,$T4); # sigma1(X[14..15])
  301. eval(shift(@insns));
  302. eval(shift(@insns));
  303. &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
  304. eval(shift(@insns));
  305. eval(shift(@insns));
  306. &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
  307. eval(shift(@insns));
  308. eval(shift(@insns));
  309. &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
  310. eval(shift(@insns));
  311. eval(shift(@insns));
  312. &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
  313. eval(shift(@insns));
  314. eval(shift(@insns));
  315. &veor ($T5,$T5,$T4);
  316. eval(shift(@insns));
  317. eval(shift(@insns));
  318. &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
  319. eval(shift(@insns));
  320. eval(shift(@insns));
  321. &vld1_32 ("{$T0}","[$Ktbl,:128]!");
  322. eval(shift(@insns));
  323. eval(shift(@insns));
  324. &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
  325. eval(shift(@insns));
  326. eval(shift(@insns));
  327. &veor ($T5,$T5,$T4); # sigma1(X[16..17])
  328. eval(shift(@insns));
  329. eval(shift(@insns));
  330. &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
  331. eval(shift(@insns));
  332. eval(shift(@insns));
  333. &vadd_i32 ($T0,$T0,@X[0]);
  334. while($#insns>=2) { eval(shift(@insns)); }
  335. &vst1_32 ("{$T0}","[$Xfer,:128]!");
  336. eval(shift(@insns));
  337. eval(shift(@insns));
  338. push(@X,shift(@X)); # "rotate" X[]
  339. }
  340. sub Xpreload()
  341. { use integer;
  342. my $body = shift;
  343. my @insns = (&$body,&$body,&$body,&$body);
  344. my ($a,$b,$c,$d,$e,$f,$g,$h);
  345. eval(shift(@insns));
  346. eval(shift(@insns));
  347. eval(shift(@insns));
  348. eval(shift(@insns));
  349. &vld1_32 ("{$T0}","[$Ktbl,:128]!");
  350. eval(shift(@insns));
  351. eval(shift(@insns));
  352. eval(shift(@insns));
  353. eval(shift(@insns));
  354. &vrev32_8 (@X[0],@X[0]);
  355. eval(shift(@insns));
  356. eval(shift(@insns));
  357. eval(shift(@insns));
  358. eval(shift(@insns));
  359. &vadd_i32 ($T0,$T0,@X[0]);
  360. foreach (@insns) { eval; } # remaining instructions
  361. &vst1_32 ("{$T0}","[$Xfer,:128]!");
  362. push(@X,shift(@X)); # "rotate" X[]
  363. }
  364. sub body_00_15 () {
  365. (
  366. '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
  367. '&add ($h,$h,$t1)', # h+=X[i]+K[i]
  368. '&eor ($t1,$f,$g)',
  369. '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
  370. '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
  371. '&and ($t1,$t1,$e)',
  372. '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
  373. '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
  374. '&eor ($t1,$t1,$g)', # Ch(e,f,g)
  375. '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
  376. '&eor ($t2,$a,$b)', # a^b, b^c in next round
  377. '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
  378. '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
  379. '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
  380. '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
  381. '&ldr ($t1,"[sp,#64]") if ($j==31)',
  382. '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
  383. '&add ($d,$d,$h)', # d+=h
  384. '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
  385. '&eor ($t3,$t3,$b)', # Maj(a,b,c)
  386. '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
  387. )
  388. }
  389. $code.=<<___;
  390. #if __ARM_ARCH__>=7
  391. .fpu neon
  392. .align 4
  393. .LNEON:
  394. stmdb sp!,{r4-r12,lr}
  395. mov $t2,sp
  396. sub sp,sp,#16*4+16 @ alloca
  397. sub $Ktbl,r3,#256+32 @ K256
  398. bic sp,sp,#15 @ align for 128-bit stores
  399. vld1.8 {@X[0]},[$inp]!
  400. vld1.8 {@X[1]},[$inp]!
  401. vld1.8 {@X[2]},[$inp]!
  402. vld1.8 {@X[3]},[$inp]!
  403. vld1.32 {$T0},[$Ktbl,:128]!
  404. vld1.32 {$T1},[$Ktbl,:128]!
  405. vld1.32 {$T2},[$Ktbl,:128]!
  406. vld1.32 {$T3},[$Ktbl,:128]!
  407. vrev32.8 @X[0],@X[0] @ yes, even on
  408. str $ctx,[sp,#64]
  409. vrev32.8 @X[1],@X[1] @ big-endian
  410. str $inp,[sp,#68]
  411. mov $Xfer,sp
  412. vrev32.8 @X[2],@X[2]
  413. str $len,[sp,#72]
  414. vrev32.8 @X[3],@X[3]
  415. str $t2,[sp,#76] @ save original sp
  416. vadd.i32 $T0,$T0,@X[0]
  417. vadd.i32 $T1,$T1,@X[1]
  418. vst1.32 {$T0},[$Xfer,:128]!
  419. vadd.i32 $T2,$T2,@X[2]
  420. vst1.32 {$T1},[$Xfer,:128]!
  421. vadd.i32 $T3,$T3,@X[3]
  422. vst1.32 {$T2},[$Xfer,:128]!
  423. vst1.32 {$T3},[$Xfer,:128]!
  424. ldmia $ctx,{$A-$H}
  425. sub $Xfer,$Xfer,#64
  426. ldr $t1,[sp,#0]
  427. eor $t2,$t2,$t2
  428. eor $t3,$B,$C
  429. b .L_00_48
  430. .align 4
  431. .L_00_48:
  432. ___
  433. &Xupdate(\&body_00_15);
  434. &Xupdate(\&body_00_15);
  435. &Xupdate(\&body_00_15);
  436. &Xupdate(\&body_00_15);
  437. $code.=<<___;
  438. teq $t1,#0 @ check for K256 terminator
  439. ldr $t1,[sp,#0]
  440. sub $Xfer,$Xfer,#64
  441. bne .L_00_48
  442. ldr $inp,[sp,#68]
  443. ldr $t0,[sp,#72]
  444. sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
  445. teq $inp,$t0
  446. subeq $inp,$inp,#64 @ avoid SEGV
  447. vld1.8 {@X[0]},[$inp]! @ load next input block
  448. vld1.8 {@X[1]},[$inp]!
  449. vld1.8 {@X[2]},[$inp]!
  450. vld1.8 {@X[3]},[$inp]!
  451. strne $inp,[sp,#68]
  452. mov $Xfer,sp
  453. ___
  454. &Xpreload(\&body_00_15);
  455. &Xpreload(\&body_00_15);
  456. &Xpreload(\&body_00_15);
  457. &Xpreload(\&body_00_15);
  458. $code.=<<___;
  459. ldr $t0,[$t1,#0]
  460. add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
  461. ldr $t2,[$t1,#4]
  462. ldr $t3,[$t1,#8]
  463. ldr $t4,[$t1,#12]
  464. add $A,$A,$t0 @ accumulate
  465. ldr $t0,[$t1,#16]
  466. add $B,$B,$t2
  467. ldr $t2,[$t1,#20]
  468. add $C,$C,$t3
  469. ldr $t3,[$t1,#24]
  470. add $D,$D,$t4
  471. ldr $t4,[$t1,#28]
  472. add $E,$E,$t0
  473. str $A,[$t1],#4
  474. add $F,$F,$t2
  475. str $B,[$t1],#4
  476. add $G,$G,$t3
  477. str $C,[$t1],#4
  478. add $H,$H,$t4
  479. str $D,[$t1],#4
  480. stmia $t1,{$E-$H}
  481. movne $Xfer,sp
  482. ldrne $t1,[sp,#0]
  483. eorne $t2,$t2,$t2
  484. ldreq sp,[sp,#76] @ restore original sp
  485. eorne $t3,$B,$C
  486. bne .L_00_48
  487. ldmia sp!,{r4-r12,pc}
  488. #endif
  489. ___
  490. }}}
  491. $code.=<<___;
  492. .size sha256_block_data_order,.-sha256_block_data_order
  493. .asciz "SHA256 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
  494. .align 2
  495. .comm OPENSSL_armcap_P,4,4
  496. #endif
  497. ___
  498. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  499. $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
  500. print $code;
  501. close STDOUT; # enforce flush