Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.
 
 
 
 
 
 

736 Zeilen
18 KiB

  1. #!/usr/bin/env perl
  2. # ====================================================================
  3. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  4. # project. The module is, however, dual licensed under OpenSSL and
  5. # CRYPTOGAMS licenses depending on where you obtain it. For further
  6. # details see http://www.openssl.org/~appro/cryptogams/.
  7. #
  8. # Permission to use under GPL terms is granted.
  9. # ====================================================================
  10. # SHA256 block procedure for ARMv4. May 2007.
  11. # Performance is ~2x better than gcc 3.4 generated code and in "abso-
  12. # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
  13. # byte [on single-issue Xscale PXA250 core].
  14. # July 2010.
  15. #
  16. # Rescheduling for dual-issue pipeline resulted in 22% improvement on
  17. # Cortex A8 core and ~20 cycles per processed byte.
  18. # February 2011.
  19. #
  20. # Profiler-assisted and platform-specific optimization resulted in 16%
  21. # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
  22. # September 2013.
  23. #
  24. # Add NEON implementation. On Cortex A8 it was measured to process one
  25. # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
  26. # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
  27. # code (meaning that latter performs sub-optimally, nothing was done
  28. # about it).
  29. # May 2014.
  30. #
  31. # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
  32. $flavour = shift;
  33. if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
  34. else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
  35. if ($flavour && $flavour ne "void") {
  36. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  37. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  38. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  39. die "can't locate arm-xlate.pl";
  40. open STDOUT,"| \"$^X\" $xlate $flavour $output";
  41. } else {
  42. open STDOUT,">$output";
  43. }
  44. $ctx="r0"; $t0="r0";
  45. $inp="r1"; $t4="r1";
  46. $len="r2"; $t1="r2";
  47. $T1="r3"; $t3="r3";
  48. $A="r4";
  49. $B="r5";
  50. $C="r6";
  51. $D="r7";
  52. $E="r8";
  53. $F="r9";
  54. $G="r10";
  55. $H="r11";
  56. @V=($A,$B,$C,$D,$E,$F,$G,$H);
  57. $t2="r12";
  58. $Ktbl="r14";
  59. @Sigma0=( 2,13,22);
  60. @Sigma1=( 6,11,25);
  61. @sigma0=( 7,18, 3);
  62. @sigma1=(17,19,10);
  63. sub BODY_00_15 {
  64. my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  65. $code.=<<___ if ($i<16);
  66. #if __ARM_ARCH__>=7
  67. @ ldr $t1,[$inp],#4 @ $i
  68. # if $i==15
  69. str $inp,[sp,#17*4] @ make room for $t4
  70. # endif
  71. eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
  72. add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
  73. eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
  74. # ifndef __ARMEB__
  75. rev $t1,$t1
  76. # endif
  77. #else
  78. @ ldrb $t1,[$inp,#3] @ $i
  79. add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
  80. ldrb $t2,[$inp,#2]
  81. ldrb $t0,[$inp,#1]
  82. orr $t1,$t1,$t2,lsl#8
  83. ldrb $t2,[$inp],#4
  84. orr $t1,$t1,$t0,lsl#16
  85. # if $i==15
  86. str $inp,[sp,#17*4] @ make room for $t4
  87. # endif
  88. eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
  89. orr $t1,$t1,$t2,lsl#24
  90. eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
  91. #endif
  92. ___
  93. $code.=<<___;
  94. ldr $t2,[$Ktbl],#4 @ *K256++
  95. add $h,$h,$t1 @ h+=X[i]
  96. str $t1,[sp,#`$i%16`*4]
  97. eor $t1,$f,$g
  98. add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
  99. and $t1,$t1,$e
  100. add $h,$h,$t2 @ h+=K256[i]
  101. eor $t1,$t1,$g @ Ch(e,f,g)
  102. eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
  103. add $h,$h,$t1 @ h+=Ch(e,f,g)
  104. #if $i==31
  105. and $t2,$t2,#0xff
  106. cmp $t2,#0xf2 @ done?
  107. #endif
  108. #if $i<15
  109. # if __ARM_ARCH__>=7
  110. ldr $t1,[$inp],#4 @ prefetch
  111. # else
  112. ldrb $t1,[$inp,#3]
  113. # endif
  114. eor $t2,$a,$b @ a^b, b^c in next round
  115. #else
  116. ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
  117. eor $t2,$a,$b @ a^b, b^c in next round
  118. ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
  119. #endif
  120. eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
  121. and $t3,$t3,$t2 @ (b^c)&=(a^b)
  122. add $d,$d,$h @ d+=h
  123. eor $t3,$t3,$b @ Maj(a,b,c)
  124. add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
  125. @ add $h,$h,$t3 @ h+=Maj(a,b,c)
  126. ___
  127. ($t2,$t3)=($t3,$t2);
  128. }
  129. sub BODY_16_XX {
  130. my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  131. $code.=<<___;
  132. @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
  133. @ ldr $t4,[sp,#`($i+14)%16`*4]
  134. mov $t0,$t1,ror#$sigma0[0]
  135. add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
  136. mov $t2,$t4,ror#$sigma1[0]
  137. eor $t0,$t0,$t1,ror#$sigma0[1]
  138. eor $t2,$t2,$t4,ror#$sigma1[1]
  139. eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
  140. ldr $t1,[sp,#`($i+0)%16`*4]
  141. eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
  142. ldr $t4,[sp,#`($i+9)%16`*4]
  143. add $t2,$t2,$t0
  144. eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
  145. add $t1,$t1,$t2
  146. eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
  147. add $t1,$t1,$t4 @ X[i]
  148. ___
  149. &BODY_00_15(@_);
  150. }
  151. $code=<<___;
  152. #ifndef __KERNEL__
  153. # include <openssl/arm_arch.h>
  154. #else
  155. # define __ARM_ARCH__ __LINUX_ARM_ARCH__
  156. # define __ARM_MAX_ARCH__ 7
  157. #endif
  158. .text
  159. #if __ARM_ARCH__<7
  160. .code 32
  161. #else
  162. .syntax unified
  163. # if defined(__thumb2__) && !defined(__APPLE__)
  164. # define adrl adr
  165. .thumb
  166. # else
  167. .code 32
  168. # endif
  169. #endif
  170. .type K256,%object
  171. .align 5
  172. K256:
  173. .word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  174. .word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  175. .word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  176. .word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  177. .word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  178. .word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  179. .word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  180. .word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  181. .word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  182. .word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  183. .word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  184. .word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  185. .word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  186. .word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  187. .word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  188. .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  189. .size K256,.-K256
  190. .word 0 @ terminator
  191. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  192. .LOPENSSL_armcap:
  193. .word OPENSSL_armcap_P-.Lsha256_block_data_order
  194. #endif
  195. .align 5
  196. .global sha256_block_data_order
  197. .type sha256_block_data_order,%function
  198. sha256_block_data_order:
  199. .Lsha256_block_data_order:
  200. #if __ARM_ARCH__<7
  201. sub r3,pc,#8 @ sha256_block_data_order
  202. #else
  203. adr r3,sha256_block_data_order
  204. #endif
  205. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  206. ldr r12,.LOPENSSL_armcap
  207. ldr r12,[r3,r12] @ OPENSSL_armcap_P
  208. #ifdef __APPLE__
  209. ldr r12,[r12]
  210. #endif
  211. tst r12,#ARMV8_SHA256
  212. bne .LARMv8
  213. tst r12,#ARMV7_NEON
  214. bne .LNEON
  215. #endif
  216. add $len,$inp,$len,lsl#6 @ len to point at the end of inp
  217. stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
  218. ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
  219. sub $Ktbl,r3,#256+32 @ K256
  220. sub sp,sp,#16*4 @ alloca(X[16])
  221. .Loop:
  222. # if __ARM_ARCH__>=7
  223. ldr $t1,[$inp],#4
  224. # else
  225. ldrb $t1,[$inp,#3]
  226. # endif
  227. eor $t3,$B,$C @ magic
  228. eor $t2,$t2,$t2
  229. ___
  230. for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
  231. $code.=".Lrounds_16_xx:\n";
  232. for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
  233. $code.=<<___;
  234. #if __ARM_ARCH__>=7
  235. ite eq @ Thumb2 thing, sanity check in ARM
  236. #endif
  237. ldreq $t3,[sp,#16*4] @ pull ctx
  238. bne .Lrounds_16_xx
  239. add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
  240. ldr $t0,[$t3,#0]
  241. ldr $t1,[$t3,#4]
  242. ldr $t2,[$t3,#8]
  243. add $A,$A,$t0
  244. ldr $t0,[$t3,#12]
  245. add $B,$B,$t1
  246. ldr $t1,[$t3,#16]
  247. add $C,$C,$t2
  248. ldr $t2,[$t3,#20]
  249. add $D,$D,$t0
  250. ldr $t0,[$t3,#24]
  251. add $E,$E,$t1
  252. ldr $t1,[$t3,#28]
  253. add $F,$F,$t2
  254. ldr $inp,[sp,#17*4] @ pull inp
  255. ldr $t2,[sp,#18*4] @ pull inp+len
  256. add $G,$G,$t0
  257. add $H,$H,$t1
  258. stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
  259. cmp $inp,$t2
  260. sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
  261. bne .Loop
  262. add sp,sp,#`16+3`*4 @ destroy frame
  263. #if __ARM_ARCH__>=5
  264. ldmia sp!,{r4-r11,pc}
  265. #else
  266. ldmia sp!,{r4-r11,lr}
  267. tst lr,#1
  268. moveq pc,lr @ be binary compatible with V4, yet
  269. bx lr @ interoperable with Thumb ISA:-)
  270. #endif
  271. .size sha256_block_data_order,.-sha256_block_data_order
  272. ___
  273. ######################################################################
  274. # NEON stuff
  275. #
  276. {{{
  277. my @X=map("q$_",(0..3));
  278. my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
  279. my $Xfer=$t4;
  280. my $j=0;
  281. sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
  282. sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
  283. sub AUTOLOAD() # thunk [simplified] x86-style perlasm
  284. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
  285. my $arg = pop;
  286. $arg = "#$arg" if ($arg*1 eq $arg);
  287. $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
  288. }
  289. sub Xupdate()
  290. { use integer;
  291. my $body = shift;
  292. my @insns = (&$body,&$body,&$body,&$body);
  293. my ($a,$b,$c,$d,$e,$f,$g,$h);
  294. &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
  295. eval(shift(@insns));
  296. eval(shift(@insns));
  297. eval(shift(@insns));
  298. &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
  299. eval(shift(@insns));
  300. eval(shift(@insns));
  301. eval(shift(@insns));
  302. &vshr_u32 ($T2,$T0,$sigma0[0]);
  303. eval(shift(@insns));
  304. eval(shift(@insns));
  305. &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
  306. eval(shift(@insns));
  307. eval(shift(@insns));
  308. &vshr_u32 ($T1,$T0,$sigma0[2]);
  309. eval(shift(@insns));
  310. eval(shift(@insns));
  311. &vsli_32 ($T2,$T0,32-$sigma0[0]);
  312. eval(shift(@insns));
  313. eval(shift(@insns));
  314. &vshr_u32 ($T3,$T0,$sigma0[1]);
  315. eval(shift(@insns));
  316. eval(shift(@insns));
  317. &veor ($T1,$T1,$T2);
  318. eval(shift(@insns));
  319. eval(shift(@insns));
  320. &vsli_32 ($T3,$T0,32-$sigma0[1]);
  321. eval(shift(@insns));
  322. eval(shift(@insns));
  323. &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
  324. eval(shift(@insns));
  325. eval(shift(@insns));
  326. &veor ($T1,$T1,$T3); # sigma0(X[1..4])
  327. eval(shift(@insns));
  328. eval(shift(@insns));
  329. &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
  330. eval(shift(@insns));
  331. eval(shift(@insns));
  332. &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
  333. eval(shift(@insns));
  334. eval(shift(@insns));
  335. &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
  336. eval(shift(@insns));
  337. eval(shift(@insns));
  338. &veor ($T5,$T5,$T4);
  339. eval(shift(@insns));
  340. eval(shift(@insns));
  341. &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
  342. eval(shift(@insns));
  343. eval(shift(@insns));
  344. &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
  345. eval(shift(@insns));
  346. eval(shift(@insns));
  347. &veor ($T5,$T5,$T4); # sigma1(X[14..15])
  348. eval(shift(@insns));
  349. eval(shift(@insns));
  350. &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
  351. eval(shift(@insns));
  352. eval(shift(@insns));
  353. &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
  354. eval(shift(@insns));
  355. eval(shift(@insns));
  356. &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
  357. eval(shift(@insns));
  358. eval(shift(@insns));
  359. &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
  360. eval(shift(@insns));
  361. eval(shift(@insns));
  362. &veor ($T5,$T5,$T4);
  363. eval(shift(@insns));
  364. eval(shift(@insns));
  365. &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
  366. eval(shift(@insns));
  367. eval(shift(@insns));
  368. &vld1_32 ("{$T0}","[$Ktbl,:128]!");
  369. eval(shift(@insns));
  370. eval(shift(@insns));
  371. &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
  372. eval(shift(@insns));
  373. eval(shift(@insns));
  374. &veor ($T5,$T5,$T4); # sigma1(X[16..17])
  375. eval(shift(@insns));
  376. eval(shift(@insns));
  377. &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
  378. eval(shift(@insns));
  379. eval(shift(@insns));
  380. &vadd_i32 ($T0,$T0,@X[0]);
  381. while($#insns>=2) { eval(shift(@insns)); }
  382. &vst1_32 ("{$T0}","[$Xfer,:128]!");
  383. eval(shift(@insns));
  384. eval(shift(@insns));
  385. push(@X,shift(@X)); # "rotate" X[]
  386. }
  387. sub Xpreload()
  388. { use integer;
  389. my $body = shift;
  390. my @insns = (&$body,&$body,&$body,&$body);
  391. my ($a,$b,$c,$d,$e,$f,$g,$h);
  392. eval(shift(@insns));
  393. eval(shift(@insns));
  394. eval(shift(@insns));
  395. eval(shift(@insns));
  396. &vld1_32 ("{$T0}","[$Ktbl,:128]!");
  397. eval(shift(@insns));
  398. eval(shift(@insns));
  399. eval(shift(@insns));
  400. eval(shift(@insns));
  401. &vrev32_8 (@X[0],@X[0]);
  402. eval(shift(@insns));
  403. eval(shift(@insns));
  404. eval(shift(@insns));
  405. eval(shift(@insns));
  406. &vadd_i32 ($T0,$T0,@X[0]);
  407. foreach (@insns) { eval; } # remaining instructions
  408. &vst1_32 ("{$T0}","[$Xfer,:128]!");
  409. push(@X,shift(@X)); # "rotate" X[]
  410. }
  411. sub body_00_15 () {
  412. (
  413. '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
  414. '&add ($h,$h,$t1)', # h+=X[i]+K[i]
  415. '&eor ($t1,$f,$g)',
  416. '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
  417. '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
  418. '&and ($t1,$t1,$e)',
  419. '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
  420. '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
  421. '&eor ($t1,$t1,$g)', # Ch(e,f,g)
  422. '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
  423. '&eor ($t2,$a,$b)', # a^b, b^c in next round
  424. '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
  425. '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
  426. '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
  427. '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
  428. '&ldr ($t1,"[sp,#64]") if ($j==31)',
  429. '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
  430. '&add ($d,$d,$h)', # d+=h
  431. '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
  432. '&eor ($t3,$t3,$b)', # Maj(a,b,c)
  433. '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
  434. )
  435. }
  436. $code.=<<___;
  437. #if __ARM_MAX_ARCH__>=7
  438. .arch armv7-a
  439. .fpu neon
  440. .global sha256_block_data_order_neon
  441. .type sha256_block_data_order_neon,%function
  442. .align 4
  443. sha256_block_data_order_neon:
  444. .LNEON:
  445. stmdb sp!,{r4-r12,lr}
  446. sub $H,sp,#16*4+16
  447. adrl $Ktbl,K256
  448. bic $H,$H,#15 @ align for 128-bit stores
  449. mov $t2,sp
  450. mov sp,$H @ alloca
  451. add $len,$inp,$len,lsl#6 @ len to point at the end of inp
  452. vld1.8 {@X[0]},[$inp]!
  453. vld1.8 {@X[1]},[$inp]!
  454. vld1.8 {@X[2]},[$inp]!
  455. vld1.8 {@X[3]},[$inp]!
  456. vld1.32 {$T0},[$Ktbl,:128]!
  457. vld1.32 {$T1},[$Ktbl,:128]!
  458. vld1.32 {$T2},[$Ktbl,:128]!
  459. vld1.32 {$T3},[$Ktbl,:128]!
  460. vrev32.8 @X[0],@X[0] @ yes, even on
  461. str $ctx,[sp,#64]
  462. vrev32.8 @X[1],@X[1] @ big-endian
  463. str $inp,[sp,#68]
  464. mov $Xfer,sp
  465. vrev32.8 @X[2],@X[2]
  466. str $len,[sp,#72]
  467. vrev32.8 @X[3],@X[3]
  468. str $t2,[sp,#76] @ save original sp
  469. vadd.i32 $T0,$T0,@X[0]
  470. vadd.i32 $T1,$T1,@X[1]
  471. vst1.32 {$T0},[$Xfer,:128]!
  472. vadd.i32 $T2,$T2,@X[2]
  473. vst1.32 {$T1},[$Xfer,:128]!
  474. vadd.i32 $T3,$T3,@X[3]
  475. vst1.32 {$T2},[$Xfer,:128]!
  476. vst1.32 {$T3},[$Xfer,:128]!
  477. ldmia $ctx,{$A-$H}
  478. sub $Xfer,$Xfer,#64
  479. ldr $t1,[sp,#0]
  480. eor $t2,$t2,$t2
  481. eor $t3,$B,$C
  482. b .L_00_48
  483. .align 4
  484. .L_00_48:
  485. ___
  486. &Xupdate(\&body_00_15);
  487. &Xupdate(\&body_00_15);
  488. &Xupdate(\&body_00_15);
  489. &Xupdate(\&body_00_15);
  490. $code.=<<___;
  491. teq $t1,#0 @ check for K256 terminator
  492. ldr $t1,[sp,#0]
  493. sub $Xfer,$Xfer,#64
  494. bne .L_00_48
  495. ldr $inp,[sp,#68]
  496. ldr $t0,[sp,#72]
  497. sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
  498. teq $inp,$t0
  499. it eq
  500. subeq $inp,$inp,#64 @ avoid SEGV
  501. vld1.8 {@X[0]},[$inp]! @ load next input block
  502. vld1.8 {@X[1]},[$inp]!
  503. vld1.8 {@X[2]},[$inp]!
  504. vld1.8 {@X[3]},[$inp]!
  505. it ne
  506. strne $inp,[sp,#68]
  507. mov $Xfer,sp
  508. ___
  509. &Xpreload(\&body_00_15);
  510. &Xpreload(\&body_00_15);
  511. &Xpreload(\&body_00_15);
  512. &Xpreload(\&body_00_15);
  513. $code.=<<___;
  514. ldr $t0,[$t1,#0]
  515. add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
  516. ldr $t2,[$t1,#4]
  517. ldr $t3,[$t1,#8]
  518. ldr $t4,[$t1,#12]
  519. add $A,$A,$t0 @ accumulate
  520. ldr $t0,[$t1,#16]
  521. add $B,$B,$t2
  522. ldr $t2,[$t1,#20]
  523. add $C,$C,$t3
  524. ldr $t3,[$t1,#24]
  525. add $D,$D,$t4
  526. ldr $t4,[$t1,#28]
  527. add $E,$E,$t0
  528. str $A,[$t1],#4
  529. add $F,$F,$t2
  530. str $B,[$t1],#4
  531. add $G,$G,$t3
  532. str $C,[$t1],#4
  533. add $H,$H,$t4
  534. str $D,[$t1],#4
  535. stmia $t1,{$E-$H}
  536. ittte ne
  537. movne $Xfer,sp
  538. ldrne $t1,[sp,#0]
  539. eorne $t2,$t2,$t2
  540. ldreq sp,[sp,#76] @ restore original sp
  541. itt ne
  542. eorne $t3,$B,$C
  543. bne .L_00_48
  544. ldmia sp!,{r4-r12,pc}
  545. .size sha256_block_data_order_neon,.-sha256_block_data_order_neon
  546. #endif
  547. ___
  548. }}}
  549. ######################################################################
  550. # ARMv8 stuff
  551. #
  552. {{{
  553. my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
  554. my @MSG=map("q$_",(8..11));
  555. my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
  556. my $Ktbl="r3";
  557. $code.=<<___;
  558. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  559. # if defined(__thumb2__) && !defined(__APPLE__)
  560. # define INST(a,b,c,d) .byte c,d|0xc,a,b
  561. # else
  562. # define INST(a,b,c,d) .byte a,b,c,d
  563. # endif
  564. .type sha256_block_data_order_armv8,%function
  565. .align 5
  566. sha256_block_data_order_armv8:
  567. .LARMv8:
  568. vld1.32 {$ABCD,$EFGH},[$ctx]
  569. # ifdef __APPLE__
  570. sub $Ktbl,$Ktbl,#256+32
  571. # elif defined(__thumb2__)
  572. adr $Ktbl,.LARMv8
  573. sub $Ktbl,$Ktbl,#.LARMv8-K256
  574. # else
  575. adrl $Ktbl,K256
  576. # endif
  577. add $len,$inp,$len,lsl#6 @ len to point at the end of inp
  578. .Loop_v8:
  579. vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
  580. vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
  581. vld1.32 {$W0},[$Ktbl]!
  582. vrev32.8 @MSG[0],@MSG[0]
  583. vrev32.8 @MSG[1],@MSG[1]
  584. vrev32.8 @MSG[2],@MSG[2]
  585. vrev32.8 @MSG[3],@MSG[3]
  586. vmov $ABCD_SAVE,$ABCD @ offload
  587. vmov $EFGH_SAVE,$EFGH
  588. teq $inp,$len
  589. ___
  590. for($i=0;$i<12;$i++) {
  591. $code.=<<___;
  592. vld1.32 {$W1},[$Ktbl]!
  593. vadd.i32 $W0,$W0,@MSG[0]
  594. sha256su0 @MSG[0],@MSG[1]
  595. vmov $abcd,$ABCD
  596. sha256h $ABCD,$EFGH,$W0
  597. sha256h2 $EFGH,$abcd,$W0
  598. sha256su1 @MSG[0],@MSG[2],@MSG[3]
  599. ___
  600. ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
  601. }
  602. $code.=<<___;
  603. vld1.32 {$W1},[$Ktbl]!
  604. vadd.i32 $W0,$W0,@MSG[0]
  605. vmov $abcd,$ABCD
  606. sha256h $ABCD,$EFGH,$W0
  607. sha256h2 $EFGH,$abcd,$W0
  608. vld1.32 {$W0},[$Ktbl]!
  609. vadd.i32 $W1,$W1,@MSG[1]
  610. vmov $abcd,$ABCD
  611. sha256h $ABCD,$EFGH,$W1
  612. sha256h2 $EFGH,$abcd,$W1
  613. vld1.32 {$W1},[$Ktbl]
  614. vadd.i32 $W0,$W0,@MSG[2]
  615. sub $Ktbl,$Ktbl,#256-16 @ rewind
  616. vmov $abcd,$ABCD
  617. sha256h $ABCD,$EFGH,$W0
  618. sha256h2 $EFGH,$abcd,$W0
  619. vadd.i32 $W1,$W1,@MSG[3]
  620. vmov $abcd,$ABCD
  621. sha256h $ABCD,$EFGH,$W1
  622. sha256h2 $EFGH,$abcd,$W1
  623. vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
  624. vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
  625. it ne
  626. bne .Loop_v8
  627. vst1.32 {$ABCD,$EFGH},[$ctx]
  628. ret @ bx lr
  629. .size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
  630. #endif
  631. ___
  632. }}}
  633. $code.=<<___;
  634. .asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  635. .align 2
  636. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  637. .comm OPENSSL_armcap_P,4,4
  638. .hidden OPENSSL_armcap_P
  639. #endif
  640. ___
  641. open SELF,$0;
  642. while(<SELF>) {
  643. next if (/^#!/);
  644. last if (!s/^#/@/ and !/^$/);
  645. print;
  646. }
  647. close SELF;
  648. { my %opcode = (
  649. "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
  650. "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
  651. sub unsha256 {
  652. my ($mnemonic,$arg)=@_;
  653. if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
  654. my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
  655. |(($2&7)<<17)|(($2&8)<<4)
  656. |(($3&7)<<1) |(($3&8)<<2);
  657. # since ARMv7 instructions are always encoded little-endian.
  658. # correct solution is to use .inst directive, but older
  659. # assemblers don't implement it:-(
  660. sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
  661. $word&0xff,($word>>8)&0xff,
  662. ($word>>16)&0xff,($word>>24)&0xff,
  663. $mnemonic,$arg;
  664. }
  665. }
  666. }
  667. foreach (split($/,$code)) {
  668. s/\`([^\`]*)\`/eval $1/geo;
  669. s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
  670. s/\bret\b/bx lr/go or
  671. s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
  672. print $_,"\n";
  673. }
  674. close STDOUT; # enforce flush