You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

Remove inconsistency in ARM support. This facilitates "universal" builds, ones that target multiple architectures, e.g. ARMv5 through ARMv7. (Imported from upstream's c1669e1c205dc8e695fb0c10a655f434e758b9f7) This is a change from a while ago which was a source of divergence between our perlasm and upstream's. This change in upstream came with the following comment in Configure: Note that -march is not among compiler options in below linux-armv4 target line. Not specifying one is intentional to give you choice to: a) rely on your compiler default by not specifying one; b) specify your target platform explicitly for optimal performance, e.g. -march=armv6 or -march=armv7-a; c) build "universal" binary that targets *range* of platforms by specifying minimum and maximum supported architecture; As for c) option. It actually makes no sense to specify maximum to be less than ARMv7, because it's the least requirement for run-time switch between platform-specific code paths. And without run-time switch performance would be equivalent to one for minimum. Secondly, there are some natural limitations that you'd have to accept and respect. Most notably you can *not* build "universal" binary for big-endian platform. This is because ARMv7 processor always picks instructions in little-endian order. Another similar limitation is that -mthumb can't "cross" -march=armv6t2 boundary, because that's where it became Thumb-2. Well, this limitation is a bit artificial, because it's not really impossible, but it's deemed too tricky to support. And of course you have to be sure that your binutils are actually up to the task of handling maximum target platform. Change-Id: Ie5f674d603393f0a1354a0d0973987484a4a650c Reviewed-on: https://boringssl-review.googlesource.com/4488 Reviewed-by: Adam Langley <agl@google.com>
пре 9 година
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512
  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  5. # project. The module is, however, dual licensed under OpenSSL and
  6. # CRYPTOGAMS licenses depending on where you obtain it. For further
  7. # details see http://www.openssl.org/~appro/cryptogams/.
  8. # ====================================================================
  9. #
  10. # April 2010
  11. #
  12. # The module implements "4-bit" GCM GHASH function and underlying
  13. # single multiplication operation in GF(2^128). "4-bit" means that it
  14. # uses 256 bytes per-key table [+32 bytes shared table]. There is no
  15. # experimental performance data available yet. The only approximation
  16. # that can be made at this point is based on code size. Inner loop is
  17. # 32 instructions long and on single-issue core should execute in <40
  18. # cycles. Having verified that gcc 3.4 didn't unroll corresponding
  19. # loop, this assembler loop body was found to be ~3x smaller than
  20. # compiler-generated one...
  21. #
  22. # July 2010
  23. #
  24. # Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
  25. # Cortex A8 core and ~25 cycles per processed byte (which was observed
  26. # to be ~3 times faster than gcc-generated code:-)
  27. #
  28. # February 2011
  29. #
  30. # Profiler-assisted and platform-specific optimization resulted in 7%
  31. # improvement on Cortex A8 core and ~23.5 cycles per byte.
  32. #
  33. # March 2011
  34. #
  35. # Add NEON implementation featuring polynomial multiplication, i.e. no
  36. # lookup tables involved. On Cortex A8 it was measured to process one
  37. # byte in 15 cycles or 55% faster than integer-only code.
  38. #
  39. # April 2014
  40. #
  41. # Switch to multiplication algorithm suggested in paper referred
  42. # below and combine it with reduction algorithm from x86 module.
  43. # Performance improvement over previous version varies from 65% on
  44. # Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
  45. # processes one byte in 8.45 cycles, A9 - in 10.2, A15 - in 7.63,
  46. # Snapdragon S4 - in 9.33.
  47. #
  48. # Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
  49. # Polynomial Multiplication on ARM Processors using the NEON Engine.
  50. #
  51. # http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
  52. # ====================================================================
  53. # Note about "528B" variant. In ARM case it makes lesser sense to
  54. # implement it for following reasons:
  55. #
  56. # - performance improvement won't be anywhere near 50%, because 128-
  57. # bit shift operation is neatly fused with 128-bit xor here, and
  58. # "538B" variant would eliminate only 4-5 instructions out of 32
  59. # in the inner loop (meaning that estimated improvement is ~15%);
  60. # - ARM-based systems are often embedded ones and extra memory
  61. # consumption might be unappreciated (for so little improvement);
  62. #
  63. # Byte order [in]dependence. =========================================
  64. #
  65. # Caller is expected to maintain specific *dword* order in Htable,
  66. # namely with *least* significant dword of 128-bit value at *lower*
  67. # address. This differs completely from C code and has everything to
  68. # do with ldm instruction and order in which dwords are "consumed" by
  69. # algorithm. *Byte* order within these dwords in turn is whatever
  70. # *native* byte order on current platform. See gcm128.c for working
  71. # example...
  72. $flavour = shift;
  73. if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
  74. else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
  75. if ($flavour && $flavour ne "void") {
  76. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  77. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  78. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  79. die "can't locate arm-xlate.pl";
  80. open STDOUT,"| \"$^X\" $xlate $flavour $output";
  81. } else {
  82. open STDOUT,">$output";
  83. }
  84. $Xi="r0"; # argument block
  85. $Htbl="r1";
  86. $inp="r2";
  87. $len="r3";
  88. $Zll="r4"; # variables
  89. $Zlh="r5";
  90. $Zhl="r6";
  91. $Zhh="r7";
  92. $Tll="r8";
  93. $Tlh="r9";
  94. $Thl="r10";
  95. $Thh="r11";
  96. $nlo="r12";
  97. ################# r13 is stack pointer
  98. $nhi="r14";
  99. ################# r15 is program counter
  100. $rem_4bit=$inp; # used in gcm_gmult_4bit
  101. $cnt=$len;
  102. sub Zsmash() {
  103. my $i=12;
  104. my @args=@_;
  105. for ($Zll,$Zlh,$Zhl,$Zhh) {
  106. $code.=<<___;
  107. #if __ARM_ARCH__>=7 && defined(__ARMEL__)
  108. rev $_,$_
  109. str $_,[$Xi,#$i]
  110. #elif defined(__ARMEB__)
  111. str $_,[$Xi,#$i]
  112. #else
  113. mov $Tlh,$_,lsr#8
  114. strb $_,[$Xi,#$i+3]
  115. mov $Thl,$_,lsr#16
  116. strb $Tlh,[$Xi,#$i+2]
  117. mov $Thh,$_,lsr#24
  118. strb $Thl,[$Xi,#$i+1]
  119. strb $Thh,[$Xi,#$i]
  120. #endif
  121. ___
  122. $code.="\t".shift(@args)."\n";
  123. $i-=4;
  124. }
  125. }
  126. $code=<<___;
  127. #include <openssl/arm_arch.h>
  128. .syntax unified
  129. .text
  130. .code 32
  131. #ifdef __clang__
  132. #define ldrplb ldrbpl
  133. #define ldrneb ldrbne
  134. #endif
  135. .type rem_4bit,%object
  136. .align 5
  137. rem_4bit:
  138. .short 0x0000,0x1C20,0x3840,0x2460
  139. .short 0x7080,0x6CA0,0x48C0,0x54E0
  140. .short 0xE100,0xFD20,0xD940,0xC560
  141. .short 0x9180,0x8DA0,0xA9C0,0xB5E0
  142. .size rem_4bit,.-rem_4bit
  143. .type rem_4bit_get,%function
  144. rem_4bit_get:
  145. sub $rem_4bit,pc,#8
  146. sub $rem_4bit,$rem_4bit,#32 @ &rem_4bit
  147. b .Lrem_4bit_got
  148. nop
  149. .size rem_4bit_get,.-rem_4bit_get
  150. .global gcm_ghash_4bit
  151. .type gcm_ghash_4bit,%function
  152. gcm_ghash_4bit:
  153. sub r12,pc,#8
  154. add $len,$inp,$len @ $len to point at the end
  155. stmdb sp!,{r3-r11,lr} @ save $len/end too
  156. sub r12,r12,#48 @ &rem_4bit
  157. ldmia r12,{r4-r11} @ copy rem_4bit ...
  158. stmdb sp!,{r4-r11} @ ... to stack
  159. ldrb $nlo,[$inp,#15]
  160. ldrb $nhi,[$Xi,#15]
  161. .Louter:
  162. eor $nlo,$nlo,$nhi
  163. and $nhi,$nlo,#0xf0
  164. and $nlo,$nlo,#0x0f
  165. mov $cnt,#14
  166. add $Zhh,$Htbl,$nlo,lsl#4
  167. ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
  168. add $Thh,$Htbl,$nhi
  169. ldrb $nlo,[$inp,#14]
  170. and $nhi,$Zll,#0xf @ rem
  171. ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
  172. add $nhi,$nhi,$nhi
  173. eor $Zll,$Tll,$Zll,lsr#4
  174. ldrh $Tll,[sp,$nhi] @ rem_4bit[rem]
  175. eor $Zll,$Zll,$Zlh,lsl#28
  176. ldrb $nhi,[$Xi,#14]
  177. eor $Zlh,$Tlh,$Zlh,lsr#4
  178. eor $Zlh,$Zlh,$Zhl,lsl#28
  179. eor $Zhl,$Thl,$Zhl,lsr#4
  180. eor $Zhl,$Zhl,$Zhh,lsl#28
  181. eor $Zhh,$Thh,$Zhh,lsr#4
  182. eor $nlo,$nlo,$nhi
  183. and $nhi,$nlo,#0xf0
  184. and $nlo,$nlo,#0x0f
  185. eor $Zhh,$Zhh,$Tll,lsl#16
  186. .Linner:
  187. add $Thh,$Htbl,$nlo,lsl#4
  188. and $nlo,$Zll,#0xf @ rem
  189. subs $cnt,$cnt,#1
  190. add $nlo,$nlo,$nlo
  191. ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
  192. eor $Zll,$Tll,$Zll,lsr#4
  193. eor $Zll,$Zll,$Zlh,lsl#28
  194. eor $Zlh,$Tlh,$Zlh,lsr#4
  195. eor $Zlh,$Zlh,$Zhl,lsl#28
  196. ldrh $Tll,[sp,$nlo] @ rem_4bit[rem]
  197. eor $Zhl,$Thl,$Zhl,lsr#4
  198. ldrbpl $nlo,[$inp,$cnt]
  199. eor $Zhl,$Zhl,$Zhh,lsl#28
  200. eor $Zhh,$Thh,$Zhh,lsr#4
  201. add $Thh,$Htbl,$nhi
  202. and $nhi,$Zll,#0xf @ rem
  203. eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
  204. add $nhi,$nhi,$nhi
  205. ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
  206. eor $Zll,$Tll,$Zll,lsr#4
  207. ldrbpl $Tll,[$Xi,$cnt]
  208. eor $Zll,$Zll,$Zlh,lsl#28
  209. eor $Zlh,$Tlh,$Zlh,lsr#4
  210. ldrh $Tlh,[sp,$nhi]
  211. eor $Zlh,$Zlh,$Zhl,lsl#28
  212. eor $Zhl,$Thl,$Zhl,lsr#4
  213. eor $Zhl,$Zhl,$Zhh,lsl#28
  214. eorpl $nlo,$nlo,$Tll
  215. eor $Zhh,$Thh,$Zhh,lsr#4
  216. andpl $nhi,$nlo,#0xf0
  217. andpl $nlo,$nlo,#0x0f
  218. eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem]
  219. bpl .Linner
  220. ldr $len,[sp,#32] @ re-load $len/end
  221. add $inp,$inp,#16
  222. mov $nhi,$Zll
  223. ___
  224. &Zsmash("cmp\t$inp,$len","ldrbne\t$nlo,[$inp,#15]");
  225. $code.=<<___;
  226. bne .Louter
  227. add sp,sp,#36
  228. #if __ARM_ARCH__>=5
  229. ldmia sp!,{r4-r11,pc}
  230. #else
  231. ldmia sp!,{r4-r11,lr}
  232. tst lr,#1
  233. moveq pc,lr @ be binary compatible with V4, yet
  234. bx lr @ interoperable with Thumb ISA:-)
  235. #endif
  236. .size gcm_ghash_4bit,.-gcm_ghash_4bit
  237. .global gcm_gmult_4bit
  238. .type gcm_gmult_4bit,%function
  239. gcm_gmult_4bit:
  240. stmdb sp!,{r4-r11,lr}
  241. ldrb $nlo,[$Xi,#15]
  242. b rem_4bit_get
  243. .Lrem_4bit_got:
  244. and $nhi,$nlo,#0xf0
  245. and $nlo,$nlo,#0x0f
  246. mov $cnt,#14
  247. add $Zhh,$Htbl,$nlo,lsl#4
  248. ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
  249. ldrb $nlo,[$Xi,#14]
  250. add $Thh,$Htbl,$nhi
  251. and $nhi,$Zll,#0xf @ rem
  252. ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
  253. add $nhi,$nhi,$nhi
  254. eor $Zll,$Tll,$Zll,lsr#4
  255. ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
  256. eor $Zll,$Zll,$Zlh,lsl#28
  257. eor $Zlh,$Tlh,$Zlh,lsr#4
  258. eor $Zlh,$Zlh,$Zhl,lsl#28
  259. eor $Zhl,$Thl,$Zhl,lsr#4
  260. eor $Zhl,$Zhl,$Zhh,lsl#28
  261. eor $Zhh,$Thh,$Zhh,lsr#4
  262. and $nhi,$nlo,#0xf0
  263. eor $Zhh,$Zhh,$Tll,lsl#16
  264. and $nlo,$nlo,#0x0f
  265. .Loop:
  266. add $Thh,$Htbl,$nlo,lsl#4
  267. and $nlo,$Zll,#0xf @ rem
  268. subs $cnt,$cnt,#1
  269. add $nlo,$nlo,$nlo
  270. ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
  271. eor $Zll,$Tll,$Zll,lsr#4
  272. eor $Zll,$Zll,$Zlh,lsl#28
  273. eor $Zlh,$Tlh,$Zlh,lsr#4
  274. eor $Zlh,$Zlh,$Zhl,lsl#28
  275. ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem]
  276. eor $Zhl,$Thl,$Zhl,lsr#4
  277. ldrbpl $nlo,[$Xi,$cnt]
  278. eor $Zhl,$Zhl,$Zhh,lsl#28
  279. eor $Zhh,$Thh,$Zhh,lsr#4
  280. add $Thh,$Htbl,$nhi
  281. and $nhi,$Zll,#0xf @ rem
  282. eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
  283. add $nhi,$nhi,$nhi
  284. ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
  285. eor $Zll,$Tll,$Zll,lsr#4
  286. eor $Zll,$Zll,$Zlh,lsl#28
  287. eor $Zlh,$Tlh,$Zlh,lsr#4
  288. ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
  289. eor $Zlh,$Zlh,$Zhl,lsl#28
  290. eor $Zhl,$Thl,$Zhl,lsr#4
  291. eor $Zhl,$Zhl,$Zhh,lsl#28
  292. eor $Zhh,$Thh,$Zhh,lsr#4
  293. andpl $nhi,$nlo,#0xf0
  294. andpl $nlo,$nlo,#0x0f
  295. eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
  296. bpl .Loop
  297. ___
  298. &Zsmash();
  299. $code.=<<___;
  300. #if __ARM_ARCH__>=5
  301. ldmia sp!,{r4-r11,pc}
  302. #else
  303. ldmia sp!,{r4-r11,lr}
  304. tst lr,#1
  305. moveq pc,lr @ be binary compatible with V4, yet
  306. bx lr @ interoperable with Thumb ISA:-)
  307. #endif
  308. .size gcm_gmult_4bit,.-gcm_gmult_4bit
  309. ___
  310. {
  311. my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
  312. my ($t0,$t1,$t2,$t3)=map("q$_",(8..12));
  313. my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31));
  314. sub clmul64x64 {
  315. my ($r,$a,$b)=@_;
  316. $code.=<<___;
  317. vext.8 $t0#lo, $a, $a, #1 @ A1
  318. vmull.p8 $t0, $t0#lo, $b @ F = A1*B
  319. vext.8 $r#lo, $b, $b, #1 @ B1
  320. vmull.p8 $r, $a, $r#lo @ E = A*B1
  321. vext.8 $t1#lo, $a, $a, #2 @ A2
  322. vmull.p8 $t1, $t1#lo, $b @ H = A2*B
  323. vext.8 $t3#lo, $b, $b, #2 @ B2
  324. vmull.p8 $t3, $a, $t3#lo @ G = A*B2
  325. vext.8 $t2#lo, $a, $a, #3 @ A3
  326. veor $t0, $t0, $r @ L = E + F
  327. vmull.p8 $t2, $t2#lo, $b @ J = A3*B
  328. vext.8 $r#lo, $b, $b, #3 @ B3
  329. veor $t1, $t1, $t3 @ M = G + H
  330. vmull.p8 $r, $a, $r#lo @ I = A*B3
  331. veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
  332. vand $t0#hi, $t0#hi, $k48
  333. vext.8 $t3#lo, $b, $b, #4 @ B4
  334. veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
  335. vand $t1#hi, $t1#hi, $k32
  336. vmull.p8 $t3, $a, $t3#lo @ K = A*B4
  337. veor $t2, $t2, $r @ N = I + J
  338. veor $t0#lo, $t0#lo, $t0#hi
  339. veor $t1#lo, $t1#lo, $t1#hi
  340. veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
  341. vand $t2#hi, $t2#hi, $k16
  342. vext.8 $t0, $t0, $t0, #15
  343. veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
  344. vmov.i64 $t3#hi, #0
  345. vext.8 $t1, $t1, $t1, #14
  346. veor $t2#lo, $t2#lo, $t2#hi
  347. vmull.p8 $r, $a, $b @ D = A*B
  348. vext.8 $t3, $t3, $t3, #12
  349. vext.8 $t2, $t2, $t2, #13
  350. veor $t0, $t0, $t1
  351. veor $t2, $t2, $t3
  352. veor $r, $r, $t0
  353. veor $r, $r, $t2
  354. ___
  355. }
  356. $code.=<<___;
  357. #if __ARM_MAX_ARCH__>=7
  358. .arch armv7-a
  359. .fpu neon
  360. .global gcm_init_neon
  361. .type gcm_init_neon,%function
  362. .align 4
  363. gcm_init_neon:
  364. vld1.64 $IN#hi,[r1]! @ load H
  365. vmov.i8 $t0,#0xe1
  366. vld1.64 $IN#lo,[r1]
  367. vshl.i64 $t0#hi,#57
  368. vshr.u64 $t0#lo,#63 @ t0=0xc2....01
  369. vdup.8 $t1,$IN#hi[7]
  370. vshr.u64 $Hlo,$IN#lo,#63
  371. vshr.s8 $t1,#7 @ broadcast carry bit
  372. vshl.i64 $IN,$IN,#1
  373. vand $t0,$t0,$t1
  374. vorr $IN#hi,$Hlo @ H<<<=1
  375. veor $IN,$IN,$t0 @ twisted H
  376. vstmia r0,{$IN}
  377. ret @ bx lr
  378. .size gcm_init_neon,.-gcm_init_neon
  379. .global gcm_gmult_neon
  380. .type gcm_gmult_neon,%function
  381. .align 4
  382. gcm_gmult_neon:
  383. vld1.64 $IN#hi,[$Xi]! @ load Xi
  384. vld1.64 $IN#lo,[$Xi]!
  385. vmov.i64 $k48,#0x0000ffffffffffff
  386. vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
  387. vmov.i64 $k32,#0x00000000ffffffff
  388. #ifdef __ARMEL__
  389. vrev64.8 $IN,$IN
  390. #endif
  391. vmov.i64 $k16,#0x000000000000ffff
  392. veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
  393. mov $len,#16
  394. b .Lgmult_neon
  395. .size gcm_gmult_neon,.-gcm_gmult_neon
  396. .global gcm_ghash_neon
  397. .type gcm_ghash_neon,%function
  398. .align 4
  399. gcm_ghash_neon:
  400. vld1.64 $Xl#hi,[$Xi]! @ load Xi
  401. vld1.64 $Xl#lo,[$Xi]!
  402. vmov.i64 $k48,#0x0000ffffffffffff
  403. vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
  404. vmov.i64 $k32,#0x00000000ffffffff
  405. #ifdef __ARMEL__
  406. vrev64.8 $Xl,$Xl
  407. #endif
  408. vmov.i64 $k16,#0x000000000000ffff
  409. veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
  410. .Loop_neon:
  411. vld1.64 $IN#hi,[$inp]! @ load inp
  412. vld1.64 $IN#lo,[$inp]!
  413. #ifdef __ARMEL__
  414. vrev64.8 $IN,$IN
  415. #endif
  416. veor $IN,$Xl @ inp^=Xi
  417. .Lgmult_neon:
  418. ___
  419. &clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo·Xi.lo
  420. $code.=<<___;
  421. veor $IN#lo,$IN#lo,$IN#hi @ Karatsuba pre-processing
  422. ___
  423. &clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)·(Xi.lo+Xi.hi)
  424. &clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi·Xi.hi
  425. $code.=<<___;
  426. veor $Xm,$Xm,$Xl @ Karatsuba post-processing
  427. veor $Xm,$Xm,$Xh
  428. veor $Xl#hi,$Xl#hi,$Xm#lo
  429. veor $Xh#lo,$Xh#lo,$Xm#hi @ Xh|Xl - 256-bit result
  430. @ equivalent of reduction_avx from ghash-x86_64.pl
  431. vshl.i64 $t1,$Xl,#57 @ 1st phase
  432. vshl.i64 $t2,$Xl,#62
  433. veor $t2,$t2,$t1 @
  434. vshl.i64 $t1,$Xl,#63
  435. veor $t2, $t2, $t1 @
  436. veor $Xl#hi,$Xl#hi,$t2#lo @
  437. veor $Xh#lo,$Xh#lo,$t2#hi
  438. vshr.u64 $t2,$Xl,#1 @ 2nd phase
  439. veor $Xh,$Xh,$Xl
  440. veor $Xl,$Xl,$t2 @
  441. vshr.u64 $t2,$t2,#6
  442. vshr.u64 $Xl,$Xl,#1 @
  443. veor $Xl,$Xl,$Xh @
  444. veor $Xl,$Xl,$t2 @
  445. subs $len,#16
  446. bne .Loop_neon
  447. #ifdef __ARMEL__
  448. vrev64.8 $Xl,$Xl
  449. #endif
  450. sub $Xi,#16
  451. vst1.64 $Xl#hi,[$Xi]! @ write out Xi
  452. vst1.64 $Xl#lo,[$Xi]
  453. ret @ bx lr
  454. .size gcm_ghash_neon,.-gcm_ghash_neon
  455. #endif
  456. ___
  457. }
  458. $code.=<<___;
  459. .asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
  460. .align 2
  461. ___
  462. foreach (split("\n",$code)) {
  463. s/\`([^\`]*)\`/eval $1/geo;
  464. s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
  465. s/\bret\b/bx lr/go or
  466. s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
  467. print $_,"\n";
  468. }
  469. close STDOUT; # enforce flush