Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.
 
 
 
 
 
 

1002 rindas
21 KiB

  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  5. # project. The module is, however, dual licensed under OpenSSL and
  6. # CRYPTOGAMS licenses depending on where you obtain it. For further
  7. # details see http://www.openssl.org/~appro/cryptogams/.
  8. # ====================================================================
  9. #
  10. # This module implements support for ARMv8 AES instructions. The
  11. # module is endian-agnostic in sense that it supports both big- and
  12. # little-endian cases. As does it support both 32- and 64-bit modes
  13. # of operation. Latter is achieved by limiting amount of utilized
  14. # registers to 16, which implies additional NEON load and integer
  15. # instructions. This has no effect on mighty Apple A7, where results
  16. # are literally equal to the theoretical estimates based on AES
  17. # instruction latencies and issue rates. On Cortex-A53, an in-order
  18. # execution core, this costs up to 10-15%, which is partially
  19. # compensated by implementing dedicated code path for 128-bit
  20. # CBC encrypt case. On Cortex-A57 parallelizable mode performance
  21. # seems to be limited by sheer amount of NEON instructions...
  22. #
  23. # Performance in cycles per byte processed with 128-bit key:
  24. #
  25. # CBC enc CBC dec CTR
  26. # Apple A7 2.39 1.20 1.20
  27. # Cortex-A53 1.32 1.29 1.46
  28. # Cortex-A57(*) 1.95 0.85 0.93
  29. # Denver 1.96 0.86 0.80
  30. #
  31. # (*) original 3.64/1.34/1.32 results were for r0p0 revision
  32. # and are still same even for updated module;
  33. $flavour = shift;
  34. $output = shift;
  35. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  36. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  37. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  38. die "can't locate arm-xlate.pl";
  39. open OUT,"| \"$^X\" $xlate $flavour $output";
  40. *STDOUT=*OUT;
  41. $prefix="aes_v8";
  42. $code=<<___;
  43. #include <openssl/arm_arch.h>
  44. #if __ARM_MAX_ARCH__>=7
  45. .text
  46. ___
  47. $code.=<<___ if ($flavour =~ /64/);
  48. #if !defined(__clang__) || defined(BORINGSSL_CLANG_SUPPORTS_DOT_ARCH)
  49. .arch armv8-a+crypto
  50. #endif
  51. ___
  52. $code.=".arch armv7-a\n.fpu neon\n.code 32\n" if ($flavour !~ /64/);
  53. #^^^^^^ this is done to simplify adoption by not depending
  54. # on latest binutils.
  55. # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
  56. # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
  57. # maintain both 32- and 64-bit codes within single module and
  58. # transliterate common code to either flavour with regex vodoo.
  59. #
  60. {{{
  61. my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
  62. my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
  63. $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
  64. $code.=<<___;
  65. .align 5
  66. .Lrcon:
  67. .long 0x01,0x01,0x01,0x01
  68. .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
  69. .long 0x1b,0x1b,0x1b,0x1b
  70. .globl ${prefix}_set_encrypt_key
  71. .type ${prefix}_set_encrypt_key,%function
  72. .align 5
  73. ${prefix}_set_encrypt_key:
  74. .Lenc_key:
  75. ___
  76. $code.=<<___ if ($flavour =~ /64/);
  77. stp x29,x30,[sp,#-16]!
  78. add x29,sp,#0
  79. ___
  80. $code.=<<___;
  81. mov $ptr,#-1
  82. cmp $inp,#0
  83. b.eq .Lenc_key_abort
  84. cmp $out,#0
  85. b.eq .Lenc_key_abort
  86. mov $ptr,#-2
  87. cmp $bits,#128
  88. b.lt .Lenc_key_abort
  89. cmp $bits,#256
  90. b.gt .Lenc_key_abort
  91. tst $bits,#0x3f
  92. b.ne .Lenc_key_abort
  93. adr $ptr,.Lrcon
  94. cmp $bits,#192
  95. veor $zero,$zero,$zero
  96. vld1.8 {$in0},[$inp],#16
  97. mov $bits,#8 // reuse $bits
  98. vld1.32 {$rcon,$mask},[$ptr],#32
  99. b.lt .Loop128
  100. b.eq .L192
  101. b .L256
  102. .align 4
  103. .Loop128:
  104. vtbl.8 $key,{$in0},$mask
  105. vext.8 $tmp,$zero,$in0,#12
  106. vst1.32 {$in0},[$out],#16
  107. aese $key,$zero
  108. subs $bits,$bits,#1
  109. veor $in0,$in0,$tmp
  110. vext.8 $tmp,$zero,$tmp,#12
  111. veor $in0,$in0,$tmp
  112. vext.8 $tmp,$zero,$tmp,#12
  113. veor $key,$key,$rcon
  114. veor $in0,$in0,$tmp
  115. vshl.u8 $rcon,$rcon,#1
  116. veor $in0,$in0,$key
  117. b.ne .Loop128
  118. vld1.32 {$rcon},[$ptr]
  119. vtbl.8 $key,{$in0},$mask
  120. vext.8 $tmp,$zero,$in0,#12
  121. vst1.32 {$in0},[$out],#16
  122. aese $key,$zero
  123. veor $in0,$in0,$tmp
  124. vext.8 $tmp,$zero,$tmp,#12
  125. veor $in0,$in0,$tmp
  126. vext.8 $tmp,$zero,$tmp,#12
  127. veor $key,$key,$rcon
  128. veor $in0,$in0,$tmp
  129. vshl.u8 $rcon,$rcon,#1
  130. veor $in0,$in0,$key
  131. vtbl.8 $key,{$in0},$mask
  132. vext.8 $tmp,$zero,$in0,#12
  133. vst1.32 {$in0},[$out],#16
  134. aese $key,$zero
  135. veor $in0,$in0,$tmp
  136. vext.8 $tmp,$zero,$tmp,#12
  137. veor $in0,$in0,$tmp
  138. vext.8 $tmp,$zero,$tmp,#12
  139. veor $key,$key,$rcon
  140. veor $in0,$in0,$tmp
  141. veor $in0,$in0,$key
  142. vst1.32 {$in0},[$out]
  143. add $out,$out,#0x50
  144. mov $rounds,#10
  145. b .Ldone
  146. .align 4
  147. .L192:
  148. vld1.8 {$in1},[$inp],#8
  149. vmov.i8 $key,#8 // borrow $key
  150. vst1.32 {$in0},[$out],#16
  151. vsub.i8 $mask,$mask,$key // adjust the mask
  152. .Loop192:
  153. vtbl.8 $key,{$in1},$mask
  154. vext.8 $tmp,$zero,$in0,#12
  155. vst1.32 {$in1},[$out],#8
  156. aese $key,$zero
  157. subs $bits,$bits,#1
  158. veor $in0,$in0,$tmp
  159. vext.8 $tmp,$zero,$tmp,#12
  160. veor $in0,$in0,$tmp
  161. vext.8 $tmp,$zero,$tmp,#12
  162. veor $in0,$in0,$tmp
  163. vdup.32 $tmp,${in0}[3]
  164. veor $tmp,$tmp,$in1
  165. veor $key,$key,$rcon
  166. vext.8 $in1,$zero,$in1,#12
  167. vshl.u8 $rcon,$rcon,#1
  168. veor $in1,$in1,$tmp
  169. veor $in0,$in0,$key
  170. veor $in1,$in1,$key
  171. vst1.32 {$in0},[$out],#16
  172. b.ne .Loop192
  173. mov $rounds,#12
  174. add $out,$out,#0x20
  175. b .Ldone
  176. .align 4
  177. .L256:
  178. vld1.8 {$in1},[$inp]
  179. mov $bits,#7
  180. mov $rounds,#14
  181. vst1.32 {$in0},[$out],#16
  182. .Loop256:
  183. vtbl.8 $key,{$in1},$mask
  184. vext.8 $tmp,$zero,$in0,#12
  185. vst1.32 {$in1},[$out],#16
  186. aese $key,$zero
  187. subs $bits,$bits,#1
  188. veor $in0,$in0,$tmp
  189. vext.8 $tmp,$zero,$tmp,#12
  190. veor $in0,$in0,$tmp
  191. vext.8 $tmp,$zero,$tmp,#12
  192. veor $key,$key,$rcon
  193. veor $in0,$in0,$tmp
  194. vshl.u8 $rcon,$rcon,#1
  195. veor $in0,$in0,$key
  196. vst1.32 {$in0},[$out],#16
  197. b.eq .Ldone
  198. vdup.32 $key,${in0}[3] // just splat
  199. vext.8 $tmp,$zero,$in1,#12
  200. aese $key,$zero
  201. veor $in1,$in1,$tmp
  202. vext.8 $tmp,$zero,$tmp,#12
  203. veor $in1,$in1,$tmp
  204. vext.8 $tmp,$zero,$tmp,#12
  205. veor $in1,$in1,$tmp
  206. veor $in1,$in1,$key
  207. b .Loop256
  208. .Ldone:
  209. str $rounds,[$out]
  210. mov $ptr,#0
  211. .Lenc_key_abort:
  212. mov x0,$ptr // return value
  213. `"ldr x29,[sp],#16" if ($flavour =~ /64/)`
  214. ret
  215. .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
  216. .globl ${prefix}_set_decrypt_key
  217. .type ${prefix}_set_decrypt_key,%function
  218. .align 5
  219. ${prefix}_set_decrypt_key:
  220. ___
  221. $code.=<<___ if ($flavour =~ /64/);
  222. stp x29,x30,[sp,#-16]!
  223. add x29,sp,#0
  224. ___
  225. $code.=<<___ if ($flavour !~ /64/);
  226. stmdb sp!,{r4,lr}
  227. ___
  228. $code.=<<___;
  229. bl .Lenc_key
  230. cmp x0,#0
  231. b.ne .Ldec_key_abort
  232. sub $out,$out,#240 // restore original $out
  233. mov x4,#-16
  234. add $inp,$out,x12,lsl#4 // end of key schedule
  235. vld1.32 {v0.16b},[$out]
  236. vld1.32 {v1.16b},[$inp]
  237. vst1.32 {v0.16b},[$inp],x4
  238. vst1.32 {v1.16b},[$out],#16
  239. .Loop_imc:
  240. vld1.32 {v0.16b},[$out]
  241. vld1.32 {v1.16b},[$inp]
  242. aesimc v0.16b,v0.16b
  243. aesimc v1.16b,v1.16b
  244. vst1.32 {v0.16b},[$inp],x4
  245. vst1.32 {v1.16b},[$out],#16
  246. cmp $inp,$out
  247. b.hi .Loop_imc
  248. vld1.32 {v0.16b},[$out]
  249. aesimc v0.16b,v0.16b
  250. vst1.32 {v0.16b},[$inp]
  251. eor x0,x0,x0 // return value
  252. .Ldec_key_abort:
  253. ___
  254. $code.=<<___ if ($flavour !~ /64/);
  255. ldmia sp!,{r4,pc}
  256. ___
  257. $code.=<<___ if ($flavour =~ /64/);
  258. ldp x29,x30,[sp],#16
  259. ret
  260. ___
  261. $code.=<<___;
  262. .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
  263. ___
  264. }}}
  265. {{{
  266. sub gen_block () {
  267. my $dir = shift;
  268. my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
  269. my ($inp,$out,$key)=map("x$_",(0..2));
  270. my $rounds="w3";
  271. my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
  272. $code.=<<___;
  273. .globl ${prefix}_${dir}crypt
  274. .type ${prefix}_${dir}crypt,%function
  275. .align 5
  276. ${prefix}_${dir}crypt:
  277. ldr $rounds,[$key,#240]
  278. vld1.32 {$rndkey0},[$key],#16
  279. vld1.8 {$inout},[$inp]
  280. sub $rounds,$rounds,#2
  281. vld1.32 {$rndkey1},[$key],#16
  282. .Loop_${dir}c:
  283. aes$e $inout,$rndkey0
  284. aes$mc $inout,$inout
  285. vld1.32 {$rndkey0},[$key],#16
  286. subs $rounds,$rounds,#2
  287. aes$e $inout,$rndkey1
  288. aes$mc $inout,$inout
  289. vld1.32 {$rndkey1},[$key],#16
  290. b.gt .Loop_${dir}c
  291. aes$e $inout,$rndkey0
  292. aes$mc $inout,$inout
  293. vld1.32 {$rndkey0},[$key]
  294. aes$e $inout,$rndkey1
  295. veor $inout,$inout,$rndkey0
  296. vst1.8 {$inout},[$out]
  297. ret
  298. .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
  299. ___
  300. }
  301. &gen_block("en");
  302. &gen_block("de");
  303. }}}
  304. {{{
  305. my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
  306. my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
  307. my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
  308. my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
  309. my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
  310. ### q8-q15 preloaded key schedule
  311. $code.=<<___;
  312. .globl ${prefix}_cbc_encrypt
  313. .type ${prefix}_cbc_encrypt,%function
  314. .align 5
  315. ${prefix}_cbc_encrypt:
  316. ___
  317. $code.=<<___ if ($flavour =~ /64/);
  318. stp x29,x30,[sp,#-16]!
  319. add x29,sp,#0
  320. ___
  321. $code.=<<___ if ($flavour !~ /64/);
  322. mov ip,sp
  323. stmdb sp!,{r4-r8,lr}
  324. vstmdb sp!,{d8-d15} @ ABI specification says so
  325. ldmia ip,{r4-r5} @ load remaining args
  326. ___
  327. $code.=<<___;
  328. subs $len,$len,#16
  329. mov $step,#16
  330. b.lo .Lcbc_abort
  331. cclr $step,eq
  332. cmp $enc,#0 // en- or decrypting?
  333. ldr $rounds,[$key,#240]
  334. and $len,$len,#-16
  335. vld1.8 {$ivec},[$ivp]
  336. vld1.8 {$dat},[$inp],$step
  337. vld1.32 {q8-q9},[$key] // load key schedule...
  338. sub $rounds,$rounds,#6
  339. add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
  340. sub $rounds,$rounds,#2
  341. vld1.32 {q10-q11},[$key_],#32
  342. vld1.32 {q12-q13},[$key_],#32
  343. vld1.32 {q14-q15},[$key_],#32
  344. vld1.32 {$rndlast},[$key_]
  345. add $key_,$key,#32
  346. mov $cnt,$rounds
  347. b.eq .Lcbc_dec
  348. cmp $rounds,#2
  349. veor $dat,$dat,$ivec
  350. veor $rndzero_n_last,q8,$rndlast
  351. b.eq .Lcbc_enc128
  352. vld1.32 {$in0-$in1},[$key_]
  353. add $key_,$key,#16
  354. add $key4,$key,#16*4
  355. add $key5,$key,#16*5
  356. aese $dat,q8
  357. aesmc $dat,$dat
  358. add $key6,$key,#16*6
  359. add $key7,$key,#16*7
  360. b .Lenter_cbc_enc
  361. .align 4
  362. .Loop_cbc_enc:
  363. aese $dat,q8
  364. aesmc $dat,$dat
  365. vst1.8 {$ivec},[$out],#16
  366. .Lenter_cbc_enc:
  367. aese $dat,q9
  368. aesmc $dat,$dat
  369. aese $dat,$in0
  370. aesmc $dat,$dat
  371. vld1.32 {q8},[$key4]
  372. cmp $rounds,#4
  373. aese $dat,$in1
  374. aesmc $dat,$dat
  375. vld1.32 {q9},[$key5]
  376. b.eq .Lcbc_enc192
  377. aese $dat,q8
  378. aesmc $dat,$dat
  379. vld1.32 {q8},[$key6]
  380. aese $dat,q9
  381. aesmc $dat,$dat
  382. vld1.32 {q9},[$key7]
  383. nop
  384. .Lcbc_enc192:
  385. aese $dat,q8
  386. aesmc $dat,$dat
  387. subs $len,$len,#16
  388. aese $dat,q9
  389. aesmc $dat,$dat
  390. cclr $step,eq
  391. aese $dat,q10
  392. aesmc $dat,$dat
  393. aese $dat,q11
  394. aesmc $dat,$dat
  395. vld1.8 {q8},[$inp],$step
  396. aese $dat,q12
  397. aesmc $dat,$dat
  398. veor q8,q8,$rndzero_n_last
  399. aese $dat,q13
  400. aesmc $dat,$dat
  401. vld1.32 {q9},[$key_] // re-pre-load rndkey[1]
  402. aese $dat,q14
  403. aesmc $dat,$dat
  404. aese $dat,q15
  405. veor $ivec,$dat,$rndlast
  406. b.hs .Loop_cbc_enc
  407. vst1.8 {$ivec},[$out],#16
  408. b .Lcbc_done
  409. .align 5
  410. .Lcbc_enc128:
  411. vld1.32 {$in0-$in1},[$key_]
  412. aese $dat,q8
  413. aesmc $dat,$dat
  414. b .Lenter_cbc_enc128
  415. .Loop_cbc_enc128:
  416. aese $dat,q8
  417. aesmc $dat,$dat
  418. vst1.8 {$ivec},[$out],#16
  419. .Lenter_cbc_enc128:
  420. aese $dat,q9
  421. aesmc $dat,$dat
  422. subs $len,$len,#16
  423. aese $dat,$in0
  424. aesmc $dat,$dat
  425. cclr $step,eq
  426. aese $dat,$in1
  427. aesmc $dat,$dat
  428. aese $dat,q10
  429. aesmc $dat,$dat
  430. aese $dat,q11
  431. aesmc $dat,$dat
  432. vld1.8 {q8},[$inp],$step
  433. aese $dat,q12
  434. aesmc $dat,$dat
  435. aese $dat,q13
  436. aesmc $dat,$dat
  437. aese $dat,q14
  438. aesmc $dat,$dat
  439. veor q8,q8,$rndzero_n_last
  440. aese $dat,q15
  441. veor $ivec,$dat,$rndlast
  442. b.hs .Loop_cbc_enc128
  443. vst1.8 {$ivec},[$out],#16
  444. b .Lcbc_done
  445. ___
  446. {
  447. my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
  448. $code.=<<___;
  449. .align 5
  450. .Lcbc_dec:
  451. vld1.8 {$dat2},[$inp],#16
  452. subs $len,$len,#32 // bias
  453. add $cnt,$rounds,#2
  454. vorr $in1,$dat,$dat
  455. vorr $dat1,$dat,$dat
  456. vorr $in2,$dat2,$dat2
  457. b.lo .Lcbc_dec_tail
  458. vorr $dat1,$dat2,$dat2
  459. vld1.8 {$dat2},[$inp],#16
  460. vorr $in0,$dat,$dat
  461. vorr $in1,$dat1,$dat1
  462. vorr $in2,$dat2,$dat2
  463. .Loop3x_cbc_dec:
  464. aesd $dat0,q8
  465. aesimc $dat0,$dat0
  466. aesd $dat1,q8
  467. aesimc $dat1,$dat1
  468. aesd $dat2,q8
  469. aesimc $dat2,$dat2
  470. vld1.32 {q8},[$key_],#16
  471. subs $cnt,$cnt,#2
  472. aesd $dat0,q9
  473. aesimc $dat0,$dat0
  474. aesd $dat1,q9
  475. aesimc $dat1,$dat1
  476. aesd $dat2,q9
  477. aesimc $dat2,$dat2
  478. vld1.32 {q9},[$key_],#16
  479. b.gt .Loop3x_cbc_dec
  480. aesd $dat0,q8
  481. aesimc $dat0,$dat0
  482. aesd $dat1,q8
  483. aesimc $dat1,$dat1
  484. aesd $dat2,q8
  485. aesimc $dat2,$dat2
  486. veor $tmp0,$ivec,$rndlast
  487. subs $len,$len,#0x30
  488. veor $tmp1,$in0,$rndlast
  489. mov.lo x6,$len // x6, $cnt, is zero at this point
  490. aesd $dat0,q9
  491. aesimc $dat0,$dat0
  492. aesd $dat1,q9
  493. aesimc $dat1,$dat1
  494. aesd $dat2,q9
  495. aesimc $dat2,$dat2
  496. veor $tmp2,$in1,$rndlast
  497. add $inp,$inp,x6 // $inp is adjusted in such way that
  498. // at exit from the loop $dat1-$dat2
  499. // are loaded with last "words"
  500. vorr $ivec,$in2,$in2
  501. mov $key_,$key
  502. aesd $dat0,q12
  503. aesimc $dat0,$dat0
  504. aesd $dat1,q12
  505. aesimc $dat1,$dat1
  506. aesd $dat2,q12
  507. aesimc $dat2,$dat2
  508. vld1.8 {$in0},[$inp],#16
  509. aesd $dat0,q13
  510. aesimc $dat0,$dat0
  511. aesd $dat1,q13
  512. aesimc $dat1,$dat1
  513. aesd $dat2,q13
  514. aesimc $dat2,$dat2
  515. vld1.8 {$in1},[$inp],#16
  516. aesd $dat0,q14
  517. aesimc $dat0,$dat0
  518. aesd $dat1,q14
  519. aesimc $dat1,$dat1
  520. aesd $dat2,q14
  521. aesimc $dat2,$dat2
  522. vld1.8 {$in2},[$inp],#16
  523. aesd $dat0,q15
  524. aesd $dat1,q15
  525. aesd $dat2,q15
  526. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  527. add $cnt,$rounds,#2
  528. veor $tmp0,$tmp0,$dat0
  529. veor $tmp1,$tmp1,$dat1
  530. veor $dat2,$dat2,$tmp2
  531. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  532. vst1.8 {$tmp0},[$out],#16
  533. vorr $dat0,$in0,$in0
  534. vst1.8 {$tmp1},[$out],#16
  535. vorr $dat1,$in1,$in1
  536. vst1.8 {$dat2},[$out],#16
  537. vorr $dat2,$in2,$in2
  538. b.hs .Loop3x_cbc_dec
  539. cmn $len,#0x30
  540. b.eq .Lcbc_done
  541. nop
  542. .Lcbc_dec_tail:
  543. aesd $dat1,q8
  544. aesimc $dat1,$dat1
  545. aesd $dat2,q8
  546. aesimc $dat2,$dat2
  547. vld1.32 {q8},[$key_],#16
  548. subs $cnt,$cnt,#2
  549. aesd $dat1,q9
  550. aesimc $dat1,$dat1
  551. aesd $dat2,q9
  552. aesimc $dat2,$dat2
  553. vld1.32 {q9},[$key_],#16
  554. b.gt .Lcbc_dec_tail
  555. aesd $dat1,q8
  556. aesimc $dat1,$dat1
  557. aesd $dat2,q8
  558. aesimc $dat2,$dat2
  559. aesd $dat1,q9
  560. aesimc $dat1,$dat1
  561. aesd $dat2,q9
  562. aesimc $dat2,$dat2
  563. aesd $dat1,q12
  564. aesimc $dat1,$dat1
  565. aesd $dat2,q12
  566. aesimc $dat2,$dat2
  567. cmn $len,#0x20
  568. aesd $dat1,q13
  569. aesimc $dat1,$dat1
  570. aesd $dat2,q13
  571. aesimc $dat2,$dat2
  572. veor $tmp1,$ivec,$rndlast
  573. aesd $dat1,q14
  574. aesimc $dat1,$dat1
  575. aesd $dat2,q14
  576. aesimc $dat2,$dat2
  577. veor $tmp2,$in1,$rndlast
  578. aesd $dat1,q15
  579. aesd $dat2,q15
  580. b.eq .Lcbc_dec_one
  581. veor $tmp1,$tmp1,$dat1
  582. veor $tmp2,$tmp2,$dat2
  583. vorr $ivec,$in2,$in2
  584. vst1.8 {$tmp1},[$out],#16
  585. vst1.8 {$tmp2},[$out],#16
  586. b .Lcbc_done
  587. .Lcbc_dec_one:
  588. veor $tmp1,$tmp1,$dat2
  589. vorr $ivec,$in2,$in2
  590. vst1.8 {$tmp1},[$out],#16
  591. .Lcbc_done:
  592. vst1.8 {$ivec},[$ivp]
  593. .Lcbc_abort:
  594. ___
  595. }
  596. $code.=<<___ if ($flavour !~ /64/);
  597. vldmia sp!,{d8-d15}
  598. ldmia sp!,{r4-r8,pc}
  599. ___
  600. $code.=<<___ if ($flavour =~ /64/);
  601. ldr x29,[sp],#16
  602. ret
  603. ___
  604. $code.=<<___;
  605. .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
  606. ___
  607. }}}
  608. {{{
  609. my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
  610. my ($rounds,$cnt,$key_)=("w5","w6","x7");
  611. my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
  612. my $step="x12"; # aliases with $tctr2
  613. my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
  614. my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
  615. my ($dat,$tmp)=($dat0,$tmp0);
  616. ### q8-q15 preloaded key schedule
  617. $code.=<<___;
  618. .globl ${prefix}_ctr32_encrypt_blocks
  619. .type ${prefix}_ctr32_encrypt_blocks,%function
  620. .align 5
  621. ${prefix}_ctr32_encrypt_blocks:
  622. ___
  623. $code.=<<___ if ($flavour =~ /64/);
  624. stp x29,x30,[sp,#-16]!
  625. add x29,sp,#0
  626. ___
  627. $code.=<<___ if ($flavour !~ /64/);
  628. mov ip,sp
  629. stmdb sp!,{r4-r10,lr}
  630. vstmdb sp!,{d8-d15} @ ABI specification says so
  631. ldr r4, [ip] @ load remaining arg
  632. ___
  633. $code.=<<___;
  634. ldr $rounds,[$key,#240]
  635. ldr $ctr, [$ivp, #12]
  636. vld1.32 {$dat0},[$ivp]
  637. vld1.32 {q8-q9},[$key] // load key schedule...
  638. sub $rounds,$rounds,#4
  639. mov $step,#16
  640. cmp $len,#2
  641. add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
  642. sub $rounds,$rounds,#2
  643. vld1.32 {q12-q13},[$key_],#32
  644. vld1.32 {q14-q15},[$key_],#32
  645. vld1.32 {$rndlast},[$key_]
  646. add $key_,$key,#32
  647. mov $cnt,$rounds
  648. cclr $step,lo
  649. #ifndef __ARMEB__
  650. rev $ctr, $ctr
  651. #endif
  652. vorr $dat1,$dat0,$dat0
  653. add $tctr1, $ctr, #1
  654. vorr $dat2,$dat0,$dat0
  655. add $ctr, $ctr, #2
  656. vorr $ivec,$dat0,$dat0
  657. rev $tctr1, $tctr1
  658. vmov.32 ${dat1}[3],$tctr1
  659. b.ls .Lctr32_tail
  660. rev $tctr2, $ctr
  661. sub $len,$len,#3 // bias
  662. vmov.32 ${dat2}[3],$tctr2
  663. b .Loop3x_ctr32
  664. .align 4
  665. .Loop3x_ctr32:
  666. aese $dat0,q8
  667. aesmc $dat0,$dat0
  668. aese $dat1,q8
  669. aesmc $dat1,$dat1
  670. aese $dat2,q8
  671. aesmc $dat2,$dat2
  672. vld1.32 {q8},[$key_],#16
  673. subs $cnt,$cnt,#2
  674. aese $dat0,q9
  675. aesmc $dat0,$dat0
  676. aese $dat1,q9
  677. aesmc $dat1,$dat1
  678. aese $dat2,q9
  679. aesmc $dat2,$dat2
  680. vld1.32 {q9},[$key_],#16
  681. b.gt .Loop3x_ctr32
  682. aese $dat0,q8
  683. aesmc $tmp0,$dat0
  684. aese $dat1,q8
  685. aesmc $tmp1,$dat1
  686. vld1.8 {$in0},[$inp],#16
  687. vorr $dat0,$ivec,$ivec
  688. aese $dat2,q8
  689. aesmc $dat2,$dat2
  690. vld1.8 {$in1},[$inp],#16
  691. vorr $dat1,$ivec,$ivec
  692. aese $tmp0,q9
  693. aesmc $tmp0,$tmp0
  694. aese $tmp1,q9
  695. aesmc $tmp1,$tmp1
  696. vld1.8 {$in2},[$inp],#16
  697. mov $key_,$key
  698. aese $dat2,q9
  699. aesmc $tmp2,$dat2
  700. vorr $dat2,$ivec,$ivec
  701. add $tctr0,$ctr,#1
  702. aese $tmp0,q12
  703. aesmc $tmp0,$tmp0
  704. aese $tmp1,q12
  705. aesmc $tmp1,$tmp1
  706. veor $in0,$in0,$rndlast
  707. add $tctr1,$ctr,#2
  708. aese $tmp2,q12
  709. aesmc $tmp2,$tmp2
  710. veor $in1,$in1,$rndlast
  711. add $ctr,$ctr,#3
  712. aese $tmp0,q13
  713. aesmc $tmp0,$tmp0
  714. aese $tmp1,q13
  715. aesmc $tmp1,$tmp1
  716. veor $in2,$in2,$rndlast
  717. rev $tctr0,$tctr0
  718. aese $tmp2,q13
  719. aesmc $tmp2,$tmp2
  720. vmov.32 ${dat0}[3], $tctr0
  721. rev $tctr1,$tctr1
  722. aese $tmp0,q14
  723. aesmc $tmp0,$tmp0
  724. aese $tmp1,q14
  725. aesmc $tmp1,$tmp1
  726. vmov.32 ${dat1}[3], $tctr1
  727. rev $tctr2,$ctr
  728. aese $tmp2,q14
  729. aesmc $tmp2,$tmp2
  730. vmov.32 ${dat2}[3], $tctr2
  731. subs $len,$len,#3
  732. aese $tmp0,q15
  733. aese $tmp1,q15
  734. aese $tmp2,q15
  735. veor $in0,$in0,$tmp0
  736. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  737. vst1.8 {$in0},[$out],#16
  738. veor $in1,$in1,$tmp1
  739. mov $cnt,$rounds
  740. vst1.8 {$in1},[$out],#16
  741. veor $in2,$in2,$tmp2
  742. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  743. vst1.8 {$in2},[$out],#16
  744. b.hs .Loop3x_ctr32
  745. adds $len,$len,#3
  746. b.eq .Lctr32_done
  747. cmp $len,#1
  748. mov $step,#16
  749. cclr $step,eq
  750. .Lctr32_tail:
  751. aese $dat0,q8
  752. aesmc $dat0,$dat0
  753. aese $dat1,q8
  754. aesmc $dat1,$dat1
  755. vld1.32 {q8},[$key_],#16
  756. subs $cnt,$cnt,#2
  757. aese $dat0,q9
  758. aesmc $dat0,$dat0
  759. aese $dat1,q9
  760. aesmc $dat1,$dat1
  761. vld1.32 {q9},[$key_],#16
  762. b.gt .Lctr32_tail
  763. aese $dat0,q8
  764. aesmc $dat0,$dat0
  765. aese $dat1,q8
  766. aesmc $dat1,$dat1
  767. aese $dat0,q9
  768. aesmc $dat0,$dat0
  769. aese $dat1,q9
  770. aesmc $dat1,$dat1
  771. vld1.8 {$in0},[$inp],$step
  772. aese $dat0,q12
  773. aesmc $dat0,$dat0
  774. aese $dat1,q12
  775. aesmc $dat1,$dat1
  776. vld1.8 {$in1},[$inp]
  777. aese $dat0,q13
  778. aesmc $dat0,$dat0
  779. aese $dat1,q13
  780. aesmc $dat1,$dat1
  781. veor $in0,$in0,$rndlast
  782. aese $dat0,q14
  783. aesmc $dat0,$dat0
  784. aese $dat1,q14
  785. aesmc $dat1,$dat1
  786. veor $in1,$in1,$rndlast
  787. aese $dat0,q15
  788. aese $dat1,q15
  789. cmp $len,#1
  790. veor $in0,$in0,$dat0
  791. veor $in1,$in1,$dat1
  792. vst1.8 {$in0},[$out],#16
  793. b.eq .Lctr32_done
  794. vst1.8 {$in1},[$out]
  795. .Lctr32_done:
  796. ___
  797. $code.=<<___ if ($flavour !~ /64/);
  798. vldmia sp!,{d8-d15}
  799. ldmia sp!,{r4-r10,pc}
  800. ___
  801. $code.=<<___ if ($flavour =~ /64/);
  802. ldr x29,[sp],#16
  803. ret
  804. ___
  805. $code.=<<___;
  806. .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
  807. ___
  808. }}}
  809. $code.=<<___;
  810. #endif
  811. ___
  812. ########################################
  813. if ($flavour =~ /64/) { ######## 64-bit code
  814. my %opcode = (
  815. "aesd" => 0x4e285800, "aese" => 0x4e284800,
  816. "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
  817. local *unaes = sub {
  818. my ($mnemonic,$arg)=@_;
  819. $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
  820. sprintf ".inst\t0x%08x\t//%s %s",
  821. $opcode{$mnemonic}|$1|($2<<5),
  822. $mnemonic,$arg;
  823. };
  824. foreach(split("\n",$code)) {
  825. s/\`([^\`]*)\`/eval($1)/geo;
  826. s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
  827. s/@\s/\/\//o; # old->new style commentary
  828. #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
  829. s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
  830. s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
  831. s/vmov\.i8/movi/o or # fix up legacy mnemonics
  832. s/vext\.8/ext/o or
  833. s/vrev32\.8/rev32/o or
  834. s/vtst\.8/cmtst/o or
  835. s/vshr/ushr/o or
  836. s/^(\s+)v/$1/o or # strip off v prefix
  837. s/\bbx\s+lr\b/ret/o;
  838. # fix up remainig legacy suffixes
  839. s/\.[ui]?8//o;
  840. m/\],#8/o and s/\.16b/\.8b/go;
  841. s/\.[ui]?32//o and s/\.16b/\.4s/go;
  842. s/\.[ui]?64//o and s/\.16b/\.2d/go;
  843. s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
  844. print $_,"\n";
  845. }
  846. } else { ######## 32-bit code
  847. my %opcode = (
  848. "aesd" => 0xf3b00340, "aese" => 0xf3b00300,
  849. "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
  850. local *unaes = sub {
  851. my ($mnemonic,$arg)=@_;
  852. if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
  853. my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
  854. |(($2&7)<<1) |(($2&8)<<2);
  855. # since ARMv7 instructions are always encoded little-endian.
  856. # correct solution is to use .inst directive, but older
  857. # assemblers don't implement it:-(
  858. sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
  859. $word&0xff,($word>>8)&0xff,
  860. ($word>>16)&0xff,($word>>24)&0xff,
  861. $mnemonic,$arg;
  862. }
  863. };
  864. sub unvtbl {
  865. my $arg=shift;
  866. $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
  867. sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
  868. "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
  869. }
  870. sub unvdup32 {
  871. my $arg=shift;
  872. $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
  873. sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
  874. }
  875. sub unvmov32 {
  876. my $arg=shift;
  877. $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
  878. sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
  879. }
  880. foreach(split("\n",$code)) {
  881. s/\`([^\`]*)\`/eval($1)/geo;
  882. s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
  883. s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
  884. s/\/\/\s?/@ /o; # new->old style commentary
  885. # fix up remainig new-style suffixes
  886. s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
  887. s/\],#[0-9]+/]!/o;
  888. s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
  889. s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
  890. s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
  891. s/vdup\.32\s+(.*)/unvdup32($1)/geo or
  892. s/vmov\.32\s+(.*)/unvmov32($1)/geo or
  893. s/^(\s+)b\./$1b/o or
  894. s/^(\s+)mov\./$1mov/o or
  895. s/^(\s+)ret/$1bx\tlr/o;
  896. print $_,"\n";
  897. }
  898. }
  899. close STDOUT;