25'ten fazla konu seçemezsiniz Konular bir harf veya rakamla başlamalı, kısa çizgiler ('-') içerebilir ve en fazla 35 karakter uzunluğunda olabilir.
 
 
 
 
 
 

1681 satır
38 KiB

  1. #!/usr/bin/env perl
  2. # Specific modes implementations for SPARC Architecture 2011. There
  3. # is T4 dependency though, an ASI value that is not specified in the
  4. # Architecture Manual. But as SPARC universe is rather monocultural,
  5. # we imply that processor capable of executing crypto instructions
  6. # can handle the ASI in question as well. This means that we ought to
  7. # keep eyes open when new processors emerge...
  8. #
  9. # As for above mentioned ASI. It's so called "block initializing
  10. # store" which cancels "read" in "read-update-write" on cache lines.
  11. # This is "cooperative" optimization, as it reduces overall pressure
  12. # on memory interface. Benefits can't be observed/quantified with
  13. # usual benchmarks, on the contrary you can notice that single-thread
  14. # performance for parallelizable modes is ~1.5% worse for largest
  15. # block sizes [though few percent better for not so long ones]. All
  16. # this based on suggestions from David Miller.
  17. sub asm_init { # to be called with @ARGV as argument
  18. for (@_) { $::abibits=64 if (/\-m64/ || /\-xarch\=v9/); }
  19. if ($::abibits==64) { $::bias=2047; $::frame=192; $::size_t_cc="%xcc"; }
  20. else { $::bias=0; $::frame=112; $::size_t_cc="%icc"; }
  21. }
  22. # unified interface
  23. my ($inp,$out,$len,$key,$ivec)=map("%i$_",(0..5));
  24. # local variables
  25. my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7));
  26. sub alg_cbc_encrypt_implement {
  27. my ($alg,$bits) = @_;
  28. $::code.=<<___;
  29. .globl ${alg}${bits}_t4_cbc_encrypt
  30. .align 32
  31. ${alg}${bits}_t4_cbc_encrypt:
  32. save %sp, -$::frame, %sp
  33. sub $inp, $out, $blk_init ! $inp!=$out
  34. ___
  35. $::code.=<<___ if (!$::evp);
  36. andcc $ivec, 7, $ivoff
  37. alignaddr $ivec, %g0, $ivec
  38. ldd [$ivec + 0], %f0 ! load ivec
  39. bz,pt %icc, 1f
  40. ldd [$ivec + 8], %f2
  41. ldd [$ivec + 16], %f4
  42. faligndata %f0, %f2, %f0
  43. faligndata %f2, %f4, %f2
  44. 1:
  45. ___
  46. $::code.=<<___ if ($::evp);
  47. ld [$ivec + 0], %f0
  48. ld [$ivec + 4], %f1
  49. ld [$ivec + 8], %f2
  50. ld [$ivec + 12], %f3
  51. ___
  52. $::code.=<<___;
  53. prefetch [$inp], 20
  54. prefetch [$inp + 63], 20
  55. call _${alg}${bits}_load_enckey
  56. and $inp, 7, $ileft
  57. andn $inp, 7, $inp
  58. sll $ileft, 3, $ileft
  59. mov 64, $iright
  60. mov 0xff, $omask
  61. sub $iright, $ileft, $iright
  62. and $out, 7, $ooff
  63. cmp $len, 127
  64. movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
  65. movleu $::size_t_cc, 0, $blk_init ! $len<128 ||
  66. brnz,pn $blk_init, .L${bits}cbc_enc_blk ! $inp==$out)
  67. srl $omask, $ooff, $omask
  68. alignaddrl $out, %g0, $out
  69. srlx $len, 4, $len
  70. prefetch [$out], 22
  71. .L${bits}_cbc_enc_loop:
  72. ldx [$inp + 0], %o0
  73. brz,pt $ileft, 4f
  74. ldx [$inp + 8], %o1
  75. ldx [$inp + 16], %o2
  76. sllx %o0, $ileft, %o0
  77. srlx %o1, $iright, %g1
  78. sllx %o1, $ileft, %o1
  79. or %g1, %o0, %o0
  80. srlx %o2, $iright, %o2
  81. or %o2, %o1, %o1
  82. 4:
  83. xor %g4, %o0, %o0 ! ^= rk[0]
  84. xor %g5, %o1, %o1
  85. movxtod %o0, %f12
  86. movxtod %o1, %f14
  87. fxor %f12, %f0, %f0 ! ^= ivec
  88. fxor %f14, %f2, %f2
  89. prefetch [$out + 63], 22
  90. prefetch [$inp + 16+63], 20
  91. call _${alg}${bits}_encrypt_1x
  92. add $inp, 16, $inp
  93. brnz,pn $ooff, 2f
  94. sub $len, 1, $len
  95. std %f0, [$out + 0]
  96. std %f2, [$out + 8]
  97. brnz,pt $len, .L${bits}_cbc_enc_loop
  98. add $out, 16, $out
  99. ___
  100. $::code.=<<___ if ($::evp);
  101. st %f0, [$ivec + 0]
  102. st %f1, [$ivec + 4]
  103. st %f2, [$ivec + 8]
  104. st %f3, [$ivec + 12]
  105. ___
  106. $::code.=<<___ if (!$::evp);
  107. brnz,pn $ivoff, 3f
  108. nop
  109. std %f0, [$ivec + 0] ! write out ivec
  110. std %f2, [$ivec + 8]
  111. ___
  112. $::code.=<<___;
  113. ret
  114. restore
  115. .align 16
  116. 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
  117. ! and ~3x deterioration
  118. ! in inp==out case
  119. faligndata %f0, %f0, %f4 ! handle unaligned output
  120. faligndata %f0, %f2, %f6
  121. faligndata %f2, %f2, %f8
  122. stda %f4, [$out + $omask]0xc0 ! partial store
  123. std %f6, [$out + 8]
  124. add $out, 16, $out
  125. orn %g0, $omask, $omask
  126. stda %f8, [$out + $omask]0xc0 ! partial store
  127. brnz,pt $len, .L${bits}_cbc_enc_loop+4
  128. orn %g0, $omask, $omask
  129. ___
  130. $::code.=<<___ if ($::evp);
  131. st %f0, [$ivec + 0]
  132. st %f1, [$ivec + 4]
  133. st %f2, [$ivec + 8]
  134. st %f3, [$ivec + 12]
  135. ___
  136. $::code.=<<___ if (!$::evp);
  137. brnz,pn $ivoff, 3f
  138. nop
  139. std %f0, [$ivec + 0] ! write out ivec
  140. std %f2, [$ivec + 8]
  141. ret
  142. restore
  143. .align 16
  144. 3: alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec
  145. mov 0xff, $omask
  146. srl $omask, $ivoff, $omask
  147. faligndata %f0, %f0, %f4
  148. faligndata %f0, %f2, %f6
  149. faligndata %f2, %f2, %f8
  150. stda %f4, [$ivec + $omask]0xc0
  151. std %f6, [$ivec + 8]
  152. add $ivec, 16, $ivec
  153. orn %g0, $omask, $omask
  154. stda %f8, [$ivec + $omask]0xc0
  155. ___
  156. $::code.=<<___;
  157. ret
  158. restore
  159. !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  160. .align 32
  161. .L${bits}cbc_enc_blk:
  162. add $out, $len, $blk_init
  163. and $blk_init, 63, $blk_init ! tail
  164. sub $len, $blk_init, $len
  165. add $blk_init, 15, $blk_init ! round up to 16n
  166. srlx $len, 4, $len
  167. srl $blk_init, 4, $blk_init
  168. .L${bits}_cbc_enc_blk_loop:
  169. ldx [$inp + 0], %o0
  170. brz,pt $ileft, 5f
  171. ldx [$inp + 8], %o1
  172. ldx [$inp + 16], %o2
  173. sllx %o0, $ileft, %o0
  174. srlx %o1, $iright, %g1
  175. sllx %o1, $ileft, %o1
  176. or %g1, %o0, %o0
  177. srlx %o2, $iright, %o2
  178. or %o2, %o1, %o1
  179. 5:
  180. xor %g4, %o0, %o0 ! ^= rk[0]
  181. xor %g5, %o1, %o1
  182. movxtod %o0, %f12
  183. movxtod %o1, %f14
  184. fxor %f12, %f0, %f0 ! ^= ivec
  185. fxor %f14, %f2, %f2
  186. prefetch [$inp + 16+63], 20
  187. call _${alg}${bits}_encrypt_1x
  188. add $inp, 16, $inp
  189. sub $len, 1, $len
  190. stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  191. add $out, 8, $out
  192. stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  193. brnz,pt $len, .L${bits}_cbc_enc_blk_loop
  194. add $out, 8, $out
  195. membar #StoreLoad|#StoreStore
  196. brnz,pt $blk_init, .L${bits}_cbc_enc_loop
  197. mov $blk_init, $len
  198. ___
  199. $::code.=<<___ if ($::evp);
  200. st %f0, [$ivec + 0]
  201. st %f1, [$ivec + 4]
  202. st %f2, [$ivec + 8]
  203. st %f3, [$ivec + 12]
  204. ___
  205. $::code.=<<___ if (!$::evp);
  206. brnz,pn $ivoff, 3b
  207. nop
  208. std %f0, [$ivec + 0] ! write out ivec
  209. std %f2, [$ivec + 8]
  210. ___
  211. $::code.=<<___;
  212. ret
  213. restore
  214. .type ${alg}${bits}_t4_cbc_encrypt,#function
  215. .size ${alg}${bits}_t4_cbc_encrypt,.-${alg}${bits}_t4_cbc_encrypt
  216. ___
  217. }
  218. sub alg_cbc_decrypt_implement {
  219. my ($alg,$bits) = @_;
  220. $::code.=<<___;
  221. .globl ${alg}${bits}_t4_cbc_decrypt
  222. .align 32
  223. ${alg}${bits}_t4_cbc_decrypt:
  224. save %sp, -$::frame, %sp
  225. sub $inp, $out, $blk_init ! $inp!=$out
  226. ___
  227. $::code.=<<___ if (!$::evp);
  228. andcc $ivec, 7, $ivoff
  229. alignaddr $ivec, %g0, $ivec
  230. ldd [$ivec + 0], %f12 ! load ivec
  231. bz,pt %icc, 1f
  232. ldd [$ivec + 8], %f14
  233. ldd [$ivec + 16], %f0
  234. faligndata %f12, %f14, %f12
  235. faligndata %f14, %f0, %f14
  236. 1:
  237. ___
  238. $::code.=<<___ if ($::evp);
  239. ld [$ivec + 0], %f12 ! load ivec
  240. ld [$ivec + 4], %f13
  241. ld [$ivec + 8], %f14
  242. ld [$ivec + 12], %f15
  243. ___
  244. $::code.=<<___;
  245. prefetch [$inp], 20
  246. prefetch [$inp + 63], 20
  247. call _${alg}${bits}_load_deckey
  248. and $inp, 7, $ileft
  249. andn $inp, 7, $inp
  250. sll $ileft, 3, $ileft
  251. mov 64, $iright
  252. mov 0xff, $omask
  253. sub $iright, $ileft, $iright
  254. and $out, 7, $ooff
  255. cmp $len, 255
  256. movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
  257. movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
  258. brnz,pn $blk_init, .L${bits}cbc_dec_blk ! $inp==$out)
  259. srl $omask, $ooff, $omask
  260. andcc $len, 16, %g0 ! is number of blocks even?
  261. srlx $len, 4, $len
  262. alignaddrl $out, %g0, $out
  263. bz %icc, .L${bits}_cbc_dec_loop2x
  264. prefetch [$out], 22
  265. .L${bits}_cbc_dec_loop:
  266. ldx [$inp + 0], %o0
  267. brz,pt $ileft, 4f
  268. ldx [$inp + 8], %o1
  269. ldx [$inp + 16], %o2
  270. sllx %o0, $ileft, %o0
  271. srlx %o1, $iright, %g1
  272. sllx %o1, $ileft, %o1
  273. or %g1, %o0, %o0
  274. srlx %o2, $iright, %o2
  275. or %o2, %o1, %o1
  276. 4:
  277. xor %g4, %o0, %o2 ! ^= rk[0]
  278. xor %g5, %o1, %o3
  279. movxtod %o2, %f0
  280. movxtod %o3, %f2
  281. prefetch [$out + 63], 22
  282. prefetch [$inp + 16+63], 20
  283. call _${alg}${bits}_decrypt_1x
  284. add $inp, 16, $inp
  285. fxor %f12, %f0, %f0 ! ^= ivec
  286. fxor %f14, %f2, %f2
  287. movxtod %o0, %f12
  288. movxtod %o1, %f14
  289. brnz,pn $ooff, 2f
  290. sub $len, 1, $len
  291. std %f0, [$out + 0]
  292. std %f2, [$out + 8]
  293. brnz,pt $len, .L${bits}_cbc_dec_loop2x
  294. add $out, 16, $out
  295. ___
  296. $::code.=<<___ if ($::evp);
  297. st %f12, [$ivec + 0]
  298. st %f13, [$ivec + 4]
  299. st %f14, [$ivec + 8]
  300. st %f15, [$ivec + 12]
  301. ___
  302. $::code.=<<___ if (!$::evp);
  303. brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
  304. nop
  305. std %f12, [$ivec + 0] ! write out ivec
  306. std %f14, [$ivec + 8]
  307. ___
  308. $::code.=<<___;
  309. ret
  310. restore
  311. .align 16
  312. 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
  313. ! and ~3x deterioration
  314. ! in inp==out case
  315. faligndata %f0, %f0, %f4 ! handle unaligned output
  316. faligndata %f0, %f2, %f6
  317. faligndata %f2, %f2, %f8
  318. stda %f4, [$out + $omask]0xc0 ! partial store
  319. std %f6, [$out + 8]
  320. add $out, 16, $out
  321. orn %g0, $omask, $omask
  322. stda %f8, [$out + $omask]0xc0 ! partial store
  323. brnz,pt $len, .L${bits}_cbc_dec_loop2x+4
  324. orn %g0, $omask, $omask
  325. ___
  326. $::code.=<<___ if ($::evp);
  327. st %f12, [$ivec + 0]
  328. st %f13, [$ivec + 4]
  329. st %f14, [$ivec + 8]
  330. st %f15, [$ivec + 12]
  331. ___
  332. $::code.=<<___ if (!$::evp);
  333. brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
  334. nop
  335. std %f12, [$ivec + 0] ! write out ivec
  336. std %f14, [$ivec + 8]
  337. ___
  338. $::code.=<<___;
  339. ret
  340. restore
  341. !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  342. .align 32
  343. .L${bits}_cbc_dec_loop2x:
  344. ldx [$inp + 0], %o0
  345. ldx [$inp + 8], %o1
  346. ldx [$inp + 16], %o2
  347. brz,pt $ileft, 4f
  348. ldx [$inp + 24], %o3
  349. ldx [$inp + 32], %o4
  350. sllx %o0, $ileft, %o0
  351. srlx %o1, $iright, %g1
  352. or %g1, %o0, %o0
  353. sllx %o1, $ileft, %o1
  354. srlx %o2, $iright, %g1
  355. or %g1, %o1, %o1
  356. sllx %o2, $ileft, %o2
  357. srlx %o3, $iright, %g1
  358. or %g1, %o2, %o2
  359. sllx %o3, $ileft, %o3
  360. srlx %o4, $iright, %o4
  361. or %o4, %o3, %o3
  362. 4:
  363. xor %g4, %o0, %o4 ! ^= rk[0]
  364. xor %g5, %o1, %o5
  365. movxtod %o4, %f0
  366. movxtod %o5, %f2
  367. xor %g4, %o2, %o4
  368. xor %g5, %o3, %o5
  369. movxtod %o4, %f4
  370. movxtod %o5, %f6
  371. prefetch [$out + 63], 22
  372. prefetch [$inp + 32+63], 20
  373. call _${alg}${bits}_decrypt_2x
  374. add $inp, 32, $inp
  375. movxtod %o0, %f8
  376. movxtod %o1, %f10
  377. fxor %f12, %f0, %f0 ! ^= ivec
  378. fxor %f14, %f2, %f2
  379. movxtod %o2, %f12
  380. movxtod %o3, %f14
  381. fxor %f8, %f4, %f4
  382. fxor %f10, %f6, %f6
  383. brnz,pn $ooff, 2f
  384. sub $len, 2, $len
  385. std %f0, [$out + 0]
  386. std %f2, [$out + 8]
  387. std %f4, [$out + 16]
  388. std %f6, [$out + 24]
  389. brnz,pt $len, .L${bits}_cbc_dec_loop2x
  390. add $out, 32, $out
  391. ___
  392. $::code.=<<___ if ($::evp);
  393. st %f12, [$ivec + 0]
  394. st %f13, [$ivec + 4]
  395. st %f14, [$ivec + 8]
  396. st %f15, [$ivec + 12]
  397. ___
  398. $::code.=<<___ if (!$::evp);
  399. brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
  400. nop
  401. std %f12, [$ivec + 0] ! write out ivec
  402. std %f14, [$ivec + 8]
  403. ___
  404. $::code.=<<___;
  405. ret
  406. restore
  407. .align 16
  408. 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
  409. ! and ~3x deterioration
  410. ! in inp==out case
  411. faligndata %f0, %f0, %f8 ! handle unaligned output
  412. faligndata %f0, %f2, %f0
  413. faligndata %f2, %f4, %f2
  414. faligndata %f4, %f6, %f4
  415. faligndata %f6, %f6, %f6
  416. stda %f8, [$out + $omask]0xc0 ! partial store
  417. std %f0, [$out + 8]
  418. std %f2, [$out + 16]
  419. std %f4, [$out + 24]
  420. add $out, 32, $out
  421. orn %g0, $omask, $omask
  422. stda %f6, [$out + $omask]0xc0 ! partial store
  423. brnz,pt $len, .L${bits}_cbc_dec_loop2x+4
  424. orn %g0, $omask, $omask
  425. ___
  426. $::code.=<<___ if ($::evp);
  427. st %f12, [$ivec + 0]
  428. st %f13, [$ivec + 4]
  429. st %f14, [$ivec + 8]
  430. st %f15, [$ivec + 12]
  431. ___
  432. $::code.=<<___ if (!$::evp);
  433. brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
  434. nop
  435. std %f12, [$ivec + 0] ! write out ivec
  436. std %f14, [$ivec + 8]
  437. ret
  438. restore
  439. .align 16
  440. .L${bits}_cbc_dec_unaligned_ivec:
  441. alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec
  442. mov 0xff, $omask
  443. srl $omask, $ivoff, $omask
  444. faligndata %f12, %f12, %f0
  445. faligndata %f12, %f14, %f2
  446. faligndata %f14, %f14, %f4
  447. stda %f0, [$ivec + $omask]0xc0
  448. std %f2, [$ivec + 8]
  449. add $ivec, 16, $ivec
  450. orn %g0, $omask, $omask
  451. stda %f4, [$ivec + $omask]0xc0
  452. ___
  453. $::code.=<<___;
  454. ret
  455. restore
  456. !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  457. .align 32
  458. .L${bits}cbc_dec_blk:
  459. add $out, $len, $blk_init
  460. and $blk_init, 63, $blk_init ! tail
  461. sub $len, $blk_init, $len
  462. add $blk_init, 15, $blk_init ! round up to 16n
  463. srlx $len, 4, $len
  464. srl $blk_init, 4, $blk_init
  465. sub $len, 1, $len
  466. add $blk_init, 1, $blk_init
  467. .L${bits}_cbc_dec_blk_loop2x:
  468. ldx [$inp + 0], %o0
  469. ldx [$inp + 8], %o1
  470. ldx [$inp + 16], %o2
  471. brz,pt $ileft, 5f
  472. ldx [$inp + 24], %o3
  473. ldx [$inp + 32], %o4
  474. sllx %o0, $ileft, %o0
  475. srlx %o1, $iright, %g1
  476. or %g1, %o0, %o0
  477. sllx %o1, $ileft, %o1
  478. srlx %o2, $iright, %g1
  479. or %g1, %o1, %o1
  480. sllx %o2, $ileft, %o2
  481. srlx %o3, $iright, %g1
  482. or %g1, %o2, %o2
  483. sllx %o3, $ileft, %o3
  484. srlx %o4, $iright, %o4
  485. or %o4, %o3, %o3
  486. 5:
  487. xor %g4, %o0, %o4 ! ^= rk[0]
  488. xor %g5, %o1, %o5
  489. movxtod %o4, %f0
  490. movxtod %o5, %f2
  491. xor %g4, %o2, %o4
  492. xor %g5, %o3, %o5
  493. movxtod %o4, %f4
  494. movxtod %o5, %f6
  495. prefetch [$inp + 32+63], 20
  496. call _${alg}${bits}_decrypt_2x
  497. add $inp, 32, $inp
  498. subcc $len, 2, $len
  499. movxtod %o0, %f8
  500. movxtod %o1, %f10
  501. fxor %f12, %f0, %f0 ! ^= ivec
  502. fxor %f14, %f2, %f2
  503. movxtod %o2, %f12
  504. movxtod %o3, %f14
  505. fxor %f8, %f4, %f4
  506. fxor %f10, %f6, %f6
  507. stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  508. add $out, 8, $out
  509. stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  510. add $out, 8, $out
  511. stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  512. add $out, 8, $out
  513. stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  514. bgu,pt $::size_t_cc, .L${bits}_cbc_dec_blk_loop2x
  515. add $out, 8, $out
  516. add $blk_init, $len, $len
  517. andcc $len, 1, %g0 ! is number of blocks even?
  518. membar #StoreLoad|#StoreStore
  519. bnz,pt %icc, .L${bits}_cbc_dec_loop
  520. srl $len, 0, $len
  521. brnz,pn $len, .L${bits}_cbc_dec_loop2x
  522. nop
  523. ___
  524. $::code.=<<___ if ($::evp);
  525. st %f12, [$ivec + 0] ! write out ivec
  526. st %f13, [$ivec + 4]
  527. st %f14, [$ivec + 8]
  528. st %f15, [$ivec + 12]
  529. ___
  530. $::code.=<<___ if (!$::evp);
  531. brnz,pn $ivoff, 3b
  532. nop
  533. std %f12, [$ivec + 0] ! write out ivec
  534. std %f14, [$ivec + 8]
  535. ___
  536. $::code.=<<___;
  537. ret
  538. restore
  539. .type ${alg}${bits}_t4_cbc_decrypt,#function
  540. .size ${alg}${bits}_t4_cbc_decrypt,.-${alg}${bits}_t4_cbc_decrypt
  541. ___
  542. }
  543. sub alg_ctr32_implement {
  544. my ($alg,$bits) = @_;
  545. $::code.=<<___;
  546. .globl ${alg}${bits}_t4_ctr32_encrypt
  547. .align 32
  548. ${alg}${bits}_t4_ctr32_encrypt:
  549. save %sp, -$::frame, %sp
  550. prefetch [$inp], 20
  551. prefetch [$inp + 63], 20
  552. call _${alg}${bits}_load_enckey
  553. sllx $len, 4, $len
  554. ld [$ivec + 0], %l4 ! counter
  555. ld [$ivec + 4], %l5
  556. ld [$ivec + 8], %l6
  557. ld [$ivec + 12], %l7
  558. sllx %l4, 32, %o5
  559. or %l5, %o5, %o5
  560. sllx %l6, 32, %g1
  561. xor %o5, %g4, %g4 ! ^= rk[0]
  562. xor %g1, %g5, %g5
  563. movxtod %g4, %f14 ! most significant 64 bits
  564. sub $inp, $out, $blk_init ! $inp!=$out
  565. and $inp, 7, $ileft
  566. andn $inp, 7, $inp
  567. sll $ileft, 3, $ileft
  568. mov 64, $iright
  569. mov 0xff, $omask
  570. sub $iright, $ileft, $iright
  571. and $out, 7, $ooff
  572. cmp $len, 255
  573. movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
  574. movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
  575. brnz,pn $blk_init, .L${bits}_ctr32_blk ! $inp==$out)
  576. srl $omask, $ooff, $omask
  577. andcc $len, 16, %g0 ! is number of blocks even?
  578. alignaddrl $out, %g0, $out
  579. bz %icc, .L${bits}_ctr32_loop2x
  580. srlx $len, 4, $len
  581. .L${bits}_ctr32_loop:
  582. ldx [$inp + 0], %o0
  583. brz,pt $ileft, 4f
  584. ldx [$inp + 8], %o1
  585. ldx [$inp + 16], %o2
  586. sllx %o0, $ileft, %o0
  587. srlx %o1, $iright, %g1
  588. sllx %o1, $ileft, %o1
  589. or %g1, %o0, %o0
  590. srlx %o2, $iright, %o2
  591. or %o2, %o1, %o1
  592. 4:
  593. xor %g5, %l7, %g1 ! ^= rk[0]
  594. add %l7, 1, %l7
  595. movxtod %g1, %f2
  596. srl %l7, 0, %l7 ! clruw
  597. prefetch [$out + 63], 22
  598. prefetch [$inp + 16+63], 20
  599. ___
  600. $::code.=<<___ if ($alg eq "aes");
  601. aes_eround01 %f16, %f14, %f2, %f4
  602. aes_eround23 %f18, %f14, %f2, %f2
  603. ___
  604. $::code.=<<___ if ($alg eq "cmll");
  605. camellia_f %f16, %f2, %f14, %f2
  606. camellia_f %f18, %f14, %f2, %f0
  607. ___
  608. $::code.=<<___;
  609. call _${alg}${bits}_encrypt_1x+8
  610. add $inp, 16, $inp
  611. movxtod %o0, %f10
  612. movxtod %o1, %f12
  613. fxor %f10, %f0, %f0 ! ^= inp
  614. fxor %f12, %f2, %f2
  615. brnz,pn $ooff, 2f
  616. sub $len, 1, $len
  617. std %f0, [$out + 0]
  618. std %f2, [$out + 8]
  619. brnz,pt $len, .L${bits}_ctr32_loop2x
  620. add $out, 16, $out
  621. ret
  622. restore
  623. .align 16
  624. 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
  625. ! and ~3x deterioration
  626. ! in inp==out case
  627. faligndata %f0, %f0, %f4 ! handle unaligned output
  628. faligndata %f0, %f2, %f6
  629. faligndata %f2, %f2, %f8
  630. stda %f4, [$out + $omask]0xc0 ! partial store
  631. std %f6, [$out + 8]
  632. add $out, 16, $out
  633. orn %g0, $omask, $omask
  634. stda %f8, [$out + $omask]0xc0 ! partial store
  635. brnz,pt $len, .L${bits}_ctr32_loop2x+4
  636. orn %g0, $omask, $omask
  637. ret
  638. restore
  639. !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  640. .align 32
  641. .L${bits}_ctr32_loop2x:
  642. ldx [$inp + 0], %o0
  643. ldx [$inp + 8], %o1
  644. ldx [$inp + 16], %o2
  645. brz,pt $ileft, 4f
  646. ldx [$inp + 24], %o3
  647. ldx [$inp + 32], %o4
  648. sllx %o0, $ileft, %o0
  649. srlx %o1, $iright, %g1
  650. or %g1, %o0, %o0
  651. sllx %o1, $ileft, %o1
  652. srlx %o2, $iright, %g1
  653. or %g1, %o1, %o1
  654. sllx %o2, $ileft, %o2
  655. srlx %o3, $iright, %g1
  656. or %g1, %o2, %o2
  657. sllx %o3, $ileft, %o3
  658. srlx %o4, $iright, %o4
  659. or %o4, %o3, %o3
  660. 4:
  661. xor %g5, %l7, %g1 ! ^= rk[0]
  662. add %l7, 1, %l7
  663. movxtod %g1, %f2
  664. srl %l7, 0, %l7 ! clruw
  665. xor %g5, %l7, %g1
  666. add %l7, 1, %l7
  667. movxtod %g1, %f6
  668. srl %l7, 0, %l7 ! clruw
  669. prefetch [$out + 63], 22
  670. prefetch [$inp + 32+63], 20
  671. ___
  672. $::code.=<<___ if ($alg eq "aes");
  673. aes_eround01 %f16, %f14, %f2, %f8
  674. aes_eround23 %f18, %f14, %f2, %f2
  675. aes_eround01 %f16, %f14, %f6, %f10
  676. aes_eround23 %f18, %f14, %f6, %f6
  677. ___
  678. $::code.=<<___ if ($alg eq "cmll");
  679. camellia_f %f16, %f2, %f14, %f2
  680. camellia_f %f16, %f6, %f14, %f6
  681. camellia_f %f18, %f14, %f2, %f0
  682. camellia_f %f18, %f14, %f6, %f4
  683. ___
  684. $::code.=<<___;
  685. call _${alg}${bits}_encrypt_2x+16
  686. add $inp, 32, $inp
  687. movxtod %o0, %f8
  688. movxtod %o1, %f10
  689. movxtod %o2, %f12
  690. fxor %f8, %f0, %f0 ! ^= inp
  691. movxtod %o3, %f8
  692. fxor %f10, %f2, %f2
  693. fxor %f12, %f4, %f4
  694. fxor %f8, %f6, %f6
  695. brnz,pn $ooff, 2f
  696. sub $len, 2, $len
  697. std %f0, [$out + 0]
  698. std %f2, [$out + 8]
  699. std %f4, [$out + 16]
  700. std %f6, [$out + 24]
  701. brnz,pt $len, .L${bits}_ctr32_loop2x
  702. add $out, 32, $out
  703. ret
  704. restore
  705. .align 16
  706. 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
  707. ! and ~3x deterioration
  708. ! in inp==out case
  709. faligndata %f0, %f0, %f8 ! handle unaligned output
  710. faligndata %f0, %f2, %f0
  711. faligndata %f2, %f4, %f2
  712. faligndata %f4, %f6, %f4
  713. faligndata %f6, %f6, %f6
  714. stda %f8, [$out + $omask]0xc0 ! partial store
  715. std %f0, [$out + 8]
  716. std %f2, [$out + 16]
  717. std %f4, [$out + 24]
  718. add $out, 32, $out
  719. orn %g0, $omask, $omask
  720. stda %f6, [$out + $omask]0xc0 ! partial store
  721. brnz,pt $len, .L${bits}_ctr32_loop2x+4
  722. orn %g0, $omask, $omask
  723. ret
  724. restore
  725. !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  726. .align 32
  727. .L${bits}_ctr32_blk:
  728. add $out, $len, $blk_init
  729. and $blk_init, 63, $blk_init ! tail
  730. sub $len, $blk_init, $len
  731. add $blk_init, 15, $blk_init ! round up to 16n
  732. srlx $len, 4, $len
  733. srl $blk_init, 4, $blk_init
  734. sub $len, 1, $len
  735. add $blk_init, 1, $blk_init
  736. .L${bits}_ctr32_blk_loop2x:
  737. ldx [$inp + 0], %o0
  738. ldx [$inp + 8], %o1
  739. ldx [$inp + 16], %o2
  740. brz,pt $ileft, 5f
  741. ldx [$inp + 24], %o3
  742. ldx [$inp + 32], %o4
  743. sllx %o0, $ileft, %o0
  744. srlx %o1, $iright, %g1
  745. or %g1, %o0, %o0
  746. sllx %o1, $ileft, %o1
  747. srlx %o2, $iright, %g1
  748. or %g1, %o1, %o1
  749. sllx %o2, $ileft, %o2
  750. srlx %o3, $iright, %g1
  751. or %g1, %o2, %o2
  752. sllx %o3, $ileft, %o3
  753. srlx %o4, $iright, %o4
  754. or %o4, %o3, %o3
  755. 5:
  756. xor %g5, %l7, %g1 ! ^= rk[0]
  757. add %l7, 1, %l7
  758. movxtod %g1, %f2
  759. srl %l7, 0, %l7 ! clruw
  760. xor %g5, %l7, %g1
  761. add %l7, 1, %l7
  762. movxtod %g1, %f6
  763. srl %l7, 0, %l7 ! clruw
  764. prefetch [$inp + 32+63], 20
  765. ___
  766. $::code.=<<___ if ($alg eq "aes");
  767. aes_eround01 %f16, %f14, %f2, %f8
  768. aes_eround23 %f18, %f14, %f2, %f2
  769. aes_eround01 %f16, %f14, %f6, %f10
  770. aes_eround23 %f18, %f14, %f6, %f6
  771. ___
  772. $::code.=<<___ if ($alg eq "cmll");
  773. camellia_f %f16, %f2, %f14, %f2
  774. camellia_f %f16, %f6, %f14, %f6
  775. camellia_f %f18, %f14, %f2, %f0
  776. camellia_f %f18, %f14, %f6, %f4
  777. ___
  778. $::code.=<<___;
  779. call _${alg}${bits}_encrypt_2x+16
  780. add $inp, 32, $inp
  781. subcc $len, 2, $len
  782. movxtod %o0, %f8
  783. movxtod %o1, %f10
  784. movxtod %o2, %f12
  785. fxor %f8, %f0, %f0 ! ^= inp
  786. movxtod %o3, %f8
  787. fxor %f10, %f2, %f2
  788. fxor %f12, %f4, %f4
  789. fxor %f8, %f6, %f6
  790. stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  791. add $out, 8, $out
  792. stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  793. add $out, 8, $out
  794. stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  795. add $out, 8, $out
  796. stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  797. bgu,pt $::size_t_cc, .L${bits}_ctr32_blk_loop2x
  798. add $out, 8, $out
  799. add $blk_init, $len, $len
  800. andcc $len, 1, %g0 ! is number of blocks even?
  801. membar #StoreLoad|#StoreStore
  802. bnz,pt %icc, .L${bits}_ctr32_loop
  803. srl $len, 0, $len
  804. brnz,pn $len, .L${bits}_ctr32_loop2x
  805. nop
  806. ret
  807. restore
  808. .type ${alg}${bits}_t4_ctr32_encrypt,#function
  809. .size ${alg}${bits}_t4_ctr32_encrypt,.-${alg}${bits}_t4_ctr32_encrypt
  810. ___
  811. }
  812. sub alg_xts_implement {
  813. my ($alg,$bits,$dir) = @_;
  814. my ($inp,$out,$len,$key1,$key2,$ivec)=map("%i$_",(0..5));
  815. my $rem=$ivec;
  816. $::code.=<<___;
  817. .globl ${alg}${bits}_t4_xts_${dir}crypt
  818. .align 32
  819. ${alg}${bits}_t4_xts_${dir}crypt:
  820. save %sp, -$::frame-16, %sp
  821. mov $ivec, %o0
  822. add %fp, $::bias-16, %o1
  823. call ${alg}_t4_encrypt
  824. mov $key2, %o2
  825. add %fp, $::bias-16, %l7
  826. ldxa [%l7]0x88, %g2
  827. add %fp, $::bias-8, %l7
  828. ldxa [%l7]0x88, %g3 ! %g3:%g2 is tweak
  829. sethi %hi(0x76543210), %l7
  830. or %l7, %lo(0x76543210), %l7
  831. bmask %l7, %g0, %g0 ! byte swap mask
  832. prefetch [$inp], 20
  833. prefetch [$inp + 63], 20
  834. call _${alg}${bits}_load_${dir}ckey
  835. and $len, 15, $rem
  836. and $len, -16, $len
  837. ___
  838. $code.=<<___ if ($dir eq "de");
  839. mov 0, %l7
  840. movrnz $rem, 16, %l7
  841. sub $len, %l7, $len
  842. ___
  843. $code.=<<___;
  844. sub $inp, $out, $blk_init ! $inp!=$out
  845. and $inp, 7, $ileft
  846. andn $inp, 7, $inp
  847. sll $ileft, 3, $ileft
  848. mov 64, $iright
  849. mov 0xff, $omask
  850. sub $iright, $ileft, $iright
  851. and $out, 7, $ooff
  852. cmp $len, 255
  853. movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
  854. movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
  855. brnz,pn $blk_init, .L${bits}_xts_${dir}blk ! $inp==$out)
  856. srl $omask, $ooff, $omask
  857. andcc $len, 16, %g0 ! is number of blocks even?
  858. ___
  859. $code.=<<___ if ($dir eq "de");
  860. brz,pn $len, .L${bits}_xts_${dir}steal
  861. ___
  862. $code.=<<___;
  863. alignaddrl $out, %g0, $out
  864. bz %icc, .L${bits}_xts_${dir}loop2x
  865. srlx $len, 4, $len
  866. .L${bits}_xts_${dir}loop:
  867. ldx [$inp + 0], %o0
  868. brz,pt $ileft, 4f
  869. ldx [$inp + 8], %o1
  870. ldx [$inp + 16], %o2
  871. sllx %o0, $ileft, %o0
  872. srlx %o1, $iright, %g1
  873. sllx %o1, $ileft, %o1
  874. or %g1, %o0, %o0
  875. srlx %o2, $iright, %o2
  876. or %o2, %o1, %o1
  877. 4:
  878. movxtod %g2, %f12
  879. movxtod %g3, %f14
  880. bshuffle %f12, %f12, %f12
  881. bshuffle %f14, %f14, %f14
  882. xor %g4, %o0, %o0 ! ^= rk[0]
  883. xor %g5, %o1, %o1
  884. movxtod %o0, %f0
  885. movxtod %o1, %f2
  886. fxor %f12, %f0, %f0 ! ^= tweak[0]
  887. fxor %f14, %f2, %f2
  888. prefetch [$out + 63], 22
  889. prefetch [$inp + 16+63], 20
  890. call _${alg}${bits}_${dir}crypt_1x
  891. add $inp, 16, $inp
  892. fxor %f12, %f0, %f0 ! ^= tweak[0]
  893. fxor %f14, %f2, %f2
  894. srax %g3, 63, %l7 ! next tweak value
  895. addcc %g2, %g2, %g2
  896. and %l7, 0x87, %l7
  897. addxc %g3, %g3, %g3
  898. xor %l7, %g2, %g2
  899. brnz,pn $ooff, 2f
  900. sub $len, 1, $len
  901. std %f0, [$out + 0]
  902. std %f2, [$out + 8]
  903. brnz,pt $len, .L${bits}_xts_${dir}loop2x
  904. add $out, 16, $out
  905. brnz,pn $rem, .L${bits}_xts_${dir}steal
  906. nop
  907. ret
  908. restore
  909. .align 16
  910. 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
  911. ! and ~3x deterioration
  912. ! in inp==out case
  913. faligndata %f0, %f0, %f4 ! handle unaligned output
  914. faligndata %f0, %f2, %f6
  915. faligndata %f2, %f2, %f8
  916. stda %f4, [$out + $omask]0xc0 ! partial store
  917. std %f6, [$out + 8]
  918. add $out, 16, $out
  919. orn %g0, $omask, $omask
  920. stda %f8, [$out + $omask]0xc0 ! partial store
  921. brnz,pt $len, .L${bits}_xts_${dir}loop2x+4
  922. orn %g0, $omask, $omask
  923. brnz,pn $rem, .L${bits}_xts_${dir}steal
  924. nop
  925. ret
  926. restore
  927. !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  928. .align 32
  929. .L${bits}_xts_${dir}loop2x:
  930. ldx [$inp + 0], %o0
  931. ldx [$inp + 8], %o1
  932. ldx [$inp + 16], %o2
  933. brz,pt $ileft, 4f
  934. ldx [$inp + 24], %o3
  935. ldx [$inp + 32], %o4
  936. sllx %o0, $ileft, %o0
  937. srlx %o1, $iright, %g1
  938. or %g1, %o0, %o0
  939. sllx %o1, $ileft, %o1
  940. srlx %o2, $iright, %g1
  941. or %g1, %o1, %o1
  942. sllx %o2, $ileft, %o2
  943. srlx %o3, $iright, %g1
  944. or %g1, %o2, %o2
  945. sllx %o3, $ileft, %o3
  946. srlx %o4, $iright, %o4
  947. or %o4, %o3, %o3
  948. 4:
  949. movxtod %g2, %f12
  950. movxtod %g3, %f14
  951. bshuffle %f12, %f12, %f12
  952. bshuffle %f14, %f14, %f14
  953. srax %g3, 63, %l7 ! next tweak value
  954. addcc %g2, %g2, %g2
  955. and %l7, 0x87, %l7
  956. addxc %g3, %g3, %g3
  957. xor %l7, %g2, %g2
  958. movxtod %g2, %f8
  959. movxtod %g3, %f10
  960. bshuffle %f8, %f8, %f8
  961. bshuffle %f10, %f10, %f10
  962. xor %g4, %o0, %o0 ! ^= rk[0]
  963. xor %g5, %o1, %o1
  964. xor %g4, %o2, %o2 ! ^= rk[0]
  965. xor %g5, %o3, %o3
  966. movxtod %o0, %f0
  967. movxtod %o1, %f2
  968. movxtod %o2, %f4
  969. movxtod %o3, %f6
  970. fxor %f12, %f0, %f0 ! ^= tweak[0]
  971. fxor %f14, %f2, %f2
  972. fxor %f8, %f4, %f4 ! ^= tweak[0]
  973. fxor %f10, %f6, %f6
  974. prefetch [$out + 63], 22
  975. prefetch [$inp + 32+63], 20
  976. call _${alg}${bits}_${dir}crypt_2x
  977. add $inp, 32, $inp
  978. movxtod %g2, %f8
  979. movxtod %g3, %f10
  980. srax %g3, 63, %l7 ! next tweak value
  981. addcc %g2, %g2, %g2
  982. and %l7, 0x87, %l7
  983. addxc %g3, %g3, %g3
  984. xor %l7, %g2, %g2
  985. bshuffle %f8, %f8, %f8
  986. bshuffle %f10, %f10, %f10
  987. fxor %f12, %f0, %f0 ! ^= tweak[0]
  988. fxor %f14, %f2, %f2
  989. fxor %f8, %f4, %f4
  990. fxor %f10, %f6, %f6
  991. brnz,pn $ooff, 2f
  992. sub $len, 2, $len
  993. std %f0, [$out + 0]
  994. std %f2, [$out + 8]
  995. std %f4, [$out + 16]
  996. std %f6, [$out + 24]
  997. brnz,pt $len, .L${bits}_xts_${dir}loop2x
  998. add $out, 32, $out
  999. fsrc2 %f4, %f0
  1000. fsrc2 %f6, %f2
  1001. brnz,pn $rem, .L${bits}_xts_${dir}steal
  1002. nop
  1003. ret
  1004. restore
  1005. .align 16
  1006. 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
  1007. ! and ~3x deterioration
  1008. ! in inp==out case
  1009. faligndata %f0, %f0, %f8 ! handle unaligned output
  1010. faligndata %f0, %f2, %f10
  1011. faligndata %f2, %f4, %f12
  1012. faligndata %f4, %f6, %f14
  1013. faligndata %f6, %f6, %f0
  1014. stda %f8, [$out + $omask]0xc0 ! partial store
  1015. std %f10, [$out + 8]
  1016. std %f12, [$out + 16]
  1017. std %f14, [$out + 24]
  1018. add $out, 32, $out
  1019. orn %g0, $omask, $omask
  1020. stda %f0, [$out + $omask]0xc0 ! partial store
  1021. brnz,pt $len, .L${bits}_xts_${dir}loop2x+4
  1022. orn %g0, $omask, $omask
  1023. fsrc2 %f4, %f0
  1024. fsrc2 %f6, %f2
  1025. brnz,pn $rem, .L${bits}_xts_${dir}steal
  1026. nop
  1027. ret
  1028. restore
  1029. !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  1030. .align 32
  1031. .L${bits}_xts_${dir}blk:
  1032. add $out, $len, $blk_init
  1033. and $blk_init, 63, $blk_init ! tail
  1034. sub $len, $blk_init, $len
  1035. add $blk_init, 15, $blk_init ! round up to 16n
  1036. srlx $len, 4, $len
  1037. srl $blk_init, 4, $blk_init
  1038. sub $len, 1, $len
  1039. add $blk_init, 1, $blk_init
  1040. .L${bits}_xts_${dir}blk2x:
  1041. ldx [$inp + 0], %o0
  1042. ldx [$inp + 8], %o1
  1043. ldx [$inp + 16], %o2
  1044. brz,pt $ileft, 5f
  1045. ldx [$inp + 24], %o3
  1046. ldx [$inp + 32], %o4
  1047. sllx %o0, $ileft, %o0
  1048. srlx %o1, $iright, %g1
  1049. or %g1, %o0, %o0
  1050. sllx %o1, $ileft, %o1
  1051. srlx %o2, $iright, %g1
  1052. or %g1, %o1, %o1
  1053. sllx %o2, $ileft, %o2
  1054. srlx %o3, $iright, %g1
  1055. or %g1, %o2, %o2
  1056. sllx %o3, $ileft, %o3
  1057. srlx %o4, $iright, %o4
  1058. or %o4, %o3, %o3
  1059. 5:
  1060. movxtod %g2, %f12
  1061. movxtod %g3, %f14
  1062. bshuffle %f12, %f12, %f12
  1063. bshuffle %f14, %f14, %f14
  1064. srax %g3, 63, %l7 ! next tweak value
  1065. addcc %g2, %g2, %g2
  1066. and %l7, 0x87, %l7
  1067. addxc %g3, %g3, %g3
  1068. xor %l7, %g2, %g2
  1069. movxtod %g2, %f8
  1070. movxtod %g3, %f10
  1071. bshuffle %f8, %f8, %f8
  1072. bshuffle %f10, %f10, %f10
  1073. xor %g4, %o0, %o0 ! ^= rk[0]
  1074. xor %g5, %o1, %o1
  1075. xor %g4, %o2, %o2 ! ^= rk[0]
  1076. xor %g5, %o3, %o3
  1077. movxtod %o0, %f0
  1078. movxtod %o1, %f2
  1079. movxtod %o2, %f4
  1080. movxtod %o3, %f6
  1081. fxor %f12, %f0, %f0 ! ^= tweak[0]
  1082. fxor %f14, %f2, %f2
  1083. fxor %f8, %f4, %f4 ! ^= tweak[0]
  1084. fxor %f10, %f6, %f6
  1085. prefetch [$inp + 32+63], 20
  1086. call _${alg}${bits}_${dir}crypt_2x
  1087. add $inp, 32, $inp
  1088. movxtod %g2, %f8
  1089. movxtod %g3, %f10
  1090. srax %g3, 63, %l7 ! next tweak value
  1091. addcc %g2, %g2, %g2
  1092. and %l7, 0x87, %l7
  1093. addxc %g3, %g3, %g3
  1094. xor %l7, %g2, %g2
  1095. bshuffle %f8, %f8, %f8
  1096. bshuffle %f10, %f10, %f10
  1097. fxor %f12, %f0, %f0 ! ^= tweak[0]
  1098. fxor %f14, %f2, %f2
  1099. fxor %f8, %f4, %f4
  1100. fxor %f10, %f6, %f6
  1101. stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  1102. add $out, 8, $out
  1103. stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  1104. add $out, 8, $out
  1105. stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  1106. add $out, 8, $out
  1107. stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  1108. bgu,pt $::size_t_cc, .L${bits}_xts_${dir}blk2x
  1109. add $out, 8, $out
  1110. add $blk_init, $len, $len
  1111. andcc $len, 1, %g0 ! is number of blocks even?
  1112. membar #StoreLoad|#StoreStore
  1113. bnz,pt %icc, .L${bits}_xts_${dir}loop
  1114. srl $len, 0, $len
  1115. brnz,pn $len, .L${bits}_xts_${dir}loop2x
  1116. nop
  1117. fsrc2 %f4, %f0
  1118. fsrc2 %f6, %f2
  1119. brnz,pn $rem, .L${bits}_xts_${dir}steal
  1120. nop
  1121. ret
  1122. restore
  1123. !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  1124. ___
  1125. $code.=<<___ if ($dir eq "en");
  1126. .align 32
  1127. .L${bits}_xts_${dir}steal:
  1128. std %f0, [%fp + $::bias-16] ! copy of output
  1129. std %f2, [%fp + $::bias-8]
  1130. srl $ileft, 3, $ileft
  1131. add %fp, $::bias-16, %l7
  1132. add $inp, $ileft, $inp ! original $inp+$len&-15
  1133. add $out, $ooff, $out ! original $out+$len&-15
  1134. mov 0, $ileft
  1135. nop ! align
  1136. .L${bits}_xts_${dir}stealing:
  1137. ldub [$inp + $ileft], %o0
  1138. ldub [%l7 + $ileft], %o1
  1139. dec $rem
  1140. stb %o0, [%l7 + $ileft]
  1141. stb %o1, [$out + $ileft]
  1142. brnz $rem, .L${bits}_xts_${dir}stealing
  1143. inc $ileft
  1144. mov %l7, $inp
  1145. sub $out, 16, $out
  1146. mov 0, $ileft
  1147. sub $out, $ooff, $out
  1148. ba .L${bits}_xts_${dir}loop ! one more time
  1149. mov 1, $len ! $rem is 0
  1150. ___
  1151. $code.=<<___ if ($dir eq "de");
  1152. .align 32
  1153. .L${bits}_xts_${dir}steal:
  1154. ldx [$inp + 0], %o0
  1155. brz,pt $ileft, 8f
  1156. ldx [$inp + 8], %o1
  1157. ldx [$inp + 16], %o2
  1158. sllx %o0, $ileft, %o0
  1159. srlx %o1, $iright, %g1
  1160. sllx %o1, $ileft, %o1
  1161. or %g1, %o0, %o0
  1162. srlx %o2, $iright, %o2
  1163. or %o2, %o1, %o1
  1164. 8:
  1165. srax %g3, 63, %l7 ! next tweak value
  1166. addcc %g2, %g2, %o2
  1167. and %l7, 0x87, %l7
  1168. addxc %g3, %g3, %o3
  1169. xor %l7, %o2, %o2
  1170. movxtod %o2, %f12
  1171. movxtod %o3, %f14
  1172. bshuffle %f12, %f12, %f12
  1173. bshuffle %f14, %f14, %f14
  1174. xor %g4, %o0, %o0 ! ^= rk[0]
  1175. xor %g5, %o1, %o1
  1176. movxtod %o0, %f0
  1177. movxtod %o1, %f2
  1178. fxor %f12, %f0, %f0 ! ^= tweak[0]
  1179. fxor %f14, %f2, %f2
  1180. call _${alg}${bits}_${dir}crypt_1x
  1181. add $inp, 16, $inp
  1182. fxor %f12, %f0, %f0 ! ^= tweak[0]
  1183. fxor %f14, %f2, %f2
  1184. std %f0, [%fp + $::bias-16]
  1185. std %f2, [%fp + $::bias-8]
  1186. srl $ileft, 3, $ileft
  1187. add %fp, $::bias-16, %l7
  1188. add $inp, $ileft, $inp ! original $inp+$len&-15
  1189. add $out, $ooff, $out ! original $out+$len&-15
  1190. mov 0, $ileft
  1191. add $out, 16, $out
  1192. nop ! align
  1193. .L${bits}_xts_${dir}stealing:
  1194. ldub [$inp + $ileft], %o0
  1195. ldub [%l7 + $ileft], %o1
  1196. dec $rem
  1197. stb %o0, [%l7 + $ileft]
  1198. stb %o1, [$out + $ileft]
  1199. brnz $rem, .L${bits}_xts_${dir}stealing
  1200. inc $ileft
  1201. mov %l7, $inp
  1202. sub $out, 16, $out
  1203. mov 0, $ileft
  1204. sub $out, $ooff, $out
  1205. ba .L${bits}_xts_${dir}loop ! one more time
  1206. mov 1, $len ! $rem is 0
  1207. ___
  1208. $code.=<<___;
  1209. ret
  1210. restore
  1211. .type ${alg}${bits}_t4_xts_${dir}crypt,#function
  1212. .size ${alg}${bits}_t4_xts_${dir}crypt,.-${alg}${bits}_t4_xts_${dir}crypt
  1213. ___
  1214. }
  1215. # Purpose of these subroutines is to explicitly encode VIS instructions,
  1216. # so that one can compile the module without having to specify VIS
  1217. # extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
  1218. # Idea is to reserve for option to produce "universal" binary and let
  1219. # programmer detect if current CPU is VIS capable at run-time.
  1220. sub unvis {
  1221. my ($mnemonic,$rs1,$rs2,$rd)=@_;
  1222. my ($ref,$opf);
  1223. my %visopf = ( "faligndata" => 0x048,
  1224. "bshuffle" => 0x04c,
  1225. "fnot2" => 0x066,
  1226. "fxor" => 0x06c,
  1227. "fsrc2" => 0x078 );
  1228. $ref = "$mnemonic\t$rs1,$rs2,$rd";
  1229. if ($opf=$visopf{$mnemonic}) {
  1230. foreach ($rs1,$rs2,$rd) {
  1231. return $ref if (!/%f([0-9]{1,2})/);
  1232. $_=$1;
  1233. if ($1>=32) {
  1234. return $ref if ($1&1);
  1235. # re-encode for upper double register addressing
  1236. $_=($1|$1>>5)&31;
  1237. }
  1238. }
  1239. return sprintf ".word\t0x%08x !%s",
  1240. 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
  1241. $ref;
  1242. } else {
  1243. return $ref;
  1244. }
  1245. }
  1246. sub unvis3 {
  1247. my ($mnemonic,$rs1,$rs2,$rd)=@_;
  1248. my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
  1249. my ($ref,$opf);
  1250. my %visopf = ( "addxc" => 0x011,
  1251. "addxccc" => 0x013,
  1252. "umulxhi" => 0x016,
  1253. "alignaddr" => 0x018,
  1254. "bmask" => 0x019,
  1255. "alignaddrl" => 0x01a );
  1256. $ref = "$mnemonic\t$rs1,$rs2,$rd";
  1257. if ($opf=$visopf{$mnemonic}) {
  1258. foreach ($rs1,$rs2,$rd) {
  1259. return $ref if (!/%([goli])([0-9])/);
  1260. $_=$bias{$1}+$2;
  1261. }
  1262. return sprintf ".word\t0x%08x !%s",
  1263. 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
  1264. $ref;
  1265. } else {
  1266. return $ref;
  1267. }
  1268. }
  1269. sub unaes_round { # 4-argument instructions
  1270. my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
  1271. my ($ref,$opf);
  1272. my %aesopf = ( "aes_eround01" => 0,
  1273. "aes_eround23" => 1,
  1274. "aes_dround01" => 2,
  1275. "aes_dround23" => 3,
  1276. "aes_eround01_l"=> 4,
  1277. "aes_eround23_l"=> 5,
  1278. "aes_dround01_l"=> 6,
  1279. "aes_dround23_l"=> 7,
  1280. "aes_kexpand1" => 8 );
  1281. $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
  1282. if (defined($opf=$aesopf{$mnemonic})) {
  1283. $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
  1284. foreach ($rs1,$rs2,$rd) {
  1285. return $ref if (!/%f([0-9]{1,2})/);
  1286. $_=$1;
  1287. if ($1>=32) {
  1288. return $ref if ($1&1);
  1289. # re-encode for upper double register addressing
  1290. $_=($1|$1>>5)&31;
  1291. }
  1292. }
  1293. return sprintf ".word\t0x%08x !%s",
  1294. 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
  1295. $ref;
  1296. } else {
  1297. return $ref;
  1298. }
  1299. }
  1300. sub unaes_kexpand { # 3-argument instructions
  1301. my ($mnemonic,$rs1,$rs2,$rd)=@_;
  1302. my ($ref,$opf);
  1303. my %aesopf = ( "aes_kexpand0" => 0x130,
  1304. "aes_kexpand2" => 0x131 );
  1305. $ref = "$mnemonic\t$rs1,$rs2,$rd";
  1306. if (defined($opf=$aesopf{$mnemonic})) {
  1307. foreach ($rs1,$rs2,$rd) {
  1308. return $ref if (!/%f([0-9]{1,2})/);
  1309. $_=$1;
  1310. if ($1>=32) {
  1311. return $ref if ($1&1);
  1312. # re-encode for upper double register addressing
  1313. $_=($1|$1>>5)&31;
  1314. }
  1315. }
  1316. return sprintf ".word\t0x%08x !%s",
  1317. 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
  1318. $ref;
  1319. } else {
  1320. return $ref;
  1321. }
  1322. }
  1323. sub uncamellia_f { # 4-argument instructions
  1324. my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
  1325. my ($ref,$opf);
  1326. $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
  1327. if (1) {
  1328. $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
  1329. foreach ($rs1,$rs2,$rd) {
  1330. return $ref if (!/%f([0-9]{1,2})/);
  1331. $_=$1;
  1332. if ($1>=32) {
  1333. return $ref if ($1&1);
  1334. # re-encode for upper double register addressing
  1335. $_=($1|$1>>5)&31;
  1336. }
  1337. }
  1338. return sprintf ".word\t0x%08x !%s",
  1339. 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|0xc<<5|$rs2,
  1340. $ref;
  1341. } else {
  1342. return $ref;
  1343. }
  1344. }
  1345. sub uncamellia3 { # 3-argument instructions
  1346. my ($mnemonic,$rs1,$rs2,$rd)=@_;
  1347. my ($ref,$opf);
  1348. my %cmllopf = ( "camellia_fl" => 0x13c,
  1349. "camellia_fli" => 0x13d );
  1350. $ref = "$mnemonic\t$rs1,$rs2,$rd";
  1351. if (defined($opf=$cmllopf{$mnemonic})) {
  1352. foreach ($rs1,$rs2,$rd) {
  1353. return $ref if (!/%f([0-9]{1,2})/);
  1354. $_=$1;
  1355. if ($1>=32) {
  1356. return $ref if ($1&1);
  1357. # re-encode for upper double register addressing
  1358. $_=($1|$1>>5)&31;
  1359. }
  1360. }
  1361. return sprintf ".word\t0x%08x !%s",
  1362. 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
  1363. $ref;
  1364. } else {
  1365. return $ref;
  1366. }
  1367. }
  1368. sub unmovxtox { # 2-argument instructions
  1369. my ($mnemonic,$rs,$rd)=@_;
  1370. my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24, "f" => 0 );
  1371. my ($ref,$opf);
  1372. my %movxopf = ( "movdtox" => 0x110,
  1373. "movstouw" => 0x111,
  1374. "movstosw" => 0x113,
  1375. "movxtod" => 0x118,
  1376. "movwtos" => 0x119 );
  1377. $ref = "$mnemonic\t$rs,$rd";
  1378. if (defined($opf=$movxopf{$mnemonic})) {
  1379. foreach ($rs,$rd) {
  1380. return $ref if (!/%([fgoli])([0-9]{1,2})/);
  1381. $_=$bias{$1}+$2;
  1382. if ($2>=32) {
  1383. return $ref if ($2&1);
  1384. # re-encode for upper double register addressing
  1385. $_=($2|$2>>5)&31;
  1386. }
  1387. }
  1388. return sprintf ".word\t0x%08x !%s",
  1389. 2<<30|$rd<<25|0x36<<19|$opf<<5|$rs,
  1390. $ref;
  1391. } else {
  1392. return $ref;
  1393. }
  1394. }
  1395. sub undes {
  1396. my ($mnemonic)=shift;
  1397. my @args=@_;
  1398. my ($ref,$opf);
  1399. my %desopf = ( "des_round" => 0b1001,
  1400. "des_ip" => 0b100110100,
  1401. "des_iip" => 0b100110101,
  1402. "des_kexpand" => 0b100110110 );
  1403. $ref = "$mnemonic\t".join(",",@_);
  1404. if (defined($opf=$desopf{$mnemonic})) { # 4-arg
  1405. if ($mnemonic eq "des_round") {
  1406. foreach (@args[0..3]) {
  1407. return $ref if (!/%f([0-9]{1,2})/);
  1408. $_=$1;
  1409. if ($1>=32) {
  1410. return $ref if ($1&1);
  1411. # re-encode for upper double register addressing
  1412. $_=($1|$1>>5)&31;
  1413. }
  1414. }
  1415. return sprintf ".word\t0x%08x !%s",
  1416. 2<<30|0b011001<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<9|$args[3]<<25,
  1417. $ref;
  1418. } elsif ($mnemonic eq "des_kexpand") { # 3-arg
  1419. foreach (@args[0..2]) {
  1420. return $ref if (!/(%f)?([0-9]{1,2})/);
  1421. $_=$2;
  1422. if ($2>=32) {
  1423. return $ref if ($2&1);
  1424. # re-encode for upper double register addressing
  1425. $_=($2|$2>>5)&31;
  1426. }
  1427. }
  1428. return sprintf ".word\t0x%08x !%s",
  1429. 2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<25,
  1430. $ref;
  1431. } else { # 2-arg
  1432. foreach (@args[0..1]) {
  1433. return $ref if (!/%f([0-9]{1,2})/);
  1434. $_=$1;
  1435. if ($1>=32) {
  1436. return $ref if ($2&1);
  1437. # re-encode for upper double register addressing
  1438. $_=($1|$1>>5)&31;
  1439. }
  1440. }
  1441. return sprintf ".word\t0x%08x !%s",
  1442. 2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]<<25,
  1443. $ref;
  1444. }
  1445. } else {
  1446. return $ref;
  1447. }
  1448. }
  1449. sub emit_assembler {
  1450. foreach (split("\n",$::code)) {
  1451. s/\`([^\`]*)\`/eval $1/ge;
  1452. s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/go;
  1453. s/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
  1454. &unaes_round($1,$2,$3,$4,$5)
  1455. /geo or
  1456. s/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
  1457. &unaes_kexpand($1,$2,$3,$4)
  1458. /geo or
  1459. s/\b(camellia_f)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
  1460. &uncamellia_f($1,$2,$3,$4,$5)
  1461. /geo or
  1462. s/\b(camellia_[^s]+)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
  1463. &uncamellia3($1,$2,$3,$4)
  1464. /geo or
  1465. s/\b(des_\w+)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+)(?:,\s*(%f[0-9]{1,2})(?:,\s*(%f[0-9]{1,2}))?)?/
  1466. &undes($1,$2,$3,$4,$5)
  1467. /geo or
  1468. s/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/
  1469. &unmovxtox($1,$2,$3)
  1470. /geo or
  1471. s/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/
  1472. &unmovxtox($1,$2,$3)
  1473. /geo or
  1474. s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
  1475. &unvis($1,$2,$3,$4)
  1476. /geo or
  1477. s/\b(umulxhi|bmask|addxc[c]{0,2}|alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
  1478. &unvis3($1,$2,$3,$4)
  1479. /geo;
  1480. print $_,"\n";
  1481. }
  1482. }
  1483. 1;