25'ten fazla konu seçemezsiniz Konular bir harf veya rakamla başlamalı, kısa çizgiler ('-') içerebilir ve en fazla 35 karakter uzunluğunda olabilir.
 
 
 
 
 
 

4049 satır
102 KiB

  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  5. # project. The module is, however, dual licensed under OpenSSL and
  6. # CRYPTOGAMS licenses depending on where you obtain it. For further
  7. # details see http://www.openssl.org/~appro/cryptogams/.
  8. # ====================================================================
  9. #
  10. # This module implements support for Intel AES-NI extension. In
  11. # OpenSSL context it's used with Intel engine, but can also be used as
  12. # drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
  13. # details].
  14. #
  15. # Performance.
  16. #
  17. # Given aes(enc|dec) instructions' latency asymptotic performance for
  18. # non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
  19. # processed with 128-bit key. And given their throughput asymptotic
  20. # performance for parallelizable modes is 1.25 cycles per byte. Being
  21. # asymptotic limit it's not something you commonly achieve in reality,
  22. # but how close does one get? Below are results collected for
  23. # different modes and block sized. Pairs of numbers are for en-/
  24. # decryption.
  25. #
  26. # 16-byte 64-byte 256-byte 1-KB 8-KB
  27. # ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
  28. # CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
  29. # CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
  30. # CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
  31. # OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
  32. # CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
  33. #
  34. # ECB, CTR, CBC and CCM results are free from EVP overhead. This means
  35. # that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
  36. # [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
  37. # The results were collected with specially crafted speed.c benchmark
  38. # in order to compare them with results reported in "Intel Advanced
  39. # Encryption Standard (AES) New Instruction Set" White Paper Revision
  40. # 3.0 dated May 2010. All above results are consistently better. This
  41. # module also provides better performance for block sizes smaller than
  42. # 128 bytes in points *not* represented in the above table.
  43. #
  44. # Looking at the results for 8-KB buffer.
  45. #
  46. # CFB and OFB results are far from the limit, because implementation
  47. # uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
  48. # single-block aesni_encrypt, which is not the most optimal way to go.
  49. # CBC encrypt result is unexpectedly high and there is no documented
  50. # explanation for it. Seemingly there is a small penalty for feeding
  51. # the result back to AES unit the way it's done in CBC mode. There is
  52. # nothing one can do and the result appears optimal. CCM result is
  53. # identical to CBC, because CBC-MAC is essentially CBC encrypt without
  54. # saving output. CCM CTR "stays invisible," because it's neatly
  55. # interleaved wih CBC-MAC. This provides ~30% improvement over
  56. # "straghtforward" CCM implementation with CTR and CBC-MAC performed
  57. # disjointly. Parallelizable modes practically achieve the theoretical
  58. # limit.
  59. #
  60. # Looking at how results vary with buffer size.
  61. #
  62. # Curves are practically saturated at 1-KB buffer size. In most cases
  63. # "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
  64. # CTR curve doesn't follow this pattern and is "slowest" changing one
  65. # with "256-byte" result being 87% of "8-KB." This is because overhead
  66. # in CTR mode is most computationally intensive. Small-block CCM
  67. # decrypt is slower than encrypt, because first CTR and last CBC-MAC
  68. # iterations can't be interleaved.
  69. #
  70. # Results for 192- and 256-bit keys.
  71. #
  72. # EVP-free results were observed to scale perfectly with number of
  73. # rounds for larger block sizes, i.e. 192-bit result being 10/12 times
  74. # lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
  75. # are a tad smaller, because the above mentioned penalty biases all
  76. # results by same constant value. In similar way function call
  77. # overhead affects small-block performance, as well as OFB and CFB
  78. # results. Differences are not large, most common coefficients are
  79. # 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
  80. # observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
  81. # January 2011
  82. #
  83. # While Westmere processor features 6 cycles latency for aes[enc|dec]
  84. # instructions, which can be scheduled every second cycle, Sandy
  85. # Bridge spends 8 cycles per instruction, but it can schedule them
  86. # every cycle. This means that code targeting Westmere would perform
  87. # suboptimally on Sandy Bridge. Therefore this update.
  88. #
  89. # In addition, non-parallelizable CBC encrypt (as well as CCM) is
  90. # optimized. Relative improvement might appear modest, 8% on Westmere,
  91. # but in absolute terms it's 3.77 cycles per byte encrypted with
  92. # 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
  93. # should be compared to asymptotic limits of 3.75 for Westmere and
  94. # 5.00 for Sandy Bridge. Actually, the fact that they get this close
  95. # to asymptotic limits is quite amazing. Indeed, the limit is
  96. # calculated as latency times number of rounds, 10 for 128-bit key,
  97. # and divided by 16, the number of bytes in block, or in other words
  98. # it accounts *solely* for aesenc instructions. But there are extra
  99. # instructions, and numbers so close to the asymptotic limits mean
  100. # that it's as if it takes as little as *one* additional cycle to
  101. # execute all of them. How is it possible? It is possible thanks to
  102. # out-of-order execution logic, which manages to overlap post-
  103. # processing of previous block, things like saving the output, with
  104. # actual encryption of current block, as well as pre-processing of
  105. # current block, things like fetching input and xor-ing it with
  106. # 0-round element of the key schedule, with actual encryption of
  107. # previous block. Keep this in mind...
  108. #
  109. # For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
  110. # performance is achieved by interleaving instructions working on
  111. # independent blocks. In which case asymptotic limit for such modes
  112. # can be obtained by dividing above mentioned numbers by AES
  113. # instructions' interleave factor. Westmere can execute at most 3
  114. # instructions at a time, meaning that optimal interleave factor is 3,
  115. # and that's where the "magic" number of 1.25 come from. "Optimal
  116. # interleave factor" means that increase of interleave factor does
  117. # not improve performance. The formula has proven to reflect reality
  118. # pretty well on Westmere... Sandy Bridge on the other hand can
  119. # execute up to 8 AES instructions at a time, so how does varying
  120. # interleave factor affect the performance? Here is table for ECB
  121. # (numbers are cycles per byte processed with 128-bit key):
  122. #
  123. # instruction interleave factor 3x 6x 8x
  124. # theoretical asymptotic limit 1.67 0.83 0.625
  125. # measured performance for 8KB block 1.05 0.86 0.84
  126. #
  127. # "as if" interleave factor 4.7x 5.8x 6.0x
  128. #
  129. # Further data for other parallelizable modes:
  130. #
  131. # CBC decrypt 1.16 0.93 0.74
  132. # CTR 1.14 0.91 0.74
  133. #
  134. # Well, given 3x column it's probably inappropriate to call the limit
  135. # asymptotic, if it can be surpassed, isn't it? What happens there?
  136. # Rewind to CBC paragraph for the answer. Yes, out-of-order execution
  137. # magic is responsible for this. Processor overlaps not only the
  138. # additional instructions with AES ones, but even AES instuctions
  139. # processing adjacent triplets of independent blocks. In the 6x case
  140. # additional instructions still claim disproportionally small amount
  141. # of additional cycles, but in 8x case number of instructions must be
  142. # a tad too high for out-of-order logic to cope with, and AES unit
  143. # remains underutilized... As you can see 8x interleave is hardly
  144. # justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
  145. # utilizies 6x interleave because of limited register bank capacity.
  146. #
  147. # Higher interleave factors do have negative impact on Westmere
  148. # performance. While for ECB mode it's negligible ~1.5%, other
  149. # parallelizables perform ~5% worse, which is outweighed by ~25%
  150. # improvement on Sandy Bridge. To balance regression on Westmere
  151. # CTR mode was implemented with 6x aesenc interleave factor.
  152. # April 2011
  153. #
  154. # Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
  155. # one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
  156. # in CTR mode AES instruction interleave factor was chosen to be 6x.
  157. ######################################################################
  158. # Current large-block performance in cycles per byte processed with
  159. # 128-bit key (less is better).
  160. #
  161. # CBC en-/decrypt CTR XTS ECB
  162. # Westmere 3.77/1.25 1.25 1.25 1.26
  163. # * Bridge 5.07/0.74 0.75 0.90 0.85
  164. # Haswell 4.44/0.63 0.63 0.73 0.63
  165. # Silvermont 5.75/3.54 3.56 4.12 3.87(*)
  166. # Bulldozer 5.77/0.70 0.72 0.90 0.70
  167. #
  168. # (*) Atom Silvermont ECB result is suboptimal because of penalties
  169. # incurred by operations on %xmm8-15. As ECB is not considered
  170. # critical, nothing was done to mitigate the problem.
  171. $PREFIX="aesni"; # if $PREFIX is set to "AES", the script
  172. # generates drop-in replacement for
  173. # crypto/aes/asm/aes-x86_64.pl:-)
  174. $flavour = shift;
  175. $output = shift;
  176. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  177. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  178. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  179. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  180. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  181. die "can't locate x86_64-xlate.pl";
  182. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  183. *STDOUT=*OUT;
  184. $movkey = $PREFIX eq "aesni" ? "movups" : "movups";
  185. @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
  186. ("%rdi","%rsi","%rdx","%rcx"); # Unix order
  187. $code=".text\n";
  188. $code.=".extern OPENSSL_ia32cap_P\n";
  189. $rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
  190. # this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
  191. $inp="%rdi";
  192. $out="%rsi";
  193. $len="%rdx";
  194. $key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!!
  195. $ivp="%r8"; # cbc, ctr, ...
  196. $rnds_="%r10d"; # backup copy for $rounds
  197. $key_="%r11"; # backup copy for $key
  198. # %xmm register layout
  199. $rndkey0="%xmm0"; $rndkey1="%xmm1";
  200. $inout0="%xmm2"; $inout1="%xmm3";
  201. $inout2="%xmm4"; $inout3="%xmm5";
  202. $inout4="%xmm6"; $inout5="%xmm7";
  203. $inout6="%xmm8"; $inout7="%xmm9";
  204. $in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ...
  205. $in0="%xmm8"; $iv="%xmm9";
  206. # Inline version of internal aesni_[en|de]crypt1.
  207. #
  208. # Why folded loop? Because aes[enc|dec] is slow enough to accommodate
  209. # cycles which take care of loop variables...
  210. { my $sn;
  211. sub aesni_generate1 {
  212. my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
  213. ++$sn;
  214. $code.=<<___;
  215. $movkey ($key),$rndkey0
  216. $movkey 16($key),$rndkey1
  217. ___
  218. $code.=<<___ if (defined($ivec));
  219. xorps $rndkey0,$ivec
  220. lea 32($key),$key
  221. xorps $ivec,$inout
  222. ___
  223. $code.=<<___ if (!defined($ivec));
  224. lea 32($key),$key
  225. xorps $rndkey0,$inout
  226. ___
  227. $code.=<<___;
  228. .Loop_${p}1_$sn:
  229. aes${p} $rndkey1,$inout
  230. dec $rounds
  231. $movkey ($key),$rndkey1
  232. lea 16($key),$key
  233. jnz .Loop_${p}1_$sn # loop body is 16 bytes
  234. aes${p}last $rndkey1,$inout
  235. ___
  236. }}
  237. # void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
  238. #
  239. { my ($inp,$out,$key) = @_4args;
  240. $code.=<<___;
  241. .globl ${PREFIX}_encrypt
  242. .type ${PREFIX}_encrypt,\@abi-omnipotent
  243. .align 16
  244. ${PREFIX}_encrypt:
  245. movups ($inp),$inout0 # load input
  246. mov 240($key),$rounds # key->rounds
  247. ___
  248. &aesni_generate1("enc",$key,$rounds);
  249. $code.=<<___;
  250. pxor $rndkey0,$rndkey0 # clear register bank
  251. pxor $rndkey1,$rndkey1
  252. movups $inout0,($out) # output
  253. pxor $inout0,$inout0
  254. ret
  255. .size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
  256. .globl ${PREFIX}_decrypt
  257. .type ${PREFIX}_decrypt,\@abi-omnipotent
  258. .align 16
  259. ${PREFIX}_decrypt:
  260. movups ($inp),$inout0 # load input
  261. mov 240($key),$rounds # key->rounds
  262. ___
  263. &aesni_generate1("dec",$key,$rounds);
  264. $code.=<<___;
  265. pxor $rndkey0,$rndkey0 # clear register bank
  266. pxor $rndkey1,$rndkey1
  267. movups $inout0,($out) # output
  268. pxor $inout0,$inout0
  269. ret
  270. .size ${PREFIX}_decrypt, .-${PREFIX}_decrypt
  271. ___
  272. }
  273. # _aesni_[en|de]cryptN are private interfaces, N denotes interleave
  274. # factor. Why 3x subroutine were originally used in loops? Even though
  275. # aes[enc|dec] latency was originally 6, it could be scheduled only
  276. # every *2nd* cycle. Thus 3x interleave was the one providing optimal
  277. # utilization, i.e. when subroutine's throughput is virtually same as
  278. # of non-interleaved subroutine [for number of input blocks up to 3].
  279. # This is why it originally made no sense to implement 2x subroutine.
  280. # But times change and it became appropriate to spend extra 192 bytes
  281. # on 2x subroutine on Atom Silvermont account. For processors that
  282. # can schedule aes[enc|dec] every cycle optimal interleave factor
  283. # equals to corresponding instructions latency. 8x is optimal for
  284. # * Bridge and "super-optimal" for other Intel CPUs...
  285. sub aesni_generate2 {
  286. my $dir=shift;
  287. # As already mentioned it takes in $key and $rounds, which are *not*
  288. # preserved. $inout[0-1] is cipher/clear text...
  289. $code.=<<___;
  290. .type _aesni_${dir}rypt2,\@abi-omnipotent
  291. .align 16
  292. _aesni_${dir}rypt2:
  293. $movkey ($key),$rndkey0
  294. shl \$4,$rounds
  295. $movkey 16($key),$rndkey1
  296. xorps $rndkey0,$inout0
  297. xorps $rndkey0,$inout1
  298. $movkey 32($key),$rndkey0
  299. lea 32($key,$rounds),$key
  300. neg %rax # $rounds
  301. add \$16,%rax
  302. .L${dir}_loop2:
  303. aes${dir} $rndkey1,$inout0
  304. aes${dir} $rndkey1,$inout1
  305. $movkey ($key,%rax),$rndkey1
  306. add \$32,%rax
  307. aes${dir} $rndkey0,$inout0
  308. aes${dir} $rndkey0,$inout1
  309. $movkey -16($key,%rax),$rndkey0
  310. jnz .L${dir}_loop2
  311. aes${dir} $rndkey1,$inout0
  312. aes${dir} $rndkey1,$inout1
  313. aes${dir}last $rndkey0,$inout0
  314. aes${dir}last $rndkey0,$inout1
  315. ret
  316. .size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2
  317. ___
  318. }
  319. sub aesni_generate3 {
  320. my $dir=shift;
  321. # As already mentioned it takes in $key and $rounds, which are *not*
  322. # preserved. $inout[0-2] is cipher/clear text...
  323. $code.=<<___;
  324. .type _aesni_${dir}rypt3,\@abi-omnipotent
  325. .align 16
  326. _aesni_${dir}rypt3:
  327. $movkey ($key),$rndkey0
  328. shl \$4,$rounds
  329. $movkey 16($key),$rndkey1
  330. xorps $rndkey0,$inout0
  331. xorps $rndkey0,$inout1
  332. xorps $rndkey0,$inout2
  333. $movkey 32($key),$rndkey0
  334. lea 32($key,$rounds),$key
  335. neg %rax # $rounds
  336. add \$16,%rax
  337. .L${dir}_loop3:
  338. aes${dir} $rndkey1,$inout0
  339. aes${dir} $rndkey1,$inout1
  340. aes${dir} $rndkey1,$inout2
  341. $movkey ($key,%rax),$rndkey1
  342. add \$32,%rax
  343. aes${dir} $rndkey0,$inout0
  344. aes${dir} $rndkey0,$inout1
  345. aes${dir} $rndkey0,$inout2
  346. $movkey -16($key,%rax),$rndkey0
  347. jnz .L${dir}_loop3
  348. aes${dir} $rndkey1,$inout0
  349. aes${dir} $rndkey1,$inout1
  350. aes${dir} $rndkey1,$inout2
  351. aes${dir}last $rndkey0,$inout0
  352. aes${dir}last $rndkey0,$inout1
  353. aes${dir}last $rndkey0,$inout2
  354. ret
  355. .size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3
  356. ___
  357. }
  358. # 4x interleave is implemented to improve small block performance,
  359. # most notably [and naturally] 4 block by ~30%. One can argue that one
  360. # should have implemented 5x as well, but improvement would be <20%,
  361. # so it's not worth it...
  362. sub aesni_generate4 {
  363. my $dir=shift;
  364. # As already mentioned it takes in $key and $rounds, which are *not*
  365. # preserved. $inout[0-3] is cipher/clear text...
  366. $code.=<<___;
  367. .type _aesni_${dir}rypt4,\@abi-omnipotent
  368. .align 16
  369. _aesni_${dir}rypt4:
  370. $movkey ($key),$rndkey0
  371. shl \$4,$rounds
  372. $movkey 16($key),$rndkey1
  373. xorps $rndkey0,$inout0
  374. xorps $rndkey0,$inout1
  375. xorps $rndkey0,$inout2
  376. xorps $rndkey0,$inout3
  377. $movkey 32($key),$rndkey0
  378. lea 32($key,$rounds),$key
  379. neg %rax # $rounds
  380. .byte 0x0f,0x1f,0x00
  381. add \$16,%rax
  382. .L${dir}_loop4:
  383. aes${dir} $rndkey1,$inout0
  384. aes${dir} $rndkey1,$inout1
  385. aes${dir} $rndkey1,$inout2
  386. aes${dir} $rndkey1,$inout3
  387. $movkey ($key,%rax),$rndkey1
  388. add \$32,%rax
  389. aes${dir} $rndkey0,$inout0
  390. aes${dir} $rndkey0,$inout1
  391. aes${dir} $rndkey0,$inout2
  392. aes${dir} $rndkey0,$inout3
  393. $movkey -16($key,%rax),$rndkey0
  394. jnz .L${dir}_loop4
  395. aes${dir} $rndkey1,$inout0
  396. aes${dir} $rndkey1,$inout1
  397. aes${dir} $rndkey1,$inout2
  398. aes${dir} $rndkey1,$inout3
  399. aes${dir}last $rndkey0,$inout0
  400. aes${dir}last $rndkey0,$inout1
  401. aes${dir}last $rndkey0,$inout2
  402. aes${dir}last $rndkey0,$inout3
  403. ret
  404. .size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
  405. ___
  406. }
  407. sub aesni_generate6 {
  408. my $dir=shift;
  409. # As already mentioned it takes in $key and $rounds, which are *not*
  410. # preserved. $inout[0-5] is cipher/clear text...
  411. $code.=<<___;
  412. .type _aesni_${dir}rypt6,\@abi-omnipotent
  413. .align 16
  414. _aesni_${dir}rypt6:
  415. $movkey ($key),$rndkey0
  416. shl \$4,$rounds
  417. $movkey 16($key),$rndkey1
  418. xorps $rndkey0,$inout0
  419. pxor $rndkey0,$inout1
  420. pxor $rndkey0,$inout2
  421. aes${dir} $rndkey1,$inout0
  422. lea 32($key,$rounds),$key
  423. neg %rax # $rounds
  424. aes${dir} $rndkey1,$inout1
  425. pxor $rndkey0,$inout3
  426. pxor $rndkey0,$inout4
  427. aes${dir} $rndkey1,$inout2
  428. pxor $rndkey0,$inout5
  429. $movkey ($key,%rax),$rndkey0
  430. add \$16,%rax
  431. jmp .L${dir}_loop6_enter
  432. .align 16
  433. .L${dir}_loop6:
  434. aes${dir} $rndkey1,$inout0
  435. aes${dir} $rndkey1,$inout1
  436. aes${dir} $rndkey1,$inout2
  437. .L${dir}_loop6_enter:
  438. aes${dir} $rndkey1,$inout3
  439. aes${dir} $rndkey1,$inout4
  440. aes${dir} $rndkey1,$inout5
  441. $movkey ($key,%rax),$rndkey1
  442. add \$32,%rax
  443. aes${dir} $rndkey0,$inout0
  444. aes${dir} $rndkey0,$inout1
  445. aes${dir} $rndkey0,$inout2
  446. aes${dir} $rndkey0,$inout3
  447. aes${dir} $rndkey0,$inout4
  448. aes${dir} $rndkey0,$inout5
  449. $movkey -16($key,%rax),$rndkey0
  450. jnz .L${dir}_loop6
  451. aes${dir} $rndkey1,$inout0
  452. aes${dir} $rndkey1,$inout1
  453. aes${dir} $rndkey1,$inout2
  454. aes${dir} $rndkey1,$inout3
  455. aes${dir} $rndkey1,$inout4
  456. aes${dir} $rndkey1,$inout5
  457. aes${dir}last $rndkey0,$inout0
  458. aes${dir}last $rndkey0,$inout1
  459. aes${dir}last $rndkey0,$inout2
  460. aes${dir}last $rndkey0,$inout3
  461. aes${dir}last $rndkey0,$inout4
  462. aes${dir}last $rndkey0,$inout5
  463. ret
  464. .size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6
  465. ___
  466. }
  467. sub aesni_generate8 {
  468. my $dir=shift;
  469. # As already mentioned it takes in $key and $rounds, which are *not*
  470. # preserved. $inout[0-7] is cipher/clear text...
  471. $code.=<<___;
  472. .type _aesni_${dir}rypt8,\@abi-omnipotent
  473. .align 16
  474. _aesni_${dir}rypt8:
  475. $movkey ($key),$rndkey0
  476. shl \$4,$rounds
  477. $movkey 16($key),$rndkey1
  478. xorps $rndkey0,$inout0
  479. xorps $rndkey0,$inout1
  480. pxor $rndkey0,$inout2
  481. pxor $rndkey0,$inout3
  482. pxor $rndkey0,$inout4
  483. lea 32($key,$rounds),$key
  484. neg %rax # $rounds
  485. aes${dir} $rndkey1,$inout0
  486. pxor $rndkey0,$inout5
  487. pxor $rndkey0,$inout6
  488. aes${dir} $rndkey1,$inout1
  489. pxor $rndkey0,$inout7
  490. $movkey ($key,%rax),$rndkey0
  491. add \$16,%rax
  492. jmp .L${dir}_loop8_inner
  493. .align 16
  494. .L${dir}_loop8:
  495. aes${dir} $rndkey1,$inout0
  496. aes${dir} $rndkey1,$inout1
  497. .L${dir}_loop8_inner:
  498. aes${dir} $rndkey1,$inout2
  499. aes${dir} $rndkey1,$inout3
  500. aes${dir} $rndkey1,$inout4
  501. aes${dir} $rndkey1,$inout5
  502. aes${dir} $rndkey1,$inout6
  503. aes${dir} $rndkey1,$inout7
  504. .L${dir}_loop8_enter:
  505. $movkey ($key,%rax),$rndkey1
  506. add \$32,%rax
  507. aes${dir} $rndkey0,$inout0
  508. aes${dir} $rndkey0,$inout1
  509. aes${dir} $rndkey0,$inout2
  510. aes${dir} $rndkey0,$inout3
  511. aes${dir} $rndkey0,$inout4
  512. aes${dir} $rndkey0,$inout5
  513. aes${dir} $rndkey0,$inout6
  514. aes${dir} $rndkey0,$inout7
  515. $movkey -16($key,%rax),$rndkey0
  516. jnz .L${dir}_loop8
  517. aes${dir} $rndkey1,$inout0
  518. aes${dir} $rndkey1,$inout1
  519. aes${dir} $rndkey1,$inout2
  520. aes${dir} $rndkey1,$inout3
  521. aes${dir} $rndkey1,$inout4
  522. aes${dir} $rndkey1,$inout5
  523. aes${dir} $rndkey1,$inout6
  524. aes${dir} $rndkey1,$inout7
  525. aes${dir}last $rndkey0,$inout0
  526. aes${dir}last $rndkey0,$inout1
  527. aes${dir}last $rndkey0,$inout2
  528. aes${dir}last $rndkey0,$inout3
  529. aes${dir}last $rndkey0,$inout4
  530. aes${dir}last $rndkey0,$inout5
  531. aes${dir}last $rndkey0,$inout6
  532. aes${dir}last $rndkey0,$inout7
  533. ret
  534. .size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
  535. ___
  536. }
  537. &aesni_generate2("enc") if ($PREFIX eq "aesni");
  538. &aesni_generate2("dec");
  539. &aesni_generate3("enc") if ($PREFIX eq "aesni");
  540. &aesni_generate3("dec");
  541. &aesni_generate4("enc") if ($PREFIX eq "aesni");
  542. &aesni_generate4("dec");
  543. &aesni_generate6("enc") if ($PREFIX eq "aesni");
  544. &aesni_generate6("dec");
  545. &aesni_generate8("enc") if ($PREFIX eq "aesni");
  546. &aesni_generate8("dec");
  547. if ($PREFIX eq "aesni") {
  548. ########################################################################
  549. # void aesni_ecb_encrypt (const void *in, void *out,
  550. # size_t length, const AES_KEY *key,
  551. # int enc);
  552. $code.=<<___;
  553. .globl aesni_ecb_encrypt
  554. .type aesni_ecb_encrypt,\@function,5
  555. .align 16
  556. aesni_ecb_encrypt:
  557. ___
  558. $code.=<<___ if ($win64);
  559. lea -0x58(%rsp),%rsp
  560. movaps %xmm6,(%rsp) # offload $inout4..7
  561. movaps %xmm7,0x10(%rsp)
  562. movaps %xmm8,0x20(%rsp)
  563. movaps %xmm9,0x30(%rsp)
  564. .Lecb_enc_body:
  565. ___
  566. $code.=<<___;
  567. and \$-16,$len # if ($len<16)
  568. jz .Lecb_ret # return
  569. mov 240($key),$rounds # key->rounds
  570. $movkey ($key),$rndkey0
  571. mov $key,$key_ # backup $key
  572. mov $rounds,$rnds_ # backup $rounds
  573. test %r8d,%r8d # 5th argument
  574. jz .Lecb_decrypt
  575. #--------------------------- ECB ENCRYPT ------------------------------#
  576. cmp \$0x80,$len # if ($len<8*16)
  577. jb .Lecb_enc_tail # short input
  578. movdqu ($inp),$inout0 # load 8 input blocks
  579. movdqu 0x10($inp),$inout1
  580. movdqu 0x20($inp),$inout2
  581. movdqu 0x30($inp),$inout3
  582. movdqu 0x40($inp),$inout4
  583. movdqu 0x50($inp),$inout5
  584. movdqu 0x60($inp),$inout6
  585. movdqu 0x70($inp),$inout7
  586. lea 0x80($inp),$inp # $inp+=8*16
  587. sub \$0x80,$len # $len-=8*16 (can be zero)
  588. jmp .Lecb_enc_loop8_enter
  589. .align 16
  590. .Lecb_enc_loop8:
  591. movups $inout0,($out) # store 8 output blocks
  592. mov $key_,$key # restore $key
  593. movdqu ($inp),$inout0 # load 8 input blocks
  594. mov $rnds_,$rounds # restore $rounds
  595. movups $inout1,0x10($out)
  596. movdqu 0x10($inp),$inout1
  597. movups $inout2,0x20($out)
  598. movdqu 0x20($inp),$inout2
  599. movups $inout3,0x30($out)
  600. movdqu 0x30($inp),$inout3
  601. movups $inout4,0x40($out)
  602. movdqu 0x40($inp),$inout4
  603. movups $inout5,0x50($out)
  604. movdqu 0x50($inp),$inout5
  605. movups $inout6,0x60($out)
  606. movdqu 0x60($inp),$inout6
  607. movups $inout7,0x70($out)
  608. lea 0x80($out),$out # $out+=8*16
  609. movdqu 0x70($inp),$inout7
  610. lea 0x80($inp),$inp # $inp+=8*16
  611. .Lecb_enc_loop8_enter:
  612. call _aesni_encrypt8
  613. sub \$0x80,$len
  614. jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow
  615. movups $inout0,($out) # store 8 output blocks
  616. mov $key_,$key # restore $key
  617. movups $inout1,0x10($out)
  618. mov $rnds_,$rounds # restore $rounds
  619. movups $inout2,0x20($out)
  620. movups $inout3,0x30($out)
  621. movups $inout4,0x40($out)
  622. movups $inout5,0x50($out)
  623. movups $inout6,0x60($out)
  624. movups $inout7,0x70($out)
  625. lea 0x80($out),$out # $out+=8*16
  626. add \$0x80,$len # restore real remaining $len
  627. jz .Lecb_ret # done if ($len==0)
  628. .Lecb_enc_tail: # $len is less than 8*16
  629. movups ($inp),$inout0
  630. cmp \$0x20,$len
  631. jb .Lecb_enc_one
  632. movups 0x10($inp),$inout1
  633. je .Lecb_enc_two
  634. movups 0x20($inp),$inout2
  635. cmp \$0x40,$len
  636. jb .Lecb_enc_three
  637. movups 0x30($inp),$inout3
  638. je .Lecb_enc_four
  639. movups 0x40($inp),$inout4
  640. cmp \$0x60,$len
  641. jb .Lecb_enc_five
  642. movups 0x50($inp),$inout5
  643. je .Lecb_enc_six
  644. movdqu 0x60($inp),$inout6
  645. xorps $inout7,$inout7
  646. call _aesni_encrypt8
  647. movups $inout0,($out) # store 7 output blocks
  648. movups $inout1,0x10($out)
  649. movups $inout2,0x20($out)
  650. movups $inout3,0x30($out)
  651. movups $inout4,0x40($out)
  652. movups $inout5,0x50($out)
  653. movups $inout6,0x60($out)
  654. jmp .Lecb_ret
  655. .align 16
  656. .Lecb_enc_one:
  657. ___
  658. &aesni_generate1("enc",$key,$rounds);
  659. $code.=<<___;
  660. movups $inout0,($out) # store one output block
  661. jmp .Lecb_ret
  662. .align 16
  663. .Lecb_enc_two:
  664. call _aesni_encrypt2
  665. movups $inout0,($out) # store 2 output blocks
  666. movups $inout1,0x10($out)
  667. jmp .Lecb_ret
  668. .align 16
  669. .Lecb_enc_three:
  670. call _aesni_encrypt3
  671. movups $inout0,($out) # store 3 output blocks
  672. movups $inout1,0x10($out)
  673. movups $inout2,0x20($out)
  674. jmp .Lecb_ret
  675. .align 16
  676. .Lecb_enc_four:
  677. call _aesni_encrypt4
  678. movups $inout0,($out) # store 4 output blocks
  679. movups $inout1,0x10($out)
  680. movups $inout2,0x20($out)
  681. movups $inout3,0x30($out)
  682. jmp .Lecb_ret
  683. .align 16
  684. .Lecb_enc_five:
  685. xorps $inout5,$inout5
  686. call _aesni_encrypt6
  687. movups $inout0,($out) # store 5 output blocks
  688. movups $inout1,0x10($out)
  689. movups $inout2,0x20($out)
  690. movups $inout3,0x30($out)
  691. movups $inout4,0x40($out)
  692. jmp .Lecb_ret
  693. .align 16
  694. .Lecb_enc_six:
  695. call _aesni_encrypt6
  696. movups $inout0,($out) # store 6 output blocks
  697. movups $inout1,0x10($out)
  698. movups $inout2,0x20($out)
  699. movups $inout3,0x30($out)
  700. movups $inout4,0x40($out)
  701. movups $inout5,0x50($out)
  702. jmp .Lecb_ret
  703. #--------------------------- ECB DECRYPT ------------------------------#
  704. .align 16
  705. .Lecb_decrypt:
  706. cmp \$0x80,$len # if ($len<8*16)
  707. jb .Lecb_dec_tail # short input
  708. movdqu ($inp),$inout0 # load 8 input blocks
  709. movdqu 0x10($inp),$inout1
  710. movdqu 0x20($inp),$inout2
  711. movdqu 0x30($inp),$inout3
  712. movdqu 0x40($inp),$inout4
  713. movdqu 0x50($inp),$inout5
  714. movdqu 0x60($inp),$inout6
  715. movdqu 0x70($inp),$inout7
  716. lea 0x80($inp),$inp # $inp+=8*16
  717. sub \$0x80,$len # $len-=8*16 (can be zero)
  718. jmp .Lecb_dec_loop8_enter
  719. .align 16
  720. .Lecb_dec_loop8:
  721. movups $inout0,($out) # store 8 output blocks
  722. mov $key_,$key # restore $key
  723. movdqu ($inp),$inout0 # load 8 input blocks
  724. mov $rnds_,$rounds # restore $rounds
  725. movups $inout1,0x10($out)
  726. movdqu 0x10($inp),$inout1
  727. movups $inout2,0x20($out)
  728. movdqu 0x20($inp),$inout2
  729. movups $inout3,0x30($out)
  730. movdqu 0x30($inp),$inout3
  731. movups $inout4,0x40($out)
  732. movdqu 0x40($inp),$inout4
  733. movups $inout5,0x50($out)
  734. movdqu 0x50($inp),$inout5
  735. movups $inout6,0x60($out)
  736. movdqu 0x60($inp),$inout6
  737. movups $inout7,0x70($out)
  738. lea 0x80($out),$out # $out+=8*16
  739. movdqu 0x70($inp),$inout7
  740. lea 0x80($inp),$inp # $inp+=8*16
  741. .Lecb_dec_loop8_enter:
  742. call _aesni_decrypt8
  743. $movkey ($key_),$rndkey0
  744. sub \$0x80,$len
  745. jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow
  746. movups $inout0,($out) # store 8 output blocks
  747. pxor $inout0,$inout0 # clear register bank
  748. mov $key_,$key # restore $key
  749. movups $inout1,0x10($out)
  750. pxor $inout1,$inout1
  751. mov $rnds_,$rounds # restore $rounds
  752. movups $inout2,0x20($out)
  753. pxor $inout2,$inout2
  754. movups $inout3,0x30($out)
  755. pxor $inout3,$inout3
  756. movups $inout4,0x40($out)
  757. pxor $inout4,$inout4
  758. movups $inout5,0x50($out)
  759. pxor $inout5,$inout5
  760. movups $inout6,0x60($out)
  761. pxor $inout6,$inout6
  762. movups $inout7,0x70($out)
  763. pxor $inout7,$inout7
  764. lea 0x80($out),$out # $out+=8*16
  765. add \$0x80,$len # restore real remaining $len
  766. jz .Lecb_ret # done if ($len==0)
  767. .Lecb_dec_tail:
  768. movups ($inp),$inout0
  769. cmp \$0x20,$len
  770. jb .Lecb_dec_one
  771. movups 0x10($inp),$inout1
  772. je .Lecb_dec_two
  773. movups 0x20($inp),$inout2
  774. cmp \$0x40,$len
  775. jb .Lecb_dec_three
  776. movups 0x30($inp),$inout3
  777. je .Lecb_dec_four
  778. movups 0x40($inp),$inout4
  779. cmp \$0x60,$len
  780. jb .Lecb_dec_five
  781. movups 0x50($inp),$inout5
  782. je .Lecb_dec_six
  783. movups 0x60($inp),$inout6
  784. $movkey ($key),$rndkey0
  785. xorps $inout7,$inout7
  786. call _aesni_decrypt8
  787. movups $inout0,($out) # store 7 output blocks
  788. pxor $inout0,$inout0 # clear register bank
  789. movups $inout1,0x10($out)
  790. pxor $inout1,$inout1
  791. movups $inout2,0x20($out)
  792. pxor $inout2,$inout2
  793. movups $inout3,0x30($out)
  794. pxor $inout3,$inout3
  795. movups $inout4,0x40($out)
  796. pxor $inout4,$inout4
  797. movups $inout5,0x50($out)
  798. pxor $inout5,$inout5
  799. movups $inout6,0x60($out)
  800. pxor $inout6,$inout6
  801. pxor $inout7,$inout7
  802. jmp .Lecb_ret
  803. .align 16
  804. .Lecb_dec_one:
  805. ___
  806. &aesni_generate1("dec",$key,$rounds);
  807. $code.=<<___;
  808. movups $inout0,($out) # store one output block
  809. pxor $inout0,$inout0 # clear register bank
  810. jmp .Lecb_ret
  811. .align 16
  812. .Lecb_dec_two:
  813. call _aesni_decrypt2
  814. movups $inout0,($out) # store 2 output blocks
  815. pxor $inout0,$inout0 # clear register bank
  816. movups $inout1,0x10($out)
  817. pxor $inout1,$inout1
  818. jmp .Lecb_ret
  819. .align 16
  820. .Lecb_dec_three:
  821. call _aesni_decrypt3
  822. movups $inout0,($out) # store 3 output blocks
  823. pxor $inout0,$inout0 # clear register bank
  824. movups $inout1,0x10($out)
  825. pxor $inout1,$inout1
  826. movups $inout2,0x20($out)
  827. pxor $inout2,$inout2
  828. jmp .Lecb_ret
  829. .align 16
  830. .Lecb_dec_four:
  831. call _aesni_decrypt4
  832. movups $inout0,($out) # store 4 output blocks
  833. pxor $inout0,$inout0 # clear register bank
  834. movups $inout1,0x10($out)
  835. pxor $inout1,$inout1
  836. movups $inout2,0x20($out)
  837. pxor $inout2,$inout2
  838. movups $inout3,0x30($out)
  839. pxor $inout3,$inout3
  840. jmp .Lecb_ret
  841. .align 16
  842. .Lecb_dec_five:
  843. xorps $inout5,$inout5
  844. call _aesni_decrypt6
  845. movups $inout0,($out) # store 5 output blocks
  846. pxor $inout0,$inout0 # clear register bank
  847. movups $inout1,0x10($out)
  848. pxor $inout1,$inout1
  849. movups $inout2,0x20($out)
  850. pxor $inout2,$inout2
  851. movups $inout3,0x30($out)
  852. pxor $inout3,$inout3
  853. movups $inout4,0x40($out)
  854. pxor $inout4,$inout4
  855. pxor $inout5,$inout5
  856. jmp .Lecb_ret
  857. .align 16
  858. .Lecb_dec_six:
  859. call _aesni_decrypt6
  860. movups $inout0,($out) # store 6 output blocks
  861. pxor $inout0,$inout0 # clear register bank
  862. movups $inout1,0x10($out)
  863. pxor $inout1,$inout1
  864. movups $inout2,0x20($out)
  865. pxor $inout2,$inout2
  866. movups $inout3,0x30($out)
  867. pxor $inout3,$inout3
  868. movups $inout4,0x40($out)
  869. pxor $inout4,$inout4
  870. movups $inout5,0x50($out)
  871. pxor $inout5,$inout5
  872. .Lecb_ret:
  873. xorps $rndkey0,$rndkey0 # %xmm0
  874. pxor $rndkey1,$rndkey1
  875. ___
  876. $code.=<<___ if ($win64);
  877. movaps (%rsp),%xmm6
  878. movaps %xmm0,(%rsp) # clear stack
  879. movaps 0x10(%rsp),%xmm7
  880. movaps %xmm0,0x10(%rsp)
  881. movaps 0x20(%rsp),%xmm8
  882. movaps %xmm0,0x20(%rsp)
  883. movaps 0x30(%rsp),%xmm9
  884. movaps %xmm0,0x30(%rsp)
  885. lea 0x58(%rsp),%rsp
  886. .Lecb_enc_ret:
  887. ___
  888. $code.=<<___;
  889. ret
  890. .size aesni_ecb_encrypt,.-aesni_ecb_encrypt
  891. ___
  892. {
  893. ######################################################################
  894. # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
  895. # size_t blocks, const AES_KEY *key,
  896. # const char *ivec,char *cmac);
  897. #
  898. # Handles only complete blocks, operates on 64-bit counter and
  899. # does not update *ivec! Nor does it finalize CMAC value
  900. # (see engine/eng_aesni.c for details)
  901. #
  902. {
  903. my $cmac="%r9"; # 6th argument
  904. my $increment="%xmm9";
  905. my $iv="%xmm6";
  906. my $bswap_mask="%xmm7";
  907. $code.=<<___;
  908. .globl aesni_ccm64_encrypt_blocks
  909. .type aesni_ccm64_encrypt_blocks,\@function,6
  910. .align 16
  911. aesni_ccm64_encrypt_blocks:
  912. ___
  913. $code.=<<___ if ($win64);
  914. lea -0x58(%rsp),%rsp
  915. movaps %xmm6,(%rsp) # $iv
  916. movaps %xmm7,0x10(%rsp) # $bswap_mask
  917. movaps %xmm8,0x20(%rsp) # $in0
  918. movaps %xmm9,0x30(%rsp) # $increment
  919. .Lccm64_enc_body:
  920. ___
  921. $code.=<<___;
  922. mov 240($key),$rounds # key->rounds
  923. movdqu ($ivp),$iv
  924. movdqa .Lincrement64(%rip),$increment
  925. movdqa .Lbswap_mask(%rip),$bswap_mask
  926. shl \$4,$rounds
  927. mov \$16,$rnds_
  928. lea 0($key),$key_
  929. movdqu ($cmac),$inout1
  930. movdqa $iv,$inout0
  931. lea 32($key,$rounds),$key # end of key schedule
  932. pshufb $bswap_mask,$iv
  933. sub %rax,%r10 # twisted $rounds
  934. jmp .Lccm64_enc_outer
  935. .align 16
  936. .Lccm64_enc_outer:
  937. $movkey ($key_),$rndkey0
  938. mov %r10,%rax
  939. movups ($inp),$in0 # load inp
  940. xorps $rndkey0,$inout0 # counter
  941. $movkey 16($key_),$rndkey1
  942. xorps $in0,$rndkey0
  943. xorps $rndkey0,$inout1 # cmac^=inp
  944. $movkey 32($key_),$rndkey0
  945. .Lccm64_enc2_loop:
  946. aesenc $rndkey1,$inout0
  947. aesenc $rndkey1,$inout1
  948. $movkey ($key,%rax),$rndkey1
  949. add \$32,%rax
  950. aesenc $rndkey0,$inout0
  951. aesenc $rndkey0,$inout1
  952. $movkey -16($key,%rax),$rndkey0
  953. jnz .Lccm64_enc2_loop
  954. aesenc $rndkey1,$inout0
  955. aesenc $rndkey1,$inout1
  956. paddq $increment,$iv
  957. dec $len # $len-- ($len is in blocks)
  958. aesenclast $rndkey0,$inout0
  959. aesenclast $rndkey0,$inout1
  960. lea 16($inp),$inp
  961. xorps $inout0,$in0 # inp ^= E(iv)
  962. movdqa $iv,$inout0
  963. movups $in0,($out) # save output
  964. pshufb $bswap_mask,$inout0
  965. lea 16($out),$out # $out+=16
  966. jnz .Lccm64_enc_outer # loop if ($len!=0)
  967. pxor $rndkey0,$rndkey0 # clear register bank
  968. pxor $rndkey1,$rndkey1
  969. pxor $inout0,$inout0
  970. movups $inout1,($cmac) # store resulting mac
  971. pxor $inout1,$inout1
  972. pxor $in0,$in0
  973. pxor $iv,$iv
  974. ___
  975. $code.=<<___ if ($win64);
  976. movaps (%rsp),%xmm6
  977. movaps %xmm0,(%rsp) # clear stack
  978. movaps 0x10(%rsp),%xmm7
  979. movaps %xmm0,0x10(%rsp)
  980. movaps 0x20(%rsp),%xmm8
  981. movaps %xmm0,0x20(%rsp)
  982. movaps 0x30(%rsp),%xmm9
  983. movaps %xmm0,0x30(%rsp)
  984. lea 0x58(%rsp),%rsp
  985. .Lccm64_enc_ret:
  986. ___
  987. $code.=<<___;
  988. ret
  989. .size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
  990. ___
  991. ######################################################################
  992. $code.=<<___;
  993. .globl aesni_ccm64_decrypt_blocks
  994. .type aesni_ccm64_decrypt_blocks,\@function,6
  995. .align 16
  996. aesni_ccm64_decrypt_blocks:
  997. ___
  998. $code.=<<___ if ($win64);
  999. lea -0x58(%rsp),%rsp
  1000. movaps %xmm6,(%rsp) # $iv
  1001. movaps %xmm7,0x10(%rsp) # $bswap_mask
  1002. movaps %xmm8,0x20(%rsp) # $in8
  1003. movaps %xmm9,0x30(%rsp) # $increment
  1004. .Lccm64_dec_body:
  1005. ___
  1006. $code.=<<___;
  1007. mov 240($key),$rounds # key->rounds
  1008. movups ($ivp),$iv
  1009. movdqu ($cmac),$inout1
  1010. movdqa .Lincrement64(%rip),$increment
  1011. movdqa .Lbswap_mask(%rip),$bswap_mask
  1012. movaps $iv,$inout0
  1013. mov $rounds,$rnds_
  1014. mov $key,$key_
  1015. pshufb $bswap_mask,$iv
  1016. ___
  1017. &aesni_generate1("enc",$key,$rounds);
  1018. $code.=<<___;
  1019. shl \$4,$rnds_
  1020. mov \$16,$rounds
  1021. movups ($inp),$in0 # load inp
  1022. paddq $increment,$iv
  1023. lea 16($inp),$inp # $inp+=16
  1024. sub %r10,%rax # twisted $rounds
  1025. lea 32($key_,$rnds_),$key # end of key schedule
  1026. mov %rax,%r10
  1027. jmp .Lccm64_dec_outer
  1028. .align 16
  1029. .Lccm64_dec_outer:
  1030. xorps $inout0,$in0 # inp ^= E(iv)
  1031. movdqa $iv,$inout0
  1032. movups $in0,($out) # save output
  1033. lea 16($out),$out # $out+=16
  1034. pshufb $bswap_mask,$inout0
  1035. sub \$1,$len # $len-- ($len is in blocks)
  1036. jz .Lccm64_dec_break # if ($len==0) break
  1037. $movkey ($key_),$rndkey0
  1038. mov %r10,%rax
  1039. $movkey 16($key_),$rndkey1
  1040. xorps $rndkey0,$in0
  1041. xorps $rndkey0,$inout0
  1042. xorps $in0,$inout1 # cmac^=out
  1043. $movkey 32($key_),$rndkey0
  1044. jmp .Lccm64_dec2_loop
  1045. .align 16
  1046. .Lccm64_dec2_loop:
  1047. aesenc $rndkey1,$inout0
  1048. aesenc $rndkey1,$inout1
  1049. $movkey ($key,%rax),$rndkey1
  1050. add \$32,%rax
  1051. aesenc $rndkey0,$inout0
  1052. aesenc $rndkey0,$inout1
  1053. $movkey -16($key,%rax),$rndkey0
  1054. jnz .Lccm64_dec2_loop
  1055. movups ($inp),$in0 # load input
  1056. paddq $increment,$iv
  1057. aesenc $rndkey1,$inout0
  1058. aesenc $rndkey1,$inout1
  1059. aesenclast $rndkey0,$inout0
  1060. aesenclast $rndkey0,$inout1
  1061. lea 16($inp),$inp # $inp+=16
  1062. jmp .Lccm64_dec_outer
  1063. .align 16
  1064. .Lccm64_dec_break:
  1065. #xorps $in0,$inout1 # cmac^=out
  1066. mov 240($key_),$rounds
  1067. ___
  1068. &aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
  1069. $code.=<<___;
  1070. pxor $rndkey0,$rndkey0 # clear register bank
  1071. pxor $rndkey1,$rndkey1
  1072. pxor $inout0,$inout0
  1073. movups $inout1,($cmac) # store resulting mac
  1074. pxor $inout1,$inout1
  1075. pxor $in0,$in0
  1076. pxor $iv,$iv
  1077. ___
  1078. $code.=<<___ if ($win64);
  1079. movaps (%rsp),%xmm6
  1080. movaps %xmm0,(%rsp) # clear stack
  1081. movaps 0x10(%rsp),%xmm7
  1082. movaps %xmm0,0x10(%rsp)
  1083. movaps 0x20(%rsp),%xmm8
  1084. movaps %xmm0,0x20(%rsp)
  1085. movaps 0x30(%rsp),%xmm9
  1086. movaps %xmm0,0x30(%rsp)
  1087. lea 0x58(%rsp),%rsp
  1088. .Lccm64_dec_ret:
  1089. ___
  1090. $code.=<<___;
  1091. ret
  1092. .size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
  1093. ___
  1094. }
  1095. ######################################################################
  1096. # void aesni_ctr32_encrypt_blocks (const void *in, void *out,
  1097. # size_t blocks, const AES_KEY *key,
  1098. # const char *ivec);
  1099. #
  1100. # Handles only complete blocks, operates on 32-bit counter and
  1101. # does not update *ivec! (see crypto/modes/ctr128.c for details)
  1102. #
  1103. # Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
  1104. # http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
  1105. # Keywords are full unroll and modulo-schedule counter calculations
  1106. # with zero-round key xor.
  1107. {
  1108. my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
  1109. my ($key0,$ctr)=("${key_}d","${ivp}d");
  1110. my $frame_size = 0x80 + ($win64?160:0);
  1111. $code.=<<___;
  1112. .globl aesni_ctr32_encrypt_blocks
  1113. .type aesni_ctr32_encrypt_blocks,\@function,5
  1114. .align 16
  1115. aesni_ctr32_encrypt_blocks:
  1116. cmp \$1,$len
  1117. jne .Lctr32_bulk
  1118. # handle single block without allocating stack frame,
  1119. # useful when handling edges
  1120. movups ($ivp),$inout0
  1121. movups ($inp),$inout1
  1122. mov 240($key),%edx # key->rounds
  1123. ___
  1124. &aesni_generate1("enc",$key,"%edx");
  1125. $code.=<<___;
  1126. pxor $rndkey0,$rndkey0 # clear register bank
  1127. pxor $rndkey1,$rndkey1
  1128. xorps $inout1,$inout0
  1129. pxor $inout1,$inout1
  1130. movups $inout0,($out)
  1131. xorps $inout0,$inout0
  1132. jmp .Lctr32_epilogue
  1133. .align 16
  1134. .Lctr32_bulk:
  1135. lea (%rsp),%rax
  1136. push %rbp
  1137. sub \$$frame_size,%rsp
  1138. and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
  1139. ___
  1140. $code.=<<___ if ($win64);
  1141. movaps %xmm6,-0xa8(%rax) # offload everything
  1142. movaps %xmm7,-0x98(%rax)
  1143. movaps %xmm8,-0x88(%rax)
  1144. movaps %xmm9,-0x78(%rax)
  1145. movaps %xmm10,-0x68(%rax)
  1146. movaps %xmm11,-0x58(%rax)
  1147. movaps %xmm12,-0x48(%rax)
  1148. movaps %xmm13,-0x38(%rax)
  1149. movaps %xmm14,-0x28(%rax)
  1150. movaps %xmm15,-0x18(%rax)
  1151. .Lctr32_body:
  1152. ___
  1153. $code.=<<___;
  1154. lea -8(%rax),%rbp
  1155. # 8 16-byte words on top of stack are counter values
  1156. # xor-ed with zero-round key
  1157. movdqu ($ivp),$inout0
  1158. movdqu ($key),$rndkey0
  1159. mov 12($ivp),$ctr # counter LSB
  1160. pxor $rndkey0,$inout0
  1161. mov 12($key),$key0 # 0-round key LSB
  1162. movdqa $inout0,0x00(%rsp) # populate counter block
  1163. bswap $ctr
  1164. movdqa $inout0,$inout1
  1165. movdqa $inout0,$inout2
  1166. movdqa $inout0,$inout3
  1167. movdqa $inout0,0x40(%rsp)
  1168. movdqa $inout0,0x50(%rsp)
  1169. movdqa $inout0,0x60(%rsp)
  1170. mov %rdx,%r10 # about to borrow %rdx
  1171. movdqa $inout0,0x70(%rsp)
  1172. lea 1($ctr),%rax
  1173. lea 2($ctr),%rdx
  1174. bswap %eax
  1175. bswap %edx
  1176. xor $key0,%eax
  1177. xor $key0,%edx
  1178. pinsrd \$3,%eax,$inout1
  1179. lea 3($ctr),%rax
  1180. movdqa $inout1,0x10(%rsp)
  1181. pinsrd \$3,%edx,$inout2
  1182. bswap %eax
  1183. mov %r10,%rdx # restore %rdx
  1184. lea 4($ctr),%r10
  1185. movdqa $inout2,0x20(%rsp)
  1186. xor $key0,%eax
  1187. bswap %r10d
  1188. pinsrd \$3,%eax,$inout3
  1189. xor $key0,%r10d
  1190. movdqa $inout3,0x30(%rsp)
  1191. lea 5($ctr),%r9
  1192. mov %r10d,0x40+12(%rsp)
  1193. bswap %r9d
  1194. lea 6($ctr),%r10
  1195. mov 240($key),$rounds # key->rounds
  1196. xor $key0,%r9d
  1197. bswap %r10d
  1198. mov %r9d,0x50+12(%rsp)
  1199. xor $key0,%r10d
  1200. lea 7($ctr),%r9
  1201. mov %r10d,0x60+12(%rsp)
  1202. bswap %r9d
  1203. mov OPENSSL_ia32cap_P+4(%rip),%r10d
  1204. xor $key0,%r9d
  1205. and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
  1206. mov %r9d,0x70+12(%rsp)
  1207. $movkey 0x10($key),$rndkey1
  1208. movdqa 0x40(%rsp),$inout4
  1209. movdqa 0x50(%rsp),$inout5
  1210. cmp \$8,$len # $len is in blocks
  1211. jb .Lctr32_tail # short input if ($len<8)
  1212. sub \$6,$len # $len is biased by -6
  1213. cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE
  1214. je .Lctr32_6x # [which denotes Atom Silvermont]
  1215. lea 0x80($key),$key # size optimization
  1216. sub \$2,$len # $len is biased by -8
  1217. jmp .Lctr32_loop8
  1218. .align 16
  1219. .Lctr32_6x:
  1220. shl \$4,$rounds
  1221. mov \$48,$rnds_
  1222. bswap $key0
  1223. lea 32($key,$rounds),$key # end of key schedule
  1224. sub %rax,%r10 # twisted $rounds
  1225. jmp .Lctr32_loop6
  1226. .align 16
  1227. .Lctr32_loop6:
  1228. add \$6,$ctr # next counter value
  1229. $movkey -48($key,$rnds_),$rndkey0
  1230. aesenc $rndkey1,$inout0
  1231. mov $ctr,%eax
  1232. xor $key0,%eax
  1233. aesenc $rndkey1,$inout1
  1234. movbe %eax,`0x00+12`(%rsp) # store next counter value
  1235. lea 1($ctr),%eax
  1236. aesenc $rndkey1,$inout2
  1237. xor $key0,%eax
  1238. movbe %eax,`0x10+12`(%rsp)
  1239. aesenc $rndkey1,$inout3
  1240. lea 2($ctr),%eax
  1241. xor $key0,%eax
  1242. aesenc $rndkey1,$inout4
  1243. movbe %eax,`0x20+12`(%rsp)
  1244. lea 3($ctr),%eax
  1245. aesenc $rndkey1,$inout5
  1246. $movkey -32($key,$rnds_),$rndkey1
  1247. xor $key0,%eax
  1248. aesenc $rndkey0,$inout0
  1249. movbe %eax,`0x30+12`(%rsp)
  1250. lea 4($ctr),%eax
  1251. aesenc $rndkey0,$inout1
  1252. xor $key0,%eax
  1253. movbe %eax,`0x40+12`(%rsp)
  1254. aesenc $rndkey0,$inout2
  1255. lea 5($ctr),%eax
  1256. xor $key0,%eax
  1257. aesenc $rndkey0,$inout3
  1258. movbe %eax,`0x50+12`(%rsp)
  1259. mov %r10,%rax # mov $rnds_,$rounds
  1260. aesenc $rndkey0,$inout4
  1261. aesenc $rndkey0,$inout5
  1262. $movkey -16($key,$rnds_),$rndkey0
  1263. call .Lenc_loop6
  1264. movdqu ($inp),$inout6 # load 6 input blocks
  1265. movdqu 0x10($inp),$inout7
  1266. movdqu 0x20($inp),$in0
  1267. movdqu 0x30($inp),$in1
  1268. movdqu 0x40($inp),$in2
  1269. movdqu 0x50($inp),$in3
  1270. lea 0x60($inp),$inp # $inp+=6*16
  1271. $movkey -64($key,$rnds_),$rndkey1
  1272. pxor $inout0,$inout6 # inp^=E(ctr)
  1273. movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round]
  1274. pxor $inout1,$inout7
  1275. movaps 0x10(%rsp),$inout1
  1276. pxor $inout2,$in0
  1277. movaps 0x20(%rsp),$inout2
  1278. pxor $inout3,$in1
  1279. movaps 0x30(%rsp),$inout3
  1280. pxor $inout4,$in2
  1281. movaps 0x40(%rsp),$inout4
  1282. pxor $inout5,$in3
  1283. movaps 0x50(%rsp),$inout5
  1284. movdqu $inout6,($out) # store 6 output blocks
  1285. movdqu $inout7,0x10($out)
  1286. movdqu $in0,0x20($out)
  1287. movdqu $in1,0x30($out)
  1288. movdqu $in2,0x40($out)
  1289. movdqu $in3,0x50($out)
  1290. lea 0x60($out),$out # $out+=6*16
  1291. sub \$6,$len
  1292. jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow
  1293. add \$6,$len # restore real remaining $len
  1294. jz .Lctr32_done # done if ($len==0)
  1295. lea -48($rnds_),$rounds
  1296. lea -80($key,$rnds_),$key # restore $key
  1297. neg $rounds
  1298. shr \$4,$rounds # restore $rounds
  1299. jmp .Lctr32_tail
  1300. .align 32
  1301. .Lctr32_loop8:
  1302. add \$8,$ctr # next counter value
  1303. movdqa 0x60(%rsp),$inout6
  1304. aesenc $rndkey1,$inout0
  1305. mov $ctr,%r9d
  1306. movdqa 0x70(%rsp),$inout7
  1307. aesenc $rndkey1,$inout1
  1308. bswap %r9d
  1309. $movkey 0x20-0x80($key),$rndkey0
  1310. aesenc $rndkey1,$inout2
  1311. xor $key0,%r9d
  1312. nop
  1313. aesenc $rndkey1,$inout3
  1314. mov %r9d,0x00+12(%rsp) # store next counter value
  1315. lea 1($ctr),%r9
  1316. aesenc $rndkey1,$inout4
  1317. aesenc $rndkey1,$inout5
  1318. aesenc $rndkey1,$inout6
  1319. aesenc $rndkey1,$inout7
  1320. $movkey 0x30-0x80($key),$rndkey1
  1321. ___
  1322. for($i=2;$i<8;$i++) {
  1323. my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
  1324. $code.=<<___;
  1325. bswap %r9d
  1326. aesenc $rndkeyx,$inout0
  1327. aesenc $rndkeyx,$inout1
  1328. xor $key0,%r9d
  1329. .byte 0x66,0x90
  1330. aesenc $rndkeyx,$inout2
  1331. aesenc $rndkeyx,$inout3
  1332. mov %r9d,`0x10*($i-1)`+12(%rsp)
  1333. lea $i($ctr),%r9
  1334. aesenc $rndkeyx,$inout4
  1335. aesenc $rndkeyx,$inout5
  1336. aesenc $rndkeyx,$inout6
  1337. aesenc $rndkeyx,$inout7
  1338. $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx
  1339. ___
  1340. }
  1341. $code.=<<___;
  1342. bswap %r9d
  1343. aesenc $rndkey0,$inout0
  1344. aesenc $rndkey0,$inout1
  1345. aesenc $rndkey0,$inout2
  1346. xor $key0,%r9d
  1347. movdqu 0x00($inp),$in0 # start loading input
  1348. aesenc $rndkey0,$inout3
  1349. mov %r9d,0x70+12(%rsp)
  1350. cmp \$11,$rounds
  1351. aesenc $rndkey0,$inout4
  1352. aesenc $rndkey0,$inout5
  1353. aesenc $rndkey0,$inout6
  1354. aesenc $rndkey0,$inout7
  1355. $movkey 0xa0-0x80($key),$rndkey0
  1356. jb .Lctr32_enc_done
  1357. aesenc $rndkey1,$inout0
  1358. aesenc $rndkey1,$inout1
  1359. aesenc $rndkey1,$inout2
  1360. aesenc $rndkey1,$inout3
  1361. aesenc $rndkey1,$inout4
  1362. aesenc $rndkey1,$inout5
  1363. aesenc $rndkey1,$inout6
  1364. aesenc $rndkey1,$inout7
  1365. $movkey 0xb0-0x80($key),$rndkey1
  1366. aesenc $rndkey0,$inout0
  1367. aesenc $rndkey0,$inout1
  1368. aesenc $rndkey0,$inout2
  1369. aesenc $rndkey0,$inout3
  1370. aesenc $rndkey0,$inout4
  1371. aesenc $rndkey0,$inout5
  1372. aesenc $rndkey0,$inout6
  1373. aesenc $rndkey0,$inout7
  1374. $movkey 0xc0-0x80($key),$rndkey0
  1375. je .Lctr32_enc_done
  1376. aesenc $rndkey1,$inout0
  1377. aesenc $rndkey1,$inout1
  1378. aesenc $rndkey1,$inout2
  1379. aesenc $rndkey1,$inout3
  1380. aesenc $rndkey1,$inout4
  1381. aesenc $rndkey1,$inout5
  1382. aesenc $rndkey1,$inout6
  1383. aesenc $rndkey1,$inout7
  1384. $movkey 0xd0-0x80($key),$rndkey1
  1385. aesenc $rndkey0,$inout0
  1386. aesenc $rndkey0,$inout1
  1387. aesenc $rndkey0,$inout2
  1388. aesenc $rndkey0,$inout3
  1389. aesenc $rndkey0,$inout4
  1390. aesenc $rndkey0,$inout5
  1391. aesenc $rndkey0,$inout6
  1392. aesenc $rndkey0,$inout7
  1393. $movkey 0xe0-0x80($key),$rndkey0
  1394. jmp .Lctr32_enc_done
  1395. .align 16
  1396. .Lctr32_enc_done:
  1397. movdqu 0x10($inp),$in1
  1398. pxor $rndkey0,$in0 # input^=round[last]
  1399. movdqu 0x20($inp),$in2
  1400. pxor $rndkey0,$in1
  1401. movdqu 0x30($inp),$in3
  1402. pxor $rndkey0,$in2
  1403. movdqu 0x40($inp),$in4
  1404. pxor $rndkey0,$in3
  1405. movdqu 0x50($inp),$in5
  1406. pxor $rndkey0,$in4
  1407. pxor $rndkey0,$in5
  1408. aesenc $rndkey1,$inout0
  1409. aesenc $rndkey1,$inout1
  1410. aesenc $rndkey1,$inout2
  1411. aesenc $rndkey1,$inout3
  1412. aesenc $rndkey1,$inout4
  1413. aesenc $rndkey1,$inout5
  1414. aesenc $rndkey1,$inout6
  1415. aesenc $rndkey1,$inout7
  1416. movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6]
  1417. lea 0x80($inp),$inp # $inp+=8*16
  1418. aesenclast $in0,$inout0 # $inN is inp[N]^round[last]
  1419. pxor $rndkey0,$rndkey1 # borrowed $rndkey
  1420. movdqu 0x70-0x80($inp),$in0
  1421. aesenclast $in1,$inout1
  1422. pxor $rndkey0,$in0
  1423. movdqa 0x00(%rsp),$in1 # load next counter block
  1424. aesenclast $in2,$inout2
  1425. aesenclast $in3,$inout3
  1426. movdqa 0x10(%rsp),$in2
  1427. movdqa 0x20(%rsp),$in3
  1428. aesenclast $in4,$inout4
  1429. aesenclast $in5,$inout5
  1430. movdqa 0x30(%rsp),$in4
  1431. movdqa 0x40(%rsp),$in5
  1432. aesenclast $rndkey1,$inout6
  1433. movdqa 0x50(%rsp),$rndkey0
  1434. $movkey 0x10-0x80($key),$rndkey1#real 1st-round key
  1435. aesenclast $in0,$inout7
  1436. movups $inout0,($out) # store 8 output blocks
  1437. movdqa $in1,$inout0
  1438. movups $inout1,0x10($out)
  1439. movdqa $in2,$inout1
  1440. movups $inout2,0x20($out)
  1441. movdqa $in3,$inout2
  1442. movups $inout3,0x30($out)
  1443. movdqa $in4,$inout3
  1444. movups $inout4,0x40($out)
  1445. movdqa $in5,$inout4
  1446. movups $inout5,0x50($out)
  1447. movdqa $rndkey0,$inout5
  1448. movups $inout6,0x60($out)
  1449. movups $inout7,0x70($out)
  1450. lea 0x80($out),$out # $out+=8*16
  1451. sub \$8,$len
  1452. jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow
  1453. add \$8,$len # restore real remainig $len
  1454. jz .Lctr32_done # done if ($len==0)
  1455. lea -0x80($key),$key
  1456. .Lctr32_tail:
  1457. # note that at this point $inout0..5 are populated with
  1458. # counter values xor-ed with 0-round key
  1459. lea 16($key),$key
  1460. cmp \$4,$len
  1461. jb .Lctr32_loop3
  1462. je .Lctr32_loop4
  1463. # if ($len>4) compute 7 E(counter)
  1464. shl \$4,$rounds
  1465. movdqa 0x60(%rsp),$inout6
  1466. pxor $inout7,$inout7
  1467. $movkey 16($key),$rndkey0
  1468. aesenc $rndkey1,$inout0
  1469. aesenc $rndkey1,$inout1
  1470. lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter
  1471. neg %rax
  1472. aesenc $rndkey1,$inout2
  1473. add \$16,%rax # prepare for .Lenc_loop8_enter
  1474. movups ($inp),$in0
  1475. aesenc $rndkey1,$inout3
  1476. aesenc $rndkey1,$inout4
  1477. movups 0x10($inp),$in1 # pre-load input
  1478. movups 0x20($inp),$in2
  1479. aesenc $rndkey1,$inout5
  1480. aesenc $rndkey1,$inout6
  1481. call .Lenc_loop8_enter
  1482. movdqu 0x30($inp),$in3
  1483. pxor $in0,$inout0
  1484. movdqu 0x40($inp),$in0
  1485. pxor $in1,$inout1
  1486. movdqu $inout0,($out) # store output
  1487. pxor $in2,$inout2
  1488. movdqu $inout1,0x10($out)
  1489. pxor $in3,$inout3
  1490. movdqu $inout2,0x20($out)
  1491. pxor $in0,$inout4
  1492. movdqu $inout3,0x30($out)
  1493. movdqu $inout4,0x40($out)
  1494. cmp \$6,$len
  1495. jb .Lctr32_done # $len was 5, stop store
  1496. movups 0x50($inp),$in1
  1497. xorps $in1,$inout5
  1498. movups $inout5,0x50($out)
  1499. je .Lctr32_done # $len was 6, stop store
  1500. movups 0x60($inp),$in2
  1501. xorps $in2,$inout6
  1502. movups $inout6,0x60($out)
  1503. jmp .Lctr32_done # $len was 7, stop store
  1504. .align 32
  1505. .Lctr32_loop4:
  1506. aesenc $rndkey1,$inout0
  1507. lea 16($key),$key
  1508. dec $rounds
  1509. aesenc $rndkey1,$inout1
  1510. aesenc $rndkey1,$inout2
  1511. aesenc $rndkey1,$inout3
  1512. $movkey ($key),$rndkey1
  1513. jnz .Lctr32_loop4
  1514. aesenclast $rndkey1,$inout0
  1515. aesenclast $rndkey1,$inout1
  1516. movups ($inp),$in0 # load input
  1517. movups 0x10($inp),$in1
  1518. aesenclast $rndkey1,$inout2
  1519. aesenclast $rndkey1,$inout3
  1520. movups 0x20($inp),$in2
  1521. movups 0x30($inp),$in3
  1522. xorps $in0,$inout0
  1523. movups $inout0,($out) # store output
  1524. xorps $in1,$inout1
  1525. movups $inout1,0x10($out)
  1526. pxor $in2,$inout2
  1527. movdqu $inout2,0x20($out)
  1528. pxor $in3,$inout3
  1529. movdqu $inout3,0x30($out)
  1530. jmp .Lctr32_done # $len was 4, stop store
  1531. .align 32
  1532. .Lctr32_loop3:
  1533. aesenc $rndkey1,$inout0
  1534. lea 16($key),$key
  1535. dec $rounds
  1536. aesenc $rndkey1,$inout1
  1537. aesenc $rndkey1,$inout2
  1538. $movkey ($key),$rndkey1
  1539. jnz .Lctr32_loop3
  1540. aesenclast $rndkey1,$inout0
  1541. aesenclast $rndkey1,$inout1
  1542. aesenclast $rndkey1,$inout2
  1543. movups ($inp),$in0 # load input
  1544. xorps $in0,$inout0
  1545. movups $inout0,($out) # store output
  1546. cmp \$2,$len
  1547. jb .Lctr32_done # $len was 1, stop store
  1548. movups 0x10($inp),$in1
  1549. xorps $in1,$inout1
  1550. movups $inout1,0x10($out)
  1551. je .Lctr32_done # $len was 2, stop store
  1552. movups 0x20($inp),$in2
  1553. xorps $in2,$inout2
  1554. movups $inout2,0x20($out) # $len was 3, stop store
  1555. .Lctr32_done:
  1556. xorps %xmm0,%xmm0 # clear regiser bank
  1557. xor $key0,$key0
  1558. pxor %xmm1,%xmm1
  1559. pxor %xmm2,%xmm2
  1560. pxor %xmm3,%xmm3
  1561. pxor %xmm4,%xmm4
  1562. pxor %xmm5,%xmm5
  1563. ___
  1564. $code.=<<___ if (!$win64);
  1565. pxor %xmm6,%xmm6
  1566. pxor %xmm7,%xmm7
  1567. movaps %xmm0,0x00(%rsp) # clear stack
  1568. pxor %xmm8,%xmm8
  1569. movaps %xmm0,0x10(%rsp)
  1570. pxor %xmm9,%xmm9
  1571. movaps %xmm0,0x20(%rsp)
  1572. pxor %xmm10,%xmm10
  1573. movaps %xmm0,0x30(%rsp)
  1574. pxor %xmm11,%xmm11
  1575. movaps %xmm0,0x40(%rsp)
  1576. pxor %xmm12,%xmm12
  1577. movaps %xmm0,0x50(%rsp)
  1578. pxor %xmm13,%xmm13
  1579. movaps %xmm0,0x60(%rsp)
  1580. pxor %xmm14,%xmm14
  1581. movaps %xmm0,0x70(%rsp)
  1582. pxor %xmm15,%xmm15
  1583. ___
  1584. $code.=<<___ if ($win64);
  1585. movaps -0xa0(%rbp),%xmm6
  1586. movaps %xmm0,-0xa0(%rbp) # clear stack
  1587. movaps -0x90(%rbp),%xmm7
  1588. movaps %xmm0,-0x90(%rbp)
  1589. movaps -0x80(%rbp),%xmm8
  1590. movaps %xmm0,-0x80(%rbp)
  1591. movaps -0x70(%rbp),%xmm9
  1592. movaps %xmm0,-0x70(%rbp)
  1593. movaps -0x60(%rbp),%xmm10
  1594. movaps %xmm0,-0x60(%rbp)
  1595. movaps -0x50(%rbp),%xmm11
  1596. movaps %xmm0,-0x50(%rbp)
  1597. movaps -0x40(%rbp),%xmm12
  1598. movaps %xmm0,-0x40(%rbp)
  1599. movaps -0x30(%rbp),%xmm13
  1600. movaps %xmm0,-0x30(%rbp)
  1601. movaps -0x20(%rbp),%xmm14
  1602. movaps %xmm0,-0x20(%rbp)
  1603. movaps -0x10(%rbp),%xmm15
  1604. movaps %xmm0,-0x10(%rbp)
  1605. movaps %xmm0,0x00(%rsp)
  1606. movaps %xmm0,0x10(%rsp)
  1607. movaps %xmm0,0x20(%rsp)
  1608. movaps %xmm0,0x30(%rsp)
  1609. movaps %xmm0,0x40(%rsp)
  1610. movaps %xmm0,0x50(%rsp)
  1611. movaps %xmm0,0x60(%rsp)
  1612. movaps %xmm0,0x70(%rsp)
  1613. ___
  1614. $code.=<<___;
  1615. lea (%rbp),%rsp
  1616. pop %rbp
  1617. .Lctr32_epilogue:
  1618. ret
  1619. .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
  1620. ___
  1621. }
  1622. ######################################################################
  1623. # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
  1624. # const AES_KEY *key1, const AES_KEY *key2
  1625. # const unsigned char iv[16]);
  1626. #
  1627. {
  1628. my @tweak=map("%xmm$_",(10..15));
  1629. my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
  1630. my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
  1631. my $frame_size = 0x70 + ($win64?160:0);
  1632. $code.=<<___;
  1633. .globl aesni_xts_encrypt
  1634. .type aesni_xts_encrypt,\@function,6
  1635. .align 16
  1636. aesni_xts_encrypt:
  1637. lea (%rsp),%rax
  1638. push %rbp
  1639. sub \$$frame_size,%rsp
  1640. and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
  1641. ___
  1642. $code.=<<___ if ($win64);
  1643. movaps %xmm6,-0xa8(%rax) # offload everything
  1644. movaps %xmm7,-0x98(%rax)
  1645. movaps %xmm8,-0x88(%rax)
  1646. movaps %xmm9,-0x78(%rax)
  1647. movaps %xmm10,-0x68(%rax)
  1648. movaps %xmm11,-0x58(%rax)
  1649. movaps %xmm12,-0x48(%rax)
  1650. movaps %xmm13,-0x38(%rax)
  1651. movaps %xmm14,-0x28(%rax)
  1652. movaps %xmm15,-0x18(%rax)
  1653. .Lxts_enc_body:
  1654. ___
  1655. $code.=<<___;
  1656. lea -8(%rax),%rbp
  1657. movups ($ivp),$inout0 # load clear-text tweak
  1658. mov 240(%r8),$rounds # key2->rounds
  1659. mov 240($key),$rnds_ # key1->rounds
  1660. ___
  1661. # generate the tweak
  1662. &aesni_generate1("enc",$key2,$rounds,$inout0);
  1663. $code.=<<___;
  1664. $movkey ($key),$rndkey0 # zero round key
  1665. mov $key,$key_ # backup $key
  1666. mov $rnds_,$rounds # backup $rounds
  1667. shl \$4,$rnds_
  1668. mov $len,$len_ # backup $len
  1669. and \$-16,$len
  1670. $movkey 16($key,$rnds_),$rndkey1 # last round key
  1671. movdqa .Lxts_magic(%rip),$twmask
  1672. movdqa $inout0,@tweak[5]
  1673. pshufd \$0x5f,$inout0,$twres
  1674. pxor $rndkey0,$rndkey1
  1675. ___
  1676. # alternative tweak calculation algorithm is based on suggestions
  1677. # by Shay Gueron. psrad doesn't conflict with AES-NI instructions
  1678. # and should help in the future...
  1679. for ($i=0;$i<4;$i++) {
  1680. $code.=<<___;
  1681. movdqa $twres,$twtmp
  1682. paddd $twres,$twres
  1683. movdqa @tweak[5],@tweak[$i]
  1684. psrad \$31,$twtmp # broadcast upper bits
  1685. paddq @tweak[5],@tweak[5]
  1686. pand $twmask,$twtmp
  1687. pxor $rndkey0,@tweak[$i]
  1688. pxor $twtmp,@tweak[5]
  1689. ___
  1690. }
  1691. $code.=<<___;
  1692. movdqa @tweak[5],@tweak[4]
  1693. psrad \$31,$twres
  1694. paddq @tweak[5],@tweak[5]
  1695. pand $twmask,$twres
  1696. pxor $rndkey0,@tweak[4]
  1697. pxor $twres,@tweak[5]
  1698. movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
  1699. sub \$16*6,$len
  1700. jc .Lxts_enc_short # if $len-=6*16 borrowed
  1701. mov \$16+96,$rounds
  1702. lea 32($key_,$rnds_),$key # end of key schedule
  1703. sub %r10,%rax # twisted $rounds
  1704. $movkey 16($key_),$rndkey1
  1705. mov %rax,%r10 # backup twisted $rounds
  1706. lea .Lxts_magic(%rip),%r8
  1707. jmp .Lxts_enc_grandloop
  1708. .align 32
  1709. .Lxts_enc_grandloop:
  1710. movdqu `16*0`($inp),$inout0 # load input
  1711. movdqa $rndkey0,$twmask
  1712. movdqu `16*1`($inp),$inout1
  1713. pxor @tweak[0],$inout0 # input^=tweak^round[0]
  1714. movdqu `16*2`($inp),$inout2
  1715. pxor @tweak[1],$inout1
  1716. aesenc $rndkey1,$inout0
  1717. movdqu `16*3`($inp),$inout3
  1718. pxor @tweak[2],$inout2
  1719. aesenc $rndkey1,$inout1
  1720. movdqu `16*4`($inp),$inout4
  1721. pxor @tweak[3],$inout3
  1722. aesenc $rndkey1,$inout2
  1723. movdqu `16*5`($inp),$inout5
  1724. pxor @tweak[5],$twmask # round[0]^=tweak[5]
  1725. movdqa 0x60(%rsp),$twres # load round[0]^round[last]
  1726. pxor @tweak[4],$inout4
  1727. aesenc $rndkey1,$inout3
  1728. $movkey 32($key_),$rndkey0
  1729. lea `16*6`($inp),$inp
  1730. pxor $twmask,$inout5
  1731. pxor $twres,@tweak[0] # calclulate tweaks^round[last]
  1732. aesenc $rndkey1,$inout4
  1733. pxor $twres,@tweak[1]
  1734. movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last]
  1735. aesenc $rndkey1,$inout5
  1736. $movkey 48($key_),$rndkey1
  1737. pxor $twres,@tweak[2]
  1738. aesenc $rndkey0,$inout0
  1739. pxor $twres,@tweak[3]
  1740. movdqa @tweak[1],`16*1`(%rsp)
  1741. aesenc $rndkey0,$inout1
  1742. pxor $twres,@tweak[4]
  1743. movdqa @tweak[2],`16*2`(%rsp)
  1744. aesenc $rndkey0,$inout2
  1745. aesenc $rndkey0,$inout3
  1746. pxor $twres,$twmask
  1747. movdqa @tweak[4],`16*4`(%rsp)
  1748. aesenc $rndkey0,$inout4
  1749. aesenc $rndkey0,$inout5
  1750. $movkey 64($key_),$rndkey0
  1751. movdqa $twmask,`16*5`(%rsp)
  1752. pshufd \$0x5f,@tweak[5],$twres
  1753. jmp .Lxts_enc_loop6
  1754. .align 32
  1755. .Lxts_enc_loop6:
  1756. aesenc $rndkey1,$inout0
  1757. aesenc $rndkey1,$inout1
  1758. aesenc $rndkey1,$inout2
  1759. aesenc $rndkey1,$inout3
  1760. aesenc $rndkey1,$inout4
  1761. aesenc $rndkey1,$inout5
  1762. $movkey -64($key,%rax),$rndkey1
  1763. add \$32,%rax
  1764. aesenc $rndkey0,$inout0
  1765. aesenc $rndkey0,$inout1
  1766. aesenc $rndkey0,$inout2
  1767. aesenc $rndkey0,$inout3
  1768. aesenc $rndkey0,$inout4
  1769. aesenc $rndkey0,$inout5
  1770. $movkey -80($key,%rax),$rndkey0
  1771. jnz .Lxts_enc_loop6
  1772. movdqa (%r8),$twmask # start calculating next tweak
  1773. movdqa $twres,$twtmp
  1774. paddd $twres,$twres
  1775. aesenc $rndkey1,$inout0
  1776. paddq @tweak[5],@tweak[5]
  1777. psrad \$31,$twtmp
  1778. aesenc $rndkey1,$inout1
  1779. pand $twmask,$twtmp
  1780. $movkey ($key_),@tweak[0] # load round[0]
  1781. aesenc $rndkey1,$inout2
  1782. aesenc $rndkey1,$inout3
  1783. aesenc $rndkey1,$inout4
  1784. pxor $twtmp,@tweak[5]
  1785. movaps @tweak[0],@tweak[1] # copy round[0]
  1786. aesenc $rndkey1,$inout5
  1787. $movkey -64($key),$rndkey1
  1788. movdqa $twres,$twtmp
  1789. aesenc $rndkey0,$inout0
  1790. paddd $twres,$twres
  1791. pxor @tweak[5],@tweak[0]
  1792. aesenc $rndkey0,$inout1
  1793. psrad \$31,$twtmp
  1794. paddq @tweak[5],@tweak[5]
  1795. aesenc $rndkey0,$inout2
  1796. aesenc $rndkey0,$inout3
  1797. pand $twmask,$twtmp
  1798. movaps @tweak[1],@tweak[2]
  1799. aesenc $rndkey0,$inout4
  1800. pxor $twtmp,@tweak[5]
  1801. movdqa $twres,$twtmp
  1802. aesenc $rndkey0,$inout5
  1803. $movkey -48($key),$rndkey0
  1804. paddd $twres,$twres
  1805. aesenc $rndkey1,$inout0
  1806. pxor @tweak[5],@tweak[1]
  1807. psrad \$31,$twtmp
  1808. aesenc $rndkey1,$inout1
  1809. paddq @tweak[5],@tweak[5]
  1810. pand $twmask,$twtmp
  1811. aesenc $rndkey1,$inout2
  1812. aesenc $rndkey1,$inout3
  1813. movdqa @tweak[3],`16*3`(%rsp)
  1814. pxor $twtmp,@tweak[5]
  1815. aesenc $rndkey1,$inout4
  1816. movaps @tweak[2],@tweak[3]
  1817. movdqa $twres,$twtmp
  1818. aesenc $rndkey1,$inout5
  1819. $movkey -32($key),$rndkey1
  1820. paddd $twres,$twres
  1821. aesenc $rndkey0,$inout0
  1822. pxor @tweak[5],@tweak[2]
  1823. psrad \$31,$twtmp
  1824. aesenc $rndkey0,$inout1
  1825. paddq @tweak[5],@tweak[5]
  1826. pand $twmask,$twtmp
  1827. aesenc $rndkey0,$inout2
  1828. aesenc $rndkey0,$inout3
  1829. aesenc $rndkey0,$inout4
  1830. pxor $twtmp,@tweak[5]
  1831. movaps @tweak[3],@tweak[4]
  1832. aesenc $rndkey0,$inout5
  1833. movdqa $twres,$rndkey0
  1834. paddd $twres,$twres
  1835. aesenc $rndkey1,$inout0
  1836. pxor @tweak[5],@tweak[3]
  1837. psrad \$31,$rndkey0
  1838. aesenc $rndkey1,$inout1
  1839. paddq @tweak[5],@tweak[5]
  1840. pand $twmask,$rndkey0
  1841. aesenc $rndkey1,$inout2
  1842. aesenc $rndkey1,$inout3
  1843. pxor $rndkey0,@tweak[5]
  1844. $movkey ($key_),$rndkey0
  1845. aesenc $rndkey1,$inout4
  1846. aesenc $rndkey1,$inout5
  1847. $movkey 16($key_),$rndkey1
  1848. pxor @tweak[5],@tweak[4]
  1849. aesenclast `16*0`(%rsp),$inout0
  1850. psrad \$31,$twres
  1851. paddq @tweak[5],@tweak[5]
  1852. aesenclast `16*1`(%rsp),$inout1
  1853. aesenclast `16*2`(%rsp),$inout2
  1854. pand $twmask,$twres
  1855. mov %r10,%rax # restore $rounds
  1856. aesenclast `16*3`(%rsp),$inout3
  1857. aesenclast `16*4`(%rsp),$inout4
  1858. aesenclast `16*5`(%rsp),$inout5
  1859. pxor $twres,@tweak[5]
  1860. lea `16*6`($out),$out # $out+=6*16
  1861. movups $inout0,`-16*6`($out) # store 6 output blocks
  1862. movups $inout1,`-16*5`($out)
  1863. movups $inout2,`-16*4`($out)
  1864. movups $inout3,`-16*3`($out)
  1865. movups $inout4,`-16*2`($out)
  1866. movups $inout5,`-16*1`($out)
  1867. sub \$16*6,$len
  1868. jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow
  1869. mov \$16+96,$rounds
  1870. sub $rnds_,$rounds
  1871. mov $key_,$key # restore $key
  1872. shr \$4,$rounds # restore original value
  1873. .Lxts_enc_short:
  1874. # at the point @tweak[0..5] are populated with tweak values
  1875. mov $rounds,$rnds_ # backup $rounds
  1876. pxor $rndkey0,@tweak[0]
  1877. add \$16*6,$len # restore real remaining $len
  1878. jz .Lxts_enc_done # done if ($len==0)
  1879. pxor $rndkey0,@tweak[1]
  1880. cmp \$0x20,$len
  1881. jb .Lxts_enc_one # $len is 1*16
  1882. pxor $rndkey0,@tweak[2]
  1883. je .Lxts_enc_two # $len is 2*16
  1884. pxor $rndkey0,@tweak[3]
  1885. cmp \$0x40,$len
  1886. jb .Lxts_enc_three # $len is 3*16
  1887. pxor $rndkey0,@tweak[4]
  1888. je .Lxts_enc_four # $len is 4*16
  1889. movdqu ($inp),$inout0 # $len is 5*16
  1890. movdqu 16*1($inp),$inout1
  1891. movdqu 16*2($inp),$inout2
  1892. pxor @tweak[0],$inout0
  1893. movdqu 16*3($inp),$inout3
  1894. pxor @tweak[1],$inout1
  1895. movdqu 16*4($inp),$inout4
  1896. lea 16*5($inp),$inp # $inp+=5*16
  1897. pxor @tweak[2],$inout2
  1898. pxor @tweak[3],$inout3
  1899. pxor @tweak[4],$inout4
  1900. pxor $inout5,$inout5
  1901. call _aesni_encrypt6
  1902. xorps @tweak[0],$inout0
  1903. movdqa @tweak[5],@tweak[0]
  1904. xorps @tweak[1],$inout1
  1905. xorps @tweak[2],$inout2
  1906. movdqu $inout0,($out) # store 5 output blocks
  1907. xorps @tweak[3],$inout3
  1908. movdqu $inout1,16*1($out)
  1909. xorps @tweak[4],$inout4
  1910. movdqu $inout2,16*2($out)
  1911. movdqu $inout3,16*3($out)
  1912. movdqu $inout4,16*4($out)
  1913. lea 16*5($out),$out # $out+=5*16
  1914. jmp .Lxts_enc_done
  1915. .align 16
  1916. .Lxts_enc_one:
  1917. movups ($inp),$inout0
  1918. lea 16*1($inp),$inp # inp+=1*16
  1919. xorps @tweak[0],$inout0
  1920. ___
  1921. &aesni_generate1("enc",$key,$rounds);
  1922. $code.=<<___;
  1923. xorps @tweak[0],$inout0
  1924. movdqa @tweak[1],@tweak[0]
  1925. movups $inout0,($out) # store one output block
  1926. lea 16*1($out),$out # $out+=1*16
  1927. jmp .Lxts_enc_done
  1928. .align 16
  1929. .Lxts_enc_two:
  1930. movups ($inp),$inout0
  1931. movups 16($inp),$inout1
  1932. lea 32($inp),$inp # $inp+=2*16
  1933. xorps @tweak[0],$inout0
  1934. xorps @tweak[1],$inout1
  1935. call _aesni_encrypt2
  1936. xorps @tweak[0],$inout0
  1937. movdqa @tweak[2],@tweak[0]
  1938. xorps @tweak[1],$inout1
  1939. movups $inout0,($out) # store 2 output blocks
  1940. movups $inout1,16*1($out)
  1941. lea 16*2($out),$out # $out+=2*16
  1942. jmp .Lxts_enc_done
  1943. .align 16
  1944. .Lxts_enc_three:
  1945. movups ($inp),$inout0
  1946. movups 16*1($inp),$inout1
  1947. movups 16*2($inp),$inout2
  1948. lea 16*3($inp),$inp # $inp+=3*16
  1949. xorps @tweak[0],$inout0
  1950. xorps @tweak[1],$inout1
  1951. xorps @tweak[2],$inout2
  1952. call _aesni_encrypt3
  1953. xorps @tweak[0],$inout0
  1954. movdqa @tweak[3],@tweak[0]
  1955. xorps @tweak[1],$inout1
  1956. xorps @tweak[2],$inout2
  1957. movups $inout0,($out) # store 3 output blocks
  1958. movups $inout1,16*1($out)
  1959. movups $inout2,16*2($out)
  1960. lea 16*3($out),$out # $out+=3*16
  1961. jmp .Lxts_enc_done
  1962. .align 16
  1963. .Lxts_enc_four:
  1964. movups ($inp),$inout0
  1965. movups 16*1($inp),$inout1
  1966. movups 16*2($inp),$inout2
  1967. xorps @tweak[0],$inout0
  1968. movups 16*3($inp),$inout3
  1969. lea 16*4($inp),$inp # $inp+=4*16
  1970. xorps @tweak[1],$inout1
  1971. xorps @tweak[2],$inout2
  1972. xorps @tweak[3],$inout3
  1973. call _aesni_encrypt4
  1974. pxor @tweak[0],$inout0
  1975. movdqa @tweak[4],@tweak[0]
  1976. pxor @tweak[1],$inout1
  1977. pxor @tweak[2],$inout2
  1978. movdqu $inout0,($out) # store 4 output blocks
  1979. pxor @tweak[3],$inout3
  1980. movdqu $inout1,16*1($out)
  1981. movdqu $inout2,16*2($out)
  1982. movdqu $inout3,16*3($out)
  1983. lea 16*4($out),$out # $out+=4*16
  1984. jmp .Lxts_enc_done
  1985. .align 16
  1986. .Lxts_enc_done:
  1987. and \$15,$len_ # see if $len%16 is 0
  1988. jz .Lxts_enc_ret
  1989. mov $len_,$len
  1990. .Lxts_enc_steal:
  1991. movzb ($inp),%eax # borrow $rounds ...
  1992. movzb -16($out),%ecx # ... and $key
  1993. lea 1($inp),$inp
  1994. mov %al,-16($out)
  1995. mov %cl,0($out)
  1996. lea 1($out),$out
  1997. sub \$1,$len
  1998. jnz .Lxts_enc_steal
  1999. sub $len_,$out # rewind $out
  2000. mov $key_,$key # restore $key
  2001. mov $rnds_,$rounds # restore $rounds
  2002. movups -16($out),$inout0
  2003. xorps @tweak[0],$inout0
  2004. ___
  2005. &aesni_generate1("enc",$key,$rounds);
  2006. $code.=<<___;
  2007. xorps @tweak[0],$inout0
  2008. movups $inout0,-16($out)
  2009. .Lxts_enc_ret:
  2010. xorps %xmm0,%xmm0 # clear register bank
  2011. pxor %xmm1,%xmm1
  2012. pxor %xmm2,%xmm2
  2013. pxor %xmm3,%xmm3
  2014. pxor %xmm4,%xmm4
  2015. pxor %xmm5,%xmm5
  2016. ___
  2017. $code.=<<___ if (!$win64);
  2018. pxor %xmm6,%xmm6
  2019. pxor %xmm7,%xmm7
  2020. movaps %xmm0,0x00(%rsp) # clear stack
  2021. pxor %xmm8,%xmm8
  2022. movaps %xmm0,0x10(%rsp)
  2023. pxor %xmm9,%xmm9
  2024. movaps %xmm0,0x20(%rsp)
  2025. pxor %xmm10,%xmm10
  2026. movaps %xmm0,0x30(%rsp)
  2027. pxor %xmm11,%xmm11
  2028. movaps %xmm0,0x40(%rsp)
  2029. pxor %xmm12,%xmm12
  2030. movaps %xmm0,0x50(%rsp)
  2031. pxor %xmm13,%xmm13
  2032. movaps %xmm0,0x60(%rsp)
  2033. pxor %xmm14,%xmm14
  2034. pxor %xmm15,%xmm15
  2035. ___
  2036. $code.=<<___ if ($win64);
  2037. movaps -0xa0(%rbp),%xmm6
  2038. movaps %xmm0,-0xa0(%rbp) # clear stack
  2039. movaps -0x90(%rbp),%xmm7
  2040. movaps %xmm0,-0x90(%rbp)
  2041. movaps -0x80(%rbp),%xmm8
  2042. movaps %xmm0,-0x80(%rbp)
  2043. movaps -0x70(%rbp),%xmm9
  2044. movaps %xmm0,-0x70(%rbp)
  2045. movaps -0x60(%rbp),%xmm10
  2046. movaps %xmm0,-0x60(%rbp)
  2047. movaps -0x50(%rbp),%xmm11
  2048. movaps %xmm0,-0x50(%rbp)
  2049. movaps -0x40(%rbp),%xmm12
  2050. movaps %xmm0,-0x40(%rbp)
  2051. movaps -0x30(%rbp),%xmm13
  2052. movaps %xmm0,-0x30(%rbp)
  2053. movaps -0x20(%rbp),%xmm14
  2054. movaps %xmm0,-0x20(%rbp)
  2055. movaps -0x10(%rbp),%xmm15
  2056. movaps %xmm0,-0x10(%rbp)
  2057. movaps %xmm0,0x00(%rsp)
  2058. movaps %xmm0,0x10(%rsp)
  2059. movaps %xmm0,0x20(%rsp)
  2060. movaps %xmm0,0x30(%rsp)
  2061. movaps %xmm0,0x40(%rsp)
  2062. movaps %xmm0,0x50(%rsp)
  2063. movaps %xmm0,0x60(%rsp)
  2064. ___
  2065. $code.=<<___;
  2066. lea (%rbp),%rsp
  2067. pop %rbp
  2068. .Lxts_enc_epilogue:
  2069. ret
  2070. .size aesni_xts_encrypt,.-aesni_xts_encrypt
  2071. ___
  2072. $code.=<<___;
  2073. .globl aesni_xts_decrypt
  2074. .type aesni_xts_decrypt,\@function,6
  2075. .align 16
  2076. aesni_xts_decrypt:
  2077. lea (%rsp),%rax
  2078. push %rbp
  2079. sub \$$frame_size,%rsp
  2080. and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
  2081. ___
  2082. $code.=<<___ if ($win64);
  2083. movaps %xmm6,-0xa8(%rax) # offload everything
  2084. movaps %xmm7,-0x98(%rax)
  2085. movaps %xmm8,-0x88(%rax)
  2086. movaps %xmm9,-0x78(%rax)
  2087. movaps %xmm10,-0x68(%rax)
  2088. movaps %xmm11,-0x58(%rax)
  2089. movaps %xmm12,-0x48(%rax)
  2090. movaps %xmm13,-0x38(%rax)
  2091. movaps %xmm14,-0x28(%rax)
  2092. movaps %xmm15,-0x18(%rax)
  2093. .Lxts_dec_body:
  2094. ___
  2095. $code.=<<___;
  2096. lea -8(%rax),%rbp
  2097. movups ($ivp),$inout0 # load clear-text tweak
  2098. mov 240($key2),$rounds # key2->rounds
  2099. mov 240($key),$rnds_ # key1->rounds
  2100. ___
  2101. # generate the tweak
  2102. &aesni_generate1("enc",$key2,$rounds,$inout0);
  2103. $code.=<<___;
  2104. xor %eax,%eax # if ($len%16) len-=16;
  2105. test \$15,$len
  2106. setnz %al
  2107. shl \$4,%rax
  2108. sub %rax,$len
  2109. $movkey ($key),$rndkey0 # zero round key
  2110. mov $key,$key_ # backup $key
  2111. mov $rnds_,$rounds # backup $rounds
  2112. shl \$4,$rnds_
  2113. mov $len,$len_ # backup $len
  2114. and \$-16,$len
  2115. $movkey 16($key,$rnds_),$rndkey1 # last round key
  2116. movdqa .Lxts_magic(%rip),$twmask
  2117. movdqa $inout0,@tweak[5]
  2118. pshufd \$0x5f,$inout0,$twres
  2119. pxor $rndkey0,$rndkey1
  2120. ___
  2121. for ($i=0;$i<4;$i++) {
  2122. $code.=<<___;
  2123. movdqa $twres,$twtmp
  2124. paddd $twres,$twres
  2125. movdqa @tweak[5],@tweak[$i]
  2126. psrad \$31,$twtmp # broadcast upper bits
  2127. paddq @tweak[5],@tweak[5]
  2128. pand $twmask,$twtmp
  2129. pxor $rndkey0,@tweak[$i]
  2130. pxor $twtmp,@tweak[5]
  2131. ___
  2132. }
  2133. $code.=<<___;
  2134. movdqa @tweak[5],@tweak[4]
  2135. psrad \$31,$twres
  2136. paddq @tweak[5],@tweak[5]
  2137. pand $twmask,$twres
  2138. pxor $rndkey0,@tweak[4]
  2139. pxor $twres,@tweak[5]
  2140. movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
  2141. sub \$16*6,$len
  2142. jc .Lxts_dec_short # if $len-=6*16 borrowed
  2143. mov \$16+96,$rounds
  2144. lea 32($key_,$rnds_),$key # end of key schedule
  2145. sub %r10,%rax # twisted $rounds
  2146. $movkey 16($key_),$rndkey1
  2147. mov %rax,%r10 # backup twisted $rounds
  2148. lea .Lxts_magic(%rip),%r8
  2149. jmp .Lxts_dec_grandloop
  2150. .align 32
  2151. .Lxts_dec_grandloop:
  2152. movdqu `16*0`($inp),$inout0 # load input
  2153. movdqa $rndkey0,$twmask
  2154. movdqu `16*1`($inp),$inout1
  2155. pxor @tweak[0],$inout0 # intput^=tweak^round[0]
  2156. movdqu `16*2`($inp),$inout2
  2157. pxor @tweak[1],$inout1
  2158. aesdec $rndkey1,$inout0
  2159. movdqu `16*3`($inp),$inout3
  2160. pxor @tweak[2],$inout2
  2161. aesdec $rndkey1,$inout1
  2162. movdqu `16*4`($inp),$inout4
  2163. pxor @tweak[3],$inout3
  2164. aesdec $rndkey1,$inout2
  2165. movdqu `16*5`($inp),$inout5
  2166. pxor @tweak[5],$twmask # round[0]^=tweak[5]
  2167. movdqa 0x60(%rsp),$twres # load round[0]^round[last]
  2168. pxor @tweak[4],$inout4
  2169. aesdec $rndkey1,$inout3
  2170. $movkey 32($key_),$rndkey0
  2171. lea `16*6`($inp),$inp
  2172. pxor $twmask,$inout5
  2173. pxor $twres,@tweak[0] # calclulate tweaks^round[last]
  2174. aesdec $rndkey1,$inout4
  2175. pxor $twres,@tweak[1]
  2176. movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key
  2177. aesdec $rndkey1,$inout5
  2178. $movkey 48($key_),$rndkey1
  2179. pxor $twres,@tweak[2]
  2180. aesdec $rndkey0,$inout0
  2181. pxor $twres,@tweak[3]
  2182. movdqa @tweak[1],`16*1`(%rsp)
  2183. aesdec $rndkey0,$inout1
  2184. pxor $twres,@tweak[4]
  2185. movdqa @tweak[2],`16*2`(%rsp)
  2186. aesdec $rndkey0,$inout2
  2187. aesdec $rndkey0,$inout3
  2188. pxor $twres,$twmask
  2189. movdqa @tweak[4],`16*4`(%rsp)
  2190. aesdec $rndkey0,$inout4
  2191. aesdec $rndkey0,$inout5
  2192. $movkey 64($key_),$rndkey0
  2193. movdqa $twmask,`16*5`(%rsp)
  2194. pshufd \$0x5f,@tweak[5],$twres
  2195. jmp .Lxts_dec_loop6
  2196. .align 32
  2197. .Lxts_dec_loop6:
  2198. aesdec $rndkey1,$inout0
  2199. aesdec $rndkey1,$inout1
  2200. aesdec $rndkey1,$inout2
  2201. aesdec $rndkey1,$inout3
  2202. aesdec $rndkey1,$inout4
  2203. aesdec $rndkey1,$inout5
  2204. $movkey -64($key,%rax),$rndkey1
  2205. add \$32,%rax
  2206. aesdec $rndkey0,$inout0
  2207. aesdec $rndkey0,$inout1
  2208. aesdec $rndkey0,$inout2
  2209. aesdec $rndkey0,$inout3
  2210. aesdec $rndkey0,$inout4
  2211. aesdec $rndkey0,$inout5
  2212. $movkey -80($key,%rax),$rndkey0
  2213. jnz .Lxts_dec_loop6
  2214. movdqa (%r8),$twmask # start calculating next tweak
  2215. movdqa $twres,$twtmp
  2216. paddd $twres,$twres
  2217. aesdec $rndkey1,$inout0
  2218. paddq @tweak[5],@tweak[5]
  2219. psrad \$31,$twtmp
  2220. aesdec $rndkey1,$inout1
  2221. pand $twmask,$twtmp
  2222. $movkey ($key_),@tweak[0] # load round[0]
  2223. aesdec $rndkey1,$inout2
  2224. aesdec $rndkey1,$inout3
  2225. aesdec $rndkey1,$inout4
  2226. pxor $twtmp,@tweak[5]
  2227. movaps @tweak[0],@tweak[1] # copy round[0]
  2228. aesdec $rndkey1,$inout5
  2229. $movkey -64($key),$rndkey1
  2230. movdqa $twres,$twtmp
  2231. aesdec $rndkey0,$inout0
  2232. paddd $twres,$twres
  2233. pxor @tweak[5],@tweak[0]
  2234. aesdec $rndkey0,$inout1
  2235. psrad \$31,$twtmp
  2236. paddq @tweak[5],@tweak[5]
  2237. aesdec $rndkey0,$inout2
  2238. aesdec $rndkey0,$inout3
  2239. pand $twmask,$twtmp
  2240. movaps @tweak[1],@tweak[2]
  2241. aesdec $rndkey0,$inout4
  2242. pxor $twtmp,@tweak[5]
  2243. movdqa $twres,$twtmp
  2244. aesdec $rndkey0,$inout5
  2245. $movkey -48($key),$rndkey0
  2246. paddd $twres,$twres
  2247. aesdec $rndkey1,$inout0
  2248. pxor @tweak[5],@tweak[1]
  2249. psrad \$31,$twtmp
  2250. aesdec $rndkey1,$inout1
  2251. paddq @tweak[5],@tweak[5]
  2252. pand $twmask,$twtmp
  2253. aesdec $rndkey1,$inout2
  2254. aesdec $rndkey1,$inout3
  2255. movdqa @tweak[3],`16*3`(%rsp)
  2256. pxor $twtmp,@tweak[5]
  2257. aesdec $rndkey1,$inout4
  2258. movaps @tweak[2],@tweak[3]
  2259. movdqa $twres,$twtmp
  2260. aesdec $rndkey1,$inout5
  2261. $movkey -32($key),$rndkey1
  2262. paddd $twres,$twres
  2263. aesdec $rndkey0,$inout0
  2264. pxor @tweak[5],@tweak[2]
  2265. psrad \$31,$twtmp
  2266. aesdec $rndkey0,$inout1
  2267. paddq @tweak[5],@tweak[5]
  2268. pand $twmask,$twtmp
  2269. aesdec $rndkey0,$inout2
  2270. aesdec $rndkey0,$inout3
  2271. aesdec $rndkey0,$inout4
  2272. pxor $twtmp,@tweak[5]
  2273. movaps @tweak[3],@tweak[4]
  2274. aesdec $rndkey0,$inout5
  2275. movdqa $twres,$rndkey0
  2276. paddd $twres,$twres
  2277. aesdec $rndkey1,$inout0
  2278. pxor @tweak[5],@tweak[3]
  2279. psrad \$31,$rndkey0
  2280. aesdec $rndkey1,$inout1
  2281. paddq @tweak[5],@tweak[5]
  2282. pand $twmask,$rndkey0
  2283. aesdec $rndkey1,$inout2
  2284. aesdec $rndkey1,$inout3
  2285. pxor $rndkey0,@tweak[5]
  2286. $movkey ($key_),$rndkey0
  2287. aesdec $rndkey1,$inout4
  2288. aesdec $rndkey1,$inout5
  2289. $movkey 16($key_),$rndkey1
  2290. pxor @tweak[5],@tweak[4]
  2291. aesdeclast `16*0`(%rsp),$inout0
  2292. psrad \$31,$twres
  2293. paddq @tweak[5],@tweak[5]
  2294. aesdeclast `16*1`(%rsp),$inout1
  2295. aesdeclast `16*2`(%rsp),$inout2
  2296. pand $twmask,$twres
  2297. mov %r10,%rax # restore $rounds
  2298. aesdeclast `16*3`(%rsp),$inout3
  2299. aesdeclast `16*4`(%rsp),$inout4
  2300. aesdeclast `16*5`(%rsp),$inout5
  2301. pxor $twres,@tweak[5]
  2302. lea `16*6`($out),$out # $out+=6*16
  2303. movups $inout0,`-16*6`($out) # store 6 output blocks
  2304. movups $inout1,`-16*5`($out)
  2305. movups $inout2,`-16*4`($out)
  2306. movups $inout3,`-16*3`($out)
  2307. movups $inout4,`-16*2`($out)
  2308. movups $inout5,`-16*1`($out)
  2309. sub \$16*6,$len
  2310. jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow
  2311. mov \$16+96,$rounds
  2312. sub $rnds_,$rounds
  2313. mov $key_,$key # restore $key
  2314. shr \$4,$rounds # restore original value
  2315. .Lxts_dec_short:
  2316. # at the point @tweak[0..5] are populated with tweak values
  2317. mov $rounds,$rnds_ # backup $rounds
  2318. pxor $rndkey0,@tweak[0]
  2319. pxor $rndkey0,@tweak[1]
  2320. add \$16*6,$len # restore real remaining $len
  2321. jz .Lxts_dec_done # done if ($len==0)
  2322. pxor $rndkey0,@tweak[2]
  2323. cmp \$0x20,$len
  2324. jb .Lxts_dec_one # $len is 1*16
  2325. pxor $rndkey0,@tweak[3]
  2326. je .Lxts_dec_two # $len is 2*16
  2327. pxor $rndkey0,@tweak[4]
  2328. cmp \$0x40,$len
  2329. jb .Lxts_dec_three # $len is 3*16
  2330. je .Lxts_dec_four # $len is 4*16
  2331. movdqu ($inp),$inout0 # $len is 5*16
  2332. movdqu 16*1($inp),$inout1
  2333. movdqu 16*2($inp),$inout2
  2334. pxor @tweak[0],$inout0
  2335. movdqu 16*3($inp),$inout3
  2336. pxor @tweak[1],$inout1
  2337. movdqu 16*4($inp),$inout4
  2338. lea 16*5($inp),$inp # $inp+=5*16
  2339. pxor @tweak[2],$inout2
  2340. pxor @tweak[3],$inout3
  2341. pxor @tweak[4],$inout4
  2342. call _aesni_decrypt6
  2343. xorps @tweak[0],$inout0
  2344. xorps @tweak[1],$inout1
  2345. xorps @tweak[2],$inout2
  2346. movdqu $inout0,($out) # store 5 output blocks
  2347. xorps @tweak[3],$inout3
  2348. movdqu $inout1,16*1($out)
  2349. xorps @tweak[4],$inout4
  2350. movdqu $inout2,16*2($out)
  2351. pxor $twtmp,$twtmp
  2352. movdqu $inout3,16*3($out)
  2353. pcmpgtd @tweak[5],$twtmp
  2354. movdqu $inout4,16*4($out)
  2355. lea 16*5($out),$out # $out+=5*16
  2356. pshufd \$0x13,$twtmp,@tweak[1] # $twres
  2357. and \$15,$len_
  2358. jz .Lxts_dec_ret
  2359. movdqa @tweak[5],@tweak[0]
  2360. paddq @tweak[5],@tweak[5] # psllq 1,$tweak
  2361. pand $twmask,@tweak[1] # isolate carry and residue
  2362. pxor @tweak[5],@tweak[1]
  2363. jmp .Lxts_dec_done2
  2364. .align 16
  2365. .Lxts_dec_one:
  2366. movups ($inp),$inout0
  2367. lea 16*1($inp),$inp # $inp+=1*16
  2368. xorps @tweak[0],$inout0
  2369. ___
  2370. &aesni_generate1("dec",$key,$rounds);
  2371. $code.=<<___;
  2372. xorps @tweak[0],$inout0
  2373. movdqa @tweak[1],@tweak[0]
  2374. movups $inout0,($out) # store one output block
  2375. movdqa @tweak[2],@tweak[1]
  2376. lea 16*1($out),$out # $out+=1*16
  2377. jmp .Lxts_dec_done
  2378. .align 16
  2379. .Lxts_dec_two:
  2380. movups ($inp),$inout0
  2381. movups 16($inp),$inout1
  2382. lea 32($inp),$inp # $inp+=2*16
  2383. xorps @tweak[0],$inout0
  2384. xorps @tweak[1],$inout1
  2385. call _aesni_decrypt2
  2386. xorps @tweak[0],$inout0
  2387. movdqa @tweak[2],@tweak[0]
  2388. xorps @tweak[1],$inout1
  2389. movdqa @tweak[3],@tweak[1]
  2390. movups $inout0,($out) # store 2 output blocks
  2391. movups $inout1,16*1($out)
  2392. lea 16*2($out),$out # $out+=2*16
  2393. jmp .Lxts_dec_done
  2394. .align 16
  2395. .Lxts_dec_three:
  2396. movups ($inp),$inout0
  2397. movups 16*1($inp),$inout1
  2398. movups 16*2($inp),$inout2
  2399. lea 16*3($inp),$inp # $inp+=3*16
  2400. xorps @tweak[0],$inout0
  2401. xorps @tweak[1],$inout1
  2402. xorps @tweak[2],$inout2
  2403. call _aesni_decrypt3
  2404. xorps @tweak[0],$inout0
  2405. movdqa @tweak[3],@tweak[0]
  2406. xorps @tweak[1],$inout1
  2407. movdqa @tweak[4],@tweak[1]
  2408. xorps @tweak[2],$inout2
  2409. movups $inout0,($out) # store 3 output blocks
  2410. movups $inout1,16*1($out)
  2411. movups $inout2,16*2($out)
  2412. lea 16*3($out),$out # $out+=3*16
  2413. jmp .Lxts_dec_done
  2414. .align 16
  2415. .Lxts_dec_four:
  2416. movups ($inp),$inout0
  2417. movups 16*1($inp),$inout1
  2418. movups 16*2($inp),$inout2
  2419. xorps @tweak[0],$inout0
  2420. movups 16*3($inp),$inout3
  2421. lea 16*4($inp),$inp # $inp+=4*16
  2422. xorps @tweak[1],$inout1
  2423. xorps @tweak[2],$inout2
  2424. xorps @tweak[3],$inout3
  2425. call _aesni_decrypt4
  2426. pxor @tweak[0],$inout0
  2427. movdqa @tweak[4],@tweak[0]
  2428. pxor @tweak[1],$inout1
  2429. movdqa @tweak[5],@tweak[1]
  2430. pxor @tweak[2],$inout2
  2431. movdqu $inout0,($out) # store 4 output blocks
  2432. pxor @tweak[3],$inout3
  2433. movdqu $inout1,16*1($out)
  2434. movdqu $inout2,16*2($out)
  2435. movdqu $inout3,16*3($out)
  2436. lea 16*4($out),$out # $out+=4*16
  2437. jmp .Lxts_dec_done
  2438. .align 16
  2439. .Lxts_dec_done:
  2440. and \$15,$len_ # see if $len%16 is 0
  2441. jz .Lxts_dec_ret
  2442. .Lxts_dec_done2:
  2443. mov $len_,$len
  2444. mov $key_,$key # restore $key
  2445. mov $rnds_,$rounds # restore $rounds
  2446. movups ($inp),$inout0
  2447. xorps @tweak[1],$inout0
  2448. ___
  2449. &aesni_generate1("dec",$key,$rounds);
  2450. $code.=<<___;
  2451. xorps @tweak[1],$inout0
  2452. movups $inout0,($out)
  2453. .Lxts_dec_steal:
  2454. movzb 16($inp),%eax # borrow $rounds ...
  2455. movzb ($out),%ecx # ... and $key
  2456. lea 1($inp),$inp
  2457. mov %al,($out)
  2458. mov %cl,16($out)
  2459. lea 1($out),$out
  2460. sub \$1,$len
  2461. jnz .Lxts_dec_steal
  2462. sub $len_,$out # rewind $out
  2463. mov $key_,$key # restore $key
  2464. mov $rnds_,$rounds # restore $rounds
  2465. movups ($out),$inout0
  2466. xorps @tweak[0],$inout0
  2467. ___
  2468. &aesni_generate1("dec",$key,$rounds);
  2469. $code.=<<___;
  2470. xorps @tweak[0],$inout0
  2471. movups $inout0,($out)
  2472. .Lxts_dec_ret:
  2473. xorps %xmm0,%xmm0 # clear register bank
  2474. pxor %xmm1,%xmm1
  2475. pxor %xmm2,%xmm2
  2476. pxor %xmm3,%xmm3
  2477. pxor %xmm4,%xmm4
  2478. pxor %xmm5,%xmm5
  2479. ___
  2480. $code.=<<___ if (!$win64);
  2481. pxor %xmm6,%xmm6
  2482. pxor %xmm7,%xmm7
  2483. movaps %xmm0,0x00(%rsp) # clear stack
  2484. pxor %xmm8,%xmm8
  2485. movaps %xmm0,0x10(%rsp)
  2486. pxor %xmm9,%xmm9
  2487. movaps %xmm0,0x20(%rsp)
  2488. pxor %xmm10,%xmm10
  2489. movaps %xmm0,0x30(%rsp)
  2490. pxor %xmm11,%xmm11
  2491. movaps %xmm0,0x40(%rsp)
  2492. pxor %xmm12,%xmm12
  2493. movaps %xmm0,0x50(%rsp)
  2494. pxor %xmm13,%xmm13
  2495. movaps %xmm0,0x60(%rsp)
  2496. pxor %xmm14,%xmm14
  2497. pxor %xmm15,%xmm15
  2498. ___
  2499. $code.=<<___ if ($win64);
  2500. movaps -0xa0(%rbp),%xmm6
  2501. movaps %xmm0,-0xa0(%rbp) # clear stack
  2502. movaps -0x90(%rbp),%xmm7
  2503. movaps %xmm0,-0x90(%rbp)
  2504. movaps -0x80(%rbp),%xmm8
  2505. movaps %xmm0,-0x80(%rbp)
  2506. movaps -0x70(%rbp),%xmm9
  2507. movaps %xmm0,-0x70(%rbp)
  2508. movaps -0x60(%rbp),%xmm10
  2509. movaps %xmm0,-0x60(%rbp)
  2510. movaps -0x50(%rbp),%xmm11
  2511. movaps %xmm0,-0x50(%rbp)
  2512. movaps -0x40(%rbp),%xmm12
  2513. movaps %xmm0,-0x40(%rbp)
  2514. movaps -0x30(%rbp),%xmm13
  2515. movaps %xmm0,-0x30(%rbp)
  2516. movaps -0x20(%rbp),%xmm14
  2517. movaps %xmm0,-0x20(%rbp)
  2518. movaps -0x10(%rbp),%xmm15
  2519. movaps %xmm0,-0x10(%rbp)
  2520. movaps %xmm0,0x00(%rsp)
  2521. movaps %xmm0,0x10(%rsp)
  2522. movaps %xmm0,0x20(%rsp)
  2523. movaps %xmm0,0x30(%rsp)
  2524. movaps %xmm0,0x40(%rsp)
  2525. movaps %xmm0,0x50(%rsp)
  2526. movaps %xmm0,0x60(%rsp)
  2527. ___
  2528. $code.=<<___;
  2529. lea (%rbp),%rsp
  2530. pop %rbp
  2531. .Lxts_dec_epilogue:
  2532. ret
  2533. .size aesni_xts_decrypt,.-aesni_xts_decrypt
  2534. ___
  2535. } }}
  2536. ########################################################################
  2537. # void $PREFIX_cbc_encrypt (const void *inp, void *out,
  2538. # size_t length, const AES_KEY *key,
  2539. # unsigned char *ivp,const int enc);
  2540. {
  2541. my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt
  2542. my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
  2543. my $inp_=$key_;
  2544. $code.=<<___;
  2545. .globl ${PREFIX}_cbc_encrypt
  2546. .type ${PREFIX}_cbc_encrypt,\@function,6
  2547. .align 16
  2548. ${PREFIX}_cbc_encrypt:
  2549. test $len,$len # check length
  2550. jz .Lcbc_ret
  2551. mov 240($key),$rnds_ # key->rounds
  2552. mov $key,$key_ # backup $key
  2553. test %r9d,%r9d # 6th argument
  2554. jz .Lcbc_decrypt
  2555. #--------------------------- CBC ENCRYPT ------------------------------#
  2556. movups ($ivp),$inout0 # load iv as initial state
  2557. mov $rnds_,$rounds
  2558. cmp \$16,$len
  2559. jb .Lcbc_enc_tail
  2560. sub \$16,$len
  2561. jmp .Lcbc_enc_loop
  2562. .align 16
  2563. .Lcbc_enc_loop:
  2564. movups ($inp),$inout1 # load input
  2565. lea 16($inp),$inp
  2566. #xorps $inout1,$inout0
  2567. ___
  2568. &aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
  2569. $code.=<<___;
  2570. mov $rnds_,$rounds # restore $rounds
  2571. mov $key_,$key # restore $key
  2572. movups $inout0,0($out) # store output
  2573. lea 16($out),$out
  2574. sub \$16,$len
  2575. jnc .Lcbc_enc_loop
  2576. add \$16,$len
  2577. jnz .Lcbc_enc_tail
  2578. pxor $rndkey0,$rndkey0 # clear register bank
  2579. pxor $rndkey1,$rndkey1
  2580. movups $inout0,($ivp)
  2581. pxor $inout0,$inout0
  2582. pxor $inout1,$inout1
  2583. jmp .Lcbc_ret
  2584. .Lcbc_enc_tail:
  2585. mov $len,%rcx # zaps $key
  2586. xchg $inp,$out # $inp is %rsi and $out is %rdi now
  2587. .long 0x9066A4F3 # rep movsb
  2588. mov \$16,%ecx # zero tail
  2589. sub $len,%rcx
  2590. xor %eax,%eax
  2591. .long 0x9066AAF3 # rep stosb
  2592. lea -16(%rdi),%rdi # rewind $out by 1 block
  2593. mov $rnds_,$rounds # restore $rounds
  2594. mov %rdi,%rsi # $inp and $out are the same
  2595. mov $key_,$key # restore $key
  2596. xor $len,$len # len=16
  2597. jmp .Lcbc_enc_loop # one more spin
  2598. #--------------------------- CBC DECRYPT ------------------------------#
  2599. .align 16
  2600. .Lcbc_decrypt:
  2601. cmp \$16,$len
  2602. jne .Lcbc_decrypt_bulk
  2603. # handle single block without allocating stack frame,
  2604. # useful in ciphertext stealing mode
  2605. movdqu ($inp),$inout0 # load input
  2606. movdqu ($ivp),$inout1 # load iv
  2607. movdqa $inout0,$inout2 # future iv
  2608. ___
  2609. &aesni_generate1("dec",$key,$rnds_);
  2610. $code.=<<___;
  2611. pxor $rndkey0,$rndkey0 # clear register bank
  2612. pxor $rndkey1,$rndkey1
  2613. movdqu $inout2,($ivp) # store iv
  2614. xorps $inout1,$inout0 # ^=iv
  2615. pxor $inout1,$inout1
  2616. movups $inout0,($out) # store output
  2617. pxor $inout0,$inout0
  2618. jmp .Lcbc_ret
  2619. .align 16
  2620. .Lcbc_decrypt_bulk:
  2621. lea (%rsp),%rax
  2622. push %rbp
  2623. sub \$$frame_size,%rsp
  2624. and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
  2625. ___
  2626. $code.=<<___ if ($win64);
  2627. movaps %xmm6,0x10(%rsp)
  2628. movaps %xmm7,0x20(%rsp)
  2629. movaps %xmm8,0x30(%rsp)
  2630. movaps %xmm9,0x40(%rsp)
  2631. movaps %xmm10,0x50(%rsp)
  2632. movaps %xmm11,0x60(%rsp)
  2633. movaps %xmm12,0x70(%rsp)
  2634. movaps %xmm13,0x80(%rsp)
  2635. movaps %xmm14,0x90(%rsp)
  2636. movaps %xmm15,0xa0(%rsp)
  2637. .Lcbc_decrypt_body:
  2638. ___
  2639. $code.=<<___;
  2640. lea -8(%rax),%rbp
  2641. movups ($ivp),$iv
  2642. mov $rnds_,$rounds
  2643. cmp \$0x50,$len
  2644. jbe .Lcbc_dec_tail
  2645. $movkey ($key),$rndkey0
  2646. movdqu 0x00($inp),$inout0 # load input
  2647. movdqu 0x10($inp),$inout1
  2648. movdqa $inout0,$in0
  2649. movdqu 0x20($inp),$inout2
  2650. movdqa $inout1,$in1
  2651. movdqu 0x30($inp),$inout3
  2652. movdqa $inout2,$in2
  2653. movdqu 0x40($inp),$inout4
  2654. movdqa $inout3,$in3
  2655. movdqu 0x50($inp),$inout5
  2656. movdqa $inout4,$in4
  2657. mov OPENSSL_ia32cap_P+4(%rip),%r9d
  2658. cmp \$0x70,$len
  2659. jbe .Lcbc_dec_six_or_seven
  2660. and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE
  2661. sub \$0x50,$len # $len is biased by -5*16
  2662. cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE
  2663. je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont]
  2664. sub \$0x20,$len # $len is biased by -7*16
  2665. lea 0x70($key),$key # size optimization
  2666. jmp .Lcbc_dec_loop8_enter
  2667. .align 16
  2668. .Lcbc_dec_loop8:
  2669. movups $inout7,($out)
  2670. lea 0x10($out),$out
  2671. .Lcbc_dec_loop8_enter:
  2672. movdqu 0x60($inp),$inout6
  2673. pxor $rndkey0,$inout0
  2674. movdqu 0x70($inp),$inout7
  2675. pxor $rndkey0,$inout1
  2676. $movkey 0x10-0x70($key),$rndkey1
  2677. pxor $rndkey0,$inout2
  2678. xor $inp_,$inp_
  2679. cmp \$0x70,$len # is there at least 0x60 bytes ahead?
  2680. pxor $rndkey0,$inout3
  2681. pxor $rndkey0,$inout4
  2682. pxor $rndkey0,$inout5
  2683. pxor $rndkey0,$inout6
  2684. aesdec $rndkey1,$inout0
  2685. pxor $rndkey0,$inout7
  2686. $movkey 0x20-0x70($key),$rndkey0
  2687. aesdec $rndkey1,$inout1
  2688. aesdec $rndkey1,$inout2
  2689. aesdec $rndkey1,$inout3
  2690. aesdec $rndkey1,$inout4
  2691. aesdec $rndkey1,$inout5
  2692. aesdec $rndkey1,$inout6
  2693. setnc ${inp_}b
  2694. shl \$7,$inp_
  2695. aesdec $rndkey1,$inout7
  2696. add $inp,$inp_
  2697. $movkey 0x30-0x70($key),$rndkey1
  2698. ___
  2699. for($i=1;$i<12;$i++) {
  2700. my $rndkeyx = ($i&1)?$rndkey0:$rndkey1;
  2701. $code.=<<___ if ($i==7);
  2702. cmp \$11,$rounds
  2703. ___
  2704. $code.=<<___;
  2705. aesdec $rndkeyx,$inout0
  2706. aesdec $rndkeyx,$inout1
  2707. aesdec $rndkeyx,$inout2
  2708. aesdec $rndkeyx,$inout3
  2709. aesdec $rndkeyx,$inout4
  2710. aesdec $rndkeyx,$inout5
  2711. aesdec $rndkeyx,$inout6
  2712. aesdec $rndkeyx,$inout7
  2713. $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx
  2714. ___
  2715. $code.=<<___ if ($i<6 || (!($i&1) && $i>7));
  2716. nop
  2717. ___
  2718. $code.=<<___ if ($i==7);
  2719. jb .Lcbc_dec_done
  2720. ___
  2721. $code.=<<___ if ($i==9);
  2722. je .Lcbc_dec_done
  2723. ___
  2724. $code.=<<___ if ($i==11);
  2725. jmp .Lcbc_dec_done
  2726. ___
  2727. }
  2728. $code.=<<___;
  2729. .align 16
  2730. .Lcbc_dec_done:
  2731. aesdec $rndkey1,$inout0
  2732. aesdec $rndkey1,$inout1
  2733. pxor $rndkey0,$iv
  2734. pxor $rndkey0,$in0
  2735. aesdec $rndkey1,$inout2
  2736. aesdec $rndkey1,$inout3
  2737. pxor $rndkey0,$in1
  2738. pxor $rndkey0,$in2
  2739. aesdec $rndkey1,$inout4
  2740. aesdec $rndkey1,$inout5
  2741. pxor $rndkey0,$in3
  2742. pxor $rndkey0,$in4
  2743. aesdec $rndkey1,$inout6
  2744. aesdec $rndkey1,$inout7
  2745. movdqu 0x50($inp),$rndkey1
  2746. aesdeclast $iv,$inout0
  2747. movdqu 0x60($inp),$iv # borrow $iv
  2748. pxor $rndkey0,$rndkey1
  2749. aesdeclast $in0,$inout1
  2750. pxor $rndkey0,$iv
  2751. movdqu 0x70($inp),$rndkey0 # next IV
  2752. aesdeclast $in1,$inout2
  2753. lea 0x80($inp),$inp
  2754. movdqu 0x00($inp_),$in0
  2755. aesdeclast $in2,$inout3
  2756. aesdeclast $in3,$inout4
  2757. movdqu 0x10($inp_),$in1
  2758. movdqu 0x20($inp_),$in2
  2759. aesdeclast $in4,$inout5
  2760. aesdeclast $rndkey1,$inout6
  2761. movdqu 0x30($inp_),$in3
  2762. movdqu 0x40($inp_),$in4
  2763. aesdeclast $iv,$inout7
  2764. movdqa $rndkey0,$iv # return $iv
  2765. movdqu 0x50($inp_),$rndkey1
  2766. $movkey -0x70($key),$rndkey0
  2767. movups $inout0,($out) # store output
  2768. movdqa $in0,$inout0
  2769. movups $inout1,0x10($out)
  2770. movdqa $in1,$inout1
  2771. movups $inout2,0x20($out)
  2772. movdqa $in2,$inout2
  2773. movups $inout3,0x30($out)
  2774. movdqa $in3,$inout3
  2775. movups $inout4,0x40($out)
  2776. movdqa $in4,$inout4
  2777. movups $inout5,0x50($out)
  2778. movdqa $rndkey1,$inout5
  2779. movups $inout6,0x60($out)
  2780. lea 0x70($out),$out
  2781. sub \$0x80,$len
  2782. ja .Lcbc_dec_loop8
  2783. movaps $inout7,$inout0
  2784. lea -0x70($key),$key
  2785. add \$0x70,$len
  2786. jle .Lcbc_dec_clear_tail_collected
  2787. movups $inout7,($out)
  2788. lea 0x10($out),$out
  2789. cmp \$0x50,$len
  2790. jbe .Lcbc_dec_tail
  2791. movaps $in0,$inout0
  2792. .Lcbc_dec_six_or_seven:
  2793. cmp \$0x60,$len
  2794. ja .Lcbc_dec_seven
  2795. movaps $inout5,$inout6
  2796. call _aesni_decrypt6
  2797. pxor $iv,$inout0 # ^= IV
  2798. movaps $inout6,$iv
  2799. pxor $in0,$inout1
  2800. movdqu $inout0,($out)
  2801. pxor $in1,$inout2
  2802. movdqu $inout1,0x10($out)
  2803. pxor $inout1,$inout1 # clear register bank
  2804. pxor $in2,$inout3
  2805. movdqu $inout2,0x20($out)
  2806. pxor $inout2,$inout2
  2807. pxor $in3,$inout4
  2808. movdqu $inout3,0x30($out)
  2809. pxor $inout3,$inout3
  2810. pxor $in4,$inout5
  2811. movdqu $inout4,0x40($out)
  2812. pxor $inout4,$inout4
  2813. lea 0x50($out),$out
  2814. movdqa $inout5,$inout0
  2815. pxor $inout5,$inout5
  2816. jmp .Lcbc_dec_tail_collected
  2817. .align 16
  2818. .Lcbc_dec_seven:
  2819. movups 0x60($inp),$inout6
  2820. xorps $inout7,$inout7
  2821. call _aesni_decrypt8
  2822. movups 0x50($inp),$inout7
  2823. pxor $iv,$inout0 # ^= IV
  2824. movups 0x60($inp),$iv
  2825. pxor $in0,$inout1
  2826. movdqu $inout0,($out)
  2827. pxor $in1,$inout2
  2828. movdqu $inout1,0x10($out)
  2829. pxor $inout1,$inout1 # clear register bank
  2830. pxor $in2,$inout3
  2831. movdqu $inout2,0x20($out)
  2832. pxor $inout2,$inout2
  2833. pxor $in3,$inout4
  2834. movdqu $inout3,0x30($out)
  2835. pxor $inout3,$inout3
  2836. pxor $in4,$inout5
  2837. movdqu $inout4,0x40($out)
  2838. pxor $inout4,$inout4
  2839. pxor $inout7,$inout6
  2840. movdqu $inout5,0x50($out)
  2841. pxor $inout5,$inout5
  2842. lea 0x60($out),$out
  2843. movdqa $inout6,$inout0
  2844. pxor $inout6,$inout6
  2845. pxor $inout7,$inout7
  2846. jmp .Lcbc_dec_tail_collected
  2847. .align 16
  2848. .Lcbc_dec_loop6:
  2849. movups $inout5,($out)
  2850. lea 0x10($out),$out
  2851. movdqu 0x00($inp),$inout0 # load input
  2852. movdqu 0x10($inp),$inout1
  2853. movdqa $inout0,$in0
  2854. movdqu 0x20($inp),$inout2
  2855. movdqa $inout1,$in1
  2856. movdqu 0x30($inp),$inout3
  2857. movdqa $inout2,$in2
  2858. movdqu 0x40($inp),$inout4
  2859. movdqa $inout3,$in3
  2860. movdqu 0x50($inp),$inout5
  2861. movdqa $inout4,$in4
  2862. .Lcbc_dec_loop6_enter:
  2863. lea 0x60($inp),$inp
  2864. movdqa $inout5,$inout6
  2865. call _aesni_decrypt6
  2866. pxor $iv,$inout0 # ^= IV
  2867. movdqa $inout6,$iv
  2868. pxor $in0,$inout1
  2869. movdqu $inout0,($out)
  2870. pxor $in1,$inout2
  2871. movdqu $inout1,0x10($out)
  2872. pxor $in2,$inout3
  2873. movdqu $inout2,0x20($out)
  2874. pxor $in3,$inout4
  2875. mov $key_,$key
  2876. movdqu $inout3,0x30($out)
  2877. pxor $in4,$inout5
  2878. mov $rnds_,$rounds
  2879. movdqu $inout4,0x40($out)
  2880. lea 0x50($out),$out
  2881. sub \$0x60,$len
  2882. ja .Lcbc_dec_loop6
  2883. movdqa $inout5,$inout0
  2884. add \$0x50,$len
  2885. jle .Lcbc_dec_clear_tail_collected
  2886. movups $inout5,($out)
  2887. lea 0x10($out),$out
  2888. .Lcbc_dec_tail:
  2889. movups ($inp),$inout0
  2890. sub \$0x10,$len
  2891. jbe .Lcbc_dec_one # $len is 1*16 or less
  2892. movups 0x10($inp),$inout1
  2893. movaps $inout0,$in0
  2894. sub \$0x10,$len
  2895. jbe .Lcbc_dec_two # $len is 2*16 or less
  2896. movups 0x20($inp),$inout2
  2897. movaps $inout1,$in1
  2898. sub \$0x10,$len
  2899. jbe .Lcbc_dec_three # $len is 3*16 or less
  2900. movups 0x30($inp),$inout3
  2901. movaps $inout2,$in2
  2902. sub \$0x10,$len
  2903. jbe .Lcbc_dec_four # $len is 4*16 or less
  2904. movups 0x40($inp),$inout4 # $len is 5*16 or less
  2905. movaps $inout3,$in3
  2906. movaps $inout4,$in4
  2907. xorps $inout5,$inout5
  2908. call _aesni_decrypt6
  2909. pxor $iv,$inout0
  2910. movaps $in4,$iv
  2911. pxor $in0,$inout1
  2912. movdqu $inout0,($out)
  2913. pxor $in1,$inout2
  2914. movdqu $inout1,0x10($out)
  2915. pxor $inout1,$inout1 # clear register bank
  2916. pxor $in2,$inout3
  2917. movdqu $inout2,0x20($out)
  2918. pxor $inout2,$inout2
  2919. pxor $in3,$inout4
  2920. movdqu $inout3,0x30($out)
  2921. pxor $inout3,$inout3
  2922. lea 0x40($out),$out
  2923. movdqa $inout4,$inout0
  2924. pxor $inout4,$inout4
  2925. pxor $inout5,$inout5
  2926. sub \$0x10,$len
  2927. jmp .Lcbc_dec_tail_collected
  2928. .align 16
  2929. .Lcbc_dec_one:
  2930. movaps $inout0,$in0
  2931. ___
  2932. &aesni_generate1("dec",$key,$rounds);
  2933. $code.=<<___;
  2934. xorps $iv,$inout0
  2935. movaps $in0,$iv
  2936. jmp .Lcbc_dec_tail_collected
  2937. .align 16
  2938. .Lcbc_dec_two:
  2939. movaps $inout1,$in1
  2940. call _aesni_decrypt2
  2941. pxor $iv,$inout0
  2942. movaps $in1,$iv
  2943. pxor $in0,$inout1
  2944. movdqu $inout0,($out)
  2945. movdqa $inout1,$inout0
  2946. pxor $inout1,$inout1 # clear register bank
  2947. lea 0x10($out),$out
  2948. jmp .Lcbc_dec_tail_collected
  2949. .align 16
  2950. .Lcbc_dec_three:
  2951. movaps $inout2,$in2
  2952. call _aesni_decrypt3
  2953. pxor $iv,$inout0
  2954. movaps $in2,$iv
  2955. pxor $in0,$inout1
  2956. movdqu $inout0,($out)
  2957. pxor $in1,$inout2
  2958. movdqu $inout1,0x10($out)
  2959. pxor $inout1,$inout1 # clear register bank
  2960. movdqa $inout2,$inout0
  2961. pxor $inout2,$inout2
  2962. lea 0x20($out),$out
  2963. jmp .Lcbc_dec_tail_collected
  2964. .align 16
  2965. .Lcbc_dec_four:
  2966. movaps $inout3,$in3
  2967. call _aesni_decrypt4
  2968. pxor $iv,$inout0
  2969. movaps $in3,$iv
  2970. pxor $in0,$inout1
  2971. movdqu $inout0,($out)
  2972. pxor $in1,$inout2
  2973. movdqu $inout1,0x10($out)
  2974. pxor $inout1,$inout1 # clear register bank
  2975. pxor $in2,$inout3
  2976. movdqu $inout2,0x20($out)
  2977. pxor $inout2,$inout2
  2978. movdqa $inout3,$inout0
  2979. pxor $inout3,$inout3
  2980. lea 0x30($out),$out
  2981. jmp .Lcbc_dec_tail_collected
  2982. .align 16
  2983. .Lcbc_dec_clear_tail_collected:
  2984. pxor $inout1,$inout1 # clear register bank
  2985. pxor $inout2,$inout2
  2986. pxor $inout3,$inout3
  2987. ___
  2988. $code.=<<___ if (!$win64);
  2989. pxor $inout4,$inout4 # %xmm6..9
  2990. pxor $inout5,$inout5
  2991. pxor $inout6,$inout6
  2992. pxor $inout7,$inout7
  2993. ___
  2994. $code.=<<___;
  2995. .Lcbc_dec_tail_collected:
  2996. movups $iv,($ivp)
  2997. and \$15,$len
  2998. jnz .Lcbc_dec_tail_partial
  2999. movups $inout0,($out)
  3000. pxor $inout0,$inout0
  3001. jmp .Lcbc_dec_ret
  3002. .align 16
  3003. .Lcbc_dec_tail_partial:
  3004. movaps $inout0,(%rsp)
  3005. pxor $inout0,$inout0
  3006. mov \$16,%rcx
  3007. mov $out,%rdi
  3008. sub $len,%rcx
  3009. lea (%rsp),%rsi
  3010. .long 0x9066A4F3 # rep movsb
  3011. movdqa $inout0,(%rsp)
  3012. .Lcbc_dec_ret:
  3013. xorps $rndkey0,$rndkey0 # %xmm0
  3014. pxor $rndkey1,$rndkey1
  3015. ___
  3016. $code.=<<___ if ($win64);
  3017. movaps 0x10(%rsp),%xmm6
  3018. movaps %xmm0,0x10(%rsp) # clear stack
  3019. movaps 0x20(%rsp),%xmm7
  3020. movaps %xmm0,0x20(%rsp)
  3021. movaps 0x30(%rsp),%xmm8
  3022. movaps %xmm0,0x30(%rsp)
  3023. movaps 0x40(%rsp),%xmm9
  3024. movaps %xmm0,0x40(%rsp)
  3025. movaps 0x50(%rsp),%xmm10
  3026. movaps %xmm0,0x50(%rsp)
  3027. movaps 0x60(%rsp),%xmm11
  3028. movaps %xmm0,0x60(%rsp)
  3029. movaps 0x70(%rsp),%xmm12
  3030. movaps %xmm0,0x70(%rsp)
  3031. movaps 0x80(%rsp),%xmm13
  3032. movaps %xmm0,0x80(%rsp)
  3033. movaps 0x90(%rsp),%xmm14
  3034. movaps %xmm0,0x90(%rsp)
  3035. movaps 0xa0(%rsp),%xmm15
  3036. movaps %xmm0,0xa0(%rsp)
  3037. ___
  3038. $code.=<<___;
  3039. lea (%rbp),%rsp
  3040. pop %rbp
  3041. .Lcbc_ret:
  3042. ret
  3043. .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
  3044. ___
  3045. }
  3046. # int ${PREFIX}_set_decrypt_key(const unsigned char *inp,
  3047. # int bits, AES_KEY *key)
  3048. #
  3049. # input: $inp user-supplied key
  3050. # $bits $inp length in bits
  3051. # $key pointer to key schedule
  3052. # output: %eax 0 denoting success, -1 or -2 - failure (see C)
  3053. # *$key key schedule
  3054. #
  3055. { my ($inp,$bits,$key) = @_4args;
  3056. $bits =~ s/%r/%e/;
  3057. $code.=<<___;
  3058. .globl ${PREFIX}_set_decrypt_key
  3059. .type ${PREFIX}_set_decrypt_key,\@abi-omnipotent
  3060. .align 16
  3061. ${PREFIX}_set_decrypt_key:
  3062. .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
  3063. call __aesni_set_encrypt_key
  3064. shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key
  3065. test %eax,%eax
  3066. jnz .Ldec_key_ret
  3067. lea 16($key,$bits),$inp # points at the end of key schedule
  3068. $movkey ($key),%xmm0 # just swap
  3069. $movkey ($inp),%xmm1
  3070. $movkey %xmm0,($inp)
  3071. $movkey %xmm1,($key)
  3072. lea 16($key),$key
  3073. lea -16($inp),$inp
  3074. .Ldec_key_inverse:
  3075. $movkey ($key),%xmm0 # swap and inverse
  3076. $movkey ($inp),%xmm1
  3077. aesimc %xmm0,%xmm0
  3078. aesimc %xmm1,%xmm1
  3079. lea 16($key),$key
  3080. lea -16($inp),$inp
  3081. $movkey %xmm0,16($inp)
  3082. $movkey %xmm1,-16($key)
  3083. cmp $key,$inp
  3084. ja .Ldec_key_inverse
  3085. $movkey ($key),%xmm0 # inverse middle
  3086. aesimc %xmm0,%xmm0
  3087. pxor %xmm1,%xmm1
  3088. $movkey %xmm0,($inp)
  3089. pxor %xmm0,%xmm0
  3090. .Ldec_key_ret:
  3091. add \$8,%rsp
  3092. ret
  3093. .LSEH_end_set_decrypt_key:
  3094. .size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
  3095. ___
  3096. # This is based on submission by
  3097. #
  3098. # Huang Ying <ying.huang@intel.com>
  3099. # Vinodh Gopal <vinodh.gopal@intel.com>
  3100. # Kahraman Akdemir
  3101. #
  3102. # Agressively optimized in respect to aeskeygenassist's critical path
  3103. # and is contained in %xmm0-5 to meet Win64 ABI requirement.
  3104. #
  3105. # int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
  3106. # int bits, AES_KEY * const key);
  3107. #
  3108. # input: $inp user-supplied key
  3109. # $bits $inp length in bits
  3110. # $key pointer to key schedule
  3111. # output: %eax 0 denoting success, -1 or -2 - failure (see C)
  3112. # $bits rounds-1 (used in aesni_set_decrypt_key)
  3113. # *$key key schedule
  3114. # $key pointer to key schedule (used in
  3115. # aesni_set_decrypt_key)
  3116. #
  3117. # Subroutine is frame-less, which means that only volatile registers
  3118. # are used. Note that it's declared "abi-omnipotent", which means that
  3119. # amount of volatile registers is smaller on Windows.
  3120. #
  3121. $code.=<<___;
  3122. .globl ${PREFIX}_set_encrypt_key
  3123. .type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
  3124. .align 16
  3125. ${PREFIX}_set_encrypt_key:
  3126. __aesni_set_encrypt_key:
  3127. .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
  3128. mov \$-1,%rax
  3129. test $inp,$inp
  3130. jz .Lenc_key_ret
  3131. test $key,$key
  3132. jz .Lenc_key_ret
  3133. mov \$`1<<28|1<<11`,%r10d # AVX and XOP bits
  3134. movups ($inp),%xmm0 # pull first 128 bits of *userKey
  3135. xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
  3136. and OPENSSL_ia32cap_P+4(%rip),%r10d
  3137. lea 16($key),%rax # %rax is used as modifiable copy of $key
  3138. cmp \$256,$bits
  3139. je .L14rounds
  3140. cmp \$192,$bits
  3141. je .L12rounds
  3142. cmp \$128,$bits
  3143. jne .Lbad_keybits
  3144. .L10rounds:
  3145. mov \$9,$bits # 10 rounds for 128-bit key
  3146. cmp \$`1<<28`,%r10d # AVX, bit no XOP
  3147. je .L10rounds_alt
  3148. $movkey %xmm0,($key) # round 0
  3149. aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1
  3150. call .Lkey_expansion_128_cold
  3151. aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2
  3152. call .Lkey_expansion_128
  3153. aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3
  3154. call .Lkey_expansion_128
  3155. aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4
  3156. call .Lkey_expansion_128
  3157. aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5
  3158. call .Lkey_expansion_128
  3159. aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6
  3160. call .Lkey_expansion_128
  3161. aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7
  3162. call .Lkey_expansion_128
  3163. aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8
  3164. call .Lkey_expansion_128
  3165. aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9
  3166. call .Lkey_expansion_128
  3167. aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10
  3168. call .Lkey_expansion_128
  3169. $movkey %xmm0,(%rax)
  3170. mov $bits,80(%rax) # 240(%rdx)
  3171. xor %eax,%eax
  3172. jmp .Lenc_key_ret
  3173. .align 16
  3174. .L10rounds_alt:
  3175. movdqa .Lkey_rotate(%rip),%xmm5
  3176. mov \$8,%r10d
  3177. movdqa .Lkey_rcon1(%rip),%xmm4
  3178. movdqa %xmm0,%xmm2
  3179. movdqu %xmm0,($key)
  3180. jmp .Loop_key128
  3181. .align 16
  3182. .Loop_key128:
  3183. pshufb %xmm5,%xmm0
  3184. aesenclast %xmm4,%xmm0
  3185. pslld \$1,%xmm4
  3186. lea 16(%rax),%rax
  3187. movdqa %xmm2,%xmm3
  3188. pslldq \$4,%xmm2
  3189. pxor %xmm2,%xmm3
  3190. pslldq \$4,%xmm2
  3191. pxor %xmm2,%xmm3
  3192. pslldq \$4,%xmm2
  3193. pxor %xmm3,%xmm2
  3194. pxor %xmm2,%xmm0
  3195. movdqu %xmm0,-16(%rax)
  3196. movdqa %xmm0,%xmm2
  3197. dec %r10d
  3198. jnz .Loop_key128
  3199. movdqa .Lkey_rcon1b(%rip),%xmm4
  3200. pshufb %xmm5,%xmm0
  3201. aesenclast %xmm4,%xmm0
  3202. pslld \$1,%xmm4
  3203. movdqa %xmm2,%xmm3
  3204. pslldq \$4,%xmm2
  3205. pxor %xmm2,%xmm3
  3206. pslldq \$4,%xmm2
  3207. pxor %xmm2,%xmm3
  3208. pslldq \$4,%xmm2
  3209. pxor %xmm3,%xmm2
  3210. pxor %xmm2,%xmm0
  3211. movdqu %xmm0,(%rax)
  3212. movdqa %xmm0,%xmm2
  3213. pshufb %xmm5,%xmm0
  3214. aesenclast %xmm4,%xmm0
  3215. movdqa %xmm2,%xmm3
  3216. pslldq \$4,%xmm2
  3217. pxor %xmm2,%xmm3
  3218. pslldq \$4,%xmm2
  3219. pxor %xmm2,%xmm3
  3220. pslldq \$4,%xmm2
  3221. pxor %xmm3,%xmm2
  3222. pxor %xmm2,%xmm0
  3223. movdqu %xmm0,16(%rax)
  3224. mov $bits,96(%rax) # 240($key)
  3225. xor %eax,%eax
  3226. jmp .Lenc_key_ret
  3227. .align 16
  3228. .L12rounds:
  3229. movq 16($inp),%xmm2 # remaining 1/3 of *userKey
  3230. mov \$11,$bits # 12 rounds for 192
  3231. cmp \$`1<<28`,%r10d # AVX, but no XOP
  3232. je .L12rounds_alt
  3233. $movkey %xmm0,($key) # round 0
  3234. aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2
  3235. call .Lkey_expansion_192a_cold
  3236. aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3
  3237. call .Lkey_expansion_192b
  3238. aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5
  3239. call .Lkey_expansion_192a
  3240. aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6
  3241. call .Lkey_expansion_192b
  3242. aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8
  3243. call .Lkey_expansion_192a
  3244. aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9
  3245. call .Lkey_expansion_192b
  3246. aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11
  3247. call .Lkey_expansion_192a
  3248. aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12
  3249. call .Lkey_expansion_192b
  3250. $movkey %xmm0,(%rax)
  3251. mov $bits,48(%rax) # 240(%rdx)
  3252. xor %rax, %rax
  3253. jmp .Lenc_key_ret
  3254. .align 16
  3255. .L12rounds_alt:
  3256. movdqa .Lkey_rotate192(%rip),%xmm5
  3257. movdqa .Lkey_rcon1(%rip),%xmm4
  3258. mov \$8,%r10d
  3259. movdqu %xmm0,($key)
  3260. jmp .Loop_key192
  3261. .align 16
  3262. .Loop_key192:
  3263. movq %xmm2,0(%rax)
  3264. movdqa %xmm2,%xmm1
  3265. pshufb %xmm5,%xmm2
  3266. aesenclast %xmm4,%xmm2
  3267. pslld \$1, %xmm4
  3268. lea 24(%rax),%rax
  3269. movdqa %xmm0,%xmm3
  3270. pslldq \$4,%xmm0
  3271. pxor %xmm0,%xmm3
  3272. pslldq \$4,%xmm0
  3273. pxor %xmm0,%xmm3
  3274. pslldq \$4,%xmm0
  3275. pxor %xmm3,%xmm0
  3276. pshufd \$0xff,%xmm0,%xmm3
  3277. pxor %xmm1,%xmm3
  3278. pslldq \$4,%xmm1
  3279. pxor %xmm1,%xmm3
  3280. pxor %xmm2,%xmm0
  3281. pxor %xmm3,%xmm2
  3282. movdqu %xmm0,-16(%rax)
  3283. dec %r10d
  3284. jnz .Loop_key192
  3285. mov $bits,32(%rax) # 240($key)
  3286. xor %eax,%eax
  3287. jmp .Lenc_key_ret
  3288. .align 16
  3289. .L14rounds:
  3290. movups 16($inp),%xmm2 # remaning half of *userKey
  3291. mov \$13,$bits # 14 rounds for 256
  3292. lea 16(%rax),%rax
  3293. cmp \$`1<<28`,%r10d # AVX, but no XOP
  3294. je .L14rounds_alt
  3295. $movkey %xmm0,($key) # round 0
  3296. $movkey %xmm2,16($key) # round 1
  3297. aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2
  3298. call .Lkey_expansion_256a_cold
  3299. aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3
  3300. call .Lkey_expansion_256b
  3301. aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4
  3302. call .Lkey_expansion_256a
  3303. aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5
  3304. call .Lkey_expansion_256b
  3305. aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6
  3306. call .Lkey_expansion_256a
  3307. aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7
  3308. call .Lkey_expansion_256b
  3309. aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8
  3310. call .Lkey_expansion_256a
  3311. aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9
  3312. call .Lkey_expansion_256b
  3313. aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10
  3314. call .Lkey_expansion_256a
  3315. aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11
  3316. call .Lkey_expansion_256b
  3317. aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12
  3318. call .Lkey_expansion_256a
  3319. aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13
  3320. call .Lkey_expansion_256b
  3321. aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14
  3322. call .Lkey_expansion_256a
  3323. $movkey %xmm0,(%rax)
  3324. mov $bits,16(%rax) # 240(%rdx)
  3325. xor %rax,%rax
  3326. jmp .Lenc_key_ret
  3327. .align 16
  3328. .L14rounds_alt:
  3329. movdqa .Lkey_rotate(%rip),%xmm5
  3330. movdqa .Lkey_rcon1(%rip),%xmm4
  3331. mov \$7,%r10d
  3332. movdqu %xmm0,0($key)
  3333. movdqa %xmm2,%xmm1
  3334. movdqu %xmm2,16($key)
  3335. jmp .Loop_key256
  3336. .align 16
  3337. .Loop_key256:
  3338. pshufb %xmm5,%xmm2
  3339. aesenclast %xmm4,%xmm2
  3340. movdqa %xmm0,%xmm3
  3341. pslldq \$4,%xmm0
  3342. pxor %xmm0,%xmm3
  3343. pslldq \$4,%xmm0
  3344. pxor %xmm0,%xmm3
  3345. pslldq \$4,%xmm0
  3346. pxor %xmm3,%xmm0
  3347. pslld \$1,%xmm4
  3348. pxor %xmm2,%xmm0
  3349. movdqu %xmm0,(%rax)
  3350. dec %r10d
  3351. jz .Ldone_key256
  3352. pshufd \$0xff,%xmm0,%xmm2
  3353. pxor %xmm3,%xmm3
  3354. aesenclast %xmm3,%xmm2
  3355. movdqa %xmm1,%xmm3
  3356. pslldq \$4,%xmm1
  3357. pxor %xmm1,%xmm3
  3358. pslldq \$4,%xmm1
  3359. pxor %xmm1,%xmm3
  3360. pslldq \$4,%xmm1
  3361. pxor %xmm3,%xmm1
  3362. pxor %xmm1,%xmm2
  3363. movdqu %xmm2,16(%rax)
  3364. lea 32(%rax),%rax
  3365. movdqa %xmm2,%xmm1
  3366. jmp .Loop_key256
  3367. .Ldone_key256:
  3368. mov $bits,16(%rax) # 240($key)
  3369. xor %eax,%eax
  3370. jmp .Lenc_key_ret
  3371. .align 16
  3372. .Lbad_keybits:
  3373. mov \$-2,%rax
  3374. .Lenc_key_ret:
  3375. pxor %xmm0,%xmm0
  3376. pxor %xmm1,%xmm1
  3377. pxor %xmm2,%xmm2
  3378. pxor %xmm3,%xmm3
  3379. pxor %xmm4,%xmm4
  3380. pxor %xmm5,%xmm5
  3381. add \$8,%rsp
  3382. ret
  3383. .LSEH_end_set_encrypt_key:
  3384. .align 16
  3385. .Lkey_expansion_128:
  3386. $movkey %xmm0,(%rax)
  3387. lea 16(%rax),%rax
  3388. .Lkey_expansion_128_cold:
  3389. shufps \$0b00010000,%xmm0,%xmm4
  3390. xorps %xmm4, %xmm0
  3391. shufps \$0b10001100,%xmm0,%xmm4
  3392. xorps %xmm4, %xmm0
  3393. shufps \$0b11111111,%xmm1,%xmm1 # critical path
  3394. xorps %xmm1,%xmm0
  3395. ret
  3396. .align 16
  3397. .Lkey_expansion_192a:
  3398. $movkey %xmm0,(%rax)
  3399. lea 16(%rax),%rax
  3400. .Lkey_expansion_192a_cold:
  3401. movaps %xmm2, %xmm5
  3402. .Lkey_expansion_192b_warm:
  3403. shufps \$0b00010000,%xmm0,%xmm4
  3404. movdqa %xmm2,%xmm3
  3405. xorps %xmm4,%xmm0
  3406. shufps \$0b10001100,%xmm0,%xmm4
  3407. pslldq \$4,%xmm3
  3408. xorps %xmm4,%xmm0
  3409. pshufd \$0b01010101,%xmm1,%xmm1 # critical path
  3410. pxor %xmm3,%xmm2
  3411. pxor %xmm1,%xmm0
  3412. pshufd \$0b11111111,%xmm0,%xmm3
  3413. pxor %xmm3,%xmm2
  3414. ret
  3415. .align 16
  3416. .Lkey_expansion_192b:
  3417. movaps %xmm0,%xmm3
  3418. shufps \$0b01000100,%xmm0,%xmm5
  3419. $movkey %xmm5,(%rax)
  3420. shufps \$0b01001110,%xmm2,%xmm3
  3421. $movkey %xmm3,16(%rax)
  3422. lea 32(%rax),%rax
  3423. jmp .Lkey_expansion_192b_warm
  3424. .align 16
  3425. .Lkey_expansion_256a:
  3426. $movkey %xmm2,(%rax)
  3427. lea 16(%rax),%rax
  3428. .Lkey_expansion_256a_cold:
  3429. shufps \$0b00010000,%xmm0,%xmm4
  3430. xorps %xmm4,%xmm0
  3431. shufps \$0b10001100,%xmm0,%xmm4
  3432. xorps %xmm4,%xmm0
  3433. shufps \$0b11111111,%xmm1,%xmm1 # critical path
  3434. xorps %xmm1,%xmm0
  3435. ret
  3436. .align 16
  3437. .Lkey_expansion_256b:
  3438. $movkey %xmm0,(%rax)
  3439. lea 16(%rax),%rax
  3440. shufps \$0b00010000,%xmm2,%xmm4
  3441. xorps %xmm4,%xmm2
  3442. shufps \$0b10001100,%xmm2,%xmm4
  3443. xorps %xmm4,%xmm2
  3444. shufps \$0b10101010,%xmm1,%xmm1 # critical path
  3445. xorps %xmm1,%xmm2
  3446. ret
  3447. .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
  3448. .size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
  3449. ___
  3450. }
  3451. $code.=<<___;
  3452. .align 64
  3453. .Lbswap_mask:
  3454. .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
  3455. .Lincrement32:
  3456. .long 6,6,6,0
  3457. .Lincrement64:
  3458. .long 1,0,0,0
  3459. .Lxts_magic:
  3460. .long 0x87,0,1,0
  3461. .Lincrement1:
  3462. .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
  3463. .Lkey_rotate:
  3464. .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
  3465. .Lkey_rotate192:
  3466. .long 0x04070605,0x04070605,0x04070605,0x04070605
  3467. .Lkey_rcon1:
  3468. .long 1,1,1,1
  3469. .Lkey_rcon1b:
  3470. .long 0x1b,0x1b,0x1b,0x1b
  3471. .asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
  3472. .align 64
  3473. ___
  3474. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  3475. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  3476. if ($win64) {
  3477. $rec="%rcx";
  3478. $frame="%rdx";
  3479. $context="%r8";
  3480. $disp="%r9";
  3481. $code.=<<___;
  3482. .extern __imp_RtlVirtualUnwind
  3483. ___
  3484. $code.=<<___ if ($PREFIX eq "aesni");
  3485. .type ecb_ccm64_se_handler,\@abi-omnipotent
  3486. .align 16
  3487. ecb_ccm64_se_handler:
  3488. push %rsi
  3489. push %rdi
  3490. push %rbx
  3491. push %rbp
  3492. push %r12
  3493. push %r13
  3494. push %r14
  3495. push %r15
  3496. pushfq
  3497. sub \$64,%rsp
  3498. mov 120($context),%rax # pull context->Rax
  3499. mov 248($context),%rbx # pull context->Rip
  3500. mov 8($disp),%rsi # disp->ImageBase
  3501. mov 56($disp),%r11 # disp->HandlerData
  3502. mov 0(%r11),%r10d # HandlerData[0]
  3503. lea (%rsi,%r10),%r10 # prologue label
  3504. cmp %r10,%rbx # context->Rip<prologue label
  3505. jb .Lcommon_seh_tail
  3506. mov 152($context),%rax # pull context->Rsp
  3507. mov 4(%r11),%r10d # HandlerData[1]
  3508. lea (%rsi,%r10),%r10 # epilogue label
  3509. cmp %r10,%rbx # context->Rip>=epilogue label
  3510. jae .Lcommon_seh_tail
  3511. lea 0(%rax),%rsi # %xmm save area
  3512. lea 512($context),%rdi # &context.Xmm6
  3513. mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
  3514. .long 0xa548f3fc # cld; rep movsq
  3515. lea 0x58(%rax),%rax # adjust stack pointer
  3516. jmp .Lcommon_seh_tail
  3517. .size ecb_ccm64_se_handler,.-ecb_ccm64_se_handler
  3518. .type ctr_xts_se_handler,\@abi-omnipotent
  3519. .align 16
  3520. ctr_xts_se_handler:
  3521. push %rsi
  3522. push %rdi
  3523. push %rbx
  3524. push %rbp
  3525. push %r12
  3526. push %r13
  3527. push %r14
  3528. push %r15
  3529. pushfq
  3530. sub \$64,%rsp
  3531. mov 120($context),%rax # pull context->Rax
  3532. mov 248($context),%rbx # pull context->Rip
  3533. mov 8($disp),%rsi # disp->ImageBase
  3534. mov 56($disp),%r11 # disp->HandlerData
  3535. mov 0(%r11),%r10d # HandlerData[0]
  3536. lea (%rsi,%r10),%r10 # prologue lable
  3537. cmp %r10,%rbx # context->Rip<prologue label
  3538. jb .Lcommon_seh_tail
  3539. mov 152($context),%rax # pull context->Rsp
  3540. mov 4(%r11),%r10d # HandlerData[1]
  3541. lea (%rsi,%r10),%r10 # epilogue label
  3542. cmp %r10,%rbx # context->Rip>=epilogue label
  3543. jae .Lcommon_seh_tail
  3544. mov 160($context),%rax # pull context->Rbp
  3545. lea -0xa0(%rax),%rsi # %xmm save area
  3546. lea 512($context),%rdi # & context.Xmm6
  3547. mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
  3548. .long 0xa548f3fc # cld; rep movsq
  3549. jmp .Lcommon_rbp_tail
  3550. .size ctr_xts_se_handler,.-ctr_xts_se_handler
  3551. ___
  3552. $code.=<<___;
  3553. .type cbc_se_handler,\@abi-omnipotent
  3554. .align 16
  3555. cbc_se_handler:
  3556. push %rsi
  3557. push %rdi
  3558. push %rbx
  3559. push %rbp
  3560. push %r12
  3561. push %r13
  3562. push %r14
  3563. push %r15
  3564. pushfq
  3565. sub \$64,%rsp
  3566. mov 152($context),%rax # pull context->Rsp
  3567. mov 248($context),%rbx # pull context->Rip
  3568. lea .Lcbc_decrypt_bulk(%rip),%r10
  3569. cmp %r10,%rbx # context->Rip<"prologue" label
  3570. jb .Lcommon_seh_tail
  3571. lea .Lcbc_decrypt_body(%rip),%r10
  3572. cmp %r10,%rbx # context->Rip<cbc_decrypt_body
  3573. jb .Lrestore_cbc_rax
  3574. lea .Lcbc_ret(%rip),%r10
  3575. cmp %r10,%rbx # context->Rip>="epilogue" label
  3576. jae .Lcommon_seh_tail
  3577. lea 16(%rax),%rsi # %xmm save area
  3578. lea 512($context),%rdi # &context.Xmm6
  3579. mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
  3580. .long 0xa548f3fc # cld; rep movsq
  3581. .Lcommon_rbp_tail:
  3582. mov 160($context),%rax # pull context->Rbp
  3583. mov (%rax),%rbp # restore saved %rbp
  3584. lea 8(%rax),%rax # adjust stack pointer
  3585. mov %rbp,160($context) # restore context->Rbp
  3586. jmp .Lcommon_seh_tail
  3587. .Lrestore_cbc_rax:
  3588. mov 120($context),%rax
  3589. .Lcommon_seh_tail:
  3590. mov 8(%rax),%rdi
  3591. mov 16(%rax),%rsi
  3592. mov %rax,152($context) # restore context->Rsp
  3593. mov %rsi,168($context) # restore context->Rsi
  3594. mov %rdi,176($context) # restore context->Rdi
  3595. mov 40($disp),%rdi # disp->ContextRecord
  3596. mov $context,%rsi # context
  3597. mov \$154,%ecx # sizeof(CONTEXT)
  3598. .long 0xa548f3fc # cld; rep movsq
  3599. mov $disp,%rsi
  3600. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  3601. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  3602. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  3603. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  3604. mov 40(%rsi),%r10 # disp->ContextRecord
  3605. lea 56(%rsi),%r11 # &disp->HandlerData
  3606. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  3607. mov %r10,32(%rsp) # arg5
  3608. mov %r11,40(%rsp) # arg6
  3609. mov %r12,48(%rsp) # arg7
  3610. mov %rcx,56(%rsp) # arg8, (NULL)
  3611. call *__imp_RtlVirtualUnwind(%rip)
  3612. mov \$1,%eax # ExceptionContinueSearch
  3613. add \$64,%rsp
  3614. popfq
  3615. pop %r15
  3616. pop %r14
  3617. pop %r13
  3618. pop %r12
  3619. pop %rbp
  3620. pop %rbx
  3621. pop %rdi
  3622. pop %rsi
  3623. ret
  3624. .size cbc_se_handler,.-cbc_se_handler
  3625. .section .pdata
  3626. .align 4
  3627. ___
  3628. $code.=<<___ if ($PREFIX eq "aesni");
  3629. .rva .LSEH_begin_aesni_ecb_encrypt
  3630. .rva .LSEH_end_aesni_ecb_encrypt
  3631. .rva .LSEH_info_ecb
  3632. .rva .LSEH_begin_aesni_ccm64_encrypt_blocks
  3633. .rva .LSEH_end_aesni_ccm64_encrypt_blocks
  3634. .rva .LSEH_info_ccm64_enc
  3635. .rva .LSEH_begin_aesni_ccm64_decrypt_blocks
  3636. .rva .LSEH_end_aesni_ccm64_decrypt_blocks
  3637. .rva .LSEH_info_ccm64_dec
  3638. .rva .LSEH_begin_aesni_ctr32_encrypt_blocks
  3639. .rva .LSEH_end_aesni_ctr32_encrypt_blocks
  3640. .rva .LSEH_info_ctr32
  3641. .rva .LSEH_begin_aesni_xts_encrypt
  3642. .rva .LSEH_end_aesni_xts_encrypt
  3643. .rva .LSEH_info_xts_enc
  3644. .rva .LSEH_begin_aesni_xts_decrypt
  3645. .rva .LSEH_end_aesni_xts_decrypt
  3646. .rva .LSEH_info_xts_dec
  3647. ___
  3648. $code.=<<___;
  3649. .rva .LSEH_begin_${PREFIX}_cbc_encrypt
  3650. .rva .LSEH_end_${PREFIX}_cbc_encrypt
  3651. .rva .LSEH_info_cbc
  3652. .rva ${PREFIX}_set_decrypt_key
  3653. .rva .LSEH_end_set_decrypt_key
  3654. .rva .LSEH_info_key
  3655. .rva ${PREFIX}_set_encrypt_key
  3656. .rva .LSEH_end_set_encrypt_key
  3657. .rva .LSEH_info_key
  3658. .section .xdata
  3659. .align 8
  3660. ___
  3661. $code.=<<___ if ($PREFIX eq "aesni");
  3662. .LSEH_info_ecb:
  3663. .byte 9,0,0,0
  3664. .rva ecb_ccm64_se_handler
  3665. .rva .Lecb_enc_body,.Lecb_enc_ret # HandlerData[]
  3666. .LSEH_info_ccm64_enc:
  3667. .byte 9,0,0,0
  3668. .rva ecb_ccm64_se_handler
  3669. .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[]
  3670. .LSEH_info_ccm64_dec:
  3671. .byte 9,0,0,0
  3672. .rva ecb_ccm64_se_handler
  3673. .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[]
  3674. .LSEH_info_ctr32:
  3675. .byte 9,0,0,0
  3676. .rva ctr_xts_se_handler
  3677. .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[]
  3678. .LSEH_info_xts_enc:
  3679. .byte 9,0,0,0
  3680. .rva ctr_xts_se_handler
  3681. .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
  3682. .LSEH_info_xts_dec:
  3683. .byte 9,0,0,0
  3684. .rva ctr_xts_se_handler
  3685. .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
  3686. ___
  3687. $code.=<<___;
  3688. .LSEH_info_cbc:
  3689. .byte 9,0,0,0
  3690. .rva cbc_se_handler
  3691. .LSEH_info_key:
  3692. .byte 0x01,0x04,0x01,0x00
  3693. .byte 0x04,0x02,0x00,0x00 # sub rsp,8
  3694. ___
  3695. }
  3696. sub rex {
  3697. local *opcode=shift;
  3698. my ($dst,$src)=@_;
  3699. my $rex=0;
  3700. $rex|=0x04 if($dst>=8);
  3701. $rex|=0x01 if($src>=8);
  3702. push @opcode,$rex|0x40 if($rex);
  3703. }
  3704. sub aesni {
  3705. my $line=shift;
  3706. my @opcode=(0x66);
  3707. if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
  3708. rex(\@opcode,$4,$3);
  3709. push @opcode,0x0f,0x3a,0xdf;
  3710. push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
  3711. my $c=$2;
  3712. push @opcode,$c=~/^0/?oct($c):$c;
  3713. return ".byte\t".join(',',@opcode);
  3714. }
  3715. elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
  3716. my %opcodelet = (
  3717. "aesimc" => 0xdb,
  3718. "aesenc" => 0xdc, "aesenclast" => 0xdd,
  3719. "aesdec" => 0xde, "aesdeclast" => 0xdf
  3720. );
  3721. return undef if (!defined($opcodelet{$1}));
  3722. rex(\@opcode,$3,$2);
  3723. push @opcode,0x0f,0x38,$opcodelet{$1};
  3724. push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
  3725. return ".byte\t".join(',',@opcode);
  3726. }
  3727. elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
  3728. my %opcodelet = (
  3729. "aesenc" => 0xdc, "aesenclast" => 0xdd,
  3730. "aesdec" => 0xde, "aesdeclast" => 0xdf
  3731. );
  3732. return undef if (!defined($opcodelet{$1}));
  3733. my $off = $2;
  3734. push @opcode,0x44 if ($3>=8);
  3735. push @opcode,0x0f,0x38,$opcodelet{$1};
  3736. push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
  3737. push @opcode,($off=~/^0/?oct($off):$off)&0xff;
  3738. return ".byte\t".join(',',@opcode);
  3739. }
  3740. return $line;
  3741. }
  3742. sub movbe {
  3743. ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift;
  3744. }
  3745. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  3746. $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
  3747. #$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact
  3748. $code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
  3749. print $code;
  3750. close STDOUT;