You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

1055 line
28 KiB

  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  5. # project. The module is, however, dual licensed under OpenSSL and
  6. # CRYPTOGAMS licenses depending on where you obtain it. For further
  7. # details see http://www.openssl.org/~appro/cryptogams/.
  8. # ====================================================================
  9. #
  10. #
  11. # AES-NI-CTR+GHASH stitch.
  12. #
  13. # February 2013
  14. #
  15. # OpenSSL GCM implementation is organized in such way that its
  16. # performance is rather close to the sum of its streamed components,
  17. # in the context parallelized AES-NI CTR and modulo-scheduled
  18. # PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
  19. # was observed to perform significantly better than the sum of the
  20. # components on contemporary CPUs, the effort was deemed impossible to
  21. # justify. This module is based on combination of Intel submissions,
  22. # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
  23. # Locktyukhin of Intel Corp. who verified that it reduces shuffles
  24. # pressure with notable relative improvement, achieving 1.0 cycle per
  25. # byte processed with 128-bit key on Haswell processor.
  26. #
  27. # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
  28. # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
  29. $flavour = shift;
  30. $output = shift;
  31. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  32. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  33. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  34. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  35. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  36. die "can't locate x86_64-xlate.pl";
  37. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  38. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  39. $avx = ($1>=2.19) + ($1>=2.22);
  40. }
  41. if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  42. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  43. $avx = ($1>=2.09) + ($1>=2.10);
  44. }
  45. if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  46. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  47. $avx = ($1>=10) + ($1>=11);
  48. }
  49. if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
  50. $avx = ($2>=3.0) + ($2>3.0);
  51. }
  52. open OUT,"| \"$^X\" $xlate $flavour $output";
  53. *STDOUT=*OUT;
  54. if ($avx>1) {{{
  55. ($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
  56. ($Ii,$T1,$T2,$Hkey,
  57. $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
  58. ($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
  59. ($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15");
  60. $code=<<___;
  61. .text
  62. .type _aesni_ctr32_ghash_6x,\@abi-omnipotent
  63. .align 32
  64. _aesni_ctr32_ghash_6x:
  65. vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb
  66. sub \$6,$len
  67. vpxor $Z0,$Z0,$Z0 # $Z0 = 0
  68. vmovdqu 0x00-0x80($key),$rndkey
  69. vpaddb $T2,$T1,$inout1
  70. vpaddb $T2,$inout1,$inout2
  71. vpaddb $T2,$inout2,$inout3
  72. vpaddb $T2,$inout3,$inout4
  73. vpaddb $T2,$inout4,$inout5
  74. vpxor $rndkey,$T1,$inout0
  75. vmovdqu $Z0,16+8(%rsp) # "$Z3" = 0
  76. jmp .Loop6x
  77. .align 32
  78. .Loop6x:
  79. add \$`6<<24`,$counter
  80. jc .Lhandle_ctr32 # discard $inout[1-5]?
  81. vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
  82. vpaddb $T2,$inout5,$T1 # next counter value
  83. vpxor $rndkey,$inout1,$inout1
  84. vpxor $rndkey,$inout2,$inout2
  85. .Lresume_ctr32:
  86. vmovdqu $T1,($ivp) # save next counter value
  87. vpclmulqdq \$0x10,$Hkey,$Z3,$Z1
  88. vpxor $rndkey,$inout3,$inout3
  89. vmovups 0x10-0x80($key),$T2 # borrow $T2 for $rndkey
  90. vpclmulqdq \$0x01,$Hkey,$Z3,$Z2
  91. xor %r12,%r12
  92. cmp $in0,$end0
  93. vaesenc $T2,$inout0,$inout0
  94. vmovdqu 0x30+8(%rsp),$Ii # I[4]
  95. vpxor $rndkey,$inout4,$inout4
  96. vpclmulqdq \$0x00,$Hkey,$Z3,$T1
  97. vaesenc $T2,$inout1,$inout1
  98. vpxor $rndkey,$inout5,$inout5
  99. setnc %r12b
  100. vpclmulqdq \$0x11,$Hkey,$Z3,$Z3
  101. vaesenc $T2,$inout2,$inout2
  102. vmovdqu 0x10-0x20($Xip),$Hkey # $Hkey^2
  103. neg %r12
  104. vaesenc $T2,$inout3,$inout3
  105. vpxor $Z1,$Z2,$Z2
  106. vpclmulqdq \$0x00,$Hkey,$Ii,$Z1
  107. vpxor $Z0,$Xi,$Xi # modulo-scheduled
  108. vaesenc $T2,$inout4,$inout4
  109. vpxor $Z1,$T1,$Z0
  110. and \$0x60,%r12
  111. vmovups 0x20-0x80($key),$rndkey
  112. vpclmulqdq \$0x10,$Hkey,$Ii,$T1
  113. vaesenc $T2,$inout5,$inout5
  114. vpclmulqdq \$0x01,$Hkey,$Ii,$T2
  115. lea ($in0,%r12),$in0
  116. vaesenc $rndkey,$inout0,$inout0
  117. vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled [vpxor $Z3,$Xi,$Xi]
  118. vpclmulqdq \$0x11,$Hkey,$Ii,$Hkey
  119. vmovdqu 0x40+8(%rsp),$Ii # I[3]
  120. vaesenc $rndkey,$inout1,$inout1
  121. movbe 0x58($in0),%r13
  122. vaesenc $rndkey,$inout2,$inout2
  123. movbe 0x50($in0),%r12
  124. vaesenc $rndkey,$inout3,$inout3
  125. mov %r13,0x20+8(%rsp)
  126. vaesenc $rndkey,$inout4,$inout4
  127. mov %r12,0x28+8(%rsp)
  128. vmovdqu 0x30-0x20($Xip),$Z1 # borrow $Z1 for $Hkey^3
  129. vaesenc $rndkey,$inout5,$inout5
  130. vmovups 0x30-0x80($key),$rndkey
  131. vpxor $T1,$Z2,$Z2
  132. vpclmulqdq \$0x00,$Z1,$Ii,$T1
  133. vaesenc $rndkey,$inout0,$inout0
  134. vpxor $T2,$Z2,$Z2
  135. vpclmulqdq \$0x10,$Z1,$Ii,$T2
  136. vaesenc $rndkey,$inout1,$inout1
  137. vpxor $Hkey,$Z3,$Z3
  138. vpclmulqdq \$0x01,$Z1,$Ii,$Hkey
  139. vaesenc $rndkey,$inout2,$inout2
  140. vpclmulqdq \$0x11,$Z1,$Ii,$Z1
  141. vmovdqu 0x50+8(%rsp),$Ii # I[2]
  142. vaesenc $rndkey,$inout3,$inout3
  143. vaesenc $rndkey,$inout4,$inout4
  144. vpxor $T1,$Z0,$Z0
  145. vmovdqu 0x40-0x20($Xip),$T1 # borrow $T1 for $Hkey^4
  146. vaesenc $rndkey,$inout5,$inout5
  147. vmovups 0x40-0x80($key),$rndkey
  148. vpxor $T2,$Z2,$Z2
  149. vpclmulqdq \$0x00,$T1,$Ii,$T2
  150. vaesenc $rndkey,$inout0,$inout0
  151. vpxor $Hkey,$Z2,$Z2
  152. vpclmulqdq \$0x10,$T1,$Ii,$Hkey
  153. vaesenc $rndkey,$inout1,$inout1
  154. movbe 0x48($in0),%r13
  155. vpxor $Z1,$Z3,$Z3
  156. vpclmulqdq \$0x01,$T1,$Ii,$Z1
  157. vaesenc $rndkey,$inout2,$inout2
  158. movbe 0x40($in0),%r12
  159. vpclmulqdq \$0x11,$T1,$Ii,$T1
  160. vmovdqu 0x60+8(%rsp),$Ii # I[1]
  161. vaesenc $rndkey,$inout3,$inout3
  162. mov %r13,0x30+8(%rsp)
  163. vaesenc $rndkey,$inout4,$inout4
  164. mov %r12,0x38+8(%rsp)
  165. vpxor $T2,$Z0,$Z0
  166. vmovdqu 0x60-0x20($Xip),$T2 # borrow $T2 for $Hkey^5
  167. vaesenc $rndkey,$inout5,$inout5
  168. vmovups 0x50-0x80($key),$rndkey
  169. vpxor $Hkey,$Z2,$Z2
  170. vpclmulqdq \$0x00,$T2,$Ii,$Hkey
  171. vaesenc $rndkey,$inout0,$inout0
  172. vpxor $Z1,$Z2,$Z2
  173. vpclmulqdq \$0x10,$T2,$Ii,$Z1
  174. vaesenc $rndkey,$inout1,$inout1
  175. movbe 0x38($in0),%r13
  176. vpxor $T1,$Z3,$Z3
  177. vpclmulqdq \$0x01,$T2,$Ii,$T1
  178. vpxor 0x70+8(%rsp),$Xi,$Xi # accumulate I[0]
  179. vaesenc $rndkey,$inout2,$inout2
  180. movbe 0x30($in0),%r12
  181. vpclmulqdq \$0x11,$T2,$Ii,$T2
  182. vaesenc $rndkey,$inout3,$inout3
  183. mov %r13,0x40+8(%rsp)
  184. vaesenc $rndkey,$inout4,$inout4
  185. mov %r12,0x48+8(%rsp)
  186. vpxor $Hkey,$Z0,$Z0
  187. vmovdqu 0x70-0x20($Xip),$Hkey # $Hkey^6
  188. vaesenc $rndkey,$inout5,$inout5
  189. vmovups 0x60-0x80($key),$rndkey
  190. vpxor $Z1,$Z2,$Z2
  191. vpclmulqdq \$0x10,$Hkey,$Xi,$Z1
  192. vaesenc $rndkey,$inout0,$inout0
  193. vpxor $T1,$Z2,$Z2
  194. vpclmulqdq \$0x01,$Hkey,$Xi,$T1
  195. vaesenc $rndkey,$inout1,$inout1
  196. movbe 0x28($in0),%r13
  197. vpxor $T2,$Z3,$Z3
  198. vpclmulqdq \$0x00,$Hkey,$Xi,$T2
  199. vaesenc $rndkey,$inout2,$inout2
  200. movbe 0x20($in0),%r12
  201. vpclmulqdq \$0x11,$Hkey,$Xi,$Xi
  202. vaesenc $rndkey,$inout3,$inout3
  203. mov %r13,0x50+8(%rsp)
  204. vaesenc $rndkey,$inout4,$inout4
  205. mov %r12,0x58+8(%rsp)
  206. vpxor $Z1,$Z2,$Z2
  207. vaesenc $rndkey,$inout5,$inout5
  208. vpxor $T1,$Z2,$Z2
  209. vmovups 0x70-0x80($key),$rndkey
  210. vpslldq \$8,$Z2,$Z1
  211. vpxor $T2,$Z0,$Z0
  212. vmovdqu 0x10($const),$Hkey # .Lpoly
  213. vaesenc $rndkey,$inout0,$inout0
  214. vpxor $Xi,$Z3,$Z3
  215. vaesenc $rndkey,$inout1,$inout1
  216. vpxor $Z1,$Z0,$Z0
  217. movbe 0x18($in0),%r13
  218. vaesenc $rndkey,$inout2,$inout2
  219. movbe 0x10($in0),%r12
  220. vpalignr \$8,$Z0,$Z0,$Ii # 1st phase
  221. vpclmulqdq \$0x10,$Hkey,$Z0,$Z0
  222. mov %r13,0x60+8(%rsp)
  223. vaesenc $rndkey,$inout3,$inout3
  224. mov %r12,0x68+8(%rsp)
  225. vaesenc $rndkey,$inout4,$inout4
  226. vmovups 0x80-0x80($key),$T1 # borrow $T1 for $rndkey
  227. vaesenc $rndkey,$inout5,$inout5
  228. vaesenc $T1,$inout0,$inout0
  229. vmovups 0x90-0x80($key),$rndkey
  230. vaesenc $T1,$inout1,$inout1
  231. vpsrldq \$8,$Z2,$Z2
  232. vaesenc $T1,$inout2,$inout2
  233. vpxor $Z2,$Z3,$Z3
  234. vaesenc $T1,$inout3,$inout3
  235. vpxor $Ii,$Z0,$Z0
  236. movbe 0x08($in0),%r13
  237. vaesenc $T1,$inout4,$inout4
  238. movbe 0x00($in0),%r12
  239. vaesenc $T1,$inout5,$inout5
  240. vmovups 0xa0-0x80($key),$T1
  241. cmp \$11,$rounds
  242. jb .Lenc_tail # 128-bit key
  243. vaesenc $rndkey,$inout0,$inout0
  244. vaesenc $rndkey,$inout1,$inout1
  245. vaesenc $rndkey,$inout2,$inout2
  246. vaesenc $rndkey,$inout3,$inout3
  247. vaesenc $rndkey,$inout4,$inout4
  248. vaesenc $rndkey,$inout5,$inout5
  249. vaesenc $T1,$inout0,$inout0
  250. vaesenc $T1,$inout1,$inout1
  251. vaesenc $T1,$inout2,$inout2
  252. vaesenc $T1,$inout3,$inout3
  253. vaesenc $T1,$inout4,$inout4
  254. vmovups 0xb0-0x80($key),$rndkey
  255. vaesenc $T1,$inout5,$inout5
  256. vmovups 0xc0-0x80($key),$T1
  257. je .Lenc_tail # 192-bit key
  258. vaesenc $rndkey,$inout0,$inout0
  259. vaesenc $rndkey,$inout1,$inout1
  260. vaesenc $rndkey,$inout2,$inout2
  261. vaesenc $rndkey,$inout3,$inout3
  262. vaesenc $rndkey,$inout4,$inout4
  263. vaesenc $rndkey,$inout5,$inout5
  264. vaesenc $T1,$inout0,$inout0
  265. vaesenc $T1,$inout1,$inout1
  266. vaesenc $T1,$inout2,$inout2
  267. vaesenc $T1,$inout3,$inout3
  268. vaesenc $T1,$inout4,$inout4
  269. vmovups 0xd0-0x80($key),$rndkey
  270. vaesenc $T1,$inout5,$inout5
  271. vmovups 0xe0-0x80($key),$T1
  272. jmp .Lenc_tail # 256-bit key
  273. .align 32
  274. .Lhandle_ctr32:
  275. vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
  276. vpshufb $Ii,$T1,$Z2 # byte-swap counter
  277. vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb
  278. vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb
  279. vpaddd $Z1,$Z2,$inout2
  280. vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
  281. vpaddd $Z1,$inout1,$inout3
  282. vpshufb $Ii,$inout1,$inout1
  283. vpaddd $Z1,$inout2,$inout4
  284. vpshufb $Ii,$inout2,$inout2
  285. vpxor $rndkey,$inout1,$inout1
  286. vpaddd $Z1,$inout3,$inout5
  287. vpshufb $Ii,$inout3,$inout3
  288. vpxor $rndkey,$inout2,$inout2
  289. vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value
  290. vpshufb $Ii,$inout4,$inout4
  291. vpshufb $Ii,$inout5,$inout5
  292. vpshufb $Ii,$T1,$T1 # next counter value
  293. jmp .Lresume_ctr32
  294. .align 32
  295. .Lenc_tail:
  296. vaesenc $rndkey,$inout0,$inout0
  297. vmovdqu $Z3,16+8(%rsp) # postpone vpxor $Z3,$Xi,$Xi
  298. vpalignr \$8,$Z0,$Z0,$Xi # 2nd phase
  299. vaesenc $rndkey,$inout1,$inout1
  300. vpclmulqdq \$0x10,$Hkey,$Z0,$Z0
  301. vpxor 0x00($inp),$T1,$T2
  302. vaesenc $rndkey,$inout2,$inout2
  303. vpxor 0x10($inp),$T1,$Ii
  304. vaesenc $rndkey,$inout3,$inout3
  305. vpxor 0x20($inp),$T1,$Z1
  306. vaesenc $rndkey,$inout4,$inout4
  307. vpxor 0x30($inp),$T1,$Z2
  308. vaesenc $rndkey,$inout5,$inout5
  309. vpxor 0x40($inp),$T1,$Z3
  310. vpxor 0x50($inp),$T1,$Hkey
  311. vmovdqu ($ivp),$T1 # load next counter value
  312. vaesenclast $T2,$inout0,$inout0
  313. vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb
  314. vaesenclast $Ii,$inout1,$inout1
  315. vpaddb $T2,$T1,$Ii
  316. mov %r13,0x70+8(%rsp)
  317. lea 0x60($inp),$inp
  318. vaesenclast $Z1,$inout2,$inout2
  319. vpaddb $T2,$Ii,$Z1
  320. mov %r12,0x78+8(%rsp)
  321. lea 0x60($out),$out
  322. vmovdqu 0x00-0x80($key),$rndkey
  323. vaesenclast $Z2,$inout3,$inout3
  324. vpaddb $T2,$Z1,$Z2
  325. vaesenclast $Z3, $inout4,$inout4
  326. vpaddb $T2,$Z2,$Z3
  327. vaesenclast $Hkey,$inout5,$inout5
  328. vpaddb $T2,$Z3,$Hkey
  329. add \$0x60,$ret
  330. sub \$0x6,$len
  331. jc .L6x_done
  332. vmovups $inout0,-0x60($out) # save output
  333. vpxor $rndkey,$T1,$inout0
  334. vmovups $inout1,-0x50($out)
  335. vmovdqa $Ii,$inout1 # 0 latency
  336. vmovups $inout2,-0x40($out)
  337. vmovdqa $Z1,$inout2 # 0 latency
  338. vmovups $inout3,-0x30($out)
  339. vmovdqa $Z2,$inout3 # 0 latency
  340. vmovups $inout4,-0x20($out)
  341. vmovdqa $Z3,$inout4 # 0 latency
  342. vmovups $inout5,-0x10($out)
  343. vmovdqa $Hkey,$inout5 # 0 latency
  344. vmovdqu 0x20+8(%rsp),$Z3 # I[5]
  345. jmp .Loop6x
  346. .L6x_done:
  347. vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled
  348. vpxor $Z0,$Xi,$Xi # modulo-scheduled
  349. ret
  350. .size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
  351. ___
  352. ######################################################################
  353. #
  354. # size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len,
  355. # const AES_KEY *key, unsigned char iv[16],
  356. # struct { u128 Xi,H,Htbl[9]; } *Xip);
  357. $code.=<<___;
  358. .globl aesni_gcm_decrypt
  359. .type aesni_gcm_decrypt,\@function,6
  360. .align 32
  361. aesni_gcm_decrypt:
  362. xor $ret,$ret
  363. cmp \$0x60,$len # minimal accepted length
  364. jb .Lgcm_dec_abort
  365. lea (%rsp),%rax # save stack pointer
  366. push %rbx
  367. push %rbp
  368. push %r12
  369. push %r13
  370. push %r14
  371. push %r15
  372. ___
  373. $code.=<<___ if ($win64);
  374. lea -0xa8(%rsp),%rsp
  375. movaps %xmm6,-0xd8(%rax)
  376. movaps %xmm7,-0xc8(%rax)
  377. movaps %xmm8,-0xb8(%rax)
  378. movaps %xmm9,-0xa8(%rax)
  379. movaps %xmm10,-0x98(%rax)
  380. movaps %xmm11,-0x88(%rax)
  381. movaps %xmm12,-0x78(%rax)
  382. movaps %xmm13,-0x68(%rax)
  383. movaps %xmm14,-0x58(%rax)
  384. movaps %xmm15,-0x48(%rax)
  385. .Lgcm_dec_body:
  386. ___
  387. $code.=<<___;
  388. vzeroupper
  389. vmovdqu ($ivp),$T1 # input counter value
  390. add \$-128,%rsp
  391. mov 12($ivp),$counter
  392. lea .Lbswap_mask(%rip),$const
  393. lea -0x80($key),$in0 # borrow $in0
  394. mov \$0xf80,$end0 # borrow $end0
  395. vmovdqu ($Xip),$Xi # load Xi
  396. and \$-128,%rsp # ensure stack alignment
  397. vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
  398. lea 0x80($key),$key # size optimization
  399. lea 0x20+0x20($Xip),$Xip # size optimization
  400. mov 0xf0-0x80($key),$rounds
  401. vpshufb $Ii,$Xi,$Xi
  402. and $end0,$in0
  403. and %rsp,$end0
  404. sub $in0,$end0
  405. jc .Ldec_no_key_aliasing
  406. cmp \$768,$end0
  407. jnc .Ldec_no_key_aliasing
  408. sub $end0,%rsp # avoid aliasing with key
  409. .Ldec_no_key_aliasing:
  410. vmovdqu 0x50($inp),$Z3 # I[5]
  411. lea ($inp),$in0
  412. vmovdqu 0x40($inp),$Z0
  413. lea -0xc0($inp,$len),$end0
  414. vmovdqu 0x30($inp),$Z1
  415. shr \$4,$len
  416. xor $ret,$ret
  417. vmovdqu 0x20($inp),$Z2
  418. vpshufb $Ii,$Z3,$Z3 # passed to _aesni_ctr32_ghash_6x
  419. vmovdqu 0x10($inp),$T2
  420. vpshufb $Ii,$Z0,$Z0
  421. vmovdqu ($inp),$Hkey
  422. vpshufb $Ii,$Z1,$Z1
  423. vmovdqu $Z0,0x30(%rsp)
  424. vpshufb $Ii,$Z2,$Z2
  425. vmovdqu $Z1,0x40(%rsp)
  426. vpshufb $Ii,$T2,$T2
  427. vmovdqu $Z2,0x50(%rsp)
  428. vpshufb $Ii,$Hkey,$Hkey
  429. vmovdqu $T2,0x60(%rsp)
  430. vmovdqu $Hkey,0x70(%rsp)
  431. call _aesni_ctr32_ghash_6x
  432. vmovups $inout0,-0x60($out) # save output
  433. vmovups $inout1,-0x50($out)
  434. vmovups $inout2,-0x40($out)
  435. vmovups $inout3,-0x30($out)
  436. vmovups $inout4,-0x20($out)
  437. vmovups $inout5,-0x10($out)
  438. vpshufb ($const),$Xi,$Xi # .Lbswap_mask
  439. vmovdqu $Xi,-0x40($Xip) # output Xi
  440. vzeroupper
  441. ___
  442. $code.=<<___ if ($win64);
  443. movaps -0xd8(%rax),%xmm6
  444. movaps -0xd8(%rax),%xmm7
  445. movaps -0xb8(%rax),%xmm8
  446. movaps -0xa8(%rax),%xmm9
  447. movaps -0x98(%rax),%xmm10
  448. movaps -0x88(%rax),%xmm11
  449. movaps -0x78(%rax),%xmm12
  450. movaps -0x68(%rax),%xmm13
  451. movaps -0x58(%rax),%xmm14
  452. movaps -0x48(%rax),%xmm15
  453. ___
  454. $code.=<<___;
  455. mov -48(%rax),%r15
  456. mov -40(%rax),%r14
  457. mov -32(%rax),%r13
  458. mov -24(%rax),%r12
  459. mov -16(%rax),%rbp
  460. mov -8(%rax),%rbx
  461. lea (%rax),%rsp # restore %rsp
  462. .Lgcm_dec_abort:
  463. mov $ret,%rax # return value
  464. ret
  465. .size aesni_gcm_decrypt,.-aesni_gcm_decrypt
  466. ___
  467. $code.=<<___;
  468. .type _aesni_ctr32_6x,\@abi-omnipotent
  469. .align 32
  470. _aesni_ctr32_6x:
  471. vmovdqu 0x00-0x80($key),$Z0 # borrow $Z0 for $rndkey
  472. vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb
  473. lea -1($rounds),%r13
  474. vmovups 0x10-0x80($key),$rndkey
  475. lea 0x20-0x80($key),%r12
  476. vpxor $Z0,$T1,$inout0
  477. add \$`6<<24`,$counter
  478. jc .Lhandle_ctr32_2
  479. vpaddb $T2,$T1,$inout1
  480. vpaddb $T2,$inout1,$inout2
  481. vpxor $Z0,$inout1,$inout1
  482. vpaddb $T2,$inout2,$inout3
  483. vpxor $Z0,$inout2,$inout2
  484. vpaddb $T2,$inout3,$inout4
  485. vpxor $Z0,$inout3,$inout3
  486. vpaddb $T2,$inout4,$inout5
  487. vpxor $Z0,$inout4,$inout4
  488. vpaddb $T2,$inout5,$T1
  489. vpxor $Z0,$inout5,$inout5
  490. jmp .Loop_ctr32
  491. .align 16
  492. .Loop_ctr32:
  493. vaesenc $rndkey,$inout0,$inout0
  494. vaesenc $rndkey,$inout1,$inout1
  495. vaesenc $rndkey,$inout2,$inout2
  496. vaesenc $rndkey,$inout3,$inout3
  497. vaesenc $rndkey,$inout4,$inout4
  498. vaesenc $rndkey,$inout5,$inout5
  499. vmovups (%r12),$rndkey
  500. lea 0x10(%r12),%r12
  501. dec %r13d
  502. jnz .Loop_ctr32
  503. vmovdqu (%r12),$Hkey # last round key
  504. vaesenc $rndkey,$inout0,$inout0
  505. vpxor 0x00($inp),$Hkey,$Z0
  506. vaesenc $rndkey,$inout1,$inout1
  507. vpxor 0x10($inp),$Hkey,$Z1
  508. vaesenc $rndkey,$inout2,$inout2
  509. vpxor 0x20($inp),$Hkey,$Z2
  510. vaesenc $rndkey,$inout3,$inout3
  511. vpxor 0x30($inp),$Hkey,$Xi
  512. vaesenc $rndkey,$inout4,$inout4
  513. vpxor 0x40($inp),$Hkey,$T2
  514. vaesenc $rndkey,$inout5,$inout5
  515. vpxor 0x50($inp),$Hkey,$Hkey
  516. lea 0x60($inp),$inp
  517. vaesenclast $Z0,$inout0,$inout0
  518. vaesenclast $Z1,$inout1,$inout1
  519. vaesenclast $Z2,$inout2,$inout2
  520. vaesenclast $Xi,$inout3,$inout3
  521. vaesenclast $T2,$inout4,$inout4
  522. vaesenclast $Hkey,$inout5,$inout5
  523. vmovups $inout0,0x00($out)
  524. vmovups $inout1,0x10($out)
  525. vmovups $inout2,0x20($out)
  526. vmovups $inout3,0x30($out)
  527. vmovups $inout4,0x40($out)
  528. vmovups $inout5,0x50($out)
  529. lea 0x60($out),$out
  530. ret
  531. .align 32
  532. .Lhandle_ctr32_2:
  533. vpshufb $Ii,$T1,$Z2 # byte-swap counter
  534. vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb
  535. vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb
  536. vpaddd $Z1,$Z2,$inout2
  537. vpaddd $Z1,$inout1,$inout3
  538. vpshufb $Ii,$inout1,$inout1
  539. vpaddd $Z1,$inout2,$inout4
  540. vpshufb $Ii,$inout2,$inout2
  541. vpxor $Z0,$inout1,$inout1
  542. vpaddd $Z1,$inout3,$inout5
  543. vpshufb $Ii,$inout3,$inout3
  544. vpxor $Z0,$inout2,$inout2
  545. vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value
  546. vpshufb $Ii,$inout4,$inout4
  547. vpxor $Z0,$inout3,$inout3
  548. vpshufb $Ii,$inout5,$inout5
  549. vpxor $Z0,$inout4,$inout4
  550. vpshufb $Ii,$T1,$T1 # next counter value
  551. vpxor $Z0,$inout5,$inout5
  552. jmp .Loop_ctr32
  553. .size _aesni_ctr32_6x,.-_aesni_ctr32_6x
  554. .globl aesni_gcm_encrypt
  555. .type aesni_gcm_encrypt,\@function,6
  556. .align 32
  557. aesni_gcm_encrypt:
  558. xor $ret,$ret
  559. cmp \$0x60*3,$len # minimal accepted length
  560. jb .Lgcm_enc_abort
  561. lea (%rsp),%rax # save stack pointer
  562. push %rbx
  563. push %rbp
  564. push %r12
  565. push %r13
  566. push %r14
  567. push %r15
  568. ___
  569. $code.=<<___ if ($win64);
  570. lea -0xa8(%rsp),%rsp
  571. movaps %xmm6,-0xd8(%rax)
  572. movaps %xmm7,-0xc8(%rax)
  573. movaps %xmm8,-0xb8(%rax)
  574. movaps %xmm9,-0xa8(%rax)
  575. movaps %xmm10,-0x98(%rax)
  576. movaps %xmm11,-0x88(%rax)
  577. movaps %xmm12,-0x78(%rax)
  578. movaps %xmm13,-0x68(%rax)
  579. movaps %xmm14,-0x58(%rax)
  580. movaps %xmm15,-0x48(%rax)
  581. .Lgcm_enc_body:
  582. ___
  583. $code.=<<___;
  584. vzeroupper
  585. vmovdqu ($ivp),$T1 # input counter value
  586. add \$-128,%rsp
  587. mov 12($ivp),$counter
  588. lea .Lbswap_mask(%rip),$const
  589. lea -0x80($key),$in0 # borrow $in0
  590. mov \$0xf80,$end0 # borrow $end0
  591. lea 0x80($key),$key # size optimization
  592. vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
  593. and \$-128,%rsp # ensure stack alignment
  594. mov 0xf0-0x80($key),$rounds
  595. and $end0,$in0
  596. and %rsp,$end0
  597. sub $in0,$end0
  598. jc .Lenc_no_key_aliasing
  599. cmp \$768,$end0
  600. jnc .Lenc_no_key_aliasing
  601. sub $end0,%rsp # avoid aliasing with key
  602. .Lenc_no_key_aliasing:
  603. lea ($out),$in0
  604. lea -0xc0($out,$len),$end0
  605. shr \$4,$len
  606. call _aesni_ctr32_6x
  607. vpshufb $Ii,$inout0,$Xi # save bswapped output on stack
  608. vpshufb $Ii,$inout1,$T2
  609. vmovdqu $Xi,0x70(%rsp)
  610. vpshufb $Ii,$inout2,$Z0
  611. vmovdqu $T2,0x60(%rsp)
  612. vpshufb $Ii,$inout3,$Z1
  613. vmovdqu $Z0,0x50(%rsp)
  614. vpshufb $Ii,$inout4,$Z2
  615. vmovdqu $Z1,0x40(%rsp)
  616. vpshufb $Ii,$inout5,$Z3 # passed to _aesni_ctr32_ghash_6x
  617. vmovdqu $Z2,0x30(%rsp)
  618. call _aesni_ctr32_6x
  619. vmovdqu ($Xip),$Xi # load Xi
  620. lea 0x20+0x20($Xip),$Xip # size optimization
  621. sub \$12,$len
  622. mov \$0x60*2,$ret
  623. vpshufb $Ii,$Xi,$Xi
  624. call _aesni_ctr32_ghash_6x
  625. vmovdqu 0x20(%rsp),$Z3 # I[5]
  626. vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
  627. vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
  628. vpunpckhqdq $Z3,$Z3,$T1
  629. vmovdqu 0x20-0x20($Xip),$rndkey # borrow $rndkey for $HK
  630. vmovups $inout0,-0x60($out) # save output
  631. vpshufb $Ii,$inout0,$inout0 # but keep bswapped copy
  632. vpxor $Z3,$T1,$T1
  633. vmovups $inout1,-0x50($out)
  634. vpshufb $Ii,$inout1,$inout1
  635. vmovups $inout2,-0x40($out)
  636. vpshufb $Ii,$inout2,$inout2
  637. vmovups $inout3,-0x30($out)
  638. vpshufb $Ii,$inout3,$inout3
  639. vmovups $inout4,-0x20($out)
  640. vpshufb $Ii,$inout4,$inout4
  641. vmovups $inout5,-0x10($out)
  642. vpshufb $Ii,$inout5,$inout5
  643. vmovdqu $inout0,0x10(%rsp) # free $inout0
  644. ___
  645. { my ($HK,$T3)=($rndkey,$inout0);
  646. $code.=<<___;
  647. vmovdqu 0x30(%rsp),$Z2 # I[4]
  648. vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2
  649. vpunpckhqdq $Z2,$Z2,$T2
  650. vpclmulqdq \$0x00,$Hkey,$Z3,$Z1
  651. vpxor $Z2,$T2,$T2
  652. vpclmulqdq \$0x11,$Hkey,$Z3,$Z3
  653. vpclmulqdq \$0x00,$HK,$T1,$T1
  654. vmovdqu 0x40(%rsp),$T3 # I[3]
  655. vpclmulqdq \$0x00,$Ii,$Z2,$Z0
  656. vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3
  657. vpxor $Z1,$Z0,$Z0
  658. vpunpckhqdq $T3,$T3,$Z1
  659. vpclmulqdq \$0x11,$Ii,$Z2,$Z2
  660. vpxor $T3,$Z1,$Z1
  661. vpxor $Z3,$Z2,$Z2
  662. vpclmulqdq \$0x10,$HK,$T2,$T2
  663. vmovdqu 0x50-0x20($Xip),$HK
  664. vpxor $T1,$T2,$T2
  665. vmovdqu 0x50(%rsp),$T1 # I[2]
  666. vpclmulqdq \$0x00,$Hkey,$T3,$Z3
  667. vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4
  668. vpxor $Z0,$Z3,$Z3
  669. vpunpckhqdq $T1,$T1,$Z0
  670. vpclmulqdq \$0x11,$Hkey,$T3,$T3
  671. vpxor $T1,$Z0,$Z0
  672. vpxor $Z2,$T3,$T3
  673. vpclmulqdq \$0x00,$HK,$Z1,$Z1
  674. vpxor $T2,$Z1,$Z1
  675. vmovdqu 0x60(%rsp),$T2 # I[1]
  676. vpclmulqdq \$0x00,$Ii,$T1,$Z2
  677. vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5
  678. vpxor $Z3,$Z2,$Z2
  679. vpunpckhqdq $T2,$T2,$Z3
  680. vpclmulqdq \$0x11,$Ii,$T1,$T1
  681. vpxor $T2,$Z3,$Z3
  682. vpxor $T3,$T1,$T1
  683. vpclmulqdq \$0x10,$HK,$Z0,$Z0
  684. vmovdqu 0x80-0x20($Xip),$HK
  685. vpxor $Z1,$Z0,$Z0
  686. vpxor 0x70(%rsp),$Xi,$Xi # accumulate I[0]
  687. vpclmulqdq \$0x00,$Hkey,$T2,$Z1
  688. vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6
  689. vpunpckhqdq $Xi,$Xi,$T3
  690. vpxor $Z2,$Z1,$Z1
  691. vpclmulqdq \$0x11,$Hkey,$T2,$T2
  692. vpxor $Xi,$T3,$T3
  693. vpxor $T1,$T2,$T2
  694. vpclmulqdq \$0x00,$HK,$Z3,$Z3
  695. vpxor $Z0,$Z3,$Z0
  696. vpclmulqdq \$0x00,$Ii,$Xi,$Z2
  697. vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
  698. vpunpckhqdq $inout5,$inout5,$T1
  699. vpclmulqdq \$0x11,$Ii,$Xi,$Xi
  700. vpxor $inout5,$T1,$T1
  701. vpxor $Z1,$Z2,$Z1
  702. vpclmulqdq \$0x10,$HK,$T3,$T3
  703. vmovdqu 0x20-0x20($Xip),$HK
  704. vpxor $T2,$Xi,$Z3
  705. vpxor $Z0,$T3,$Z2
  706. vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2
  707. vpxor $Z1,$Z3,$T3 # aggregated Karatsuba post-processing
  708. vpclmulqdq \$0x00,$Hkey,$inout5,$Z0
  709. vpxor $T3,$Z2,$Z2
  710. vpunpckhqdq $inout4,$inout4,$T2
  711. vpclmulqdq \$0x11,$Hkey,$inout5,$inout5
  712. vpxor $inout4,$T2,$T2
  713. vpslldq \$8,$Z2,$T3
  714. vpclmulqdq \$0x00,$HK,$T1,$T1
  715. vpxor $T3,$Z1,$Xi
  716. vpsrldq \$8,$Z2,$Z2
  717. vpxor $Z2,$Z3,$Z3
  718. vpclmulqdq \$0x00,$Ii,$inout4,$Z1
  719. vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3
  720. vpxor $Z0,$Z1,$Z1
  721. vpunpckhqdq $inout3,$inout3,$T3
  722. vpclmulqdq \$0x11,$Ii,$inout4,$inout4
  723. vpxor $inout3,$T3,$T3
  724. vpxor $inout5,$inout4,$inout4
  725. vpalignr \$8,$Xi,$Xi,$inout5 # 1st phase
  726. vpclmulqdq \$0x10,$HK,$T2,$T2
  727. vmovdqu 0x50-0x20($Xip),$HK
  728. vpxor $T1,$T2,$T2
  729. vpclmulqdq \$0x00,$Hkey,$inout3,$Z0
  730. vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4
  731. vpxor $Z1,$Z0,$Z0
  732. vpunpckhqdq $inout2,$inout2,$T1
  733. vpclmulqdq \$0x11,$Hkey,$inout3,$inout3
  734. vpxor $inout2,$T1,$T1
  735. vpxor $inout4,$inout3,$inout3
  736. vxorps 0x10(%rsp),$Z3,$Z3 # accumulate $inout0
  737. vpclmulqdq \$0x00,$HK,$T3,$T3
  738. vpxor $T2,$T3,$T3
  739. vpclmulqdq \$0x10,0x10($const),$Xi,$Xi
  740. vxorps $inout5,$Xi,$Xi
  741. vpclmulqdq \$0x00,$Ii,$inout2,$Z1
  742. vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5
  743. vpxor $Z0,$Z1,$Z1
  744. vpunpckhqdq $inout1,$inout1,$T2
  745. vpclmulqdq \$0x11,$Ii,$inout2,$inout2
  746. vpxor $inout1,$T2,$T2
  747. vpalignr \$8,$Xi,$Xi,$inout5 # 2nd phase
  748. vpxor $inout3,$inout2,$inout2
  749. vpclmulqdq \$0x10,$HK,$T1,$T1
  750. vmovdqu 0x80-0x20($Xip),$HK
  751. vpxor $T3,$T1,$T1
  752. vxorps $Z3,$inout5,$inout5
  753. vpclmulqdq \$0x10,0x10($const),$Xi,$Xi
  754. vxorps $inout5,$Xi,$Xi
  755. vpclmulqdq \$0x00,$Hkey,$inout1,$Z0
  756. vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6
  757. vpxor $Z1,$Z0,$Z0
  758. vpunpckhqdq $Xi,$Xi,$T3
  759. vpclmulqdq \$0x11,$Hkey,$inout1,$inout1
  760. vpxor $Xi,$T3,$T3
  761. vpxor $inout2,$inout1,$inout1
  762. vpclmulqdq \$0x00,$HK,$T2,$T2
  763. vpxor $T1,$T2,$T2
  764. vpclmulqdq \$0x00,$Ii,$Xi,$Z1
  765. vpclmulqdq \$0x11,$Ii,$Xi,$Z3
  766. vpxor $Z0,$Z1,$Z1
  767. vpclmulqdq \$0x10,$HK,$T3,$Z2
  768. vpxor $inout1,$Z3,$Z3
  769. vpxor $T2,$Z2,$Z2
  770. vpxor $Z1,$Z3,$Z0 # aggregated Karatsuba post-processing
  771. vpxor $Z0,$Z2,$Z2
  772. vpslldq \$8,$Z2,$T1
  773. vmovdqu 0x10($const),$Hkey # .Lpoly
  774. vpsrldq \$8,$Z2,$Z2
  775. vpxor $T1,$Z1,$Xi
  776. vpxor $Z2,$Z3,$Z3
  777. vpalignr \$8,$Xi,$Xi,$T2 # 1st phase
  778. vpclmulqdq \$0x10,$Hkey,$Xi,$Xi
  779. vpxor $T2,$Xi,$Xi
  780. vpalignr \$8,$Xi,$Xi,$T2 # 2nd phase
  781. vpclmulqdq \$0x10,$Hkey,$Xi,$Xi
  782. vpxor $Z3,$T2,$T2
  783. vpxor $T2,$Xi,$Xi
  784. ___
  785. }
  786. $code.=<<___;
  787. vpshufb ($const),$Xi,$Xi # .Lbswap_mask
  788. vmovdqu $Xi,-0x40($Xip) # output Xi
  789. vzeroupper
  790. ___
  791. $code.=<<___ if ($win64);
  792. movaps -0xd8(%rax),%xmm6
  793. movaps -0xc8(%rax),%xmm7
  794. movaps -0xb8(%rax),%xmm8
  795. movaps -0xa8(%rax),%xmm9
  796. movaps -0x98(%rax),%xmm10
  797. movaps -0x88(%rax),%xmm11
  798. movaps -0x78(%rax),%xmm12
  799. movaps -0x68(%rax),%xmm13
  800. movaps -0x58(%rax),%xmm14
  801. movaps -0x48(%rax),%xmm15
  802. ___
  803. $code.=<<___;
  804. mov -48(%rax),%r15
  805. mov -40(%rax),%r14
  806. mov -32(%rax),%r13
  807. mov -24(%rax),%r12
  808. mov -16(%rax),%rbp
  809. mov -8(%rax),%rbx
  810. lea (%rax),%rsp # restore %rsp
  811. .Lgcm_enc_abort:
  812. mov $ret,%rax # return value
  813. ret
  814. .size aesni_gcm_encrypt,.-aesni_gcm_encrypt
  815. ___
  816. $code.=<<___;
  817. .align 64
  818. .Lbswap_mask:
  819. .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
  820. .Lpoly:
  821. .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
  822. .Lone_msb:
  823. .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
  824. .Ltwo_lsb:
  825. .byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  826. .Lone_lsb:
  827. .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  828. .asciz "AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  829. .align 64
  830. ___
  831. if ($win64) {
  832. $rec="%rcx";
  833. $frame="%rdx";
  834. $context="%r8";
  835. $disp="%r9";
  836. $code.=<<___
  837. .extern __imp_RtlVirtualUnwind
  838. .type gcm_se_handler,\@abi-omnipotent
  839. .align 16
  840. gcm_se_handler:
  841. push %rsi
  842. push %rdi
  843. push %rbx
  844. push %rbp
  845. push %r12
  846. push %r13
  847. push %r14
  848. push %r15
  849. pushfq
  850. sub \$64,%rsp
  851. mov 120($context),%rax # pull context->Rax
  852. mov 248($context),%rbx # pull context->Rip
  853. mov 8($disp),%rsi # disp->ImageBase
  854. mov 56($disp),%r11 # disp->HandlerData
  855. mov 0(%r11),%r10d # HandlerData[0]
  856. lea (%rsi,%r10),%r10 # prologue label
  857. cmp %r10,%rbx # context->Rip<prologue label
  858. jb .Lcommon_seh_tail
  859. mov 152($context),%rax # pull context->Rsp
  860. mov 4(%r11),%r10d # HandlerData[1]
  861. lea (%rsi,%r10),%r10 # epilogue label
  862. cmp %r10,%rbx # context->Rip>=epilogue label
  863. jae .Lcommon_seh_tail
  864. mov 120($context),%rax # pull context->Rax
  865. mov -48(%rax),%r15
  866. mov -40(%rax),%r14
  867. mov -32(%rax),%r13
  868. mov -24(%rax),%r12
  869. mov -16(%rax),%rbp
  870. mov -8(%rax),%rbx
  871. mov %r15,240($context)
  872. mov %r14,232($context)
  873. mov %r13,224($context)
  874. mov %r12,216($context)
  875. mov %rbp,160($context)
  876. mov %rbx,144($context)
  877. lea -0xd8(%rax),%rsi # %xmm save area
  878. lea 512($context),%rdi # & context.Xmm6
  879. mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
  880. .long 0xa548f3fc # cld; rep movsq
  881. .Lcommon_seh_tail:
  882. mov 8(%rax),%rdi
  883. mov 16(%rax),%rsi
  884. mov %rax,152($context) # restore context->Rsp
  885. mov %rsi,168($context) # restore context->Rsi
  886. mov %rdi,176($context) # restore context->Rdi
  887. mov 40($disp),%rdi # disp->ContextRecord
  888. mov $context,%rsi # context
  889. mov \$154,%ecx # sizeof(CONTEXT)
  890. .long 0xa548f3fc # cld; rep movsq
  891. mov $disp,%rsi
  892. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  893. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  894. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  895. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  896. mov 40(%rsi),%r10 # disp->ContextRecord
  897. lea 56(%rsi),%r11 # &disp->HandlerData
  898. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  899. mov %r10,32(%rsp) # arg5
  900. mov %r11,40(%rsp) # arg6
  901. mov %r12,48(%rsp) # arg7
  902. mov %rcx,56(%rsp) # arg8, (NULL)
  903. call *__imp_RtlVirtualUnwind(%rip)
  904. mov \$1,%eax # ExceptionContinueSearch
  905. add \$64,%rsp
  906. popfq
  907. pop %r15
  908. pop %r14
  909. pop %r13
  910. pop %r12
  911. pop %rbp
  912. pop %rbx
  913. pop %rdi
  914. pop %rsi
  915. ret
  916. .size gcm_se_handler,.-gcm_se_handler
  917. .section .pdata
  918. .align 4
  919. .rva .LSEH_begin_aesni_gcm_decrypt
  920. .rva .LSEH_end_aesni_gcm_decrypt
  921. .rva .LSEH_gcm_dec_info
  922. .rva .LSEH_begin_aesni_gcm_encrypt
  923. .rva .LSEH_end_aesni_gcm_encrypt
  924. .rva .LSEH_gcm_enc_info
  925. .section .xdata
  926. .align 8
  927. .LSEH_gcm_dec_info:
  928. .byte 9,0,0,0
  929. .rva gcm_se_handler
  930. .rva .Lgcm_dec_body,.Lgcm_dec_abort
  931. .LSEH_gcm_enc_info:
  932. .byte 9,0,0,0
  933. .rva gcm_se_handler
  934. .rva .Lgcm_enc_body,.Lgcm_enc_abort
  935. ___
  936. }
  937. }}} else {{{
  938. $code=<<___; # assembler is too old
  939. .text
  940. .globl aesni_gcm_encrypt
  941. .type aesni_gcm_encrypt,\@abi-omnipotent
  942. aesni_gcm_encrypt:
  943. xor %eax,%eax
  944. ret
  945. .size aesni_gcm_encrypt,.-aesni_gcm_encrypt
  946. .globl aesni_gcm_decrypt
  947. .type aesni_gcm_decrypt,\@abi-omnipotent
  948. aesni_gcm_decrypt:
  949. xor %eax,%eax
  950. ret
  951. .size aesni_gcm_decrypt,.-aesni_gcm_decrypt
  952. ___
  953. }}}
  954. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  955. print $code;
  956. close STDOUT;