You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

1754 rivejä
42 KiB

  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  5. # project. The module is, however, dual licensed under OpenSSL and
  6. # CRYPTOGAMS licenses depending on where you obtain it. For further
  7. # details see http://www.openssl.org/~appro/cryptogams/.
  8. # ====================================================================
  9. #
  10. # March, June 2010
  11. #
  12. # The module implements "4-bit" GCM GHASH function and underlying
  13. # single multiplication operation in GF(2^128). "4-bit" means that
  14. # it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
  15. # function features so called "528B" variant utilizing additional
  16. # 256+16 bytes of per-key storage [+512 bytes shared table].
  17. # Performance results are for this streamed GHASH subroutine and are
  18. # expressed in cycles per processed byte, less is better:
  19. #
  20. # gcc 3.4.x(*) assembler
  21. #
  22. # P4 28.6 14.0 +100%
  23. # Opteron 19.3 7.7 +150%
  24. # Core2 17.8 8.1(**) +120%
  25. # Atom 31.6 16.8 +88%
  26. # VIA Nano 21.8 10.1 +115%
  27. #
  28. # (*) comparison is not completely fair, because C results are
  29. # for vanilla "256B" implementation, while assembler results
  30. # are for "528B";-)
  31. # (**) it's mystery [to me] why Core2 result is not same as for
  32. # Opteron;
  33. # May 2010
  34. #
  35. # Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
  36. # See ghash-x86.pl for background information and details about coding
  37. # techniques.
  38. #
  39. # Special thanks to David Woodhouse <dwmw2@infradead.org> for
  40. # providing access to a Westmere-based system on behalf of Intel
  41. # Open Source Technology Centre.
  42. # December 2012
  43. #
  44. # Overhaul: aggregate Karatsuba post-processing, improve ILP in
  45. # reduction_alg9, increase reduction aggregate factor to 4x. As for
  46. # the latter. ghash-x86.pl discusses that it makes lesser sense to
  47. # increase aggregate factor. Then why increase here? Critical path
  48. # consists of 3 independent pclmulqdq instructions, Karatsuba post-
  49. # processing and reduction. "On top" of this we lay down aggregated
  50. # multiplication operations, triplets of independent pclmulqdq's. As
  51. # issue rate for pclmulqdq is limited, it makes lesser sense to
  52. # aggregate more multiplications than it takes to perform remaining
  53. # non-multiplication operations. 2x is near-optimal coefficient for
  54. # contemporary Intel CPUs (therefore modest improvement coefficient),
  55. # but not for Bulldozer. Latter is because logical SIMD operations
  56. # are twice as slow in comparison to Intel, so that critical path is
  57. # longer. A CPU with higher pclmulqdq issue rate would also benefit
  58. # from higher aggregate factor...
  59. #
  60. # Westmere 1.78(+13%)
  61. # Sandy Bridge 1.80(+8%)
  62. # Ivy Bridge 1.80(+7%)
  63. # Haswell 0.55(+93%) (if system doesn't support AVX)
  64. # Broadwell 0.45(+110%)(if system doesn't support AVX)
  65. # Bulldozer 1.49(+27%)
  66. # Silvermont 2.88(+13%)
  67. # March 2013
  68. #
  69. # ... 8x aggregate factor AVX code path is using reduction algorithm
  70. # suggested by Shay Gueron[1]. Even though contemporary AVX-capable
  71. # CPUs such as Sandy and Ivy Bridge can execute it, the code performs
  72. # sub-optimally in comparison to above mentioned version. But thanks
  73. # to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
  74. # it performs in 0.41 cycles per byte on Haswell processor, and in
  75. # 0.29 on Broadwell.
  76. #
  77. # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
  78. $flavour = shift;
  79. $output = shift;
  80. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  81. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  82. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  83. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  84. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  85. die "can't locate x86_64-xlate.pl";
  86. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  87. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  88. $avx = ($1>=2.19) + ($1>=2.22);
  89. }
  90. if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  91. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  92. $avx = ($1>=2.09) + ($1>=2.10);
  93. }
  94. if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  95. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  96. $avx = ($1>=10) + ($1>=11);
  97. }
  98. if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
  99. $avx = ($2>=3.0) + ($2>3.0);
  100. }
  101. open OUT,"| \"$^X\" $xlate $flavour $output";
  102. *STDOUT=*OUT;
  103. $do4xaggr=1;
  104. # common register layout
  105. $nlo="%rax";
  106. $nhi="%rbx";
  107. $Zlo="%r8";
  108. $Zhi="%r9";
  109. $tmp="%r10";
  110. $rem_4bit = "%r11";
  111. $Xi="%rdi";
  112. $Htbl="%rsi";
  113. # per-function register layout
  114. $cnt="%rcx";
  115. $rem="%rdx";
  116. sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or
  117. $r =~ s/%[er]([sd]i)/%\1l/ or
  118. $r =~ s/%[er](bp)/%\1l/ or
  119. $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
  120. sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
  121. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
  122. my $arg = pop;
  123. $arg = "\$$arg" if ($arg*1 eq $arg);
  124. $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
  125. }
  126. { my $N;
  127. sub loop() {
  128. my $inp = shift;
  129. $N++;
  130. $code.=<<___;
  131. xor $nlo,$nlo
  132. xor $nhi,$nhi
  133. mov `&LB("$Zlo")`,`&LB("$nlo")`
  134. mov `&LB("$Zlo")`,`&LB("$nhi")`
  135. shl \$4,`&LB("$nlo")`
  136. mov \$14,$cnt
  137. mov 8($Htbl,$nlo),$Zlo
  138. mov ($Htbl,$nlo),$Zhi
  139. and \$0xf0,`&LB("$nhi")`
  140. mov $Zlo,$rem
  141. jmp .Loop$N
  142. .align 16
  143. .Loop$N:
  144. shr \$4,$Zlo
  145. and \$0xf,$rem
  146. mov $Zhi,$tmp
  147. mov ($inp,$cnt),`&LB("$nlo")`
  148. shr \$4,$Zhi
  149. xor 8($Htbl,$nhi),$Zlo
  150. shl \$60,$tmp
  151. xor ($Htbl,$nhi),$Zhi
  152. mov `&LB("$nlo")`,`&LB("$nhi")`
  153. xor ($rem_4bit,$rem,8),$Zhi
  154. mov $Zlo,$rem
  155. shl \$4,`&LB("$nlo")`
  156. xor $tmp,$Zlo
  157. dec $cnt
  158. js .Lbreak$N
  159. shr \$4,$Zlo
  160. and \$0xf,$rem
  161. mov $Zhi,$tmp
  162. shr \$4,$Zhi
  163. xor 8($Htbl,$nlo),$Zlo
  164. shl \$60,$tmp
  165. xor ($Htbl,$nlo),$Zhi
  166. and \$0xf0,`&LB("$nhi")`
  167. xor ($rem_4bit,$rem,8),$Zhi
  168. mov $Zlo,$rem
  169. xor $tmp,$Zlo
  170. jmp .Loop$N
  171. .align 16
  172. .Lbreak$N:
  173. shr \$4,$Zlo
  174. and \$0xf,$rem
  175. mov $Zhi,$tmp
  176. shr \$4,$Zhi
  177. xor 8($Htbl,$nlo),$Zlo
  178. shl \$60,$tmp
  179. xor ($Htbl,$nlo),$Zhi
  180. and \$0xf0,`&LB("$nhi")`
  181. xor ($rem_4bit,$rem,8),$Zhi
  182. mov $Zlo,$rem
  183. xor $tmp,$Zlo
  184. shr \$4,$Zlo
  185. and \$0xf,$rem
  186. mov $Zhi,$tmp
  187. shr \$4,$Zhi
  188. xor 8($Htbl,$nhi),$Zlo
  189. shl \$60,$tmp
  190. xor ($Htbl,$nhi),$Zhi
  191. xor $tmp,$Zlo
  192. xor ($rem_4bit,$rem,8),$Zhi
  193. bswap $Zlo
  194. bswap $Zhi
  195. ___
  196. }}
  197. $code=<<___;
  198. .text
  199. .extern OPENSSL_ia32cap_P
  200. .globl gcm_gmult_4bit
  201. .type gcm_gmult_4bit,\@function,2
  202. .align 16
  203. gcm_gmult_4bit:
  204. push %rbx
  205. push %rbp # %rbp and %r12 are pushed exclusively in
  206. push %r12 # order to reuse Win64 exception handler...
  207. .Lgmult_prologue:
  208. movzb 15($Xi),$Zlo
  209. lea .Lrem_4bit(%rip),$rem_4bit
  210. ___
  211. &loop ($Xi);
  212. $code.=<<___;
  213. mov $Zlo,8($Xi)
  214. mov $Zhi,($Xi)
  215. mov 16(%rsp),%rbx
  216. lea 24(%rsp),%rsp
  217. .Lgmult_epilogue:
  218. ret
  219. .size gcm_gmult_4bit,.-gcm_gmult_4bit
  220. ___
  221. # per-function register layout
  222. $inp="%rdx";
  223. $len="%rcx";
  224. $rem_8bit=$rem_4bit;
  225. $code.=<<___;
  226. .globl gcm_ghash_4bit
  227. .type gcm_ghash_4bit,\@function,4
  228. .align 16
  229. gcm_ghash_4bit:
  230. push %rbx
  231. push %rbp
  232. push %r12
  233. push %r13
  234. push %r14
  235. push %r15
  236. sub \$280,%rsp
  237. .Lghash_prologue:
  238. mov $inp,%r14 # reassign couple of args
  239. mov $len,%r15
  240. ___
  241. { my $inp="%r14";
  242. my $dat="%edx";
  243. my $len="%r15";
  244. my @nhi=("%ebx","%ecx");
  245. my @rem=("%r12","%r13");
  246. my $Hshr4="%rbp";
  247. &sub ($Htbl,-128); # size optimization
  248. &lea ($Hshr4,"16+128(%rsp)");
  249. { my @lo =($nlo,$nhi);
  250. my @hi =($Zlo,$Zhi);
  251. &xor ($dat,$dat);
  252. for ($i=0,$j=-2;$i<18;$i++,$j++) {
  253. &mov ("$j(%rsp)",&LB($dat)) if ($i>1);
  254. &or ($lo[0],$tmp) if ($i>1);
  255. &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17);
  256. &shr ($lo[1],4) if ($i>0 && $i<17);
  257. &mov ($tmp,$hi[1]) if ($i>0 && $i<17);
  258. &shr ($hi[1],4) if ($i>0 && $i<17);
  259. &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1);
  260. &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16);
  261. &shl (&LB($dat),4) if ($i>0 && $i<17);
  262. &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1);
  263. &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16);
  264. &shl ($tmp,60) if ($i>0 && $i<17);
  265. push (@lo,shift(@lo));
  266. push (@hi,shift(@hi));
  267. }
  268. }
  269. &add ($Htbl,-128);
  270. &mov ($Zlo,"8($Xi)");
  271. &mov ($Zhi,"0($Xi)");
  272. &add ($len,$inp); # pointer to the end of data
  273. &lea ($rem_8bit,".Lrem_8bit(%rip)");
  274. &jmp (".Louter_loop");
  275. $code.=".align 16\n.Louter_loop:\n";
  276. &xor ($Zhi,"($inp)");
  277. &mov ("%rdx","8($inp)");
  278. &lea ($inp,"16($inp)");
  279. &xor ("%rdx",$Zlo);
  280. &mov ("($Xi)",$Zhi);
  281. &mov ("8($Xi)","%rdx");
  282. &shr ("%rdx",32);
  283. &xor ($nlo,$nlo);
  284. &rol ($dat,8);
  285. &mov (&LB($nlo),&LB($dat));
  286. &movz ($nhi[0],&LB($dat));
  287. &shl (&LB($nlo),4);
  288. &shr ($nhi[0],4);
  289. for ($j=11,$i=0;$i<15;$i++) {
  290. &rol ($dat,8);
  291. &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0);
  292. &xor ($Zhi,"($Htbl,$nlo)") if ($i>0);
  293. &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0);
  294. &mov ($Zhi,"($Htbl,$nlo)") if ($i==0);
  295. &mov (&LB($nlo),&LB($dat));
  296. &xor ($Zlo,$tmp) if ($i>0);
  297. &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0);
  298. &movz ($nhi[1],&LB($dat));
  299. &shl (&LB($nlo),4);
  300. &movzb ($rem[0],"(%rsp,$nhi[0])");
  301. &shr ($nhi[1],4) if ($i<14);
  302. &and ($nhi[1],0xf0) if ($i==14);
  303. &shl ($rem[1],48) if ($i>0);
  304. &xor ($rem[0],$Zlo);
  305. &mov ($tmp,$Zhi);
  306. &xor ($Zhi,$rem[1]) if ($i>0);
  307. &shr ($Zlo,8);
  308. &movz ($rem[0],&LB($rem[0]));
  309. &mov ($dat,"$j($Xi)") if (--$j%4==0);
  310. &shr ($Zhi,8);
  311. &xor ($Zlo,"-128($Hshr4,$nhi[0],8)");
  312. &shl ($tmp,56);
  313. &xor ($Zhi,"($Hshr4,$nhi[0],8)");
  314. unshift (@nhi,pop(@nhi)); # "rotate" registers
  315. unshift (@rem,pop(@rem));
  316. }
  317. &movzw ($rem[1],"($rem_8bit,$rem[1],2)");
  318. &xor ($Zlo,"8($Htbl,$nlo)");
  319. &xor ($Zhi,"($Htbl,$nlo)");
  320. &shl ($rem[1],48);
  321. &xor ($Zlo,$tmp);
  322. &xor ($Zhi,$rem[1]);
  323. &movz ($rem[0],&LB($Zlo));
  324. &shr ($Zlo,4);
  325. &mov ($tmp,$Zhi);
  326. &shl (&LB($rem[0]),4);
  327. &shr ($Zhi,4);
  328. &xor ($Zlo,"8($Htbl,$nhi[0])");
  329. &movzw ($rem[0],"($rem_8bit,$rem[0],2)");
  330. &shl ($tmp,60);
  331. &xor ($Zhi,"($Htbl,$nhi[0])");
  332. &xor ($Zlo,$tmp);
  333. &shl ($rem[0],48);
  334. &bswap ($Zlo);
  335. &xor ($Zhi,$rem[0]);
  336. &bswap ($Zhi);
  337. &cmp ($inp,$len);
  338. &jb (".Louter_loop");
  339. }
  340. $code.=<<___;
  341. mov $Zlo,8($Xi)
  342. mov $Zhi,($Xi)
  343. lea 280(%rsp),%rsi
  344. mov 0(%rsi),%r15
  345. mov 8(%rsi),%r14
  346. mov 16(%rsi),%r13
  347. mov 24(%rsi),%r12
  348. mov 32(%rsi),%rbp
  349. mov 40(%rsi),%rbx
  350. lea 48(%rsi),%rsp
  351. .Lghash_epilogue:
  352. ret
  353. .size gcm_ghash_4bit,.-gcm_ghash_4bit
  354. ___
  355. ######################################################################
  356. # PCLMULQDQ version.
  357. @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
  358. ("%rdi","%rsi","%rdx","%rcx"); # Unix order
  359. ($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2";
  360. ($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
  361. sub clmul64x64_T2 { # minimal register pressure
  362. my ($Xhi,$Xi,$Hkey,$HK)=@_;
  363. if (!defined($HK)) { $HK = $T2;
  364. $code.=<<___;
  365. movdqa $Xi,$Xhi #
  366. pshufd \$0b01001110,$Xi,$T1
  367. pshufd \$0b01001110,$Hkey,$T2
  368. pxor $Xi,$T1 #
  369. pxor $Hkey,$T2
  370. ___
  371. } else {
  372. $code.=<<___;
  373. movdqa $Xi,$Xhi #
  374. pshufd \$0b01001110,$Xi,$T1
  375. pxor $Xi,$T1 #
  376. ___
  377. }
  378. $code.=<<___;
  379. pclmulqdq \$0x00,$Hkey,$Xi #######
  380. pclmulqdq \$0x11,$Hkey,$Xhi #######
  381. pclmulqdq \$0x00,$HK,$T1 #######
  382. pxor $Xi,$T1 #
  383. pxor $Xhi,$T1 #
  384. movdqa $T1,$T2 #
  385. psrldq \$8,$T1
  386. pslldq \$8,$T2 #
  387. pxor $T1,$Xhi
  388. pxor $T2,$Xi #
  389. ___
  390. }
  391. sub reduction_alg9 { # 17/11 times faster than Intel version
  392. my ($Xhi,$Xi) = @_;
  393. $code.=<<___;
  394. # 1st phase
  395. movdqa $Xi,$T2 #
  396. movdqa $Xi,$T1
  397. psllq \$5,$Xi
  398. pxor $Xi,$T1 #
  399. psllq \$1,$Xi
  400. pxor $T1,$Xi #
  401. psllq \$57,$Xi #
  402. movdqa $Xi,$T1 #
  403. pslldq \$8,$Xi
  404. psrldq \$8,$T1 #
  405. pxor $T2,$Xi
  406. pxor $T1,$Xhi #
  407. # 2nd phase
  408. movdqa $Xi,$T2
  409. psrlq \$1,$Xi
  410. pxor $T2,$Xhi #
  411. pxor $Xi,$T2
  412. psrlq \$5,$Xi
  413. pxor $T2,$Xi #
  414. psrlq \$1,$Xi #
  415. pxor $Xhi,$Xi #
  416. ___
  417. }
  418. { my ($Htbl,$Xip)=@_4args;
  419. my $HK="%xmm6";
  420. $code.=<<___;
  421. .globl gcm_init_clmul
  422. .type gcm_init_clmul,\@abi-omnipotent
  423. .align 16
  424. gcm_init_clmul:
  425. .L_init_clmul:
  426. ___
  427. $code.=<<___ if ($win64);
  428. .LSEH_begin_gcm_init_clmul:
  429. # I can't trust assembler to use specific encoding:-(
  430. .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
  431. .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
  432. ___
  433. $code.=<<___;
  434. movdqu ($Xip),$Hkey
  435. pshufd \$0b01001110,$Hkey,$Hkey # dword swap
  436. # <<1 twist
  437. pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
  438. movdqa $Hkey,$T1
  439. psllq \$1,$Hkey
  440. pxor $T3,$T3 #
  441. psrlq \$63,$T1
  442. pcmpgtd $T2,$T3 # broadcast carry bit
  443. pslldq \$8,$T1
  444. por $T1,$Hkey # H<<=1
  445. # magic reduction
  446. pand .L0x1c2_polynomial(%rip),$T3
  447. pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
  448. # calculate H^2
  449. pshufd \$0b01001110,$Hkey,$HK
  450. movdqa $Hkey,$Xi
  451. pxor $Hkey,$HK
  452. ___
  453. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK);
  454. &reduction_alg9 ($Xhi,$Xi);
  455. $code.=<<___;
  456. pshufd \$0b01001110,$Hkey,$T1
  457. pshufd \$0b01001110,$Xi,$T2
  458. pxor $Hkey,$T1 # Karatsuba pre-processing
  459. movdqu $Hkey,0x00($Htbl) # save H
  460. pxor $Xi,$T2 # Karatsuba pre-processing
  461. movdqu $Xi,0x10($Htbl) # save H^2
  462. palignr \$8,$T1,$T2 # low part is H.lo^H.hi...
  463. movdqu $T2,0x20($Htbl) # save Karatsuba "salt"
  464. ___
  465. if ($do4xaggr) {
  466. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3
  467. &reduction_alg9 ($Xhi,$Xi);
  468. $code.=<<___;
  469. movdqa $Xi,$T3
  470. ___
  471. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4
  472. &reduction_alg9 ($Xhi,$Xi);
  473. $code.=<<___;
  474. pshufd \$0b01001110,$T3,$T1
  475. pshufd \$0b01001110,$Xi,$T2
  476. pxor $T3,$T1 # Karatsuba pre-processing
  477. movdqu $T3,0x30($Htbl) # save H^3
  478. pxor $Xi,$T2 # Karatsuba pre-processing
  479. movdqu $Xi,0x40($Htbl) # save H^4
  480. palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi...
  481. movdqu $T2,0x50($Htbl) # save Karatsuba "salt"
  482. ___
  483. }
  484. $code.=<<___ if ($win64);
  485. movaps (%rsp),%xmm6
  486. lea 0x18(%rsp),%rsp
  487. .LSEH_end_gcm_init_clmul:
  488. ___
  489. $code.=<<___;
  490. ret
  491. .size gcm_init_clmul,.-gcm_init_clmul
  492. ___
  493. }
  494. { my ($Xip,$Htbl)=@_4args;
  495. $code.=<<___;
  496. .globl gcm_gmult_clmul
  497. .type gcm_gmult_clmul,\@abi-omnipotent
  498. .align 16
  499. gcm_gmult_clmul:
  500. .L_gmult_clmul:
  501. movdqu ($Xip),$Xi
  502. movdqa .Lbswap_mask(%rip),$T3
  503. movdqu ($Htbl),$Hkey
  504. movdqu 0x20($Htbl),$T2
  505. pshufb $T3,$Xi
  506. ___
  507. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
  508. $code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
  509. # experimental alternative. special thing about is that there
  510. # no dependency between the two multiplications...
  511. mov \$`0xE1<<1`,%eax
  512. mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff
  513. mov \$0x07,%r11d
  514. movq %rax,$T1
  515. movq %r10,$T2
  516. movq %r11,$T3 # borrow $T3
  517. pand $Xi,$T3
  518. pshufb $T3,$T2 # ($Xi&7)·0xE0
  519. movq %rax,$T3
  520. pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1)
  521. pxor $Xi,$T2
  522. pslldq \$15,$T2
  523. paddd $T2,$T2 # <<(64+56+1)
  524. pxor $T2,$Xi
  525. pclmulqdq \$0x01,$T3,$Xi
  526. movdqa .Lbswap_mask(%rip),$T3 # reload $T3
  527. psrldq \$1,$T1
  528. pxor $T1,$Xhi
  529. pslldq \$7,$Xi
  530. pxor $Xhi,$Xi
  531. ___
  532. $code.=<<___;
  533. pshufb $T3,$Xi
  534. movdqu $Xi,($Xip)
  535. ret
  536. .size gcm_gmult_clmul,.-gcm_gmult_clmul
  537. ___
  538. }
  539. { my ($Xip,$Htbl,$inp,$len)=@_4args;
  540. my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
  541. my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
  542. $code.=<<___;
  543. .globl gcm_ghash_clmul
  544. .type gcm_ghash_clmul,\@abi-omnipotent
  545. .align 32
  546. gcm_ghash_clmul:
  547. .L_ghash_clmul:
  548. ___
  549. $code.=<<___ if ($win64);
  550. lea -0x88(%rsp),%rax
  551. .LSEH_begin_gcm_ghash_clmul:
  552. # I can't trust assembler to use specific encoding:-(
  553. .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
  554. .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
  555. .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
  556. .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
  557. .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
  558. .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
  559. .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
  560. .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
  561. .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
  562. .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
  563. .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
  564. ___
  565. $code.=<<___;
  566. movdqa .Lbswap_mask(%rip),$T3
  567. movdqu ($Xip),$Xi
  568. movdqu ($Htbl),$Hkey
  569. movdqu 0x20($Htbl),$HK
  570. pshufb $T3,$Xi
  571. sub \$0x10,$len
  572. jz .Lodd_tail
  573. movdqu 0x10($Htbl),$Hkey2
  574. ___
  575. if ($do4xaggr) {
  576. my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
  577. $code.=<<___;
  578. mov OPENSSL_ia32cap_P+4(%rip),%eax
  579. cmp \$0x30,$len
  580. jb .Lskip4x
  581. and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE
  582. cmp \$`1<<22`,%eax # check for MOVBE without XSAVE
  583. je .Lskip4x
  584. sub \$0x30,$len
  585. mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
  586. movdqu 0x30($Htbl),$Hkey3
  587. movdqu 0x40($Htbl),$Hkey4
  588. #######
  589. # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P
  590. #
  591. movdqu 0x30($inp),$Xln
  592. movdqu 0x20($inp),$Xl
  593. pshufb $T3,$Xln
  594. pshufb $T3,$Xl
  595. movdqa $Xln,$Xhn
  596. pshufd \$0b01001110,$Xln,$Xmn
  597. pxor $Xln,$Xmn
  598. pclmulqdq \$0x00,$Hkey,$Xln
  599. pclmulqdq \$0x11,$Hkey,$Xhn
  600. pclmulqdq \$0x00,$HK,$Xmn
  601. movdqa $Xl,$Xh
  602. pshufd \$0b01001110,$Xl,$Xm
  603. pxor $Xl,$Xm
  604. pclmulqdq \$0x00,$Hkey2,$Xl
  605. pclmulqdq \$0x11,$Hkey2,$Xh
  606. pclmulqdq \$0x10,$HK,$Xm
  607. xorps $Xl,$Xln
  608. xorps $Xh,$Xhn
  609. movups 0x50($Htbl),$HK
  610. xorps $Xm,$Xmn
  611. movdqu 0x10($inp),$Xl
  612. movdqu 0($inp),$T1
  613. pshufb $T3,$Xl
  614. pshufb $T3,$T1
  615. movdqa $Xl,$Xh
  616. pshufd \$0b01001110,$Xl,$Xm
  617. pxor $T1,$Xi
  618. pxor $Xl,$Xm
  619. pclmulqdq \$0x00,$Hkey3,$Xl
  620. movdqa $Xi,$Xhi
  621. pshufd \$0b01001110,$Xi,$T1
  622. pxor $Xi,$T1
  623. pclmulqdq \$0x11,$Hkey3,$Xh
  624. pclmulqdq \$0x00,$HK,$Xm
  625. xorps $Xl,$Xln
  626. xorps $Xh,$Xhn
  627. lea 0x40($inp),$inp
  628. sub \$0x40,$len
  629. jc .Ltail4x
  630. jmp .Lmod4_loop
  631. .align 32
  632. .Lmod4_loop:
  633. pclmulqdq \$0x00,$Hkey4,$Xi
  634. xorps $Xm,$Xmn
  635. movdqu 0x30($inp),$Xl
  636. pshufb $T3,$Xl
  637. pclmulqdq \$0x11,$Hkey4,$Xhi
  638. xorps $Xln,$Xi
  639. movdqu 0x20($inp),$Xln
  640. movdqa $Xl,$Xh
  641. pclmulqdq \$0x10,$HK,$T1
  642. pshufd \$0b01001110,$Xl,$Xm
  643. xorps $Xhn,$Xhi
  644. pxor $Xl,$Xm
  645. pshufb $T3,$Xln
  646. movups 0x20($Htbl),$HK
  647. xorps $Xmn,$T1
  648. pclmulqdq \$0x00,$Hkey,$Xl
  649. pshufd \$0b01001110,$Xln,$Xmn
  650. pxor $Xi,$T1 # aggregated Karatsuba post-processing
  651. movdqa $Xln,$Xhn
  652. pxor $Xhi,$T1 #
  653. pxor $Xln,$Xmn
  654. movdqa $T1,$T2 #
  655. pclmulqdq \$0x11,$Hkey,$Xh
  656. pslldq \$8,$T1
  657. psrldq \$8,$T2 #
  658. pxor $T1,$Xi
  659. movdqa .L7_mask(%rip),$T1
  660. pxor $T2,$Xhi #
  661. movq %rax,$T2
  662. pand $Xi,$T1 # 1st phase
  663. pshufb $T1,$T2 #
  664. pxor $Xi,$T2 #
  665. pclmulqdq \$0x00,$HK,$Xm
  666. psllq \$57,$T2 #
  667. movdqa $T2,$T1 #
  668. pslldq \$8,$T2
  669. pclmulqdq \$0x00,$Hkey2,$Xln
  670. psrldq \$8,$T1 #
  671. pxor $T2,$Xi
  672. pxor $T1,$Xhi #
  673. movdqu 0($inp),$T1
  674. movdqa $Xi,$T2 # 2nd phase
  675. psrlq \$1,$Xi
  676. pclmulqdq \$0x11,$Hkey2,$Xhn
  677. xorps $Xl,$Xln
  678. movdqu 0x10($inp),$Xl
  679. pshufb $T3,$Xl
  680. pclmulqdq \$0x10,$HK,$Xmn
  681. xorps $Xh,$Xhn
  682. movups 0x50($Htbl),$HK
  683. pshufb $T3,$T1
  684. pxor $T2,$Xhi #
  685. pxor $Xi,$T2
  686. psrlq \$5,$Xi
  687. movdqa $Xl,$Xh
  688. pxor $Xm,$Xmn
  689. pshufd \$0b01001110,$Xl,$Xm
  690. pxor $T2,$Xi #
  691. pxor $T1,$Xhi
  692. pxor $Xl,$Xm
  693. pclmulqdq \$0x00,$Hkey3,$Xl
  694. psrlq \$1,$Xi #
  695. pxor $Xhi,$Xi #
  696. movdqa $Xi,$Xhi
  697. pclmulqdq \$0x11,$Hkey3,$Xh
  698. xorps $Xl,$Xln
  699. pshufd \$0b01001110,$Xi,$T1
  700. pxor $Xi,$T1
  701. pclmulqdq \$0x00,$HK,$Xm
  702. xorps $Xh,$Xhn
  703. lea 0x40($inp),$inp
  704. sub \$0x40,$len
  705. jnc .Lmod4_loop
  706. .Ltail4x:
  707. pclmulqdq \$0x00,$Hkey4,$Xi
  708. pclmulqdq \$0x11,$Hkey4,$Xhi
  709. pclmulqdq \$0x10,$HK,$T1
  710. xorps $Xm,$Xmn
  711. xorps $Xln,$Xi
  712. xorps $Xhn,$Xhi
  713. pxor $Xi,$Xhi # aggregated Karatsuba post-processing
  714. pxor $Xmn,$T1
  715. pxor $Xhi,$T1 #
  716. pxor $Xi,$Xhi
  717. movdqa $T1,$T2 #
  718. psrldq \$8,$T1
  719. pslldq \$8,$T2 #
  720. pxor $T1,$Xhi
  721. pxor $T2,$Xi #
  722. ___
  723. &reduction_alg9($Xhi,$Xi);
  724. $code.=<<___;
  725. add \$0x40,$len
  726. jz .Ldone
  727. movdqu 0x20($Htbl),$HK
  728. sub \$0x10,$len
  729. jz .Lodd_tail
  730. .Lskip4x:
  731. ___
  732. }
  733. $code.=<<___;
  734. #######
  735. # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
  736. # [(H*Ii+1) + (H*Xi+1)] mod P =
  737. # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
  738. #
  739. movdqu ($inp),$T1 # Ii
  740. movdqu 16($inp),$Xln # Ii+1
  741. pshufb $T3,$T1
  742. pshufb $T3,$Xln
  743. pxor $T1,$Xi # Ii+Xi
  744. movdqa $Xln,$Xhn
  745. pshufd \$0b01001110,$Xln,$Xmn
  746. pxor $Xln,$Xmn
  747. pclmulqdq \$0x00,$Hkey,$Xln
  748. pclmulqdq \$0x11,$Hkey,$Xhn
  749. pclmulqdq \$0x00,$HK,$Xmn
  750. lea 32($inp),$inp # i+=2
  751. nop
  752. sub \$0x20,$len
  753. jbe .Leven_tail
  754. nop
  755. jmp .Lmod_loop
  756. .align 32
  757. .Lmod_loop:
  758. movdqa $Xi,$Xhi
  759. movdqa $Xmn,$T1
  760. pshufd \$0b01001110,$Xi,$Xmn #
  761. pxor $Xi,$Xmn #
  762. pclmulqdq \$0x00,$Hkey2,$Xi
  763. pclmulqdq \$0x11,$Hkey2,$Xhi
  764. pclmulqdq \$0x10,$HK,$Xmn
  765. pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
  766. pxor $Xhn,$Xhi
  767. movdqu ($inp),$T2 # Ii
  768. pxor $Xi,$T1 # aggregated Karatsuba post-processing
  769. pshufb $T3,$T2
  770. movdqu 16($inp),$Xln # Ii+1
  771. pxor $Xhi,$T1
  772. pxor $T2,$Xhi # "Ii+Xi", consume early
  773. pxor $T1,$Xmn
  774. pshufb $T3,$Xln
  775. movdqa $Xmn,$T1 #
  776. psrldq \$8,$T1
  777. pslldq \$8,$Xmn #
  778. pxor $T1,$Xhi
  779. pxor $Xmn,$Xi #
  780. movdqa $Xln,$Xhn #
  781. movdqa $Xi,$T2 # 1st phase
  782. movdqa $Xi,$T1
  783. psllq \$5,$Xi
  784. pxor $Xi,$T1 #
  785. pclmulqdq \$0x00,$Hkey,$Xln #######
  786. psllq \$1,$Xi
  787. pxor $T1,$Xi #
  788. psllq \$57,$Xi #
  789. movdqa $Xi,$T1 #
  790. pslldq \$8,$Xi
  791. psrldq \$8,$T1 #
  792. pxor $T2,$Xi
  793. pshufd \$0b01001110,$Xhn,$Xmn
  794. pxor $T1,$Xhi #
  795. pxor $Xhn,$Xmn #
  796. movdqa $Xi,$T2 # 2nd phase
  797. psrlq \$1,$Xi
  798. pclmulqdq \$0x11,$Hkey,$Xhn #######
  799. pxor $T2,$Xhi #
  800. pxor $Xi,$T2
  801. psrlq \$5,$Xi
  802. pxor $T2,$Xi #
  803. lea 32($inp),$inp
  804. psrlq \$1,$Xi #
  805. pclmulqdq \$0x00,$HK,$Xmn #######
  806. pxor $Xhi,$Xi #
  807. sub \$0x20,$len
  808. ja .Lmod_loop
  809. .Leven_tail:
  810. movdqa $Xi,$Xhi
  811. movdqa $Xmn,$T1
  812. pshufd \$0b01001110,$Xi,$Xmn #
  813. pxor $Xi,$Xmn #
  814. pclmulqdq \$0x00,$Hkey2,$Xi
  815. pclmulqdq \$0x11,$Hkey2,$Xhi
  816. pclmulqdq \$0x10,$HK,$Xmn
  817. pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
  818. pxor $Xhn,$Xhi
  819. pxor $Xi,$T1
  820. pxor $Xhi,$T1
  821. pxor $T1,$Xmn
  822. movdqa $Xmn,$T1 #
  823. psrldq \$8,$T1
  824. pslldq \$8,$Xmn #
  825. pxor $T1,$Xhi
  826. pxor $Xmn,$Xi #
  827. ___
  828. &reduction_alg9 ($Xhi,$Xi);
  829. $code.=<<___;
  830. test $len,$len
  831. jnz .Ldone
  832. .Lodd_tail:
  833. movdqu ($inp),$T1 # Ii
  834. pshufb $T3,$T1
  835. pxor $T1,$Xi # Ii+Xi
  836. ___
  837. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi)
  838. &reduction_alg9 ($Xhi,$Xi);
  839. $code.=<<___;
  840. .Ldone:
  841. pshufb $T3,$Xi
  842. movdqu $Xi,($Xip)
  843. ___
  844. $code.=<<___ if ($win64);
  845. movaps (%rsp),%xmm6
  846. movaps 0x10(%rsp),%xmm7
  847. movaps 0x20(%rsp),%xmm8
  848. movaps 0x30(%rsp),%xmm9
  849. movaps 0x40(%rsp),%xmm10
  850. movaps 0x50(%rsp),%xmm11
  851. movaps 0x60(%rsp),%xmm12
  852. movaps 0x70(%rsp),%xmm13
  853. movaps 0x80(%rsp),%xmm14
  854. movaps 0x90(%rsp),%xmm15
  855. lea 0xa8(%rsp),%rsp
  856. .LSEH_end_gcm_ghash_clmul:
  857. ___
  858. $code.=<<___;
  859. ret
  860. .size gcm_ghash_clmul,.-gcm_ghash_clmul
  861. ___
  862. }
  863. $code.=<<___;
  864. .globl gcm_init_avx
  865. .type gcm_init_avx,\@abi-omnipotent
  866. .align 32
  867. gcm_init_avx:
  868. ___
  869. if ($avx) {
  870. my ($Htbl,$Xip)=@_4args;
  871. my $HK="%xmm6";
  872. $code.=<<___ if ($win64);
  873. .LSEH_begin_gcm_init_avx:
  874. # I can't trust assembler to use specific encoding:-(
  875. .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
  876. .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
  877. ___
  878. $code.=<<___;
  879. vzeroupper
  880. vmovdqu ($Xip),$Hkey
  881. vpshufd \$0b01001110,$Hkey,$Hkey # dword swap
  882. # <<1 twist
  883. vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
  884. vpsrlq \$63,$Hkey,$T1
  885. vpsllq \$1,$Hkey,$Hkey
  886. vpxor $T3,$T3,$T3 #
  887. vpcmpgtd $T2,$T3,$T3 # broadcast carry bit
  888. vpslldq \$8,$T1,$T1
  889. vpor $T1,$Hkey,$Hkey # H<<=1
  890. # magic reduction
  891. vpand .L0x1c2_polynomial(%rip),$T3,$T3
  892. vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial
  893. vpunpckhqdq $Hkey,$Hkey,$HK
  894. vmovdqa $Hkey,$Xi
  895. vpxor $Hkey,$HK,$HK
  896. mov \$4,%r10 # up to H^8
  897. jmp .Linit_start_avx
  898. ___
  899. sub clmul64x64_avx {
  900. my ($Xhi,$Xi,$Hkey,$HK)=@_;
  901. if (!defined($HK)) { $HK = $T2;
  902. $code.=<<___;
  903. vpunpckhqdq $Xi,$Xi,$T1
  904. vpunpckhqdq $Hkey,$Hkey,$T2
  905. vpxor $Xi,$T1,$T1 #
  906. vpxor $Hkey,$T2,$T2
  907. ___
  908. } else {
  909. $code.=<<___;
  910. vpunpckhqdq $Xi,$Xi,$T1
  911. vpxor $Xi,$T1,$T1 #
  912. ___
  913. }
  914. $code.=<<___;
  915. vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi #######
  916. vpclmulqdq \$0x00,$Hkey,$Xi,$Xi #######
  917. vpclmulqdq \$0x00,$HK,$T1,$T1 #######
  918. vpxor $Xi,$Xhi,$T2 #
  919. vpxor $T2,$T1,$T1 #
  920. vpslldq \$8,$T1,$T2 #
  921. vpsrldq \$8,$T1,$T1
  922. vpxor $T2,$Xi,$Xi #
  923. vpxor $T1,$Xhi,$Xhi
  924. ___
  925. }
  926. sub reduction_avx {
  927. my ($Xhi,$Xi) = @_;
  928. $code.=<<___;
  929. vpsllq \$57,$Xi,$T1 # 1st phase
  930. vpsllq \$62,$Xi,$T2
  931. vpxor $T1,$T2,$T2 #
  932. vpsllq \$63,$Xi,$T1
  933. vpxor $T1,$T2,$T2 #
  934. vpslldq \$8,$T2,$T1 #
  935. vpsrldq \$8,$T2,$T2
  936. vpxor $T1,$Xi,$Xi #
  937. vpxor $T2,$Xhi,$Xhi
  938. vpsrlq \$1,$Xi,$T2 # 2nd phase
  939. vpxor $Xi,$Xhi,$Xhi
  940. vpxor $T2,$Xi,$Xi #
  941. vpsrlq \$5,$T2,$T2
  942. vpxor $T2,$Xi,$Xi #
  943. vpsrlq \$1,$Xi,$Xi #
  944. vpxor $Xhi,$Xi,$Xi #
  945. ___
  946. }
  947. $code.=<<___;
  948. .align 32
  949. .Linit_loop_avx:
  950. vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi...
  951. vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt"
  952. ___
  953. &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7
  954. &reduction_avx ($Xhi,$Xi);
  955. $code.=<<___;
  956. .Linit_start_avx:
  957. vmovdqa $Xi,$T3
  958. ___
  959. &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8
  960. &reduction_avx ($Xhi,$Xi);
  961. $code.=<<___;
  962. vpshufd \$0b01001110,$T3,$T1
  963. vpshufd \$0b01001110,$Xi,$T2
  964. vpxor $T3,$T1,$T1 # Karatsuba pre-processing
  965. vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7
  966. vpxor $Xi,$T2,$T2 # Karatsuba pre-processing
  967. vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8
  968. lea 0x30($Htbl),$Htbl
  969. sub \$1,%r10
  970. jnz .Linit_loop_avx
  971. vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped
  972. vmovdqu $T3,-0x10($Htbl)
  973. vzeroupper
  974. ___
  975. $code.=<<___ if ($win64);
  976. movaps (%rsp),%xmm6
  977. lea 0x18(%rsp),%rsp
  978. .LSEH_end_gcm_init_avx:
  979. ___
  980. $code.=<<___;
  981. ret
  982. .size gcm_init_avx,.-gcm_init_avx
  983. ___
  984. } else {
  985. $code.=<<___;
  986. jmp .L_init_clmul
  987. .size gcm_init_avx,.-gcm_init_avx
  988. ___
  989. }
  990. $code.=<<___;
  991. .globl gcm_gmult_avx
  992. .type gcm_gmult_avx,\@abi-omnipotent
  993. .align 32
  994. gcm_gmult_avx:
  995. jmp .L_gmult_clmul
  996. .size gcm_gmult_avx,.-gcm_gmult_avx
  997. ___
  998. $code.=<<___;
  999. .globl gcm_ghash_avx
  1000. .type gcm_ghash_avx,\@abi-omnipotent
  1001. .align 32
  1002. gcm_ghash_avx:
  1003. ___
  1004. if ($avx) {
  1005. my ($Xip,$Htbl,$inp,$len)=@_4args;
  1006. my ($Xlo,$Xhi,$Xmi,
  1007. $Zlo,$Zhi,$Zmi,
  1008. $Hkey,$HK,$T1,$T2,
  1009. $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15));
  1010. $code.=<<___ if ($win64);
  1011. lea -0x88(%rsp),%rax
  1012. .LSEH_begin_gcm_ghash_avx:
  1013. # I can't trust assembler to use specific encoding:-(
  1014. .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
  1015. .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
  1016. .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
  1017. .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
  1018. .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
  1019. .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
  1020. .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
  1021. .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
  1022. .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
  1023. .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
  1024. .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
  1025. ___
  1026. $code.=<<___;
  1027. vzeroupper
  1028. vmovdqu ($Xip),$Xi # load $Xi
  1029. lea .L0x1c2_polynomial(%rip),%r10
  1030. lea 0x40($Htbl),$Htbl # size optimization
  1031. vmovdqu .Lbswap_mask(%rip),$bswap
  1032. vpshufb $bswap,$Xi,$Xi
  1033. cmp \$0x80,$len
  1034. jb .Lshort_avx
  1035. sub \$0x80,$len
  1036. vmovdqu 0x70($inp),$Ii # I[7]
  1037. vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
  1038. vpshufb $bswap,$Ii,$Ii
  1039. vmovdqu 0x20-0x40($Htbl),$HK
  1040. vpunpckhqdq $Ii,$Ii,$T2
  1041. vmovdqu 0x60($inp),$Ij # I[6]
  1042. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1043. vpxor $Ii,$T2,$T2
  1044. vpshufb $bswap,$Ij,$Ij
  1045. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1046. vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
  1047. vpunpckhqdq $Ij,$Ij,$T1
  1048. vmovdqu 0x50($inp),$Ii # I[5]
  1049. vpclmulqdq \$0x00,$HK,$T2,$Xmi
  1050. vpxor $Ij,$T1,$T1
  1051. vpshufb $bswap,$Ii,$Ii
  1052. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1053. vpunpckhqdq $Ii,$Ii,$T2
  1054. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1055. vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
  1056. vpxor $Ii,$T2,$T2
  1057. vmovdqu 0x40($inp),$Ij # I[4]
  1058. vpclmulqdq \$0x10,$HK,$T1,$Zmi
  1059. vmovdqu 0x50-0x40($Htbl),$HK
  1060. vpshufb $bswap,$Ij,$Ij
  1061. vpxor $Xlo,$Zlo,$Zlo
  1062. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1063. vpxor $Xhi,$Zhi,$Zhi
  1064. vpunpckhqdq $Ij,$Ij,$T1
  1065. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1066. vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
  1067. vpxor $Xmi,$Zmi,$Zmi
  1068. vpclmulqdq \$0x00,$HK,$T2,$Xmi
  1069. vpxor $Ij,$T1,$T1
  1070. vmovdqu 0x30($inp),$Ii # I[3]
  1071. vpxor $Zlo,$Xlo,$Xlo
  1072. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1073. vpxor $Zhi,$Xhi,$Xhi
  1074. vpshufb $bswap,$Ii,$Ii
  1075. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1076. vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
  1077. vpxor $Zmi,$Xmi,$Xmi
  1078. vpunpckhqdq $Ii,$Ii,$T2
  1079. vpclmulqdq \$0x10,$HK,$T1,$Zmi
  1080. vmovdqu 0x80-0x40($Htbl),$HK
  1081. vpxor $Ii,$T2,$T2
  1082. vmovdqu 0x20($inp),$Ij # I[2]
  1083. vpxor $Xlo,$Zlo,$Zlo
  1084. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1085. vpxor $Xhi,$Zhi,$Zhi
  1086. vpshufb $bswap,$Ij,$Ij
  1087. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1088. vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
  1089. vpxor $Xmi,$Zmi,$Zmi
  1090. vpunpckhqdq $Ij,$Ij,$T1
  1091. vpclmulqdq \$0x00,$HK,$T2,$Xmi
  1092. vpxor $Ij,$T1,$T1
  1093. vmovdqu 0x10($inp),$Ii # I[1]
  1094. vpxor $Zlo,$Xlo,$Xlo
  1095. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1096. vpxor $Zhi,$Xhi,$Xhi
  1097. vpshufb $bswap,$Ii,$Ii
  1098. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1099. vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
  1100. vpxor $Zmi,$Xmi,$Xmi
  1101. vpunpckhqdq $Ii,$Ii,$T2
  1102. vpclmulqdq \$0x10,$HK,$T1,$Zmi
  1103. vmovdqu 0xb0-0x40($Htbl),$HK
  1104. vpxor $Ii,$T2,$T2
  1105. vmovdqu ($inp),$Ij # I[0]
  1106. vpxor $Xlo,$Zlo,$Zlo
  1107. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1108. vpxor $Xhi,$Zhi,$Zhi
  1109. vpshufb $bswap,$Ij,$Ij
  1110. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1111. vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
  1112. vpxor $Xmi,$Zmi,$Zmi
  1113. vpclmulqdq \$0x10,$HK,$T2,$Xmi
  1114. lea 0x80($inp),$inp
  1115. cmp \$0x80,$len
  1116. jb .Ltail_avx
  1117. vpxor $Xi,$Ij,$Ij # accumulate $Xi
  1118. sub \$0x80,$len
  1119. jmp .Loop8x_avx
  1120. .align 32
  1121. .Loop8x_avx:
  1122. vpunpckhqdq $Ij,$Ij,$T1
  1123. vmovdqu 0x70($inp),$Ii # I[7]
  1124. vpxor $Xlo,$Zlo,$Zlo
  1125. vpxor $Ij,$T1,$T1
  1126. vpclmulqdq \$0x00,$Hkey,$Ij,$Xi
  1127. vpshufb $bswap,$Ii,$Ii
  1128. vpxor $Xhi,$Zhi,$Zhi
  1129. vpclmulqdq \$0x11,$Hkey,$Ij,$Xo
  1130. vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
  1131. vpunpckhqdq $Ii,$Ii,$T2
  1132. vpxor $Xmi,$Zmi,$Zmi
  1133. vpclmulqdq \$0x00,$HK,$T1,$Tred
  1134. vmovdqu 0x20-0x40($Htbl),$HK
  1135. vpxor $Ii,$T2,$T2
  1136. vmovdqu 0x60($inp),$Ij # I[6]
  1137. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1138. vpxor $Zlo,$Xi,$Xi # collect result
  1139. vpshufb $bswap,$Ij,$Ij
  1140. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1141. vxorps $Zhi,$Xo,$Xo
  1142. vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
  1143. vpunpckhqdq $Ij,$Ij,$T1
  1144. vpclmulqdq \$0x00,$HK, $T2,$Xmi
  1145. vpxor $Zmi,$Tred,$Tred
  1146. vxorps $Ij,$T1,$T1
  1147. vmovdqu 0x50($inp),$Ii # I[5]
  1148. vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing
  1149. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1150. vpxor $Xo,$Tred,$Tred
  1151. vpslldq \$8,$Tred,$T2
  1152. vpxor $Xlo,$Zlo,$Zlo
  1153. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1154. vpsrldq \$8,$Tred,$Tred
  1155. vpxor $T2, $Xi, $Xi
  1156. vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
  1157. vpshufb $bswap,$Ii,$Ii
  1158. vxorps $Tred,$Xo, $Xo
  1159. vpxor $Xhi,$Zhi,$Zhi
  1160. vpunpckhqdq $Ii,$Ii,$T2
  1161. vpclmulqdq \$0x10,$HK, $T1,$Zmi
  1162. vmovdqu 0x50-0x40($Htbl),$HK
  1163. vpxor $Ii,$T2,$T2
  1164. vpxor $Xmi,$Zmi,$Zmi
  1165. vmovdqu 0x40($inp),$Ij # I[4]
  1166. vpalignr \$8,$Xi,$Xi,$Tred # 1st phase
  1167. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1168. vpshufb $bswap,$Ij,$Ij
  1169. vpxor $Zlo,$Xlo,$Xlo
  1170. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1171. vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
  1172. vpunpckhqdq $Ij,$Ij,$T1
  1173. vpxor $Zhi,$Xhi,$Xhi
  1174. vpclmulqdq \$0x00,$HK, $T2,$Xmi
  1175. vxorps $Ij,$T1,$T1
  1176. vpxor $Zmi,$Xmi,$Xmi
  1177. vmovdqu 0x30($inp),$Ii # I[3]
  1178. vpclmulqdq \$0x10,(%r10),$Xi,$Xi
  1179. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1180. vpshufb $bswap,$Ii,$Ii
  1181. vpxor $Xlo,$Zlo,$Zlo
  1182. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1183. vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
  1184. vpunpckhqdq $Ii,$Ii,$T2
  1185. vpxor $Xhi,$Zhi,$Zhi
  1186. vpclmulqdq \$0x10,$HK, $T1,$Zmi
  1187. vmovdqu 0x80-0x40($Htbl),$HK
  1188. vpxor $Ii,$T2,$T2
  1189. vpxor $Xmi,$Zmi,$Zmi
  1190. vmovdqu 0x20($inp),$Ij # I[2]
  1191. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1192. vpshufb $bswap,$Ij,$Ij
  1193. vpxor $Zlo,$Xlo,$Xlo
  1194. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1195. vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
  1196. vpunpckhqdq $Ij,$Ij,$T1
  1197. vpxor $Zhi,$Xhi,$Xhi
  1198. vpclmulqdq \$0x00,$HK, $T2,$Xmi
  1199. vpxor $Ij,$T1,$T1
  1200. vpxor $Zmi,$Xmi,$Xmi
  1201. vxorps $Tred,$Xi,$Xi
  1202. vmovdqu 0x10($inp),$Ii # I[1]
  1203. vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase
  1204. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1205. vpshufb $bswap,$Ii,$Ii
  1206. vpxor $Xlo,$Zlo,$Zlo
  1207. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1208. vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
  1209. vpclmulqdq \$0x10,(%r10),$Xi,$Xi
  1210. vxorps $Xo,$Tred,$Tred
  1211. vpunpckhqdq $Ii,$Ii,$T2
  1212. vpxor $Xhi,$Zhi,$Zhi
  1213. vpclmulqdq \$0x10,$HK, $T1,$Zmi
  1214. vmovdqu 0xb0-0x40($Htbl),$HK
  1215. vpxor $Ii,$T2,$T2
  1216. vpxor $Xmi,$Zmi,$Zmi
  1217. vmovdqu ($inp),$Ij # I[0]
  1218. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1219. vpshufb $bswap,$Ij,$Ij
  1220. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1221. vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
  1222. vpxor $Tred,$Ij,$Ij
  1223. vpclmulqdq \$0x10,$HK, $T2,$Xmi
  1224. vpxor $Xi,$Ij,$Ij # accumulate $Xi
  1225. lea 0x80($inp),$inp
  1226. sub \$0x80,$len
  1227. jnc .Loop8x_avx
  1228. add \$0x80,$len
  1229. jmp .Ltail_no_xor_avx
  1230. .align 32
  1231. .Lshort_avx:
  1232. vmovdqu -0x10($inp,$len),$Ii # very last word
  1233. lea ($inp,$len),$inp
  1234. vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
  1235. vmovdqu 0x20-0x40($Htbl),$HK
  1236. vpshufb $bswap,$Ii,$Ij
  1237. vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo,
  1238. vmovdqa $Xhi,$Zhi # $Zhi and
  1239. vmovdqa $Xmi,$Zmi # $Zmi
  1240. sub \$0x10,$len
  1241. jz .Ltail_avx
  1242. vpunpckhqdq $Ij,$Ij,$T1
  1243. vpxor $Xlo,$Zlo,$Zlo
  1244. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1245. vpxor $Ij,$T1,$T1
  1246. vmovdqu -0x20($inp),$Ii
  1247. vpxor $Xhi,$Zhi,$Zhi
  1248. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1249. vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
  1250. vpshufb $bswap,$Ii,$Ij
  1251. vpxor $Xmi,$Zmi,$Zmi
  1252. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1253. vpsrldq \$8,$HK,$HK
  1254. sub \$0x10,$len
  1255. jz .Ltail_avx
  1256. vpunpckhqdq $Ij,$Ij,$T1
  1257. vpxor $Xlo,$Zlo,$Zlo
  1258. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1259. vpxor $Ij,$T1,$T1
  1260. vmovdqu -0x30($inp),$Ii
  1261. vpxor $Xhi,$Zhi,$Zhi
  1262. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1263. vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
  1264. vpshufb $bswap,$Ii,$Ij
  1265. vpxor $Xmi,$Zmi,$Zmi
  1266. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1267. vmovdqu 0x50-0x40($Htbl),$HK
  1268. sub \$0x10,$len
  1269. jz .Ltail_avx
  1270. vpunpckhqdq $Ij,$Ij,$T1
  1271. vpxor $Xlo,$Zlo,$Zlo
  1272. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1273. vpxor $Ij,$T1,$T1
  1274. vmovdqu -0x40($inp),$Ii
  1275. vpxor $Xhi,$Zhi,$Zhi
  1276. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1277. vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
  1278. vpshufb $bswap,$Ii,$Ij
  1279. vpxor $Xmi,$Zmi,$Zmi
  1280. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1281. vpsrldq \$8,$HK,$HK
  1282. sub \$0x10,$len
  1283. jz .Ltail_avx
  1284. vpunpckhqdq $Ij,$Ij,$T1
  1285. vpxor $Xlo,$Zlo,$Zlo
  1286. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1287. vpxor $Ij,$T1,$T1
  1288. vmovdqu -0x50($inp),$Ii
  1289. vpxor $Xhi,$Zhi,$Zhi
  1290. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1291. vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
  1292. vpshufb $bswap,$Ii,$Ij
  1293. vpxor $Xmi,$Zmi,$Zmi
  1294. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1295. vmovdqu 0x80-0x40($Htbl),$HK
  1296. sub \$0x10,$len
  1297. jz .Ltail_avx
  1298. vpunpckhqdq $Ij,$Ij,$T1
  1299. vpxor $Xlo,$Zlo,$Zlo
  1300. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1301. vpxor $Ij,$T1,$T1
  1302. vmovdqu -0x60($inp),$Ii
  1303. vpxor $Xhi,$Zhi,$Zhi
  1304. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1305. vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
  1306. vpshufb $bswap,$Ii,$Ij
  1307. vpxor $Xmi,$Zmi,$Zmi
  1308. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1309. vpsrldq \$8,$HK,$HK
  1310. sub \$0x10,$len
  1311. jz .Ltail_avx
  1312. vpunpckhqdq $Ij,$Ij,$T1
  1313. vpxor $Xlo,$Zlo,$Zlo
  1314. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1315. vpxor $Ij,$T1,$T1
  1316. vmovdqu -0x70($inp),$Ii
  1317. vpxor $Xhi,$Zhi,$Zhi
  1318. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1319. vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
  1320. vpshufb $bswap,$Ii,$Ij
  1321. vpxor $Xmi,$Zmi,$Zmi
  1322. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1323. vmovq 0xb8-0x40($Htbl),$HK
  1324. sub \$0x10,$len
  1325. jmp .Ltail_avx
  1326. .align 32
  1327. .Ltail_avx:
  1328. vpxor $Xi,$Ij,$Ij # accumulate $Xi
  1329. .Ltail_no_xor_avx:
  1330. vpunpckhqdq $Ij,$Ij,$T1
  1331. vpxor $Xlo,$Zlo,$Zlo
  1332. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1333. vpxor $Ij,$T1,$T1
  1334. vpxor $Xhi,$Zhi,$Zhi
  1335. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1336. vpxor $Xmi,$Zmi,$Zmi
  1337. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1338. vmovdqu (%r10),$Tred
  1339. vpxor $Xlo,$Zlo,$Xi
  1340. vpxor $Xhi,$Zhi,$Xo
  1341. vpxor $Xmi,$Zmi,$Zmi
  1342. vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing
  1343. vpxor $Xo, $Zmi,$Zmi
  1344. vpslldq \$8, $Zmi,$T2
  1345. vpsrldq \$8, $Zmi,$Zmi
  1346. vpxor $T2, $Xi, $Xi
  1347. vpxor $Zmi,$Xo, $Xo
  1348. vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase
  1349. vpalignr \$8,$Xi,$Xi,$Xi
  1350. vpxor $T2,$Xi,$Xi
  1351. vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase
  1352. vpalignr \$8,$Xi,$Xi,$Xi
  1353. vpxor $Xo,$Xi,$Xi
  1354. vpxor $T2,$Xi,$Xi
  1355. cmp \$0,$len
  1356. jne .Lshort_avx
  1357. vpshufb $bswap,$Xi,$Xi
  1358. vmovdqu $Xi,($Xip)
  1359. vzeroupper
  1360. ___
  1361. $code.=<<___ if ($win64);
  1362. movaps (%rsp),%xmm6
  1363. movaps 0x10(%rsp),%xmm7
  1364. movaps 0x20(%rsp),%xmm8
  1365. movaps 0x30(%rsp),%xmm9
  1366. movaps 0x40(%rsp),%xmm10
  1367. movaps 0x50(%rsp),%xmm11
  1368. movaps 0x60(%rsp),%xmm12
  1369. movaps 0x70(%rsp),%xmm13
  1370. movaps 0x80(%rsp),%xmm14
  1371. movaps 0x90(%rsp),%xmm15
  1372. lea 0xa8(%rsp),%rsp
  1373. .LSEH_end_gcm_ghash_avx:
  1374. ___
  1375. $code.=<<___;
  1376. ret
  1377. .size gcm_ghash_avx,.-gcm_ghash_avx
  1378. ___
  1379. } else {
  1380. $code.=<<___;
  1381. jmp .L_ghash_clmul
  1382. .size gcm_ghash_avx,.-gcm_ghash_avx
  1383. ___
  1384. }
  1385. $code.=<<___;
  1386. .align 64
  1387. .Lbswap_mask:
  1388. .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
  1389. .L0x1c2_polynomial:
  1390. .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
  1391. .L7_mask:
  1392. .long 7,0,7,0
  1393. .L7_mask_poly:
  1394. .long 7,0,`0xE1<<1`,0
  1395. .align 64
  1396. .type .Lrem_4bit,\@object
  1397. .Lrem_4bit:
  1398. .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
  1399. .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
  1400. .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
  1401. .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
  1402. .type .Lrem_8bit,\@object
  1403. .Lrem_8bit:
  1404. .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
  1405. .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
  1406. .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
  1407. .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
  1408. .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
  1409. .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
  1410. .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
  1411. .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
  1412. .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
  1413. .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
  1414. .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
  1415. .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
  1416. .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
  1417. .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
  1418. .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
  1419. .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
  1420. .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
  1421. .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
  1422. .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
  1423. .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
  1424. .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
  1425. .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
  1426. .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
  1427. .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
  1428. .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
  1429. .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
  1430. .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
  1431. .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
  1432. .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
  1433. .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
  1434. .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
  1435. .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
  1436. .asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  1437. .align 64
  1438. ___
  1439. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1440. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1441. if ($win64) {
  1442. $rec="%rcx";
  1443. $frame="%rdx";
  1444. $context="%r8";
  1445. $disp="%r9";
  1446. $code.=<<___;
  1447. .extern __imp_RtlVirtualUnwind
  1448. .type se_handler,\@abi-omnipotent
  1449. .align 16
  1450. se_handler:
  1451. push %rsi
  1452. push %rdi
  1453. push %rbx
  1454. push %rbp
  1455. push %r12
  1456. push %r13
  1457. push %r14
  1458. push %r15
  1459. pushfq
  1460. sub \$64,%rsp
  1461. mov 120($context),%rax # pull context->Rax
  1462. mov 248($context),%rbx # pull context->Rip
  1463. mov 8($disp),%rsi # disp->ImageBase
  1464. mov 56($disp),%r11 # disp->HandlerData
  1465. mov 0(%r11),%r10d # HandlerData[0]
  1466. lea (%rsi,%r10),%r10 # prologue label
  1467. cmp %r10,%rbx # context->Rip<prologue label
  1468. jb .Lin_prologue
  1469. mov 152($context),%rax # pull context->Rsp
  1470. mov 4(%r11),%r10d # HandlerData[1]
  1471. lea (%rsi,%r10),%r10 # epilogue label
  1472. cmp %r10,%rbx # context->Rip>=epilogue label
  1473. jae .Lin_prologue
  1474. lea 24(%rax),%rax # adjust "rsp"
  1475. mov -8(%rax),%rbx
  1476. mov -16(%rax),%rbp
  1477. mov -24(%rax),%r12
  1478. mov %rbx,144($context) # restore context->Rbx
  1479. mov %rbp,160($context) # restore context->Rbp
  1480. mov %r12,216($context) # restore context->R12
  1481. .Lin_prologue:
  1482. mov 8(%rax),%rdi
  1483. mov 16(%rax),%rsi
  1484. mov %rax,152($context) # restore context->Rsp
  1485. mov %rsi,168($context) # restore context->Rsi
  1486. mov %rdi,176($context) # restore context->Rdi
  1487. mov 40($disp),%rdi # disp->ContextRecord
  1488. mov $context,%rsi # context
  1489. mov \$`1232/8`,%ecx # sizeof(CONTEXT)
  1490. .long 0xa548f3fc # cld; rep movsq
  1491. mov $disp,%rsi
  1492. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1493. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1494. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1495. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1496. mov 40(%rsi),%r10 # disp->ContextRecord
  1497. lea 56(%rsi),%r11 # &disp->HandlerData
  1498. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1499. mov %r10,32(%rsp) # arg5
  1500. mov %r11,40(%rsp) # arg6
  1501. mov %r12,48(%rsp) # arg7
  1502. mov %rcx,56(%rsp) # arg8, (NULL)
  1503. call *__imp_RtlVirtualUnwind(%rip)
  1504. mov \$1,%eax # ExceptionContinueSearch
  1505. add \$64,%rsp
  1506. popfq
  1507. pop %r15
  1508. pop %r14
  1509. pop %r13
  1510. pop %r12
  1511. pop %rbp
  1512. pop %rbx
  1513. pop %rdi
  1514. pop %rsi
  1515. ret
  1516. .size se_handler,.-se_handler
  1517. .section .pdata
  1518. .align 4
  1519. .rva .LSEH_begin_gcm_gmult_4bit
  1520. .rva .LSEH_end_gcm_gmult_4bit
  1521. .rva .LSEH_info_gcm_gmult_4bit
  1522. .rva .LSEH_begin_gcm_ghash_4bit
  1523. .rva .LSEH_end_gcm_ghash_4bit
  1524. .rva .LSEH_info_gcm_ghash_4bit
  1525. .rva .LSEH_begin_gcm_init_clmul
  1526. .rva .LSEH_end_gcm_init_clmul
  1527. .rva .LSEH_info_gcm_init_clmul
  1528. .rva .LSEH_begin_gcm_ghash_clmul
  1529. .rva .LSEH_end_gcm_ghash_clmul
  1530. .rva .LSEH_info_gcm_ghash_clmul
  1531. ___
  1532. $code.=<<___ if ($avx);
  1533. .rva .LSEH_begin_gcm_init_avx
  1534. .rva .LSEH_end_gcm_init_avx
  1535. .rva .LSEH_info_gcm_init_clmul
  1536. .rva .LSEH_begin_gcm_ghash_avx
  1537. .rva .LSEH_end_gcm_ghash_avx
  1538. .rva .LSEH_info_gcm_ghash_clmul
  1539. ___
  1540. $code.=<<___;
  1541. .section .xdata
  1542. .align 8
  1543. .LSEH_info_gcm_gmult_4bit:
  1544. .byte 9,0,0,0
  1545. .rva se_handler
  1546. .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData
  1547. .LSEH_info_gcm_ghash_4bit:
  1548. .byte 9,0,0,0
  1549. .rva se_handler
  1550. .rva .Lghash_prologue,.Lghash_epilogue # HandlerData
  1551. .LSEH_info_gcm_init_clmul:
  1552. .byte 0x01,0x08,0x03,0x00
  1553. .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
  1554. .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18
  1555. .LSEH_info_gcm_ghash_clmul:
  1556. .byte 0x01,0x33,0x16,0x00
  1557. .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15
  1558. .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14
  1559. .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13
  1560. .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12
  1561. .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11
  1562. .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
  1563. .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
  1564. .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
  1565. .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
  1566. .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
  1567. .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
  1568. ___
  1569. }
  1570. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  1571. print $code;
  1572. close STDOUT;