Não pode escolher mais do que 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.
 
 
 
 
 
 

1742 linhas
42 KiB

  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  5. # project. The module is, however, dual licensed under OpenSSL and
  6. # CRYPTOGAMS licenses depending on where you obtain it. For further
  7. # details see http://www.openssl.org/~appro/cryptogams/.
  8. # ====================================================================
  9. #
  10. # March, June 2010
  11. #
  12. # The module implements "4-bit" GCM GHASH function and underlying
  13. # single multiplication operation in GF(2^128). "4-bit" means that
  14. # it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
  15. # function features so called "528B" variant utilizing additional
  16. # 256+16 bytes of per-key storage [+512 bytes shared table].
  17. # Performance results are for this streamed GHASH subroutine and are
  18. # expressed in cycles per processed byte, less is better:
  19. #
  20. # gcc 3.4.x(*) assembler
  21. #
  22. # P4 28.6 14.0 +100%
  23. # Opteron 19.3 7.7 +150%
  24. # Core2 17.8 8.1(**) +120%
  25. # Atom 31.6 16.8 +88%
  26. # VIA Nano 21.8 10.1 +115%
  27. #
  28. # (*) comparison is not completely fair, because C results are
  29. # for vanilla "256B" implementation, while assembler results
  30. # are for "528B";-)
  31. # (**) it's mystery [to me] why Core2 result is not same as for
  32. # Opteron;
  33. # May 2010
  34. #
  35. # Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
  36. # See ghash-x86.pl for background information and details about coding
  37. # techniques.
  38. #
  39. # Special thanks to David Woodhouse <dwmw2@infradead.org> for
  40. # providing access to a Westmere-based system on behalf of Intel
  41. # Open Source Technology Centre.
  42. # December 2012
  43. #
  44. # Overhaul: aggregate Karatsuba post-processing, improve ILP in
  45. # reduction_alg9, increase reduction aggregate factor to 4x. As for
  46. # the latter. ghash-x86.pl discusses that it makes lesser sense to
  47. # increase aggregate factor. Then why increase here? Critical path
  48. # consists of 3 independent pclmulqdq instructions, Karatsuba post-
  49. # processing and reduction. "On top" of this we lay down aggregated
  50. # multiplication operations, triplets of independent pclmulqdq's. As
  51. # issue rate for pclmulqdq is limited, it makes lesser sense to
  52. # aggregate more multiplications than it takes to perform remaining
  53. # non-multiplication operations. 2x is near-optimal coefficient for
  54. # contemporary Intel CPUs (therefore modest improvement coefficient),
  55. # but not for Bulldozer. Latter is because logical SIMD operations
  56. # are twice as slow in comparison to Intel, so that critical path is
  57. # longer. A CPU with higher pclmulqdq issue rate would also benefit
  58. # from higher aggregate factor...
  59. #
  60. # Westmere 1.78(+13%)
  61. # Sandy Bridge 1.80(+8%)
  62. # Ivy Bridge 1.80(+7%)
  63. # Haswell 0.55(+93%) (if system doesn't support AVX)
  64. # Broadwell 0.45(+110%)(if system doesn't support AVX)
  65. # Bulldozer 1.49(+27%)
  66. # Silvermont 2.88(+13%)
  67. # March 2013
  68. #
  69. # ... 8x aggregate factor AVX code path is using reduction algorithm
  70. # suggested by Shay Gueron[1]. Even though contemporary AVX-capable
  71. # CPUs such as Sandy and Ivy Bridge can execute it, the code performs
  72. # sub-optimally in comparison to above mentioned version. But thanks
  73. # to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
  74. # it performs in 0.41 cycles per byte on Haswell processor, and in
  75. # 0.29 on Broadwell.
  76. #
  77. # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
  78. $flavour = shift;
  79. $output = shift;
  80. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  81. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  82. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  83. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  84. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  85. die "can't locate x86_64-xlate.pl";
  86. # In upstream, this is controlled by shelling out to the compiler to check
  87. # versions, but BoringSSL is intended to be used with pre-generated perlasm
  88. # output, so this isn't useful anyway.
  89. #
  90. # TODO(davidben): Enable this after testing. $avx goes up to 2.
  91. $avx = 0;
  92. open OUT,"| \"$^X\" $xlate $flavour $output";
  93. *STDOUT=*OUT;
  94. $do4xaggr=1;
  95. # common register layout
  96. $nlo="%rax";
  97. $nhi="%rbx";
  98. $Zlo="%r8";
  99. $Zhi="%r9";
  100. $tmp="%r10";
  101. $rem_4bit = "%r11";
  102. $Xi="%rdi";
  103. $Htbl="%rsi";
  104. # per-function register layout
  105. $cnt="%rcx";
  106. $rem="%rdx";
  107. sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or
  108. $r =~ s/%[er]([sd]i)/%\1l/ or
  109. $r =~ s/%[er](bp)/%\1l/ or
  110. $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
  111. sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
  112. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
  113. my $arg = pop;
  114. $arg = "\$$arg" if ($arg*1 eq $arg);
  115. $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
  116. }
  117. { my $N;
  118. sub loop() {
  119. my $inp = shift;
  120. $N++;
  121. $code.=<<___;
  122. xor $nlo,$nlo
  123. xor $nhi,$nhi
  124. mov `&LB("$Zlo")`,`&LB("$nlo")`
  125. mov `&LB("$Zlo")`,`&LB("$nhi")`
  126. shl \$4,`&LB("$nlo")`
  127. mov \$14,$cnt
  128. mov 8($Htbl,$nlo),$Zlo
  129. mov ($Htbl,$nlo),$Zhi
  130. and \$0xf0,`&LB("$nhi")`
  131. mov $Zlo,$rem
  132. jmp .Loop$N
  133. .align 16
  134. .Loop$N:
  135. shr \$4,$Zlo
  136. and \$0xf,$rem
  137. mov $Zhi,$tmp
  138. mov ($inp,$cnt),`&LB("$nlo")`
  139. shr \$4,$Zhi
  140. xor 8($Htbl,$nhi),$Zlo
  141. shl \$60,$tmp
  142. xor ($Htbl,$nhi),$Zhi
  143. mov `&LB("$nlo")`,`&LB("$nhi")`
  144. xor ($rem_4bit,$rem,8),$Zhi
  145. mov $Zlo,$rem
  146. shl \$4,`&LB("$nlo")`
  147. xor $tmp,$Zlo
  148. dec $cnt
  149. js .Lbreak$N
  150. shr \$4,$Zlo
  151. and \$0xf,$rem
  152. mov $Zhi,$tmp
  153. shr \$4,$Zhi
  154. xor 8($Htbl,$nlo),$Zlo
  155. shl \$60,$tmp
  156. xor ($Htbl,$nlo),$Zhi
  157. and \$0xf0,`&LB("$nhi")`
  158. xor ($rem_4bit,$rem,8),$Zhi
  159. mov $Zlo,$rem
  160. xor $tmp,$Zlo
  161. jmp .Loop$N
  162. .align 16
  163. .Lbreak$N:
  164. shr \$4,$Zlo
  165. and \$0xf,$rem
  166. mov $Zhi,$tmp
  167. shr \$4,$Zhi
  168. xor 8($Htbl,$nlo),$Zlo
  169. shl \$60,$tmp
  170. xor ($Htbl,$nlo),$Zhi
  171. and \$0xf0,`&LB("$nhi")`
  172. xor ($rem_4bit,$rem,8),$Zhi
  173. mov $Zlo,$rem
  174. xor $tmp,$Zlo
  175. shr \$4,$Zlo
  176. and \$0xf,$rem
  177. mov $Zhi,$tmp
  178. shr \$4,$Zhi
  179. xor 8($Htbl,$nhi),$Zlo
  180. shl \$60,$tmp
  181. xor ($Htbl,$nhi),$Zhi
  182. xor $tmp,$Zlo
  183. xor ($rem_4bit,$rem,8),$Zhi
  184. bswap $Zlo
  185. bswap $Zhi
  186. ___
  187. }}
  188. $code=<<___;
  189. .text
  190. .extern OPENSSL_ia32cap_P
  191. .globl gcm_gmult_4bit
  192. .type gcm_gmult_4bit,\@function,2
  193. .align 16
  194. gcm_gmult_4bit:
  195. push %rbx
  196. push %rbp # %rbp and %r12 are pushed exclusively in
  197. push %r12 # order to reuse Win64 exception handler...
  198. .Lgmult_prologue:
  199. movzb 15($Xi),$Zlo
  200. lea .Lrem_4bit(%rip),$rem_4bit
  201. ___
  202. &loop ($Xi);
  203. $code.=<<___;
  204. mov $Zlo,8($Xi)
  205. mov $Zhi,($Xi)
  206. mov 16(%rsp),%rbx
  207. lea 24(%rsp),%rsp
  208. .Lgmult_epilogue:
  209. ret
  210. .size gcm_gmult_4bit,.-gcm_gmult_4bit
  211. ___
  212. # per-function register layout
  213. $inp="%rdx";
  214. $len="%rcx";
  215. $rem_8bit=$rem_4bit;
  216. $code.=<<___;
  217. .globl gcm_ghash_4bit
  218. .type gcm_ghash_4bit,\@function,4
  219. .align 16
  220. gcm_ghash_4bit:
  221. push %rbx
  222. push %rbp
  223. push %r12
  224. push %r13
  225. push %r14
  226. push %r15
  227. sub \$280,%rsp
  228. .Lghash_prologue:
  229. mov $inp,%r14 # reassign couple of args
  230. mov $len,%r15
  231. ___
  232. { my $inp="%r14";
  233. my $dat="%edx";
  234. my $len="%r15";
  235. my @nhi=("%ebx","%ecx");
  236. my @rem=("%r12","%r13");
  237. my $Hshr4="%rbp";
  238. &sub ($Htbl,-128); # size optimization
  239. &lea ($Hshr4,"16+128(%rsp)");
  240. { my @lo =($nlo,$nhi);
  241. my @hi =($Zlo,$Zhi);
  242. &xor ($dat,$dat);
  243. for ($i=0,$j=-2;$i<18;$i++,$j++) {
  244. &mov ("$j(%rsp)",&LB($dat)) if ($i>1);
  245. &or ($lo[0],$tmp) if ($i>1);
  246. &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17);
  247. &shr ($lo[1],4) if ($i>0 && $i<17);
  248. &mov ($tmp,$hi[1]) if ($i>0 && $i<17);
  249. &shr ($hi[1],4) if ($i>0 && $i<17);
  250. &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1);
  251. &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16);
  252. &shl (&LB($dat),4) if ($i>0 && $i<17);
  253. &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1);
  254. &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16);
  255. &shl ($tmp,60) if ($i>0 && $i<17);
  256. push (@lo,shift(@lo));
  257. push (@hi,shift(@hi));
  258. }
  259. }
  260. &add ($Htbl,-128);
  261. &mov ($Zlo,"8($Xi)");
  262. &mov ($Zhi,"0($Xi)");
  263. &add ($len,$inp); # pointer to the end of data
  264. &lea ($rem_8bit,".Lrem_8bit(%rip)");
  265. &jmp (".Louter_loop");
  266. $code.=".align 16\n.Louter_loop:\n";
  267. &xor ($Zhi,"($inp)");
  268. &mov ("%rdx","8($inp)");
  269. &lea ($inp,"16($inp)");
  270. &xor ("%rdx",$Zlo);
  271. &mov ("($Xi)",$Zhi);
  272. &mov ("8($Xi)","%rdx");
  273. &shr ("%rdx",32);
  274. &xor ($nlo,$nlo);
  275. &rol ($dat,8);
  276. &mov (&LB($nlo),&LB($dat));
  277. &movz ($nhi[0],&LB($dat));
  278. &shl (&LB($nlo),4);
  279. &shr ($nhi[0],4);
  280. for ($j=11,$i=0;$i<15;$i++) {
  281. &rol ($dat,8);
  282. &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0);
  283. &xor ($Zhi,"($Htbl,$nlo)") if ($i>0);
  284. &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0);
  285. &mov ($Zhi,"($Htbl,$nlo)") if ($i==0);
  286. &mov (&LB($nlo),&LB($dat));
  287. &xor ($Zlo,$tmp) if ($i>0);
  288. &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0);
  289. &movz ($nhi[1],&LB($dat));
  290. &shl (&LB($nlo),4);
  291. &movzb ($rem[0],"(%rsp,$nhi[0])");
  292. &shr ($nhi[1],4) if ($i<14);
  293. &and ($nhi[1],0xf0) if ($i==14);
  294. &shl ($rem[1],48) if ($i>0);
  295. &xor ($rem[0],$Zlo);
  296. &mov ($tmp,$Zhi);
  297. &xor ($Zhi,$rem[1]) if ($i>0);
  298. &shr ($Zlo,8);
  299. &movz ($rem[0],&LB($rem[0]));
  300. &mov ($dat,"$j($Xi)") if (--$j%4==0);
  301. &shr ($Zhi,8);
  302. &xor ($Zlo,"-128($Hshr4,$nhi[0],8)");
  303. &shl ($tmp,56);
  304. &xor ($Zhi,"($Hshr4,$nhi[0],8)");
  305. unshift (@nhi,pop(@nhi)); # "rotate" registers
  306. unshift (@rem,pop(@rem));
  307. }
  308. &movzw ($rem[1],"($rem_8bit,$rem[1],2)");
  309. &xor ($Zlo,"8($Htbl,$nlo)");
  310. &xor ($Zhi,"($Htbl,$nlo)");
  311. &shl ($rem[1],48);
  312. &xor ($Zlo,$tmp);
  313. &xor ($Zhi,$rem[1]);
  314. &movz ($rem[0],&LB($Zlo));
  315. &shr ($Zlo,4);
  316. &mov ($tmp,$Zhi);
  317. &shl (&LB($rem[0]),4);
  318. &shr ($Zhi,4);
  319. &xor ($Zlo,"8($Htbl,$nhi[0])");
  320. &movzw ($rem[0],"($rem_8bit,$rem[0],2)");
  321. &shl ($tmp,60);
  322. &xor ($Zhi,"($Htbl,$nhi[0])");
  323. &xor ($Zlo,$tmp);
  324. &shl ($rem[0],48);
  325. &bswap ($Zlo);
  326. &xor ($Zhi,$rem[0]);
  327. &bswap ($Zhi);
  328. &cmp ($inp,$len);
  329. &jb (".Louter_loop");
  330. }
  331. $code.=<<___;
  332. mov $Zlo,8($Xi)
  333. mov $Zhi,($Xi)
  334. lea 280(%rsp),%rsi
  335. mov 0(%rsi),%r15
  336. mov 8(%rsi),%r14
  337. mov 16(%rsi),%r13
  338. mov 24(%rsi),%r12
  339. mov 32(%rsi),%rbp
  340. mov 40(%rsi),%rbx
  341. lea 48(%rsi),%rsp
  342. .Lghash_epilogue:
  343. ret
  344. .size gcm_ghash_4bit,.-gcm_ghash_4bit
  345. ___
  346. ######################################################################
  347. # PCLMULQDQ version.
  348. @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
  349. ("%rdi","%rsi","%rdx","%rcx"); # Unix order
  350. ($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2";
  351. ($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
  352. sub clmul64x64_T2 { # minimal register pressure
  353. my ($Xhi,$Xi,$Hkey,$HK)=@_;
  354. if (!defined($HK)) { $HK = $T2;
  355. $code.=<<___;
  356. movdqa $Xi,$Xhi #
  357. pshufd \$0b01001110,$Xi,$T1
  358. pshufd \$0b01001110,$Hkey,$T2
  359. pxor $Xi,$T1 #
  360. pxor $Hkey,$T2
  361. ___
  362. } else {
  363. $code.=<<___;
  364. movdqa $Xi,$Xhi #
  365. pshufd \$0b01001110,$Xi,$T1
  366. pxor $Xi,$T1 #
  367. ___
  368. }
  369. $code.=<<___;
  370. pclmulqdq \$0x00,$Hkey,$Xi #######
  371. pclmulqdq \$0x11,$Hkey,$Xhi #######
  372. pclmulqdq \$0x00,$HK,$T1 #######
  373. pxor $Xi,$T1 #
  374. pxor $Xhi,$T1 #
  375. movdqa $T1,$T2 #
  376. psrldq \$8,$T1
  377. pslldq \$8,$T2 #
  378. pxor $T1,$Xhi
  379. pxor $T2,$Xi #
  380. ___
  381. }
  382. sub reduction_alg9 { # 17/11 times faster than Intel version
  383. my ($Xhi,$Xi) = @_;
  384. $code.=<<___;
  385. # 1st phase
  386. movdqa $Xi,$T2 #
  387. movdqa $Xi,$T1
  388. psllq \$5,$Xi
  389. pxor $Xi,$T1 #
  390. psllq \$1,$Xi
  391. pxor $T1,$Xi #
  392. psllq \$57,$Xi #
  393. movdqa $Xi,$T1 #
  394. pslldq \$8,$Xi
  395. psrldq \$8,$T1 #
  396. pxor $T2,$Xi
  397. pxor $T1,$Xhi #
  398. # 2nd phase
  399. movdqa $Xi,$T2
  400. psrlq \$1,$Xi
  401. pxor $T2,$Xhi #
  402. pxor $Xi,$T2
  403. psrlq \$5,$Xi
  404. pxor $T2,$Xi #
  405. psrlq \$1,$Xi #
  406. pxor $Xhi,$Xi #
  407. ___
  408. }
  409. { my ($Htbl,$Xip)=@_4args;
  410. my $HK="%xmm6";
  411. $code.=<<___;
  412. .globl gcm_init_clmul
  413. .type gcm_init_clmul,\@abi-omnipotent
  414. .align 16
  415. gcm_init_clmul:
  416. .L_init_clmul:
  417. ___
  418. $code.=<<___ if ($win64);
  419. .LSEH_begin_gcm_init_clmul:
  420. # I can't trust assembler to use specific encoding:-(
  421. .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
  422. .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
  423. ___
  424. $code.=<<___;
  425. movdqu ($Xip),$Hkey
  426. pshufd \$0b01001110,$Hkey,$Hkey # dword swap
  427. # <<1 twist
  428. pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
  429. movdqa $Hkey,$T1
  430. psllq \$1,$Hkey
  431. pxor $T3,$T3 #
  432. psrlq \$63,$T1
  433. pcmpgtd $T2,$T3 # broadcast carry bit
  434. pslldq \$8,$T1
  435. por $T1,$Hkey # H<<=1
  436. # magic reduction
  437. pand .L0x1c2_polynomial(%rip),$T3
  438. pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
  439. # calculate H^2
  440. pshufd \$0b01001110,$Hkey,$HK
  441. movdqa $Hkey,$Xi
  442. pxor $Hkey,$HK
  443. ___
  444. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK);
  445. &reduction_alg9 ($Xhi,$Xi);
  446. $code.=<<___;
  447. pshufd \$0b01001110,$Hkey,$T1
  448. pshufd \$0b01001110,$Xi,$T2
  449. pxor $Hkey,$T1 # Karatsuba pre-processing
  450. movdqu $Hkey,0x00($Htbl) # save H
  451. pxor $Xi,$T2 # Karatsuba pre-processing
  452. movdqu $Xi,0x10($Htbl) # save H^2
  453. palignr \$8,$T1,$T2 # low part is H.lo^H.hi...
  454. movdqu $T2,0x20($Htbl) # save Karatsuba "salt"
  455. ___
  456. if ($do4xaggr) {
  457. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3
  458. &reduction_alg9 ($Xhi,$Xi);
  459. $code.=<<___;
  460. movdqa $Xi,$T3
  461. ___
  462. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4
  463. &reduction_alg9 ($Xhi,$Xi);
  464. $code.=<<___;
  465. pshufd \$0b01001110,$T3,$T1
  466. pshufd \$0b01001110,$Xi,$T2
  467. pxor $T3,$T1 # Karatsuba pre-processing
  468. movdqu $T3,0x30($Htbl) # save H^3
  469. pxor $Xi,$T2 # Karatsuba pre-processing
  470. movdqu $Xi,0x40($Htbl) # save H^4
  471. palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi...
  472. movdqu $T2,0x50($Htbl) # save Karatsuba "salt"
  473. ___
  474. }
  475. $code.=<<___ if ($win64);
  476. movaps (%rsp),%xmm6
  477. lea 0x18(%rsp),%rsp
  478. .LSEH_end_gcm_init_clmul:
  479. ___
  480. $code.=<<___;
  481. ret
  482. .size gcm_init_clmul,.-gcm_init_clmul
  483. ___
  484. }
  485. { my ($Xip,$Htbl)=@_4args;
  486. $code.=<<___;
  487. .globl gcm_gmult_clmul
  488. .type gcm_gmult_clmul,\@abi-omnipotent
  489. .align 16
  490. gcm_gmult_clmul:
  491. .L_gmult_clmul:
  492. movdqu ($Xip),$Xi
  493. movdqa .Lbswap_mask(%rip),$T3
  494. movdqu ($Htbl),$Hkey
  495. movdqu 0x20($Htbl),$T2
  496. pshufb $T3,$Xi
  497. ___
  498. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
  499. $code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
  500. # experimental alternative. special thing about is that there
  501. # no dependency between the two multiplications...
  502. mov \$`0xE1<<1`,%eax
  503. mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff
  504. mov \$0x07,%r11d
  505. movq %rax,$T1
  506. movq %r10,$T2
  507. movq %r11,$T3 # borrow $T3
  508. pand $Xi,$T3
  509. pshufb $T3,$T2 # ($Xi&7)·0xE0
  510. movq %rax,$T3
  511. pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1)
  512. pxor $Xi,$T2
  513. pslldq \$15,$T2
  514. paddd $T2,$T2 # <<(64+56+1)
  515. pxor $T2,$Xi
  516. pclmulqdq \$0x01,$T3,$Xi
  517. movdqa .Lbswap_mask(%rip),$T3 # reload $T3
  518. psrldq \$1,$T1
  519. pxor $T1,$Xhi
  520. pslldq \$7,$Xi
  521. pxor $Xhi,$Xi
  522. ___
  523. $code.=<<___;
  524. pshufb $T3,$Xi
  525. movdqu $Xi,($Xip)
  526. ret
  527. .size gcm_gmult_clmul,.-gcm_gmult_clmul
  528. ___
  529. }
  530. { my ($Xip,$Htbl,$inp,$len)=@_4args;
  531. my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
  532. my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
  533. $code.=<<___;
  534. .globl gcm_ghash_clmul
  535. .type gcm_ghash_clmul,\@abi-omnipotent
  536. .align 32
  537. gcm_ghash_clmul:
  538. .L_ghash_clmul:
  539. ___
  540. $code.=<<___ if ($win64);
  541. lea -0x88(%rsp),%rax
  542. .LSEH_begin_gcm_ghash_clmul:
  543. # I can't trust assembler to use specific encoding:-(
  544. .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
  545. .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
  546. .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
  547. .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
  548. .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
  549. .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
  550. .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
  551. .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
  552. .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
  553. .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
  554. .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
  555. ___
  556. $code.=<<___;
  557. movdqa .Lbswap_mask(%rip),$T3
  558. movdqu ($Xip),$Xi
  559. movdqu ($Htbl),$Hkey
  560. movdqu 0x20($Htbl),$HK
  561. pshufb $T3,$Xi
  562. sub \$0x10,$len
  563. jz .Lodd_tail
  564. movdqu 0x10($Htbl),$Hkey2
  565. ___
  566. if ($do4xaggr) {
  567. my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
  568. $code.=<<___;
  569. mov OPENSSL_ia32cap_P+4(%rip),%eax
  570. cmp \$0x30,$len
  571. jb .Lskip4x
  572. and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE
  573. cmp \$`1<<22`,%eax # check for MOVBE without XSAVE
  574. je .Lskip4x
  575. sub \$0x30,$len
  576. mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
  577. movdqu 0x30($Htbl),$Hkey3
  578. movdqu 0x40($Htbl),$Hkey4
  579. #######
  580. # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P
  581. #
  582. movdqu 0x30($inp),$Xln
  583. movdqu 0x20($inp),$Xl
  584. pshufb $T3,$Xln
  585. pshufb $T3,$Xl
  586. movdqa $Xln,$Xhn
  587. pshufd \$0b01001110,$Xln,$Xmn
  588. pxor $Xln,$Xmn
  589. pclmulqdq \$0x00,$Hkey,$Xln
  590. pclmulqdq \$0x11,$Hkey,$Xhn
  591. pclmulqdq \$0x00,$HK,$Xmn
  592. movdqa $Xl,$Xh
  593. pshufd \$0b01001110,$Xl,$Xm
  594. pxor $Xl,$Xm
  595. pclmulqdq \$0x00,$Hkey2,$Xl
  596. pclmulqdq \$0x11,$Hkey2,$Xh
  597. pclmulqdq \$0x10,$HK,$Xm
  598. xorps $Xl,$Xln
  599. xorps $Xh,$Xhn
  600. movups 0x50($Htbl),$HK
  601. xorps $Xm,$Xmn
  602. movdqu 0x10($inp),$Xl
  603. movdqu 0($inp),$T1
  604. pshufb $T3,$Xl
  605. pshufb $T3,$T1
  606. movdqa $Xl,$Xh
  607. pshufd \$0b01001110,$Xl,$Xm
  608. pxor $T1,$Xi
  609. pxor $Xl,$Xm
  610. pclmulqdq \$0x00,$Hkey3,$Xl
  611. movdqa $Xi,$Xhi
  612. pshufd \$0b01001110,$Xi,$T1
  613. pxor $Xi,$T1
  614. pclmulqdq \$0x11,$Hkey3,$Xh
  615. pclmulqdq \$0x00,$HK,$Xm
  616. xorps $Xl,$Xln
  617. xorps $Xh,$Xhn
  618. lea 0x40($inp),$inp
  619. sub \$0x40,$len
  620. jc .Ltail4x
  621. jmp .Lmod4_loop
  622. .align 32
  623. .Lmod4_loop:
  624. pclmulqdq \$0x00,$Hkey4,$Xi
  625. xorps $Xm,$Xmn
  626. movdqu 0x30($inp),$Xl
  627. pshufb $T3,$Xl
  628. pclmulqdq \$0x11,$Hkey4,$Xhi
  629. xorps $Xln,$Xi
  630. movdqu 0x20($inp),$Xln
  631. movdqa $Xl,$Xh
  632. pclmulqdq \$0x10,$HK,$T1
  633. pshufd \$0b01001110,$Xl,$Xm
  634. xorps $Xhn,$Xhi
  635. pxor $Xl,$Xm
  636. pshufb $T3,$Xln
  637. movups 0x20($Htbl),$HK
  638. xorps $Xmn,$T1
  639. pclmulqdq \$0x00,$Hkey,$Xl
  640. pshufd \$0b01001110,$Xln,$Xmn
  641. pxor $Xi,$T1 # aggregated Karatsuba post-processing
  642. movdqa $Xln,$Xhn
  643. pxor $Xhi,$T1 #
  644. pxor $Xln,$Xmn
  645. movdqa $T1,$T2 #
  646. pclmulqdq \$0x11,$Hkey,$Xh
  647. pslldq \$8,$T1
  648. psrldq \$8,$T2 #
  649. pxor $T1,$Xi
  650. movdqa .L7_mask(%rip),$T1
  651. pxor $T2,$Xhi #
  652. movq %rax,$T2
  653. pand $Xi,$T1 # 1st phase
  654. pshufb $T1,$T2 #
  655. pxor $Xi,$T2 #
  656. pclmulqdq \$0x00,$HK,$Xm
  657. psllq \$57,$T2 #
  658. movdqa $T2,$T1 #
  659. pslldq \$8,$T2
  660. pclmulqdq \$0x00,$Hkey2,$Xln
  661. psrldq \$8,$T1 #
  662. pxor $T2,$Xi
  663. pxor $T1,$Xhi #
  664. movdqu 0($inp),$T1
  665. movdqa $Xi,$T2 # 2nd phase
  666. psrlq \$1,$Xi
  667. pclmulqdq \$0x11,$Hkey2,$Xhn
  668. xorps $Xl,$Xln
  669. movdqu 0x10($inp),$Xl
  670. pshufb $T3,$Xl
  671. pclmulqdq \$0x10,$HK,$Xmn
  672. xorps $Xh,$Xhn
  673. movups 0x50($Htbl),$HK
  674. pshufb $T3,$T1
  675. pxor $T2,$Xhi #
  676. pxor $Xi,$T2
  677. psrlq \$5,$Xi
  678. movdqa $Xl,$Xh
  679. pxor $Xm,$Xmn
  680. pshufd \$0b01001110,$Xl,$Xm
  681. pxor $T2,$Xi #
  682. pxor $T1,$Xhi
  683. pxor $Xl,$Xm
  684. pclmulqdq \$0x00,$Hkey3,$Xl
  685. psrlq \$1,$Xi #
  686. pxor $Xhi,$Xi #
  687. movdqa $Xi,$Xhi
  688. pclmulqdq \$0x11,$Hkey3,$Xh
  689. xorps $Xl,$Xln
  690. pshufd \$0b01001110,$Xi,$T1
  691. pxor $Xi,$T1
  692. pclmulqdq \$0x00,$HK,$Xm
  693. xorps $Xh,$Xhn
  694. lea 0x40($inp),$inp
  695. sub \$0x40,$len
  696. jnc .Lmod4_loop
  697. .Ltail4x:
  698. pclmulqdq \$0x00,$Hkey4,$Xi
  699. pclmulqdq \$0x11,$Hkey4,$Xhi
  700. pclmulqdq \$0x10,$HK,$T1
  701. xorps $Xm,$Xmn
  702. xorps $Xln,$Xi
  703. xorps $Xhn,$Xhi
  704. pxor $Xi,$Xhi # aggregated Karatsuba post-processing
  705. pxor $Xmn,$T1
  706. pxor $Xhi,$T1 #
  707. pxor $Xi,$Xhi
  708. movdqa $T1,$T2 #
  709. psrldq \$8,$T1
  710. pslldq \$8,$T2 #
  711. pxor $T1,$Xhi
  712. pxor $T2,$Xi #
  713. ___
  714. &reduction_alg9($Xhi,$Xi);
  715. $code.=<<___;
  716. add \$0x40,$len
  717. jz .Ldone
  718. movdqu 0x20($Htbl),$HK
  719. sub \$0x10,$len
  720. jz .Lodd_tail
  721. .Lskip4x:
  722. ___
  723. }
  724. $code.=<<___;
  725. #######
  726. # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
  727. # [(H*Ii+1) + (H*Xi+1)] mod P =
  728. # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
  729. #
  730. movdqu ($inp),$T1 # Ii
  731. movdqu 16($inp),$Xln # Ii+1
  732. pshufb $T3,$T1
  733. pshufb $T3,$Xln
  734. pxor $T1,$Xi # Ii+Xi
  735. movdqa $Xln,$Xhn
  736. pshufd \$0b01001110,$Xln,$Xmn
  737. pxor $Xln,$Xmn
  738. pclmulqdq \$0x00,$Hkey,$Xln
  739. pclmulqdq \$0x11,$Hkey,$Xhn
  740. pclmulqdq \$0x00,$HK,$Xmn
  741. lea 32($inp),$inp # i+=2
  742. nop
  743. sub \$0x20,$len
  744. jbe .Leven_tail
  745. nop
  746. jmp .Lmod_loop
  747. .align 32
  748. .Lmod_loop:
  749. movdqa $Xi,$Xhi
  750. movdqa $Xmn,$T1
  751. pshufd \$0b01001110,$Xi,$Xmn #
  752. pxor $Xi,$Xmn #
  753. pclmulqdq \$0x00,$Hkey2,$Xi
  754. pclmulqdq \$0x11,$Hkey2,$Xhi
  755. pclmulqdq \$0x10,$HK,$Xmn
  756. pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
  757. pxor $Xhn,$Xhi
  758. movdqu ($inp),$T2 # Ii
  759. pxor $Xi,$T1 # aggregated Karatsuba post-processing
  760. pshufb $T3,$T2
  761. movdqu 16($inp),$Xln # Ii+1
  762. pxor $Xhi,$T1
  763. pxor $T2,$Xhi # "Ii+Xi", consume early
  764. pxor $T1,$Xmn
  765. pshufb $T3,$Xln
  766. movdqa $Xmn,$T1 #
  767. psrldq \$8,$T1
  768. pslldq \$8,$Xmn #
  769. pxor $T1,$Xhi
  770. pxor $Xmn,$Xi #
  771. movdqa $Xln,$Xhn #
  772. movdqa $Xi,$T2 # 1st phase
  773. movdqa $Xi,$T1
  774. psllq \$5,$Xi
  775. pxor $Xi,$T1 #
  776. pclmulqdq \$0x00,$Hkey,$Xln #######
  777. psllq \$1,$Xi
  778. pxor $T1,$Xi #
  779. psllq \$57,$Xi #
  780. movdqa $Xi,$T1 #
  781. pslldq \$8,$Xi
  782. psrldq \$8,$T1 #
  783. pxor $T2,$Xi
  784. pshufd \$0b01001110,$Xhn,$Xmn
  785. pxor $T1,$Xhi #
  786. pxor $Xhn,$Xmn #
  787. movdqa $Xi,$T2 # 2nd phase
  788. psrlq \$1,$Xi
  789. pclmulqdq \$0x11,$Hkey,$Xhn #######
  790. pxor $T2,$Xhi #
  791. pxor $Xi,$T2
  792. psrlq \$5,$Xi
  793. pxor $T2,$Xi #
  794. lea 32($inp),$inp
  795. psrlq \$1,$Xi #
  796. pclmulqdq \$0x00,$HK,$Xmn #######
  797. pxor $Xhi,$Xi #
  798. sub \$0x20,$len
  799. ja .Lmod_loop
  800. .Leven_tail:
  801. movdqa $Xi,$Xhi
  802. movdqa $Xmn,$T1
  803. pshufd \$0b01001110,$Xi,$Xmn #
  804. pxor $Xi,$Xmn #
  805. pclmulqdq \$0x00,$Hkey2,$Xi
  806. pclmulqdq \$0x11,$Hkey2,$Xhi
  807. pclmulqdq \$0x10,$HK,$Xmn
  808. pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
  809. pxor $Xhn,$Xhi
  810. pxor $Xi,$T1
  811. pxor $Xhi,$T1
  812. pxor $T1,$Xmn
  813. movdqa $Xmn,$T1 #
  814. psrldq \$8,$T1
  815. pslldq \$8,$Xmn #
  816. pxor $T1,$Xhi
  817. pxor $Xmn,$Xi #
  818. ___
  819. &reduction_alg9 ($Xhi,$Xi);
  820. $code.=<<___;
  821. test $len,$len
  822. jnz .Ldone
  823. .Lodd_tail:
  824. movdqu ($inp),$T1 # Ii
  825. pshufb $T3,$T1
  826. pxor $T1,$Xi # Ii+Xi
  827. ___
  828. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi)
  829. &reduction_alg9 ($Xhi,$Xi);
  830. $code.=<<___;
  831. .Ldone:
  832. pshufb $T3,$Xi
  833. movdqu $Xi,($Xip)
  834. ___
  835. $code.=<<___ if ($win64);
  836. movaps (%rsp),%xmm6
  837. movaps 0x10(%rsp),%xmm7
  838. movaps 0x20(%rsp),%xmm8
  839. movaps 0x30(%rsp),%xmm9
  840. movaps 0x40(%rsp),%xmm10
  841. movaps 0x50(%rsp),%xmm11
  842. movaps 0x60(%rsp),%xmm12
  843. movaps 0x70(%rsp),%xmm13
  844. movaps 0x80(%rsp),%xmm14
  845. movaps 0x90(%rsp),%xmm15
  846. lea 0xa8(%rsp),%rsp
  847. .LSEH_end_gcm_ghash_clmul:
  848. ___
  849. $code.=<<___;
  850. ret
  851. .size gcm_ghash_clmul,.-gcm_ghash_clmul
  852. ___
  853. }
  854. $code.=<<___;
  855. .globl gcm_init_avx
  856. .type gcm_init_avx,\@abi-omnipotent
  857. .align 32
  858. gcm_init_avx:
  859. ___
  860. if ($avx) {
  861. my ($Htbl,$Xip)=@_4args;
  862. my $HK="%xmm6";
  863. $code.=<<___ if ($win64);
  864. .LSEH_begin_gcm_init_avx:
  865. # I can't trust assembler to use specific encoding:-(
  866. .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
  867. .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
  868. ___
  869. $code.=<<___;
  870. vzeroupper
  871. vmovdqu ($Xip),$Hkey
  872. vpshufd \$0b01001110,$Hkey,$Hkey # dword swap
  873. # <<1 twist
  874. vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
  875. vpsrlq \$63,$Hkey,$T1
  876. vpsllq \$1,$Hkey,$Hkey
  877. vpxor $T3,$T3,$T3 #
  878. vpcmpgtd $T2,$T3,$T3 # broadcast carry bit
  879. vpslldq \$8,$T1,$T1
  880. vpor $T1,$Hkey,$Hkey # H<<=1
  881. # magic reduction
  882. vpand .L0x1c2_polynomial(%rip),$T3,$T3
  883. vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial
  884. vpunpckhqdq $Hkey,$Hkey,$HK
  885. vmovdqa $Hkey,$Xi
  886. vpxor $Hkey,$HK,$HK
  887. mov \$4,%r10 # up to H^8
  888. jmp .Linit_start_avx
  889. ___
  890. sub clmul64x64_avx {
  891. my ($Xhi,$Xi,$Hkey,$HK)=@_;
  892. if (!defined($HK)) { $HK = $T2;
  893. $code.=<<___;
  894. vpunpckhqdq $Xi,$Xi,$T1
  895. vpunpckhqdq $Hkey,$Hkey,$T2
  896. vpxor $Xi,$T1,$T1 #
  897. vpxor $Hkey,$T2,$T2
  898. ___
  899. } else {
  900. $code.=<<___;
  901. vpunpckhqdq $Xi,$Xi,$T1
  902. vpxor $Xi,$T1,$T1 #
  903. ___
  904. }
  905. $code.=<<___;
  906. vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi #######
  907. vpclmulqdq \$0x00,$Hkey,$Xi,$Xi #######
  908. vpclmulqdq \$0x00,$HK,$T1,$T1 #######
  909. vpxor $Xi,$Xhi,$T2 #
  910. vpxor $T2,$T1,$T1 #
  911. vpslldq \$8,$T1,$T2 #
  912. vpsrldq \$8,$T1,$T1
  913. vpxor $T2,$Xi,$Xi #
  914. vpxor $T1,$Xhi,$Xhi
  915. ___
  916. }
  917. sub reduction_avx {
  918. my ($Xhi,$Xi) = @_;
  919. $code.=<<___;
  920. vpsllq \$57,$Xi,$T1 # 1st phase
  921. vpsllq \$62,$Xi,$T2
  922. vpxor $T1,$T2,$T2 #
  923. vpsllq \$63,$Xi,$T1
  924. vpxor $T1,$T2,$T2 #
  925. vpslldq \$8,$T2,$T1 #
  926. vpsrldq \$8,$T2,$T2
  927. vpxor $T1,$Xi,$Xi #
  928. vpxor $T2,$Xhi,$Xhi
  929. vpsrlq \$1,$Xi,$T2 # 2nd phase
  930. vpxor $Xi,$Xhi,$Xhi
  931. vpxor $T2,$Xi,$Xi #
  932. vpsrlq \$5,$T2,$T2
  933. vpxor $T2,$Xi,$Xi #
  934. vpsrlq \$1,$Xi,$Xi #
  935. vpxor $Xhi,$Xi,$Xi #
  936. ___
  937. }
  938. $code.=<<___;
  939. .align 32
  940. .Linit_loop_avx:
  941. vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi...
  942. vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt"
  943. ___
  944. &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7
  945. &reduction_avx ($Xhi,$Xi);
  946. $code.=<<___;
  947. .Linit_start_avx:
  948. vmovdqa $Xi,$T3
  949. ___
  950. &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8
  951. &reduction_avx ($Xhi,$Xi);
  952. $code.=<<___;
  953. vpshufd \$0b01001110,$T3,$T1
  954. vpshufd \$0b01001110,$Xi,$T2
  955. vpxor $T3,$T1,$T1 # Karatsuba pre-processing
  956. vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7
  957. vpxor $Xi,$T2,$T2 # Karatsuba pre-processing
  958. vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8
  959. lea 0x30($Htbl),$Htbl
  960. sub \$1,%r10
  961. jnz .Linit_loop_avx
  962. vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped
  963. vmovdqu $T3,-0x10($Htbl)
  964. vzeroupper
  965. ___
  966. $code.=<<___ if ($win64);
  967. movaps (%rsp),%xmm6
  968. lea 0x18(%rsp),%rsp
  969. .LSEH_end_gcm_init_avx:
  970. ___
  971. $code.=<<___;
  972. ret
  973. .size gcm_init_avx,.-gcm_init_avx
  974. ___
  975. } else {
  976. $code.=<<___;
  977. jmp .L_init_clmul
  978. .size gcm_init_avx,.-gcm_init_avx
  979. ___
  980. }
  981. $code.=<<___;
  982. .globl gcm_gmult_avx
  983. .type gcm_gmult_avx,\@abi-omnipotent
  984. .align 32
  985. gcm_gmult_avx:
  986. jmp .L_gmult_clmul
  987. .size gcm_gmult_avx,.-gcm_gmult_avx
  988. ___
  989. $code.=<<___;
  990. .globl gcm_ghash_avx
  991. .type gcm_ghash_avx,\@abi-omnipotent
  992. .align 32
  993. gcm_ghash_avx:
  994. ___
  995. if ($avx) {
  996. my ($Xip,$Htbl,$inp,$len)=@_4args;
  997. my ($Xlo,$Xhi,$Xmi,
  998. $Zlo,$Zhi,$Zmi,
  999. $Hkey,$HK,$T1,$T2,
  1000. $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15));
  1001. $code.=<<___ if ($win64);
  1002. lea -0x88(%rsp),%rax
  1003. .LSEH_begin_gcm_ghash_avx:
  1004. # I can't trust assembler to use specific encoding:-(
  1005. .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
  1006. .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
  1007. .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
  1008. .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
  1009. .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
  1010. .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
  1011. .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
  1012. .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
  1013. .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
  1014. .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
  1015. .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
  1016. ___
  1017. $code.=<<___;
  1018. vzeroupper
  1019. vmovdqu ($Xip),$Xi # load $Xi
  1020. lea .L0x1c2_polynomial(%rip),%r10
  1021. lea 0x40($Htbl),$Htbl # size optimization
  1022. vmovdqu .Lbswap_mask(%rip),$bswap
  1023. vpshufb $bswap,$Xi,$Xi
  1024. cmp \$0x80,$len
  1025. jb .Lshort_avx
  1026. sub \$0x80,$len
  1027. vmovdqu 0x70($inp),$Ii # I[7]
  1028. vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
  1029. vpshufb $bswap,$Ii,$Ii
  1030. vmovdqu 0x20-0x40($Htbl),$HK
  1031. vpunpckhqdq $Ii,$Ii,$T2
  1032. vmovdqu 0x60($inp),$Ij # I[6]
  1033. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1034. vpxor $Ii,$T2,$T2
  1035. vpshufb $bswap,$Ij,$Ij
  1036. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1037. vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
  1038. vpunpckhqdq $Ij,$Ij,$T1
  1039. vmovdqu 0x50($inp),$Ii # I[5]
  1040. vpclmulqdq \$0x00,$HK,$T2,$Xmi
  1041. vpxor $Ij,$T1,$T1
  1042. vpshufb $bswap,$Ii,$Ii
  1043. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1044. vpunpckhqdq $Ii,$Ii,$T2
  1045. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1046. vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
  1047. vpxor $Ii,$T2,$T2
  1048. vmovdqu 0x40($inp),$Ij # I[4]
  1049. vpclmulqdq \$0x10,$HK,$T1,$Zmi
  1050. vmovdqu 0x50-0x40($Htbl),$HK
  1051. vpshufb $bswap,$Ij,$Ij
  1052. vpxor $Xlo,$Zlo,$Zlo
  1053. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1054. vpxor $Xhi,$Zhi,$Zhi
  1055. vpunpckhqdq $Ij,$Ij,$T1
  1056. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1057. vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
  1058. vpxor $Xmi,$Zmi,$Zmi
  1059. vpclmulqdq \$0x00,$HK,$T2,$Xmi
  1060. vpxor $Ij,$T1,$T1
  1061. vmovdqu 0x30($inp),$Ii # I[3]
  1062. vpxor $Zlo,$Xlo,$Xlo
  1063. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1064. vpxor $Zhi,$Xhi,$Xhi
  1065. vpshufb $bswap,$Ii,$Ii
  1066. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1067. vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
  1068. vpxor $Zmi,$Xmi,$Xmi
  1069. vpunpckhqdq $Ii,$Ii,$T2
  1070. vpclmulqdq \$0x10,$HK,$T1,$Zmi
  1071. vmovdqu 0x80-0x40($Htbl),$HK
  1072. vpxor $Ii,$T2,$T2
  1073. vmovdqu 0x20($inp),$Ij # I[2]
  1074. vpxor $Xlo,$Zlo,$Zlo
  1075. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1076. vpxor $Xhi,$Zhi,$Zhi
  1077. vpshufb $bswap,$Ij,$Ij
  1078. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1079. vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
  1080. vpxor $Xmi,$Zmi,$Zmi
  1081. vpunpckhqdq $Ij,$Ij,$T1
  1082. vpclmulqdq \$0x00,$HK,$T2,$Xmi
  1083. vpxor $Ij,$T1,$T1
  1084. vmovdqu 0x10($inp),$Ii # I[1]
  1085. vpxor $Zlo,$Xlo,$Xlo
  1086. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1087. vpxor $Zhi,$Xhi,$Xhi
  1088. vpshufb $bswap,$Ii,$Ii
  1089. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1090. vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
  1091. vpxor $Zmi,$Xmi,$Xmi
  1092. vpunpckhqdq $Ii,$Ii,$T2
  1093. vpclmulqdq \$0x10,$HK,$T1,$Zmi
  1094. vmovdqu 0xb0-0x40($Htbl),$HK
  1095. vpxor $Ii,$T2,$T2
  1096. vmovdqu ($inp),$Ij # I[0]
  1097. vpxor $Xlo,$Zlo,$Zlo
  1098. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1099. vpxor $Xhi,$Zhi,$Zhi
  1100. vpshufb $bswap,$Ij,$Ij
  1101. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1102. vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
  1103. vpxor $Xmi,$Zmi,$Zmi
  1104. vpclmulqdq \$0x10,$HK,$T2,$Xmi
  1105. lea 0x80($inp),$inp
  1106. cmp \$0x80,$len
  1107. jb .Ltail_avx
  1108. vpxor $Xi,$Ij,$Ij # accumulate $Xi
  1109. sub \$0x80,$len
  1110. jmp .Loop8x_avx
  1111. .align 32
  1112. .Loop8x_avx:
  1113. vpunpckhqdq $Ij,$Ij,$T1
  1114. vmovdqu 0x70($inp),$Ii # I[7]
  1115. vpxor $Xlo,$Zlo,$Zlo
  1116. vpxor $Ij,$T1,$T1
  1117. vpclmulqdq \$0x00,$Hkey,$Ij,$Xi
  1118. vpshufb $bswap,$Ii,$Ii
  1119. vpxor $Xhi,$Zhi,$Zhi
  1120. vpclmulqdq \$0x11,$Hkey,$Ij,$Xo
  1121. vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
  1122. vpunpckhqdq $Ii,$Ii,$T2
  1123. vpxor $Xmi,$Zmi,$Zmi
  1124. vpclmulqdq \$0x00,$HK,$T1,$Tred
  1125. vmovdqu 0x20-0x40($Htbl),$HK
  1126. vpxor $Ii,$T2,$T2
  1127. vmovdqu 0x60($inp),$Ij # I[6]
  1128. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1129. vpxor $Zlo,$Xi,$Xi # collect result
  1130. vpshufb $bswap,$Ij,$Ij
  1131. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1132. vxorps $Zhi,$Xo,$Xo
  1133. vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
  1134. vpunpckhqdq $Ij,$Ij,$T1
  1135. vpclmulqdq \$0x00,$HK, $T2,$Xmi
  1136. vpxor $Zmi,$Tred,$Tred
  1137. vxorps $Ij,$T1,$T1
  1138. vmovdqu 0x50($inp),$Ii # I[5]
  1139. vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing
  1140. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1141. vpxor $Xo,$Tred,$Tred
  1142. vpslldq \$8,$Tred,$T2
  1143. vpxor $Xlo,$Zlo,$Zlo
  1144. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1145. vpsrldq \$8,$Tred,$Tred
  1146. vpxor $T2, $Xi, $Xi
  1147. vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
  1148. vpshufb $bswap,$Ii,$Ii
  1149. vxorps $Tred,$Xo, $Xo
  1150. vpxor $Xhi,$Zhi,$Zhi
  1151. vpunpckhqdq $Ii,$Ii,$T2
  1152. vpclmulqdq \$0x10,$HK, $T1,$Zmi
  1153. vmovdqu 0x50-0x40($Htbl),$HK
  1154. vpxor $Ii,$T2,$T2
  1155. vpxor $Xmi,$Zmi,$Zmi
  1156. vmovdqu 0x40($inp),$Ij # I[4]
  1157. vpalignr \$8,$Xi,$Xi,$Tred # 1st phase
  1158. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1159. vpshufb $bswap,$Ij,$Ij
  1160. vpxor $Zlo,$Xlo,$Xlo
  1161. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1162. vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
  1163. vpunpckhqdq $Ij,$Ij,$T1
  1164. vpxor $Zhi,$Xhi,$Xhi
  1165. vpclmulqdq \$0x00,$HK, $T2,$Xmi
  1166. vxorps $Ij,$T1,$T1
  1167. vpxor $Zmi,$Xmi,$Xmi
  1168. vmovdqu 0x30($inp),$Ii # I[3]
  1169. vpclmulqdq \$0x10,(%r10),$Xi,$Xi
  1170. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1171. vpshufb $bswap,$Ii,$Ii
  1172. vpxor $Xlo,$Zlo,$Zlo
  1173. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1174. vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
  1175. vpunpckhqdq $Ii,$Ii,$T2
  1176. vpxor $Xhi,$Zhi,$Zhi
  1177. vpclmulqdq \$0x10,$HK, $T1,$Zmi
  1178. vmovdqu 0x80-0x40($Htbl),$HK
  1179. vpxor $Ii,$T2,$T2
  1180. vpxor $Xmi,$Zmi,$Zmi
  1181. vmovdqu 0x20($inp),$Ij # I[2]
  1182. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1183. vpshufb $bswap,$Ij,$Ij
  1184. vpxor $Zlo,$Xlo,$Xlo
  1185. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1186. vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
  1187. vpunpckhqdq $Ij,$Ij,$T1
  1188. vpxor $Zhi,$Xhi,$Xhi
  1189. vpclmulqdq \$0x00,$HK, $T2,$Xmi
  1190. vpxor $Ij,$T1,$T1
  1191. vpxor $Zmi,$Xmi,$Xmi
  1192. vxorps $Tred,$Xi,$Xi
  1193. vmovdqu 0x10($inp),$Ii # I[1]
  1194. vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase
  1195. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1196. vpshufb $bswap,$Ii,$Ii
  1197. vpxor $Xlo,$Zlo,$Zlo
  1198. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1199. vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
  1200. vpclmulqdq \$0x10,(%r10),$Xi,$Xi
  1201. vxorps $Xo,$Tred,$Tred
  1202. vpunpckhqdq $Ii,$Ii,$T2
  1203. vpxor $Xhi,$Zhi,$Zhi
  1204. vpclmulqdq \$0x10,$HK, $T1,$Zmi
  1205. vmovdqu 0xb0-0x40($Htbl),$HK
  1206. vpxor $Ii,$T2,$T2
  1207. vpxor $Xmi,$Zmi,$Zmi
  1208. vmovdqu ($inp),$Ij # I[0]
  1209. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1210. vpshufb $bswap,$Ij,$Ij
  1211. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1212. vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
  1213. vpxor $Tred,$Ij,$Ij
  1214. vpclmulqdq \$0x10,$HK, $T2,$Xmi
  1215. vpxor $Xi,$Ij,$Ij # accumulate $Xi
  1216. lea 0x80($inp),$inp
  1217. sub \$0x80,$len
  1218. jnc .Loop8x_avx
  1219. add \$0x80,$len
  1220. jmp .Ltail_no_xor_avx
  1221. .align 32
  1222. .Lshort_avx:
  1223. vmovdqu -0x10($inp,$len),$Ii # very last word
  1224. lea ($inp,$len),$inp
  1225. vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
  1226. vmovdqu 0x20-0x40($Htbl),$HK
  1227. vpshufb $bswap,$Ii,$Ij
  1228. vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo,
  1229. vmovdqa $Xhi,$Zhi # $Zhi and
  1230. vmovdqa $Xmi,$Zmi # $Zmi
  1231. sub \$0x10,$len
  1232. jz .Ltail_avx
  1233. vpunpckhqdq $Ij,$Ij,$T1
  1234. vpxor $Xlo,$Zlo,$Zlo
  1235. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1236. vpxor $Ij,$T1,$T1
  1237. vmovdqu -0x20($inp),$Ii
  1238. vpxor $Xhi,$Zhi,$Zhi
  1239. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1240. vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
  1241. vpshufb $bswap,$Ii,$Ij
  1242. vpxor $Xmi,$Zmi,$Zmi
  1243. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1244. vpsrldq \$8,$HK,$HK
  1245. sub \$0x10,$len
  1246. jz .Ltail_avx
  1247. vpunpckhqdq $Ij,$Ij,$T1
  1248. vpxor $Xlo,$Zlo,$Zlo
  1249. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1250. vpxor $Ij,$T1,$T1
  1251. vmovdqu -0x30($inp),$Ii
  1252. vpxor $Xhi,$Zhi,$Zhi
  1253. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1254. vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
  1255. vpshufb $bswap,$Ii,$Ij
  1256. vpxor $Xmi,$Zmi,$Zmi
  1257. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1258. vmovdqu 0x50-0x40($Htbl),$HK
  1259. sub \$0x10,$len
  1260. jz .Ltail_avx
  1261. vpunpckhqdq $Ij,$Ij,$T1
  1262. vpxor $Xlo,$Zlo,$Zlo
  1263. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1264. vpxor $Ij,$T1,$T1
  1265. vmovdqu -0x40($inp),$Ii
  1266. vpxor $Xhi,$Zhi,$Zhi
  1267. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1268. vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
  1269. vpshufb $bswap,$Ii,$Ij
  1270. vpxor $Xmi,$Zmi,$Zmi
  1271. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1272. vpsrldq \$8,$HK,$HK
  1273. sub \$0x10,$len
  1274. jz .Ltail_avx
  1275. vpunpckhqdq $Ij,$Ij,$T1
  1276. vpxor $Xlo,$Zlo,$Zlo
  1277. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1278. vpxor $Ij,$T1,$T1
  1279. vmovdqu -0x50($inp),$Ii
  1280. vpxor $Xhi,$Zhi,$Zhi
  1281. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1282. vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
  1283. vpshufb $bswap,$Ii,$Ij
  1284. vpxor $Xmi,$Zmi,$Zmi
  1285. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1286. vmovdqu 0x80-0x40($Htbl),$HK
  1287. sub \$0x10,$len
  1288. jz .Ltail_avx
  1289. vpunpckhqdq $Ij,$Ij,$T1
  1290. vpxor $Xlo,$Zlo,$Zlo
  1291. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1292. vpxor $Ij,$T1,$T1
  1293. vmovdqu -0x60($inp),$Ii
  1294. vpxor $Xhi,$Zhi,$Zhi
  1295. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1296. vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
  1297. vpshufb $bswap,$Ii,$Ij
  1298. vpxor $Xmi,$Zmi,$Zmi
  1299. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1300. vpsrldq \$8,$HK,$HK
  1301. sub \$0x10,$len
  1302. jz .Ltail_avx
  1303. vpunpckhqdq $Ij,$Ij,$T1
  1304. vpxor $Xlo,$Zlo,$Zlo
  1305. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1306. vpxor $Ij,$T1,$T1
  1307. vmovdqu -0x70($inp),$Ii
  1308. vpxor $Xhi,$Zhi,$Zhi
  1309. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1310. vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
  1311. vpshufb $bswap,$Ii,$Ij
  1312. vpxor $Xmi,$Zmi,$Zmi
  1313. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1314. vmovq 0xb8-0x40($Htbl),$HK
  1315. sub \$0x10,$len
  1316. jmp .Ltail_avx
  1317. .align 32
  1318. .Ltail_avx:
  1319. vpxor $Xi,$Ij,$Ij # accumulate $Xi
  1320. .Ltail_no_xor_avx:
  1321. vpunpckhqdq $Ij,$Ij,$T1
  1322. vpxor $Xlo,$Zlo,$Zlo
  1323. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1324. vpxor $Ij,$T1,$T1
  1325. vpxor $Xhi,$Zhi,$Zhi
  1326. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1327. vpxor $Xmi,$Zmi,$Zmi
  1328. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1329. vmovdqu (%r10),$Tred
  1330. vpxor $Xlo,$Zlo,$Xi
  1331. vpxor $Xhi,$Zhi,$Xo
  1332. vpxor $Xmi,$Zmi,$Zmi
  1333. vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing
  1334. vpxor $Xo, $Zmi,$Zmi
  1335. vpslldq \$8, $Zmi,$T2
  1336. vpsrldq \$8, $Zmi,$Zmi
  1337. vpxor $T2, $Xi, $Xi
  1338. vpxor $Zmi,$Xo, $Xo
  1339. vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase
  1340. vpalignr \$8,$Xi,$Xi,$Xi
  1341. vpxor $T2,$Xi,$Xi
  1342. vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase
  1343. vpalignr \$8,$Xi,$Xi,$Xi
  1344. vpxor $Xo,$Xi,$Xi
  1345. vpxor $T2,$Xi,$Xi
  1346. cmp \$0,$len
  1347. jne .Lshort_avx
  1348. vpshufb $bswap,$Xi,$Xi
  1349. vmovdqu $Xi,($Xip)
  1350. vzeroupper
  1351. ___
  1352. $code.=<<___ if ($win64);
  1353. movaps (%rsp),%xmm6
  1354. movaps 0x10(%rsp),%xmm7
  1355. movaps 0x20(%rsp),%xmm8
  1356. movaps 0x30(%rsp),%xmm9
  1357. movaps 0x40(%rsp),%xmm10
  1358. movaps 0x50(%rsp),%xmm11
  1359. movaps 0x60(%rsp),%xmm12
  1360. movaps 0x70(%rsp),%xmm13
  1361. movaps 0x80(%rsp),%xmm14
  1362. movaps 0x90(%rsp),%xmm15
  1363. lea 0xa8(%rsp),%rsp
  1364. .LSEH_end_gcm_ghash_avx:
  1365. ___
  1366. $code.=<<___;
  1367. ret
  1368. .size gcm_ghash_avx,.-gcm_ghash_avx
  1369. ___
  1370. } else {
  1371. $code.=<<___;
  1372. jmp .L_ghash_clmul
  1373. .size gcm_ghash_avx,.-gcm_ghash_avx
  1374. ___
  1375. }
  1376. $code.=<<___;
  1377. .align 64
  1378. .Lbswap_mask:
  1379. .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
  1380. .L0x1c2_polynomial:
  1381. .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
  1382. .L7_mask:
  1383. .long 7,0,7,0
  1384. .L7_mask_poly:
  1385. .long 7,0,`0xE1<<1`,0
  1386. .align 64
  1387. .type .Lrem_4bit,\@object
  1388. .Lrem_4bit:
  1389. .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
  1390. .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
  1391. .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
  1392. .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
  1393. .type .Lrem_8bit,\@object
  1394. .Lrem_8bit:
  1395. .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
  1396. .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
  1397. .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
  1398. .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
  1399. .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
  1400. .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
  1401. .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
  1402. .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
  1403. .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
  1404. .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
  1405. .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
  1406. .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
  1407. .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
  1408. .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
  1409. .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
  1410. .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
  1411. .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
  1412. .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
  1413. .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
  1414. .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
  1415. .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
  1416. .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
  1417. .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
  1418. .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
  1419. .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
  1420. .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
  1421. .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
  1422. .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
  1423. .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
  1424. .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
  1425. .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
  1426. .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
  1427. .asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  1428. .align 64
  1429. ___
  1430. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1431. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1432. if ($win64) {
  1433. $rec="%rcx";
  1434. $frame="%rdx";
  1435. $context="%r8";
  1436. $disp="%r9";
  1437. $code.=<<___;
  1438. .extern __imp_RtlVirtualUnwind
  1439. .type se_handler,\@abi-omnipotent
  1440. .align 16
  1441. se_handler:
  1442. push %rsi
  1443. push %rdi
  1444. push %rbx
  1445. push %rbp
  1446. push %r12
  1447. push %r13
  1448. push %r14
  1449. push %r15
  1450. pushfq
  1451. sub \$64,%rsp
  1452. mov 120($context),%rax # pull context->Rax
  1453. mov 248($context),%rbx # pull context->Rip
  1454. mov 8($disp),%rsi # disp->ImageBase
  1455. mov 56($disp),%r11 # disp->HandlerData
  1456. mov 0(%r11),%r10d # HandlerData[0]
  1457. lea (%rsi,%r10),%r10 # prologue label
  1458. cmp %r10,%rbx # context->Rip<prologue label
  1459. jb .Lin_prologue
  1460. mov 152($context),%rax # pull context->Rsp
  1461. mov 4(%r11),%r10d # HandlerData[1]
  1462. lea (%rsi,%r10),%r10 # epilogue label
  1463. cmp %r10,%rbx # context->Rip>=epilogue label
  1464. jae .Lin_prologue
  1465. lea 24(%rax),%rax # adjust "rsp"
  1466. mov -8(%rax),%rbx
  1467. mov -16(%rax),%rbp
  1468. mov -24(%rax),%r12
  1469. mov %rbx,144($context) # restore context->Rbx
  1470. mov %rbp,160($context) # restore context->Rbp
  1471. mov %r12,216($context) # restore context->R12
  1472. .Lin_prologue:
  1473. mov 8(%rax),%rdi
  1474. mov 16(%rax),%rsi
  1475. mov %rax,152($context) # restore context->Rsp
  1476. mov %rsi,168($context) # restore context->Rsi
  1477. mov %rdi,176($context) # restore context->Rdi
  1478. mov 40($disp),%rdi # disp->ContextRecord
  1479. mov $context,%rsi # context
  1480. mov \$`1232/8`,%ecx # sizeof(CONTEXT)
  1481. .long 0xa548f3fc # cld; rep movsq
  1482. mov $disp,%rsi
  1483. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1484. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1485. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1486. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1487. mov 40(%rsi),%r10 # disp->ContextRecord
  1488. lea 56(%rsi),%r11 # &disp->HandlerData
  1489. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1490. mov %r10,32(%rsp) # arg5
  1491. mov %r11,40(%rsp) # arg6
  1492. mov %r12,48(%rsp) # arg7
  1493. mov %rcx,56(%rsp) # arg8, (NULL)
  1494. call *__imp_RtlVirtualUnwind(%rip)
  1495. mov \$1,%eax # ExceptionContinueSearch
  1496. add \$64,%rsp
  1497. popfq
  1498. pop %r15
  1499. pop %r14
  1500. pop %r13
  1501. pop %r12
  1502. pop %rbp
  1503. pop %rbx
  1504. pop %rdi
  1505. pop %rsi
  1506. ret
  1507. .size se_handler,.-se_handler
  1508. .section .pdata
  1509. .align 4
  1510. .rva .LSEH_begin_gcm_gmult_4bit
  1511. .rva .LSEH_end_gcm_gmult_4bit
  1512. .rva .LSEH_info_gcm_gmult_4bit
  1513. .rva .LSEH_begin_gcm_ghash_4bit
  1514. .rva .LSEH_end_gcm_ghash_4bit
  1515. .rva .LSEH_info_gcm_ghash_4bit
  1516. .rva .LSEH_begin_gcm_init_clmul
  1517. .rva .LSEH_end_gcm_init_clmul
  1518. .rva .LSEH_info_gcm_init_clmul
  1519. .rva .LSEH_begin_gcm_ghash_clmul
  1520. .rva .LSEH_end_gcm_ghash_clmul
  1521. .rva .LSEH_info_gcm_ghash_clmul
  1522. ___
  1523. $code.=<<___ if ($avx);
  1524. .rva .LSEH_begin_gcm_init_avx
  1525. .rva .LSEH_end_gcm_init_avx
  1526. .rva .LSEH_info_gcm_init_clmul
  1527. .rva .LSEH_begin_gcm_ghash_avx
  1528. .rva .LSEH_end_gcm_ghash_avx
  1529. .rva .LSEH_info_gcm_ghash_clmul
  1530. ___
  1531. $code.=<<___;
  1532. .section .xdata
  1533. .align 8
  1534. .LSEH_info_gcm_gmult_4bit:
  1535. .byte 9,0,0,0
  1536. .rva se_handler
  1537. .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData
  1538. .LSEH_info_gcm_ghash_4bit:
  1539. .byte 9,0,0,0
  1540. .rva se_handler
  1541. .rva .Lghash_prologue,.Lghash_epilogue # HandlerData
  1542. .LSEH_info_gcm_init_clmul:
  1543. .byte 0x01,0x08,0x03,0x00
  1544. .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
  1545. .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18
  1546. .LSEH_info_gcm_ghash_clmul:
  1547. .byte 0x01,0x33,0x16,0x00
  1548. .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15
  1549. .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14
  1550. .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13
  1551. .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12
  1552. .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11
  1553. .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
  1554. .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
  1555. .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
  1556. .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
  1557. .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
  1558. .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
  1559. ___
  1560. }
  1561. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  1562. print $code;
  1563. close STDOUT;