Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 
 
 
 

1743 lignes
42 KiB

  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  5. # project. The module is, however, dual licensed under OpenSSL and
  6. # CRYPTOGAMS licenses depending on where you obtain it. For further
  7. # details see http://www.openssl.org/~appro/cryptogams/.
  8. # ====================================================================
  9. #
  10. # March, June 2010
  11. #
  12. # The module implements "4-bit" GCM GHASH function and underlying
  13. # single multiplication operation in GF(2^128). "4-bit" means that
  14. # it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
  15. # function features so called "528B" variant utilizing additional
  16. # 256+16 bytes of per-key storage [+512 bytes shared table].
  17. # Performance results are for this streamed GHASH subroutine and are
  18. # expressed in cycles per processed byte, less is better:
  19. #
  20. # gcc 3.4.x(*) assembler
  21. #
  22. # P4 28.6 14.0 +100%
  23. # Opteron 19.3 7.7 +150%
  24. # Core2 17.8 8.1(**) +120%
  25. # Atom 31.6 16.8 +88%
  26. # VIA Nano 21.8 10.1 +115%
  27. #
  28. # (*) comparison is not completely fair, because C results are
  29. # for vanilla "256B" implementation, while assembler results
  30. # are for "528B";-)
  31. # (**) it's mystery [to me] why Core2 result is not same as for
  32. # Opteron;
  33. # May 2010
  34. #
  35. # Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
  36. # See ghash-x86.pl for background information and details about coding
  37. # techniques.
  38. #
  39. # Special thanks to David Woodhouse <dwmw2@infradead.org> for
  40. # providing access to a Westmere-based system on behalf of Intel
  41. # Open Source Technology Centre.
  42. # December 2012
  43. #
  44. # Overhaul: aggregate Karatsuba post-processing, improve ILP in
  45. # reduction_alg9, increase reduction aggregate factor to 4x. As for
  46. # the latter. ghash-x86.pl discusses that it makes lesser sense to
  47. # increase aggregate factor. Then why increase here? Critical path
  48. # consists of 3 independent pclmulqdq instructions, Karatsuba post-
  49. # processing and reduction. "On top" of this we lay down aggregated
  50. # multiplication operations, triplets of independent pclmulqdq's. As
  51. # issue rate for pclmulqdq is limited, it makes lesser sense to
  52. # aggregate more multiplications than it takes to perform remaining
  53. # non-multiplication operations. 2x is near-optimal coefficient for
  54. # contemporary Intel CPUs (therefore modest improvement coefficient),
  55. # but not for Bulldozer. Latter is because logical SIMD operations
  56. # are twice as slow in comparison to Intel, so that critical path is
  57. # longer. A CPU with higher pclmulqdq issue rate would also benefit
  58. # from higher aggregate factor...
  59. #
  60. # Westmere 1.78(+13%)
  61. # Sandy Bridge 1.80(+8%)
  62. # Ivy Bridge 1.80(+7%)
  63. # Haswell 0.55(+93%) (if system doesn't support AVX)
  64. # Broadwell 0.45(+110%)(if system doesn't support AVX)
  65. # Bulldozer 1.49(+27%)
  66. # Silvermont 2.88(+13%)
  67. # March 2013
  68. #
  69. # ... 8x aggregate factor AVX code path is using reduction algorithm
  70. # suggested by Shay Gueron[1]. Even though contemporary AVX-capable
  71. # CPUs such as Sandy and Ivy Bridge can execute it, the code performs
  72. # sub-optimally in comparison to above mentioned version. But thanks
  73. # to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
  74. # it performs in 0.41 cycles per byte on Haswell processor, and in
  75. # 0.29 on Broadwell.
  76. #
  77. # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
  78. $flavour = shift;
  79. $output = shift;
  80. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  81. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  82. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  83. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  84. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  85. die "can't locate x86_64-xlate.pl";
  86. # This must be kept in sync with |$avx| in aesni-gcm-x86_64.pl; otherwise tags
  87. # will be computed incorrectly.
  88. #
  89. # In upstream, this is controlled by shelling out to the compiler to check
  90. # versions, but BoringSSL is intended to be used with pre-generated perlasm
  91. # output, so this isn't useful anyway.
  92. $avx = 0;
  93. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  94. *STDOUT=*OUT;
  95. $do4xaggr=1;
  96. # common register layout
  97. $nlo="%rax";
  98. $nhi="%rbx";
  99. $Zlo="%r8";
  100. $Zhi="%r9";
  101. $tmp="%r10";
  102. $rem_4bit = "%r11";
  103. $Xi="%rdi";
  104. $Htbl="%rsi";
  105. # per-function register layout
  106. $cnt="%rcx";
  107. $rem="%rdx";
  108. sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or
  109. $r =~ s/%[er]([sd]i)/%\1l/ or
  110. $r =~ s/%[er](bp)/%\1l/ or
  111. $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
  112. sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
  113. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
  114. my $arg = pop;
  115. $arg = "\$$arg" if ($arg*1 eq $arg);
  116. $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
  117. }
  118. { my $N;
  119. sub loop() {
  120. my $inp = shift;
  121. $N++;
  122. $code.=<<___;
  123. xor $nlo,$nlo
  124. xor $nhi,$nhi
  125. mov `&LB("$Zlo")`,`&LB("$nlo")`
  126. mov `&LB("$Zlo")`,`&LB("$nhi")`
  127. shl \$4,`&LB("$nlo")`
  128. mov \$14,$cnt
  129. mov 8($Htbl,$nlo),$Zlo
  130. mov ($Htbl,$nlo),$Zhi
  131. and \$0xf0,`&LB("$nhi")`
  132. mov $Zlo,$rem
  133. jmp .Loop$N
  134. .align 16
  135. .Loop$N:
  136. shr \$4,$Zlo
  137. and \$0xf,$rem
  138. mov $Zhi,$tmp
  139. mov ($inp,$cnt),`&LB("$nlo")`
  140. shr \$4,$Zhi
  141. xor 8($Htbl,$nhi),$Zlo
  142. shl \$60,$tmp
  143. xor ($Htbl,$nhi),$Zhi
  144. mov `&LB("$nlo")`,`&LB("$nhi")`
  145. xor ($rem_4bit,$rem,8),$Zhi
  146. mov $Zlo,$rem
  147. shl \$4,`&LB("$nlo")`
  148. xor $tmp,$Zlo
  149. dec $cnt
  150. js .Lbreak$N
  151. shr \$4,$Zlo
  152. and \$0xf,$rem
  153. mov $Zhi,$tmp
  154. shr \$4,$Zhi
  155. xor 8($Htbl,$nlo),$Zlo
  156. shl \$60,$tmp
  157. xor ($Htbl,$nlo),$Zhi
  158. and \$0xf0,`&LB("$nhi")`
  159. xor ($rem_4bit,$rem,8),$Zhi
  160. mov $Zlo,$rem
  161. xor $tmp,$Zlo
  162. jmp .Loop$N
  163. .align 16
  164. .Lbreak$N:
  165. shr \$4,$Zlo
  166. and \$0xf,$rem
  167. mov $Zhi,$tmp
  168. shr \$4,$Zhi
  169. xor 8($Htbl,$nlo),$Zlo
  170. shl \$60,$tmp
  171. xor ($Htbl,$nlo),$Zhi
  172. and \$0xf0,`&LB("$nhi")`
  173. xor ($rem_4bit,$rem,8),$Zhi
  174. mov $Zlo,$rem
  175. xor $tmp,$Zlo
  176. shr \$4,$Zlo
  177. and \$0xf,$rem
  178. mov $Zhi,$tmp
  179. shr \$4,$Zhi
  180. xor 8($Htbl,$nhi),$Zlo
  181. shl \$60,$tmp
  182. xor ($Htbl,$nhi),$Zhi
  183. xor $tmp,$Zlo
  184. xor ($rem_4bit,$rem,8),$Zhi
  185. bswap $Zlo
  186. bswap $Zhi
  187. ___
  188. }}
  189. $code=<<___;
  190. .text
  191. .extern OPENSSL_ia32cap_P
  192. .globl gcm_gmult_4bit
  193. .type gcm_gmult_4bit,\@function,2
  194. .align 16
  195. gcm_gmult_4bit:
  196. push %rbx
  197. push %rbp # %rbp and %r12 are pushed exclusively in
  198. push %r12 # order to reuse Win64 exception handler...
  199. .Lgmult_prologue:
  200. movzb 15($Xi),$Zlo
  201. lea .Lrem_4bit(%rip),$rem_4bit
  202. ___
  203. &loop ($Xi);
  204. $code.=<<___;
  205. mov $Zlo,8($Xi)
  206. mov $Zhi,($Xi)
  207. mov 16(%rsp),%rbx
  208. lea 24(%rsp),%rsp
  209. .Lgmult_epilogue:
  210. ret
  211. .size gcm_gmult_4bit,.-gcm_gmult_4bit
  212. ___
  213. # per-function register layout
  214. $inp="%rdx";
  215. $len="%rcx";
  216. $rem_8bit=$rem_4bit;
  217. $code.=<<___;
  218. .globl gcm_ghash_4bit
  219. .type gcm_ghash_4bit,\@function,4
  220. .align 16
  221. gcm_ghash_4bit:
  222. push %rbx
  223. push %rbp
  224. push %r12
  225. push %r13
  226. push %r14
  227. push %r15
  228. sub \$280,%rsp
  229. .Lghash_prologue:
  230. mov $inp,%r14 # reassign couple of args
  231. mov $len,%r15
  232. ___
  233. { my $inp="%r14";
  234. my $dat="%edx";
  235. my $len="%r15";
  236. my @nhi=("%ebx","%ecx");
  237. my @rem=("%r12","%r13");
  238. my $Hshr4="%rbp";
  239. &sub ($Htbl,-128); # size optimization
  240. &lea ($Hshr4,"16+128(%rsp)");
  241. { my @lo =($nlo,$nhi);
  242. my @hi =($Zlo,$Zhi);
  243. &xor ($dat,$dat);
  244. for ($i=0,$j=-2;$i<18;$i++,$j++) {
  245. &mov ("$j(%rsp)",&LB($dat)) if ($i>1);
  246. &or ($lo[0],$tmp) if ($i>1);
  247. &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17);
  248. &shr ($lo[1],4) if ($i>0 && $i<17);
  249. &mov ($tmp,$hi[1]) if ($i>0 && $i<17);
  250. &shr ($hi[1],4) if ($i>0 && $i<17);
  251. &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1);
  252. &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16);
  253. &shl (&LB($dat),4) if ($i>0 && $i<17);
  254. &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1);
  255. &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16);
  256. &shl ($tmp,60) if ($i>0 && $i<17);
  257. push (@lo,shift(@lo));
  258. push (@hi,shift(@hi));
  259. }
  260. }
  261. &add ($Htbl,-128);
  262. &mov ($Zlo,"8($Xi)");
  263. &mov ($Zhi,"0($Xi)");
  264. &add ($len,$inp); # pointer to the end of data
  265. &lea ($rem_8bit,".Lrem_8bit(%rip)");
  266. &jmp (".Louter_loop");
  267. $code.=".align 16\n.Louter_loop:\n";
  268. &xor ($Zhi,"($inp)");
  269. &mov ("%rdx","8($inp)");
  270. &lea ($inp,"16($inp)");
  271. &xor ("%rdx",$Zlo);
  272. &mov ("($Xi)",$Zhi);
  273. &mov ("8($Xi)","%rdx");
  274. &shr ("%rdx",32);
  275. &xor ($nlo,$nlo);
  276. &rol ($dat,8);
  277. &mov (&LB($nlo),&LB($dat));
  278. &movz ($nhi[0],&LB($dat));
  279. &shl (&LB($nlo),4);
  280. &shr ($nhi[0],4);
  281. for ($j=11,$i=0;$i<15;$i++) {
  282. &rol ($dat,8);
  283. &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0);
  284. &xor ($Zhi,"($Htbl,$nlo)") if ($i>0);
  285. &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0);
  286. &mov ($Zhi,"($Htbl,$nlo)") if ($i==0);
  287. &mov (&LB($nlo),&LB($dat));
  288. &xor ($Zlo,$tmp) if ($i>0);
  289. &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0);
  290. &movz ($nhi[1],&LB($dat));
  291. &shl (&LB($nlo),4);
  292. &movzb ($rem[0],"(%rsp,$nhi[0])");
  293. &shr ($nhi[1],4) if ($i<14);
  294. &and ($nhi[1],0xf0) if ($i==14);
  295. &shl ($rem[1],48) if ($i>0);
  296. &xor ($rem[0],$Zlo);
  297. &mov ($tmp,$Zhi);
  298. &xor ($Zhi,$rem[1]) if ($i>0);
  299. &shr ($Zlo,8);
  300. &movz ($rem[0],&LB($rem[0]));
  301. &mov ($dat,"$j($Xi)") if (--$j%4==0);
  302. &shr ($Zhi,8);
  303. &xor ($Zlo,"-128($Hshr4,$nhi[0],8)");
  304. &shl ($tmp,56);
  305. &xor ($Zhi,"($Hshr4,$nhi[0],8)");
  306. unshift (@nhi,pop(@nhi)); # "rotate" registers
  307. unshift (@rem,pop(@rem));
  308. }
  309. &movzw ($rem[1],"($rem_8bit,$rem[1],2)");
  310. &xor ($Zlo,"8($Htbl,$nlo)");
  311. &xor ($Zhi,"($Htbl,$nlo)");
  312. &shl ($rem[1],48);
  313. &xor ($Zlo,$tmp);
  314. &xor ($Zhi,$rem[1]);
  315. &movz ($rem[0],&LB($Zlo));
  316. &shr ($Zlo,4);
  317. &mov ($tmp,$Zhi);
  318. &shl (&LB($rem[0]),4);
  319. &shr ($Zhi,4);
  320. &xor ($Zlo,"8($Htbl,$nhi[0])");
  321. &movzw ($rem[0],"($rem_8bit,$rem[0],2)");
  322. &shl ($tmp,60);
  323. &xor ($Zhi,"($Htbl,$nhi[0])");
  324. &xor ($Zlo,$tmp);
  325. &shl ($rem[0],48);
  326. &bswap ($Zlo);
  327. &xor ($Zhi,$rem[0]);
  328. &bswap ($Zhi);
  329. &cmp ($inp,$len);
  330. &jb (".Louter_loop");
  331. }
  332. $code.=<<___;
  333. mov $Zlo,8($Xi)
  334. mov $Zhi,($Xi)
  335. lea 280(%rsp),%rsi
  336. mov 0(%rsi),%r15
  337. mov 8(%rsi),%r14
  338. mov 16(%rsi),%r13
  339. mov 24(%rsi),%r12
  340. mov 32(%rsi),%rbp
  341. mov 40(%rsi),%rbx
  342. lea 48(%rsi),%rsp
  343. .Lghash_epilogue:
  344. ret
  345. .size gcm_ghash_4bit,.-gcm_ghash_4bit
  346. ___
  347. ######################################################################
  348. # PCLMULQDQ version.
  349. @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
  350. ("%rdi","%rsi","%rdx","%rcx"); # Unix order
  351. ($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2";
  352. ($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
  353. sub clmul64x64_T2 { # minimal register pressure
  354. my ($Xhi,$Xi,$Hkey,$HK)=@_;
  355. if (!defined($HK)) { $HK = $T2;
  356. $code.=<<___;
  357. movdqa $Xi,$Xhi #
  358. pshufd \$0b01001110,$Xi,$T1
  359. pshufd \$0b01001110,$Hkey,$T2
  360. pxor $Xi,$T1 #
  361. pxor $Hkey,$T2
  362. ___
  363. } else {
  364. $code.=<<___;
  365. movdqa $Xi,$Xhi #
  366. pshufd \$0b01001110,$Xi,$T1
  367. pxor $Xi,$T1 #
  368. ___
  369. }
  370. $code.=<<___;
  371. pclmulqdq \$0x00,$Hkey,$Xi #######
  372. pclmulqdq \$0x11,$Hkey,$Xhi #######
  373. pclmulqdq \$0x00,$HK,$T1 #######
  374. pxor $Xi,$T1 #
  375. pxor $Xhi,$T1 #
  376. movdqa $T1,$T2 #
  377. psrldq \$8,$T1
  378. pslldq \$8,$T2 #
  379. pxor $T1,$Xhi
  380. pxor $T2,$Xi #
  381. ___
  382. }
  383. sub reduction_alg9 { # 17/11 times faster than Intel version
  384. my ($Xhi,$Xi) = @_;
  385. $code.=<<___;
  386. # 1st phase
  387. movdqa $Xi,$T2 #
  388. movdqa $Xi,$T1
  389. psllq \$5,$Xi
  390. pxor $Xi,$T1 #
  391. psllq \$1,$Xi
  392. pxor $T1,$Xi #
  393. psllq \$57,$Xi #
  394. movdqa $Xi,$T1 #
  395. pslldq \$8,$Xi
  396. psrldq \$8,$T1 #
  397. pxor $T2,$Xi
  398. pxor $T1,$Xhi #
  399. # 2nd phase
  400. movdqa $Xi,$T2
  401. psrlq \$1,$Xi
  402. pxor $T2,$Xhi #
  403. pxor $Xi,$T2
  404. psrlq \$5,$Xi
  405. pxor $T2,$Xi #
  406. psrlq \$1,$Xi #
  407. pxor $Xhi,$Xi #
  408. ___
  409. }
  410. { my ($Htbl,$Xip)=@_4args;
  411. my $HK="%xmm6";
  412. $code.=<<___;
  413. .globl gcm_init_clmul
  414. .type gcm_init_clmul,\@abi-omnipotent
  415. .align 16
  416. gcm_init_clmul:
  417. .L_init_clmul:
  418. ___
  419. $code.=<<___ if ($win64);
  420. .LSEH_begin_gcm_init_clmul:
  421. # I can't trust assembler to use specific encoding:-(
  422. .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
  423. .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
  424. ___
  425. $code.=<<___;
  426. movdqu ($Xip),$Hkey
  427. pshufd \$0b01001110,$Hkey,$Hkey # dword swap
  428. # <<1 twist
  429. pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
  430. movdqa $Hkey,$T1
  431. psllq \$1,$Hkey
  432. pxor $T3,$T3 #
  433. psrlq \$63,$T1
  434. pcmpgtd $T2,$T3 # broadcast carry bit
  435. pslldq \$8,$T1
  436. por $T1,$Hkey # H<<=1
  437. # magic reduction
  438. pand .L0x1c2_polynomial(%rip),$T3
  439. pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
  440. # calculate H^2
  441. pshufd \$0b01001110,$Hkey,$HK
  442. movdqa $Hkey,$Xi
  443. pxor $Hkey,$HK
  444. ___
  445. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK);
  446. &reduction_alg9 ($Xhi,$Xi);
  447. $code.=<<___;
  448. pshufd \$0b01001110,$Hkey,$T1
  449. pshufd \$0b01001110,$Xi,$T2
  450. pxor $Hkey,$T1 # Karatsuba pre-processing
  451. movdqu $Hkey,0x00($Htbl) # save H
  452. pxor $Xi,$T2 # Karatsuba pre-processing
  453. movdqu $Xi,0x10($Htbl) # save H^2
  454. palignr \$8,$T1,$T2 # low part is H.lo^H.hi...
  455. movdqu $T2,0x20($Htbl) # save Karatsuba "salt"
  456. ___
  457. if ($do4xaggr) {
  458. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3
  459. &reduction_alg9 ($Xhi,$Xi);
  460. $code.=<<___;
  461. movdqa $Xi,$T3
  462. ___
  463. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4
  464. &reduction_alg9 ($Xhi,$Xi);
  465. $code.=<<___;
  466. pshufd \$0b01001110,$T3,$T1
  467. pshufd \$0b01001110,$Xi,$T2
  468. pxor $T3,$T1 # Karatsuba pre-processing
  469. movdqu $T3,0x30($Htbl) # save H^3
  470. pxor $Xi,$T2 # Karatsuba pre-processing
  471. movdqu $Xi,0x40($Htbl) # save H^4
  472. palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi...
  473. movdqu $T2,0x50($Htbl) # save Karatsuba "salt"
  474. ___
  475. }
  476. $code.=<<___ if ($win64);
  477. movaps (%rsp),%xmm6
  478. lea 0x18(%rsp),%rsp
  479. .LSEH_end_gcm_init_clmul:
  480. ___
  481. $code.=<<___;
  482. ret
  483. .size gcm_init_clmul,.-gcm_init_clmul
  484. ___
  485. }
  486. { my ($Xip,$Htbl)=@_4args;
  487. $code.=<<___;
  488. .globl gcm_gmult_clmul
  489. .type gcm_gmult_clmul,\@abi-omnipotent
  490. .align 16
  491. gcm_gmult_clmul:
  492. .L_gmult_clmul:
  493. movdqu ($Xip),$Xi
  494. movdqa .Lbswap_mask(%rip),$T3
  495. movdqu ($Htbl),$Hkey
  496. movdqu 0x20($Htbl),$T2
  497. pshufb $T3,$Xi
  498. ___
  499. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
  500. $code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
  501. # experimental alternative. special thing about is that there
  502. # no dependency between the two multiplications...
  503. mov \$`0xE1<<1`,%eax
  504. mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff
  505. mov \$0x07,%r11d
  506. movq %rax,$T1
  507. movq %r10,$T2
  508. movq %r11,$T3 # borrow $T3
  509. pand $Xi,$T3
  510. pshufb $T3,$T2 # ($Xi&7)·0xE0
  511. movq %rax,$T3
  512. pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1)
  513. pxor $Xi,$T2
  514. pslldq \$15,$T2
  515. paddd $T2,$T2 # <<(64+56+1)
  516. pxor $T2,$Xi
  517. pclmulqdq \$0x01,$T3,$Xi
  518. movdqa .Lbswap_mask(%rip),$T3 # reload $T3
  519. psrldq \$1,$T1
  520. pxor $T1,$Xhi
  521. pslldq \$7,$Xi
  522. pxor $Xhi,$Xi
  523. ___
  524. $code.=<<___;
  525. pshufb $T3,$Xi
  526. movdqu $Xi,($Xip)
  527. ret
  528. .size gcm_gmult_clmul,.-gcm_gmult_clmul
  529. ___
  530. }
  531. { my ($Xip,$Htbl,$inp,$len)=@_4args;
  532. my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
  533. my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
  534. $code.=<<___;
  535. .globl gcm_ghash_clmul
  536. .type gcm_ghash_clmul,\@abi-omnipotent
  537. .align 32
  538. gcm_ghash_clmul:
  539. .L_ghash_clmul:
  540. ___
  541. $code.=<<___ if ($win64);
  542. lea -0x88(%rsp),%rax
  543. .LSEH_begin_gcm_ghash_clmul:
  544. # I can't trust assembler to use specific encoding:-(
  545. .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
  546. .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
  547. .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
  548. .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
  549. .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
  550. .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
  551. .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
  552. .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
  553. .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
  554. .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
  555. .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
  556. ___
  557. $code.=<<___;
  558. movdqa .Lbswap_mask(%rip),$T3
  559. movdqu ($Xip),$Xi
  560. movdqu ($Htbl),$Hkey
  561. movdqu 0x20($Htbl),$HK
  562. pshufb $T3,$Xi
  563. sub \$0x10,$len
  564. jz .Lodd_tail
  565. movdqu 0x10($Htbl),$Hkey2
  566. ___
  567. if ($do4xaggr) {
  568. my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
  569. $code.=<<___;
  570. mov OPENSSL_ia32cap_P+4(%rip),%eax
  571. cmp \$0x30,$len
  572. jb .Lskip4x
  573. and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE
  574. cmp \$`1<<22`,%eax # check for MOVBE without XSAVE
  575. je .Lskip4x
  576. sub \$0x30,$len
  577. mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
  578. movdqu 0x30($Htbl),$Hkey3
  579. movdqu 0x40($Htbl),$Hkey4
  580. #######
  581. # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P
  582. #
  583. movdqu 0x30($inp),$Xln
  584. movdqu 0x20($inp),$Xl
  585. pshufb $T3,$Xln
  586. pshufb $T3,$Xl
  587. movdqa $Xln,$Xhn
  588. pshufd \$0b01001110,$Xln,$Xmn
  589. pxor $Xln,$Xmn
  590. pclmulqdq \$0x00,$Hkey,$Xln
  591. pclmulqdq \$0x11,$Hkey,$Xhn
  592. pclmulqdq \$0x00,$HK,$Xmn
  593. movdqa $Xl,$Xh
  594. pshufd \$0b01001110,$Xl,$Xm
  595. pxor $Xl,$Xm
  596. pclmulqdq \$0x00,$Hkey2,$Xl
  597. pclmulqdq \$0x11,$Hkey2,$Xh
  598. pclmulqdq \$0x10,$HK,$Xm
  599. xorps $Xl,$Xln
  600. xorps $Xh,$Xhn
  601. movups 0x50($Htbl),$HK
  602. xorps $Xm,$Xmn
  603. movdqu 0x10($inp),$Xl
  604. movdqu 0($inp),$T1
  605. pshufb $T3,$Xl
  606. pshufb $T3,$T1
  607. movdqa $Xl,$Xh
  608. pshufd \$0b01001110,$Xl,$Xm
  609. pxor $T1,$Xi
  610. pxor $Xl,$Xm
  611. pclmulqdq \$0x00,$Hkey3,$Xl
  612. movdqa $Xi,$Xhi
  613. pshufd \$0b01001110,$Xi,$T1
  614. pxor $Xi,$T1
  615. pclmulqdq \$0x11,$Hkey3,$Xh
  616. pclmulqdq \$0x00,$HK,$Xm
  617. xorps $Xl,$Xln
  618. xorps $Xh,$Xhn
  619. lea 0x40($inp),$inp
  620. sub \$0x40,$len
  621. jc .Ltail4x
  622. jmp .Lmod4_loop
  623. .align 32
  624. .Lmod4_loop:
  625. pclmulqdq \$0x00,$Hkey4,$Xi
  626. xorps $Xm,$Xmn
  627. movdqu 0x30($inp),$Xl
  628. pshufb $T3,$Xl
  629. pclmulqdq \$0x11,$Hkey4,$Xhi
  630. xorps $Xln,$Xi
  631. movdqu 0x20($inp),$Xln
  632. movdqa $Xl,$Xh
  633. pclmulqdq \$0x10,$HK,$T1
  634. pshufd \$0b01001110,$Xl,$Xm
  635. xorps $Xhn,$Xhi
  636. pxor $Xl,$Xm
  637. pshufb $T3,$Xln
  638. movups 0x20($Htbl),$HK
  639. xorps $Xmn,$T1
  640. pclmulqdq \$0x00,$Hkey,$Xl
  641. pshufd \$0b01001110,$Xln,$Xmn
  642. pxor $Xi,$T1 # aggregated Karatsuba post-processing
  643. movdqa $Xln,$Xhn
  644. pxor $Xhi,$T1 #
  645. pxor $Xln,$Xmn
  646. movdqa $T1,$T2 #
  647. pclmulqdq \$0x11,$Hkey,$Xh
  648. pslldq \$8,$T1
  649. psrldq \$8,$T2 #
  650. pxor $T1,$Xi
  651. movdqa .L7_mask(%rip),$T1
  652. pxor $T2,$Xhi #
  653. movq %rax,$T2
  654. pand $Xi,$T1 # 1st phase
  655. pshufb $T1,$T2 #
  656. pxor $Xi,$T2 #
  657. pclmulqdq \$0x00,$HK,$Xm
  658. psllq \$57,$T2 #
  659. movdqa $T2,$T1 #
  660. pslldq \$8,$T2
  661. pclmulqdq \$0x00,$Hkey2,$Xln
  662. psrldq \$8,$T1 #
  663. pxor $T2,$Xi
  664. pxor $T1,$Xhi #
  665. movdqu 0($inp),$T1
  666. movdqa $Xi,$T2 # 2nd phase
  667. psrlq \$1,$Xi
  668. pclmulqdq \$0x11,$Hkey2,$Xhn
  669. xorps $Xl,$Xln
  670. movdqu 0x10($inp),$Xl
  671. pshufb $T3,$Xl
  672. pclmulqdq \$0x10,$HK,$Xmn
  673. xorps $Xh,$Xhn
  674. movups 0x50($Htbl),$HK
  675. pshufb $T3,$T1
  676. pxor $T2,$Xhi #
  677. pxor $Xi,$T2
  678. psrlq \$5,$Xi
  679. movdqa $Xl,$Xh
  680. pxor $Xm,$Xmn
  681. pshufd \$0b01001110,$Xl,$Xm
  682. pxor $T2,$Xi #
  683. pxor $T1,$Xhi
  684. pxor $Xl,$Xm
  685. pclmulqdq \$0x00,$Hkey3,$Xl
  686. psrlq \$1,$Xi #
  687. pxor $Xhi,$Xi #
  688. movdqa $Xi,$Xhi
  689. pclmulqdq \$0x11,$Hkey3,$Xh
  690. xorps $Xl,$Xln
  691. pshufd \$0b01001110,$Xi,$T1
  692. pxor $Xi,$T1
  693. pclmulqdq \$0x00,$HK,$Xm
  694. xorps $Xh,$Xhn
  695. lea 0x40($inp),$inp
  696. sub \$0x40,$len
  697. jnc .Lmod4_loop
  698. .Ltail4x:
  699. pclmulqdq \$0x00,$Hkey4,$Xi
  700. pclmulqdq \$0x11,$Hkey4,$Xhi
  701. pclmulqdq \$0x10,$HK,$T1
  702. xorps $Xm,$Xmn
  703. xorps $Xln,$Xi
  704. xorps $Xhn,$Xhi
  705. pxor $Xi,$Xhi # aggregated Karatsuba post-processing
  706. pxor $Xmn,$T1
  707. pxor $Xhi,$T1 #
  708. pxor $Xi,$Xhi
  709. movdqa $T1,$T2 #
  710. psrldq \$8,$T1
  711. pslldq \$8,$T2 #
  712. pxor $T1,$Xhi
  713. pxor $T2,$Xi #
  714. ___
  715. &reduction_alg9($Xhi,$Xi);
  716. $code.=<<___;
  717. add \$0x40,$len
  718. jz .Ldone
  719. movdqu 0x20($Htbl),$HK
  720. sub \$0x10,$len
  721. jz .Lodd_tail
  722. .Lskip4x:
  723. ___
  724. }
  725. $code.=<<___;
  726. #######
  727. # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
  728. # [(H*Ii+1) + (H*Xi+1)] mod P =
  729. # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
  730. #
  731. movdqu ($inp),$T1 # Ii
  732. movdqu 16($inp),$Xln # Ii+1
  733. pshufb $T3,$T1
  734. pshufb $T3,$Xln
  735. pxor $T1,$Xi # Ii+Xi
  736. movdqa $Xln,$Xhn
  737. pshufd \$0b01001110,$Xln,$Xmn
  738. pxor $Xln,$Xmn
  739. pclmulqdq \$0x00,$Hkey,$Xln
  740. pclmulqdq \$0x11,$Hkey,$Xhn
  741. pclmulqdq \$0x00,$HK,$Xmn
  742. lea 32($inp),$inp # i+=2
  743. nop
  744. sub \$0x20,$len
  745. jbe .Leven_tail
  746. nop
  747. jmp .Lmod_loop
  748. .align 32
  749. .Lmod_loop:
  750. movdqa $Xi,$Xhi
  751. movdqa $Xmn,$T1
  752. pshufd \$0b01001110,$Xi,$Xmn #
  753. pxor $Xi,$Xmn #
  754. pclmulqdq \$0x00,$Hkey2,$Xi
  755. pclmulqdq \$0x11,$Hkey2,$Xhi
  756. pclmulqdq \$0x10,$HK,$Xmn
  757. pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
  758. pxor $Xhn,$Xhi
  759. movdqu ($inp),$T2 # Ii
  760. pxor $Xi,$T1 # aggregated Karatsuba post-processing
  761. pshufb $T3,$T2
  762. movdqu 16($inp),$Xln # Ii+1
  763. pxor $Xhi,$T1
  764. pxor $T2,$Xhi # "Ii+Xi", consume early
  765. pxor $T1,$Xmn
  766. pshufb $T3,$Xln
  767. movdqa $Xmn,$T1 #
  768. psrldq \$8,$T1
  769. pslldq \$8,$Xmn #
  770. pxor $T1,$Xhi
  771. pxor $Xmn,$Xi #
  772. movdqa $Xln,$Xhn #
  773. movdqa $Xi,$T2 # 1st phase
  774. movdqa $Xi,$T1
  775. psllq \$5,$Xi
  776. pxor $Xi,$T1 #
  777. pclmulqdq \$0x00,$Hkey,$Xln #######
  778. psllq \$1,$Xi
  779. pxor $T1,$Xi #
  780. psllq \$57,$Xi #
  781. movdqa $Xi,$T1 #
  782. pslldq \$8,$Xi
  783. psrldq \$8,$T1 #
  784. pxor $T2,$Xi
  785. pshufd \$0b01001110,$Xhn,$Xmn
  786. pxor $T1,$Xhi #
  787. pxor $Xhn,$Xmn #
  788. movdqa $Xi,$T2 # 2nd phase
  789. psrlq \$1,$Xi
  790. pclmulqdq \$0x11,$Hkey,$Xhn #######
  791. pxor $T2,$Xhi #
  792. pxor $Xi,$T2
  793. psrlq \$5,$Xi
  794. pxor $T2,$Xi #
  795. lea 32($inp),$inp
  796. psrlq \$1,$Xi #
  797. pclmulqdq \$0x00,$HK,$Xmn #######
  798. pxor $Xhi,$Xi #
  799. sub \$0x20,$len
  800. ja .Lmod_loop
  801. .Leven_tail:
  802. movdqa $Xi,$Xhi
  803. movdqa $Xmn,$T1
  804. pshufd \$0b01001110,$Xi,$Xmn #
  805. pxor $Xi,$Xmn #
  806. pclmulqdq \$0x00,$Hkey2,$Xi
  807. pclmulqdq \$0x11,$Hkey2,$Xhi
  808. pclmulqdq \$0x10,$HK,$Xmn
  809. pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
  810. pxor $Xhn,$Xhi
  811. pxor $Xi,$T1
  812. pxor $Xhi,$T1
  813. pxor $T1,$Xmn
  814. movdqa $Xmn,$T1 #
  815. psrldq \$8,$T1
  816. pslldq \$8,$Xmn #
  817. pxor $T1,$Xhi
  818. pxor $Xmn,$Xi #
  819. ___
  820. &reduction_alg9 ($Xhi,$Xi);
  821. $code.=<<___;
  822. test $len,$len
  823. jnz .Ldone
  824. .Lodd_tail:
  825. movdqu ($inp),$T1 # Ii
  826. pshufb $T3,$T1
  827. pxor $T1,$Xi # Ii+Xi
  828. ___
  829. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi)
  830. &reduction_alg9 ($Xhi,$Xi);
  831. $code.=<<___;
  832. .Ldone:
  833. pshufb $T3,$Xi
  834. movdqu $Xi,($Xip)
  835. ___
  836. $code.=<<___ if ($win64);
  837. movaps (%rsp),%xmm6
  838. movaps 0x10(%rsp),%xmm7
  839. movaps 0x20(%rsp),%xmm8
  840. movaps 0x30(%rsp),%xmm9
  841. movaps 0x40(%rsp),%xmm10
  842. movaps 0x50(%rsp),%xmm11
  843. movaps 0x60(%rsp),%xmm12
  844. movaps 0x70(%rsp),%xmm13
  845. movaps 0x80(%rsp),%xmm14
  846. movaps 0x90(%rsp),%xmm15
  847. lea 0xa8(%rsp),%rsp
  848. .LSEH_end_gcm_ghash_clmul:
  849. ___
  850. $code.=<<___;
  851. ret
  852. .size gcm_ghash_clmul,.-gcm_ghash_clmul
  853. ___
  854. }
  855. $code.=<<___;
  856. .globl gcm_init_avx
  857. .type gcm_init_avx,\@abi-omnipotent
  858. .align 32
  859. gcm_init_avx:
  860. ___
  861. if ($avx) {
  862. my ($Htbl,$Xip)=@_4args;
  863. my $HK="%xmm6";
  864. $code.=<<___ if ($win64);
  865. .LSEH_begin_gcm_init_avx:
  866. # I can't trust assembler to use specific encoding:-(
  867. .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
  868. .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
  869. ___
  870. $code.=<<___;
  871. vzeroupper
  872. vmovdqu ($Xip),$Hkey
  873. vpshufd \$0b01001110,$Hkey,$Hkey # dword swap
  874. # <<1 twist
  875. vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
  876. vpsrlq \$63,$Hkey,$T1
  877. vpsllq \$1,$Hkey,$Hkey
  878. vpxor $T3,$T3,$T3 #
  879. vpcmpgtd $T2,$T3,$T3 # broadcast carry bit
  880. vpslldq \$8,$T1,$T1
  881. vpor $T1,$Hkey,$Hkey # H<<=1
  882. # magic reduction
  883. vpand .L0x1c2_polynomial(%rip),$T3,$T3
  884. vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial
  885. vpunpckhqdq $Hkey,$Hkey,$HK
  886. vmovdqa $Hkey,$Xi
  887. vpxor $Hkey,$HK,$HK
  888. mov \$4,%r10 # up to H^8
  889. jmp .Linit_start_avx
  890. ___
  891. sub clmul64x64_avx {
  892. my ($Xhi,$Xi,$Hkey,$HK)=@_;
  893. if (!defined($HK)) { $HK = $T2;
  894. $code.=<<___;
  895. vpunpckhqdq $Xi,$Xi,$T1
  896. vpunpckhqdq $Hkey,$Hkey,$T2
  897. vpxor $Xi,$T1,$T1 #
  898. vpxor $Hkey,$T2,$T2
  899. ___
  900. } else {
  901. $code.=<<___;
  902. vpunpckhqdq $Xi,$Xi,$T1
  903. vpxor $Xi,$T1,$T1 #
  904. ___
  905. }
  906. $code.=<<___;
  907. vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi #######
  908. vpclmulqdq \$0x00,$Hkey,$Xi,$Xi #######
  909. vpclmulqdq \$0x00,$HK,$T1,$T1 #######
  910. vpxor $Xi,$Xhi,$T2 #
  911. vpxor $T2,$T1,$T1 #
  912. vpslldq \$8,$T1,$T2 #
  913. vpsrldq \$8,$T1,$T1
  914. vpxor $T2,$Xi,$Xi #
  915. vpxor $T1,$Xhi,$Xhi
  916. ___
  917. }
  918. sub reduction_avx {
  919. my ($Xhi,$Xi) = @_;
  920. $code.=<<___;
  921. vpsllq \$57,$Xi,$T1 # 1st phase
  922. vpsllq \$62,$Xi,$T2
  923. vpxor $T1,$T2,$T2 #
  924. vpsllq \$63,$Xi,$T1
  925. vpxor $T1,$T2,$T2 #
  926. vpslldq \$8,$T2,$T1 #
  927. vpsrldq \$8,$T2,$T2
  928. vpxor $T1,$Xi,$Xi #
  929. vpxor $T2,$Xhi,$Xhi
  930. vpsrlq \$1,$Xi,$T2 # 2nd phase
  931. vpxor $Xi,$Xhi,$Xhi
  932. vpxor $T2,$Xi,$Xi #
  933. vpsrlq \$5,$T2,$T2
  934. vpxor $T2,$Xi,$Xi #
  935. vpsrlq \$1,$Xi,$Xi #
  936. vpxor $Xhi,$Xi,$Xi #
  937. ___
  938. }
  939. $code.=<<___;
  940. .align 32
  941. .Linit_loop_avx:
  942. vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi...
  943. vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt"
  944. ___
  945. &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7
  946. &reduction_avx ($Xhi,$Xi);
  947. $code.=<<___;
  948. .Linit_start_avx:
  949. vmovdqa $Xi,$T3
  950. ___
  951. &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8
  952. &reduction_avx ($Xhi,$Xi);
  953. $code.=<<___;
  954. vpshufd \$0b01001110,$T3,$T1
  955. vpshufd \$0b01001110,$Xi,$T2
  956. vpxor $T3,$T1,$T1 # Karatsuba pre-processing
  957. vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7
  958. vpxor $Xi,$T2,$T2 # Karatsuba pre-processing
  959. vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8
  960. lea 0x30($Htbl),$Htbl
  961. sub \$1,%r10
  962. jnz .Linit_loop_avx
  963. vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped
  964. vmovdqu $T3,-0x10($Htbl)
  965. vzeroupper
  966. ___
  967. $code.=<<___ if ($win64);
  968. movaps (%rsp),%xmm6
  969. lea 0x18(%rsp),%rsp
  970. .LSEH_end_gcm_init_avx:
  971. ___
  972. $code.=<<___;
  973. ret
  974. .size gcm_init_avx,.-gcm_init_avx
  975. ___
  976. } else {
  977. $code.=<<___;
  978. jmp .L_init_clmul
  979. .size gcm_init_avx,.-gcm_init_avx
  980. ___
  981. }
  982. $code.=<<___;
  983. .globl gcm_gmult_avx
  984. .type gcm_gmult_avx,\@abi-omnipotent
  985. .align 32
  986. gcm_gmult_avx:
  987. jmp .L_gmult_clmul
  988. .size gcm_gmult_avx,.-gcm_gmult_avx
  989. ___
  990. $code.=<<___;
  991. .globl gcm_ghash_avx
  992. .type gcm_ghash_avx,\@abi-omnipotent
  993. .align 32
  994. gcm_ghash_avx:
  995. ___
  996. if ($avx) {
  997. my ($Xip,$Htbl,$inp,$len)=@_4args;
  998. my ($Xlo,$Xhi,$Xmi,
  999. $Zlo,$Zhi,$Zmi,
  1000. $Hkey,$HK,$T1,$T2,
  1001. $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15));
  1002. $code.=<<___ if ($win64);
  1003. lea -0x88(%rsp),%rax
  1004. .LSEH_begin_gcm_ghash_avx:
  1005. # I can't trust assembler to use specific encoding:-(
  1006. .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
  1007. .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
  1008. .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
  1009. .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
  1010. .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
  1011. .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
  1012. .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
  1013. .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
  1014. .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
  1015. .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
  1016. .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
  1017. ___
  1018. $code.=<<___;
  1019. vzeroupper
  1020. vmovdqu ($Xip),$Xi # load $Xi
  1021. lea .L0x1c2_polynomial(%rip),%r10
  1022. lea 0x40($Htbl),$Htbl # size optimization
  1023. vmovdqu .Lbswap_mask(%rip),$bswap
  1024. vpshufb $bswap,$Xi,$Xi
  1025. cmp \$0x80,$len
  1026. jb .Lshort_avx
  1027. sub \$0x80,$len
  1028. vmovdqu 0x70($inp),$Ii # I[7]
  1029. vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
  1030. vpshufb $bswap,$Ii,$Ii
  1031. vmovdqu 0x20-0x40($Htbl),$HK
  1032. vpunpckhqdq $Ii,$Ii,$T2
  1033. vmovdqu 0x60($inp),$Ij # I[6]
  1034. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1035. vpxor $Ii,$T2,$T2
  1036. vpshufb $bswap,$Ij,$Ij
  1037. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1038. vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
  1039. vpunpckhqdq $Ij,$Ij,$T1
  1040. vmovdqu 0x50($inp),$Ii # I[5]
  1041. vpclmulqdq \$0x00,$HK,$T2,$Xmi
  1042. vpxor $Ij,$T1,$T1
  1043. vpshufb $bswap,$Ii,$Ii
  1044. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1045. vpunpckhqdq $Ii,$Ii,$T2
  1046. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1047. vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
  1048. vpxor $Ii,$T2,$T2
  1049. vmovdqu 0x40($inp),$Ij # I[4]
  1050. vpclmulqdq \$0x10,$HK,$T1,$Zmi
  1051. vmovdqu 0x50-0x40($Htbl),$HK
  1052. vpshufb $bswap,$Ij,$Ij
  1053. vpxor $Xlo,$Zlo,$Zlo
  1054. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1055. vpxor $Xhi,$Zhi,$Zhi
  1056. vpunpckhqdq $Ij,$Ij,$T1
  1057. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1058. vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
  1059. vpxor $Xmi,$Zmi,$Zmi
  1060. vpclmulqdq \$0x00,$HK,$T2,$Xmi
  1061. vpxor $Ij,$T1,$T1
  1062. vmovdqu 0x30($inp),$Ii # I[3]
  1063. vpxor $Zlo,$Xlo,$Xlo
  1064. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1065. vpxor $Zhi,$Xhi,$Xhi
  1066. vpshufb $bswap,$Ii,$Ii
  1067. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1068. vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
  1069. vpxor $Zmi,$Xmi,$Xmi
  1070. vpunpckhqdq $Ii,$Ii,$T2
  1071. vpclmulqdq \$0x10,$HK,$T1,$Zmi
  1072. vmovdqu 0x80-0x40($Htbl),$HK
  1073. vpxor $Ii,$T2,$T2
  1074. vmovdqu 0x20($inp),$Ij # I[2]
  1075. vpxor $Xlo,$Zlo,$Zlo
  1076. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1077. vpxor $Xhi,$Zhi,$Zhi
  1078. vpshufb $bswap,$Ij,$Ij
  1079. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1080. vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
  1081. vpxor $Xmi,$Zmi,$Zmi
  1082. vpunpckhqdq $Ij,$Ij,$T1
  1083. vpclmulqdq \$0x00,$HK,$T2,$Xmi
  1084. vpxor $Ij,$T1,$T1
  1085. vmovdqu 0x10($inp),$Ii # I[1]
  1086. vpxor $Zlo,$Xlo,$Xlo
  1087. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1088. vpxor $Zhi,$Xhi,$Xhi
  1089. vpshufb $bswap,$Ii,$Ii
  1090. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1091. vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
  1092. vpxor $Zmi,$Xmi,$Xmi
  1093. vpunpckhqdq $Ii,$Ii,$T2
  1094. vpclmulqdq \$0x10,$HK,$T1,$Zmi
  1095. vmovdqu 0xb0-0x40($Htbl),$HK
  1096. vpxor $Ii,$T2,$T2
  1097. vmovdqu ($inp),$Ij # I[0]
  1098. vpxor $Xlo,$Zlo,$Zlo
  1099. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1100. vpxor $Xhi,$Zhi,$Zhi
  1101. vpshufb $bswap,$Ij,$Ij
  1102. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1103. vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
  1104. vpxor $Xmi,$Zmi,$Zmi
  1105. vpclmulqdq \$0x10,$HK,$T2,$Xmi
  1106. lea 0x80($inp),$inp
  1107. cmp \$0x80,$len
  1108. jb .Ltail_avx
  1109. vpxor $Xi,$Ij,$Ij # accumulate $Xi
  1110. sub \$0x80,$len
  1111. jmp .Loop8x_avx
  1112. .align 32
  1113. .Loop8x_avx:
  1114. vpunpckhqdq $Ij,$Ij,$T1
  1115. vmovdqu 0x70($inp),$Ii # I[7]
  1116. vpxor $Xlo,$Zlo,$Zlo
  1117. vpxor $Ij,$T1,$T1
  1118. vpclmulqdq \$0x00,$Hkey,$Ij,$Xi
  1119. vpshufb $bswap,$Ii,$Ii
  1120. vpxor $Xhi,$Zhi,$Zhi
  1121. vpclmulqdq \$0x11,$Hkey,$Ij,$Xo
  1122. vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
  1123. vpunpckhqdq $Ii,$Ii,$T2
  1124. vpxor $Xmi,$Zmi,$Zmi
  1125. vpclmulqdq \$0x00,$HK,$T1,$Tred
  1126. vmovdqu 0x20-0x40($Htbl),$HK
  1127. vpxor $Ii,$T2,$T2
  1128. vmovdqu 0x60($inp),$Ij # I[6]
  1129. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1130. vpxor $Zlo,$Xi,$Xi # collect result
  1131. vpshufb $bswap,$Ij,$Ij
  1132. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1133. vxorps $Zhi,$Xo,$Xo
  1134. vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
  1135. vpunpckhqdq $Ij,$Ij,$T1
  1136. vpclmulqdq \$0x00,$HK, $T2,$Xmi
  1137. vpxor $Zmi,$Tred,$Tred
  1138. vxorps $Ij,$T1,$T1
  1139. vmovdqu 0x50($inp),$Ii # I[5]
  1140. vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing
  1141. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1142. vpxor $Xo,$Tred,$Tred
  1143. vpslldq \$8,$Tred,$T2
  1144. vpxor $Xlo,$Zlo,$Zlo
  1145. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1146. vpsrldq \$8,$Tred,$Tred
  1147. vpxor $T2, $Xi, $Xi
  1148. vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
  1149. vpshufb $bswap,$Ii,$Ii
  1150. vxorps $Tred,$Xo, $Xo
  1151. vpxor $Xhi,$Zhi,$Zhi
  1152. vpunpckhqdq $Ii,$Ii,$T2
  1153. vpclmulqdq \$0x10,$HK, $T1,$Zmi
  1154. vmovdqu 0x50-0x40($Htbl),$HK
  1155. vpxor $Ii,$T2,$T2
  1156. vpxor $Xmi,$Zmi,$Zmi
  1157. vmovdqu 0x40($inp),$Ij # I[4]
  1158. vpalignr \$8,$Xi,$Xi,$Tred # 1st phase
  1159. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1160. vpshufb $bswap,$Ij,$Ij
  1161. vpxor $Zlo,$Xlo,$Xlo
  1162. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1163. vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
  1164. vpunpckhqdq $Ij,$Ij,$T1
  1165. vpxor $Zhi,$Xhi,$Xhi
  1166. vpclmulqdq \$0x00,$HK, $T2,$Xmi
  1167. vxorps $Ij,$T1,$T1
  1168. vpxor $Zmi,$Xmi,$Xmi
  1169. vmovdqu 0x30($inp),$Ii # I[3]
  1170. vpclmulqdq \$0x10,(%r10),$Xi,$Xi
  1171. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1172. vpshufb $bswap,$Ii,$Ii
  1173. vpxor $Xlo,$Zlo,$Zlo
  1174. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1175. vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
  1176. vpunpckhqdq $Ii,$Ii,$T2
  1177. vpxor $Xhi,$Zhi,$Zhi
  1178. vpclmulqdq \$0x10,$HK, $T1,$Zmi
  1179. vmovdqu 0x80-0x40($Htbl),$HK
  1180. vpxor $Ii,$T2,$T2
  1181. vpxor $Xmi,$Zmi,$Zmi
  1182. vmovdqu 0x20($inp),$Ij # I[2]
  1183. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1184. vpshufb $bswap,$Ij,$Ij
  1185. vpxor $Zlo,$Xlo,$Xlo
  1186. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1187. vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
  1188. vpunpckhqdq $Ij,$Ij,$T1
  1189. vpxor $Zhi,$Xhi,$Xhi
  1190. vpclmulqdq \$0x00,$HK, $T2,$Xmi
  1191. vpxor $Ij,$T1,$T1
  1192. vpxor $Zmi,$Xmi,$Xmi
  1193. vxorps $Tred,$Xi,$Xi
  1194. vmovdqu 0x10($inp),$Ii # I[1]
  1195. vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase
  1196. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1197. vpshufb $bswap,$Ii,$Ii
  1198. vpxor $Xlo,$Zlo,$Zlo
  1199. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1200. vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
  1201. vpclmulqdq \$0x10,(%r10),$Xi,$Xi
  1202. vxorps $Xo,$Tred,$Tred
  1203. vpunpckhqdq $Ii,$Ii,$T2
  1204. vpxor $Xhi,$Zhi,$Zhi
  1205. vpclmulqdq \$0x10,$HK, $T1,$Zmi
  1206. vmovdqu 0xb0-0x40($Htbl),$HK
  1207. vpxor $Ii,$T2,$T2
  1208. vpxor $Xmi,$Zmi,$Zmi
  1209. vmovdqu ($inp),$Ij # I[0]
  1210. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1211. vpshufb $bswap,$Ij,$Ij
  1212. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1213. vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
  1214. vpxor $Tred,$Ij,$Ij
  1215. vpclmulqdq \$0x10,$HK, $T2,$Xmi
  1216. vpxor $Xi,$Ij,$Ij # accumulate $Xi
  1217. lea 0x80($inp),$inp
  1218. sub \$0x80,$len
  1219. jnc .Loop8x_avx
  1220. add \$0x80,$len
  1221. jmp .Ltail_no_xor_avx
  1222. .align 32
  1223. .Lshort_avx:
  1224. vmovdqu -0x10($inp,$len),$Ii # very last word
  1225. lea ($inp,$len),$inp
  1226. vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
  1227. vmovdqu 0x20-0x40($Htbl),$HK
  1228. vpshufb $bswap,$Ii,$Ij
  1229. vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo,
  1230. vmovdqa $Xhi,$Zhi # $Zhi and
  1231. vmovdqa $Xmi,$Zmi # $Zmi
  1232. sub \$0x10,$len
  1233. jz .Ltail_avx
  1234. vpunpckhqdq $Ij,$Ij,$T1
  1235. vpxor $Xlo,$Zlo,$Zlo
  1236. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1237. vpxor $Ij,$T1,$T1
  1238. vmovdqu -0x20($inp),$Ii
  1239. vpxor $Xhi,$Zhi,$Zhi
  1240. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1241. vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
  1242. vpshufb $bswap,$Ii,$Ij
  1243. vpxor $Xmi,$Zmi,$Zmi
  1244. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1245. vpsrldq \$8,$HK,$HK
  1246. sub \$0x10,$len
  1247. jz .Ltail_avx
  1248. vpunpckhqdq $Ij,$Ij,$T1
  1249. vpxor $Xlo,$Zlo,$Zlo
  1250. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1251. vpxor $Ij,$T1,$T1
  1252. vmovdqu -0x30($inp),$Ii
  1253. vpxor $Xhi,$Zhi,$Zhi
  1254. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1255. vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
  1256. vpshufb $bswap,$Ii,$Ij
  1257. vpxor $Xmi,$Zmi,$Zmi
  1258. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1259. vmovdqu 0x50-0x40($Htbl),$HK
  1260. sub \$0x10,$len
  1261. jz .Ltail_avx
  1262. vpunpckhqdq $Ij,$Ij,$T1
  1263. vpxor $Xlo,$Zlo,$Zlo
  1264. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1265. vpxor $Ij,$T1,$T1
  1266. vmovdqu -0x40($inp),$Ii
  1267. vpxor $Xhi,$Zhi,$Zhi
  1268. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1269. vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
  1270. vpshufb $bswap,$Ii,$Ij
  1271. vpxor $Xmi,$Zmi,$Zmi
  1272. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1273. vpsrldq \$8,$HK,$HK
  1274. sub \$0x10,$len
  1275. jz .Ltail_avx
  1276. vpunpckhqdq $Ij,$Ij,$T1
  1277. vpxor $Xlo,$Zlo,$Zlo
  1278. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1279. vpxor $Ij,$T1,$T1
  1280. vmovdqu -0x50($inp),$Ii
  1281. vpxor $Xhi,$Zhi,$Zhi
  1282. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1283. vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
  1284. vpshufb $bswap,$Ii,$Ij
  1285. vpxor $Xmi,$Zmi,$Zmi
  1286. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1287. vmovdqu 0x80-0x40($Htbl),$HK
  1288. sub \$0x10,$len
  1289. jz .Ltail_avx
  1290. vpunpckhqdq $Ij,$Ij,$T1
  1291. vpxor $Xlo,$Zlo,$Zlo
  1292. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1293. vpxor $Ij,$T1,$T1
  1294. vmovdqu -0x60($inp),$Ii
  1295. vpxor $Xhi,$Zhi,$Zhi
  1296. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1297. vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
  1298. vpshufb $bswap,$Ii,$Ij
  1299. vpxor $Xmi,$Zmi,$Zmi
  1300. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1301. vpsrldq \$8,$HK,$HK
  1302. sub \$0x10,$len
  1303. jz .Ltail_avx
  1304. vpunpckhqdq $Ij,$Ij,$T1
  1305. vpxor $Xlo,$Zlo,$Zlo
  1306. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1307. vpxor $Ij,$T1,$T1
  1308. vmovdqu -0x70($inp),$Ii
  1309. vpxor $Xhi,$Zhi,$Zhi
  1310. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1311. vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
  1312. vpshufb $bswap,$Ii,$Ij
  1313. vpxor $Xmi,$Zmi,$Zmi
  1314. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1315. vmovq 0xb8-0x40($Htbl),$HK
  1316. sub \$0x10,$len
  1317. jmp .Ltail_avx
  1318. .align 32
  1319. .Ltail_avx:
  1320. vpxor $Xi,$Ij,$Ij # accumulate $Xi
  1321. .Ltail_no_xor_avx:
  1322. vpunpckhqdq $Ij,$Ij,$T1
  1323. vpxor $Xlo,$Zlo,$Zlo
  1324. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1325. vpxor $Ij,$T1,$T1
  1326. vpxor $Xhi,$Zhi,$Zhi
  1327. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1328. vpxor $Xmi,$Zmi,$Zmi
  1329. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1330. vmovdqu (%r10),$Tred
  1331. vpxor $Xlo,$Zlo,$Xi
  1332. vpxor $Xhi,$Zhi,$Xo
  1333. vpxor $Xmi,$Zmi,$Zmi
  1334. vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing
  1335. vpxor $Xo, $Zmi,$Zmi
  1336. vpslldq \$8, $Zmi,$T2
  1337. vpsrldq \$8, $Zmi,$Zmi
  1338. vpxor $T2, $Xi, $Xi
  1339. vpxor $Zmi,$Xo, $Xo
  1340. vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase
  1341. vpalignr \$8,$Xi,$Xi,$Xi
  1342. vpxor $T2,$Xi,$Xi
  1343. vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase
  1344. vpalignr \$8,$Xi,$Xi,$Xi
  1345. vpxor $Xo,$Xi,$Xi
  1346. vpxor $T2,$Xi,$Xi
  1347. cmp \$0,$len
  1348. jne .Lshort_avx
  1349. vpshufb $bswap,$Xi,$Xi
  1350. vmovdqu $Xi,($Xip)
  1351. vzeroupper
  1352. ___
  1353. $code.=<<___ if ($win64);
  1354. movaps (%rsp),%xmm6
  1355. movaps 0x10(%rsp),%xmm7
  1356. movaps 0x20(%rsp),%xmm8
  1357. movaps 0x30(%rsp),%xmm9
  1358. movaps 0x40(%rsp),%xmm10
  1359. movaps 0x50(%rsp),%xmm11
  1360. movaps 0x60(%rsp),%xmm12
  1361. movaps 0x70(%rsp),%xmm13
  1362. movaps 0x80(%rsp),%xmm14
  1363. movaps 0x90(%rsp),%xmm15
  1364. lea 0xa8(%rsp),%rsp
  1365. .LSEH_end_gcm_ghash_avx:
  1366. ___
  1367. $code.=<<___;
  1368. ret
  1369. .size gcm_ghash_avx,.-gcm_ghash_avx
  1370. ___
  1371. } else {
  1372. $code.=<<___;
  1373. jmp .L_ghash_clmul
  1374. .size gcm_ghash_avx,.-gcm_ghash_avx
  1375. ___
  1376. }
  1377. $code.=<<___;
  1378. .align 64
  1379. .Lbswap_mask:
  1380. .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
  1381. .L0x1c2_polynomial:
  1382. .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
  1383. .L7_mask:
  1384. .long 7,0,7,0
  1385. .L7_mask_poly:
  1386. .long 7,0,`0xE1<<1`,0
  1387. .align 64
  1388. .type .Lrem_4bit,\@object
  1389. .Lrem_4bit:
  1390. .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
  1391. .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
  1392. .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
  1393. .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
  1394. .type .Lrem_8bit,\@object
  1395. .Lrem_8bit:
  1396. .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
  1397. .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
  1398. .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
  1399. .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
  1400. .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
  1401. .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
  1402. .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
  1403. .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
  1404. .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
  1405. .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
  1406. .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
  1407. .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
  1408. .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
  1409. .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
  1410. .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
  1411. .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
  1412. .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
  1413. .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
  1414. .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
  1415. .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
  1416. .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
  1417. .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
  1418. .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
  1419. .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
  1420. .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
  1421. .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
  1422. .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
  1423. .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
  1424. .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
  1425. .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
  1426. .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
  1427. .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
  1428. .asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  1429. .align 64
  1430. ___
  1431. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1432. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1433. if ($win64) {
  1434. $rec="%rcx";
  1435. $frame="%rdx";
  1436. $context="%r8";
  1437. $disp="%r9";
  1438. $code.=<<___;
  1439. .extern __imp_RtlVirtualUnwind
  1440. .type se_handler,\@abi-omnipotent
  1441. .align 16
  1442. se_handler:
  1443. push %rsi
  1444. push %rdi
  1445. push %rbx
  1446. push %rbp
  1447. push %r12
  1448. push %r13
  1449. push %r14
  1450. push %r15
  1451. pushfq
  1452. sub \$64,%rsp
  1453. mov 120($context),%rax # pull context->Rax
  1454. mov 248($context),%rbx # pull context->Rip
  1455. mov 8($disp),%rsi # disp->ImageBase
  1456. mov 56($disp),%r11 # disp->HandlerData
  1457. mov 0(%r11),%r10d # HandlerData[0]
  1458. lea (%rsi,%r10),%r10 # prologue label
  1459. cmp %r10,%rbx # context->Rip<prologue label
  1460. jb .Lin_prologue
  1461. mov 152($context),%rax # pull context->Rsp
  1462. mov 4(%r11),%r10d # HandlerData[1]
  1463. lea (%rsi,%r10),%r10 # epilogue label
  1464. cmp %r10,%rbx # context->Rip>=epilogue label
  1465. jae .Lin_prologue
  1466. lea 24(%rax),%rax # adjust "rsp"
  1467. mov -8(%rax),%rbx
  1468. mov -16(%rax),%rbp
  1469. mov -24(%rax),%r12
  1470. mov %rbx,144($context) # restore context->Rbx
  1471. mov %rbp,160($context) # restore context->Rbp
  1472. mov %r12,216($context) # restore context->R12
  1473. .Lin_prologue:
  1474. mov 8(%rax),%rdi
  1475. mov 16(%rax),%rsi
  1476. mov %rax,152($context) # restore context->Rsp
  1477. mov %rsi,168($context) # restore context->Rsi
  1478. mov %rdi,176($context) # restore context->Rdi
  1479. mov 40($disp),%rdi # disp->ContextRecord
  1480. mov $context,%rsi # context
  1481. mov \$`1232/8`,%ecx # sizeof(CONTEXT)
  1482. .long 0xa548f3fc # cld; rep movsq
  1483. mov $disp,%rsi
  1484. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1485. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1486. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1487. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1488. mov 40(%rsi),%r10 # disp->ContextRecord
  1489. lea 56(%rsi),%r11 # &disp->HandlerData
  1490. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1491. mov %r10,32(%rsp) # arg5
  1492. mov %r11,40(%rsp) # arg6
  1493. mov %r12,48(%rsp) # arg7
  1494. mov %rcx,56(%rsp) # arg8, (NULL)
  1495. call *__imp_RtlVirtualUnwind(%rip)
  1496. mov \$1,%eax # ExceptionContinueSearch
  1497. add \$64,%rsp
  1498. popfq
  1499. pop %r15
  1500. pop %r14
  1501. pop %r13
  1502. pop %r12
  1503. pop %rbp
  1504. pop %rbx
  1505. pop %rdi
  1506. pop %rsi
  1507. ret
  1508. .size se_handler,.-se_handler
  1509. .section .pdata
  1510. .align 4
  1511. .rva .LSEH_begin_gcm_gmult_4bit
  1512. .rva .LSEH_end_gcm_gmult_4bit
  1513. .rva .LSEH_info_gcm_gmult_4bit
  1514. .rva .LSEH_begin_gcm_ghash_4bit
  1515. .rva .LSEH_end_gcm_ghash_4bit
  1516. .rva .LSEH_info_gcm_ghash_4bit
  1517. .rva .LSEH_begin_gcm_init_clmul
  1518. .rva .LSEH_end_gcm_init_clmul
  1519. .rva .LSEH_info_gcm_init_clmul
  1520. .rva .LSEH_begin_gcm_ghash_clmul
  1521. .rva .LSEH_end_gcm_ghash_clmul
  1522. .rva .LSEH_info_gcm_ghash_clmul
  1523. ___
  1524. $code.=<<___ if ($avx);
  1525. .rva .LSEH_begin_gcm_init_avx
  1526. .rva .LSEH_end_gcm_init_avx
  1527. .rva .LSEH_info_gcm_init_clmul
  1528. .rva .LSEH_begin_gcm_ghash_avx
  1529. .rva .LSEH_end_gcm_ghash_avx
  1530. .rva .LSEH_info_gcm_ghash_clmul
  1531. ___
  1532. $code.=<<___;
  1533. .section .xdata
  1534. .align 8
  1535. .LSEH_info_gcm_gmult_4bit:
  1536. .byte 9,0,0,0
  1537. .rva se_handler
  1538. .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData
  1539. .LSEH_info_gcm_ghash_4bit:
  1540. .byte 9,0,0,0
  1541. .rva se_handler
  1542. .rva .Lghash_prologue,.Lghash_epilogue # HandlerData
  1543. .LSEH_info_gcm_init_clmul:
  1544. .byte 0x01,0x08,0x03,0x00
  1545. .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
  1546. .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18
  1547. .LSEH_info_gcm_ghash_clmul:
  1548. .byte 0x01,0x33,0x16,0x00
  1549. .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15
  1550. .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14
  1551. .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13
  1552. .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12
  1553. .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11
  1554. .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
  1555. .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
  1556. .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
  1557. .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
  1558. .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
  1559. .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
  1560. ___
  1561. }
  1562. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  1563. print $code;
  1564. close STDOUT;