Nie możesz wybrać więcej, niż 25 tematów Tematy muszą się zaczynać od litery lub cyfry, mogą zawierać myślniki ('-') i mogą mieć do 35 znaków.
 
 
 
 
 
 

1751 wiersze
42 KiB

  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  5. # project. The module is, however, dual licensed under OpenSSL and
  6. # CRYPTOGAMS licenses depending on where you obtain it. For further
  7. # details see http://www.openssl.org/~appro/cryptogams/.
  8. # ====================================================================
  9. #
  10. # March, June 2010
  11. #
  12. # The module implements "4-bit" GCM GHASH function and underlying
  13. # single multiplication operation in GF(2^128). "4-bit" means that
  14. # it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
  15. # function features so called "528B" variant utilizing additional
  16. # 256+16 bytes of per-key storage [+512 bytes shared table].
  17. # Performance results are for this streamed GHASH subroutine and are
  18. # expressed in cycles per processed byte, less is better:
  19. #
  20. # gcc 3.4.x(*) assembler
  21. #
  22. # P4 28.6 14.0 +100%
  23. # Opteron 19.3 7.7 +150%
  24. # Core2 17.8 8.1(**) +120%
  25. # Atom 31.6 16.8 +88%
  26. # VIA Nano 21.8 10.1 +115%
  27. #
  28. # (*) comparison is not completely fair, because C results are
  29. # for vanilla "256B" implementation, while assembler results
  30. # are for "528B";-)
  31. # (**) it's mystery [to me] why Core2 result is not same as for
  32. # Opteron;
  33. # May 2010
  34. #
  35. # Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
  36. # See ghash-x86.pl for background information and details about coding
  37. # techniques.
  38. #
  39. # Special thanks to David Woodhouse <dwmw2@infradead.org> for
  40. # providing access to a Westmere-based system on behalf of Intel
  41. # Open Source Technology Centre.
  42. # December 2012
  43. #
  44. # Overhaul: aggregate Karatsuba post-processing, improve ILP in
  45. # reduction_alg9, increase reduction aggregate factor to 4x. As for
  46. # the latter. ghash-x86.pl discusses that it makes lesser sense to
  47. # increase aggregate factor. Then why increase here? Critical path
  48. # consists of 3 independent pclmulqdq instructions, Karatsuba post-
  49. # processing and reduction. "On top" of this we lay down aggregated
  50. # multiplication operations, triplets of independent pclmulqdq's. As
  51. # issue rate for pclmulqdq is limited, it makes lesser sense to
  52. # aggregate more multiplications than it takes to perform remaining
  53. # non-multiplication operations. 2x is near-optimal coefficient for
  54. # contemporary Intel CPUs (therefore modest improvement coefficient),
  55. # but not for Bulldozer. Latter is because logical SIMD operations
  56. # are twice as slow in comparison to Intel, so that critical path is
  57. # longer. A CPU with higher pclmulqdq issue rate would also benefit
  58. # from higher aggregate factor...
  59. #
  60. # Westmere 1.78(+13%)
  61. # Sandy Bridge 1.80(+8%)
  62. # Ivy Bridge 1.80(+7%)
  63. # Haswell 0.55(+93%) (if system doesn't support AVX)
  64. # Bulldozer 1.49(+27%)
  65. # March 2013
  66. #
  67. # ... 8x aggregate factor AVX code path is using reduction algorithm
  68. # suggested by Shay Gueron[1]. Even though contemporary AVX-capable
  69. # CPUs such as Sandy and Ivy Bridge can execute it, the code performs
  70. # sub-optimally in comparison to above mentioned version. But thanks
  71. # to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
  72. # it performs in 0.41 cycles per byte on Haswell processor.
  73. #
  74. # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
  75. $flavour = shift;
  76. $output = shift;
  77. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  78. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  79. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  80. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  81. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  82. die "can't locate x86_64-xlate.pl";
  83. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  84. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  85. $avx = ($1>=2.19) + ($1>=2.22);
  86. }
  87. if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  88. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  89. $avx = ($1>=2.09) + ($1>=2.10);
  90. }
  91. if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  92. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  93. $avx = ($1>=10) + ($1>=11);
  94. }
  95. if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
  96. $avx = ($2>=3.0) + ($2>3.0);
  97. }
  98. open OUT,"| \"$^X\" $xlate $flavour $output";
  99. *STDOUT=*OUT;
  100. $do4xaggr=1;
  101. # common register layout
  102. $nlo="%rax";
  103. $nhi="%rbx";
  104. $Zlo="%r8";
  105. $Zhi="%r9";
  106. $tmp="%r10";
  107. $rem_4bit = "%r11";
  108. $Xi="%rdi";
  109. $Htbl="%rsi";
  110. # per-function register layout
  111. $cnt="%rcx";
  112. $rem="%rdx";
  113. sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or
  114. $r =~ s/%[er]([sd]i)/%\1l/ or
  115. $r =~ s/%[er](bp)/%\1l/ or
  116. $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
  117. sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
  118. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
  119. my $arg = pop;
  120. $arg = "\$$arg" if ($arg*1 eq $arg);
  121. $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
  122. }
  123. { my $N;
  124. sub loop() {
  125. my $inp = shift;
  126. $N++;
  127. $code.=<<___;
  128. xor $nlo,$nlo
  129. xor $nhi,$nhi
  130. mov `&LB("$Zlo")`,`&LB("$nlo")`
  131. mov `&LB("$Zlo")`,`&LB("$nhi")`
  132. shl \$4,`&LB("$nlo")`
  133. mov \$14,$cnt
  134. mov 8($Htbl,$nlo),$Zlo
  135. mov ($Htbl,$nlo),$Zhi
  136. and \$0xf0,`&LB("$nhi")`
  137. mov $Zlo,$rem
  138. jmp .Loop$N
  139. .align 16
  140. .Loop$N:
  141. shr \$4,$Zlo
  142. and \$0xf,$rem
  143. mov $Zhi,$tmp
  144. mov ($inp,$cnt),`&LB("$nlo")`
  145. shr \$4,$Zhi
  146. xor 8($Htbl,$nhi),$Zlo
  147. shl \$60,$tmp
  148. xor ($Htbl,$nhi),$Zhi
  149. mov `&LB("$nlo")`,`&LB("$nhi")`
  150. xor ($rem_4bit,$rem,8),$Zhi
  151. mov $Zlo,$rem
  152. shl \$4,`&LB("$nlo")`
  153. xor $tmp,$Zlo
  154. dec $cnt
  155. js .Lbreak$N
  156. shr \$4,$Zlo
  157. and \$0xf,$rem
  158. mov $Zhi,$tmp
  159. shr \$4,$Zhi
  160. xor 8($Htbl,$nlo),$Zlo
  161. shl \$60,$tmp
  162. xor ($Htbl,$nlo),$Zhi
  163. and \$0xf0,`&LB("$nhi")`
  164. xor ($rem_4bit,$rem,8),$Zhi
  165. mov $Zlo,$rem
  166. xor $tmp,$Zlo
  167. jmp .Loop$N
  168. .align 16
  169. .Lbreak$N:
  170. shr \$4,$Zlo
  171. and \$0xf,$rem
  172. mov $Zhi,$tmp
  173. shr \$4,$Zhi
  174. xor 8($Htbl,$nlo),$Zlo
  175. shl \$60,$tmp
  176. xor ($Htbl,$nlo),$Zhi
  177. and \$0xf0,`&LB("$nhi")`
  178. xor ($rem_4bit,$rem,8),$Zhi
  179. mov $Zlo,$rem
  180. xor $tmp,$Zlo
  181. shr \$4,$Zlo
  182. and \$0xf,$rem
  183. mov $Zhi,$tmp
  184. shr \$4,$Zhi
  185. xor 8($Htbl,$nhi),$Zlo
  186. shl \$60,$tmp
  187. xor ($Htbl,$nhi),$Zhi
  188. xor $tmp,$Zlo
  189. xor ($rem_4bit,$rem,8),$Zhi
  190. bswap $Zlo
  191. bswap $Zhi
  192. ___
  193. }}
  194. $code=<<___;
  195. .text
  196. .extern OPENSSL_ia32cap_P
  197. .globl gcm_gmult_4bit
  198. .type gcm_gmult_4bit,\@function,2
  199. .align 16
  200. gcm_gmult_4bit:
  201. push %rbx
  202. push %rbp # %rbp and %r12 are pushed exclusively in
  203. push %r12 # order to reuse Win64 exception handler...
  204. .Lgmult_prologue:
  205. movzb 15($Xi),$Zlo
  206. lea .Lrem_4bit(%rip),$rem_4bit
  207. ___
  208. &loop ($Xi);
  209. $code.=<<___;
  210. mov $Zlo,8($Xi)
  211. mov $Zhi,($Xi)
  212. mov 16(%rsp),%rbx
  213. lea 24(%rsp),%rsp
  214. .Lgmult_epilogue:
  215. ret
  216. .size gcm_gmult_4bit,.-gcm_gmult_4bit
  217. ___
  218. # per-function register layout
  219. $inp="%rdx";
  220. $len="%rcx";
  221. $rem_8bit=$rem_4bit;
  222. $code.=<<___;
  223. .globl gcm_ghash_4bit
  224. .type gcm_ghash_4bit,\@function,4
  225. .align 16
  226. gcm_ghash_4bit:
  227. push %rbx
  228. push %rbp
  229. push %r12
  230. push %r13
  231. push %r14
  232. push %r15
  233. sub \$280,%rsp
  234. .Lghash_prologue:
  235. mov $inp,%r14 # reassign couple of args
  236. mov $len,%r15
  237. ___
  238. { my $inp="%r14";
  239. my $dat="%edx";
  240. my $len="%r15";
  241. my @nhi=("%ebx","%ecx");
  242. my @rem=("%r12","%r13");
  243. my $Hshr4="%rbp";
  244. &sub ($Htbl,-128); # size optimization
  245. &lea ($Hshr4,"16+128(%rsp)");
  246. { my @lo =($nlo,$nhi);
  247. my @hi =($Zlo,$Zhi);
  248. &xor ($dat,$dat);
  249. for ($i=0,$j=-2;$i<18;$i++,$j++) {
  250. &mov ("$j(%rsp)",&LB($dat)) if ($i>1);
  251. &or ($lo[0],$tmp) if ($i>1);
  252. &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17);
  253. &shr ($lo[1],4) if ($i>0 && $i<17);
  254. &mov ($tmp,$hi[1]) if ($i>0 && $i<17);
  255. &shr ($hi[1],4) if ($i>0 && $i<17);
  256. &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1);
  257. &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16);
  258. &shl (&LB($dat),4) if ($i>0 && $i<17);
  259. &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1);
  260. &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16);
  261. &shl ($tmp,60) if ($i>0 && $i<17);
  262. push (@lo,shift(@lo));
  263. push (@hi,shift(@hi));
  264. }
  265. }
  266. &add ($Htbl,-128);
  267. &mov ($Zlo,"8($Xi)");
  268. &mov ($Zhi,"0($Xi)");
  269. &add ($len,$inp); # pointer to the end of data
  270. &lea ($rem_8bit,".Lrem_8bit(%rip)");
  271. &jmp (".Louter_loop");
  272. $code.=".align 16\n.Louter_loop:\n";
  273. &xor ($Zhi,"($inp)");
  274. &mov ("%rdx","8($inp)");
  275. &lea ($inp,"16($inp)");
  276. &xor ("%rdx",$Zlo);
  277. &mov ("($Xi)",$Zhi);
  278. &mov ("8($Xi)","%rdx");
  279. &shr ("%rdx",32);
  280. &xor ($nlo,$nlo);
  281. &rol ($dat,8);
  282. &mov (&LB($nlo),&LB($dat));
  283. &movz ($nhi[0],&LB($dat));
  284. &shl (&LB($nlo),4);
  285. &shr ($nhi[0],4);
  286. for ($j=11,$i=0;$i<15;$i++) {
  287. &rol ($dat,8);
  288. &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0);
  289. &xor ($Zhi,"($Htbl,$nlo)") if ($i>0);
  290. &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0);
  291. &mov ($Zhi,"($Htbl,$nlo)") if ($i==0);
  292. &mov (&LB($nlo),&LB($dat));
  293. &xor ($Zlo,$tmp) if ($i>0);
  294. &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0);
  295. &movz ($nhi[1],&LB($dat));
  296. &shl (&LB($nlo),4);
  297. &movzb ($rem[0],"(%rsp,$nhi[0])");
  298. &shr ($nhi[1],4) if ($i<14);
  299. &and ($nhi[1],0xf0) if ($i==14);
  300. &shl ($rem[1],48) if ($i>0);
  301. &xor ($rem[0],$Zlo);
  302. &mov ($tmp,$Zhi);
  303. &xor ($Zhi,$rem[1]) if ($i>0);
  304. &shr ($Zlo,8);
  305. &movz ($rem[0],&LB($rem[0]));
  306. &mov ($dat,"$j($Xi)") if (--$j%4==0);
  307. &shr ($Zhi,8);
  308. &xor ($Zlo,"-128($Hshr4,$nhi[0],8)");
  309. &shl ($tmp,56);
  310. &xor ($Zhi,"($Hshr4,$nhi[0],8)");
  311. unshift (@nhi,pop(@nhi)); # "rotate" registers
  312. unshift (@rem,pop(@rem));
  313. }
  314. &movzw ($rem[1],"($rem_8bit,$rem[1],2)");
  315. &xor ($Zlo,"8($Htbl,$nlo)");
  316. &xor ($Zhi,"($Htbl,$nlo)");
  317. &shl ($rem[1],48);
  318. &xor ($Zlo,$tmp);
  319. &xor ($Zhi,$rem[1]);
  320. &movz ($rem[0],&LB($Zlo));
  321. &shr ($Zlo,4);
  322. &mov ($tmp,$Zhi);
  323. &shl (&LB($rem[0]),4);
  324. &shr ($Zhi,4);
  325. &xor ($Zlo,"8($Htbl,$nhi[0])");
  326. &movzw ($rem[0],"($rem_8bit,$rem[0],2)");
  327. &shl ($tmp,60);
  328. &xor ($Zhi,"($Htbl,$nhi[0])");
  329. &xor ($Zlo,$tmp);
  330. &shl ($rem[0],48);
  331. &bswap ($Zlo);
  332. &xor ($Zhi,$rem[0]);
  333. &bswap ($Zhi);
  334. &cmp ($inp,$len);
  335. &jb (".Louter_loop");
  336. }
  337. $code.=<<___;
  338. mov $Zlo,8($Xi)
  339. mov $Zhi,($Xi)
  340. lea 280(%rsp),%rsi
  341. mov 0(%rsi),%r15
  342. mov 8(%rsi),%r14
  343. mov 16(%rsi),%r13
  344. mov 24(%rsi),%r12
  345. mov 32(%rsi),%rbp
  346. mov 40(%rsi),%rbx
  347. lea 48(%rsi),%rsp
  348. .Lghash_epilogue:
  349. ret
  350. .size gcm_ghash_4bit,.-gcm_ghash_4bit
  351. ___
  352. ######################################################################
  353. # PCLMULQDQ version.
  354. @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
  355. ("%rdi","%rsi","%rdx","%rcx"); # Unix order
  356. ($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2";
  357. ($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
  358. sub clmul64x64_T2 { # minimal register pressure
  359. my ($Xhi,$Xi,$Hkey,$HK)=@_;
  360. if (!defined($HK)) { $HK = $T2;
  361. $code.=<<___;
  362. movdqa $Xi,$Xhi #
  363. pshufd \$0b01001110,$Xi,$T1
  364. pshufd \$0b01001110,$Hkey,$T2
  365. pxor $Xi,$T1 #
  366. pxor $Hkey,$T2
  367. ___
  368. } else {
  369. $code.=<<___;
  370. movdqa $Xi,$Xhi #
  371. pshufd \$0b01001110,$Xi,$T1
  372. pxor $Xi,$T1 #
  373. ___
  374. }
  375. $code.=<<___;
  376. pclmulqdq \$0x00,$Hkey,$Xi #######
  377. pclmulqdq \$0x11,$Hkey,$Xhi #######
  378. pclmulqdq \$0x00,$HK,$T1 #######
  379. pxor $Xi,$T1 #
  380. pxor $Xhi,$T1 #
  381. movdqa $T1,$T2 #
  382. psrldq \$8,$T1
  383. pslldq \$8,$T2 #
  384. pxor $T1,$Xhi
  385. pxor $T2,$Xi #
  386. ___
  387. }
  388. sub reduction_alg9 { # 17/11 times faster than Intel version
  389. my ($Xhi,$Xi) = @_;
  390. $code.=<<___;
  391. # 1st phase
  392. movdqa $Xi,$T2 #
  393. movdqa $Xi,$T1
  394. psllq \$5,$Xi
  395. pxor $Xi,$T1 #
  396. psllq \$1,$Xi
  397. pxor $T1,$Xi #
  398. psllq \$57,$Xi #
  399. movdqa $Xi,$T1 #
  400. pslldq \$8,$Xi
  401. psrldq \$8,$T1 #
  402. pxor $T2,$Xi
  403. pxor $T1,$Xhi #
  404. # 2nd phase
  405. movdqa $Xi,$T2
  406. psrlq \$1,$Xi
  407. pxor $T2,$Xhi #
  408. pxor $Xi,$T2
  409. psrlq \$5,$Xi
  410. pxor $T2,$Xi #
  411. psrlq \$1,$Xi #
  412. pxor $Xhi,$Xi #
  413. ___
  414. }
  415. { my ($Htbl,$Xip)=@_4args;
  416. my $HK="%xmm6";
  417. $code.=<<___;
  418. .globl gcm_init_clmul
  419. .type gcm_init_clmul,\@abi-omnipotent
  420. .align 16
  421. gcm_init_clmul:
  422. .L_init_clmul:
  423. ___
  424. $code.=<<___ if ($win64);
  425. .LSEH_begin_gcm_init_clmul:
  426. # I can't trust assembler to use specific encoding:-(
  427. .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
  428. .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
  429. ___
  430. $code.=<<___;
  431. movdqu ($Xip),$Hkey
  432. pshufd \$0b01001110,$Hkey,$Hkey # dword swap
  433. # <<1 twist
  434. pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
  435. movdqa $Hkey,$T1
  436. psllq \$1,$Hkey
  437. pxor $T3,$T3 #
  438. psrlq \$63,$T1
  439. pcmpgtd $T2,$T3 # broadcast carry bit
  440. pslldq \$8,$T1
  441. por $T1,$Hkey # H<<=1
  442. # magic reduction
  443. pand .L0x1c2_polynomial(%rip),$T3
  444. pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
  445. # calculate H^2
  446. pshufd \$0b01001110,$Hkey,$HK
  447. movdqa $Hkey,$Xi
  448. pxor $Hkey,$HK
  449. ___
  450. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK);
  451. &reduction_alg9 ($Xhi,$Xi);
  452. $code.=<<___;
  453. pshufd \$0b01001110,$Hkey,$T1
  454. pshufd \$0b01001110,$Xi,$T2
  455. pxor $Hkey,$T1 # Karatsuba pre-processing
  456. movdqu $Hkey,0x00($Htbl) # save H
  457. pxor $Xi,$T2 # Karatsuba pre-processing
  458. movdqu $Xi,0x10($Htbl) # save H^2
  459. palignr \$8,$T1,$T2 # low part is H.lo^H.hi...
  460. movdqu $T2,0x20($Htbl) # save Karatsuba "salt"
  461. ___
  462. if ($do4xaggr) {
  463. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3
  464. &reduction_alg9 ($Xhi,$Xi);
  465. $code.=<<___;
  466. movdqa $Xi,$T3
  467. ___
  468. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4
  469. &reduction_alg9 ($Xhi,$Xi);
  470. $code.=<<___;
  471. pshufd \$0b01001110,$T3,$T1
  472. pshufd \$0b01001110,$Xi,$T2
  473. pxor $T3,$T1 # Karatsuba pre-processing
  474. movdqu $T3,0x30($Htbl) # save H^3
  475. pxor $Xi,$T2 # Karatsuba pre-processing
  476. movdqu $Xi,0x40($Htbl) # save H^4
  477. palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi...
  478. movdqu $T2,0x50($Htbl) # save Karatsuba "salt"
  479. ___
  480. }
  481. $code.=<<___ if ($win64);
  482. movaps (%rsp),%xmm6
  483. lea 0x18(%rsp),%rsp
  484. .LSEH_end_gcm_init_clmul:
  485. ___
  486. $code.=<<___;
  487. ret
  488. .size gcm_init_clmul,.-gcm_init_clmul
  489. ___
  490. }
  491. { my ($Xip,$Htbl)=@_4args;
  492. $code.=<<___;
  493. .globl gcm_gmult_clmul
  494. .type gcm_gmult_clmul,\@abi-omnipotent
  495. .align 16
  496. gcm_gmult_clmul:
  497. .L_gmult_clmul:
  498. movdqu ($Xip),$Xi
  499. movdqa .Lbswap_mask(%rip),$T3
  500. movdqu ($Htbl),$Hkey
  501. movdqu 0x20($Htbl),$T2
  502. pshufb $T3,$Xi
  503. ___
  504. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
  505. $code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
  506. # experimental alternative. special thing about is that there
  507. # no dependency between the two multiplications...
  508. mov \$`0xE1<<1`,%eax
  509. mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff
  510. mov \$0x07,%r11d
  511. movq %rax,$T1
  512. movq %r10,$T2
  513. movq %r11,$T3 # borrow $T3
  514. pand $Xi,$T3
  515. pshufb $T3,$T2 # ($Xi&7)·0xE0
  516. movq %rax,$T3
  517. pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1)
  518. pxor $Xi,$T2
  519. pslldq \$15,$T2
  520. paddd $T2,$T2 # <<(64+56+1)
  521. pxor $T2,$Xi
  522. pclmulqdq \$0x01,$T3,$Xi
  523. movdqa .Lbswap_mask(%rip),$T3 # reload $T3
  524. psrldq \$1,$T1
  525. pxor $T1,$Xhi
  526. pslldq \$7,$Xi
  527. pxor $Xhi,$Xi
  528. ___
  529. $code.=<<___;
  530. pshufb $T3,$Xi
  531. movdqu $Xi,($Xip)
  532. ret
  533. .size gcm_gmult_clmul,.-gcm_gmult_clmul
  534. ___
  535. }
  536. { my ($Xip,$Htbl,$inp,$len)=@_4args;
  537. my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
  538. my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
  539. $code.=<<___;
  540. .globl gcm_ghash_clmul
  541. .type gcm_ghash_clmul,\@abi-omnipotent
  542. .align 32
  543. gcm_ghash_clmul:
  544. .L_ghash_clmul:
  545. ___
  546. $code.=<<___ if ($win64);
  547. lea -0x88(%rsp),%rax
  548. .LSEH_begin_gcm_ghash_clmul:
  549. # I can't trust assembler to use specific encoding:-(
  550. .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
  551. .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
  552. .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
  553. .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
  554. .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
  555. .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
  556. .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
  557. .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
  558. .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
  559. .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
  560. .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
  561. ___
  562. $code.=<<___;
  563. movdqa .Lbswap_mask(%rip),$T3
  564. movdqu ($Xip),$Xi
  565. movdqu ($Htbl),$Hkey
  566. movdqu 0x20($Htbl),$HK
  567. pshufb $T3,$Xi
  568. sub \$0x10,$len
  569. jz .Lodd_tail
  570. movdqu 0x10($Htbl),$Hkey2
  571. ___
  572. if ($do4xaggr) {
  573. my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
  574. $code.=<<___;
  575. mov OPENSSL_ia32cap_P+4(%rip),%eax
  576. cmp \$0x30,$len
  577. jb .Lskip4x
  578. and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE
  579. cmp \$`1<<22`,%eax # check for MOVBE without XSAVE
  580. je .Lskip4x
  581. sub \$0x30,$len
  582. mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
  583. movdqu 0x30($Htbl),$Hkey3
  584. movdqu 0x40($Htbl),$Hkey4
  585. #######
  586. # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P
  587. #
  588. movdqu 0x30($inp),$Xln
  589. movdqu 0x20($inp),$Xl
  590. pshufb $T3,$Xln
  591. pshufb $T3,$Xl
  592. movdqa $Xln,$Xhn
  593. pshufd \$0b01001110,$Xln,$Xmn
  594. pxor $Xln,$Xmn
  595. pclmulqdq \$0x00,$Hkey,$Xln
  596. pclmulqdq \$0x11,$Hkey,$Xhn
  597. pclmulqdq \$0x00,$HK,$Xmn
  598. movdqa $Xl,$Xh
  599. pshufd \$0b01001110,$Xl,$Xm
  600. pxor $Xl,$Xm
  601. pclmulqdq \$0x00,$Hkey2,$Xl
  602. pclmulqdq \$0x11,$Hkey2,$Xh
  603. pclmulqdq \$0x10,$HK,$Xm
  604. xorps $Xl,$Xln
  605. xorps $Xh,$Xhn
  606. movups 0x50($Htbl),$HK
  607. xorps $Xm,$Xmn
  608. movdqu 0x10($inp),$Xl
  609. movdqu 0($inp),$T1
  610. pshufb $T3,$Xl
  611. pshufb $T3,$T1
  612. movdqa $Xl,$Xh
  613. pshufd \$0b01001110,$Xl,$Xm
  614. pxor $T1,$Xi
  615. pxor $Xl,$Xm
  616. pclmulqdq \$0x00,$Hkey3,$Xl
  617. movdqa $Xi,$Xhi
  618. pshufd \$0b01001110,$Xi,$T1
  619. pxor $Xi,$T1
  620. pclmulqdq \$0x11,$Hkey3,$Xh
  621. pclmulqdq \$0x00,$HK,$Xm
  622. xorps $Xl,$Xln
  623. xorps $Xh,$Xhn
  624. lea 0x40($inp),$inp
  625. sub \$0x40,$len
  626. jc .Ltail4x
  627. jmp .Lmod4_loop
  628. .align 32
  629. .Lmod4_loop:
  630. pclmulqdq \$0x00,$Hkey4,$Xi
  631. xorps $Xm,$Xmn
  632. movdqu 0x30($inp),$Xl
  633. pshufb $T3,$Xl
  634. pclmulqdq \$0x11,$Hkey4,$Xhi
  635. xorps $Xln,$Xi
  636. movdqu 0x20($inp),$Xln
  637. movdqa $Xl,$Xh
  638. pclmulqdq \$0x10,$HK,$T1
  639. pshufd \$0b01001110,$Xl,$Xm
  640. xorps $Xhn,$Xhi
  641. pxor $Xl,$Xm
  642. pshufb $T3,$Xln
  643. movups 0x20($Htbl),$HK
  644. xorps $Xmn,$T1
  645. pclmulqdq \$0x00,$Hkey,$Xl
  646. pshufd \$0b01001110,$Xln,$Xmn
  647. pxor $Xi,$T1 # aggregated Karatsuba post-processing
  648. movdqa $Xln,$Xhn
  649. pxor $Xhi,$T1 #
  650. pxor $Xln,$Xmn
  651. movdqa $T1,$T2 #
  652. pclmulqdq \$0x11,$Hkey,$Xh
  653. pslldq \$8,$T1
  654. psrldq \$8,$T2 #
  655. pxor $T1,$Xi
  656. movdqa .L7_mask(%rip),$T1
  657. pxor $T2,$Xhi #
  658. movq %rax,$T2
  659. pand $Xi,$T1 # 1st phase
  660. pshufb $T1,$T2 #
  661. pxor $Xi,$T2 #
  662. pclmulqdq \$0x00,$HK,$Xm
  663. psllq \$57,$T2 #
  664. movdqa $T2,$T1 #
  665. pslldq \$8,$T2
  666. pclmulqdq \$0x00,$Hkey2,$Xln
  667. psrldq \$8,$T1 #
  668. pxor $T2,$Xi
  669. pxor $T1,$Xhi #
  670. movdqu 0($inp),$T1
  671. movdqa $Xi,$T2 # 2nd phase
  672. psrlq \$1,$Xi
  673. pclmulqdq \$0x11,$Hkey2,$Xhn
  674. xorps $Xl,$Xln
  675. movdqu 0x10($inp),$Xl
  676. pshufb $T3,$Xl
  677. pclmulqdq \$0x10,$HK,$Xmn
  678. xorps $Xh,$Xhn
  679. movups 0x50($Htbl),$HK
  680. pshufb $T3,$T1
  681. pxor $T2,$Xhi #
  682. pxor $Xi,$T2
  683. psrlq \$5,$Xi
  684. movdqa $Xl,$Xh
  685. pxor $Xm,$Xmn
  686. pshufd \$0b01001110,$Xl,$Xm
  687. pxor $T2,$Xi #
  688. pxor $T1,$Xhi
  689. pxor $Xl,$Xm
  690. pclmulqdq \$0x00,$Hkey3,$Xl
  691. psrlq \$1,$Xi #
  692. pxor $Xhi,$Xi #
  693. movdqa $Xi,$Xhi
  694. pclmulqdq \$0x11,$Hkey3,$Xh
  695. xorps $Xl,$Xln
  696. pshufd \$0b01001110,$Xi,$T1
  697. pxor $Xi,$T1
  698. pclmulqdq \$0x00,$HK,$Xm
  699. xorps $Xh,$Xhn
  700. lea 0x40($inp),$inp
  701. sub \$0x40,$len
  702. jnc .Lmod4_loop
  703. .Ltail4x:
  704. pclmulqdq \$0x00,$Hkey4,$Xi
  705. pclmulqdq \$0x11,$Hkey4,$Xhi
  706. pclmulqdq \$0x10,$HK,$T1
  707. xorps $Xm,$Xmn
  708. xorps $Xln,$Xi
  709. xorps $Xhn,$Xhi
  710. pxor $Xi,$Xhi # aggregated Karatsuba post-processing
  711. pxor $Xmn,$T1
  712. pxor $Xhi,$T1 #
  713. pxor $Xi,$Xhi
  714. movdqa $T1,$T2 #
  715. psrldq \$8,$T1
  716. pslldq \$8,$T2 #
  717. pxor $T1,$Xhi
  718. pxor $T2,$Xi #
  719. ___
  720. &reduction_alg9($Xhi,$Xi);
  721. $code.=<<___;
  722. add \$0x40,$len
  723. jz .Ldone
  724. movdqu 0x20($Htbl),$HK
  725. sub \$0x10,$len
  726. jz .Lodd_tail
  727. .Lskip4x:
  728. ___
  729. }
  730. $code.=<<___;
  731. #######
  732. # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
  733. # [(H*Ii+1) + (H*Xi+1)] mod P =
  734. # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
  735. #
  736. movdqu ($inp),$T1 # Ii
  737. movdqu 16($inp),$Xln # Ii+1
  738. pshufb $T3,$T1
  739. pshufb $T3,$Xln
  740. pxor $T1,$Xi # Ii+Xi
  741. movdqa $Xln,$Xhn
  742. pshufd \$0b01001110,$Xln,$Xmn
  743. pxor $Xln,$Xmn
  744. pclmulqdq \$0x00,$Hkey,$Xln
  745. pclmulqdq \$0x11,$Hkey,$Xhn
  746. pclmulqdq \$0x00,$HK,$Xmn
  747. lea 32($inp),$inp # i+=2
  748. nop
  749. sub \$0x20,$len
  750. jbe .Leven_tail
  751. nop
  752. jmp .Lmod_loop
  753. .align 32
  754. .Lmod_loop:
  755. movdqa $Xi,$Xhi
  756. movdqa $Xmn,$T1
  757. pshufd \$0b01001110,$Xi,$Xmn #
  758. pxor $Xi,$Xmn #
  759. pclmulqdq \$0x00,$Hkey2,$Xi
  760. pclmulqdq \$0x11,$Hkey2,$Xhi
  761. pclmulqdq \$0x10,$HK,$Xmn
  762. pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
  763. pxor $Xhn,$Xhi
  764. movdqu ($inp),$T2 # Ii
  765. pxor $Xi,$T1 # aggregated Karatsuba post-processing
  766. pshufb $T3,$T2
  767. movdqu 16($inp),$Xln # Ii+1
  768. pxor $Xhi,$T1
  769. pxor $T2,$Xhi # "Ii+Xi", consume early
  770. pxor $T1,$Xmn
  771. pshufb $T3,$Xln
  772. movdqa $Xmn,$T1 #
  773. psrldq \$8,$T1
  774. pslldq \$8,$Xmn #
  775. pxor $T1,$Xhi
  776. pxor $Xmn,$Xi #
  777. movdqa $Xln,$Xhn #
  778. movdqa $Xi,$T2 # 1st phase
  779. movdqa $Xi,$T1
  780. psllq \$5,$Xi
  781. pxor $Xi,$T1 #
  782. pclmulqdq \$0x00,$Hkey,$Xln #######
  783. psllq \$1,$Xi
  784. pxor $T1,$Xi #
  785. psllq \$57,$Xi #
  786. movdqa $Xi,$T1 #
  787. pslldq \$8,$Xi
  788. psrldq \$8,$T1 #
  789. pxor $T2,$Xi
  790. pshufd \$0b01001110,$Xhn,$Xmn
  791. pxor $T1,$Xhi #
  792. pxor $Xhn,$Xmn #
  793. movdqa $Xi,$T2 # 2nd phase
  794. psrlq \$1,$Xi
  795. pclmulqdq \$0x11,$Hkey,$Xhn #######
  796. pxor $T2,$Xhi #
  797. pxor $Xi,$T2
  798. psrlq \$5,$Xi
  799. pxor $T2,$Xi #
  800. lea 32($inp),$inp
  801. psrlq \$1,$Xi #
  802. pclmulqdq \$0x00,$HK,$Xmn #######
  803. pxor $Xhi,$Xi #
  804. sub \$0x20,$len
  805. ja .Lmod_loop
  806. .Leven_tail:
  807. movdqa $Xi,$Xhi
  808. movdqa $Xmn,$T1
  809. pshufd \$0b01001110,$Xi,$Xmn #
  810. pxor $Xi,$Xmn #
  811. pclmulqdq \$0x00,$Hkey2,$Xi
  812. pclmulqdq \$0x11,$Hkey2,$Xhi
  813. pclmulqdq \$0x10,$HK,$Xmn
  814. pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
  815. pxor $Xhn,$Xhi
  816. pxor $Xi,$T1
  817. pxor $Xhi,$T1
  818. pxor $T1,$Xmn
  819. movdqa $Xmn,$T1 #
  820. psrldq \$8,$T1
  821. pslldq \$8,$Xmn #
  822. pxor $T1,$Xhi
  823. pxor $Xmn,$Xi #
  824. ___
  825. &reduction_alg9 ($Xhi,$Xi);
  826. $code.=<<___;
  827. test $len,$len
  828. jnz .Ldone
  829. .Lodd_tail:
  830. movdqu ($inp),$T1 # Ii
  831. pshufb $T3,$T1
  832. pxor $T1,$Xi # Ii+Xi
  833. ___
  834. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi)
  835. &reduction_alg9 ($Xhi,$Xi);
  836. $code.=<<___;
  837. .Ldone:
  838. pshufb $T3,$Xi
  839. movdqu $Xi,($Xip)
  840. ___
  841. $code.=<<___ if ($win64);
  842. movaps (%rsp),%xmm6
  843. movaps 0x10(%rsp),%xmm7
  844. movaps 0x20(%rsp),%xmm8
  845. movaps 0x30(%rsp),%xmm9
  846. movaps 0x40(%rsp),%xmm10
  847. movaps 0x50(%rsp),%xmm11
  848. movaps 0x60(%rsp),%xmm12
  849. movaps 0x70(%rsp),%xmm13
  850. movaps 0x80(%rsp),%xmm14
  851. movaps 0x90(%rsp),%xmm15
  852. lea 0xa8(%rsp),%rsp
  853. .LSEH_end_gcm_ghash_clmul:
  854. ___
  855. $code.=<<___;
  856. ret
  857. .size gcm_ghash_clmul,.-gcm_ghash_clmul
  858. ___
  859. }
  860. $code.=<<___;
  861. .globl gcm_init_avx
  862. .type gcm_init_avx,\@abi-omnipotent
  863. .align 32
  864. gcm_init_avx:
  865. ___
  866. if ($avx) {
  867. my ($Htbl,$Xip)=@_4args;
  868. my $HK="%xmm6";
  869. $code.=<<___ if ($win64);
  870. .LSEH_begin_gcm_init_avx:
  871. # I can't trust assembler to use specific encoding:-(
  872. .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
  873. .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
  874. ___
  875. $code.=<<___;
  876. vzeroupper
  877. vmovdqu ($Xip),$Hkey
  878. vpshufd \$0b01001110,$Hkey,$Hkey # dword swap
  879. # <<1 twist
  880. vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
  881. vpsrlq \$63,$Hkey,$T1
  882. vpsllq \$1,$Hkey,$Hkey
  883. vpxor $T3,$T3,$T3 #
  884. vpcmpgtd $T2,$T3,$T3 # broadcast carry bit
  885. vpslldq \$8,$T1,$T1
  886. vpor $T1,$Hkey,$Hkey # H<<=1
  887. # magic reduction
  888. vpand .L0x1c2_polynomial(%rip),$T3,$T3
  889. vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial
  890. vpunpckhqdq $Hkey,$Hkey,$HK
  891. vmovdqa $Hkey,$Xi
  892. vpxor $Hkey,$HK,$HK
  893. mov \$4,%r10 # up to H^8
  894. jmp .Linit_start_avx
  895. ___
  896. sub clmul64x64_avx {
  897. my ($Xhi,$Xi,$Hkey,$HK)=@_;
  898. if (!defined($HK)) { $HK = $T2;
  899. $code.=<<___;
  900. vpunpckhqdq $Xi,$Xi,$T1
  901. vpunpckhqdq $Hkey,$Hkey,$T2
  902. vpxor $Xi,$T1,$T1 #
  903. vpxor $Hkey,$T2,$T2
  904. ___
  905. } else {
  906. $code.=<<___;
  907. vpunpckhqdq $Xi,$Xi,$T1
  908. vpxor $Xi,$T1,$T1 #
  909. ___
  910. }
  911. $code.=<<___;
  912. vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi #######
  913. vpclmulqdq \$0x00,$Hkey,$Xi,$Xi #######
  914. vpclmulqdq \$0x00,$HK,$T1,$T1 #######
  915. vpxor $Xi,$Xhi,$T2 #
  916. vpxor $T2,$T1,$T1 #
  917. vpslldq \$8,$T1,$T2 #
  918. vpsrldq \$8,$T1,$T1
  919. vpxor $T2,$Xi,$Xi #
  920. vpxor $T1,$Xhi,$Xhi
  921. ___
  922. }
  923. sub reduction_avx {
  924. my ($Xhi,$Xi) = @_;
  925. $code.=<<___;
  926. vpsllq \$57,$Xi,$T1 # 1st phase
  927. vpsllq \$62,$Xi,$T2
  928. vpxor $T1,$T2,$T2 #
  929. vpsllq \$63,$Xi,$T1
  930. vpxor $T1,$T2,$T2 #
  931. vpslldq \$8,$T2,$T1 #
  932. vpsrldq \$8,$T2,$T2
  933. vpxor $T1,$Xi,$Xi #
  934. vpxor $T2,$Xhi,$Xhi
  935. vpsrlq \$1,$Xi,$T2 # 2nd phase
  936. vpxor $Xi,$Xhi,$Xhi
  937. vpxor $T2,$Xi,$Xi #
  938. vpsrlq \$5,$T2,$T2
  939. vpxor $T2,$Xi,$Xi #
  940. vpsrlq \$1,$Xi,$Xi #
  941. vpxor $Xhi,$Xi,$Xi #
  942. ___
  943. }
  944. $code.=<<___;
  945. .align 32
  946. .Linit_loop_avx:
  947. vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi...
  948. vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt"
  949. ___
  950. &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7
  951. &reduction_avx ($Xhi,$Xi);
  952. $code.=<<___;
  953. .Linit_start_avx:
  954. vmovdqa $Xi,$T3
  955. ___
  956. &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8
  957. &reduction_avx ($Xhi,$Xi);
  958. $code.=<<___;
  959. vpshufd \$0b01001110,$T3,$T1
  960. vpshufd \$0b01001110,$Xi,$T2
  961. vpxor $T3,$T1,$T1 # Karatsuba pre-processing
  962. vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7
  963. vpxor $Xi,$T2,$T2 # Karatsuba pre-processing
  964. vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8
  965. lea 0x30($Htbl),$Htbl
  966. sub \$1,%r10
  967. jnz .Linit_loop_avx
  968. vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped
  969. vmovdqu $T3,-0x10($Htbl)
  970. vzeroupper
  971. ___
  972. $code.=<<___ if ($win64);
  973. movaps (%rsp),%xmm6
  974. lea 0x18(%rsp),%rsp
  975. .LSEH_end_gcm_init_avx:
  976. ___
  977. $code.=<<___;
  978. ret
  979. .size gcm_init_avx,.-gcm_init_avx
  980. ___
  981. } else {
  982. $code.=<<___;
  983. jmp .L_init_clmul
  984. .size gcm_init_avx,.-gcm_init_avx
  985. ___
  986. }
  987. $code.=<<___;
  988. .globl gcm_gmult_avx
  989. .type gcm_gmult_avx,\@abi-omnipotent
  990. .align 32
  991. gcm_gmult_avx:
  992. jmp .L_gmult_clmul
  993. .size gcm_gmult_avx,.-gcm_gmult_avx
  994. ___
  995. $code.=<<___;
  996. .globl gcm_ghash_avx
  997. .type gcm_ghash_avx,\@abi-omnipotent
  998. .align 32
  999. gcm_ghash_avx:
  1000. ___
  1001. if ($avx) {
  1002. my ($Xip,$Htbl,$inp,$len)=@_4args;
  1003. my ($Xlo,$Xhi,$Xmi,
  1004. $Zlo,$Zhi,$Zmi,
  1005. $Hkey,$HK,$T1,$T2,
  1006. $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15));
  1007. $code.=<<___ if ($win64);
  1008. lea -0x88(%rsp),%rax
  1009. .LSEH_begin_gcm_ghash_avx:
  1010. # I can't trust assembler to use specific encoding:-(
  1011. .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
  1012. .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
  1013. .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
  1014. .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
  1015. .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
  1016. .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
  1017. .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
  1018. .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
  1019. .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
  1020. .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
  1021. .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
  1022. ___
  1023. $code.=<<___;
  1024. vzeroupper
  1025. vmovdqu ($Xip),$Xi # load $Xi
  1026. lea .L0x1c2_polynomial(%rip),%r10
  1027. lea 0x40($Htbl),$Htbl # size optimization
  1028. vmovdqu .Lbswap_mask(%rip),$bswap
  1029. vpshufb $bswap,$Xi,$Xi
  1030. cmp \$0x80,$len
  1031. jb .Lshort_avx
  1032. sub \$0x80,$len
  1033. vmovdqu 0x70($inp),$Ii # I[7]
  1034. vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
  1035. vpshufb $bswap,$Ii,$Ii
  1036. vmovdqu 0x20-0x40($Htbl),$HK
  1037. vpunpckhqdq $Ii,$Ii,$T2
  1038. vmovdqu 0x60($inp),$Ij # I[6]
  1039. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1040. vpxor $Ii,$T2,$T2
  1041. vpshufb $bswap,$Ij,$Ij
  1042. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1043. vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
  1044. vpunpckhqdq $Ij,$Ij,$T1
  1045. vmovdqu 0x50($inp),$Ii # I[5]
  1046. vpclmulqdq \$0x00,$HK,$T2,$Xmi
  1047. vpxor $Ij,$T1,$T1
  1048. vpshufb $bswap,$Ii,$Ii
  1049. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1050. vpunpckhqdq $Ii,$Ii,$T2
  1051. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1052. vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
  1053. vpxor $Ii,$T2,$T2
  1054. vmovdqu 0x40($inp),$Ij # I[4]
  1055. vpclmulqdq \$0x10,$HK,$T1,$Zmi
  1056. vmovdqu 0x50-0x40($Htbl),$HK
  1057. vpshufb $bswap,$Ij,$Ij
  1058. vpxor $Xlo,$Zlo,$Zlo
  1059. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1060. vpxor $Xhi,$Zhi,$Zhi
  1061. vpunpckhqdq $Ij,$Ij,$T1
  1062. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1063. vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
  1064. vpxor $Xmi,$Zmi,$Zmi
  1065. vpclmulqdq \$0x00,$HK,$T2,$Xmi
  1066. vpxor $Ij,$T1,$T1
  1067. vmovdqu 0x30($inp),$Ii # I[3]
  1068. vpxor $Zlo,$Xlo,$Xlo
  1069. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1070. vpxor $Zhi,$Xhi,$Xhi
  1071. vpshufb $bswap,$Ii,$Ii
  1072. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1073. vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
  1074. vpxor $Zmi,$Xmi,$Xmi
  1075. vpunpckhqdq $Ii,$Ii,$T2
  1076. vpclmulqdq \$0x10,$HK,$T1,$Zmi
  1077. vmovdqu 0x80-0x40($Htbl),$HK
  1078. vpxor $Ii,$T2,$T2
  1079. vmovdqu 0x20($inp),$Ij # I[2]
  1080. vpxor $Xlo,$Zlo,$Zlo
  1081. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1082. vpxor $Xhi,$Zhi,$Zhi
  1083. vpshufb $bswap,$Ij,$Ij
  1084. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1085. vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
  1086. vpxor $Xmi,$Zmi,$Zmi
  1087. vpunpckhqdq $Ij,$Ij,$T1
  1088. vpclmulqdq \$0x00,$HK,$T2,$Xmi
  1089. vpxor $Ij,$T1,$T1
  1090. vmovdqu 0x10($inp),$Ii # I[1]
  1091. vpxor $Zlo,$Xlo,$Xlo
  1092. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1093. vpxor $Zhi,$Xhi,$Xhi
  1094. vpshufb $bswap,$Ii,$Ii
  1095. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1096. vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
  1097. vpxor $Zmi,$Xmi,$Xmi
  1098. vpunpckhqdq $Ii,$Ii,$T2
  1099. vpclmulqdq \$0x10,$HK,$T1,$Zmi
  1100. vmovdqu 0xb0-0x40($Htbl),$HK
  1101. vpxor $Ii,$T2,$T2
  1102. vmovdqu ($inp),$Ij # I[0]
  1103. vpxor $Xlo,$Zlo,$Zlo
  1104. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1105. vpxor $Xhi,$Zhi,$Zhi
  1106. vpshufb $bswap,$Ij,$Ij
  1107. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1108. vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
  1109. vpxor $Xmi,$Zmi,$Zmi
  1110. vpclmulqdq \$0x10,$HK,$T2,$Xmi
  1111. lea 0x80($inp),$inp
  1112. cmp \$0x80,$len
  1113. jb .Ltail_avx
  1114. vpxor $Xi,$Ij,$Ij # accumulate $Xi
  1115. sub \$0x80,$len
  1116. jmp .Loop8x_avx
  1117. .align 32
  1118. .Loop8x_avx:
  1119. vpunpckhqdq $Ij,$Ij,$T1
  1120. vmovdqu 0x70($inp),$Ii # I[7]
  1121. vpxor $Xlo,$Zlo,$Zlo
  1122. vpxor $Ij,$T1,$T1
  1123. vpclmulqdq \$0x00,$Hkey,$Ij,$Xi
  1124. vpshufb $bswap,$Ii,$Ii
  1125. vpxor $Xhi,$Zhi,$Zhi
  1126. vpclmulqdq \$0x11,$Hkey,$Ij,$Xo
  1127. vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
  1128. vpunpckhqdq $Ii,$Ii,$T2
  1129. vpxor $Xmi,$Zmi,$Zmi
  1130. vpclmulqdq \$0x00,$HK,$T1,$Tred
  1131. vmovdqu 0x20-0x40($Htbl),$HK
  1132. vpxor $Ii,$T2,$T2
  1133. vmovdqu 0x60($inp),$Ij # I[6]
  1134. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1135. vpxor $Zlo,$Xi,$Xi # collect result
  1136. vpshufb $bswap,$Ij,$Ij
  1137. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1138. vxorps $Zhi,$Xo,$Xo
  1139. vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
  1140. vpunpckhqdq $Ij,$Ij,$T1
  1141. vpclmulqdq \$0x00,$HK, $T2,$Xmi
  1142. vpxor $Zmi,$Tred,$Tred
  1143. vxorps $Ij,$T1,$T1
  1144. vmovdqu 0x50($inp),$Ii # I[5]
  1145. vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing
  1146. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1147. vpxor $Xo,$Tred,$Tred
  1148. vpslldq \$8,$Tred,$T2
  1149. vpxor $Xlo,$Zlo,$Zlo
  1150. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1151. vpsrldq \$8,$Tred,$Tred
  1152. vpxor $T2, $Xi, $Xi
  1153. vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
  1154. vpshufb $bswap,$Ii,$Ii
  1155. vxorps $Tred,$Xo, $Xo
  1156. vpxor $Xhi,$Zhi,$Zhi
  1157. vpunpckhqdq $Ii,$Ii,$T2
  1158. vpclmulqdq \$0x10,$HK, $T1,$Zmi
  1159. vmovdqu 0x50-0x40($Htbl),$HK
  1160. vpxor $Ii,$T2,$T2
  1161. vpxor $Xmi,$Zmi,$Zmi
  1162. vmovdqu 0x40($inp),$Ij # I[4]
  1163. vpalignr \$8,$Xi,$Xi,$Tred # 1st phase
  1164. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1165. vpshufb $bswap,$Ij,$Ij
  1166. vpxor $Zlo,$Xlo,$Xlo
  1167. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1168. vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
  1169. vpunpckhqdq $Ij,$Ij,$T1
  1170. vpxor $Zhi,$Xhi,$Xhi
  1171. vpclmulqdq \$0x00,$HK, $T2,$Xmi
  1172. vxorps $Ij,$T1,$T1
  1173. vpxor $Zmi,$Xmi,$Xmi
  1174. vmovdqu 0x30($inp),$Ii # I[3]
  1175. vpclmulqdq \$0x10,(%r10),$Xi,$Xi
  1176. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1177. vpshufb $bswap,$Ii,$Ii
  1178. vpxor $Xlo,$Zlo,$Zlo
  1179. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1180. vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
  1181. vpunpckhqdq $Ii,$Ii,$T2
  1182. vpxor $Xhi,$Zhi,$Zhi
  1183. vpclmulqdq \$0x10,$HK, $T1,$Zmi
  1184. vmovdqu 0x80-0x40($Htbl),$HK
  1185. vpxor $Ii,$T2,$T2
  1186. vpxor $Xmi,$Zmi,$Zmi
  1187. vmovdqu 0x20($inp),$Ij # I[2]
  1188. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1189. vpshufb $bswap,$Ij,$Ij
  1190. vpxor $Zlo,$Xlo,$Xlo
  1191. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1192. vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
  1193. vpunpckhqdq $Ij,$Ij,$T1
  1194. vpxor $Zhi,$Xhi,$Xhi
  1195. vpclmulqdq \$0x00,$HK, $T2,$Xmi
  1196. vpxor $Ij,$T1,$T1
  1197. vpxor $Zmi,$Xmi,$Xmi
  1198. vxorps $Tred,$Xi,$Xi
  1199. vmovdqu 0x10($inp),$Ii # I[1]
  1200. vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase
  1201. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1202. vpshufb $bswap,$Ii,$Ii
  1203. vpxor $Xlo,$Zlo,$Zlo
  1204. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1205. vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
  1206. vpclmulqdq \$0x10,(%r10),$Xi,$Xi
  1207. vxorps $Xo,$Tred,$Tred
  1208. vpunpckhqdq $Ii,$Ii,$T2
  1209. vpxor $Xhi,$Zhi,$Zhi
  1210. vpclmulqdq \$0x10,$HK, $T1,$Zmi
  1211. vmovdqu 0xb0-0x40($Htbl),$HK
  1212. vpxor $Ii,$T2,$T2
  1213. vpxor $Xmi,$Zmi,$Zmi
  1214. vmovdqu ($inp),$Ij # I[0]
  1215. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1216. vpshufb $bswap,$Ij,$Ij
  1217. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1218. vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
  1219. vpxor $Tred,$Ij,$Ij
  1220. vpclmulqdq \$0x10,$HK, $T2,$Xmi
  1221. vpxor $Xi,$Ij,$Ij # accumulate $Xi
  1222. lea 0x80($inp),$inp
  1223. sub \$0x80,$len
  1224. jnc .Loop8x_avx
  1225. add \$0x80,$len
  1226. jmp .Ltail_no_xor_avx
  1227. .align 32
  1228. .Lshort_avx:
  1229. vmovdqu -0x10($inp,$len),$Ii # very last word
  1230. lea ($inp,$len),$inp
  1231. vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
  1232. vmovdqu 0x20-0x40($Htbl),$HK
  1233. vpshufb $bswap,$Ii,$Ij
  1234. vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo,
  1235. vmovdqa $Xhi,$Zhi # $Zhi and
  1236. vmovdqa $Xmi,$Zmi # $Zmi
  1237. sub \$0x10,$len
  1238. jz .Ltail_avx
  1239. vpunpckhqdq $Ij,$Ij,$T1
  1240. vpxor $Xlo,$Zlo,$Zlo
  1241. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1242. vpxor $Ij,$T1,$T1
  1243. vmovdqu -0x20($inp),$Ii
  1244. vpxor $Xhi,$Zhi,$Zhi
  1245. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1246. vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
  1247. vpshufb $bswap,$Ii,$Ij
  1248. vpxor $Xmi,$Zmi,$Zmi
  1249. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1250. vpsrldq \$8,$HK,$HK
  1251. sub \$0x10,$len
  1252. jz .Ltail_avx
  1253. vpunpckhqdq $Ij,$Ij,$T1
  1254. vpxor $Xlo,$Zlo,$Zlo
  1255. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1256. vpxor $Ij,$T1,$T1
  1257. vmovdqu -0x30($inp),$Ii
  1258. vpxor $Xhi,$Zhi,$Zhi
  1259. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1260. vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
  1261. vpshufb $bswap,$Ii,$Ij
  1262. vpxor $Xmi,$Zmi,$Zmi
  1263. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1264. vmovdqu 0x50-0x40($Htbl),$HK
  1265. sub \$0x10,$len
  1266. jz .Ltail_avx
  1267. vpunpckhqdq $Ij,$Ij,$T1
  1268. vpxor $Xlo,$Zlo,$Zlo
  1269. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1270. vpxor $Ij,$T1,$T1
  1271. vmovdqu -0x40($inp),$Ii
  1272. vpxor $Xhi,$Zhi,$Zhi
  1273. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1274. vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
  1275. vpshufb $bswap,$Ii,$Ij
  1276. vpxor $Xmi,$Zmi,$Zmi
  1277. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1278. vpsrldq \$8,$HK,$HK
  1279. sub \$0x10,$len
  1280. jz .Ltail_avx
  1281. vpunpckhqdq $Ij,$Ij,$T1
  1282. vpxor $Xlo,$Zlo,$Zlo
  1283. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1284. vpxor $Ij,$T1,$T1
  1285. vmovdqu -0x50($inp),$Ii
  1286. vpxor $Xhi,$Zhi,$Zhi
  1287. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1288. vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
  1289. vpshufb $bswap,$Ii,$Ij
  1290. vpxor $Xmi,$Zmi,$Zmi
  1291. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1292. vmovdqu 0x80-0x40($Htbl),$HK
  1293. sub \$0x10,$len
  1294. jz .Ltail_avx
  1295. vpunpckhqdq $Ij,$Ij,$T1
  1296. vpxor $Xlo,$Zlo,$Zlo
  1297. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1298. vpxor $Ij,$T1,$T1
  1299. vmovdqu -0x60($inp),$Ii
  1300. vpxor $Xhi,$Zhi,$Zhi
  1301. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1302. vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
  1303. vpshufb $bswap,$Ii,$Ij
  1304. vpxor $Xmi,$Zmi,$Zmi
  1305. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1306. vpsrldq \$8,$HK,$HK
  1307. sub \$0x10,$len
  1308. jz .Ltail_avx
  1309. vpunpckhqdq $Ij,$Ij,$T1
  1310. vpxor $Xlo,$Zlo,$Zlo
  1311. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1312. vpxor $Ij,$T1,$T1
  1313. vmovdqu -0x70($inp),$Ii
  1314. vpxor $Xhi,$Zhi,$Zhi
  1315. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1316. vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
  1317. vpshufb $bswap,$Ii,$Ij
  1318. vpxor $Xmi,$Zmi,$Zmi
  1319. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1320. vmovq 0xb8-0x40($Htbl),$HK
  1321. sub \$0x10,$len
  1322. jmp .Ltail_avx
  1323. .align 32
  1324. .Ltail_avx:
  1325. vpxor $Xi,$Ij,$Ij # accumulate $Xi
  1326. .Ltail_no_xor_avx:
  1327. vpunpckhqdq $Ij,$Ij,$T1
  1328. vpxor $Xlo,$Zlo,$Zlo
  1329. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1330. vpxor $Ij,$T1,$T1
  1331. vpxor $Xhi,$Zhi,$Zhi
  1332. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1333. vpxor $Xmi,$Zmi,$Zmi
  1334. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1335. vmovdqu (%r10),$Tred
  1336. vpxor $Xlo,$Zlo,$Xi
  1337. vpxor $Xhi,$Zhi,$Xo
  1338. vpxor $Xmi,$Zmi,$Zmi
  1339. vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing
  1340. vpxor $Xo, $Zmi,$Zmi
  1341. vpslldq \$8, $Zmi,$T2
  1342. vpsrldq \$8, $Zmi,$Zmi
  1343. vpxor $T2, $Xi, $Xi
  1344. vpxor $Zmi,$Xo, $Xo
  1345. vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase
  1346. vpalignr \$8,$Xi,$Xi,$Xi
  1347. vpxor $T2,$Xi,$Xi
  1348. vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase
  1349. vpalignr \$8,$Xi,$Xi,$Xi
  1350. vpxor $Xo,$Xi,$Xi
  1351. vpxor $T2,$Xi,$Xi
  1352. cmp \$0,$len
  1353. jne .Lshort_avx
  1354. vpshufb $bswap,$Xi,$Xi
  1355. vmovdqu $Xi,($Xip)
  1356. vzeroupper
  1357. ___
  1358. $code.=<<___ if ($win64);
  1359. movaps (%rsp),%xmm6
  1360. movaps 0x10(%rsp),%xmm7
  1361. movaps 0x20(%rsp),%xmm8
  1362. movaps 0x30(%rsp),%xmm9
  1363. movaps 0x40(%rsp),%xmm10
  1364. movaps 0x50(%rsp),%xmm11
  1365. movaps 0x60(%rsp),%xmm12
  1366. movaps 0x70(%rsp),%xmm13
  1367. movaps 0x80(%rsp),%xmm14
  1368. movaps 0x90(%rsp),%xmm15
  1369. lea 0xa8(%rsp),%rsp
  1370. .LSEH_end_gcm_ghash_avx:
  1371. ___
  1372. $code.=<<___;
  1373. ret
  1374. .size gcm_ghash_avx,.-gcm_ghash_avx
  1375. ___
  1376. } else {
  1377. $code.=<<___;
  1378. jmp .L_ghash_clmul
  1379. .size gcm_ghash_avx,.-gcm_ghash_avx
  1380. ___
  1381. }
  1382. $code.=<<___;
  1383. .align 64
  1384. .Lbswap_mask:
  1385. .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
  1386. .L0x1c2_polynomial:
  1387. .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
  1388. .L7_mask:
  1389. .long 7,0,7,0
  1390. .L7_mask_poly:
  1391. .long 7,0,`0xE1<<1`,0
  1392. .align 64
  1393. .type .Lrem_4bit,\@object
  1394. .Lrem_4bit:
  1395. .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
  1396. .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
  1397. .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
  1398. .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
  1399. .type .Lrem_8bit,\@object
  1400. .Lrem_8bit:
  1401. .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
  1402. .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
  1403. .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
  1404. .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
  1405. .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
  1406. .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
  1407. .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
  1408. .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
  1409. .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
  1410. .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
  1411. .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
  1412. .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
  1413. .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
  1414. .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
  1415. .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
  1416. .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
  1417. .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
  1418. .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
  1419. .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
  1420. .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
  1421. .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
  1422. .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
  1423. .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
  1424. .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
  1425. .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
  1426. .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
  1427. .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
  1428. .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
  1429. .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
  1430. .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
  1431. .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
  1432. .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
  1433. .asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  1434. .align 64
  1435. ___
  1436. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1437. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1438. if ($win64) {
  1439. $rec="%rcx";
  1440. $frame="%rdx";
  1441. $context="%r8";
  1442. $disp="%r9";
  1443. $code.=<<___;
  1444. .extern __imp_RtlVirtualUnwind
  1445. .type se_handler,\@abi-omnipotent
  1446. .align 16
  1447. se_handler:
  1448. push %rsi
  1449. push %rdi
  1450. push %rbx
  1451. push %rbp
  1452. push %r12
  1453. push %r13
  1454. push %r14
  1455. push %r15
  1456. pushfq
  1457. sub \$64,%rsp
  1458. mov 120($context),%rax # pull context->Rax
  1459. mov 248($context),%rbx # pull context->Rip
  1460. mov 8($disp),%rsi # disp->ImageBase
  1461. mov 56($disp),%r11 # disp->HandlerData
  1462. mov 0(%r11),%r10d # HandlerData[0]
  1463. lea (%rsi,%r10),%r10 # prologue label
  1464. cmp %r10,%rbx # context->Rip<prologue label
  1465. jb .Lin_prologue
  1466. mov 152($context),%rax # pull context->Rsp
  1467. mov 4(%r11),%r10d # HandlerData[1]
  1468. lea (%rsi,%r10),%r10 # epilogue label
  1469. cmp %r10,%rbx # context->Rip>=epilogue label
  1470. jae .Lin_prologue
  1471. lea 24(%rax),%rax # adjust "rsp"
  1472. mov -8(%rax),%rbx
  1473. mov -16(%rax),%rbp
  1474. mov -24(%rax),%r12
  1475. mov %rbx,144($context) # restore context->Rbx
  1476. mov %rbp,160($context) # restore context->Rbp
  1477. mov %r12,216($context) # restore context->R12
  1478. .Lin_prologue:
  1479. mov 8(%rax),%rdi
  1480. mov 16(%rax),%rsi
  1481. mov %rax,152($context) # restore context->Rsp
  1482. mov %rsi,168($context) # restore context->Rsi
  1483. mov %rdi,176($context) # restore context->Rdi
  1484. mov 40($disp),%rdi # disp->ContextRecord
  1485. mov $context,%rsi # context
  1486. mov \$`1232/8`,%ecx # sizeof(CONTEXT)
  1487. .long 0xa548f3fc # cld; rep movsq
  1488. mov $disp,%rsi
  1489. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1490. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1491. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1492. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1493. mov 40(%rsi),%r10 # disp->ContextRecord
  1494. lea 56(%rsi),%r11 # &disp->HandlerData
  1495. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1496. mov %r10,32(%rsp) # arg5
  1497. mov %r11,40(%rsp) # arg6
  1498. mov %r12,48(%rsp) # arg7
  1499. mov %rcx,56(%rsp) # arg8, (NULL)
  1500. call *__imp_RtlVirtualUnwind(%rip)
  1501. mov \$1,%eax # ExceptionContinueSearch
  1502. add \$64,%rsp
  1503. popfq
  1504. pop %r15
  1505. pop %r14
  1506. pop %r13
  1507. pop %r12
  1508. pop %rbp
  1509. pop %rbx
  1510. pop %rdi
  1511. pop %rsi
  1512. ret
  1513. .size se_handler,.-se_handler
  1514. .section .pdata
  1515. .align 4
  1516. .rva .LSEH_begin_gcm_gmult_4bit
  1517. .rva .LSEH_end_gcm_gmult_4bit
  1518. .rva .LSEH_info_gcm_gmult_4bit
  1519. .rva .LSEH_begin_gcm_ghash_4bit
  1520. .rva .LSEH_end_gcm_ghash_4bit
  1521. .rva .LSEH_info_gcm_ghash_4bit
  1522. .rva .LSEH_begin_gcm_init_clmul
  1523. .rva .LSEH_end_gcm_init_clmul
  1524. .rva .LSEH_info_gcm_init_clmul
  1525. .rva .LSEH_begin_gcm_ghash_clmul
  1526. .rva .LSEH_end_gcm_ghash_clmul
  1527. .rva .LSEH_info_gcm_ghash_clmul
  1528. ___
  1529. $code.=<<___ if ($avx);
  1530. .rva .LSEH_begin_gcm_init_avx
  1531. .rva .LSEH_end_gcm_init_avx
  1532. .rva .LSEH_info_gcm_init_clmul
  1533. .rva .LSEH_begin_gcm_ghash_avx
  1534. .rva .LSEH_end_gcm_ghash_avx
  1535. .rva .LSEH_info_gcm_ghash_clmul
  1536. ___
  1537. $code.=<<___;
  1538. .section .xdata
  1539. .align 8
  1540. .LSEH_info_gcm_gmult_4bit:
  1541. .byte 9,0,0,0
  1542. .rva se_handler
  1543. .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData
  1544. .LSEH_info_gcm_ghash_4bit:
  1545. .byte 9,0,0,0
  1546. .rva se_handler
  1547. .rva .Lghash_prologue,.Lghash_epilogue # HandlerData
  1548. .LSEH_info_gcm_init_clmul:
  1549. .byte 0x01,0x08,0x03,0x00
  1550. .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
  1551. .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18
  1552. .LSEH_info_gcm_ghash_clmul:
  1553. .byte 0x01,0x33,0x16,0x00
  1554. .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15
  1555. .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14
  1556. .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13
  1557. .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12
  1558. .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11
  1559. .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
  1560. .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
  1561. .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
  1562. .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
  1563. .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
  1564. .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
  1565. ___
  1566. }
  1567. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  1568. print $code;
  1569. close STDOUT;