You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

2489 lines
70 KiB

  1. #!/usr/bin/env perl
  2. # Copyright (c) 2015, CloudFlare Ltd.
  3. #
  4. # Permission to use, copy, modify, and/or distribute this software for any
  5. # purpose with or without fee is hereby granted, provided that the above
  6. # copyright notice and this permission notice appear in all copies.
  7. #
  8. # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  9. # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  10. # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
  11. # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  12. # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
  13. # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
  14. # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
  15. ##############################################################################
  16. # #
  17. # Author: Vlad Krasnov #
  18. # #
  19. ##############################################################################
  20. $flavour = shift;
  21. $output = shift;
  22. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  23. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  24. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  25. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  26. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  27. die "can't locate x86_64-xlate.pl";
  28. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  29. *STDOUT=*OUT;
  30. $avx = 2;
  31. $code.=<<___;
  32. .text
  33. .extern OPENSSL_ia32cap_P
  34. chacha20_poly1305_constants:
  35. .align 64
  36. .chacha20_consts:
  37. .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
  38. .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
  39. .rol8:
  40. .byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
  41. .byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
  42. .rol16:
  43. .byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
  44. .byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
  45. .avx2_init:
  46. .long 0,0,0,0
  47. .sse_inc:
  48. .long 1,0,0,0
  49. .avx2_inc:
  50. .long 2,0,0,0,2,0,0,0
  51. .clamp:
  52. .quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
  53. .quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
  54. .align 16
  55. .and_masks:
  56. .byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
  57. .byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
  58. .byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
  59. .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
  60. .byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
  61. .byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
  62. .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
  63. .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
  64. .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
  65. .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
  66. .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
  67. .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
  68. .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
  69. .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
  70. .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
  71. .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
  72. ___
  73. my ($oup,$inp,$inl,$adp,$keyp,$itr1,$itr2)=("%rdi","%rsi","%rbx","%rcx","%r9","%rcx","%r8");
  74. my ($acc0,$acc1,$acc2)=map("%r$_",(10..12));
  75. my ($t0,$t1,$t2,$t3)=("%r13","%r14","%r15","%r9");
  76. my ($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%xmm$_",(0..15));
  77. my ($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3);
  78. my $r_store="0*16(%rbp)";
  79. my $s_store="1*16(%rbp)";
  80. my $len_store="2*16(%rbp)";
  81. my $state1_store="3*16(%rbp)";
  82. my $state2_store="4*16(%rbp)";
  83. my $tmp_store="5*16(%rbp)";
  84. my $ctr0_store="6*16(%rbp)";
  85. my $ctr1_store="7*16(%rbp)";
  86. my $ctr2_store="8*16(%rbp)";
  87. my $ctr3_store="9*16(%rbp)";
  88. sub chacha_qr {
  89. my ($a,$b,$c,$d,$t,$dir)=@_;
  90. $code.="movdqa $t, $tmp_store\n" if ($dir =~ /store/);
  91. $code.="paddd $b, $a
  92. pxor $a, $d
  93. pshufb .rol16(%rip), $d
  94. paddd $d, $c
  95. pxor $c, $b
  96. movdqa $b, $t
  97. pslld \$12, $t
  98. psrld \$20, $b
  99. pxor $t, $b
  100. paddd $b, $a
  101. pxor $a, $d
  102. pshufb .rol8(%rip), $d
  103. paddd $d, $c
  104. pxor $c, $b
  105. movdqa $b, $t
  106. pslld \$7, $t
  107. psrld \$25, $b
  108. pxor $t, $b\n";
  109. $code.="palignr \$4, $b, $b
  110. palignr \$8, $c, $c
  111. palignr \$12, $d, $d\n" if ($dir =~ /left/);
  112. $code.="palignr \$12, $b, $b
  113. palignr \$8, $c, $c
  114. palignr \$4, $d, $d\n" if ($dir =~ /right/);
  115. $code.="movdqa $tmp_store, $t\n" if ($dir =~ /load/);
  116. }
  117. sub poly_add {
  118. my ($src)=@_;
  119. $code.="add $src, $acc0
  120. adc 8+$src, $acc1
  121. adc \$1, $acc2\n";
  122. }
  123. sub poly_stage1 {
  124. $code.="mov 0+$r_store, %rax
  125. mov %rax, $t2
  126. mul $acc0
  127. mov %rax, $t0
  128. mov %rdx, $t1
  129. mov 0+$r_store, %rax
  130. mul $acc1
  131. imulq $acc2, $t2
  132. add %rax, $t1
  133. adc %rdx, $t2\n";
  134. }
  135. sub poly_stage2 {
  136. $code.="mov 8+$r_store, %rax
  137. mov %rax, $t3
  138. mul $acc0
  139. add %rax, $t1
  140. adc \$0, %rdx
  141. mov %rdx, $acc0
  142. mov 8+$r_store, %rax
  143. mul $acc1
  144. add %rax, $t2
  145. adc \$0, %rdx\n";
  146. }
  147. sub poly_stage3 {
  148. $code.="imulq $acc2, $t3
  149. add $acc0, $t2
  150. adc %rdx, $t3\n";
  151. }
  152. sub poly_reduce_stage {
  153. $code.="mov $t0, $acc0
  154. mov $t1, $acc1
  155. mov $t2, $acc2
  156. and \$3, $acc2
  157. mov $t2, $t0
  158. and \$-4, $t0
  159. mov $t3, $t1
  160. shrd \$2, $t3, $t2
  161. shr \$2, $t3
  162. add $t0, $acc0
  163. adc $t1, $acc1
  164. adc \$0, $acc2
  165. add $t2, $acc0
  166. adc $t3, $acc1
  167. adc \$0, $acc2\n";
  168. }
  169. sub poly_mul {
  170. &poly_stage1();
  171. &poly_stage2();
  172. &poly_stage3();
  173. &poly_reduce_stage();
  174. }
  175. sub prep_state {
  176. my ($n)=@_;
  177. $code.="movdqa .chacha20_consts(%rip), $A0
  178. movdqa $state1_store, $B0
  179. movdqa $state2_store, $C0\n";
  180. $code.="movdqa $A0, $A1
  181. movdqa $B0, $B1
  182. movdqa $C0, $C1\n" if ($n ge 2);
  183. $code.="movdqa $A0, $A2
  184. movdqa $B0, $B2
  185. movdqa $C0, $C2\n" if ($n ge 3);
  186. $code.="movdqa $A0, $A3
  187. movdqa $B0, $B3
  188. movdqa $C0, $C3\n" if ($n ge 4);
  189. $code.="movdqa $ctr0_store, $D0
  190. paddd .sse_inc(%rip), $D0
  191. movdqa $D0, $ctr0_store\n" if ($n eq 1);
  192. $code.="movdqa $ctr0_store, $D1
  193. paddd .sse_inc(%rip), $D1
  194. movdqa $D1, $D0
  195. paddd .sse_inc(%rip), $D0
  196. movdqa $D0, $ctr0_store
  197. movdqa $D1, $ctr1_store\n" if ($n eq 2);
  198. $code.="movdqa $ctr0_store, $D2
  199. paddd .sse_inc(%rip), $D2
  200. movdqa $D2, $D1
  201. paddd .sse_inc(%rip), $D1
  202. movdqa $D1, $D0
  203. paddd .sse_inc(%rip), $D0
  204. movdqa $D0, $ctr0_store
  205. movdqa $D1, $ctr1_store
  206. movdqa $D2, $ctr2_store\n" if ($n eq 3);
  207. $code.="movdqa $ctr0_store, $D3
  208. paddd .sse_inc(%rip), $D3
  209. movdqa $D3, $D2
  210. paddd .sse_inc(%rip), $D2
  211. movdqa $D2, $D1
  212. paddd .sse_inc(%rip), $D1
  213. movdqa $D1, $D0
  214. paddd .sse_inc(%rip), $D0
  215. movdqa $D0, $ctr0_store
  216. movdqa $D1, $ctr1_store
  217. movdqa $D2, $ctr2_store
  218. movdqa $D3, $ctr3_store\n" if ($n eq 4);
  219. }
  220. sub finalize_state {
  221. my ($n)=@_;
  222. $code.="paddd .chacha20_consts(%rip), $A3
  223. paddd $state1_store, $B3
  224. paddd $state2_store, $C3
  225. paddd $ctr3_store, $D3\n" if ($n eq 4);
  226. $code.="paddd .chacha20_consts(%rip), $A2
  227. paddd $state1_store, $B2
  228. paddd $state2_store, $C2
  229. paddd $ctr2_store, $D2\n" if ($n ge 3);
  230. $code.="paddd .chacha20_consts(%rip), $A1
  231. paddd $state1_store, $B1
  232. paddd $state2_store, $C1
  233. paddd $ctr1_store, $D1\n" if ($n ge 2);
  234. $code.="paddd .chacha20_consts(%rip), $A0
  235. paddd $state1_store, $B0
  236. paddd $state2_store, $C0
  237. paddd $ctr0_store, $D0\n";
  238. }
  239. sub xor_stream {
  240. my ($A, $B, $C, $D, $offset)=@_;
  241. $code.="movdqu 0*16 + $offset($inp), $A3
  242. movdqu 1*16 + $offset($inp), $B3
  243. movdqu 2*16 + $offset($inp), $C3
  244. movdqu 3*16 + $offset($inp), $D3
  245. pxor $A3, $A
  246. pxor $B3, $B
  247. pxor $C3, $C
  248. pxor $D, $D3
  249. movdqu $A, 0*16 + $offset($oup)
  250. movdqu $B, 1*16 + $offset($oup)
  251. movdqu $C, 2*16 + $offset($oup)
  252. movdqu $D3, 3*16 + $offset($oup)\n";
  253. }
  254. sub xor_stream_using_temp {
  255. my ($A, $B, $C, $D, $offset, $temp)=@_;
  256. $code.="movdqa $temp, $tmp_store
  257. movdqu 0*16 + $offset($inp), $temp
  258. pxor $A, $temp
  259. movdqu $temp, 0*16 + $offset($oup)
  260. movdqu 1*16 + $offset($inp), $temp
  261. pxor $B, $temp
  262. movdqu $temp, 1*16 + $offset($oup)
  263. movdqu 2*16 + $offset($inp), $temp
  264. pxor $C, $temp
  265. movdqu $temp, 2*16 + $offset($oup)
  266. movdqu 3*16 + $offset($inp), $temp
  267. pxor $D, $temp
  268. movdqu $temp, 3*16 + $offset($oup)\n";
  269. }
  270. sub gen_chacha_round {
  271. my ($rot1, $rot2, $shift)=@_;
  272. my $round="";
  273. $round.="movdqa $C0, $tmp_store\n" if ($rot1 eq 20);
  274. $round.="movdqa $rot2, $C0
  275. paddd $B3, $A3
  276. paddd $B2, $A2
  277. paddd $B1, $A1
  278. paddd $B0, $A0
  279. pxor $A3, $D3
  280. pxor $A2, $D2
  281. pxor $A1, $D1
  282. pxor $A0, $D0
  283. pshufb $C0, $D3
  284. pshufb $C0, $D2
  285. pshufb $C0, $D1
  286. pshufb $C0, $D0
  287. movdqa $tmp_store, $C0
  288. paddd $D3, $C3
  289. paddd $D2, $C2
  290. paddd $D1, $C1
  291. paddd $D0, $C0
  292. pxor $C3, $B3
  293. pxor $C2, $B2
  294. pxor $C1, $B1
  295. pxor $C0, $B0
  296. movdqa $C0, $tmp_store
  297. movdqa $B3, $C0
  298. psrld \$$rot1, $C0
  299. pslld \$32-$rot1, $B3
  300. pxor $C0, $B3
  301. movdqa $B2, $C0
  302. psrld \$$rot1, $C0
  303. pslld \$32-$rot1, $B2
  304. pxor $C0, $B2
  305. movdqa $B1, $C0
  306. psrld \$$rot1, $C0
  307. pslld \$32-$rot1, $B1
  308. pxor $C0, $B1
  309. movdqa $B0, $C0
  310. psrld \$$rot1, $C0
  311. pslld \$32-$rot1, $B0
  312. pxor $C0, $B0\n";
  313. ($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/);
  314. ($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/);
  315. $round.="movdqa $tmp_store, $C0
  316. palignr \$$s1, $B3, $B3
  317. palignr \$$s2, $C3, $C3
  318. palignr \$$s3, $D3, $D3
  319. palignr \$$s1, $B2, $B2
  320. palignr \$$s2, $C2, $C2
  321. palignr \$$s3, $D2, $D2
  322. palignr \$$s1, $B1, $B1
  323. palignr \$$s2, $C1, $C1
  324. palignr \$$s3, $D1, $D1
  325. palignr \$$s1, $B0, $B0
  326. palignr \$$s2, $C0, $C0
  327. palignr \$$s3, $D0, $D0\n"
  328. if (($shift =~ /left/) || ($shift =~ /right/));
  329. return $round;
  330. };
  331. $chacha_body = &gen_chacha_round(20, ".rol16(%rip)") .
  332. &gen_chacha_round(25, ".rol8(%rip)", "left") .
  333. &gen_chacha_round(20, ".rol16(%rip)") .
  334. &gen_chacha_round(25, ".rol8(%rip)", "right");
  335. my @loop_body = split /\n/, $chacha_body;
  336. sub emit_body {
  337. my ($n)=@_;
  338. for (my $i=0; $i < $n; $i++) {
  339. $code=$code.shift(@loop_body)."\n";
  340. };
  341. }
  342. {
  343. ################################################################################
  344. # void poly_hash_ad_internal();
  345. $code.="
  346. .type poly_hash_ad_internal,\@function,2
  347. .align 64
  348. poly_hash_ad_internal:
  349. .cfi_startproc
  350. xor $acc0, $acc0
  351. xor $acc1, $acc1
  352. xor $acc2, $acc2
  353. cmp \$13, $itr2
  354. jne hash_ad_loop
  355. poly_fast_tls_ad:
  356. # Special treatment for the TLS case of 13 bytes
  357. mov ($adp), $acc0
  358. mov 5($adp), $acc1
  359. shr \$24, $acc1
  360. mov \$1, $acc2\n";
  361. &poly_mul(); $code.="
  362. ret
  363. hash_ad_loop:
  364. # Hash in 16 byte chunk
  365. cmp \$16, $itr2
  366. jb hash_ad_tail\n";
  367. &poly_add("0($adp)");
  368. &poly_mul(); $code.="
  369. lea 1*16($adp), $adp
  370. sub \$16, $itr2
  371. jmp hash_ad_loop
  372. hash_ad_tail:
  373. cmp \$0, $itr2
  374. je 1f
  375. # Hash last < 16 byte tail
  376. xor $t0, $t0
  377. xor $t1, $t1
  378. xor $t2, $t2
  379. add $itr2, $adp
  380. hash_ad_tail_loop:
  381. shld \$8, $t0, $t1
  382. shl \$8, $t0
  383. movzxb -1($adp), $t2
  384. xor $t2, $t0
  385. dec $adp
  386. dec $itr2
  387. jne hash_ad_tail_loop
  388. add $t0, $acc0
  389. adc $t1, $acc1
  390. adc \$1, $acc2\n";
  391. &poly_mul(); $code.="
  392. # Finished AD
  393. 1:
  394. ret
  395. .cfi_endproc
  396. .size poly_hash_ad_internal, .-poly_hash_ad_internal\n";
  397. }
  398. {
  399. ################################################################################
  400. # void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp);
  401. $code.="
  402. .globl chacha20_poly1305_open
  403. .type chacha20_poly1305_open,\@function,2
  404. .align 64
  405. chacha20_poly1305_open:
  406. .cfi_startproc
  407. push %rbp
  408. .cfi_adjust_cfa_offset 8
  409. push %rbx
  410. .cfi_adjust_cfa_offset 8
  411. push %r12
  412. .cfi_adjust_cfa_offset 8
  413. push %r13
  414. .cfi_adjust_cfa_offset 8
  415. push %r14
  416. .cfi_adjust_cfa_offset 8
  417. push %r15
  418. .cfi_adjust_cfa_offset 8
  419. # We write the calculated authenticator back to keyp at the end, so save
  420. # the pointer on the stack too.
  421. push $keyp
  422. .cfi_adjust_cfa_offset 8
  423. sub \$288 + 32, %rsp
  424. .cfi_adjust_cfa_offset 288 + 32
  425. .cfi_offset rbp, -16
  426. .cfi_offset rbx, -24
  427. .cfi_offset r12, -32
  428. .cfi_offset r13, -40
  429. .cfi_offset r14, -48
  430. .cfi_offset r15, -56
  431. lea 32(%rsp), %rbp
  432. and \$-32, %rbp
  433. mov %rdx, 8+$len_store
  434. mov %r8, 0+$len_store
  435. mov %rdx, $inl\n"; $code.="
  436. mov OPENSSL_ia32cap_P+8(%rip), %eax
  437. and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present
  438. xor \$`(1<<5) + (1<<8)`, %eax
  439. jz chacha20_poly1305_open_avx2\n" if ($avx>1);
  440. $code.="
  441. 1:
  442. cmp \$128, $inl
  443. jbe open_sse_128
  444. # For long buffers, prepare the poly key first
  445. movdqa .chacha20_consts(%rip), $A0
  446. movdqu 0*16($keyp), $B0
  447. movdqu 1*16($keyp), $C0
  448. movdqu 2*16($keyp), $D0
  449. movdqa $D0, $T1
  450. # Store on stack, to free keyp
  451. movdqa $B0, $state1_store
  452. movdqa $C0, $state2_store
  453. movdqa $D0, $ctr0_store
  454. mov \$10, $acc0
  455. 1: \n";
  456. &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
  457. &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.="
  458. dec $acc0
  459. jne 1b
  460. # A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
  461. paddd .chacha20_consts(%rip), $A0
  462. paddd $state1_store, $B0
  463. # Clamp and store the key
  464. pand .clamp(%rip), $A0
  465. movdqa $A0, $r_store
  466. movdqa $B0, $s_store
  467. # Hash
  468. mov %r8, $itr2
  469. call poly_hash_ad_internal
  470. open_sse_main_loop:
  471. cmp \$16*16, $inl
  472. jb 2f
  473. # Load state, increment counter blocks\n";
  474. &prep_state(4); $code.="
  475. # There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we
  476. # hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
  477. mov \$4, $itr1
  478. mov $inp, $itr2
  479. 1: \n";
  480. &emit_body(20);
  481. &poly_add("0($itr2)"); $code.="
  482. lea 2*8($itr2), $itr2\n";
  483. &emit_body(20);
  484. &poly_stage1();
  485. &emit_body(20);
  486. &poly_stage2();
  487. &emit_body(20);
  488. &poly_stage3();
  489. &emit_body(20);
  490. &poly_reduce_stage();
  491. foreach $l (@loop_body) {$code.=$l."\n";}
  492. @loop_body = split /\n/, $chacha_body; $code.="
  493. dec $itr1
  494. jge 1b\n";
  495. &poly_add("0($itr2)");
  496. &poly_mul(); $code.="
  497. lea 2*8($itr2), $itr2
  498. cmp \$-6, $itr1
  499. jg 1b\n";
  500. &finalize_state(4);
  501. &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0);
  502. &xor_stream($A2, $B2, $C2, $D2, "4*16");
  503. &xor_stream($A1, $B1, $C1, $D1, "8*16");
  504. &xor_stream($A0, $B0, $C0, $tmp_store, "12*16"); $code.="
  505. lea 16*16($inp), $inp
  506. lea 16*16($oup), $oup
  507. sub \$16*16, $inl
  508. jmp open_sse_main_loop
  509. 2:
  510. # Handle the various tail sizes efficiently
  511. test $inl, $inl
  512. jz open_sse_finalize
  513. cmp \$4*16, $inl
  514. ja 3f\n";
  515. ###############################################################################
  516. # At most 64 bytes are left
  517. &prep_state(1); $code.="
  518. xor $itr2, $itr2
  519. mov $inl, $itr1
  520. cmp \$16, $itr1
  521. jb 2f
  522. 1: \n";
  523. &poly_add("0($inp, $itr2)");
  524. &poly_mul(); $code.="
  525. sub \$16, $itr1
  526. 2:
  527. add \$16, $itr2\n";
  528. &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
  529. &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.="
  530. cmp \$16, $itr1
  531. jae 1b
  532. cmp \$10*16, $itr2
  533. jne 2b\n";
  534. &finalize_state(1); $code.="
  535. jmp open_sse_tail_64_dec_loop
  536. 3:
  537. cmp \$8*16, $inl
  538. ja 3f\n";
  539. ###############################################################################
  540. # 65 - 128 bytes are left
  541. &prep_state(2); $code.="
  542. mov $inl, $itr1
  543. and \$-16, $itr1
  544. xor $itr2, $itr2
  545. 1: \n";
  546. &poly_add("0($inp, $itr2)");
  547. &poly_mul(); $code.="
  548. 2:
  549. add \$16, $itr2\n";
  550. &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
  551. &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
  552. &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
  553. &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");$code.="
  554. cmp $itr1, $itr2
  555. jb 1b
  556. cmp \$10*16, $itr2
  557. jne 2b\n";
  558. &finalize_state(2);
  559. &xor_stream($A1, $B1, $C1, $D1, "0*16"); $code.="
  560. sub \$4*16, $inl
  561. lea 4*16($inp), $inp
  562. lea 4*16($oup), $oup
  563. jmp open_sse_tail_64_dec_loop
  564. 3:
  565. cmp \$12*16, $inl
  566. ja 3f\n";
  567. ###############################################################################
  568. # 129 - 192 bytes are left
  569. &prep_state(3); $code.="
  570. mov $inl, $itr1
  571. mov \$10*16, $itr2
  572. cmp \$10*16, $itr1
  573. cmovg $itr2, $itr1
  574. and \$-16, $itr1
  575. xor $itr2, $itr2
  576. 1: \n";
  577. &poly_add("0($inp, $itr2)");
  578. &poly_mul(); $code.="
  579. 2:
  580. add \$16, $itr2\n";
  581. &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
  582. &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
  583. &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
  584. &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
  585. &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
  586. &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
  587. cmp $itr1, $itr2
  588. jb 1b
  589. cmp \$10*16, $itr2
  590. jne 2b
  591. cmp \$11*16, $inl
  592. jb 1f\n";
  593. &poly_add("10*16($inp)");
  594. &poly_mul(); $code.="
  595. cmp \$12*16, $inl
  596. jb 1f\n";
  597. &poly_add("11*16($inp)");
  598. &poly_mul(); $code.="
  599. 1: \n";
  600. &finalize_state(3);
  601. &xor_stream($A2, $B2, $C2, $D2, "0*16");
  602. &xor_stream($A1, $B1, $C1, $D1, "4*16"); $code.="
  603. sub \$8*16, $inl
  604. lea 8*16($inp), $inp
  605. lea 8*16($oup), $oup
  606. jmp open_sse_tail_64_dec_loop
  607. 3:
  608. ###############################################################################\n";
  609. # 193 - 255 bytes are left
  610. &prep_state(4); $code.="
  611. xor $itr2, $itr2
  612. 1: \n";
  613. &poly_add("0($inp, $itr2)");
  614. &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_left");
  615. &chacha_qr($A1,$B1,$C1,$D1,$C3,"left");
  616. &chacha_qr($A2,$B2,$C2,$D2,$C3,"left_load");
  617. &poly_stage1();
  618. &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_left_load");
  619. &poly_stage2();
  620. &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_right");
  621. &chacha_qr($A1,$B1,$C1,$D1,$C3,"right");
  622. &poly_stage3();
  623. &chacha_qr($A2,$B2,$C2,$D2,$C3,"right_load");
  624. &poly_reduce_stage();
  625. &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_right_load"); $code.="
  626. add \$16, $itr2
  627. cmp \$10*16, $itr2
  628. jb 1b
  629. mov $inl, $itr1
  630. and \$-16, $itr1
  631. 1: \n";
  632. &poly_add("0($inp, $itr2)");
  633. &poly_mul(); $code.="
  634. add \$16, $itr2
  635. cmp $itr1, $itr2
  636. jb 1b\n";
  637. &finalize_state(4);
  638. &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0);
  639. &xor_stream($A2, $B2, $C2, $D2, "4*16");
  640. &xor_stream($A1, $B1, $C1, $D1, "8*16"); $code.="
  641. movdqa $tmp_store, $D0
  642. sub \$12*16, $inl
  643. lea 12*16($inp), $inp
  644. lea 12*16($oup), $oup
  645. ###############################################################################
  646. # Decrypt the remaining data, 16B at a time, using existing stream
  647. open_sse_tail_64_dec_loop:
  648. cmp \$16, $inl
  649. jb 1f
  650. sub \$16, $inl
  651. movdqu ($inp), $T0
  652. pxor $T0, $A0
  653. movdqu $A0, ($oup)
  654. lea 16($inp), $inp
  655. lea 16($oup), $oup
  656. movdqa $B0, $A0
  657. movdqa $C0, $B0
  658. movdqa $D0, $C0
  659. jmp open_sse_tail_64_dec_loop
  660. 1:
  661. movdqa $A0, $A1
  662. # Decrypt up to 16 bytes at the end.
  663. open_sse_tail_16:
  664. test $inl, $inl
  665. jz open_sse_finalize
  666. # Read the final bytes into $T0. They need to be read in reverse order so
  667. # that they end up in the correct order in $T0.
  668. pxor $T0, $T0
  669. lea -1($inp, $inl), $inp
  670. movq $inl, $itr2
  671. 2:
  672. pslldq \$1, $T0
  673. pinsrb \$0, ($inp), $T0
  674. sub \$1, $inp
  675. sub \$1, $itr2
  676. jnz 2b
  677. 3:
  678. movq $T0, $t0
  679. pextrq \$1, $T0, $t1
  680. # The final bytes of keystream are in $A1.
  681. pxor $A1, $T0
  682. # Copy the plaintext bytes out.
  683. 2:
  684. pextrb \$0, $T0, ($oup)
  685. psrldq \$1, $T0
  686. add \$1, $oup
  687. sub \$1, $inl
  688. jne 2b
  689. add $t0, $acc0
  690. adc $t1, $acc1
  691. adc \$1, $acc2\n";
  692. &poly_mul(); $code.="
  693. open_sse_finalize:\n";
  694. &poly_add($len_store);
  695. &poly_mul(); $code.="
  696. # Final reduce
  697. mov $acc0, $t0
  698. mov $acc1, $t1
  699. mov $acc2, $t2
  700. sub \$-5, $acc0
  701. sbb \$-1, $acc1
  702. sbb \$3, $acc2
  703. cmovc $t0, $acc0
  704. cmovc $t1, $acc1
  705. cmovc $t2, $acc2
  706. # Add in s part of the key
  707. add 0+$s_store, $acc0
  708. adc 8+$s_store, $acc1
  709. add \$288 + 32, %rsp
  710. .cfi_adjust_cfa_offset -(288 + 32)
  711. pop $keyp
  712. .cfi_adjust_cfa_offset -8
  713. movq $acc0, ($keyp)
  714. movq $acc1, 8($keyp)
  715. pop %r15
  716. .cfi_adjust_cfa_offset -8
  717. pop %r14
  718. .cfi_adjust_cfa_offset -8
  719. pop %r13
  720. .cfi_adjust_cfa_offset -8
  721. pop %r12
  722. .cfi_adjust_cfa_offset -8
  723. pop %rbx
  724. .cfi_adjust_cfa_offset -8
  725. pop %rbp
  726. .cfi_adjust_cfa_offset -8
  727. ret
  728. .cfi_adjust_cfa_offset (8 * 6) + 288 + 32
  729. ###############################################################################
  730. open_sse_128:
  731. movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
  732. movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
  733. movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
  734. movdqu 2*16($keyp), $D0
  735. movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1
  736. movdqa $D1, $D2\npaddd .sse_inc(%rip), $D2
  737. movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D1, $T3
  738. mov \$10, $acc0
  739. 1: \n";
  740. &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
  741. &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
  742. &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
  743. &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
  744. &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
  745. &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
  746. dec $acc0
  747. jnz 1b
  748. paddd .chacha20_consts(%rip), $A0
  749. paddd .chacha20_consts(%rip), $A1
  750. paddd .chacha20_consts(%rip), $A2
  751. paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
  752. paddd $T2, $C1\npaddd $T2, $C2
  753. paddd $T3, $D1
  754. paddd .sse_inc(%rip), $T3
  755. paddd $T3, $D2
  756. # Clamp and store the key
  757. pand .clamp(%rip), $A0
  758. movdqa $A0, $r_store
  759. movdqa $B0, $s_store
  760. # Hash
  761. mov %r8, $itr2
  762. call poly_hash_ad_internal
  763. 1:
  764. cmp \$16, $inl
  765. jb open_sse_tail_16
  766. sub \$16, $inl\n";
  767. # Load for hashing
  768. &poly_add("0*8($inp)"); $code.="
  769. # Load for decryption
  770. movdqu 0*16($inp), $T0
  771. pxor $T0, $A1
  772. movdqu $A1, 0*16($oup)
  773. lea 1*16($inp), $inp
  774. lea 1*16($oup), $oup\n";
  775. &poly_mul(); $code.="
  776. # Shift the stream left
  777. movdqa $B1, $A1
  778. movdqa $C1, $B1
  779. movdqa $D1, $C1
  780. movdqa $A2, $D1
  781. movdqa $B2, $A2
  782. movdqa $C2, $B2
  783. movdqa $D2, $C2
  784. jmp 1b
  785. jmp open_sse_tail_16
  786. .size chacha20_poly1305_open, .-chacha20_poly1305_open
  787. .cfi_endproc
  788. ################################################################################
  789. ################################################################################
  790. # void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp);
  791. .globl chacha20_poly1305_seal
  792. .type chacha20_poly1305_seal,\@function,2
  793. .align 64
  794. chacha20_poly1305_seal:
  795. .cfi_startproc
  796. push %rbp
  797. .cfi_adjust_cfa_offset 8
  798. push %rbx
  799. .cfi_adjust_cfa_offset 8
  800. push %r12
  801. .cfi_adjust_cfa_offset 8
  802. push %r13
  803. .cfi_adjust_cfa_offset 8
  804. push %r14
  805. .cfi_adjust_cfa_offset 8
  806. push %r15
  807. .cfi_adjust_cfa_offset 8
  808. # We write the calculated authenticator back to keyp at the end, so save
  809. # the pointer on the stack too.
  810. push $keyp
  811. .cfi_adjust_cfa_offset 8
  812. sub \$288 + 32, %rsp
  813. .cfi_adjust_cfa_offset 288 + 32
  814. .cfi_offset rbp, -16
  815. .cfi_offset rbx, -24
  816. .cfi_offset r12, -32
  817. .cfi_offset r13, -40
  818. .cfi_offset r14, -48
  819. .cfi_offset r15, -56
  820. lea 32(%rsp), %rbp
  821. and \$-32, %rbp
  822. mov 56($keyp), $inl # extra_in_len
  823. addq %rdx, $inl
  824. mov $inl, 8+$len_store
  825. mov %r8, 0+$len_store
  826. mov %rdx, $inl\n"; $code.="
  827. mov OPENSSL_ia32cap_P+8(%rip), %eax
  828. and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present
  829. xor \$`(1<<5) + (1<<8)`, %eax
  830. jz chacha20_poly1305_seal_avx2\n" if ($avx>1);
  831. $code.="
  832. cmp \$128, $inl
  833. jbe seal_sse_128
  834. # For longer buffers, prepare the poly key + some stream
  835. movdqa .chacha20_consts(%rip), $A0
  836. movdqu 0*16($keyp), $B0
  837. movdqu 1*16($keyp), $C0
  838. movdqu 2*16($keyp), $D0
  839. movdqa $A0, $A1
  840. movdqa $A0, $A2
  841. movdqa $A0, $A3
  842. movdqa $B0, $B1
  843. movdqa $B0, $B2
  844. movdqa $B0, $B3
  845. movdqa $C0, $C1
  846. movdqa $C0, $C2
  847. movdqa $C0, $C3
  848. movdqa $D0, $D3
  849. paddd .sse_inc(%rip), $D0
  850. movdqa $D0, $D2
  851. paddd .sse_inc(%rip), $D0
  852. movdqa $D0, $D1
  853. paddd .sse_inc(%rip), $D0
  854. # Store on stack
  855. movdqa $B0, $state1_store
  856. movdqa $C0, $state2_store
  857. movdqa $D0, $ctr0_store
  858. movdqa $D1, $ctr1_store
  859. movdqa $D2, $ctr2_store
  860. movdqa $D3, $ctr3_store
  861. mov \$10, $acc0
  862. 1: \n";
  863. foreach $l (@loop_body) {$code.=$l."\n";}
  864. @loop_body = split /\n/, $chacha_body; $code.="
  865. dec $acc0
  866. jnz 1b\n";
  867. &finalize_state(4); $code.="
  868. # Clamp and store the key
  869. pand .clamp(%rip), $A3
  870. movdqa $A3, $r_store
  871. movdqa $B3, $s_store
  872. # Hash
  873. mov %r8, $itr2
  874. call poly_hash_ad_internal\n";
  875. &xor_stream($A2,$B2,$C2,$D2,"0*16");
  876. &xor_stream($A1,$B1,$C1,$D1,"4*16"); $code.="
  877. cmp \$12*16, $inl
  878. ja 1f
  879. mov \$8*16, $itr1
  880. sub \$8*16, $inl
  881. lea 8*16($inp), $inp
  882. jmp seal_sse_128_seal_hash
  883. 1: \n";
  884. &xor_stream($A0, $B0, $C0, $D0, "8*16"); $code.="
  885. mov \$12*16, $itr1
  886. sub \$12*16, $inl
  887. lea 12*16($inp), $inp
  888. mov \$2, $itr1
  889. mov \$8, $itr2
  890. cmp \$4*16, $inl
  891. jbe seal_sse_tail_64
  892. cmp \$8*16, $inl
  893. jbe seal_sse_tail_128
  894. cmp \$12*16, $inl
  895. jbe seal_sse_tail_192
  896. 1: \n";
  897. # The main loop
  898. &prep_state(4); $code.="
  899. 2: \n";
  900. &emit_body(20);
  901. &poly_add("0($oup)");
  902. &emit_body(20);
  903. &poly_stage1();
  904. &emit_body(20);
  905. &poly_stage2();
  906. &emit_body(20);
  907. &poly_stage3();
  908. &emit_body(20);
  909. &poly_reduce_stage();
  910. foreach $l (@loop_body) {$code.=$l."\n";}
  911. @loop_body = split /\n/, $chacha_body; $code.="
  912. lea 16($oup), $oup
  913. dec $itr2
  914. jge 2b\n";
  915. &poly_add("0*8($oup)");
  916. &poly_mul(); $code.="
  917. lea 16($oup), $oup
  918. dec $itr1
  919. jg 2b\n";
  920. &finalize_state(4);$code.="
  921. movdqa $D2, $tmp_store\n";
  922. &xor_stream_using_temp($A3,$B3,$C3,$D3,0*16,$D2); $code.="
  923. movdqa $tmp_store, $D2\n";
  924. &xor_stream($A2,$B2,$C2,$D2, 4*16);
  925. &xor_stream($A1,$B1,$C1,$D1, 8*16); $code.="
  926. cmp \$16*16, $inl
  927. ja 3f
  928. mov \$12*16, $itr1
  929. sub \$12*16, $inl
  930. lea 12*16($inp), $inp
  931. jmp seal_sse_128_seal_hash
  932. 3: \n";
  933. &xor_stream($A0,$B0,$C0,$D0,"12*16"); $code.="
  934. lea 16*16($inp), $inp
  935. sub \$16*16, $inl
  936. mov \$6, $itr1
  937. mov \$4, $itr2
  938. cmp \$12*16, $inl
  939. jg 1b
  940. mov $inl, $itr1
  941. test $inl, $inl
  942. je seal_sse_128_seal_hash
  943. mov \$6, $itr1
  944. cmp \$4*16, $inl
  945. jg 3f
  946. ###############################################################################
  947. seal_sse_tail_64:\n";
  948. &prep_state(1); $code.="
  949. 1: \n";
  950. &poly_add("0($oup)");
  951. &poly_mul(); $code.="
  952. lea 16($oup), $oup
  953. 2: \n";
  954. &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
  955. &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
  956. &poly_add("0($oup)");
  957. &poly_mul(); $code.="
  958. lea 16($oup), $oup
  959. dec $itr1
  960. jg 1b
  961. dec $itr2
  962. jge 2b\n";
  963. &finalize_state(1); $code.="
  964. jmp seal_sse_128_seal
  965. 3:
  966. cmp \$8*16, $inl
  967. jg 3f
  968. ###############################################################################
  969. seal_sse_tail_128:\n";
  970. &prep_state(2); $code.="
  971. 1: \n";
  972. &poly_add("0($oup)");
  973. &poly_mul(); $code.="
  974. lea 16($oup), $oup
  975. 2: \n";
  976. &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
  977. &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
  978. &poly_add("0($oup)");
  979. &poly_mul();
  980. &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
  981. &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); $code.="
  982. lea 16($oup), $oup
  983. dec $itr1
  984. jg 1b
  985. dec $itr2
  986. jge 2b\n";
  987. &finalize_state(2);
  988. &xor_stream($A1,$B1,$C1,$D1,0*16); $code.="
  989. mov \$4*16, $itr1
  990. sub \$4*16, $inl
  991. lea 4*16($inp), $inp
  992. jmp seal_sse_128_seal_hash
  993. 3:
  994. ###############################################################################
  995. seal_sse_tail_192:\n";
  996. &prep_state(3); $code.="
  997. 1: \n";
  998. &poly_add("0($oup)");
  999. &poly_mul(); $code.="
  1000. lea 16($oup), $oup
  1001. 2: \n";
  1002. &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
  1003. &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
  1004. &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
  1005. &poly_add("0($oup)");
  1006. &poly_mul();
  1007. &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
  1008. &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
  1009. &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
  1010. lea 16($oup), $oup
  1011. dec $itr1
  1012. jg 1b
  1013. dec $itr2
  1014. jge 2b\n";
  1015. &finalize_state(3);
  1016. &xor_stream($A2,$B2,$C2,$D2,0*16);
  1017. &xor_stream($A1,$B1,$C1,$D1,4*16); $code.="
  1018. mov \$8*16, $itr1
  1019. sub \$8*16, $inl
  1020. lea 8*16($inp), $inp
  1021. ###############################################################################
  1022. seal_sse_128_seal_hash:
  1023. cmp \$16, $itr1
  1024. jb seal_sse_128_seal\n";
  1025. &poly_add("0($oup)");
  1026. &poly_mul(); $code.="
  1027. sub \$16, $itr1
  1028. lea 16($oup), $oup
  1029. jmp seal_sse_128_seal_hash
  1030. seal_sse_128_seal:
  1031. cmp \$16, $inl
  1032. jb seal_sse_tail_16
  1033. sub \$16, $inl
  1034. # Load for decryption
  1035. movdqu 0*16($inp), $T0
  1036. pxor $T0, $A0
  1037. movdqu $A0, 0*16($oup)
  1038. # Then hash
  1039. add 0*8($oup), $acc0
  1040. adc 1*8($oup), $acc1
  1041. adc \$1, $acc2
  1042. lea 1*16($inp), $inp
  1043. lea 1*16($oup), $oup\n";
  1044. &poly_mul(); $code.="
  1045. # Shift the stream left
  1046. movdqa $B0, $A0
  1047. movdqa $C0, $B0
  1048. movdqa $D0, $C0
  1049. movdqa $A1, $D0
  1050. movdqa $B1, $A1
  1051. movdqa $C1, $B1
  1052. movdqa $D1, $C1
  1053. jmp seal_sse_128_seal
  1054. seal_sse_tail_16:
  1055. test $inl, $inl
  1056. jz process_blocks_of_extra_in
  1057. # We can only load the PT one byte at a time to avoid buffer overread
  1058. mov $inl, $itr2
  1059. mov $inl, $itr1
  1060. lea -1($inp, $inl), $inp
  1061. pxor $T3, $T3
  1062. 1:
  1063. pslldq \$1, $T3
  1064. pinsrb \$0, ($inp), $T3
  1065. lea -1($inp), $inp
  1066. dec $itr1
  1067. jne 1b
  1068. # XOR the keystream with the plaintext.
  1069. pxor $A0, $T3
  1070. # Write ciphertext out, byte-by-byte.
  1071. movq $inl, $itr1
  1072. movdqu $T3, $A0
  1073. 2:
  1074. pextrb \$0, $A0, ($oup)
  1075. psrldq \$1, $A0
  1076. add \$1, $oup
  1077. sub \$1, $itr1
  1078. jnz 2b
  1079. # $T3 contains the final (partial, non-empty) block of ciphertext which
  1080. # needs to be fed into the Poly1305 state. The right-most $inl bytes of it
  1081. # are valid. We need to fill it with extra_in bytes until full, or until we
  1082. # run out of bytes.
  1083. #
  1084. # $keyp points to the tag output, which is actually a struct with the
  1085. # extra_in pointer and length at offset 48.
  1086. movq 288+32(%rsp), $keyp
  1087. movq 56($keyp), $t1 # extra_in_len
  1088. movq 48($keyp), $t0 # extra_in
  1089. test $t1, $t1
  1090. jz process_partial_block # Common case: no bytes of extra_in
  1091. movq \$16, $t2
  1092. subq $inl, $t2 # 16-$inl is the number of bytes that fit into $T3.
  1093. cmpq $t2, $t1 # if extra_in_len < 16-$inl, only copy extra_in_len
  1094. # (note that AT&T syntax reverses the arguments)
  1095. jge load_extra_in
  1096. movq $t1, $t2
  1097. load_extra_in:
  1098. # $t2 contains the number of bytes of extra_in (pointed to by $t0) to load
  1099. # into $T3. They are loaded in reverse order.
  1100. leaq -1($t0, $t2), $inp
  1101. # Update extra_in and extra_in_len to reflect the bytes that are about to
  1102. # be read.
  1103. addq $t2, $t0
  1104. subq $t2, $t1
  1105. movq $t0, 48($keyp)
  1106. movq $t1, 56($keyp)
  1107. # Update $itr2, which is used to select the mask later on, to reflect the
  1108. # extra bytes about to be added.
  1109. addq $t2, $itr2
  1110. # Load $t2 bytes of extra_in into $T2.
  1111. pxor $T2, $T2
  1112. 3:
  1113. pslldq \$1, $T2
  1114. pinsrb \$0, ($inp), $T2
  1115. lea -1($inp), $inp
  1116. sub \$1, $t2
  1117. jnz 3b
  1118. # Shift $T2 up the length of the remainder from the main encryption. Sadly,
  1119. # the shift for an XMM register has to be a constant, thus we loop to do
  1120. # this.
  1121. movq $inl, $t2
  1122. 4:
  1123. pslldq \$1, $T2
  1124. sub \$1, $t2
  1125. jnz 4b
  1126. # Mask $T3 (the remainder from the main encryption) so that superfluous
  1127. # bytes are zero. This means that the non-zero bytes in $T2 and $T3 are
  1128. # disjoint and so we can merge them with an OR.
  1129. lea .and_masks(%rip), $t2
  1130. shl \$4, $inl
  1131. pand -16($t2, $inl), $T3
  1132. # Merge $T2 into $T3, forming the remainder block.
  1133. por $T2, $T3
  1134. # The block of ciphertext + extra_in is ready to be included in the
  1135. # Poly1305 state.
  1136. movq $T3, $t0
  1137. pextrq \$1, $T3, $t1
  1138. add $t0, $acc0
  1139. adc $t1, $acc1
  1140. adc \$1, $acc2\n";
  1141. &poly_mul(); $code.="
  1142. process_blocks_of_extra_in:
  1143. # There may be additional bytes of extra_in to process.
  1144. movq 288+32(%rsp), $keyp
  1145. movq 48($keyp), $inp # extra_in
  1146. movq 56($keyp), $itr2 # extra_in_len
  1147. movq $itr2, $itr1
  1148. shr \$4, $itr2 # number of blocks
  1149. 5:
  1150. jz process_extra_in_trailer\n";
  1151. &poly_add("0($inp)");
  1152. &poly_mul(); $code.="
  1153. leaq 16($inp), $inp
  1154. subq \$1, $itr2
  1155. jmp 5b
  1156. process_extra_in_trailer:
  1157. andq \$15, $itr1 # remaining num bytes (<16) of extra_in
  1158. movq $itr1, $inl
  1159. jz do_length_block
  1160. leaq -1($inp, $itr1), $inp
  1161. 6:
  1162. pslldq \$1, $T3
  1163. pinsrb \$0, ($inp), $T3
  1164. lea -1($inp), $inp
  1165. sub \$1, $itr1
  1166. jnz 6b
  1167. process_partial_block:
  1168. # $T3 contains $inl bytes of data to be fed into Poly1305. $inl != 0
  1169. lea .and_masks(%rip), $t2
  1170. shl \$4, $inl
  1171. pand -16($t2, $inl), $T3
  1172. movq $T3, $t0
  1173. pextrq \$1, $T3, $t1
  1174. add $t0, $acc0
  1175. adc $t1, $acc1
  1176. adc \$1, $acc2\n";
  1177. &poly_mul(); $code.="
  1178. do_length_block:\n";
  1179. &poly_add($len_store);
  1180. &poly_mul(); $code.="
  1181. # Final reduce
  1182. mov $acc0, $t0
  1183. mov $acc1, $t1
  1184. mov $acc2, $t2
  1185. sub \$-5, $acc0
  1186. sbb \$-1, $acc1
  1187. sbb \$3, $acc2
  1188. cmovc $t0, $acc0
  1189. cmovc $t1, $acc1
  1190. cmovc $t2, $acc2
  1191. # Add in s part of the key
  1192. add 0+$s_store, $acc0
  1193. adc 8+$s_store, $acc1
  1194. add \$288 + 32, %rsp
  1195. .cfi_adjust_cfa_offset -(288 + 32)
  1196. pop $keyp
  1197. .cfi_adjust_cfa_offset -8
  1198. mov $acc0, 0*8($keyp)
  1199. mov $acc1, 1*8($keyp)
  1200. pop %r15
  1201. .cfi_adjust_cfa_offset -8
  1202. pop %r14
  1203. .cfi_adjust_cfa_offset -8
  1204. pop %r13
  1205. .cfi_adjust_cfa_offset -8
  1206. pop %r12
  1207. .cfi_adjust_cfa_offset -8
  1208. pop %rbx
  1209. .cfi_adjust_cfa_offset -8
  1210. pop %rbp
  1211. .cfi_adjust_cfa_offset -8
  1212. ret
  1213. .cfi_adjust_cfa_offset (8 * 6) + 288 + 32
  1214. ################################################################################
  1215. seal_sse_128:
  1216. movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
  1217. movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
  1218. movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
  1219. movdqu 2*16($keyp), $D2
  1220. movdqa $D2, $D0\npaddd .sse_inc(%rip), $D0
  1221. movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1
  1222. movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D0, $T3
  1223. mov \$10, $acc0
  1224. 1:\n";
  1225. &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
  1226. &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
  1227. &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
  1228. &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
  1229. &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
  1230. &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
  1231. dec $acc0
  1232. jnz 1b
  1233. paddd .chacha20_consts(%rip), $A0
  1234. paddd .chacha20_consts(%rip), $A1
  1235. paddd .chacha20_consts(%rip), $A2
  1236. paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
  1237. paddd $T2, $C0\npaddd $T2, $C1
  1238. paddd $T3, $D0
  1239. paddd .sse_inc(%rip), $T3
  1240. paddd $T3, $D1
  1241. # Clamp and store the key
  1242. pand .clamp(%rip), $A2
  1243. movdqa $A2, $r_store
  1244. movdqa $B2, $s_store
  1245. # Hash
  1246. mov %r8, $itr2
  1247. call poly_hash_ad_internal
  1248. jmp seal_sse_128_seal
  1249. .size chacha20_poly1305_seal, .-chacha20_poly1305_seal\n";
  1250. }
  1251. # There should have been a cfi_endproc at the end of that function, but the two
  1252. # following blocks of code are jumped to without a stack frame and the CFI
  1253. # context which they are used in happens to match the CFI context at the end of
  1254. # the previous function. So the CFI table is just extended to the end of them.
  1255. if ($avx>1) {
  1256. ($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%ymm$_",(0..15));
  1257. my ($A0x,$A1x,$A2x,$A3x,$B0x,$B1x,$B2x,$B3x,$C0x,$C1x,$C2x,$C3x,$D0x,$D1x,$D2x,$D3x)=map("%xmm$_",(0..15));
  1258. ($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3);
  1259. $state1_store="2*32(%rbp)";
  1260. $state2_store="3*32(%rbp)";
  1261. $tmp_store="4*32(%rbp)";
  1262. $ctr0_store="5*32(%rbp)";
  1263. $ctr1_store="6*32(%rbp)";
  1264. $ctr2_store="7*32(%rbp)";
  1265. $ctr3_store="8*32(%rbp)";
  1266. sub chacha_qr_avx2 {
  1267. my ($a,$b,$c,$d,$t,$dir)=@_;
  1268. $code.=<<___ if ($dir =~ /store/);
  1269. vmovdqa $t, $tmp_store
  1270. ___
  1271. $code.=<<___;
  1272. vpaddd $b, $a, $a
  1273. vpxor $a, $d, $d
  1274. vpshufb .rol16(%rip), $d, $d
  1275. vpaddd $d, $c, $c
  1276. vpxor $c, $b, $b
  1277. vpsrld \$20, $b, $t
  1278. vpslld \$12, $b, $b
  1279. vpxor $t, $b, $b
  1280. vpaddd $b, $a, $a
  1281. vpxor $a, $d, $d
  1282. vpshufb .rol8(%rip), $d, $d
  1283. vpaddd $d, $c, $c
  1284. vpxor $c, $b, $b
  1285. vpslld \$7, $b, $t
  1286. vpsrld \$25, $b, $b
  1287. vpxor $t, $b, $b
  1288. ___
  1289. $code.=<<___ if ($dir =~ /left/);
  1290. vpalignr \$12, $d, $d, $d
  1291. vpalignr \$8, $c, $c, $c
  1292. vpalignr \$4, $b, $b, $b
  1293. ___
  1294. $code.=<<___ if ($dir =~ /right/);
  1295. vpalignr \$4, $d, $d, $d
  1296. vpalignr \$8, $c, $c, $c
  1297. vpalignr \$12, $b, $b, $b
  1298. ___
  1299. $code.=<<___ if ($dir =~ /load/);
  1300. vmovdqa $tmp_store, $t
  1301. ___
  1302. }
  1303. sub prep_state_avx2 {
  1304. my ($n)=@_;
  1305. $code.=<<___;
  1306. vmovdqa .chacha20_consts(%rip), $A0
  1307. vmovdqa $state1_store, $B0
  1308. vmovdqa $state2_store, $C0
  1309. ___
  1310. $code.=<<___ if ($n ge 2);
  1311. vmovdqa $A0, $A1
  1312. vmovdqa $B0, $B1
  1313. vmovdqa $C0, $C1
  1314. ___
  1315. $code.=<<___ if ($n ge 3);
  1316. vmovdqa $A0, $A2
  1317. vmovdqa $B0, $B2
  1318. vmovdqa $C0, $C2
  1319. ___
  1320. $code.=<<___ if ($n ge 4);
  1321. vmovdqa $A0, $A3
  1322. vmovdqa $B0, $B3
  1323. vmovdqa $C0, $C3
  1324. ___
  1325. $code.=<<___ if ($n eq 1);
  1326. vmovdqa .avx2_inc(%rip), $D0
  1327. vpaddd $ctr0_store, $D0, $D0
  1328. vmovdqa $D0, $ctr0_store
  1329. ___
  1330. $code.=<<___ if ($n eq 2);
  1331. vmovdqa .avx2_inc(%rip), $D0
  1332. vpaddd $ctr0_store, $D0, $D1
  1333. vpaddd $D1, $D0, $D0
  1334. vmovdqa $D0, $ctr0_store
  1335. vmovdqa $D1, $ctr1_store
  1336. ___
  1337. $code.=<<___ if ($n eq 3);
  1338. vmovdqa .avx2_inc(%rip), $D0
  1339. vpaddd $ctr0_store, $D0, $D2
  1340. vpaddd $D2, $D0, $D1
  1341. vpaddd $D1, $D0, $D0
  1342. vmovdqa $D0, $ctr0_store
  1343. vmovdqa $D1, $ctr1_store
  1344. vmovdqa $D2, $ctr2_store
  1345. ___
  1346. $code.=<<___ if ($n eq 4);
  1347. vmovdqa .avx2_inc(%rip), $D0
  1348. vpaddd $ctr0_store, $D0, $D3
  1349. vpaddd $D3, $D0, $D2
  1350. vpaddd $D2, $D0, $D1
  1351. vpaddd $D1, $D0, $D0
  1352. vmovdqa $D3, $ctr3_store
  1353. vmovdqa $D2, $ctr2_store
  1354. vmovdqa $D1, $ctr1_store
  1355. vmovdqa $D0, $ctr0_store
  1356. ___
  1357. }
  1358. sub finalize_state_avx2 {
  1359. my ($n)=@_;
  1360. $code.=<<___ if ($n eq 4);
  1361. vpaddd .chacha20_consts(%rip), $A3, $A3
  1362. vpaddd $state1_store, $B3, $B3
  1363. vpaddd $state2_store, $C3, $C3
  1364. vpaddd $ctr3_store, $D3, $D3
  1365. ___
  1366. $code.=<<___ if ($n ge 3);
  1367. vpaddd .chacha20_consts(%rip), $A2, $A2
  1368. vpaddd $state1_store, $B2, $B2
  1369. vpaddd $state2_store, $C2, $C2
  1370. vpaddd $ctr2_store, $D2, $D2
  1371. ___
  1372. $code.=<<___ if ($n ge 2);
  1373. vpaddd .chacha20_consts(%rip), $A1, $A1
  1374. vpaddd $state1_store, $B1, $B1
  1375. vpaddd $state2_store, $C1, $C1
  1376. vpaddd $ctr1_store, $D1, $D1
  1377. ___
  1378. $code.=<<___;
  1379. vpaddd .chacha20_consts(%rip), $A0, $A0
  1380. vpaddd $state1_store, $B0, $B0
  1381. vpaddd $state2_store, $C0, $C0
  1382. vpaddd $ctr0_store, $D0, $D0
  1383. ___
  1384. }
  1385. sub xor_stream_avx2 {
  1386. my ($A, $B, $C, $D, $offset, $hlp)=@_;
  1387. $code.=<<___;
  1388. vperm2i128 \$0x02, $A, $B, $hlp
  1389. vperm2i128 \$0x13, $A, $B, $B
  1390. vperm2i128 \$0x02, $C, $D, $A
  1391. vperm2i128 \$0x13, $C, $D, $C
  1392. vpxor 0*32+$offset($inp), $hlp, $hlp
  1393. vpxor 1*32+$offset($inp), $A, $A
  1394. vpxor 2*32+$offset($inp), $B, $B
  1395. vpxor 3*32+$offset($inp), $C, $C
  1396. vmovdqu $hlp, 0*32+$offset($oup)
  1397. vmovdqu $A, 1*32+$offset($oup)
  1398. vmovdqu $B, 2*32+$offset($oup)
  1399. vmovdqu $C, 3*32+$offset($oup)
  1400. ___
  1401. }
  1402. sub finish_stream_avx2 {
  1403. my ($A, $B, $C, $D, $hlp)=@_;
  1404. $code.=<<___;
  1405. vperm2i128 \$0x13, $A, $B, $hlp
  1406. vperm2i128 \$0x02, $A, $B, $A
  1407. vperm2i128 \$0x02, $C, $D, $B
  1408. vperm2i128 \$0x13, $C, $D, $D
  1409. vmovdqa $hlp, $C
  1410. ___
  1411. }
  1412. sub poly_stage1_mulx {
  1413. $code.=<<___;
  1414. mov 0+$r_store, %rdx
  1415. mov %rdx, $t2
  1416. mulx $acc0, $t0, $t1
  1417. mulx $acc1, %rax, %rdx
  1418. imulq $acc2, $t2
  1419. add %rax, $t1
  1420. adc %rdx, $t2
  1421. ___
  1422. }
  1423. sub poly_stage2_mulx {
  1424. $code.=<<___;
  1425. mov 8+$r_store, %rdx
  1426. mulx $acc0, $acc0, %rax
  1427. add $acc0, $t1
  1428. mulx $acc1, $acc1, $t3
  1429. adc $acc1, $t2
  1430. adc \$0, $t3
  1431. imulq $acc2, %rdx
  1432. ___
  1433. }
  1434. sub poly_stage3_mulx {
  1435. $code.=<<___;
  1436. add %rax, $t2
  1437. adc %rdx, $t3
  1438. ___
  1439. }
  1440. sub poly_mul_mulx {
  1441. &poly_stage1_mulx();
  1442. &poly_stage2_mulx();
  1443. &poly_stage3_mulx();
  1444. &poly_reduce_stage();
  1445. }
  1446. sub gen_chacha_round_avx2 {
  1447. my ($rot1, $rot2, $shift)=@_;
  1448. my $round="";
  1449. $round=$round ."vmovdqa $C0, $tmp_store\n" if ($rot1 eq 20);
  1450. $round=$round ."vmovdqa $rot2, $C0
  1451. vpaddd $B3, $A3, $A3
  1452. vpaddd $B2, $A2, $A2
  1453. vpaddd $B1, $A1, $A1
  1454. vpaddd $B0, $A0, $A0
  1455. vpxor $A3, $D3, $D3
  1456. vpxor $A2, $D2, $D2
  1457. vpxor $A1, $D1, $D1
  1458. vpxor $A0, $D0, $D0
  1459. vpshufb $C0, $D3, $D3
  1460. vpshufb $C0, $D2, $D2
  1461. vpshufb $C0, $D1, $D1
  1462. vpshufb $C0, $D0, $D0
  1463. vmovdqa $tmp_store, $C0
  1464. vpaddd $D3, $C3, $C3
  1465. vpaddd $D2, $C2, $C2
  1466. vpaddd $D1, $C1, $C1
  1467. vpaddd $D0, $C0, $C0
  1468. vpxor $C3, $B3, $B3
  1469. vpxor $C2, $B2, $B2
  1470. vpxor $C1, $B1, $B1
  1471. vpxor $C0, $B0, $B0
  1472. vmovdqa $C0, $tmp_store
  1473. vpsrld \$$rot1, $B3, $C0
  1474. vpslld \$32-$rot1, $B3, $B3
  1475. vpxor $C0, $B3, $B3
  1476. vpsrld \$$rot1, $B2, $C0
  1477. vpslld \$32-$rot1, $B2, $B2
  1478. vpxor $C0, $B2, $B2
  1479. vpsrld \$$rot1, $B1, $C0
  1480. vpslld \$32-$rot1, $B1, $B1
  1481. vpxor $C0, $B1, $B1
  1482. vpsrld \$$rot1, $B0, $C0
  1483. vpslld \$32-$rot1, $B0, $B0
  1484. vpxor $C0, $B0, $B0\n";
  1485. ($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/);
  1486. ($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/);
  1487. $round=$round ."vmovdqa $tmp_store, $C0
  1488. vpalignr \$$s1, $B3, $B3, $B3
  1489. vpalignr \$$s2, $C3, $C3, $C3
  1490. vpalignr \$$s3, $D3, $D3, $D3
  1491. vpalignr \$$s1, $B2, $B2, $B2
  1492. vpalignr \$$s2, $C2, $C2, $C2
  1493. vpalignr \$$s3, $D2, $D2, $D2
  1494. vpalignr \$$s1, $B1, $B1, $B1
  1495. vpalignr \$$s2, $C1, $C1, $C1
  1496. vpalignr \$$s3, $D1, $D1, $D1
  1497. vpalignr \$$s1, $B0, $B0, $B0
  1498. vpalignr \$$s2, $C0, $C0, $C0
  1499. vpalignr \$$s3, $D0, $D0, $D0\n"
  1500. if (($shift =~ /left/) || ($shift =~ /right/));
  1501. return $round;
  1502. };
  1503. $chacha_body = &gen_chacha_round_avx2(20, ".rol16(%rip)") .
  1504. &gen_chacha_round_avx2(25, ".rol8(%rip)", "left") .
  1505. &gen_chacha_round_avx2(20, ".rol16(%rip)") .
  1506. &gen_chacha_round_avx2(25, ".rol8(%rip)", "right");
  1507. @loop_body = split /\n/, $chacha_body;
  1508. $code.="
  1509. ###############################################################################
  1510. .type chacha20_poly1305_open_avx2,\@function,2
  1511. .align 64
  1512. chacha20_poly1305_open_avx2:
  1513. vzeroupper
  1514. vmovdqa .chacha20_consts(%rip), $A0
  1515. vbroadcasti128 0*16($keyp), $B0
  1516. vbroadcasti128 1*16($keyp), $C0
  1517. vbroadcasti128 2*16($keyp), $D0
  1518. vpaddd .avx2_init(%rip), $D0, $D0
  1519. cmp \$6*32, $inl
  1520. jbe open_avx2_192
  1521. cmp \$10*32, $inl
  1522. jbe open_avx2_320
  1523. vmovdqa $B0, $state1_store
  1524. vmovdqa $C0, $state2_store
  1525. vmovdqa $D0, $ctr0_store
  1526. mov \$10, $acc0
  1527. 1: \n";
  1528. &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
  1529. &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
  1530. dec $acc0
  1531. jne 1b
  1532. vpaddd .chacha20_consts(%rip), $A0, $A0
  1533. vpaddd $state1_store, $B0, $B0
  1534. vpaddd $state2_store, $C0, $C0
  1535. vpaddd $ctr0_store, $D0, $D0
  1536. vperm2i128 \$0x02, $A0, $B0, $T0
  1537. # Clamp and store key
  1538. vpand .clamp(%rip), $T0, $T0
  1539. vmovdqa $T0, $r_store
  1540. # Stream for the first 64 bytes
  1541. vperm2i128 \$0x13, $A0, $B0, $A0
  1542. vperm2i128 \$0x13, $C0, $D0, $B0
  1543. # Hash AD + first 64 bytes
  1544. mov %r8, $itr2
  1545. call poly_hash_ad_internal
  1546. xor $itr1, $itr1
  1547. # Hash first 64 bytes
  1548. 1: \n";
  1549. &poly_add("0($inp, $itr1)");
  1550. &poly_mul(); $code.="
  1551. add \$16, $itr1
  1552. cmp \$2*32, $itr1
  1553. jne 1b
  1554. # Decrypt first 64 bytes
  1555. vpxor 0*32($inp), $A0, $A0
  1556. vpxor 1*32($inp), $B0, $B0
  1557. vmovdqu $A0, 0*32($oup)
  1558. vmovdqu $B0, 1*32($oup)
  1559. lea 2*32($inp), $inp
  1560. lea 2*32($oup), $oup
  1561. sub \$2*32, $inl
  1562. 1:
  1563. # Hash and decrypt 512 bytes each iteration
  1564. cmp \$16*32, $inl
  1565. jb 3f\n";
  1566. &prep_state_avx2(4); $code.="
  1567. xor $itr1, $itr1
  1568. 2: \n";
  1569. &poly_add("0*8($inp, $itr1)");
  1570. &emit_body(10);
  1571. &poly_stage1_mulx();
  1572. &emit_body(9);
  1573. &poly_stage2_mulx();
  1574. &emit_body(12);
  1575. &poly_stage3_mulx();
  1576. &emit_body(10);
  1577. &poly_reduce_stage();
  1578. &emit_body(9);
  1579. &poly_add("2*8($inp, $itr1)");
  1580. &emit_body(8);
  1581. &poly_stage1_mulx();
  1582. &emit_body(18);
  1583. &poly_stage2_mulx();
  1584. &emit_body(18);
  1585. &poly_stage3_mulx();
  1586. &emit_body(9);
  1587. &poly_reduce_stage();
  1588. &emit_body(8);
  1589. &poly_add("4*8($inp, $itr1)"); $code.="
  1590. lea 6*8($itr1), $itr1\n";
  1591. &emit_body(18);
  1592. &poly_stage1_mulx();
  1593. &emit_body(8);
  1594. &poly_stage2_mulx();
  1595. &emit_body(8);
  1596. &poly_stage3_mulx();
  1597. &emit_body(18);
  1598. &poly_reduce_stage();
  1599. foreach $l (@loop_body) {$code.=$l."\n";}
  1600. @loop_body = split /\n/, $chacha_body; $code.="
  1601. cmp \$10*6*8, $itr1
  1602. jne 2b\n";
  1603. &finalize_state_avx2(4); $code.="
  1604. vmovdqa $A0, $tmp_store\n";
  1605. &poly_add("10*6*8($inp)");
  1606. &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
  1607. vmovdqa $tmp_store, $A0\n";
  1608. &poly_mul();
  1609. &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
  1610. &poly_add("10*6*8+2*8($inp)");
  1611. &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
  1612. &poly_mul();
  1613. &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.="
  1614. lea 16*32($inp), $inp
  1615. lea 16*32($oup), $oup
  1616. sub \$16*32, $inl
  1617. jmp 1b
  1618. 3:
  1619. test $inl, $inl
  1620. vzeroupper
  1621. je open_sse_finalize
  1622. 3:
  1623. cmp \$4*32, $inl
  1624. ja 3f\n";
  1625. ###############################################################################
  1626. # 1-128 bytes left
  1627. &prep_state_avx2(1); $code.="
  1628. xor $itr2, $itr2
  1629. mov $inl, $itr1
  1630. and \$-16, $itr1
  1631. test $itr1, $itr1
  1632. je 2f
  1633. 1: \n";
  1634. &poly_add("0*8($inp, $itr2)");
  1635. &poly_mul(); $code.="
  1636. 2:
  1637. add \$16, $itr2\n";
  1638. &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
  1639. &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
  1640. cmp $itr1, $itr2
  1641. jb 1b
  1642. cmp \$160, $itr2
  1643. jne 2b\n";
  1644. &finalize_state_avx2(1);
  1645. &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
  1646. jmp open_avx2_tail_loop
  1647. 3:
  1648. cmp \$8*32, $inl
  1649. ja 3f\n";
  1650. ###############################################################################
  1651. # 129-256 bytes left
  1652. &prep_state_avx2(2); $code.="
  1653. mov $inl, $tmp_store
  1654. mov $inl, $itr1
  1655. sub \$4*32, $itr1
  1656. shr \$4, $itr1
  1657. mov \$10, $itr2
  1658. cmp \$10, $itr1
  1659. cmovg $itr2, $itr1
  1660. mov $inp, $inl
  1661. xor $itr2, $itr2
  1662. 1: \n";
  1663. &poly_add("0*8($inl)");
  1664. &poly_mul_mulx(); $code.="
  1665. lea 16($inl), $inl
  1666. 2: \n";
  1667. &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
  1668. &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); $code.="
  1669. inc $itr2\n";
  1670. &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
  1671. &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
  1672. &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
  1673. cmp $itr1, $itr2
  1674. jb 1b
  1675. cmp \$10, $itr2
  1676. jne 2b
  1677. mov $inl, $itr2
  1678. sub $inp, $inl
  1679. mov $inl, $itr1
  1680. mov $tmp_store, $inl
  1681. 1:
  1682. add \$16, $itr1
  1683. cmp $inl, $itr1
  1684. jg 1f\n";
  1685. &poly_add("0*8($itr2)");
  1686. &poly_mul_mulx(); $code.="
  1687. lea 16($itr2), $itr2
  1688. jmp 1b
  1689. 1: \n";
  1690. &finalize_state_avx2(2);
  1691. &xor_stream_avx2($A1, $B1, $C1, $D1, 0*32, $T0);
  1692. &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.="
  1693. lea 4*32($inp), $inp
  1694. lea 4*32($oup), $oup
  1695. sub \$4*32, $inl
  1696. jmp open_avx2_tail_loop
  1697. 3:
  1698. cmp \$12*32, $inl
  1699. ja 3f\n";
  1700. ###############################################################################
  1701. # 257-383 bytes left
  1702. &prep_state_avx2(3); $code.="
  1703. mov $inl, $tmp_store
  1704. mov $inl, $itr1
  1705. sub \$8*32, $itr1
  1706. shr \$4, $itr1
  1707. add \$6, $itr1
  1708. mov \$10, $itr2
  1709. cmp \$10, $itr1
  1710. cmovg $itr2, $itr1
  1711. mov $inp, $inl
  1712. xor $itr2, $itr2
  1713. 1: \n";
  1714. &poly_add("0*8($inl)");
  1715. &poly_mul_mulx(); $code.="
  1716. lea 16($inl), $inl
  1717. 2: \n";
  1718. &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
  1719. &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
  1720. &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
  1721. &poly_add("0*8($inl)");
  1722. &poly_mul(); $code.="
  1723. lea 16($inl), $inl
  1724. inc $itr2\n";
  1725. &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right");
  1726. &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
  1727. &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
  1728. cmp $itr1, $itr2
  1729. jb 1b
  1730. cmp \$10, $itr2
  1731. jne 2b
  1732. mov $inl, $itr2
  1733. sub $inp, $inl
  1734. mov $inl, $itr1
  1735. mov $tmp_store, $inl
  1736. 1:
  1737. add \$16, $itr1
  1738. cmp $inl, $itr1
  1739. jg 1f\n";
  1740. &poly_add("0*8($itr2)");
  1741. &poly_mul_mulx(); $code.="
  1742. lea 16($itr2), $itr2
  1743. jmp 1b
  1744. 1: \n";
  1745. &finalize_state_avx2(3);
  1746. &xor_stream_avx2($A2, $B2, $C2, $D2, 0*32, $T0);
  1747. &xor_stream_avx2($A1, $B1, $C1, $D1, 4*32, $T0);
  1748. &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.="
  1749. lea 8*32($inp), $inp
  1750. lea 8*32($oup), $oup
  1751. sub \$8*32, $inl
  1752. jmp open_avx2_tail_loop
  1753. 3: \n";
  1754. ###############################################################################
  1755. # 384-512 bytes left
  1756. &prep_state_avx2(4); $code.="
  1757. xor $itr1, $itr1
  1758. mov $inp, $itr2
  1759. 1: \n";
  1760. &poly_add("0*8($itr2)");
  1761. &poly_mul(); $code.="
  1762. lea 2*8($itr2), $itr2
  1763. 2: \n";
  1764. &emit_body(37);
  1765. &poly_add("0*8($itr2)");
  1766. &poly_mul_mulx();
  1767. &emit_body(48);
  1768. &poly_add("2*8($itr2)");
  1769. &poly_mul_mulx(); $code.="
  1770. lea 4*8($itr2), $itr2\n";
  1771. foreach $l (@loop_body) {$code.=$l."\n";}
  1772. @loop_body = split /\n/, $chacha_body; $code.="
  1773. inc $itr1
  1774. cmp \$4, $itr1
  1775. jl 1b
  1776. cmp \$10, $itr1
  1777. jne 2b
  1778. mov $inl, $itr1
  1779. sub \$12*32, $itr1
  1780. and \$-16, $itr1
  1781. 1:
  1782. test $itr1, $itr1
  1783. je 1f\n";
  1784. &poly_add("0*8($itr2)");
  1785. &poly_mul_mulx(); $code.="
  1786. lea 2*8($itr2), $itr2
  1787. sub \$2*8, $itr1
  1788. jmp 1b
  1789. 1: \n";
  1790. &finalize_state_avx2(4); $code.="
  1791. vmovdqa $A0, $tmp_store\n";
  1792. &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
  1793. vmovdqa $tmp_store, $A0\n";
  1794. &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
  1795. &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
  1796. &finish_stream_avx2($A0, $B0, $C0, $D0, $A3); $code.="
  1797. lea 12*32($inp), $inp
  1798. lea 12*32($oup), $oup
  1799. sub \$12*32, $inl
  1800. open_avx2_tail_loop:
  1801. cmp \$32, $inl
  1802. jb open_avx2_tail
  1803. sub \$32, $inl
  1804. vpxor ($inp), $A0, $A0
  1805. vmovdqu $A0, ($oup)
  1806. lea 1*32($inp), $inp
  1807. lea 1*32($oup), $oup
  1808. vmovdqa $B0, $A0
  1809. vmovdqa $C0, $B0
  1810. vmovdqa $D0, $C0
  1811. jmp open_avx2_tail_loop
  1812. open_avx2_tail:
  1813. cmp \$16, $inl
  1814. vmovdqa $A0x, $A1x
  1815. jb 1f
  1816. sub \$16, $inl
  1817. #load for decryption
  1818. vpxor ($inp), $A0x, $A1x
  1819. vmovdqu $A1x, ($oup)
  1820. lea 1*16($inp), $inp
  1821. lea 1*16($oup), $oup
  1822. vperm2i128 \$0x11, $A0, $A0, $A0
  1823. vmovdqa $A0x, $A1x
  1824. 1:
  1825. vzeroupper
  1826. jmp open_sse_tail_16
  1827. ###############################################################################
  1828. open_avx2_192:
  1829. vmovdqa $A0, $A1
  1830. vmovdqa $A0, $A2
  1831. vmovdqa $B0, $B1
  1832. vmovdqa $B0, $B2
  1833. vmovdqa $C0, $C1
  1834. vmovdqa $C0, $C2
  1835. vpaddd .avx2_inc(%rip), $D0, $D1
  1836. vmovdqa $D0, $T2
  1837. vmovdqa $D1, $T3
  1838. mov \$10, $acc0
  1839. 1: \n";
  1840. &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
  1841. &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
  1842. &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
  1843. &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.="
  1844. dec $acc0
  1845. jne 1b
  1846. vpaddd $A2, $A0, $A0
  1847. vpaddd $A2, $A1, $A1
  1848. vpaddd $B2, $B0, $B0
  1849. vpaddd $B2, $B1, $B1
  1850. vpaddd $C2, $C0, $C0
  1851. vpaddd $C2, $C1, $C1
  1852. vpaddd $T2, $D0, $D0
  1853. vpaddd $T3, $D1, $D1
  1854. vperm2i128 \$0x02, $A0, $B0, $T0
  1855. # Clamp and store the key
  1856. vpand .clamp(%rip), $T0, $T0
  1857. vmovdqa $T0, $r_store
  1858. # Stream for up to 192 bytes
  1859. vperm2i128 \$0x13, $A0, $B0, $A0
  1860. vperm2i128 \$0x13, $C0, $D0, $B0
  1861. vperm2i128 \$0x02, $A1, $B1, $C0
  1862. vperm2i128 \$0x02, $C1, $D1, $D0
  1863. vperm2i128 \$0x13, $A1, $B1, $A1
  1864. vperm2i128 \$0x13, $C1, $D1, $B1
  1865. open_avx2_short:
  1866. mov %r8, $itr2
  1867. call poly_hash_ad_internal
  1868. open_avx2_hash_and_xor_loop:
  1869. cmp \$32, $inl
  1870. jb open_avx2_short_tail_32
  1871. sub \$32, $inl\n";
  1872. # Load + hash
  1873. &poly_add("0*8($inp)");
  1874. &poly_mul();
  1875. &poly_add("2*8($inp)");
  1876. &poly_mul(); $code.="
  1877. # Load + decrypt
  1878. vpxor ($inp), $A0, $A0
  1879. vmovdqu $A0, ($oup)
  1880. lea 1*32($inp), $inp
  1881. lea 1*32($oup), $oup
  1882. # Shift stream
  1883. vmovdqa $B0, $A0
  1884. vmovdqa $C0, $B0
  1885. vmovdqa $D0, $C0
  1886. vmovdqa $A1, $D0
  1887. vmovdqa $B1, $A1
  1888. vmovdqa $C1, $B1
  1889. vmovdqa $D1, $C1
  1890. vmovdqa $A2, $D1
  1891. vmovdqa $B2, $A2
  1892. jmp open_avx2_hash_and_xor_loop
  1893. open_avx2_short_tail_32:
  1894. cmp \$16, $inl
  1895. vmovdqa $A0x, $A1x
  1896. jb 1f
  1897. sub \$16, $inl\n";
  1898. &poly_add("0*8($inp)");
  1899. &poly_mul(); $code.="
  1900. vpxor ($inp), $A0x, $A3x
  1901. vmovdqu $A3x, ($oup)
  1902. lea 1*16($inp), $inp
  1903. lea 1*16($oup), $oup
  1904. vextracti128 \$1, $A0, $A1x
  1905. 1:
  1906. vzeroupper
  1907. jmp open_sse_tail_16
  1908. ###############################################################################
  1909. open_avx2_320:
  1910. vmovdqa $A0, $A1
  1911. vmovdqa $A0, $A2
  1912. vmovdqa $B0, $B1
  1913. vmovdqa $B0, $B2
  1914. vmovdqa $C0, $C1
  1915. vmovdqa $C0, $C2
  1916. vpaddd .avx2_inc(%rip), $D0, $D1
  1917. vpaddd .avx2_inc(%rip), $D1, $D2
  1918. vmovdqa $B0, $T1
  1919. vmovdqa $C0, $T2
  1920. vmovdqa $D0, $ctr0_store
  1921. vmovdqa $D1, $ctr1_store
  1922. vmovdqa $D2, $ctr2_store
  1923. mov \$10, $acc0
  1924. 1: \n";
  1925. &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
  1926. &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
  1927. &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
  1928. &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
  1929. &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
  1930. &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
  1931. dec $acc0
  1932. jne 1b
  1933. vpaddd .chacha20_consts(%rip), $A0, $A0
  1934. vpaddd .chacha20_consts(%rip), $A1, $A1
  1935. vpaddd .chacha20_consts(%rip), $A2, $A2
  1936. vpaddd $T1, $B0, $B0
  1937. vpaddd $T1, $B1, $B1
  1938. vpaddd $T1, $B2, $B2
  1939. vpaddd $T2, $C0, $C0
  1940. vpaddd $T2, $C1, $C1
  1941. vpaddd $T2, $C2, $C2
  1942. vpaddd $ctr0_store, $D0, $D0
  1943. vpaddd $ctr1_store, $D1, $D1
  1944. vpaddd $ctr2_store, $D2, $D2
  1945. vperm2i128 \$0x02, $A0, $B0, $T0
  1946. # Clamp and store the key
  1947. vpand .clamp(%rip), $T0, $T0
  1948. vmovdqa $T0, $r_store
  1949. # Stream for up to 320 bytes
  1950. vperm2i128 \$0x13, $A0, $B0, $A0
  1951. vperm2i128 \$0x13, $C0, $D0, $B0
  1952. vperm2i128 \$0x02, $A1, $B1, $C0
  1953. vperm2i128 \$0x02, $C1, $D1, $D0
  1954. vperm2i128 \$0x13, $A1, $B1, $A1
  1955. vperm2i128 \$0x13, $C1, $D1, $B1
  1956. vperm2i128 \$0x02, $A2, $B2, $C1
  1957. vperm2i128 \$0x02, $C2, $D2, $D1
  1958. vperm2i128 \$0x13, $A2, $B2, $A2
  1959. vperm2i128 \$0x13, $C2, $D2, $B2
  1960. jmp open_avx2_short
  1961. .size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2
  1962. ###############################################################################
  1963. ###############################################################################
  1964. .type chacha20_poly1305_seal_avx2,\@function,2
  1965. .align 64
  1966. chacha20_poly1305_seal_avx2:
  1967. vzeroupper
  1968. vmovdqa .chacha20_consts(%rip), $A0
  1969. vbroadcasti128 0*16($keyp), $B0
  1970. vbroadcasti128 1*16($keyp), $C0
  1971. vbroadcasti128 2*16($keyp), $D0
  1972. vpaddd .avx2_init(%rip), $D0, $D0
  1973. cmp \$6*32, $inl
  1974. jbe seal_avx2_192
  1975. cmp \$10*32, $inl
  1976. jbe seal_avx2_320
  1977. vmovdqa $A0, $A1
  1978. vmovdqa $A0, $A2
  1979. vmovdqa $A0, $A3
  1980. vmovdqa $B0, $B1
  1981. vmovdqa $B0, $B2
  1982. vmovdqa $B0, $B3
  1983. vmovdqa $B0, $state1_store
  1984. vmovdqa $C0, $C1
  1985. vmovdqa $C0, $C2
  1986. vmovdqa $C0, $C3
  1987. vmovdqa $C0, $state2_store
  1988. vmovdqa $D0, $D3
  1989. vpaddd .avx2_inc(%rip), $D3, $D2
  1990. vpaddd .avx2_inc(%rip), $D2, $D1
  1991. vpaddd .avx2_inc(%rip), $D1, $D0
  1992. vmovdqa $D0, $ctr0_store
  1993. vmovdqa $D1, $ctr1_store
  1994. vmovdqa $D2, $ctr2_store
  1995. vmovdqa $D3, $ctr3_store
  1996. mov \$10, $acc0
  1997. 1: \n";
  1998. foreach $l (@loop_body) {$code.=$l."\n";}
  1999. @loop_body = split /\n/, $chacha_body; $code.="
  2000. dec $acc0
  2001. jnz 1b\n";
  2002. &finalize_state_avx2(4); $code.="
  2003. vperm2i128 \$0x13, $C3, $D3, $C3
  2004. vperm2i128 \$0x02, $A3, $B3, $D3
  2005. vperm2i128 \$0x13, $A3, $B3, $A3
  2006. vpand .clamp(%rip), $D3, $D3
  2007. vmovdqa $D3, $r_store
  2008. mov %r8, $itr2
  2009. call poly_hash_ad_internal
  2010. # Safely store 320 bytes (otherwise would handle with optimized call)
  2011. vpxor 0*32($inp), $A3, $A3
  2012. vpxor 1*32($inp), $C3, $C3
  2013. vmovdqu $A3, 0*32($oup)
  2014. vmovdqu $C3, 1*32($oup)\n";
  2015. &xor_stream_avx2($A2,$B2,$C2,$D2,2*32,$T3);
  2016. &xor_stream_avx2($A1,$B1,$C1,$D1,6*32,$T3);
  2017. &finish_stream_avx2($A0,$B0,$C0,$D0,$T3); $code.="
  2018. lea 10*32($inp), $inp
  2019. sub \$10*32, $inl
  2020. mov \$10*32, $itr1
  2021. cmp \$4*32, $inl
  2022. jbe seal_avx2_hash
  2023. vpxor 0*32($inp), $A0, $A0
  2024. vpxor 1*32($inp), $B0, $B0
  2025. vpxor 2*32($inp), $C0, $C0
  2026. vpxor 3*32($inp), $D0, $D0
  2027. vmovdqu $A0, 10*32($oup)
  2028. vmovdqu $B0, 11*32($oup)
  2029. vmovdqu $C0, 12*32($oup)
  2030. vmovdqu $D0, 13*32($oup)
  2031. lea 4*32($inp), $inp
  2032. sub \$4*32, $inl
  2033. mov \$8, $itr1
  2034. mov \$2, $itr2
  2035. cmp \$4*32, $inl
  2036. jbe seal_avx2_tail_128
  2037. cmp \$8*32, $inl
  2038. jbe seal_avx2_tail_256
  2039. cmp \$12*32, $inl
  2040. jbe seal_avx2_tail_384
  2041. cmp \$16*32, $inl
  2042. jbe seal_avx2_tail_512\n";
  2043. # We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
  2044. &prep_state_avx2(4);
  2045. foreach $l (@loop_body) {$code.=$l."\n";}
  2046. @loop_body = split /\n/, $chacha_body;
  2047. &emit_body(41);
  2048. @loop_body = split /\n/, $chacha_body; $code.="
  2049. sub \$16, $oup
  2050. mov \$9, $itr1
  2051. jmp 4f
  2052. 1: \n";
  2053. &prep_state_avx2(4); $code.="
  2054. mov \$10, $itr1
  2055. 2: \n";
  2056. &poly_add("0*8($oup)");
  2057. &emit_body(10);
  2058. &poly_stage1_mulx();
  2059. &emit_body(9);
  2060. &poly_stage2_mulx();
  2061. &emit_body(12);
  2062. &poly_stage3_mulx();
  2063. &emit_body(10);
  2064. &poly_reduce_stage(); $code.="
  2065. 4: \n";
  2066. &emit_body(9);
  2067. &poly_add("2*8($oup)");
  2068. &emit_body(8);
  2069. &poly_stage1_mulx();
  2070. &emit_body(18);
  2071. &poly_stage2_mulx();
  2072. &emit_body(18);
  2073. &poly_stage3_mulx();
  2074. &emit_body(9);
  2075. &poly_reduce_stage();
  2076. &emit_body(8);
  2077. &poly_add("4*8($oup)"); $code.="
  2078. lea 6*8($oup), $oup\n";
  2079. &emit_body(18);
  2080. &poly_stage1_mulx();
  2081. &emit_body(8);
  2082. &poly_stage2_mulx();
  2083. &emit_body(8);
  2084. &poly_stage3_mulx();
  2085. &emit_body(18);
  2086. &poly_reduce_stage();
  2087. foreach $l (@loop_body) {$code.=$l."\n";}
  2088. @loop_body = split /\n/, $chacha_body; $code.="
  2089. dec $itr1
  2090. jne 2b\n";
  2091. &finalize_state_avx2(4); $code.="
  2092. lea 4*8($oup), $oup
  2093. vmovdqa $A0, $tmp_store\n";
  2094. &poly_add("-4*8($oup)");
  2095. &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
  2096. vmovdqa $tmp_store, $A0\n";
  2097. &poly_mul();
  2098. &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
  2099. &poly_add("-2*8($oup)");
  2100. &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
  2101. &poly_mul();
  2102. &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.="
  2103. lea 16*32($inp), $inp
  2104. sub \$16*32, $inl
  2105. cmp \$16*32, $inl
  2106. jg 1b\n";
  2107. &poly_add("0*8($oup)");
  2108. &poly_mul();
  2109. &poly_add("2*8($oup)");
  2110. &poly_mul(); $code.="
  2111. lea 4*8($oup), $oup
  2112. mov \$10, $itr1
  2113. xor $itr2, $itr2
  2114. cmp \$4*32, $inl
  2115. ja 3f
  2116. ###############################################################################
  2117. seal_avx2_tail_128:\n";
  2118. &prep_state_avx2(1); $code.="
  2119. 1: \n";
  2120. &poly_add("0($oup)");
  2121. &poly_mul(); $code.="
  2122. lea 2*8($oup), $oup
  2123. 2: \n";
  2124. &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
  2125. &poly_add("0*8($oup)");
  2126. &poly_mul();
  2127. &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
  2128. &poly_add("2*8($oup)");
  2129. &poly_mul(); $code.="
  2130. lea 4*8($oup), $oup
  2131. dec $itr1
  2132. jg 1b
  2133. dec $itr2
  2134. jge 2b\n";
  2135. &finalize_state_avx2(1);
  2136. &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
  2137. jmp seal_avx2_short_loop
  2138. 3:
  2139. cmp \$8*32, $inl
  2140. ja 3f
  2141. ###############################################################################
  2142. seal_avx2_tail_256:\n";
  2143. &prep_state_avx2(2); $code.="
  2144. 1: \n";
  2145. &poly_add("0($oup)");
  2146. &poly_mul(); $code.="
  2147. lea 2*8($oup), $oup
  2148. 2: \n";
  2149. &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
  2150. &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
  2151. &poly_add("0*8($oup)");
  2152. &poly_mul();
  2153. &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
  2154. &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
  2155. &poly_add("2*8($oup)");
  2156. &poly_mul(); $code.="
  2157. lea 4*8($oup), $oup
  2158. dec $itr1
  2159. jg 1b
  2160. dec $itr2
  2161. jge 2b\n";
  2162. &finalize_state_avx2(2);
  2163. &xor_stream_avx2($A1,$B1,$C1,$D1,0*32,$T0);
  2164. &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
  2165. mov \$4*32, $itr1
  2166. lea 4*32($inp), $inp
  2167. sub \$4*32, $inl
  2168. jmp seal_avx2_hash
  2169. 3:
  2170. cmp \$12*32, $inl
  2171. ja seal_avx2_tail_512
  2172. ###############################################################################
  2173. seal_avx2_tail_384:\n";
  2174. &prep_state_avx2(3); $code.="
  2175. 1: \n";
  2176. &poly_add("0($oup)");
  2177. &poly_mul(); $code.="
  2178. lea 2*8($oup), $oup
  2179. 2: \n";
  2180. &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
  2181. &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
  2182. &poly_add("0*8($oup)");
  2183. &poly_mul();
  2184. &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
  2185. &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
  2186. &poly_add("2*8($oup)");
  2187. &poly_mul();
  2188. &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
  2189. &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
  2190. lea 4*8($oup), $oup
  2191. dec $itr1
  2192. jg 1b
  2193. dec $itr2
  2194. jge 2b\n";
  2195. &finalize_state_avx2(3);
  2196. &xor_stream_avx2($A2,$B2,$C2,$D2,0*32,$T0);
  2197. &xor_stream_avx2($A1,$B1,$C1,$D1,4*32,$T0);
  2198. &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
  2199. mov \$8*32, $itr1
  2200. lea 8*32($inp), $inp
  2201. sub \$8*32, $inl
  2202. jmp seal_avx2_hash
  2203. ###############################################################################
  2204. seal_avx2_tail_512:\n";
  2205. &prep_state_avx2(4); $code.="
  2206. 1: \n";
  2207. &poly_add("0($oup)");
  2208. &poly_mul_mulx(); $code.="
  2209. lea 2*8($oup), $oup
  2210. 2: \n";
  2211. &emit_body(20);
  2212. &poly_add("0*8($oup)");
  2213. &emit_body(20);
  2214. &poly_stage1_mulx();
  2215. &emit_body(20);
  2216. &poly_stage2_mulx();
  2217. &emit_body(20);
  2218. &poly_stage3_mulx();
  2219. &emit_body(20);
  2220. &poly_reduce_stage();
  2221. &emit_body(20);
  2222. &poly_add("2*8($oup)");
  2223. &emit_body(20);
  2224. &poly_stage1_mulx();
  2225. &emit_body(20);
  2226. &poly_stage2_mulx();
  2227. &emit_body(20);
  2228. &poly_stage3_mulx();
  2229. &emit_body(20);
  2230. &poly_reduce_stage();
  2231. foreach $l (@loop_body) {$code.=$l."\n";}
  2232. @loop_body = split /\n/, $chacha_body; $code.="
  2233. lea 4*8($oup), $oup
  2234. dec $itr1
  2235. jg 1b
  2236. dec $itr2
  2237. jge 2b\n";
  2238. &finalize_state_avx2(4); $code.="
  2239. vmovdqa $A0, $tmp_store\n";
  2240. &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
  2241. vmovdqa $tmp_store, $A0\n";
  2242. &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
  2243. &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
  2244. &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
  2245. mov \$12*32, $itr1
  2246. lea 12*32($inp), $inp
  2247. sub \$12*32, $inl
  2248. jmp seal_avx2_hash
  2249. ################################################################################
  2250. seal_avx2_320:
  2251. vmovdqa $A0, $A1
  2252. vmovdqa $A0, $A2
  2253. vmovdqa $B0, $B1
  2254. vmovdqa $B0, $B2
  2255. vmovdqa $C0, $C1
  2256. vmovdqa $C0, $C2
  2257. vpaddd .avx2_inc(%rip), $D0, $D1
  2258. vpaddd .avx2_inc(%rip), $D1, $D2
  2259. vmovdqa $B0, $T1
  2260. vmovdqa $C0, $T2
  2261. vmovdqa $D0, $ctr0_store
  2262. vmovdqa $D1, $ctr1_store
  2263. vmovdqa $D2, $ctr2_store
  2264. mov \$10, $acc0
  2265. 1: \n";
  2266. &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
  2267. &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
  2268. &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
  2269. &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
  2270. &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
  2271. &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
  2272. dec $acc0
  2273. jne 1b
  2274. vpaddd .chacha20_consts(%rip), $A0, $A0
  2275. vpaddd .chacha20_consts(%rip), $A1, $A1
  2276. vpaddd .chacha20_consts(%rip), $A2, $A2
  2277. vpaddd $T1, $B0, $B0
  2278. vpaddd $T1, $B1, $B1
  2279. vpaddd $T1, $B2, $B2
  2280. vpaddd $T2, $C0, $C0
  2281. vpaddd $T2, $C1, $C1
  2282. vpaddd $T2, $C2, $C2
  2283. vpaddd $ctr0_store, $D0, $D0
  2284. vpaddd $ctr1_store, $D1, $D1
  2285. vpaddd $ctr2_store, $D2, $D2
  2286. vperm2i128 \$0x02, $A0, $B0, $T0
  2287. # Clamp and store the key
  2288. vpand .clamp(%rip), $T0, $T0
  2289. vmovdqa $T0, $r_store
  2290. # Stream for up to 320 bytes
  2291. vperm2i128 \$0x13, $A0, $B0, $A0
  2292. vperm2i128 \$0x13, $C0, $D0, $B0
  2293. vperm2i128 \$0x02, $A1, $B1, $C0
  2294. vperm2i128 \$0x02, $C1, $D1, $D0
  2295. vperm2i128 \$0x13, $A1, $B1, $A1
  2296. vperm2i128 \$0x13, $C1, $D1, $B1
  2297. vperm2i128 \$0x02, $A2, $B2, $C1
  2298. vperm2i128 \$0x02, $C2, $D2, $D1
  2299. vperm2i128 \$0x13, $A2, $B2, $A2
  2300. vperm2i128 \$0x13, $C2, $D2, $B2
  2301. jmp seal_avx2_short
  2302. ################################################################################
  2303. seal_avx2_192:
  2304. vmovdqa $A0, $A1
  2305. vmovdqa $A0, $A2
  2306. vmovdqa $B0, $B1
  2307. vmovdqa $B0, $B2
  2308. vmovdqa $C0, $C1
  2309. vmovdqa $C0, $C2
  2310. vpaddd .avx2_inc(%rip), $D0, $D1
  2311. vmovdqa $D0, $T2
  2312. vmovdqa $D1, $T3
  2313. mov \$10, $acc0
  2314. 1: \n";
  2315. &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
  2316. &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
  2317. &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
  2318. &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.="
  2319. dec $acc0
  2320. jne 1b
  2321. vpaddd $A2, $A0, $A0
  2322. vpaddd $A2, $A1, $A1
  2323. vpaddd $B2, $B0, $B0
  2324. vpaddd $B2, $B1, $B1
  2325. vpaddd $C2, $C0, $C0
  2326. vpaddd $C2, $C1, $C1
  2327. vpaddd $T2, $D0, $D0
  2328. vpaddd $T3, $D1, $D1
  2329. vperm2i128 \$0x02, $A0, $B0, $T0
  2330. # Clamp and store the key
  2331. vpand .clamp(%rip), $T0, $T0
  2332. vmovdqa $T0, $r_store
  2333. # Stream for up to 192 bytes
  2334. vperm2i128 \$0x13, $A0, $B0, $A0
  2335. vperm2i128 \$0x13, $C0, $D0, $B0
  2336. vperm2i128 \$0x02, $A1, $B1, $C0
  2337. vperm2i128 \$0x02, $C1, $D1, $D0
  2338. vperm2i128 \$0x13, $A1, $B1, $A1
  2339. vperm2i128 \$0x13, $C1, $D1, $B1
  2340. seal_avx2_short:
  2341. mov %r8, $itr2
  2342. call poly_hash_ad_internal
  2343. xor $itr1, $itr1
  2344. seal_avx2_hash:
  2345. cmp \$16, $itr1
  2346. jb seal_avx2_short_loop\n";
  2347. &poly_add("0($oup)");
  2348. &poly_mul(); $code.="
  2349. sub \$16, $itr1
  2350. add \$16, $oup
  2351. jmp seal_avx2_hash
  2352. seal_avx2_short_loop:
  2353. cmp \$32, $inl
  2354. jb seal_avx2_short_tail
  2355. sub \$32, $inl
  2356. # Encrypt
  2357. vpxor ($inp), $A0, $A0
  2358. vmovdqu $A0, ($oup)
  2359. lea 1*32($inp), $inp
  2360. # Load + hash\n";
  2361. &poly_add("0*8($oup)");
  2362. &poly_mul();
  2363. &poly_add("2*8($oup)");
  2364. &poly_mul(); $code.="
  2365. lea 1*32($oup), $oup
  2366. # Shift stream
  2367. vmovdqa $B0, $A0
  2368. vmovdqa $C0, $B0
  2369. vmovdqa $D0, $C0
  2370. vmovdqa $A1, $D0
  2371. vmovdqa $B1, $A1
  2372. vmovdqa $C1, $B1
  2373. vmovdqa $D1, $C1
  2374. vmovdqa $A2, $D1
  2375. vmovdqa $B2, $A2
  2376. jmp seal_avx2_short_loop
  2377. seal_avx2_short_tail:
  2378. cmp \$16, $inl
  2379. jb 1f
  2380. sub \$16, $inl
  2381. vpxor ($inp), $A0x, $A3x
  2382. vmovdqu $A3x, ($oup)
  2383. lea 1*16($inp), $inp\n";
  2384. &poly_add("0*8($oup)");
  2385. &poly_mul(); $code.="
  2386. lea 1*16($oup), $oup
  2387. vextracti128 \$1, $A0, $A0x
  2388. 1:
  2389. vzeroupper
  2390. jmp seal_sse_tail_16
  2391. .cfi_endproc
  2392. ";
  2393. }
  2394. if (!$win64) {
  2395. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  2396. print $code;
  2397. } else {
  2398. print <<___;
  2399. .globl dummy_chacha20_poly1305_asm
  2400. .type dummy_chacha20_poly1305_asm,\@abi-omnipotent
  2401. dummy_chacha20_poly1305_asm:
  2402. ret
  2403. ___
  2404. }
  2405. close STDOUT;