You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

2257 lines
57 KiB

  1. #!/usr/bin/env perl
  2. # Copyright (c) 2017, Shay Gueron.
  3. # Copyright (c) 2017, Google Inc.
  4. #
  5. # Permission to use, copy, modify, and/or distribute this software for any
  6. # purpose with or without fee is hereby granted, provided that the above
  7. # copyright notice and this permission notice appear in all copies.
  8. #
  9. # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  10. # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  11. # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
  12. # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  13. # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
  14. # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
  15. # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
  16. use warnings FATAL => 'all';
  17. $flavour = shift;
  18. $output = shift;
  19. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  20. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  21. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  22. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  23. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  24. die "can't locate x86_64-xlate.pl";
  25. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  26. *STDOUT=*OUT;
  27. $code.=<<___;
  28. .data
  29. .align 16
  30. one:
  31. .quad 1,0
  32. two:
  33. .quad 2,0
  34. three:
  35. .quad 3,0
  36. four:
  37. .quad 4,0
  38. five:
  39. .quad 5,0
  40. six:
  41. .quad 6,0
  42. seven:
  43. .quad 7,0
  44. eight:
  45. .quad 8,0
  46. OR_MASK:
  47. .long 0x00000000,0x00000000,0x00000000,0x80000000
  48. poly:
  49. .quad 0x1, 0xc200000000000000
  50. mask:
  51. .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
  52. con1:
  53. .long 1,1,1,1
  54. con2:
  55. .long 0x1b,0x1b,0x1b,0x1b
  56. con3:
  57. .byte -1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7
  58. and_mask:
  59. .long 0,0xffffffff, 0xffffffff, 0xffffffff
  60. ___
  61. $code.=<<___;
  62. .text
  63. ___
  64. sub gfmul {
  65. #########################
  66. # a = T
  67. # b = TMP0 - remains unchanged
  68. # res = T
  69. # uses also TMP1,TMP2,TMP3,TMP4
  70. # __m128i GFMUL(__m128i A, __m128i B);
  71. my $T = "%xmm0";
  72. my $TMP0 = "%xmm1";
  73. my $TMP1 = "%xmm2";
  74. my $TMP2 = "%xmm3";
  75. my $TMP3 = "%xmm4";
  76. my $TMP4 = "%xmm5";
  77. $code.=<<___;
  78. .type GFMUL,\@abi-omnipotent
  79. .align 16
  80. GFMUL:
  81. .cfi_startproc
  82. vpclmulqdq \$0x00, $TMP0, $T, $TMP1
  83. vpclmulqdq \$0x11, $TMP0, $T, $TMP4
  84. vpclmulqdq \$0x10, $TMP0, $T, $TMP2
  85. vpclmulqdq \$0x01, $TMP0, $T, $TMP3
  86. vpxor $TMP3, $TMP2, $TMP2
  87. vpslldq \$8, $TMP2, $TMP3
  88. vpsrldq \$8, $TMP2, $TMP2
  89. vpxor $TMP3, $TMP1, $TMP1
  90. vpxor $TMP2, $TMP4, $TMP4
  91. vpclmulqdq \$0x10, poly(%rip), $TMP1, $TMP2
  92. vpshufd \$78, $TMP1, $TMP3
  93. vpxor $TMP3, $TMP2, $TMP1
  94. vpclmulqdq \$0x10, poly(%rip), $TMP1, $TMP2
  95. vpshufd \$78, $TMP1, $TMP3
  96. vpxor $TMP3, $TMP2, $TMP1
  97. vpxor $TMP4, $TMP1, $T
  98. ret
  99. .cfi_endproc
  100. .size GFMUL, .-GFMUL
  101. ___
  102. }
  103. gfmul();
  104. sub aesgcmsiv_htable_init {
  105. # aesgcmsiv_htable_init writes an eight-entry table of powers of |H| to
  106. # |out_htable|.
  107. # void aesgcmsiv_htable_init(uint8_t out_htable[16*8], uint8_t *H);
  108. my $Htbl = "%rdi";
  109. my $H = "%rsi";
  110. my $T = "%xmm0";
  111. my $TMP0 = "%xmm1";
  112. $code.=<<___;
  113. .globl aesgcmsiv_htable_init
  114. .type aesgcmsiv_htable_init,\@function,2
  115. .align 16
  116. aesgcmsiv_htable_init:
  117. .cfi_startproc
  118. vmovdqa ($H), $T
  119. vmovdqa $T, $TMP0
  120. vmovdqa $T, ($Htbl) # H
  121. call GFMUL
  122. vmovdqa $T, 16($Htbl) # H^2
  123. call GFMUL
  124. vmovdqa $T, 32($Htbl) # H^3
  125. call GFMUL
  126. vmovdqa $T, 48($Htbl) # H^4
  127. call GFMUL
  128. vmovdqa $T, 64($Htbl) # H^5
  129. call GFMUL
  130. vmovdqa $T, 80($Htbl) # H^6
  131. call GFMUL
  132. vmovdqa $T, 96($Htbl) # H^7
  133. call GFMUL
  134. vmovdqa $T, 112($Htbl) # H^8
  135. ret
  136. .cfi_endproc
  137. .size aesgcmsiv_htable_init, .-aesgcmsiv_htable_init
  138. ___
  139. }
  140. aesgcmsiv_htable_init();
  141. sub aesgcmsiv_htable6_init {
  142. # aesgcmsiv_htable6_init writes a six-entry table of powers of |H| to
  143. # |out_htable|.
  144. # void aesgcmsiv_htable6_init(uint8_t out_htable[16*6], uint8_t *H);
  145. #
  146. my $Htbl = "%rdi";
  147. my $H = "%rsi";
  148. my $T = "%xmm0";
  149. my $TMP0 = "%xmm1";
  150. $code.=<<___;
  151. .globl aesgcmsiv_htable6_init
  152. .type aesgcmsiv_htable6_init,\@function,2
  153. .align 16
  154. aesgcmsiv_htable6_init:
  155. .cfi_startproc
  156. vmovdqa ($H), $T
  157. vmovdqa $T, $TMP0
  158. vmovdqa $T, ($Htbl) # H
  159. call GFMUL
  160. vmovdqa $T, 16($Htbl) # H^2
  161. call GFMUL
  162. vmovdqa $T, 32($Htbl) # H^3
  163. call GFMUL
  164. vmovdqa $T, 48($Htbl) # H^4
  165. call GFMUL
  166. vmovdqa $T, 64($Htbl) # H^5
  167. call GFMUL
  168. vmovdqa $T, 80($Htbl) # H^6
  169. ret
  170. .cfi_endproc
  171. .size aesgcmsiv_htable6_init, .-aesgcmsiv_htable6_init
  172. ___
  173. }
  174. aesgcmsiv_htable6_init();
  175. sub aesgcmsiv_htable_polyval {
  176. # void aesgcmsiv_htable_polyval(uint8_t Htbl[16*8], uint8_t *MSG, uint64_t LEN, uint8_t *T);
  177. # parameter 1: %rdi Htable - pointer to Htable
  178. # parameter 2: %rsi INp - pointer to input
  179. # parameter 3: %rdx LEN - length of BUFFER in bytes
  180. # parameter 4: %rcx T - pointer to POLYVAL output
  181. my $DATA = "%xmm0";
  182. my $hlp0 = "%r11";
  183. my $Htbl = "%rdi";
  184. my $inp = "%rsi";
  185. my $len = "%rdx";
  186. my $TMP0 = "%xmm3";
  187. my $TMP1 = "%xmm4";
  188. my $TMP2 = "%xmm5";
  189. my $TMP3 = "%xmm6";
  190. my $TMP4 = "%xmm7";
  191. my $Tp = "%rcx";
  192. my $T = "%xmm1";
  193. my $Xhi = "%xmm9";
  194. my $SCHOOLBOOK_AAD = sub {
  195. my ($i)=@_;
  196. return <<___;
  197. vpclmulqdq \$0x01, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
  198. vpxor $TMP3, $TMP2, $TMP2
  199. vpclmulqdq \$0x00, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
  200. vpxor $TMP3, $TMP0, $TMP0
  201. vpclmulqdq \$0x11, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
  202. vpxor $TMP3, $TMP1, $TMP1
  203. vpclmulqdq \$0x10, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
  204. vpxor $TMP3, $TMP2, $TMP2
  205. ___
  206. };
  207. $code.=<<___;
  208. .globl aesgcmsiv_htable_polyval
  209. .type aesgcmsiv_htable_polyval,\@function,4
  210. .align 16
  211. aesgcmsiv_htable_polyval:
  212. .cfi_startproc
  213. test $len, $len
  214. jnz .Lhtable_polyval_start
  215. ret
  216. .Lhtable_polyval_start:
  217. vzeroall
  218. # We hash 8 blocks each iteration. If the total number of blocks is not a
  219. # multiple of 8, we first hash the leading n%8 blocks.
  220. movq $len, $hlp0
  221. andq \$127, $hlp0
  222. jz .Lhtable_polyval_no_prefix
  223. vpxor $Xhi, $Xhi, $Xhi
  224. vmovdqa ($Tp), $T
  225. sub $hlp0, $len
  226. sub \$16, $hlp0
  227. # hash first prefix block
  228. vmovdqu ($inp), $DATA
  229. vpxor $T, $DATA, $DATA
  230. vpclmulqdq \$0x01, ($Htbl,$hlp0), $DATA, $TMP2
  231. vpclmulqdq \$0x00, ($Htbl,$hlp0), $DATA, $TMP0
  232. vpclmulqdq \$0x11, ($Htbl,$hlp0), $DATA, $TMP1
  233. vpclmulqdq \$0x10, ($Htbl,$hlp0), $DATA, $TMP3
  234. vpxor $TMP3, $TMP2, $TMP2
  235. lea 16($inp), $inp
  236. test $hlp0, $hlp0
  237. jnz .Lhtable_polyval_prefix_loop
  238. jmp .Lhtable_polyval_prefix_complete
  239. # hash remaining prefix bocks (up to 7 total prefix blocks)
  240. .align 64
  241. .Lhtable_polyval_prefix_loop:
  242. sub \$16, $hlp0
  243. vmovdqu ($inp), $DATA # next data block
  244. vpclmulqdq \$0x00, ($Htbl,$hlp0), $DATA, $TMP3
  245. vpxor $TMP3, $TMP0, $TMP0
  246. vpclmulqdq \$0x11, ($Htbl,$hlp0), $DATA, $TMP3
  247. vpxor $TMP3, $TMP1, $TMP1
  248. vpclmulqdq \$0x01, ($Htbl,$hlp0), $DATA, $TMP3
  249. vpxor $TMP3, $TMP2, $TMP2
  250. vpclmulqdq \$0x10, ($Htbl,$hlp0), $DATA, $TMP3
  251. vpxor $TMP3, $TMP2, $TMP2
  252. test $hlp0, $hlp0
  253. lea 16($inp), $inp
  254. jnz .Lhtable_polyval_prefix_loop
  255. .Lhtable_polyval_prefix_complete:
  256. vpsrldq \$8, $TMP2, $TMP3
  257. vpslldq \$8, $TMP2, $TMP2
  258. vpxor $TMP3, $TMP1, $Xhi
  259. vpxor $TMP2, $TMP0, $T
  260. jmp .Lhtable_polyval_main_loop
  261. .Lhtable_polyval_no_prefix:
  262. # At this point we know the number of blocks is a multiple of 8. However,
  263. # the reduction in the main loop includes a multiplication by x^(-128). In
  264. # order to counter this, the existing tag needs to be multipled by x^128.
  265. # In practice, this just means that it is loaded into $Xhi, not $T.
  266. vpxor $T, $T, $T
  267. vmovdqa ($Tp), $Xhi
  268. .align 64
  269. .Lhtable_polyval_main_loop:
  270. sub \$0x80, $len
  271. jb .Lhtable_polyval_out
  272. vmovdqu 16*7($inp), $DATA # Ii
  273. vpclmulqdq \$0x01, ($Htbl), $DATA, $TMP2
  274. vpclmulqdq \$0x00, ($Htbl), $DATA, $TMP0
  275. vpclmulqdq \$0x11, ($Htbl), $DATA, $TMP1
  276. vpclmulqdq \$0x10, ($Htbl), $DATA, $TMP3
  277. vpxor $TMP3, $TMP2, $TMP2
  278. #########################################################
  279. vmovdqu 16*6($inp), $DATA
  280. ${\$SCHOOLBOOK_AAD->(1)}
  281. #########################################################
  282. vmovdqu 16*5($inp), $DATA
  283. vpclmulqdq \$0x10, poly(%rip), $T, $TMP4 # reduction stage 1a
  284. vpalignr \$8, $T, $T, $T
  285. ${\$SCHOOLBOOK_AAD->(2)}
  286. vpxor $TMP4, $T, $T # reduction stage 1b
  287. #########################################################
  288. vmovdqu 16*4($inp), $DATA
  289. ${\$SCHOOLBOOK_AAD->(3)}
  290. #########################################################
  291. vmovdqu 16*3($inp), $DATA
  292. vpclmulqdq \$0x10, poly(%rip), $T, $TMP4 # reduction stage 2a
  293. vpalignr \$8, $T, $T, $T
  294. ${\$SCHOOLBOOK_AAD->(4)}
  295. vpxor $TMP4, $T, $T # reduction stage 2b
  296. #########################################################
  297. vmovdqu 16*2($inp), $DATA
  298. ${\$SCHOOLBOOK_AAD->(5)}
  299. vpxor $Xhi, $T, $T # reduction finalize
  300. #########################################################
  301. vmovdqu 16*1($inp), $DATA
  302. ${\$SCHOOLBOOK_AAD->(6)}
  303. #########################################################
  304. vmovdqu 16*0($inp), $DATA
  305. vpxor $T, $DATA, $DATA
  306. ${\$SCHOOLBOOK_AAD->(7)}
  307. #########################################################
  308. vpsrldq \$8, $TMP2, $TMP3
  309. vpslldq \$8, $TMP2, $TMP2
  310. vpxor $TMP3, $TMP1, $Xhi
  311. vpxor $TMP2, $TMP0, $T
  312. lea 16*8($inp), $inp
  313. jmp .Lhtable_polyval_main_loop
  314. #########################################################
  315. .Lhtable_polyval_out:
  316. vpclmulqdq \$0x10, poly(%rip), $T, $TMP3
  317. vpalignr \$8, $T, $T, $T
  318. vpxor $TMP3, $T, $T
  319. vpclmulqdq \$0x10, poly(%rip), $T, $TMP3
  320. vpalignr \$8, $T, $T, $T
  321. vpxor $TMP3, $T, $T
  322. vpxor $Xhi, $T, $T
  323. vmovdqu $T, ($Tp)
  324. vzeroupper
  325. ret
  326. .cfi_endproc
  327. .size aesgcmsiv_htable_polyval,.-aesgcmsiv_htable_polyval
  328. ___
  329. }
  330. aesgcmsiv_htable_polyval();
  331. sub aesgcmsiv_polyval_horner {
  332. #void aesgcmsiv_polyval_horner(unsigned char T[16], // output
  333. # const unsigned char* H, // H
  334. # unsigned char* BUF, // Buffer
  335. # unsigned int blocks); // Len2
  336. #
  337. # parameter 1: %rdi T - pointers to POLYVAL output
  338. # parameter 2: %rsi Hp - pointer to H (user key)
  339. # parameter 3: %rdx INp - pointer to input
  340. # parameter 4: %rcx L - total number of blocks in input BUFFER
  341. #
  342. my $T = "%rdi";
  343. my $Hp = "%rsi";
  344. my $INp = "%rdx";
  345. my $L = "%rcx";
  346. my $LOC = "%r10";
  347. my $LEN = "%eax";
  348. my $H = "%xmm1";
  349. my $RES = "%xmm0";
  350. $code.=<<___;
  351. .globl aesgcmsiv_polyval_horner
  352. .type aesgcmsiv_polyval_horner,\@function,4
  353. .align 16
  354. aesgcmsiv_polyval_horner:
  355. .cfi_startproc
  356. test $L, $L
  357. jnz .Lpolyval_horner_start
  358. ret
  359. .Lpolyval_horner_start:
  360. # We will start with L GFMULS for POLYVAL(BIG_BUFFER)
  361. # RES = GFMUL(RES, H)
  362. xorq $LOC, $LOC
  363. shlq \$4, $L # L contains number of bytes to process
  364. vmovdqa ($Hp), $H
  365. vmovdqa ($T), $RES
  366. .Lpolyval_horner_loop:
  367. vpxor ($INp,$LOC), $RES, $RES # RES = RES + Xi
  368. call GFMUL # RES = RES * H
  369. add \$16, $LOC
  370. cmp $LOC, $L
  371. jne .Lpolyval_horner_loop
  372. # calculation of T is complete. RES=T
  373. vmovdqa $RES, ($T)
  374. ret
  375. .cfi_endproc
  376. .size aesgcmsiv_polyval_horner,.-aesgcmsiv_polyval_horner
  377. ___
  378. }
  379. aesgcmsiv_polyval_horner();
  380. # void aes128gcmsiv_aes_ks(const uint8_t *key, uint8_t *out_expanded_key);
  381. # parameter 1: %rdi
  382. # parameter 2: %rsi
  383. $code.=<<___;
  384. .globl aes128gcmsiv_aes_ks
  385. .type aes128gcmsiv_aes_ks,\@function,2
  386. .align 16
  387. aes128gcmsiv_aes_ks:
  388. .cfi_startproc
  389. vmovdqu (%rdi), %xmm1 # xmm1 = user key
  390. vmovdqa %xmm1, (%rsi) # rsi points to output
  391. vmovdqa con1(%rip), %xmm0
  392. vmovdqa mask(%rip), %xmm15
  393. movq \$8, %rax
  394. .Lks128_loop:
  395. addq \$16, %rsi # rsi points for next key
  396. subq \$1, %rax
  397. vpshufb %xmm15, %xmm1, %xmm2 # xmm2 = shuffled user key
  398. vaesenclast %xmm0, %xmm2, %xmm2
  399. vpslld \$1, %xmm0, %xmm0
  400. vpslldq \$4, %xmm1, %xmm3
  401. vpxor %xmm3, %xmm1, %xmm1
  402. vpslldq \$4, %xmm3, %xmm3
  403. vpxor %xmm3, %xmm1, %xmm1
  404. vpslldq \$4, %xmm3, %xmm3
  405. vpxor %xmm3, %xmm1, %xmm1
  406. vpxor %xmm2, %xmm1, %xmm1
  407. vmovdqa %xmm1, (%rsi)
  408. jne .Lks128_loop
  409. vmovdqa con2(%rip), %xmm0
  410. vpshufb %xmm15, %xmm1, %xmm2
  411. vaesenclast %xmm0, %xmm2, %xmm2
  412. vpslld \$1, %xmm0, %xmm0
  413. vpslldq \$4, %xmm1, %xmm3
  414. vpxor %xmm3, %xmm1, %xmm1
  415. vpslldq \$4, %xmm3, %xmm3
  416. vpxor %xmm3, %xmm1, %xmm1
  417. vpslldq \$4, %xmm3, %xmm3
  418. vpxor %xmm3, %xmm1, %xmm1
  419. vpxor %xmm2, %xmm1, %xmm1
  420. vmovdqa %xmm1, 16(%rsi)
  421. vpshufb %xmm15, %xmm1, %xmm2
  422. vaesenclast %xmm0, %xmm2, %xmm2
  423. vpslldq \$4, %xmm1, %xmm3
  424. vpxor %xmm3, %xmm1, %xmm1
  425. vpslldq \$4, %xmm3, %xmm3
  426. vpxor %xmm3, %xmm1, %xmm1
  427. vpslldq \$4, %xmm3, %xmm3
  428. vpxor %xmm3, %xmm1, %xmm1
  429. vpxor %xmm2, %xmm1, %xmm1
  430. vmovdqa %xmm1, 32(%rsi)
  431. ret
  432. .cfi_endproc
  433. .size aes128gcmsiv_aes_ks,.-aes128gcmsiv_aes_ks
  434. ___
  435. # void aes256gcmsiv_aes_ks(const uint8_t *key, uint8_t *out_expanded_key);
  436. # parameter 1: %rdi
  437. # parameter 2: %rsi
  438. $code.=<<___;
  439. .globl aes256gcmsiv_aes_ks
  440. .type aes256gcmsiv_aes_ks,\@function,2
  441. .align 16
  442. aes256gcmsiv_aes_ks:
  443. .cfi_startproc
  444. vmovdqu (%rdi), %xmm1
  445. vmovdqu 16(%rdi), %xmm3
  446. vmovdqa %xmm1, (%rsi)
  447. vmovdqa %xmm3, 16(%rsi)
  448. vmovdqa con1(%rip), %xmm0
  449. vmovdqa mask(%rip), %xmm15
  450. vpxor %xmm14, %xmm14, %xmm14
  451. mov \$6, %rax
  452. .Lks256_loop:
  453. add \$32, %rsi
  454. subq \$1, %rax
  455. vpshufb %xmm15, %xmm3, %xmm2
  456. vaesenclast %xmm0, %xmm2, %xmm2
  457. vpslld \$1, %xmm0, %xmm0
  458. vpsllq \$32, %xmm1, %xmm4
  459. vpxor %xmm4, %xmm1, %xmm1
  460. vpshufb con3(%rip), %xmm1, %xmm4
  461. vpxor %xmm4, %xmm1, %xmm1
  462. vpxor %xmm2, %xmm1, %xmm1
  463. vmovdqa %xmm1, (%rsi)
  464. vpshufd \$0xff, %xmm1, %xmm2
  465. vaesenclast %xmm14, %xmm2, %xmm2
  466. vpsllq \$32, %xmm3, %xmm4
  467. vpxor %xmm4, %xmm3, %xmm3
  468. vpshufb con3(%rip), %xmm3, %xmm4
  469. vpxor %xmm4, %xmm3, %xmm3
  470. vpxor %xmm2, %xmm3, %xmm3
  471. vmovdqa %xmm3, 16(%rsi)
  472. jne .Lks256_loop
  473. vpshufb %xmm15, %xmm3, %xmm2
  474. vaesenclast %xmm0, %xmm2, %xmm2
  475. vpsllq \$32, %xmm1, %xmm4
  476. vpxor %xmm4, %xmm1, %xmm1
  477. vpshufb con3(%rip), %xmm1, %xmm4
  478. vpxor %xmm4, %xmm1, %xmm1
  479. vpxor %xmm2, %xmm1, %xmm1
  480. vmovdqa %xmm1, 32(%rsi)
  481. ret
  482. .cfi_endproc
  483. ___
  484. sub aes128gcmsiv_aes_ks_enc_x1 {
  485. my $KS1_REGA = "%xmm1";
  486. my $KS1_REGB = "%xmm2";
  487. my $BLOCK1 = "%xmm4";
  488. my $AUXREG = "%xmm3";
  489. my $KS_BLOCK = sub {
  490. my ($reg, $reg2, $auxReg) = @_;
  491. return <<___;
  492. vpsllq \$32, $reg, $auxReg #!!saving mov instruction to xmm3
  493. vpxor $auxReg, $reg, $reg
  494. vpshufb con3(%rip), $reg, $auxReg
  495. vpxor $auxReg, $reg, $reg
  496. vpxor $reg2, $reg, $reg
  497. ___
  498. };
  499. my $round = sub {
  500. my ($i, $j) = @_;
  501. return <<___;
  502. vpshufb %xmm15, %xmm1, %xmm2 #!!saving mov instruction to xmm2
  503. vaesenclast %xmm0, %xmm2, %xmm2
  504. vpslld \$1, %xmm0, %xmm0
  505. ${\$KS_BLOCK->($KS1_REGA, $KS1_REGB, $AUXREG)}
  506. vaesenc %xmm1, $BLOCK1, $BLOCK1
  507. vmovdqa %xmm1, ${\eval(16*$i)}($j)
  508. ___
  509. };
  510. my $roundlast = sub {
  511. my ($i, $j) = @_;
  512. return <<___;
  513. vpshufb %xmm15, %xmm1, %xmm2 #!!saving mov instruction to xmm2
  514. vaesenclast %xmm0, %xmm2, %xmm2
  515. ${\$KS_BLOCK->($KS1_REGA, $KS1_REGB, $AUXREG)}
  516. vaesenclast %xmm1, $BLOCK1, $BLOCK1
  517. vmovdqa %xmm1, ${\eval(16*$i)}($j)
  518. ___
  519. };
  520. # parameter 1: %rdi Pointer to PT
  521. # parameter 2: %rsi Pointer to CT
  522. # parameter 4: %rdx Pointer to keys
  523. # parameter 5: %rcx Pointer to initial key
  524. $code.=<<___;
  525. .globl aes128gcmsiv_aes_ks_enc_x1
  526. .type aes128gcmsiv_aes_ks_enc_x1,\@function,4
  527. .align 16
  528. aes128gcmsiv_aes_ks_enc_x1:
  529. .cfi_startproc
  530. vmovdqa (%rcx), %xmm1 # xmm1 = first 16 bytes of random key
  531. vmovdqa 0*16(%rdi), $BLOCK1
  532. vmovdqa %xmm1, (%rdx) # KEY[0] = first 16 bytes of random key
  533. vpxor %xmm1, $BLOCK1, $BLOCK1
  534. vmovdqa con1(%rip), %xmm0 # xmm0 = 1,1,1,1
  535. vmovdqa mask(%rip), %xmm15 # xmm15 = mask
  536. ${\$round->(1, "%rdx")}
  537. ${\$round->(2, "%rdx")}
  538. ${\$round->(3, "%rdx")}
  539. ${\$round->(4, "%rdx")}
  540. ${\$round->(5, "%rdx")}
  541. ${\$round->(6, "%rdx")}
  542. ${\$round->(7, "%rdx")}
  543. ${\$round->(8, "%rdx")}
  544. vmovdqa con2(%rip), %xmm0
  545. ${\$round->(9, "%rdx")}
  546. ${\$roundlast->(10, "%rdx")}
  547. vmovdqa $BLOCK1, 0*16(%rsi)
  548. ret
  549. .cfi_endproc
  550. .size aes128gcmsiv_aes_ks_enc_x1,.-aes128gcmsiv_aes_ks_enc_x1
  551. ___
  552. }
  553. aes128gcmsiv_aes_ks_enc_x1();
  554. sub aes128gcmsiv_kdf {
  555. my $BLOCK1 = "%xmm9";
  556. my $BLOCK2 = "%xmm10";
  557. my $BLOCK3 = "%xmm11";
  558. my $BLOCK4 = "%xmm12";
  559. my $BLOCK5 = "%xmm13";
  560. my $BLOCK6 = "%xmm14";
  561. my $ONE = "%xmm13";
  562. my $KSp = "%rdx";
  563. my $STATE_1 = "%xmm1";
  564. my $enc_roundx4 = sub {
  565. my ($i, $j) = @_;
  566. return <<___;
  567. vmovdqa ${\eval($i*16)}(%rdx), $j
  568. vaesenc $j, $BLOCK1, $BLOCK1
  569. vaesenc $j, $BLOCK2, $BLOCK2
  570. vaesenc $j, $BLOCK3, $BLOCK3
  571. vaesenc $j, $BLOCK4, $BLOCK4
  572. ___
  573. };
  574. my $enc_roundlastx4 = sub {
  575. my ($i, $j) = @_;
  576. return <<___;
  577. vmovdqa ${\eval($i*16)}(%rdx), $j
  578. vaesenclast $j, $BLOCK1, $BLOCK1
  579. vaesenclast $j, $BLOCK2, $BLOCK2
  580. vaesenclast $j, $BLOCK3, $BLOCK3
  581. vaesenclast $j, $BLOCK4, $BLOCK4
  582. ___
  583. };
  584. # void aes128gcmsiv_kdf(const uint8_t nonce[16],
  585. # uint8_t *out_key_material,
  586. # const uint8_t *key_schedule);
  587. $code.=<<___;
  588. .globl aes128gcmsiv_kdf
  589. .type aes128gcmsiv_kdf,\@function,3
  590. .align 16
  591. aes128gcmsiv_kdf:
  592. .cfi_startproc
  593. # parameter 1: %rdi Pointer to NONCE
  594. # parameter 2: %rsi Pointer to CT
  595. # parameter 4: %rdx Pointer to keys
  596. vmovdqa (%rdx), %xmm1 # xmm1 = first 16 bytes of random key
  597. vmovdqa 0*16(%rdi), $BLOCK1
  598. vmovdqa and_mask(%rip), $BLOCK4
  599. vmovdqa one(%rip), $ONE
  600. vpshufd \$0x90, $BLOCK1, $BLOCK1
  601. vpand $BLOCK4, $BLOCK1, $BLOCK1
  602. vpaddd $ONE, $BLOCK1, $BLOCK2
  603. vpaddd $ONE, $BLOCK2, $BLOCK3
  604. vpaddd $ONE, $BLOCK3, $BLOCK4
  605. vpxor %xmm1, $BLOCK1, $BLOCK1
  606. vpxor %xmm1, $BLOCK2, $BLOCK2
  607. vpxor %xmm1, $BLOCK3, $BLOCK3
  608. vpxor %xmm1, $BLOCK4, $BLOCK4
  609. ${\$enc_roundx4->(1, "%xmm1")}
  610. ${\$enc_roundx4->(2, "%xmm2")}
  611. ${\$enc_roundx4->(3, "%xmm1")}
  612. ${\$enc_roundx4->(4, "%xmm2")}
  613. ${\$enc_roundx4->(5, "%xmm1")}
  614. ${\$enc_roundx4->(6, "%xmm2")}
  615. ${\$enc_roundx4->(7, "%xmm1")}
  616. ${\$enc_roundx4->(8, "%xmm2")}
  617. ${\$enc_roundx4->(9, "%xmm1")}
  618. ${\$enc_roundlastx4->(10, "%xmm2")}
  619. vmovdqa $BLOCK1, 0*16(%rsi)
  620. vmovdqa $BLOCK2, 1*16(%rsi)
  621. vmovdqa $BLOCK3, 2*16(%rsi)
  622. vmovdqa $BLOCK4, 3*16(%rsi)
  623. ret
  624. .cfi_endproc
  625. .size aes128gcmsiv_kdf,.-aes128gcmsiv_kdf
  626. ___
  627. }
  628. aes128gcmsiv_kdf();
  629. sub aes128gcmsiv_enc_msg_x4 {
  630. my $CTR1 = "%xmm0";
  631. my $CTR2 = "%xmm1";
  632. my $CTR3 = "%xmm2";
  633. my $CTR4 = "%xmm3";
  634. my $ADDER = "%xmm4";
  635. my $STATE1 = "%xmm5";
  636. my $STATE2 = "%xmm6";
  637. my $STATE3 = "%xmm7";
  638. my $STATE4 = "%xmm8";
  639. my $TMP = "%xmm12";
  640. my $TMP2 = "%xmm13";
  641. my $TMP3 = "%xmm14";
  642. my $IV = "%xmm15";
  643. my $PT = "%rdi";
  644. my $CT = "%rsi";
  645. my $TAG = "%rdx";
  646. my $KS = "%rcx";
  647. my $LEN = "%r8";
  648. my $aes_round = sub {
  649. my ($i) = @_;
  650. return <<___;
  651. vmovdqu ${\eval($i*16)}($KS), $TMP
  652. vaesenc $TMP, $STATE1, $STATE1
  653. vaesenc $TMP, $STATE2, $STATE2
  654. vaesenc $TMP, $STATE3, $STATE3
  655. vaesenc $TMP, $STATE4, $STATE4
  656. ___
  657. };
  658. my $aes_lastround = sub {
  659. my ($i) = @_;
  660. return <<___;
  661. vmovdqu ${\eval($i*16)}($KS), $TMP
  662. vaesenclast $TMP, $STATE1, $STATE1
  663. vaesenclast $TMP, $STATE2, $STATE2
  664. vaesenclast $TMP, $STATE3, $STATE3
  665. vaesenclast $TMP, $STATE4, $STATE4
  666. ___
  667. };
  668. # void aes128gcmsiv_enc_msg_x4(unsigned char* PT, unsigned char* CT,
  669. # unsigned char* TAG, unsigned char* KS,
  670. # size_t byte_len);
  671. # parameter 1: %rdi #PT
  672. # parameter 2: %rsi #CT
  673. # parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32]
  674. # parameter 4: %rcx #KS
  675. # parameter 5: %r8 #LEN MSG_length in bytes
  676. $code.=<<___;
  677. .globl aes128gcmsiv_enc_msg_x4
  678. .type aes128gcmsiv_enc_msg_x4,\@function,5
  679. .align 16
  680. aes128gcmsiv_enc_msg_x4:
  681. .cfi_startproc
  682. test $LEN, $LEN
  683. jnz .L128_enc_msg_x4_start
  684. ret
  685. .L128_enc_msg_x4_start:
  686. pushq %r12
  687. .cfi_push %r12
  688. pushq %r13
  689. .cfi_push %r13
  690. shrq \$4, $LEN # LEN = num of blocks
  691. movq $LEN, %r10
  692. shlq \$62, %r10
  693. shrq \$62, %r10
  694. # make IV from TAG
  695. vmovdqa ($TAG), $IV
  696. vpor OR_MASK(%rip), $IV, $IV #IV = [1]TAG[126...32][00..00]
  697. vmovdqu four(%rip), $ADDER # Register to increment counters
  698. vmovdqa $IV, $CTR1 # CTR1 = TAG[1][127...32][00..00]
  699. vpaddd one(%rip), $IV, $CTR2 # CTR2 = TAG[1][127...32][00..01]
  700. vpaddd two(%rip), $IV, $CTR3 # CTR3 = TAG[1][127...32][00..02]
  701. vpaddd three(%rip), $IV, $CTR4 # CTR4 = TAG[1][127...32][00..03]
  702. shrq \$2, $LEN
  703. je .L128_enc_msg_x4_check_remainder
  704. subq \$64, $CT
  705. subq \$64, $PT
  706. .L128_enc_msg_x4_loop1:
  707. addq \$64, $CT
  708. addq \$64, $PT
  709. vmovdqa $CTR1, $STATE1
  710. vmovdqa $CTR2, $STATE2
  711. vmovdqa $CTR3, $STATE3
  712. vmovdqa $CTR4, $STATE4
  713. vpxor ($KS), $STATE1, $STATE1
  714. vpxor ($KS), $STATE2, $STATE2
  715. vpxor ($KS), $STATE3, $STATE3
  716. vpxor ($KS), $STATE4, $STATE4
  717. ${\$aes_round->(1)}
  718. vpaddd $ADDER, $CTR1, $CTR1
  719. ${\$aes_round->(2)}
  720. vpaddd $ADDER, $CTR2, $CTR2
  721. ${\$aes_round->(3)}
  722. vpaddd $ADDER, $CTR3, $CTR3
  723. ${\$aes_round->(4)}
  724. vpaddd $ADDER, $CTR4, $CTR4
  725. ${\$aes_round->(5)}
  726. ${\$aes_round->(6)}
  727. ${\$aes_round->(7)}
  728. ${\$aes_round->(8)}
  729. ${\$aes_round->(9)}
  730. ${\$aes_lastround->(10)}
  731. # XOR with Plaintext
  732. vpxor 0*16($PT), $STATE1, $STATE1
  733. vpxor 1*16($PT), $STATE2, $STATE2
  734. vpxor 2*16($PT), $STATE3, $STATE3
  735. vpxor 3*16($PT), $STATE4, $STATE4
  736. subq \$1, $LEN
  737. vmovdqu $STATE1, 0*16($CT)
  738. vmovdqu $STATE2, 1*16($CT)
  739. vmovdqu $STATE3, 2*16($CT)
  740. vmovdqu $STATE4, 3*16($CT)
  741. jne .L128_enc_msg_x4_loop1
  742. addq \$64,$CT
  743. addq \$64,$PT
  744. .L128_enc_msg_x4_check_remainder:
  745. cmpq \$0, %r10
  746. je .L128_enc_msg_x4_out
  747. .L128_enc_msg_x4_loop2:
  748. # enc each block separately
  749. # CTR1 is the highest counter (even if no LOOP done)
  750. vmovdqa $CTR1, $STATE1
  751. vpaddd one(%rip), $CTR1, $CTR1 # inc counter
  752. vpxor ($KS), $STATE1, $STATE1
  753. vaesenc 16($KS), $STATE1, $STATE1
  754. vaesenc 32($KS), $STATE1, $STATE1
  755. vaesenc 48($KS), $STATE1, $STATE1
  756. vaesenc 64($KS), $STATE1, $STATE1
  757. vaesenc 80($KS), $STATE1, $STATE1
  758. vaesenc 96($KS), $STATE1, $STATE1
  759. vaesenc 112($KS), $STATE1, $STATE1
  760. vaesenc 128($KS), $STATE1, $STATE1
  761. vaesenc 144($KS), $STATE1, $STATE1
  762. vaesenclast 160($KS), $STATE1, $STATE1
  763. # XOR with plaintext
  764. vpxor ($PT), $STATE1, $STATE1
  765. vmovdqu $STATE1, ($CT)
  766. addq \$16, $PT
  767. addq \$16, $CT
  768. subq \$1, %r10
  769. jne .L128_enc_msg_x4_loop2
  770. .L128_enc_msg_x4_out:
  771. popq %r13
  772. .cfi_pop %r13
  773. popq %r12
  774. .cfi_pop %r12
  775. ret
  776. .cfi_endproc
  777. .size aes128gcmsiv_enc_msg_x4,.-aes128gcmsiv_enc_msg_x4
  778. ___
  779. }
  780. aes128gcmsiv_enc_msg_x4();
  781. sub aes128gcmsiv_enc_msg_x8 {
  782. my $STATE1 = "%xmm1";
  783. my $STATE2 = "%xmm2";
  784. my $STATE3 = "%xmm3";
  785. my $STATE4 = "%xmm4";
  786. my $STATE5 = "%xmm5";
  787. my $STATE6 = "%xmm6";
  788. my $STATE7 = "%xmm7";
  789. my $STATE8 = "%xmm8";
  790. my $CTR1 = "%xmm0";
  791. my $CTR2 = "%xmm9";
  792. my $CTR3 = "%xmm10";
  793. my $CTR4 = "%xmm11";
  794. my $CTR5 = "%xmm12";
  795. my $CTR6 = "%xmm13";
  796. my $CTR7 = "%xmm14";
  797. my $SCHED = "%xmm15";
  798. my $TMP1 = "%xmm1";
  799. my $TMP2 = "%xmm2";
  800. my $PT = "%rdi";
  801. my $CT = "%rsi";
  802. my $TAG = "%rdx";
  803. my $KS = "%rcx";
  804. my $LEN = "%r8";
  805. my $aes_round8 = sub {
  806. my ($i) = @_;
  807. return <<___;
  808. vmovdqu ${\eval($i*16)}($KS), $SCHED
  809. vaesenc $SCHED, $STATE1, $STATE1
  810. vaesenc $SCHED, $STATE2, $STATE2
  811. vaesenc $SCHED, $STATE3, $STATE3
  812. vaesenc $SCHED, $STATE4, $STATE4
  813. vaesenc $SCHED, $STATE5, $STATE5
  814. vaesenc $SCHED, $STATE6, $STATE6
  815. vaesenc $SCHED, $STATE7, $STATE7
  816. vaesenc $SCHED, $STATE8, $STATE8
  817. ___
  818. };
  819. my $aes_lastround8 = sub {
  820. my ($i) = @_;
  821. return <<___;
  822. vmovdqu ${\eval($i*16)}($KS), $SCHED
  823. vaesenclast $SCHED, $STATE1, $STATE1
  824. vaesenclast $SCHED, $STATE2, $STATE2
  825. vaesenclast $SCHED, $STATE3, $STATE3
  826. vaesenclast $SCHED, $STATE4, $STATE4
  827. vaesenclast $SCHED, $STATE5, $STATE5
  828. vaesenclast $SCHED, $STATE6, $STATE6
  829. vaesenclast $SCHED, $STATE7, $STATE7
  830. vaesenclast $SCHED, $STATE8, $STATE8
  831. ___
  832. };
  833. # void ENC_MSG_x8(unsigned char* PT,
  834. # unsigned char* CT,
  835. # unsigned char* TAG,
  836. # unsigned char* KS,
  837. # size_t byte_len);
  838. # parameter 1: %rdi #PT
  839. # parameter 2: %rsi #CT
  840. # parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32]
  841. # parameter 4: %rcx #KS
  842. # parameter 5: %r8 #LEN MSG_length in bytes
  843. $code.=<<___;
  844. .globl aes128gcmsiv_enc_msg_x8
  845. .type aes128gcmsiv_enc_msg_x8,\@function,5
  846. .align 16
  847. aes128gcmsiv_enc_msg_x8:
  848. .cfi_startproc
  849. test $LEN, $LEN
  850. jnz .L128_enc_msg_x8_start
  851. ret
  852. .L128_enc_msg_x8_start:
  853. pushq %r12
  854. .cfi_push %r12
  855. pushq %r13
  856. .cfi_push %r13
  857. pushq %rbp
  858. .cfi_push %rbp
  859. movq %rsp, %rbp
  860. .cfi_def_cfa_register rbp
  861. # Place in stack
  862. subq \$128, %rsp
  863. andq \$-64, %rsp
  864. shrq \$4, $LEN # LEN = num of blocks
  865. movq $LEN, %r10
  866. shlq \$61, %r10
  867. shrq \$61, %r10
  868. # make IV from TAG
  869. vmovdqu ($TAG), $TMP1
  870. vpor OR_MASK(%rip), $TMP1, $TMP1 # TMP1= IV = [1]TAG[126...32][00..00]
  871. # store counter8 in the stack
  872. vpaddd seven(%rip), $TMP1, $CTR1
  873. vmovdqu $CTR1, (%rsp) # CTR8 = TAG[127...32][00..07]
  874. vpaddd one(%rip), $TMP1, $CTR2 # CTR2 = TAG[127...32][00..01]
  875. vpaddd two(%rip), $TMP1, $CTR3 # CTR3 = TAG[127...32][00..02]
  876. vpaddd three(%rip), $TMP1, $CTR4 # CTR4 = TAG[127...32][00..03]
  877. vpaddd four(%rip), $TMP1, $CTR5 # CTR5 = TAG[127...32][00..04]
  878. vpaddd five(%rip), $TMP1, $CTR6 # CTR6 = TAG[127...32][00..05]
  879. vpaddd six(%rip), $TMP1, $CTR7 # CTR7 = TAG[127...32][00..06]
  880. vmovdqa $TMP1, $CTR1 # CTR1 = TAG[127...32][00..00]
  881. shrq \$3, $LEN
  882. je .L128_enc_msg_x8_check_remainder
  883. subq \$128, $CT
  884. subq \$128, $PT
  885. .L128_enc_msg_x8_loop1:
  886. addq \$128, $CT
  887. addq \$128, $PT
  888. vmovdqa $CTR1, $STATE1
  889. vmovdqa $CTR2, $STATE2
  890. vmovdqa $CTR3, $STATE3
  891. vmovdqa $CTR4, $STATE4
  892. vmovdqa $CTR5, $STATE5
  893. vmovdqa $CTR6, $STATE6
  894. vmovdqa $CTR7, $STATE7
  895. # move from stack
  896. vmovdqu (%rsp), $STATE8
  897. vpxor ($KS), $STATE1, $STATE1
  898. vpxor ($KS), $STATE2, $STATE2
  899. vpxor ($KS), $STATE3, $STATE3
  900. vpxor ($KS), $STATE4, $STATE4
  901. vpxor ($KS), $STATE5, $STATE5
  902. vpxor ($KS), $STATE6, $STATE6
  903. vpxor ($KS), $STATE7, $STATE7
  904. vpxor ($KS), $STATE8, $STATE8
  905. ${\$aes_round8->(1)}
  906. vmovdqu (%rsp), $CTR7 # deal with CTR8
  907. vpaddd eight(%rip), $CTR7, $CTR7
  908. vmovdqu $CTR7, (%rsp)
  909. ${\$aes_round8->(2)}
  910. vpsubd one(%rip), $CTR7, $CTR7
  911. ${\$aes_round8->(3)}
  912. vpaddd eight(%rip), $CTR1, $CTR1
  913. ${\$aes_round8->(4)}
  914. vpaddd eight(%rip), $CTR2, $CTR2
  915. ${\$aes_round8->(5)}
  916. vpaddd eight(%rip), $CTR3, $CTR3
  917. ${\$aes_round8->(6)}
  918. vpaddd eight(%rip), $CTR4, $CTR4
  919. ${\$aes_round8->(7)}
  920. vpaddd eight(%rip), $CTR5, $CTR5
  921. ${\$aes_round8->(8)}
  922. vpaddd eight(%rip), $CTR6, $CTR6
  923. ${\$aes_round8->(9)}
  924. ${\$aes_lastround8->(10)}
  925. # XOR with Plaintext
  926. vpxor 0*16($PT), $STATE1, $STATE1
  927. vpxor 1*16($PT), $STATE2, $STATE2
  928. vpxor 2*16($PT), $STATE3, $STATE3
  929. vpxor 3*16($PT), $STATE4, $STATE4
  930. vpxor 4*16($PT), $STATE5, $STATE5
  931. vpxor 5*16($PT), $STATE6, $STATE6
  932. vpxor 6*16($PT), $STATE7, $STATE7
  933. vpxor 7*16($PT), $STATE8, $STATE8
  934. dec $LEN
  935. vmovdqu $STATE1, 0*16($CT)
  936. vmovdqu $STATE2, 1*16($CT)
  937. vmovdqu $STATE3, 2*16($CT)
  938. vmovdqu $STATE4, 3*16($CT)
  939. vmovdqu $STATE5, 4*16($CT)
  940. vmovdqu $STATE6, 5*16($CT)
  941. vmovdqu $STATE7, 6*16($CT)
  942. vmovdqu $STATE8, 7*16($CT)
  943. jne .L128_enc_msg_x8_loop1
  944. addq \$128, $CT
  945. addq \$128, $PT
  946. .L128_enc_msg_x8_check_remainder:
  947. cmpq \$0, %r10
  948. je .L128_enc_msg_x8_out
  949. .L128_enc_msg_x8_loop2:
  950. # enc each block separately
  951. # CTR1 is the highest counter (even if no LOOP done)
  952. vmovdqa $CTR1, $STATE1
  953. vpaddd one(%rip), $CTR1, $CTR1 # inc counter
  954. vpxor ($KS), $STATE1, $STATE1
  955. vaesenc 16($KS), $STATE1, $STATE1
  956. vaesenc 32($KS), $STATE1, $STATE1
  957. vaesenc 48($KS), $STATE1, $STATE1
  958. vaesenc 64($KS), $STATE1, $STATE1
  959. vaesenc 80($KS), $STATE1, $STATE1
  960. vaesenc 96($KS), $STATE1, $STATE1
  961. vaesenc 112($KS), $STATE1, $STATE1
  962. vaesenc 128($KS), $STATE1, $STATE1
  963. vaesenc 144($KS), $STATE1, $STATE1
  964. vaesenclast 160($KS), $STATE1, $STATE1
  965. # XOR with Plaintext
  966. vpxor ($PT), $STATE1, $STATE1
  967. vmovdqu $STATE1, ($CT)
  968. addq \$16, $PT
  969. addq \$16, $CT
  970. decq %r10
  971. jne .L128_enc_msg_x8_loop2
  972. .L128_enc_msg_x8_out:
  973. movq %rbp, %rsp
  974. .cfi_def_cfa_register %rsp
  975. popq %rbp
  976. .cfi_pop %rbp
  977. popq %r13
  978. .cfi_pop %r13
  979. popq %r12
  980. .cfi_pop %r12
  981. ret
  982. .cfi_endproc
  983. .size aes128gcmsiv_enc_msg_x8,.-aes128gcmsiv_enc_msg_x8
  984. ___
  985. }
  986. aes128gcmsiv_enc_msg_x8();
  987. sub aesgcmsiv_dec {
  988. my ($aes256) = @_;
  989. my $T = "%xmm0";
  990. my $TMP0 = "%xmm1";
  991. my $TMP1 = "%xmm2";
  992. my $TMP2 = "%xmm3";
  993. my $TMP3 = "%xmm4";
  994. my $TMP4 = "%xmm5";
  995. my $TMP5 = "%xmm6";
  996. my $CTR1 = "%xmm7";
  997. my $CTR2 = "%xmm8";
  998. my $CTR3 = "%xmm9";
  999. my $CTR4 = "%xmm10";
  1000. my $CTR5 = "%xmm11";
  1001. my $CTR6 = "%xmm12";
  1002. my $CTR = "%xmm15";
  1003. my $CT = "%rdi";
  1004. my $PT = "%rsi";
  1005. my $POL = "%rdx";
  1006. my $Htbl = "%rcx";
  1007. my $KS = "%r8";
  1008. my $LEN = "%r9";
  1009. my $secureBuffer = "%rax";
  1010. my $HTABLE_ROUNDS = "%xmm13";
  1011. my $labelPrefix = "128";
  1012. if ($aes256) {
  1013. $labelPrefix = "256";
  1014. }
  1015. my $aes_round_dec = sub {
  1016. my ($i) = @_;
  1017. return <<___;
  1018. vmovdqu ${\eval($i*16)}($KS), $TMP3
  1019. vaesenc $TMP3, $CTR1, $CTR1
  1020. vaesenc $TMP3, $CTR2, $CTR2
  1021. vaesenc $TMP3, $CTR3, $CTR3
  1022. vaesenc $TMP3, $CTR4, $CTR4
  1023. vaesenc $TMP3, $CTR5, $CTR5
  1024. vaesenc $TMP3, $CTR6, $CTR6
  1025. ___
  1026. };
  1027. my $aes_lastround_dec = sub {
  1028. my ($i) = @_;
  1029. return <<___;
  1030. vmovdqu ${\eval($i*16)}($KS), $TMP3
  1031. vaesenclast $TMP3, $CTR1, $CTR1
  1032. vaesenclast $TMP3, $CTR2, $CTR2
  1033. vaesenclast $TMP3, $CTR3, $CTR3
  1034. vaesenclast $TMP3, $CTR4, $CTR4
  1035. vaesenclast $TMP3, $CTR5, $CTR5
  1036. vaesenclast $TMP3, $CTR6, $CTR6
  1037. ___
  1038. };
  1039. my $schoolbook = sub {
  1040. my ($i) = @_;
  1041. return <<___;
  1042. vmovdqu ${\eval($i*16-32)}($secureBuffer), $TMP5
  1043. vmovdqu ${\eval($i*16-32)}($Htbl), $HTABLE_ROUNDS
  1044. vpclmulqdq \$0x10, $HTABLE_ROUNDS, $TMP5, $TMP3
  1045. vpxor $TMP3, $TMP0, $TMP0
  1046. vpclmulqdq \$0x11, $HTABLE_ROUNDS, $TMP5, $TMP3
  1047. vpxor $TMP3, $TMP1, $TMP1
  1048. vpclmulqdq \$0x00, $HTABLE_ROUNDS, $TMP5, $TMP3
  1049. vpxor $TMP3, $TMP2, $TMP2
  1050. vpclmulqdq \$0x01, $HTABLE_ROUNDS, $TMP5, $TMP3
  1051. vpxor $TMP3, $TMP0, $TMP0
  1052. ___
  1053. };
  1054. if ($aes256) {
  1055. $code.=<<___;
  1056. .globl aes256gcmsiv_dec
  1057. .type aes256gcmsiv_dec,\@function,6
  1058. .align 16
  1059. aes256gcmsiv_dec:
  1060. ___
  1061. } else {
  1062. $code.=<<___;
  1063. .globl aes128gcmsiv_dec
  1064. .type aes128gcmsiv_dec,\@function,6
  1065. .align 16
  1066. aes128gcmsiv_dec:
  1067. ___
  1068. }
  1069. $code.=<<___;
  1070. .cfi_startproc
  1071. test \$~15, $LEN
  1072. jnz .L${labelPrefix}_dec_start
  1073. ret
  1074. .L${labelPrefix}_dec_start:
  1075. vzeroupper
  1076. vmovdqa ($POL), $T
  1077. movq $POL, $secureBuffer
  1078. leaq 32($secureBuffer), $secureBuffer
  1079. leaq 32($Htbl), $Htbl
  1080. # make CTRBLKs from given tag.
  1081. vmovdqu ($CT,$LEN), $CTR
  1082. vpor OR_MASK(%rip), $CTR, $CTR # CTR = [1]TAG[126...32][00..00]
  1083. andq \$~15, $LEN
  1084. # If less then 6 blocks, make singles
  1085. cmp \$96, $LEN
  1086. jb .L${labelPrefix}_dec_loop2
  1087. # Decrypt the first six blocks
  1088. sub \$96, $LEN
  1089. vmovdqa $CTR, $CTR1
  1090. vpaddd one(%rip), $CTR1, $CTR2
  1091. vpaddd two(%rip), $CTR1, $CTR3
  1092. vpaddd one(%rip), $CTR3, $CTR4
  1093. vpaddd two(%rip), $CTR3, $CTR5
  1094. vpaddd one(%rip), $CTR5, $CTR6
  1095. vpaddd two(%rip), $CTR5, $CTR
  1096. vpxor ($KS), $CTR1, $CTR1
  1097. vpxor ($KS), $CTR2, $CTR2
  1098. vpxor ($KS), $CTR3, $CTR3
  1099. vpxor ($KS), $CTR4, $CTR4
  1100. vpxor ($KS), $CTR5, $CTR5
  1101. vpxor ($KS), $CTR6, $CTR6
  1102. ${\$aes_round_dec->(1)}
  1103. ${\$aes_round_dec->(2)}
  1104. ${\$aes_round_dec->(3)}
  1105. ${\$aes_round_dec->(4)}
  1106. ${\$aes_round_dec->(5)}
  1107. ${\$aes_round_dec->(6)}
  1108. ${\$aes_round_dec->(7)}
  1109. ${\$aes_round_dec->(8)}
  1110. ${\$aes_round_dec->(9)}
  1111. ___
  1112. if ($aes256) {
  1113. $code.=<<___;
  1114. ${\$aes_round_dec->(10)}
  1115. ${\$aes_round_dec->(11)}
  1116. ${\$aes_round_dec->(12)}
  1117. ${\$aes_round_dec->(13)}
  1118. ${\$aes_lastround_dec->(14)}
  1119. ___
  1120. } else {
  1121. $code.=<<___;
  1122. ${\$aes_lastround_dec->(10)}
  1123. ___
  1124. }
  1125. $code.=<<___;
  1126. # XOR with CT
  1127. vpxor 0*16($CT), $CTR1, $CTR1
  1128. vpxor 1*16($CT), $CTR2, $CTR2
  1129. vpxor 2*16($CT), $CTR3, $CTR3
  1130. vpxor 3*16($CT), $CTR4, $CTR4
  1131. vpxor 4*16($CT), $CTR5, $CTR5
  1132. vpxor 5*16($CT), $CTR6, $CTR6
  1133. vmovdqu $CTR1, 0*16($PT)
  1134. vmovdqu $CTR2, 1*16($PT)
  1135. vmovdqu $CTR3, 2*16($PT)
  1136. vmovdqu $CTR4, 3*16($PT)
  1137. vmovdqu $CTR5, 4*16($PT)
  1138. vmovdqu $CTR6, 5*16($PT)
  1139. addq \$96, $CT
  1140. addq \$96, $PT
  1141. jmp .L${labelPrefix}_dec_loop1
  1142. # Decrypt 6 blocks each time while hashing previous 6 blocks
  1143. .align 64
  1144. .L${labelPrefix}_dec_loop1:
  1145. cmp \$96, $LEN
  1146. jb .L${labelPrefix}_dec_finish_96
  1147. sub \$96, $LEN
  1148. vmovdqa $CTR6, $TMP5
  1149. vmovdqa $CTR5, 1*16-32($secureBuffer)
  1150. vmovdqa $CTR4, 2*16-32($secureBuffer)
  1151. vmovdqa $CTR3, 3*16-32($secureBuffer)
  1152. vmovdqa $CTR2, 4*16-32($secureBuffer)
  1153. vmovdqa $CTR1, 5*16-32($secureBuffer)
  1154. vmovdqa $CTR, $CTR1
  1155. vpaddd one(%rip), $CTR1, $CTR2
  1156. vpaddd two(%rip), $CTR1, $CTR3
  1157. vpaddd one(%rip), $CTR3, $CTR4
  1158. vpaddd two(%rip), $CTR3, $CTR5
  1159. vpaddd one(%rip), $CTR5, $CTR6
  1160. vpaddd two(%rip), $CTR5, $CTR
  1161. vmovdqa ($KS), $TMP3
  1162. vpxor $TMP3, $CTR1, $CTR1
  1163. vpxor $TMP3, $CTR2, $CTR2
  1164. vpxor $TMP3, $CTR3, $CTR3
  1165. vpxor $TMP3, $CTR4, $CTR4
  1166. vpxor $TMP3, $CTR5, $CTR5
  1167. vpxor $TMP3, $CTR6, $CTR6
  1168. vmovdqu 0*16-32($Htbl), $TMP3
  1169. vpclmulqdq \$0x11, $TMP3, $TMP5, $TMP1
  1170. vpclmulqdq \$0x00, $TMP3, $TMP5, $TMP2
  1171. vpclmulqdq \$0x01, $TMP3, $TMP5, $TMP0
  1172. vpclmulqdq \$0x10, $TMP3, $TMP5, $TMP3
  1173. vpxor $TMP3, $TMP0, $TMP0
  1174. ${\$aes_round_dec->(1)}
  1175. ${\$schoolbook->(1)}
  1176. ${\$aes_round_dec->(2)}
  1177. ${\$schoolbook->(2)}
  1178. ${\$aes_round_dec->(3)}
  1179. ${\$schoolbook->(3)}
  1180. ${\$aes_round_dec->(4)}
  1181. ${\$schoolbook->(4)}
  1182. ${\$aes_round_dec->(5)}
  1183. ${\$aes_round_dec->(6)}
  1184. ${\$aes_round_dec->(7)}
  1185. vmovdqa 5*16-32($secureBuffer), $TMP5
  1186. vpxor $T, $TMP5, $TMP5
  1187. vmovdqu 5*16-32($Htbl), $TMP4
  1188. vpclmulqdq \$0x01, $TMP4, $TMP5, $TMP3
  1189. vpxor $TMP3, $TMP0, $TMP0
  1190. vpclmulqdq \$0x11, $TMP4, $TMP5, $TMP3
  1191. vpxor $TMP3, $TMP1, $TMP1
  1192. vpclmulqdq \$0x00, $TMP4, $TMP5, $TMP3
  1193. vpxor $TMP3, $TMP2, $TMP2
  1194. vpclmulqdq \$0x10, $TMP4, $TMP5, $TMP3
  1195. vpxor $TMP3, $TMP0, $TMP0
  1196. ${\$aes_round_dec->(8)}
  1197. vpsrldq \$8, $TMP0, $TMP3
  1198. vpxor $TMP3, $TMP1, $TMP4
  1199. vpslldq \$8, $TMP0, $TMP3
  1200. vpxor $TMP3, $TMP2, $T
  1201. vmovdqa poly(%rip), $TMP2
  1202. ${\$aes_round_dec->(9)}
  1203. ___
  1204. if ($aes256) {
  1205. $code.=<<___;
  1206. ${\$aes_round_dec->(10)}
  1207. ${\$aes_round_dec->(11)}
  1208. ${\$aes_round_dec->(12)}
  1209. ${\$aes_round_dec->(13)}
  1210. vmovdqu 14*16($KS), $TMP5
  1211. ___
  1212. } else {
  1213. $code.=<<___;
  1214. vmovdqu 10*16($KS), $TMP5
  1215. ___
  1216. }
  1217. $code.=<<___;
  1218. vpalignr \$8, $T, $T, $TMP1
  1219. vpclmulqdq \$0x10, $TMP2, $T, $T
  1220. vpxor $T, $TMP1, $T
  1221. vpxor 0*16($CT), $TMP5, $TMP3
  1222. vaesenclast $TMP3, $CTR1, $CTR1
  1223. vpxor 1*16($CT), $TMP5, $TMP3
  1224. vaesenclast $TMP3, $CTR2, $CTR2
  1225. vpxor 2*16($CT), $TMP5, $TMP3
  1226. vaesenclast $TMP3, $CTR3, $CTR3
  1227. vpxor 3*16($CT), $TMP5, $TMP3
  1228. vaesenclast $TMP3, $CTR4, $CTR4
  1229. vpxor 4*16($CT), $TMP5, $TMP3
  1230. vaesenclast $TMP3, $CTR5, $CTR5
  1231. vpxor 5*16($CT), $TMP5, $TMP3
  1232. vaesenclast $TMP3, $CTR6, $CTR6
  1233. vpalignr \$8, $T, $T, $TMP1
  1234. vpclmulqdq \$0x10, $TMP2, $T, $T
  1235. vpxor $T, $TMP1, $T
  1236. vmovdqu $CTR1, 0*16($PT)
  1237. vmovdqu $CTR2, 1*16($PT)
  1238. vmovdqu $CTR3, 2*16($PT)
  1239. vmovdqu $CTR4, 3*16($PT)
  1240. vmovdqu $CTR5, 4*16($PT)
  1241. vmovdqu $CTR6, 5*16($PT)
  1242. vpxor $TMP4, $T, $T
  1243. lea 96($CT), $CT
  1244. lea 96($PT), $PT
  1245. jmp .L${labelPrefix}_dec_loop1
  1246. .L${labelPrefix}_dec_finish_96:
  1247. vmovdqa $CTR6, $TMP5
  1248. vmovdqa $CTR5, 1*16-32($secureBuffer)
  1249. vmovdqa $CTR4, 2*16-32($secureBuffer)
  1250. vmovdqa $CTR3, 3*16-32($secureBuffer)
  1251. vmovdqa $CTR2, 4*16-32($secureBuffer)
  1252. vmovdqa $CTR1, 5*16-32($secureBuffer)
  1253. vmovdqu 0*16-32($Htbl), $TMP3
  1254. vpclmulqdq \$0x10, $TMP3, $TMP5, $TMP0
  1255. vpclmulqdq \$0x11, $TMP3, $TMP5, $TMP1
  1256. vpclmulqdq \$0x00, $TMP3, $TMP5, $TMP2
  1257. vpclmulqdq \$0x01, $TMP3, $TMP5, $TMP3
  1258. vpxor $TMP3, $TMP0, $TMP0
  1259. ${\$schoolbook->(1)}
  1260. ${\$schoolbook->(2)}
  1261. ${\$schoolbook->(3)}
  1262. ${\$schoolbook->(4)}
  1263. vmovdqu 5*16-32($secureBuffer), $TMP5
  1264. vpxor $T, $TMP5, $TMP5
  1265. vmovdqu 5*16-32($Htbl), $TMP4
  1266. vpclmulqdq \$0x11, $TMP4, $TMP5, $TMP3
  1267. vpxor $TMP3, $TMP1, $TMP1
  1268. vpclmulqdq \$0x00, $TMP4, $TMP5, $TMP3
  1269. vpxor $TMP3, $TMP2, $TMP2
  1270. vpclmulqdq \$0x10, $TMP4, $TMP5, $TMP3
  1271. vpxor $TMP3, $TMP0, $TMP0
  1272. vpclmulqdq \$0x01, $TMP4, $TMP5, $TMP3
  1273. vpxor $TMP3, $TMP0, $TMP0
  1274. vpsrldq \$8, $TMP0, $TMP3
  1275. vpxor $TMP3, $TMP1, $TMP4
  1276. vpslldq \$8, $TMP0, $TMP3
  1277. vpxor $TMP3, $TMP2, $T
  1278. vmovdqa poly(%rip), $TMP2
  1279. vpalignr \$8, $T, $T, $TMP1
  1280. vpclmulqdq \$0x10, $TMP2, $T, $T
  1281. vpxor $T, $TMP1, $T
  1282. vpalignr \$8, $T, $T, $TMP1
  1283. vpclmulqdq \$0x10, $TMP2, $T, $T
  1284. vpxor $T, $TMP1, $T
  1285. vpxor $TMP4, $T, $T
  1286. .L${labelPrefix}_dec_loop2:
  1287. # Here we encrypt any remaining whole block
  1288. # if there are no whole blocks
  1289. cmp \$16, $LEN
  1290. jb .L${labelPrefix}_dec_out
  1291. sub \$16, $LEN
  1292. vmovdqa $CTR, $TMP1
  1293. vpaddd one(%rip), $CTR, $CTR
  1294. vpxor 0*16($KS), $TMP1, $TMP1
  1295. vaesenc 1*16($KS), $TMP1, $TMP1
  1296. vaesenc 2*16($KS), $TMP1, $TMP1
  1297. vaesenc 3*16($KS), $TMP1, $TMP1
  1298. vaesenc 4*16($KS), $TMP1, $TMP1
  1299. vaesenc 5*16($KS), $TMP1, $TMP1
  1300. vaesenc 6*16($KS), $TMP1, $TMP1
  1301. vaesenc 7*16($KS), $TMP1, $TMP1
  1302. vaesenc 8*16($KS), $TMP1, $TMP1
  1303. vaesenc 9*16($KS), $TMP1, $TMP1
  1304. ___
  1305. if ($aes256) {
  1306. $code.=<<___;
  1307. vaesenc 10*16($KS), $TMP1, $TMP1
  1308. vaesenc 11*16($KS), $TMP1, $TMP1
  1309. vaesenc 12*16($KS), $TMP1, $TMP1
  1310. vaesenc 13*16($KS), $TMP1, $TMP1
  1311. vaesenclast 14*16($KS), $TMP1, $TMP1
  1312. ___
  1313. } else {
  1314. $code.=<<___;
  1315. vaesenclast 10*16($KS), $TMP1, $TMP1
  1316. ___
  1317. }
  1318. $code.=<<___;
  1319. vpxor ($CT), $TMP1, $TMP1
  1320. vmovdqu $TMP1, ($PT)
  1321. addq \$16, $CT
  1322. addq \$16, $PT
  1323. vpxor $TMP1, $T, $T
  1324. vmovdqa -32($Htbl), $TMP0
  1325. call GFMUL
  1326. jmp .L${labelPrefix}_dec_loop2
  1327. .L${labelPrefix}_dec_out:
  1328. vmovdqu $T, ($POL)
  1329. ret
  1330. .cfi_endproc
  1331. ___
  1332. if ($aes256) {
  1333. $code.=<<___;
  1334. .size aes256gcmsiv_dec, .-aes256gcmsiv_dec
  1335. ___
  1336. } else {
  1337. $code.=<<___;
  1338. .size aes128gcmsiv_dec, .-aes128gcmsiv_dec
  1339. ___
  1340. }
  1341. }
  1342. aesgcmsiv_dec(0); # emit 128-bit version
  1343. sub aes128gcmsiv_ecb_enc_block {
  1344. my $STATE_1 = "%xmm1";
  1345. my $KSp = "%rdx";
  1346. # parameter 1: PT %rdi (pointer to 128 bit)
  1347. # parameter 2: CT %rsi (pointer to 128 bit)
  1348. # parameter 3: ks %rdx (pointer to ks)
  1349. $code.=<<___;
  1350. .globl aes128gcmsiv_ecb_enc_block
  1351. .type aes128gcmsiv_ecb_enc_block,\@function,3
  1352. .align 16
  1353. aes128gcmsiv_ecb_enc_block:
  1354. .cfi_startproc
  1355. vmovdqa (%rdi), $STATE_1
  1356. vpxor ($KSp), $STATE_1, $STATE_1
  1357. vaesenc 1*16($KSp), $STATE_1, $STATE_1
  1358. vaesenc 2*16($KSp), $STATE_1, $STATE_1
  1359. vaesenc 3*16($KSp), $STATE_1, $STATE_1
  1360. vaesenc 4*16($KSp), $STATE_1, $STATE_1
  1361. vaesenc 5*16($KSp), $STATE_1, $STATE_1
  1362. vaesenc 6*16($KSp), $STATE_1, $STATE_1
  1363. vaesenc 7*16($KSp), $STATE_1, $STATE_1
  1364. vaesenc 8*16($KSp), $STATE_1, $STATE_1
  1365. vaesenc 9*16($KSp), $STATE_1, $STATE_1
  1366. vaesenclast 10*16($KSp), $STATE_1, $STATE_1 # STATE_1 == IV
  1367. vmovdqa $STATE_1, (%rsi)
  1368. ret
  1369. .cfi_endproc
  1370. .size aes128gcmsiv_ecb_enc_block,.-aes128gcmsiv_ecb_enc_block
  1371. ___
  1372. }
  1373. aes128gcmsiv_ecb_enc_block();
  1374. sub aes256gcmsiv_aes_ks_enc_x1 {
  1375. my $KS = "%rdx";
  1376. my $KEYp = "%rcx";
  1377. my $CON_MASK = "%xmm0";
  1378. my $MASK_256 = "%xmm15";
  1379. my $KEY_1 = "%xmm1";
  1380. my $KEY_2 = "%xmm3";
  1381. my $BLOCK1 = "%xmm8";
  1382. my $AUX_REG = "%xmm14";
  1383. my $PT = "%rdi";
  1384. my $CT = "%rsi";
  1385. my $round_double = sub {
  1386. my ($i, $j) = @_;
  1387. return <<___;
  1388. vpshufb %xmm15, %xmm3, %xmm2
  1389. vaesenclast %xmm0, %xmm2, %xmm2
  1390. vpslld \$1, %xmm0, %xmm0
  1391. vpslldq \$4, %xmm1, %xmm4
  1392. vpxor %xmm4, %xmm1, %xmm1
  1393. vpslldq \$4, %xmm4, %xmm4
  1394. vpxor %xmm4, %xmm1, %xmm1
  1395. vpslldq \$4, %xmm4, %xmm4
  1396. vpxor %xmm4, %xmm1, %xmm1
  1397. vpxor %xmm2, %xmm1, %xmm1
  1398. vaesenc %xmm1, $BLOCK1, $BLOCK1
  1399. vmovdqu %xmm1, ${\eval(16*$i)}($KS)
  1400. vpshufd \$0xff, %xmm1, %xmm2
  1401. vaesenclast %xmm14, %xmm2, %xmm2
  1402. vpslldq \$4, %xmm3, %xmm4
  1403. vpxor %xmm4, %xmm3, %xmm3
  1404. vpslldq \$4, %xmm4, %xmm4
  1405. vpxor %xmm4, %xmm3, %xmm3
  1406. vpslldq \$4, %xmm4, %xmm4
  1407. vpxor %xmm4, %xmm3, %xmm3
  1408. vpxor %xmm2, %xmm3, %xmm3
  1409. vaesenc %xmm3, $BLOCK1, $BLOCK1
  1410. vmovdqu %xmm3, ${\eval(16*$j)}($KS)
  1411. ___
  1412. };
  1413. my $round_last = sub {
  1414. my ($i) = @_;
  1415. return <<___;
  1416. vpshufb %xmm15, %xmm3, %xmm2
  1417. vaesenclast %xmm0, %xmm2, %xmm2
  1418. vpslldq \$4, %xmm1, %xmm4
  1419. vpxor %xmm4, %xmm1, %xmm1
  1420. vpslldq \$4, %xmm4, %xmm4
  1421. vpxor %xmm4, %xmm1, %xmm1
  1422. vpslldq \$4, %xmm4, %xmm4
  1423. vpxor %xmm4, %xmm1, %xmm1
  1424. vpxor %xmm2, %xmm1, %xmm1
  1425. vaesenclast %xmm1, $BLOCK1, $BLOCK1
  1426. vmovdqu %xmm1, ${\eval(16*$i)}($KS)
  1427. ___
  1428. };
  1429. # parameter 1: %rdi Pointer to PT1
  1430. # parameter 2: %rsi Pointer to CT1
  1431. # parameter 3: %rdx Pointer to KS
  1432. # parameter 4: %rcx Pointer to initial key
  1433. $code.=<<___;
  1434. .globl aes256gcmsiv_aes_ks_enc_x1
  1435. .type aes256gcmsiv_aes_ks_enc_x1,\@function,4
  1436. .align 16
  1437. aes256gcmsiv_aes_ks_enc_x1:
  1438. .cfi_startproc
  1439. vmovdqa con1(%rip), $CON_MASK # CON_MASK = 1,1,1,1
  1440. vmovdqa mask(%rip), $MASK_256 # MASK_256
  1441. vmovdqa ($PT), $BLOCK1
  1442. vmovdqa ($KEYp), $KEY_1 # KEY_1 || KEY_2 [0..7] = user key
  1443. vmovdqa 16($KEYp), $KEY_2
  1444. vpxor $KEY_1, $BLOCK1, $BLOCK1
  1445. vaesenc $KEY_2, $BLOCK1, $BLOCK1
  1446. vmovdqu $KEY_1, ($KS) # First round key
  1447. vmovdqu $KEY_2, 16($KS)
  1448. vpxor $AUX_REG, $AUX_REG, $AUX_REG
  1449. ${\$round_double->(2, 3)}
  1450. ${\$round_double->(4, 5)}
  1451. ${\$round_double->(6, 7)}
  1452. ${\$round_double->(8, 9)}
  1453. ${\$round_double->(10, 11)}
  1454. ${\$round_double->(12, 13)}
  1455. ${\$round_last->(14)}
  1456. vmovdqa $BLOCK1, ($CT)
  1457. ret
  1458. .cfi_endproc
  1459. .size aes256gcmsiv_aes_ks_enc_x1,.-aes256gcmsiv_aes_ks_enc_x1
  1460. ___
  1461. }
  1462. aes256gcmsiv_aes_ks_enc_x1();
  1463. sub aes256gcmsiv_ecb_enc_block {
  1464. my $STATE_1 = "%xmm1";
  1465. my $PT = "%rdi";
  1466. my $CT = "%rsi";
  1467. my $KSp = "%rdx";
  1468. # parameter 1: PT %rdi (pointer to 128 bit)
  1469. # parameter 2: CT %rsi (pointer to 128 bit)
  1470. # parameter 3: ks %rdx (pointer to ks)
  1471. $code.=<<___;
  1472. .globl aes256gcmsiv_ecb_enc_block
  1473. .type aes256gcmsiv_ecb_enc_block,\@function,3
  1474. .align 16
  1475. aes256gcmsiv_ecb_enc_block:
  1476. .cfi_startproc
  1477. vmovdqa (%rdi), $STATE_1
  1478. vpxor ($KSp), $STATE_1, $STATE_1
  1479. vaesenc 1*16($KSp), $STATE_1, $STATE_1
  1480. vaesenc 2*16($KSp), $STATE_1, $STATE_1
  1481. vaesenc 3*16($KSp), $STATE_1, $STATE_1
  1482. vaesenc 4*16($KSp), $STATE_1, $STATE_1
  1483. vaesenc 5*16($KSp), $STATE_1, $STATE_1
  1484. vaesenc 6*16($KSp), $STATE_1, $STATE_1
  1485. vaesenc 7*16($KSp), $STATE_1, $STATE_1
  1486. vaesenc 8*16($KSp), $STATE_1, $STATE_1
  1487. vaesenc 9*16($KSp), $STATE_1, $STATE_1
  1488. vaesenc 10*16($KSp), $STATE_1, $STATE_1
  1489. vaesenc 11*16($KSp), $STATE_1, $STATE_1
  1490. vaesenc 12*16($KSp), $STATE_1, $STATE_1
  1491. vaesenc 13*16($KSp), $STATE_1, $STATE_1
  1492. vaesenclast 14*16($KSp), $STATE_1, $STATE_1 # $STATE_1 == IV
  1493. vmovdqa $STATE_1, (%rsi)
  1494. ret
  1495. .cfi_endproc
  1496. .size aes256gcmsiv_ecb_enc_block,.-aes256gcmsiv_ecb_enc_block
  1497. ___
  1498. }
  1499. aes256gcmsiv_ecb_enc_block();
  1500. sub aes256gcmsiv_enc_msg_x4 {
  1501. my $CTR1 = "%xmm0";
  1502. my $CTR2 = "%xmm1";
  1503. my $CTR3 = "%xmm2";
  1504. my $CTR4 = "%xmm3";
  1505. my $ADDER = "%xmm4";
  1506. my $STATE1 = "%xmm5";
  1507. my $STATE2 = "%xmm6";
  1508. my $STATE3 = "%xmm7";
  1509. my $STATE4 = "%xmm8";
  1510. my $TMP = "%xmm12";
  1511. my $TMP2 = "%xmm13";
  1512. my $TMP3 = "%xmm14";
  1513. my $IV = "%xmm15";
  1514. my $PT = "%rdi";
  1515. my $CT = "%rsi";
  1516. my $TAG = "%rdx";
  1517. my $KS = "%rcx";
  1518. my $LEN = "%r8";
  1519. my $aes_round = sub {
  1520. my ($i) = @_;
  1521. return <<___;
  1522. vmovdqu ${\eval($i*16)}($KS), $TMP
  1523. vaesenc $TMP, $STATE1, $STATE1
  1524. vaesenc $TMP, $STATE2, $STATE2
  1525. vaesenc $TMP, $STATE3, $STATE3
  1526. vaesenc $TMP, $STATE4, $STATE4
  1527. ___
  1528. };
  1529. my $aes_lastround = sub {
  1530. my ($i) = @_;
  1531. return <<___;
  1532. vmovdqu ${\eval($i*16)}($KS), $TMP
  1533. vaesenclast $TMP, $STATE1, $STATE1
  1534. vaesenclast $TMP, $STATE2, $STATE2
  1535. vaesenclast $TMP, $STATE3, $STATE3
  1536. vaesenclast $TMP, $STATE4, $STATE4
  1537. ___
  1538. };
  1539. # void aes256gcmsiv_enc_msg_x4(unsigned char* PT, unsigned char* CT,
  1540. # unsigned char* TAG, unsigned char* KS,
  1541. # size_t byte_len);
  1542. # parameter 1: %rdi #PT
  1543. # parameter 2: %rsi #CT
  1544. # parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32]
  1545. # parameter 4: %rcx #KS
  1546. # parameter 5: %r8 #LEN MSG_length in bytes
  1547. $code.=<<___;
  1548. .globl aes256gcmsiv_enc_msg_x4
  1549. .type aes256gcmsiv_enc_msg_x4,\@function,5
  1550. .align 16
  1551. aes256gcmsiv_enc_msg_x4:
  1552. .cfi_startproc
  1553. test $LEN, $LEN
  1554. jnz .L256_enc_msg_x4_start
  1555. ret
  1556. .L256_enc_msg_x4_start:
  1557. movq $LEN, %r10
  1558. shrq \$4, $LEN # LEN = num of blocks
  1559. shlq \$60, %r10
  1560. jz .L256_enc_msg_x4_start2
  1561. addq \$1, $LEN
  1562. .L256_enc_msg_x4_start2:
  1563. movq $LEN, %r10
  1564. shlq \$62, %r10
  1565. shrq \$62, %r10
  1566. # make IV from TAG
  1567. vmovdqa ($TAG), $IV
  1568. vpor OR_MASK(%rip), $IV, $IV # IV = [1]TAG[126...32][00..00]
  1569. vmovdqa four(%rip), $ADDER # Register to increment counters
  1570. vmovdqa $IV, $CTR1 # CTR1 = TAG[1][127...32][00..00]
  1571. vpaddd one(%rip), $IV, $CTR2 # CTR2 = TAG[1][127...32][00..01]
  1572. vpaddd two(%rip), $IV, $CTR3 # CTR3 = TAG[1][127...32][00..02]
  1573. vpaddd three(%rip), $IV, $CTR4 # CTR4 = TAG[1][127...32][00..03]
  1574. shrq \$2, $LEN
  1575. je .L256_enc_msg_x4_check_remainder
  1576. subq \$64, $CT
  1577. subq \$64, $PT
  1578. .L256_enc_msg_x4_loop1:
  1579. addq \$64, $CT
  1580. addq \$64, $PT
  1581. vmovdqa $CTR1, $STATE1
  1582. vmovdqa $CTR2, $STATE2
  1583. vmovdqa $CTR3, $STATE3
  1584. vmovdqa $CTR4, $STATE4
  1585. vpxor ($KS), $STATE1, $STATE1
  1586. vpxor ($KS), $STATE2, $STATE2
  1587. vpxor ($KS), $STATE3, $STATE3
  1588. vpxor ($KS), $STATE4, $STATE4
  1589. ${\$aes_round->(1)}
  1590. vpaddd $ADDER, $CTR1, $CTR1
  1591. ${\$aes_round->(2)}
  1592. vpaddd $ADDER, $CTR2, $CTR2
  1593. ${\$aes_round->(3)}
  1594. vpaddd $ADDER, $CTR3, $CTR3
  1595. ${\$aes_round->(4)}
  1596. vpaddd $ADDER, $CTR4, $CTR4
  1597. ${\$aes_round->(5)}
  1598. ${\$aes_round->(6)}
  1599. ${\$aes_round->(7)}
  1600. ${\$aes_round->(8)}
  1601. ${\$aes_round->(9)}
  1602. ${\$aes_round->(10)}
  1603. ${\$aes_round->(11)}
  1604. ${\$aes_round->(12)}
  1605. ${\$aes_round->(13)}
  1606. ${\$aes_lastround->(14)}
  1607. # XOR with Plaintext
  1608. vpxor 0*16($PT), $STATE1, $STATE1
  1609. vpxor 1*16($PT), $STATE2, $STATE2
  1610. vpxor 2*16($PT), $STATE3, $STATE3
  1611. vpxor 3*16($PT), $STATE4, $STATE4
  1612. subq \$1, $LEN
  1613. vmovdqu $STATE1, 0*16($CT)
  1614. vmovdqu $STATE2, 1*16($CT)
  1615. vmovdqu $STATE3, 2*16($CT)
  1616. vmovdqu $STATE4, 3*16($CT)
  1617. jne .L256_enc_msg_x4_loop1
  1618. addq \$64, $CT
  1619. addq \$64, $PT
  1620. .L256_enc_msg_x4_check_remainder:
  1621. cmpq \$0, %r10
  1622. je .L256_enc_msg_x4_out
  1623. .L256_enc_msg_x4_loop2:
  1624. # encrypt each block separately
  1625. # CTR1 is the highest counter (even if no LOOP done)
  1626. vmovdqa $CTR1, $STATE1
  1627. vpaddd one(%rip), $CTR1, $CTR1 # inc counter
  1628. vpxor ($KS), $STATE1, $STATE1
  1629. vaesenc 16($KS), $STATE1, $STATE1
  1630. vaesenc 32($KS), $STATE1, $STATE1
  1631. vaesenc 48($KS), $STATE1, $STATE1
  1632. vaesenc 64($KS), $STATE1, $STATE1
  1633. vaesenc 80($KS), $STATE1, $STATE1
  1634. vaesenc 96($KS), $STATE1, $STATE1
  1635. vaesenc 112($KS), $STATE1, $STATE1
  1636. vaesenc 128($KS), $STATE1, $STATE1
  1637. vaesenc 144($KS), $STATE1, $STATE1
  1638. vaesenc 160($KS), $STATE1, $STATE1
  1639. vaesenc 176($KS), $STATE1, $STATE1
  1640. vaesenc 192($KS), $STATE1, $STATE1
  1641. vaesenc 208($KS), $STATE1, $STATE1
  1642. vaesenclast 224($KS), $STATE1, $STATE1
  1643. # XOR with Plaintext
  1644. vpxor ($PT), $STATE1, $STATE1
  1645. vmovdqu $STATE1, ($CT)
  1646. addq \$16, $PT
  1647. addq \$16, $CT
  1648. subq \$1, %r10
  1649. jne .L256_enc_msg_x4_loop2
  1650. .L256_enc_msg_x4_out:
  1651. ret
  1652. .cfi_endproc
  1653. .size aes256gcmsiv_enc_msg_x4,.-aes256gcmsiv_enc_msg_x4
  1654. ___
  1655. }
  1656. aes256gcmsiv_enc_msg_x4();
  1657. sub aes256gcmsiv_enc_msg_x8() {
  1658. my $STATE1 = "%xmm1";
  1659. my $STATE2 = "%xmm2";
  1660. my $STATE3 = "%xmm3";
  1661. my $STATE4 = "%xmm4";
  1662. my $STATE5 = "%xmm5";
  1663. my $STATE6 = "%xmm6";
  1664. my $STATE7 = "%xmm7";
  1665. my $STATE8 = "%xmm8";
  1666. my $CTR1 = "%xmm0";
  1667. my $CTR2 = "%xmm9";
  1668. my $CTR3 = "%xmm10";
  1669. my $CTR4 = "%xmm11";
  1670. my $CTR5 = "%xmm12";
  1671. my $CTR6 = "%xmm13";
  1672. my $CTR7 = "%xmm14";
  1673. my $TMP1 = "%xmm1";
  1674. my $TMP2 = "%xmm2";
  1675. my $KS = "%rcx";
  1676. my $LEN = "%r8";
  1677. my $PT = "%rdi";
  1678. my $CT = "%rsi";
  1679. my $TAG = "%rdx";
  1680. my $SCHED = "%xmm15";
  1681. my $aes_round8 = sub {
  1682. my ($i) = @_;
  1683. return <<___;
  1684. vmovdqu ${\eval($i*16)}($KS), $SCHED
  1685. vaesenc $SCHED, $STATE1, $STATE1
  1686. vaesenc $SCHED, $STATE2, $STATE2
  1687. vaesenc $SCHED, $STATE3, $STATE3
  1688. vaesenc $SCHED, $STATE4, $STATE4
  1689. vaesenc $SCHED, $STATE5, $STATE5
  1690. vaesenc $SCHED, $STATE6, $STATE6
  1691. vaesenc $SCHED, $STATE7, $STATE7
  1692. vaesenc $SCHED, $STATE8, $STATE8
  1693. ___
  1694. };
  1695. my $aes_lastround8 = sub {
  1696. my ($i) = @_;
  1697. return <<___;
  1698. vmovdqu ${\eval($i*16)}($KS), $SCHED
  1699. vaesenclast $SCHED, $STATE1, $STATE1
  1700. vaesenclast $SCHED, $STATE2, $STATE2
  1701. vaesenclast $SCHED, $STATE3, $STATE3
  1702. vaesenclast $SCHED, $STATE4, $STATE4
  1703. vaesenclast $SCHED, $STATE5, $STATE5
  1704. vaesenclast $SCHED, $STATE6, $STATE6
  1705. vaesenclast $SCHED, $STATE7, $STATE7
  1706. vaesenclast $SCHED, $STATE8, $STATE8
  1707. ___
  1708. };
  1709. # void ENC_MSG_x8(unsigned char* PT,
  1710. # unsigned char* CT,
  1711. # unsigned char* TAG,
  1712. # unsigned char* KS,
  1713. # size_t byte_len);
  1714. # parameter 1: %rdi #PT
  1715. # parameter 2: %rsi #CT
  1716. # parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32]
  1717. # parameter 4: %rcx #KS
  1718. # parameter 5: %r8 #LEN MSG_length in bytes
  1719. $code.=<<___;
  1720. .globl aes256gcmsiv_enc_msg_x8
  1721. .type aes256gcmsiv_enc_msg_x8,\@function,5
  1722. .align 16
  1723. aes256gcmsiv_enc_msg_x8:
  1724. .cfi_startproc
  1725. test $LEN, $LEN
  1726. jnz .L256_enc_msg_x8_start
  1727. ret
  1728. .L256_enc_msg_x8_start:
  1729. # Place in stack
  1730. movq %rsp, %r11
  1731. subq \$16, %r11
  1732. andq \$-64, %r11
  1733. movq $LEN, %r10
  1734. shrq \$4, $LEN # LEN = num of blocks
  1735. shlq \$60, %r10
  1736. jz .L256_enc_msg_x8_start2
  1737. addq \$1, $LEN
  1738. .L256_enc_msg_x8_start2:
  1739. movq $LEN, %r10
  1740. shlq \$61, %r10
  1741. shrq \$61, %r10
  1742. # Make IV from TAG
  1743. vmovdqa ($TAG), $TMP1
  1744. vpor OR_MASK(%rip), $TMP1, $TMP1 # TMP1= IV = [1]TAG[126...32][00..00]
  1745. # store counter8 on the stack
  1746. vpaddd seven(%rip), $TMP1, $CTR1
  1747. vmovdqa $CTR1, (%r11) # CTR8 = TAG[127...32][00..07]
  1748. vpaddd one(%rip), $TMP1, $CTR2 # CTR2 = TAG[127...32][00..01]
  1749. vpaddd two(%rip), $TMP1, $CTR3 # CTR3 = TAG[127...32][00..02]
  1750. vpaddd three(%rip), $TMP1, $CTR4 # CTR4 = TAG[127...32][00..03]
  1751. vpaddd four(%rip), $TMP1, $CTR5 # CTR5 = TAG[127...32][00..04]
  1752. vpaddd five(%rip), $TMP1, $CTR6 # CTR6 = TAG[127...32][00..05]
  1753. vpaddd six(%rip), $TMP1, $CTR7 # CTR7 = TAG[127...32][00..06]
  1754. vmovdqa $TMP1, $CTR1 # CTR1 = TAG[127...32][00..00]
  1755. shrq \$3, $LEN
  1756. jz .L256_enc_msg_x8_check_remainder
  1757. subq \$128, $CT
  1758. subq \$128, $PT
  1759. .L256_enc_msg_x8_loop1:
  1760. addq \$128, $CT
  1761. addq \$128, $PT
  1762. vmovdqa $CTR1, $STATE1
  1763. vmovdqa $CTR2, $STATE2
  1764. vmovdqa $CTR3, $STATE3
  1765. vmovdqa $CTR4, $STATE4
  1766. vmovdqa $CTR5, $STATE5
  1767. vmovdqa $CTR6, $STATE6
  1768. vmovdqa $CTR7, $STATE7
  1769. # move from stack
  1770. vmovdqa (%r11), $STATE8
  1771. vpxor ($KS), $STATE1, $STATE1
  1772. vpxor ($KS), $STATE2, $STATE2
  1773. vpxor ($KS), $STATE3, $STATE3
  1774. vpxor ($KS), $STATE4, $STATE4
  1775. vpxor ($KS), $STATE5, $STATE5
  1776. vpxor ($KS), $STATE6, $STATE6
  1777. vpxor ($KS), $STATE7, $STATE7
  1778. vpxor ($KS), $STATE8, $STATE8
  1779. ${\$aes_round8->(1)}
  1780. vmovdqa (%r11), $CTR7 # deal with CTR8
  1781. vpaddd eight(%rip), $CTR7, $CTR7
  1782. vmovdqa $CTR7, (%r11)
  1783. ${\$aes_round8->(2)}
  1784. vpsubd one(%rip), $CTR7, $CTR7
  1785. ${\$aes_round8->(3)}
  1786. vpaddd eight(%rip), $CTR1, $CTR1
  1787. ${\$aes_round8->(4)}
  1788. vpaddd eight(%rip), $CTR2, $CTR2
  1789. ${\$aes_round8->(5)}
  1790. vpaddd eight(%rip), $CTR3, $CTR3
  1791. ${\$aes_round8->(6)}
  1792. vpaddd eight(%rip), $CTR4, $CTR4
  1793. ${\$aes_round8->(7)}
  1794. vpaddd eight(%rip), $CTR5, $CTR5
  1795. ${\$aes_round8->(8)}
  1796. vpaddd eight(%rip), $CTR6, $CTR6
  1797. ${\$aes_round8->(9)}
  1798. ${\$aes_round8->(10)}
  1799. ${\$aes_round8->(11)}
  1800. ${\$aes_round8->(12)}
  1801. ${\$aes_round8->(13)}
  1802. ${\$aes_lastround8->(14)}
  1803. # XOR with Plaintext
  1804. vpxor 0*16($PT), $STATE1, $STATE1
  1805. vpxor 1*16($PT), $STATE2, $STATE2
  1806. vpxor 2*16($PT), $STATE3, $STATE3
  1807. vpxor 3*16($PT), $STATE4, $STATE4
  1808. vpxor 4*16($PT), $STATE5, $STATE5
  1809. vpxor 5*16($PT), $STATE6, $STATE6
  1810. vpxor 6*16($PT), $STATE7, $STATE7
  1811. vpxor 7*16($PT), $STATE8, $STATE8
  1812. subq \$1, $LEN
  1813. vmovdqu $STATE1, 0*16($CT)
  1814. vmovdqu $STATE2, 1*16($CT)
  1815. vmovdqu $STATE3, 2*16($CT)
  1816. vmovdqu $STATE4, 3*16($CT)
  1817. vmovdqu $STATE5, 4*16($CT)
  1818. vmovdqu $STATE6, 5*16($CT)
  1819. vmovdqu $STATE7, 6*16($CT)
  1820. vmovdqu $STATE8, 7*16($CT)
  1821. jne .L256_enc_msg_x8_loop1
  1822. addq \$128, $CT
  1823. addq \$128, $PT
  1824. .L256_enc_msg_x8_check_remainder:
  1825. cmpq \$0, %r10
  1826. je .L256_enc_msg_x8_out
  1827. .L256_enc_msg_x8_loop2:
  1828. # encrypt each block separately
  1829. # CTR1 is the highest counter (even if no LOOP done)
  1830. vmovdqa $CTR1, $STATE1
  1831. vpaddd one(%rip), $CTR1, $CTR1
  1832. vpxor ($KS), $STATE1, $STATE1
  1833. vaesenc 16($KS), $STATE1, $STATE1
  1834. vaesenc 32($KS), $STATE1, $STATE1
  1835. vaesenc 48($KS), $STATE1, $STATE1
  1836. vaesenc 64($KS), $STATE1, $STATE1
  1837. vaesenc 80($KS), $STATE1, $STATE1
  1838. vaesenc 96($KS), $STATE1, $STATE1
  1839. vaesenc 112($KS), $STATE1, $STATE1
  1840. vaesenc 128($KS), $STATE1, $STATE1
  1841. vaesenc 144($KS), $STATE1, $STATE1
  1842. vaesenc 160($KS), $STATE1, $STATE1
  1843. vaesenc 176($KS), $STATE1, $STATE1
  1844. vaesenc 192($KS), $STATE1, $STATE1
  1845. vaesenc 208($KS), $STATE1, $STATE1
  1846. vaesenclast 224($KS), $STATE1, $STATE1
  1847. # XOR with Plaintext
  1848. vpxor ($PT), $STATE1, $STATE1
  1849. vmovdqu $STATE1, ($CT)
  1850. addq \$16, $PT
  1851. addq \$16, $CT
  1852. subq \$1, %r10
  1853. jnz .L256_enc_msg_x8_loop2
  1854. .L256_enc_msg_x8_out:
  1855. ret
  1856. .cfi_endproc
  1857. .size aes256gcmsiv_enc_msg_x8,.-aes256gcmsiv_enc_msg_x8
  1858. ___
  1859. }
  1860. aes256gcmsiv_enc_msg_x8();
  1861. aesgcmsiv_dec(1);
  1862. sub aes256gcmsiv_kdf {
  1863. my $ONE = "%xmm8";
  1864. my $BLOCK1 = "%xmm4";
  1865. my $BLOCK2 = "%xmm6";
  1866. my $BLOCK3 = "%xmm7";
  1867. my $BLOCK4 = "%xmm11";
  1868. my $BLOCK5 = "%xmm12";
  1869. my $BLOCK6 = "%xmm13";
  1870. my $enc_roundx6 = sub {
  1871. my ($i, $j) = @_;
  1872. return <<___;
  1873. vmovdqa ${\eval($i*16)}(%rdx), $j
  1874. vaesenc $j, $BLOCK1, $BLOCK1
  1875. vaesenc $j, $BLOCK2, $BLOCK2
  1876. vaesenc $j, $BLOCK3, $BLOCK3
  1877. vaesenc $j, $BLOCK4, $BLOCK4
  1878. vaesenc $j, $BLOCK5, $BLOCK5
  1879. vaesenc $j, $BLOCK6, $BLOCK6
  1880. ___
  1881. };
  1882. my $enc_roundlastx6 = sub {
  1883. my ($i, $j) = @_;
  1884. return <<___;
  1885. vmovdqa ${\eval($i*16)}(%rdx), $j
  1886. vaesenclast $j, $BLOCK1, $BLOCK1
  1887. vaesenclast $j, $BLOCK2, $BLOCK2
  1888. vaesenclast $j, $BLOCK3, $BLOCK3
  1889. vaesenclast $j, $BLOCK4, $BLOCK4
  1890. vaesenclast $j, $BLOCK5, $BLOCK5
  1891. vaesenclast $j, $BLOCK6, $BLOCK6
  1892. ___
  1893. };
  1894. # void aes256gcmsiv_kdf(const uint8_t nonce[16],
  1895. # uint8_t *out_key_material,
  1896. # const uint8_t *key_schedule);
  1897. $code.=<<___;
  1898. .globl aes256gcmsiv_kdf
  1899. .type aes256gcmsiv_kdf,\@function,3
  1900. .align 16
  1901. aes256gcmsiv_kdf:
  1902. .cfi_startproc
  1903. # parameter 1: %rdi Pointer to NONCE
  1904. # parameter 2: %rsi Pointer to CT
  1905. # parameter 4: %rdx Pointer to keys
  1906. vmovdqa (%rdx), %xmm1 # xmm1 = first 16 bytes of random key
  1907. vmovdqa 0*16(%rdi), $BLOCK1
  1908. vmovdqa and_mask(%rip), $BLOCK4
  1909. vmovdqa one(%rip), $ONE
  1910. vpshufd \$0x90, $BLOCK1, $BLOCK1
  1911. vpand $BLOCK4, $BLOCK1, $BLOCK1
  1912. vpaddd $ONE, $BLOCK1, $BLOCK2
  1913. vpaddd $ONE, $BLOCK2, $BLOCK3
  1914. vpaddd $ONE, $BLOCK3, $BLOCK4
  1915. vpaddd $ONE, $BLOCK4, $BLOCK5
  1916. vpaddd $ONE, $BLOCK5, $BLOCK6
  1917. vpxor %xmm1, $BLOCK1, $BLOCK1
  1918. vpxor %xmm1, $BLOCK2, $BLOCK2
  1919. vpxor %xmm1, $BLOCK3, $BLOCK3
  1920. vpxor %xmm1, $BLOCK4, $BLOCK4
  1921. vpxor %xmm1, $BLOCK5, $BLOCK5
  1922. vpxor %xmm1, $BLOCK6, $BLOCK6
  1923. ${\$enc_roundx6->(1, "%xmm1")}
  1924. ${\$enc_roundx6->(2, "%xmm2")}
  1925. ${\$enc_roundx6->(3, "%xmm1")}
  1926. ${\$enc_roundx6->(4, "%xmm2")}
  1927. ${\$enc_roundx6->(5, "%xmm1")}
  1928. ${\$enc_roundx6->(6, "%xmm2")}
  1929. ${\$enc_roundx6->(7, "%xmm1")}
  1930. ${\$enc_roundx6->(8, "%xmm2")}
  1931. ${\$enc_roundx6->(9, "%xmm1")}
  1932. ${\$enc_roundx6->(10, "%xmm2")}
  1933. ${\$enc_roundx6->(11, "%xmm1")}
  1934. ${\$enc_roundx6->(12, "%xmm2")}
  1935. ${\$enc_roundx6->(13, "%xmm1")}
  1936. ${\$enc_roundlastx6->(14, "%xmm2")}
  1937. vmovdqa $BLOCK1, 0*16(%rsi)
  1938. vmovdqa $BLOCK2, 1*16(%rsi)
  1939. vmovdqa $BLOCK3, 2*16(%rsi)
  1940. vmovdqa $BLOCK4, 3*16(%rsi)
  1941. vmovdqa $BLOCK5, 4*16(%rsi)
  1942. vmovdqa $BLOCK6, 5*16(%rsi)
  1943. ret
  1944. .cfi_endproc
  1945. .size aes256gcmsiv_kdf, .-aes256gcmsiv_kdf
  1946. ___
  1947. }
  1948. aes256gcmsiv_kdf();
  1949. print $code;
  1950. close STDOUT;