You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

181 lines
3.4 KiB

  1. #include "params.h"
  2. .macro schoolbook off,sign
  3. #load
  4. vmovdqa \off+32(%rsi),%ymm7 # b
  5. vmovdqa \off+32(%rdx),%ymm8 # d
  6. vmovdqa \off(%rsi),%ymm9 # a
  7. vmovdqa \off(%rdx),%ymm10 # c
  8. #mul
  9. vpmullw %ymm7,%ymm8,%ymm11 # bd.lo
  10. vpmulhw %ymm7,%ymm8,%ymm12 # bd.hi
  11. vpmullw %ymm7,%ymm10,%ymm13 # bc.lo
  12. vpmulhw %ymm7,%ymm10,%ymm7 # bc.hi
  13. vpmullw %ymm9,%ymm8,%ymm14 # ad.lo
  14. vpmulhw %ymm9,%ymm8,%ymm8 # ad.hi
  15. vpmullw %ymm9,%ymm10,%ymm15 # ac.lo
  16. vpmulhw %ymm9,%ymm10,%ymm9 # ac.hi
  17. #reduce
  18. vpmullw %ymm1,%ymm11,%ymm11
  19. vpmulhw %ymm0,%ymm11,%ymm11
  20. vpsubw %ymm11,%ymm12,%ymm11 # bd
  21. #mul
  22. vpmullw %ymm2,%ymm11,%ymm10 # rbd.lo
  23. vpmulhw %ymm2,%ymm11,%ymm11 # rbd.hi
  24. #unpack
  25. vpunpcklwd %ymm7,%ymm13,%ymm12 # bc0
  26. vpunpckhwd %ymm7,%ymm13,%ymm7 # bc1
  27. vpunpcklwd %ymm8,%ymm14,%ymm13 # ad0
  28. vpunpckhwd %ymm8,%ymm14,%ymm8 # ad1
  29. vpunpcklwd %ymm9,%ymm15,%ymm14 # ac0
  30. vpunpckhwd %ymm9,%ymm15,%ymm9 # ac1
  31. vpunpcklwd %ymm11,%ymm10,%ymm15 # rbd0
  32. vpunpckhwd %ymm11,%ymm10,%ymm10 # rbd1
  33. #add
  34. .ifeq \sign
  35. vpaddd %ymm14,%ymm15,%ymm14 # x0
  36. vpaddd %ymm9,%ymm10,%ymm9 # x1
  37. .else
  38. vpsubd %ymm15,%ymm14,%ymm14 # x0
  39. vpsubd %ymm10,%ymm9,%ymm9 # x1
  40. .endif
  41. vpaddd %ymm12,%ymm13,%ymm12 # y0
  42. vpaddd %ymm7,%ymm8,%ymm7 # y1
  43. .endm
  44. .macro red a0,a1,b0,b1 x,y,z
  45. #pack
  46. vpxor %ymm\x,%ymm\x,%ymm\x
  47. vpblendw $0xAA,%ymm\x,%ymm\a0,%ymm\y
  48. vpblendw $0xAA,%ymm\x,%ymm\a1,%ymm\z
  49. vpsrld $16,%ymm\a0,%ymm\a0
  50. vpsrld $16,%ymm\a1,%ymm\a1
  51. vpackusdw %ymm\z,%ymm\y,%ymm\z
  52. vpackusdw %ymm\a1,%ymm\a0,%ymm\a0
  53. vpblendw $0xAA,%ymm\x,%ymm\b0,%ymm\y
  54. vpblendw $0xAA,%ymm\x,%ymm\b1,%ymm\x
  55. vpsrld $16,%ymm\b0,%ymm\b0
  56. vpsrld $16,%ymm\b1,%ymm\b1
  57. vpackusdw %ymm\x,%ymm\y,%ymm\y
  58. vpackusdw %ymm\b1,%ymm\b0,%ymm\b0
  59. #reduce
  60. vpmullw %ymm1,%ymm\z,%ymm\z
  61. vpmullw %ymm1,%ymm\y,%ymm\y
  62. vpmulhw %ymm0,%ymm\z,%ymm\z
  63. vpmulhw %ymm0,%ymm\y,%ymm\y
  64. vpsubw %ymm\z,%ymm\a0,%ymm\a0
  65. vpsubw %ymm\y,%ymm\b0,%ymm\b0
  66. .endm
  67. .global PQCLEAN_KYBER76890S_AVX2_basemul_acc_avx
  68. PQCLEAN_KYBER76890S_AVX2_basemul_acc_avx:
  69. #consts
  70. vmovdqa PQCLEAN_KYBER76890S_AVX2_16xq(%rip),%ymm0
  71. vmovdqa PQCLEAN_KYBER76890S_AVX2_16xqinv(%rip),%ymm1
  72. vmovdqu (%rcx),%ymm2
  73. poly0.0:
  74. schoolbook 0,0
  75. #mov
  76. vmovdqa %ymm14,%ymm3
  77. vmovdqa %ymm9,%ymm4
  78. vmovdqa %ymm12,%ymm5
  79. vmovdqa %ymm7,%ymm6
  80. poly1.0:
  81. schoolbook 512,0
  82. #add
  83. vpaddd %ymm14,%ymm3,%ymm3
  84. vpaddd %ymm9,%ymm4,%ymm4
  85. vpaddd %ymm12,%ymm5,%ymm5
  86. vpaddd %ymm7,%ymm6,%ymm6
  87. poly2.0:
  88. schoolbook 1024,0
  89. #add
  90. vpaddd %ymm14,%ymm3,%ymm3
  91. vpaddd %ymm9,%ymm4,%ymm4
  92. vpaddd %ymm12,%ymm5,%ymm5
  93. vpaddd %ymm7,%ymm6,%ymm6
  94. #reduce
  95. red 3,4,5,6 7,8,9
  96. #store
  97. vmovdqa %ymm3,(%rdi)
  98. vmovdqa %ymm5,32(%rdi)
  99. poly0.1:
  100. schoolbook 64,1
  101. #mov
  102. vmovdqa %ymm14,%ymm3
  103. vmovdqa %ymm9,%ymm4
  104. vmovdqa %ymm12,%ymm5
  105. vmovdqa %ymm7,%ymm6
  106. poly1.1:
  107. schoolbook 576,1
  108. #add
  109. vpaddd %ymm14,%ymm3,%ymm3
  110. vpaddd %ymm9,%ymm4,%ymm4
  111. vpaddd %ymm12,%ymm5,%ymm5
  112. vpaddd %ymm7,%ymm6,%ymm6
  113. poly2.1:
  114. schoolbook 1088,1
  115. #add
  116. vpaddd %ymm14,%ymm3,%ymm3
  117. vpaddd %ymm9,%ymm4,%ymm4
  118. vpaddd %ymm12,%ymm5,%ymm5
  119. vpaddd %ymm7,%ymm6,%ymm6
  120. #reduce
  121. red 3,4,5,6 7,8,9
  122. #store
  123. vmovdqa %ymm3,64(%rdi)
  124. vmovdqa %ymm5,96(%rdi)
  125. ret
  126. .global PQCLEAN_KYBER76890S_AVX2_basemul_avx
  127. PQCLEAN_KYBER76890S_AVX2_basemul_avx:
  128. #consts
  129. vmovdqa PQCLEAN_KYBER76890S_AVX2_16xq(%rip),%ymm0
  130. vmovdqa PQCLEAN_KYBER76890S_AVX2_16xqinv(%rip),%ymm1
  131. vmovdqu (%rcx),%ymm2
  132. schoolbook 0,0
  133. #reduce
  134. red 14,9,12,7 8,10,11
  135. #store
  136. vmovdqa %ymm14,(%rdi)
  137. vmovdqa %ymm12,32(%rdi)
  138. schoolbook 64,1
  139. #reduce
  140. red 14,9,12,7 8,10,11
  141. #store
  142. vmovdqa %ymm14,64(%rdi)
  143. vmovdqa %ymm12,96(%rdi)
  144. ret