You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

389 lines
15 KiB

  1. #include "consts.h"
  2. #include "params.h"
  3. #include "rejsample.h"
  4. #include <immintrin.h>
  5. #include <stdint.h>
  6. static const uint8_t idx[256][8] = {
  7. { 0, 0, 0, 0, 0, 0, 0, 0},
  8. { 0, 0, 0, 0, 0, 0, 0, 0},
  9. { 2, 0, 0, 0, 0, 0, 0, 0},
  10. { 0, 2, 0, 0, 0, 0, 0, 0},
  11. { 4, 0, 0, 0, 0, 0, 0, 0},
  12. { 0, 4, 0, 0, 0, 0, 0, 0},
  13. { 2, 4, 0, 0, 0, 0, 0, 0},
  14. { 0, 2, 4, 0, 0, 0, 0, 0},
  15. { 6, 0, 0, 0, 0, 0, 0, 0},
  16. { 0, 6, 0, 0, 0, 0, 0, 0},
  17. { 2, 6, 0, 0, 0, 0, 0, 0},
  18. { 0, 2, 6, 0, 0, 0, 0, 0},
  19. { 4, 6, 0, 0, 0, 0, 0, 0},
  20. { 0, 4, 6, 0, 0, 0, 0, 0},
  21. { 2, 4, 6, 0, 0, 0, 0, 0},
  22. { 0, 2, 4, 6, 0, 0, 0, 0},
  23. { 8, 0, 0, 0, 0, 0, 0, 0},
  24. { 0, 8, 0, 0, 0, 0, 0, 0},
  25. { 2, 8, 0, 0, 0, 0, 0, 0},
  26. { 0, 2, 8, 0, 0, 0, 0, 0},
  27. { 4, 8, 0, 0, 0, 0, 0, 0},
  28. { 0, 4, 8, 0, 0, 0, 0, 0},
  29. { 2, 4, 8, 0, 0, 0, 0, 0},
  30. { 0, 2, 4, 8, 0, 0, 0, 0},
  31. { 6, 8, 0, 0, 0, 0, 0, 0},
  32. { 0, 6, 8, 0, 0, 0, 0, 0},
  33. { 2, 6, 8, 0, 0, 0, 0, 0},
  34. { 0, 2, 6, 8, 0, 0, 0, 0},
  35. { 4, 6, 8, 0, 0, 0, 0, 0},
  36. { 0, 4, 6, 8, 0, 0, 0, 0},
  37. { 2, 4, 6, 8, 0, 0, 0, 0},
  38. { 0, 2, 4, 6, 8, 0, 0, 0},
  39. {10, 0, 0, 0, 0, 0, 0, 0},
  40. { 0, 10, 0, 0, 0, 0, 0, 0},
  41. { 2, 10, 0, 0, 0, 0, 0, 0},
  42. { 0, 2, 10, 0, 0, 0, 0, 0},
  43. { 4, 10, 0, 0, 0, 0, 0, 0},
  44. { 0, 4, 10, 0, 0, 0, 0, 0},
  45. { 2, 4, 10, 0, 0, 0, 0, 0},
  46. { 0, 2, 4, 10, 0, 0, 0, 0},
  47. { 6, 10, 0, 0, 0, 0, 0, 0},
  48. { 0, 6, 10, 0, 0, 0, 0, 0},
  49. { 2, 6, 10, 0, 0, 0, 0, 0},
  50. { 0, 2, 6, 10, 0, 0, 0, 0},
  51. { 4, 6, 10, 0, 0, 0, 0, 0},
  52. { 0, 4, 6, 10, 0, 0, 0, 0},
  53. { 2, 4, 6, 10, 0, 0, 0, 0},
  54. { 0, 2, 4, 6, 10, 0, 0, 0},
  55. { 8, 10, 0, 0, 0, 0, 0, 0},
  56. { 0, 8, 10, 0, 0, 0, 0, 0},
  57. { 2, 8, 10, 0, 0, 0, 0, 0},
  58. { 0, 2, 8, 10, 0, 0, 0, 0},
  59. { 4, 8, 10, 0, 0, 0, 0, 0},
  60. { 0, 4, 8, 10, 0, 0, 0, 0},
  61. { 2, 4, 8, 10, 0, 0, 0, 0},
  62. { 0, 2, 4, 8, 10, 0, 0, 0},
  63. { 6, 8, 10, 0, 0, 0, 0, 0},
  64. { 0, 6, 8, 10, 0, 0, 0, 0},
  65. { 2, 6, 8, 10, 0, 0, 0, 0},
  66. { 0, 2, 6, 8, 10, 0, 0, 0},
  67. { 4, 6, 8, 10, 0, 0, 0, 0},
  68. { 0, 4, 6, 8, 10, 0, 0, 0},
  69. { 2, 4, 6, 8, 10, 0, 0, 0},
  70. { 0, 2, 4, 6, 8, 10, 0, 0},
  71. {12, 0, 0, 0, 0, 0, 0, 0},
  72. { 0, 12, 0, 0, 0, 0, 0, 0},
  73. { 2, 12, 0, 0, 0, 0, 0, 0},
  74. { 0, 2, 12, 0, 0, 0, 0, 0},
  75. { 4, 12, 0, 0, 0, 0, 0, 0},
  76. { 0, 4, 12, 0, 0, 0, 0, 0},
  77. { 2, 4, 12, 0, 0, 0, 0, 0},
  78. { 0, 2, 4, 12, 0, 0, 0, 0},
  79. { 6, 12, 0, 0, 0, 0, 0, 0},
  80. { 0, 6, 12, 0, 0, 0, 0, 0},
  81. { 2, 6, 12, 0, 0, 0, 0, 0},
  82. { 0, 2, 6, 12, 0, 0, 0, 0},
  83. { 4, 6, 12, 0, 0, 0, 0, 0},
  84. { 0, 4, 6, 12, 0, 0, 0, 0},
  85. { 2, 4, 6, 12, 0, 0, 0, 0},
  86. { 0, 2, 4, 6, 12, 0, 0, 0},
  87. { 8, 12, 0, 0, 0, 0, 0, 0},
  88. { 0, 8, 12, 0, 0, 0, 0, 0},
  89. { 2, 8, 12, 0, 0, 0, 0, 0},
  90. { 0, 2, 8, 12, 0, 0, 0, 0},
  91. { 4, 8, 12, 0, 0, 0, 0, 0},
  92. { 0, 4, 8, 12, 0, 0, 0, 0},
  93. { 2, 4, 8, 12, 0, 0, 0, 0},
  94. { 0, 2, 4, 8, 12, 0, 0, 0},
  95. { 6, 8, 12, 0, 0, 0, 0, 0},
  96. { 0, 6, 8, 12, 0, 0, 0, 0},
  97. { 2, 6, 8, 12, 0, 0, 0, 0},
  98. { 0, 2, 6, 8, 12, 0, 0, 0},
  99. { 4, 6, 8, 12, 0, 0, 0, 0},
  100. { 0, 4, 6, 8, 12, 0, 0, 0},
  101. { 2, 4, 6, 8, 12, 0, 0, 0},
  102. { 0, 2, 4, 6, 8, 12, 0, 0},
  103. {10, 12, 0, 0, 0, 0, 0, 0},
  104. { 0, 10, 12, 0, 0, 0, 0, 0},
  105. { 2, 10, 12, 0, 0, 0, 0, 0},
  106. { 0, 2, 10, 12, 0, 0, 0, 0},
  107. { 4, 10, 12, 0, 0, 0, 0, 0},
  108. { 0, 4, 10, 12, 0, 0, 0, 0},
  109. { 2, 4, 10, 12, 0, 0, 0, 0},
  110. { 0, 2, 4, 10, 12, 0, 0, 0},
  111. { 6, 10, 12, 0, 0, 0, 0, 0},
  112. { 0, 6, 10, 12, 0, 0, 0, 0},
  113. { 2, 6, 10, 12, 0, 0, 0, 0},
  114. { 0, 2, 6, 10, 12, 0, 0, 0},
  115. { 4, 6, 10, 12, 0, 0, 0, 0},
  116. { 0, 4, 6, 10, 12, 0, 0, 0},
  117. { 2, 4, 6, 10, 12, 0, 0, 0},
  118. { 0, 2, 4, 6, 10, 12, 0, 0},
  119. { 8, 10, 12, 0, 0, 0, 0, 0},
  120. { 0, 8, 10, 12, 0, 0, 0, 0},
  121. { 2, 8, 10, 12, 0, 0, 0, 0},
  122. { 0, 2, 8, 10, 12, 0, 0, 0},
  123. { 4, 8, 10, 12, 0, 0, 0, 0},
  124. { 0, 4, 8, 10, 12, 0, 0, 0},
  125. { 2, 4, 8, 10, 12, 0, 0, 0},
  126. { 0, 2, 4, 8, 10, 12, 0, 0},
  127. { 6, 8, 10, 12, 0, 0, 0, 0},
  128. { 0, 6, 8, 10, 12, 0, 0, 0},
  129. { 2, 6, 8, 10, 12, 0, 0, 0},
  130. { 0, 2, 6, 8, 10, 12, 0, 0},
  131. { 4, 6, 8, 10, 12, 0, 0, 0},
  132. { 0, 4, 6, 8, 10, 12, 0, 0},
  133. { 2, 4, 6, 8, 10, 12, 0, 0},
  134. { 0, 2, 4, 6, 8, 10, 12, 0},
  135. {14, 0, 0, 0, 0, 0, 0, 0},
  136. { 0, 14, 0, 0, 0, 0, 0, 0},
  137. { 2, 14, 0, 0, 0, 0, 0, 0},
  138. { 0, 2, 14, 0, 0, 0, 0, 0},
  139. { 4, 14, 0, 0, 0, 0, 0, 0},
  140. { 0, 4, 14, 0, 0, 0, 0, 0},
  141. { 2, 4, 14, 0, 0, 0, 0, 0},
  142. { 0, 2, 4, 14, 0, 0, 0, 0},
  143. { 6, 14, 0, 0, 0, 0, 0, 0},
  144. { 0, 6, 14, 0, 0, 0, 0, 0},
  145. { 2, 6, 14, 0, 0, 0, 0, 0},
  146. { 0, 2, 6, 14, 0, 0, 0, 0},
  147. { 4, 6, 14, 0, 0, 0, 0, 0},
  148. { 0, 4, 6, 14, 0, 0, 0, 0},
  149. { 2, 4, 6, 14, 0, 0, 0, 0},
  150. { 0, 2, 4, 6, 14, 0, 0, 0},
  151. { 8, 14, 0, 0, 0, 0, 0, 0},
  152. { 0, 8, 14, 0, 0, 0, 0, 0},
  153. { 2, 8, 14, 0, 0, 0, 0, 0},
  154. { 0, 2, 8, 14, 0, 0, 0, 0},
  155. { 4, 8, 14, 0, 0, 0, 0, 0},
  156. { 0, 4, 8, 14, 0, 0, 0, 0},
  157. { 2, 4, 8, 14, 0, 0, 0, 0},
  158. { 0, 2, 4, 8, 14, 0, 0, 0},
  159. { 6, 8, 14, 0, 0, 0, 0, 0},
  160. { 0, 6, 8, 14, 0, 0, 0, 0},
  161. { 2, 6, 8, 14, 0, 0, 0, 0},
  162. { 0, 2, 6, 8, 14, 0, 0, 0},
  163. { 4, 6, 8, 14, 0, 0, 0, 0},
  164. { 0, 4, 6, 8, 14, 0, 0, 0},
  165. { 2, 4, 6, 8, 14, 0, 0, 0},
  166. { 0, 2, 4, 6, 8, 14, 0, 0},
  167. {10, 14, 0, 0, 0, 0, 0, 0},
  168. { 0, 10, 14, 0, 0, 0, 0, 0},
  169. { 2, 10, 14, 0, 0, 0, 0, 0},
  170. { 0, 2, 10, 14, 0, 0, 0, 0},
  171. { 4, 10, 14, 0, 0, 0, 0, 0},
  172. { 0, 4, 10, 14, 0, 0, 0, 0},
  173. { 2, 4, 10, 14, 0, 0, 0, 0},
  174. { 0, 2, 4, 10, 14, 0, 0, 0},
  175. { 6, 10, 14, 0, 0, 0, 0, 0},
  176. { 0, 6, 10, 14, 0, 0, 0, 0},
  177. { 2, 6, 10, 14, 0, 0, 0, 0},
  178. { 0, 2, 6, 10, 14, 0, 0, 0},
  179. { 4, 6, 10, 14, 0, 0, 0, 0},
  180. { 0, 4, 6, 10, 14, 0, 0, 0},
  181. { 2, 4, 6, 10, 14, 0, 0, 0},
  182. { 0, 2, 4, 6, 10, 14, 0, 0},
  183. { 8, 10, 14, 0, 0, 0, 0, 0},
  184. { 0, 8, 10, 14, 0, 0, 0, 0},
  185. { 2, 8, 10, 14, 0, 0, 0, 0},
  186. { 0, 2, 8, 10, 14, 0, 0, 0},
  187. { 4, 8, 10, 14, 0, 0, 0, 0},
  188. { 0, 4, 8, 10, 14, 0, 0, 0},
  189. { 2, 4, 8, 10, 14, 0, 0, 0},
  190. { 0, 2, 4, 8, 10, 14, 0, 0},
  191. { 6, 8, 10, 14, 0, 0, 0, 0},
  192. { 0, 6, 8, 10, 14, 0, 0, 0},
  193. { 2, 6, 8, 10, 14, 0, 0, 0},
  194. { 0, 2, 6, 8, 10, 14, 0, 0},
  195. { 4, 6, 8, 10, 14, 0, 0, 0},
  196. { 0, 4, 6, 8, 10, 14, 0, 0},
  197. { 2, 4, 6, 8, 10, 14, 0, 0},
  198. { 0, 2, 4, 6, 8, 10, 14, 0},
  199. {12, 14, 0, 0, 0, 0, 0, 0},
  200. { 0, 12, 14, 0, 0, 0, 0, 0},
  201. { 2, 12, 14, 0, 0, 0, 0, 0},
  202. { 0, 2, 12, 14, 0, 0, 0, 0},
  203. { 4, 12, 14, 0, 0, 0, 0, 0},
  204. { 0, 4, 12, 14, 0, 0, 0, 0},
  205. { 2, 4, 12, 14, 0, 0, 0, 0},
  206. { 0, 2, 4, 12, 14, 0, 0, 0},
  207. { 6, 12, 14, 0, 0, 0, 0, 0},
  208. { 0, 6, 12, 14, 0, 0, 0, 0},
  209. { 2, 6, 12, 14, 0, 0, 0, 0},
  210. { 0, 2, 6, 12, 14, 0, 0, 0},
  211. { 4, 6, 12, 14, 0, 0, 0, 0},
  212. { 0, 4, 6, 12, 14, 0, 0, 0},
  213. { 2, 4, 6, 12, 14, 0, 0, 0},
  214. { 0, 2, 4, 6, 12, 14, 0, 0},
  215. { 8, 12, 14, 0, 0, 0, 0, 0},
  216. { 0, 8, 12, 14, 0, 0, 0, 0},
  217. { 2, 8, 12, 14, 0, 0, 0, 0},
  218. { 0, 2, 8, 12, 14, 0, 0, 0},
  219. { 4, 8, 12, 14, 0, 0, 0, 0},
  220. { 0, 4, 8, 12, 14, 0, 0, 0},
  221. { 2, 4, 8, 12, 14, 0, 0, 0},
  222. { 0, 2, 4, 8, 12, 14, 0, 0},
  223. { 6, 8, 12, 14, 0, 0, 0, 0},
  224. { 0, 6, 8, 12, 14, 0, 0, 0},
  225. { 2, 6, 8, 12, 14, 0, 0, 0},
  226. { 0, 2, 6, 8, 12, 14, 0, 0},
  227. { 4, 6, 8, 12, 14, 0, 0, 0},
  228. { 0, 4, 6, 8, 12, 14, 0, 0},
  229. { 2, 4, 6, 8, 12, 14, 0, 0},
  230. { 0, 2, 4, 6, 8, 12, 14, 0},
  231. {10, 12, 14, 0, 0, 0, 0, 0},
  232. { 0, 10, 12, 14, 0, 0, 0, 0},
  233. { 2, 10, 12, 14, 0, 0, 0, 0},
  234. { 0, 2, 10, 12, 14, 0, 0, 0},
  235. { 4, 10, 12, 14, 0, 0, 0, 0},
  236. { 0, 4, 10, 12, 14, 0, 0, 0},
  237. { 2, 4, 10, 12, 14, 0, 0, 0},
  238. { 0, 2, 4, 10, 12, 14, 0, 0},
  239. { 6, 10, 12, 14, 0, 0, 0, 0},
  240. { 0, 6, 10, 12, 14, 0, 0, 0},
  241. { 2, 6, 10, 12, 14, 0, 0, 0},
  242. { 0, 2, 6, 10, 12, 14, 0, 0},
  243. { 4, 6, 10, 12, 14, 0, 0, 0},
  244. { 0, 4, 6, 10, 12, 14, 0, 0},
  245. { 2, 4, 6, 10, 12, 14, 0, 0},
  246. { 0, 2, 4, 6, 10, 12, 14, 0},
  247. { 8, 10, 12, 14, 0, 0, 0, 0},
  248. { 0, 8, 10, 12, 14, 0, 0, 0},
  249. { 2, 8, 10, 12, 14, 0, 0, 0},
  250. { 0, 2, 8, 10, 12, 14, 0, 0},
  251. { 4, 8, 10, 12, 14, 0, 0, 0},
  252. { 0, 4, 8, 10, 12, 14, 0, 0},
  253. { 2, 4, 8, 10, 12, 14, 0, 0},
  254. { 0, 2, 4, 8, 10, 12, 14, 0},
  255. { 6, 8, 10, 12, 14, 0, 0, 0},
  256. { 0, 6, 8, 10, 12, 14, 0, 0},
  257. { 2, 6, 8, 10, 12, 14, 0, 0},
  258. { 0, 2, 6, 8, 10, 12, 14, 0},
  259. { 4, 6, 8, 10, 12, 14, 0, 0},
  260. { 0, 4, 6, 8, 10, 12, 14, 0},
  261. { 2, 4, 6, 8, 10, 12, 14, 0},
  262. { 0, 2, 4, 6, 8, 10, 12, 14}
  263. };
  264. #define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a)
  265. #define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a)
  266. size_t PQCLEAN_KYBER76890S_AVX2_rej_uniform(int16_t *r,
  267. size_t len,
  268. const uint8_t *buf,
  269. size_t buflen) {
  270. size_t ctr, pos;
  271. uint16_t val;
  272. uint32_t good0, good1, good2;
  273. const __m256i bound = _mm256_set1_epi16((int16_t)(19 * KYBER_Q - 1)); // -1 to use cheaper >= instead of > comparison
  274. const __m256i ones = _mm256_set1_epi8(1);
  275. const __m256i kyberq = _mm256_load_si256(&PQCLEAN_KYBER76890S_AVX2_16xq.as_vec);
  276. const __m256i v = _mm256_load_si256(&PQCLEAN_KYBER76890S_AVX2_16xv.as_vec);
  277. __m256i d0, d1, d2, tmp0, tmp1, tmp2, pi0, pi1, pi2;
  278. __m128i d, tmp, pilo, pihi;
  279. ctr = pos = 0;
  280. while (ctr + 48 <= len && pos + 96 <= buflen) {
  281. d0 = _mm256_loadu_si256((__m256i *)&buf[pos + 0]);
  282. d1 = _mm256_loadu_si256((__m256i *)&buf[pos + 32]);
  283. d2 = _mm256_loadu_si256((__m256i *)&buf[pos + 64]);
  284. tmp0 = _mm256_cmpge_epu16(bound, d0);
  285. tmp1 = _mm256_cmpge_epu16(bound, d1);
  286. tmp2 = _mm256_cmpge_epu16(bound, d2);
  287. good0 = (uint32_t)_mm256_movemask_epi8(tmp0);
  288. good1 = (uint32_t)_mm256_movemask_epi8(tmp1);
  289. good2 = (uint32_t)_mm256_movemask_epi8(tmp2);
  290. good0 = _pext_u32(good0, 0x55555555);
  291. good1 = _pext_u32(good1, 0x55555555);
  292. good2 = _pext_u32(good2, 0x55555555);
  293. pilo = _mm_loadl_epi64((__m128i *)&idx[good0 & 0xFF]);
  294. pihi = _mm_loadl_epi64((__m128i *)&idx[(good0 >> 8) & 0xFF]);
  295. pi0 = _mm256_castsi128_si256(pilo);
  296. pi0 = _mm256_inserti128_si256(pi0, pihi, 1);
  297. pilo = _mm_loadl_epi64((__m128i *)&idx[good1 & 0xFF]);
  298. pihi = _mm_loadl_epi64((__m128i *)&idx[(good1 >> 8) & 0xFF]);
  299. pi1 = _mm256_castsi128_si256(pilo);
  300. pi1 = _mm256_inserti128_si256(pi1, pihi, 1);
  301. pilo = _mm_loadl_epi64((__m128i *)&idx[good2 & 0xFF]);
  302. pihi = _mm_loadl_epi64((__m128i *)&idx[(good2 >> 8) & 0xFF]);
  303. pi2 = _mm256_castsi128_si256(pilo);
  304. pi2 = _mm256_inserti128_si256(pi2, pihi, 1);
  305. tmp0 = _mm256_add_epi8(pi0, ones);
  306. tmp1 = _mm256_add_epi8(pi1, ones);
  307. tmp2 = _mm256_add_epi8(pi2, ones);
  308. pi0 = _mm256_unpacklo_epi8(pi0, tmp0);
  309. pi1 = _mm256_unpacklo_epi8(pi1, tmp1);
  310. pi2 = _mm256_unpacklo_epi8(pi2, tmp2);
  311. d0 = _mm256_shuffle_epi8(d0, pi0);
  312. d1 = _mm256_shuffle_epi8(d1, pi1);
  313. d2 = _mm256_shuffle_epi8(d2, pi2);
  314. /* Barrett reduction of (still unsigned) d values */
  315. tmp0 = _mm256_mulhi_epu16(d0, v);
  316. tmp1 = _mm256_mulhi_epu16(d1, v);
  317. tmp2 = _mm256_mulhi_epu16(d2, v);
  318. tmp0 = _mm256_srli_epi16(tmp0, 10);
  319. tmp1 = _mm256_srli_epi16(tmp1, 10);
  320. tmp2 = _mm256_srli_epi16(tmp2, 10);
  321. tmp0 = _mm256_mullo_epi16(tmp0, kyberq);
  322. tmp1 = _mm256_mullo_epi16(tmp1, kyberq);
  323. tmp2 = _mm256_mullo_epi16(tmp2, kyberq);
  324. d0 = _mm256_sub_epi16(d0, tmp0);
  325. d1 = _mm256_sub_epi16(d1, tmp1);
  326. d2 = _mm256_sub_epi16(d2, tmp2);
  327. _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(d0));
  328. ctr += (unsigned int)_mm_popcnt_u32(good0 & 0xFF);
  329. _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(d0, 1));
  330. ctr += (unsigned int)_mm_popcnt_u32((good0 >> 8) & 0xFF);
  331. _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(d1));
  332. ctr += (unsigned int)_mm_popcnt_u32(good1 & 0xFF);
  333. _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(d1, 1));
  334. ctr += (unsigned int)_mm_popcnt_u32((good1 >> 8) & 0xFF);
  335. _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(d2));
  336. ctr += (unsigned int)_mm_popcnt_u32(good2 & 0xFF);
  337. _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(d2, 1));
  338. ctr += (unsigned int)_mm_popcnt_u32((good2 >> 8) & 0xFF);
  339. pos += 96;
  340. }
  341. while (ctr + 8 <= len && pos + 16 <= buflen) {
  342. d = _mm_loadu_si128((__m128i *)&buf[pos]);
  343. tmp = _mm_cmpge_epu16(_mm256_castsi256_si128(bound), d);
  344. good0 = (uint32_t)_mm_movemask_epi8(tmp);
  345. good0 = _pext_u32(good0, 0x55555555);
  346. pilo = _mm_loadl_epi64((__m128i *)&idx[good0]);
  347. pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones));
  348. pilo = _mm_unpacklo_epi8(pilo, pihi);
  349. d = _mm_shuffle_epi8(d, pilo);
  350. /* Barrett reduction */
  351. tmp = _mm_mulhi_epu16(d, _mm256_castsi256_si128(v));
  352. tmp = _mm_srli_epi16(tmp, 10);
  353. tmp = _mm_mullo_epi16(tmp, _mm256_castsi256_si128(kyberq));
  354. d = _mm_sub_epi16(d, tmp);
  355. _mm_storeu_si128((__m128i *)&r[ctr], d);
  356. ctr += (unsigned int)_mm_popcnt_u32(good0);
  357. pos += 16;
  358. }
  359. while (ctr < len && pos + 2 <= buflen) {
  360. val = (uint16_t)(buf[pos] | ((uint16_t)buf[pos + 1] << 8));
  361. pos += 2;
  362. if (val < 19 * KYBER_Q) {
  363. val -= ((int32_t)val * 20159 >> 26) * KYBER_Q;
  364. r[ctr++] = (int16_t)val;
  365. }
  366. }
  367. return ctr;
  368. }