Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.
 
 
 

263 rindas
12 KiB

  1. /*
  2. This file is for the Gao-Mateer FFT
  3. sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf
  4. */
  5. #include "fft.h"
  6. #include "transpose.h"
  7. #include "vec256.h"
  8. #include <stdint.h>
  9. /* input: in, polynomial in bitsliced form */
  10. /* output: in, result of applying the radix conversions on in */
  11. static void radix_conversions(vec128 *in) {
  12. int i, j, k;
  13. vec128 t;
  14. uint64_t v0, v1;
  15. const vec128 mask[5][2] = {
  16. {
  17. PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0x8888888888888888, 0x8888888888888888),
  18. PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0x4444444444444444, 0x4444444444444444)
  19. },
  20. {
  21. PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0xC0C0C0C0C0C0C0C0, 0xC0C0C0C0C0C0C0C0),
  22. PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0x3030303030303030, 0x3030303030303030)
  23. },
  24. {
  25. PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0xF000F000F000F000, 0xF000F000F000F000),
  26. PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00)
  27. },
  28. {
  29. PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0xFF000000FF000000, 0xFF000000FF000000),
  30. PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000)
  31. },
  32. {
  33. PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0xFFFF000000000000, 0xFFFF000000000000),
  34. PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000)
  35. }
  36. };
  37. const vec128 s[5][GFBITS] = {
  38. #include "scalars_2x.inc"
  39. };
  40. //
  41. for (j = 0; j <= 5; j++) {
  42. for (i = 0; i < GFBITS; i++) {
  43. v1 = PQCLEAN_MCELIECE460896F_AVX_vec128_extract(in[i], 1);
  44. v1 ^= v1 >> 32;
  45. v0 = PQCLEAN_MCELIECE460896F_AVX_vec128_extract(in[i], 0);
  46. v0 ^= v1 << 32;
  47. in[i] = PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(v0, v1);
  48. }
  49. for (i = 0; i < GFBITS; i++) {
  50. for (k = 4; k >= j; k--) {
  51. t = PQCLEAN_MCELIECE460896F_AVX_vec128_and(in[i], mask[k][0]);
  52. t = PQCLEAN_MCELIECE460896F_AVX_vec128_srl_2x(t, 1 << k);
  53. in[i] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(in[i], t);
  54. t = PQCLEAN_MCELIECE460896F_AVX_vec128_and(in[i], mask[k][1]);
  55. t = PQCLEAN_MCELIECE460896F_AVX_vec128_srl_2x(t, 1 << k);
  56. in[i] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(in[i], t);
  57. }
  58. }
  59. if (j < 5) {
  60. PQCLEAN_MCELIECE460896F_AVX_vec128_mul(in, in, s[j]); // scaling
  61. }
  62. }
  63. }
  64. /* input: in, result of applying the radix conversions to the input polynomial */
  65. /* output: out, evaluation results (by applying the FFT butterflies) */
  66. static void butterflies(vec256 out[][ GFBITS ], vec128 *in) {
  67. int i, j, k, s, b;
  68. vec128 tmp[ GFBITS ];
  69. vec256 tmp0[ GFBITS ];
  70. vec256 tmp1[ GFBITS ];
  71. vec128 t[ GFBITS ];
  72. union {
  73. vec128 v[8][ GFBITS + 1 ];
  74. vec256 V[8][ (GFBITS + 1) / 2 ];
  75. } pre;
  76. union {
  77. vec128 v[64][ 2 ];
  78. vec256 V[64];
  79. } buf;
  80. uint64_t v0, v1;
  81. const vec256 consts[ 33 ][ GFBITS ] = {
  82. #include "consts.inc"
  83. };
  84. uint64_t consts_ptr = 2;
  85. const unsigned char reversal[64] = {
  86. 0, 32, 16, 48, 8, 40, 24, 56,
  87. 4, 36, 20, 52, 12, 44, 28, 60,
  88. 2, 34, 18, 50, 10, 42, 26, 58,
  89. 6, 38, 22, 54, 14, 46, 30, 62,
  90. 1, 33, 17, 49, 9, 41, 25, 57,
  91. 5, 37, 21, 53, 13, 45, 29, 61,
  92. 3, 35, 19, 51, 11, 43, 27, 59,
  93. 7, 39, 23, 55, 15, 47, 31, 63
  94. };
  95. const uint16_t beta[8] = {2522, 7827, 7801, 8035, 6897, 8167, 3476, 0};
  96. // boradcast
  97. for (j = 0; j < GFBITS; j++) {
  98. t[j] = PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_high(in[j], in[j]);
  99. }
  100. for (i = 0; i < 8; i += 2) {
  101. for (j = 0; j < GFBITS; j++) {
  102. v0 = (beta[i + 0] >> j) & 1;
  103. v0 = -v0;
  104. v1 = (beta[i + 1] >> j) & 1;
  105. v1 = -v1;
  106. tmp[j] = PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(v0, v1);
  107. }
  108. PQCLEAN_MCELIECE460896F_AVX_vec128_mul(tmp, t, tmp);
  109. for (j = 0; j < GFBITS; j++) {
  110. pre.v[i + 0][j] = PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_low(tmp[j], tmp[j]);
  111. pre.v[i + 1][j] = PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_high(tmp[j], tmp[j]);
  112. }
  113. }
  114. for (i = 0; i < GFBITS; i += 2) {
  115. if (i != GFBITS - 1) {
  116. buf.v[0][1] = PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_low(in[i + 1], in[i + 1] ^ pre.v[6][i + 1]);
  117. }
  118. buf.v[0][0] = PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_low(in[i + 0], in[i + 0] ^ pre.v[6][i + 0]);
  119. buf.V[1] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[0], pre.V[0][i / 2]);
  120. buf.V[16] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[0], pre.V[4][i / 2]);
  121. buf.V[3] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[1], pre.V[1][i / 2]);
  122. buf.V[48] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[16], pre.V[5][i / 2]);
  123. buf.V[49] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[48], pre.V[0][i / 2]);
  124. buf.V[2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[0], pre.V[1][i / 2]);
  125. buf.V[51] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[49], pre.V[1][i / 2]);
  126. buf.V[6] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[2], pre.V[2][i / 2]);
  127. buf.V[50] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[51], pre.V[0][i / 2]);
  128. buf.V[7] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[6], pre.V[0][i / 2]);
  129. buf.V[54] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[50], pre.V[2][i / 2]);
  130. buf.V[5] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[7], pre.V[1][i / 2]);
  131. buf.V[55] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[54], pre.V[0][i / 2]);
  132. buf.V[53] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[55], pre.V[1][i / 2]);
  133. buf.V[4] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[0], pre.V[2][i / 2]);
  134. buf.V[52] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[53], pre.V[0][i / 2]);
  135. buf.V[12] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[4], pre.V[3][i / 2]);
  136. buf.V[60] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[52], pre.V[3][i / 2]);
  137. buf.V[13] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[12], pre.V[0][i / 2]);
  138. buf.V[61] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[60], pre.V[0][i / 2]);
  139. buf.V[15] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[13], pre.V[1][i / 2]);
  140. buf.V[63] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[61], pre.V[1][i / 2]);
  141. buf.V[14] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[15], pre.V[0][i / 2]);
  142. buf.V[62] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[63], pre.V[0][i / 2]);
  143. buf.V[10] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[14], pre.V[2][i / 2]);
  144. buf.V[58] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[62], pre.V[2][i / 2]);
  145. buf.V[11] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[10], pre.V[0][i / 2]);
  146. buf.V[59] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[58], pre.V[0][i / 2]);
  147. buf.V[9] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[11], pre.V[1][i / 2]);
  148. buf.V[57] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[59], pre.V[1][i / 2]);
  149. buf.V[56] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[57], pre.V[0][i / 2]);
  150. buf.V[8] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[0], pre.V[3][i / 2]);
  151. buf.V[40] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[56], pre.V[4][i / 2]);
  152. buf.V[24] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[8], pre.V[4][i / 2]);
  153. buf.V[41] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[40], pre.V[0][i / 2]);
  154. buf.V[25] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[24], pre.V[0][i / 2]);
  155. buf.V[43] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[41], pre.V[1][i / 2]);
  156. buf.V[27] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[25], pre.V[1][i / 2]);
  157. buf.V[42] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[43], pre.V[0][i / 2]);
  158. buf.V[26] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[27], pre.V[0][i / 2]);
  159. buf.V[46] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[42], pre.V[2][i / 2]);
  160. buf.V[30] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[26], pre.V[2][i / 2]);
  161. buf.V[47] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[46], pre.V[0][i / 2]);
  162. buf.V[31] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[30], pre.V[0][i / 2]);
  163. buf.V[45] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[47], pre.V[1][i / 2]);
  164. buf.V[29] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[31], pre.V[1][i / 2]);
  165. buf.V[44] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[45], pre.V[0][i / 2]);
  166. buf.V[28] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[29], pre.V[0][i / 2]);
  167. buf.V[36] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[44], pre.V[3][i / 2]);
  168. buf.V[20] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[28], pre.V[3][i / 2]);
  169. buf.V[37] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[36], pre.V[0][i / 2]);
  170. buf.V[21] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[20], pre.V[0][i / 2]);
  171. buf.V[39] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[37], pre.V[1][i / 2]);
  172. buf.V[23] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[21], pre.V[1][i / 2]);
  173. buf.V[38] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[39], pre.V[0][i / 2]);
  174. buf.V[22] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[23], pre.V[0][i / 2]);
  175. buf.V[34] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[38], pre.V[2][i / 2]);
  176. buf.V[18] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[22], pre.V[2][i / 2]);
  177. buf.V[35] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[34], pre.V[0][i / 2]);
  178. buf.V[19] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[18], pre.V[0][i / 2]);
  179. buf.V[33] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[35], pre.V[1][i / 2]);
  180. buf.V[17] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[19], pre.V[1][i / 2]);
  181. buf.V[32] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[33], pre.V[0][i / 2]);
  182. // transpose
  183. PQCLEAN_MCELIECE460896F_AVX_transpose_64x256_sp(buf.V);
  184. for (j = 0; j < 32; j++) {
  185. if (i != GFBITS - 1) {
  186. out[j][i + 1] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_high(buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]);
  187. }
  188. out[j][i + 0] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_low (buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]);
  189. }
  190. }
  191. // butterflies
  192. for (k = 0; k < 32; k += 2) {
  193. for (b = 0; b < GFBITS; b++) {
  194. tmp0[b] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_low (out[k][b], out[k + 1][b]);
  195. }
  196. for (b = 0; b < GFBITS; b++) {
  197. tmp1[b] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_high (out[k][b], out[k + 1][b]);
  198. }
  199. PQCLEAN_MCELIECE460896F_AVX_vec256_maa_asm(tmp0, tmp1, consts[1]);
  200. for (b = 0; b < GFBITS; b++) {
  201. out[k][b] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_low (tmp0[b], tmp1[b]);
  202. }
  203. for (b = 0; b < GFBITS; b++) {
  204. out[k + 1][b] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_high (tmp0[b], tmp1[b]);
  205. }
  206. }
  207. for (i = 0; i <= 4; i++) {
  208. s = 1 << i;
  209. for (j = 0; j < 32; j += 2 * s) {
  210. for (k = j; k < j + s; k++) {
  211. PQCLEAN_MCELIECE460896F_AVX_vec256_maa_asm(out[k], out[k + s], consts[ consts_ptr + (k - j) ]);
  212. }
  213. }
  214. consts_ptr += (1 << i);
  215. }
  216. }
  217. /* input: in, polynomial in bitsliced form */
  218. /* output: out, bitsliced results of evaluating in all the field elements */
  219. void PQCLEAN_MCELIECE460896F_AVX_fft(vec256 out[][GFBITS], vec128 *in) {
  220. radix_conversions(in);
  221. butterflies(out, in);
  222. }