Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.
 
 
 

802 linhas
27 KiB

  1. /*
  2. Plain C implementation of the Haraka256 and Haraka512 permutations.
  3. */
  4. #include <immintrin.h>
  5. #include <stdint.h>
  6. #include <stdio.h>
  7. #include <stdlib.h>
  8. #include <string.h>
  9. #include "haraka.h"
  10. #define HARAKAS_RATE 32
  11. #define u64 uint64_t
  12. #define u128 __m128i
  13. #define LOAD(src) _mm_load_si128((u128 *)(src))
  14. #define STORE(dest,src) _mm_storeu_si128((u128 *)(dest),src)
  15. #define XOR128(a, b) _mm_xor_si128(a, b)
  16. #define AES2(s0, s1, rci) \
  17. (s0) = _mm_aesenc_si128(s0, *(rci)); \
  18. (s1) = _mm_aesenc_si128(s1, *((rci) + 1)); \
  19. (s0) = _mm_aesenc_si128(s0, *((rci) + 2)); \
  20. (s1) = _mm_aesenc_si128(s1, *((rci) + 3));
  21. #define AES2_4x(s0, s1, s2, s3, rci) \
  22. AES2((s0)[0], (s0)[1], rci); \
  23. AES2((s1)[0], (s1)[1], rci); \
  24. AES2((s2)[0], (s2)[1], rci); \
  25. AES2((s3)[0], (s3)[1], rci);
  26. #define AES4(s0, s1, s2, s3, rci) \
  27. (s0) = _mm_aesenc_si128(s0, *(rci)); \
  28. (s1) = _mm_aesenc_si128(s1, *((rci) + 1)); \
  29. (s2) = _mm_aesenc_si128(s2, *((rci) + 2)); \
  30. (s3) = _mm_aesenc_si128(s3, *((rci) + 3)); \
  31. (s0) = _mm_aesenc_si128(s0, *((rci) + 4)); \
  32. (s1) = _mm_aesenc_si128(s1, *((rci) + 5)); \
  33. (s2) = _mm_aesenc_si128(s2, *((rci) + 6)); \
  34. (s3) = _mm_aesenc_si128(s3, *((rci) + 7));
  35. #define AES4_4x(s0, s1, s2, s3, rci) \
  36. AES4((s0)[0], (s0)[1], (s0)[2], (s0)[3], rci); \
  37. AES4((s1)[0], (s1)[1], (s1)[2], (s1)[3], rci); \
  38. AES4((s2)[0], (s2)[1], (s2)[2], (s2)[3], rci); \
  39. AES4((s3)[0], (s3)[1], (s3)[2], (s3)[3], rci);
  40. #define MIX2(s0, s1) \
  41. tmp = _mm_unpacklo_epi32(s0, s1); \
  42. (s1) = _mm_unpackhi_epi32(s0, s1); \
  43. (s0) = tmp;
  44. #define MIX4(s0, s1, s2, s3) \
  45. tmp = _mm_unpacklo_epi32(s0, s1); \
  46. (s0) = _mm_unpackhi_epi32(s0, s1); \
  47. (s1) = _mm_unpacklo_epi32(s2, s3); \
  48. (s2) = _mm_unpackhi_epi32(s2, s3); \
  49. (s3) = _mm_unpacklo_epi32(s0, s2); \
  50. (s0) = _mm_unpackhi_epi32(s0, s2); \
  51. (s2) = _mm_unpackhi_epi32(s1, tmp); \
  52. (s1) = _mm_unpacklo_epi32(s1, tmp);
  53. #define TRUNCSTORE(out, s0, s1, s2, s3) \
  54. _mm_storeu_si128((u128 *)(out), \
  55. _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(s0), _mm_castsi128_pd(s1), 3))); \
  56. _mm_storeu_si128((u128 *)((out) + 16), \
  57. _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(s2), _mm_castsi128_pd(s3), 0)));
  58. static void load_haraka_constants(u128 rc[40]) {
  59. rc[ 0] = _mm_set_epi32((int)0x0684704c, (int)0xe620c00a, (int)0xb2c5fef0, (int)0x75817b9d);
  60. rc[ 1] = _mm_set_epi32((int)0x8b66b4e1, (int)0x88f3a06b, (int)0x640f6ba4, (int)0x2f08f717);
  61. rc[ 2] = _mm_set_epi32((int)0x3402de2d, (int)0x53f28498, (int)0xcf029d60, (int)0x9f029114);
  62. rc[ 3] = _mm_set_epi32((int)0x0ed6eae6, (int)0x2e7b4f08, (int)0xbbf3bcaf, (int)0xfd5b4f79);
  63. rc[ 4] = _mm_set_epi32((int)0xcbcfb0cb, (int)0x4872448b, (int)0x79eecd1c, (int)0xbe397044);
  64. rc[ 5] = _mm_set_epi32((int)0x7eeacdee, (int)0x6e9032b7, (int)0x8d5335ed, (int)0x2b8a057b);
  65. rc[ 6] = _mm_set_epi32((int)0x67c28f43, (int)0x5e2e7cd0, (int)0xe2412761, (int)0xda4fef1b);
  66. rc[ 7] = _mm_set_epi32((int)0x2924d9b0, (int)0xafcacc07, (int)0x675ffde2, (int)0x1fc70b3b);
  67. rc[ 8] = _mm_set_epi32((int)0xab4d63f1, (int)0xe6867fe9, (int)0xecdb8fca, (int)0xb9d465ee);
  68. rc[ 9] = _mm_set_epi32((int)0x1c30bf84, (int)0xd4b7cd64, (int)0x5b2a404f, (int)0xad037e33);
  69. rc[10] = _mm_set_epi32((int)0xb2cc0bb9, (int)0x941723bf, (int)0x69028b2e, (int)0x8df69800);
  70. rc[11] = _mm_set_epi32((int)0xfa0478a6, (int)0xde6f5572, (int)0x4aaa9ec8, (int)0x5c9d2d8a);
  71. rc[12] = _mm_set_epi32((int)0xdfb49f2b, (int)0x6b772a12, (int)0x0efa4f2e, (int)0x29129fd4);
  72. rc[13] = _mm_set_epi32((int)0x1ea10344, (int)0xf449a236, (int)0x32d611ae, (int)0xbb6a12ee);
  73. rc[14] = _mm_set_epi32((int)0xaf044988, (int)0x4b050084, (int)0x5f9600c9, (int)0x9ca8eca6);
  74. rc[15] = _mm_set_epi32((int)0x21025ed8, (int)0x9d199c4f, (int)0x78a2c7e3, (int)0x27e593ec);
  75. rc[16] = _mm_set_epi32((int)0xbf3aaaf8, (int)0xa759c9b7, (int)0xb9282ecd, (int)0x82d40173);
  76. rc[17] = _mm_set_epi32((int)0x6260700d, (int)0x6186b017, (int)0x37f2efd9, (int)0x10307d6b);
  77. rc[18] = _mm_set_epi32((int)0x5aca45c2, (int)0x21300443, (int)0x81c29153, (int)0xf6fc9ac6);
  78. rc[19] = _mm_set_epi32((int)0x9223973c, (int)0x226b68bb, (int)0x2caf92e8, (int)0x36d1943a);
  79. rc[20] = _mm_set_epi32((int)0xd3bf9238, (int)0x225886eb, (int)0x6cbab958, (int)0xe51071b4);
  80. rc[21] = _mm_set_epi32((int)0xdb863ce5, (int)0xaef0c677, (int)0x933dfddd, (int)0x24e1128d);
  81. rc[22] = _mm_set_epi32((int)0xbb606268, (int)0xffeba09c, (int)0x83e48de3, (int)0xcb2212b1);
  82. rc[23] = _mm_set_epi32((int)0x734bd3dc, (int)0xe2e4d19c, (int)0x2db91a4e, (int)0xc72bf77d);
  83. rc[24] = _mm_set_epi32((int)0x43bb47c3, (int)0x61301b43, (int)0x4b1415c4, (int)0x2cb3924e);
  84. rc[25] = _mm_set_epi32((int)0xdba775a8, (int)0xe707eff6, (int)0x03b231dd, (int)0x16eb6899);
  85. rc[26] = _mm_set_epi32((int)0x6df3614b, (int)0x3c755977, (int)0x8e5e2302, (int)0x7eca472c);
  86. rc[27] = _mm_set_epi32((int)0xcda75a17, (int)0xd6de7d77, (int)0x6d1be5b9, (int)0xb88617f9);
  87. rc[28] = _mm_set_epi32((int)0xec6b43f0, (int)0x6ba8e9aa, (int)0x9d6c069d, (int)0xa946ee5d);
  88. rc[29] = _mm_set_epi32((int)0xcb1e6950, (int)0xf957332b, (int)0xa2531159, (int)0x3bf327c1);
  89. rc[30] = _mm_set_epi32((int)0x2cee0c75, (int)0x00da619c, (int)0xe4ed0353, (int)0x600ed0d9);
  90. rc[31] = _mm_set_epi32((int)0xf0b1a5a1, (int)0x96e90cab, (int)0x80bbbabc, (int)0x63a4a350);
  91. rc[32] = _mm_set_epi32((int)0xae3db102, (int)0x5e962988, (int)0xab0dde30, (int)0x938dca39);
  92. rc[33] = _mm_set_epi32((int)0x17bb8f38, (int)0xd554a40b, (int)0x8814f3a8, (int)0x2e75b442);
  93. rc[34] = _mm_set_epi32((int)0x34bb8a5b, (int)0x5f427fd7, (int)0xaeb6b779, (int)0x360a16f6);
  94. rc[35] = _mm_set_epi32((int)0x26f65241, (int)0xcbe55438, (int)0x43ce5918, (int)0xffbaafde);
  95. rc[36] = _mm_set_epi32((int)0x4ce99a54, (int)0xb9f3026a, (int)0xa2ca9cf7, (int)0x839ec978);
  96. rc[37] = _mm_set_epi32((int)0xae51a51a, (int)0x1bdff7be, (int)0x40c06e28, (int)0x22901235);
  97. rc[38] = _mm_set_epi32((int)0xa0c1613c, (int)0xba7ed22b, (int)0xc173bc0f, (int)0x48a659cf);
  98. rc[39] = _mm_set_epi32((int)0x756acc03, (int)0x02288288, (int)0x4ad6bdfd, (int)0xe9c59da1);
  99. }
  100. void PQCLEAN_SPHINCSHARAKA192FROBUST_AESNI_tweak_constants(
  101. harakactx *state,
  102. const unsigned char *pk_seed, const unsigned char *sk_seed,
  103. unsigned long long seed_length) {
  104. int i;
  105. unsigned char buf[40 * 16];
  106. /* Use the standard constants to generate tweaked ones. */
  107. load_haraka_constants(state->rc);
  108. /* Constants for sk.seed */
  109. if (sk_seed != NULL) {
  110. PQCLEAN_SPHINCSHARAKA192FROBUST_AESNI_haraka_S(buf, 40 * 16, sk_seed, seed_length, state);
  111. /* Tweak constants with the pub_seed */
  112. for (i = 0; i < 40; i++) {
  113. state->rc_sseed[i] = LOAD(buf + i * 16);
  114. }
  115. }
  116. /* Constants for pk.seed */
  117. PQCLEAN_SPHINCSHARAKA192FROBUST_AESNI_haraka_S(buf, 40 * 16, pk_seed, seed_length, state);
  118. /* Tweak constants with the pub_seed */
  119. for (i = 0; i < 40; i++) {
  120. state->rc[i] = LOAD(buf + i * 16);
  121. }
  122. }
  123. static void haraka_S_absorb(unsigned char *s,
  124. const unsigned char *m, unsigned long long mlen,
  125. unsigned char p,
  126. const harakactx *state) {
  127. unsigned long long i;
  128. unsigned char t[HARAKAS_RATE];
  129. while (mlen >= HARAKAS_RATE) {
  130. // XOR block to state
  131. STORE(s, XOR128(LOAD(s), LOAD(m)));
  132. STORE(s + 16, XOR128(LOAD(s + 16), LOAD(m + 16)));
  133. PQCLEAN_SPHINCSHARAKA192FROBUST_AESNI_haraka512_perm(s, s, state);
  134. mlen -= HARAKAS_RATE;
  135. m += HARAKAS_RATE;
  136. }
  137. for (i = 0; i < HARAKAS_RATE; ++i) {
  138. t[i] = 0;
  139. }
  140. for (i = 0; i < mlen; ++i) {
  141. t[i] = m[i];
  142. }
  143. t[i] = p;
  144. t[HARAKAS_RATE - 1] |= 128;
  145. STORE(s, XOR128(LOAD(s), LOAD(t)));
  146. STORE(s + 16, XOR128(LOAD(s + 16), LOAD(t + 16)));
  147. }
  148. static void haraka_S_absorb4x(unsigned char *s,
  149. const unsigned char *m0,
  150. const unsigned char *m1,
  151. const unsigned char *m2,
  152. const unsigned char *m3,
  153. unsigned long long int mlen,
  154. unsigned char p,
  155. const harakactx *state) {
  156. unsigned long long i;
  157. unsigned char t0[HARAKAS_RATE];
  158. unsigned char t1[HARAKAS_RATE];
  159. unsigned char t2[HARAKAS_RATE];
  160. unsigned char t3[HARAKAS_RATE];
  161. while (mlen >= HARAKAS_RATE) {
  162. // XOR block to state
  163. STORE(s, XOR128(LOAD(s), LOAD(m0)));
  164. STORE(s + 16, XOR128(LOAD(s + 16), LOAD(m0 + 16)));
  165. STORE(s + 64, XOR128(LOAD(s + 64), LOAD(m1)));
  166. STORE(s + 80, XOR128(LOAD(s + 80), LOAD(m1 + 16)));
  167. STORE(s + 128, XOR128(LOAD(s + 128), LOAD(m2)));
  168. STORE(s + 144, XOR128(LOAD(s + 144), LOAD(m2 + 16)));
  169. STORE(s + 192, XOR128(LOAD(s + 192), LOAD(m3)));
  170. STORE(s + 208, XOR128(LOAD(s + 208), LOAD(m3 + 16)));
  171. PQCLEAN_SPHINCSHARAKA192FROBUST_AESNI_haraka512_perm_x4(s, s, state);
  172. mlen -= HARAKAS_RATE;
  173. m0 += HARAKAS_RATE;
  174. m1 += HARAKAS_RATE;
  175. m2 += HARAKAS_RATE;
  176. m3 += HARAKAS_RATE;
  177. }
  178. for (i = 0; i < HARAKAS_RATE; ++i) {
  179. t0[i] = 0;
  180. t1[i] = 0;
  181. t2[i] = 0;
  182. t3[i] = 0;
  183. }
  184. for (i = 0; i < mlen; ++i) {
  185. t0[i] = m0[i];
  186. t1[i] = m1[i];
  187. t2[i] = m2[i];
  188. t3[i] = m3[i];
  189. }
  190. t0[i] = p;
  191. t1[i] = p;
  192. t2[i] = p;
  193. t3[i] = p;
  194. t0[HARAKAS_RATE - 1] |= 128;
  195. t1[HARAKAS_RATE - 1] |= 128;
  196. t2[HARAKAS_RATE - 1] |= 128;
  197. t3[HARAKAS_RATE - 1] |= 128;
  198. STORE(s, XOR128(LOAD(s), LOAD(t0)));
  199. STORE(s + 16, XOR128(LOAD(s + 16), LOAD(t0 + 16)));
  200. STORE(s + 64, XOR128(LOAD(s + 64), LOAD(t1)));
  201. STORE(s + 80, XOR128(LOAD(s + 80), LOAD(t1 + 16)));
  202. STORE(s + 128, XOR128(LOAD(s + 128), LOAD(t2)));
  203. STORE(s + 144, XOR128(LOAD(s + 144), LOAD(t2 + 16)));
  204. STORE(s + 192, XOR128(LOAD(s + 192), LOAD(t3)));
  205. STORE(s + 208, XOR128(LOAD(s + 208), LOAD(t3 + 16)));
  206. }
  207. static void haraka_S_squeezeblocks(unsigned char *h, unsigned long long nblocks,
  208. unsigned char *s, unsigned int r, const harakactx *state) {
  209. while (nblocks > 0) {
  210. PQCLEAN_SPHINCSHARAKA192FROBUST_AESNI_haraka512_perm(s, s, state);
  211. STORE(h, LOAD(s));
  212. STORE(h + 16, LOAD(s + 16));
  213. h += r;
  214. nblocks--;
  215. }
  216. }
  217. static void haraka_S_squeezeblocks4x(unsigned char *h0,
  218. unsigned char *h1,
  219. unsigned char *h2,
  220. unsigned char *h3,
  221. unsigned long long nblocks,
  222. unsigned char *s,
  223. unsigned int r,
  224. const harakactx *state) {
  225. while (nblocks > 0) {
  226. PQCLEAN_SPHINCSHARAKA192FROBUST_AESNI_haraka512_perm_x4(s, s, state);
  227. STORE(h0, LOAD(s));
  228. STORE(h0 + 16, LOAD(s + 16));
  229. STORE(h1, LOAD(s + 64));
  230. STORE(h1 + 16, LOAD(s + 80));
  231. STORE(h2, LOAD(s + 128));
  232. STORE(h2 + 16, LOAD(s + 144));
  233. STORE(h3, LOAD(s + 192));
  234. STORE(h3 + 16, LOAD(s + 208));
  235. h0 += r;
  236. h1 += r;
  237. h2 += r;
  238. h3 += r;
  239. nblocks--;
  240. }
  241. }
  242. void PQCLEAN_SPHINCSHARAKA192FROBUST_AESNI_haraka_S_inc_init(uint8_t *s_inc) {
  243. size_t i;
  244. for (i = 0; i < 64; i++) {
  245. s_inc[i] = 0;
  246. }
  247. s_inc[64] = 0;
  248. }
  249. void PQCLEAN_SPHINCSHARAKA192FROBUST_AESNI_haraka_S_inc_absorb(uint8_t *s_inc, const uint8_t *m, size_t mlen, const harakactx *state) {
  250. size_t i;
  251. /* Recall that s_inc[64] is the non-absorbed bytes xored into the state */
  252. while (mlen + s_inc[64] >= HARAKAS_RATE) {
  253. for (i = 0; i < (size_t)(HARAKAS_RATE - s_inc[64]); i++) {
  254. /* Take the i'th byte from message
  255. xor with the s_inc[64] + i'th byte of the state */
  256. s_inc[s_inc[64] + i] ^= m[i];
  257. }
  258. mlen -= (size_t)(HARAKAS_RATE - s_inc[64]);
  259. m += HARAKAS_RATE - s_inc[64];
  260. s_inc[64] = 0;
  261. PQCLEAN_SPHINCSHARAKA192FROBUST_AESNI_haraka512_perm(s_inc, s_inc, state);
  262. }
  263. for (i = 0; i < mlen; i++) {
  264. s_inc[s_inc[64] + i] ^= m[i];
  265. }
  266. s_inc[64] = (uint8_t)(s_inc[64] + mlen);
  267. }
  268. void PQCLEAN_SPHINCSHARAKA192FROBUST_AESNI_haraka_S_inc_finalize(uint8_t *s_inc) {
  269. /* After haraka_S_inc_absorb, we are guaranteed that s_inc[64] < HARAKAS_RATE,
  270. so we can always use one more byte for p in the current state. */
  271. s_inc[s_inc[64]] ^= 0x1F;
  272. s_inc[HARAKAS_RATE - 1] ^= 128;
  273. s_inc[64] = 0;
  274. }
  275. void PQCLEAN_SPHINCSHARAKA192FROBUST_AESNI_haraka_S_inc_squeeze(uint8_t *out, size_t outlen, uint8_t *s_inc, const harakactx *state) {
  276. size_t i;
  277. /* First consume any bytes we still have sitting around */
  278. for (i = 0; i < outlen && i < s_inc[64]; i++) {
  279. /* There are s_inc[64] bytes left, so r - s_inc[64] is the first
  280. available byte. We consume from there, i.e., up to r. */
  281. out[i] = (uint8_t)s_inc[(HARAKAS_RATE - s_inc[64] + (uint8_t)i)];
  282. }
  283. out += i;
  284. outlen -= i;
  285. s_inc[64] = (uint8_t)(s_inc[64] - i);
  286. /* Then squeeze the remaining necessary blocks */
  287. while (outlen > 0) {
  288. PQCLEAN_SPHINCSHARAKA192FROBUST_AESNI_haraka512_perm(s_inc, s_inc, state);
  289. for (i = 0; i < outlen && i < HARAKAS_RATE; i++) {
  290. out[i] = s_inc[i];
  291. }
  292. out += i;
  293. outlen -= i;
  294. s_inc[64] = (uint8_t)(HARAKAS_RATE - i);
  295. }
  296. }
  297. void PQCLEAN_SPHINCSHARAKA192FROBUST_AESNI_haraka_S(unsigned char *out, unsigned long long outlen,
  298. const unsigned char *in, unsigned long long inlen, const harakactx *state) {
  299. unsigned long long i;
  300. unsigned char s[64];
  301. unsigned char d[32];
  302. for (i = 0; i < 64; i++) {
  303. s[i] = 0;
  304. }
  305. haraka_S_absorb(s, in, inlen, 0x1F, state);
  306. haraka_S_squeezeblocks(out, outlen / HARAKAS_RATE, s, HARAKAS_RATE, state);
  307. out += (outlen / HARAKAS_RATE) * HARAKAS_RATE;
  308. if (outlen % HARAKAS_RATE) {
  309. haraka_S_squeezeblocks(d, 1, s, HARAKAS_RATE, state);
  310. for (i = 0; i < outlen % HARAKAS_RATE; i++) {
  311. out[i] = d[i];
  312. }
  313. }
  314. }
  315. void PQCLEAN_SPHINCSHARAKA192FROBUST_AESNI_haraka_Sx4(unsigned char *out0,
  316. unsigned char *out1,
  317. unsigned char *out2,
  318. unsigned char *out3,
  319. unsigned long long outlen,
  320. const unsigned char *in0,
  321. const unsigned char *in1,
  322. const unsigned char *in2,
  323. const unsigned char *in3,
  324. unsigned long long inlen,
  325. const harakactx *state) {
  326. unsigned long long i;
  327. unsigned char s[64 * 4];
  328. unsigned char d0[32];
  329. unsigned char d1[32];
  330. unsigned char d2[32];
  331. unsigned char d3[32];
  332. for (i = 0; i < 64 * 4; i++) {
  333. s[i] = 0;
  334. }
  335. haraka_S_absorb4x(s, in0, in1, in2, in3, inlen, 0x1F, state);
  336. haraka_S_squeezeblocks4x(out0, out1, out2, out3, outlen / HARAKAS_RATE, s, HARAKAS_RATE, state);
  337. out0 += (outlen / HARAKAS_RATE) * HARAKAS_RATE;
  338. out1 += (outlen / HARAKAS_RATE) * HARAKAS_RATE;
  339. out2 += (outlen / HARAKAS_RATE) * HARAKAS_RATE;
  340. out3 += (outlen / HARAKAS_RATE) * HARAKAS_RATE;
  341. if (outlen % HARAKAS_RATE) {
  342. haraka_S_squeezeblocks4x(d0, d1, d2, d3, 1, s, HARAKAS_RATE, state);
  343. for (i = 0; i < outlen % HARAKAS_RATE; i++) {
  344. out0[i] = d0[i];
  345. out1[i] = d1[i];
  346. out2[i] = d2[i];
  347. out3[i] = d3[i];
  348. }
  349. }
  350. }
  351. void PQCLEAN_SPHINCSHARAKA192FROBUST_AESNI_haraka512_perm(unsigned char *out, const unsigned char *in, const harakactx *state) {
  352. u128 s[4], tmp;
  353. s[0] = LOAD(in);
  354. s[1] = LOAD(in + 16);
  355. s[2] = LOAD(in + 32);
  356. s[3] = LOAD(in + 48);
  357. AES4(s[0], s[1], s[2], s[3], state->rc);
  358. MIX4(s[0], s[1], s[2], s[3]);
  359. AES4(s[0], s[1], s[2], s[3], state->rc + 8);
  360. MIX4(s[0], s[1], s[2], s[3]);
  361. AES4(s[0], s[1], s[2], s[3], state->rc + 16);
  362. MIX4(s[0], s[1], s[2], s[3]);
  363. AES4(s[0], s[1], s[2], s[3], state->rc + 24);
  364. MIX4(s[0], s[1], s[2], s[3]);
  365. AES4(s[0], s[1], s[2], s[3], state->rc + 32);
  366. MIX4(s[0], s[1], s[2], s[3]);
  367. STORE(out, s[0]);
  368. STORE(out + 16, s[1]);
  369. STORE(out + 32, s[2]);
  370. STORE(out + 48, s[3]);
  371. }
  372. void PQCLEAN_SPHINCSHARAKA192FROBUST_AESNI_haraka512_perm_x4(unsigned char *out, const unsigned char *in, const harakactx *state) {
  373. u128 s[4][4], tmp;
  374. s[0][0] = LOAD(in);
  375. s[0][1] = LOAD(in + 16);
  376. s[0][2] = LOAD(in + 32);
  377. s[0][3] = LOAD(in + 48);
  378. s[1][0] = LOAD(in + 64);
  379. s[1][1] = LOAD(in + 80);
  380. s[1][2] = LOAD(in + 96);
  381. s[1][3] = LOAD(in + 112);
  382. s[2][0] = LOAD(in + 128);
  383. s[2][1] = LOAD(in + 144);
  384. s[2][2] = LOAD(in + 160);
  385. s[2][3] = LOAD(in + 176);
  386. s[3][0] = LOAD(in + 192);
  387. s[3][1] = LOAD(in + 208);
  388. s[3][2] = LOAD(in + 224);
  389. s[3][3] = LOAD(in + 240);
  390. AES4_4x(s[0], s[1], s[2], s[3], state->rc);
  391. MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
  392. MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
  393. MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
  394. MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
  395. AES4_4x(s[0], s[1], s[2], s[3], state->rc + 8);
  396. MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
  397. MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
  398. MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
  399. MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
  400. AES4_4x(s[0], s[1], s[2], s[3], state->rc + 16);
  401. MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
  402. MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
  403. MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
  404. MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
  405. AES4_4x(s[0], s[1], s[2], s[3], state->rc + 24);
  406. MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
  407. MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
  408. MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
  409. MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
  410. AES4_4x(s[0], s[1], s[2], s[3], state->rc + 32);
  411. MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
  412. MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
  413. MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
  414. MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
  415. STORE(out, s[0][0]);
  416. STORE(out + 16, s[0][1]);
  417. STORE(out + 32, s[0][2]);
  418. STORE(out + 48, s[0][3]);
  419. STORE(out + 64, s[1][0]);
  420. STORE(out + 80, s[1][1]);
  421. STORE(out + 96, s[1][2]);
  422. STORE(out + 112, s[1][3]);
  423. STORE(out + 128, s[2][0]);
  424. STORE(out + 144, s[2][1]);
  425. STORE(out + 160, s[2][2]);
  426. STORE(out + 176, s[2][3]);
  427. STORE(out + 192, s[3][0]);
  428. STORE(out + 208, s[3][1]);
  429. STORE(out + 224, s[3][2]);
  430. STORE(out + 240, s[3][3]);
  431. }
  432. void PQCLEAN_SPHINCSHARAKA192FROBUST_AESNI_haraka512(unsigned char *out, const unsigned char *in, const harakactx *state) {
  433. u128 s[4], tmp;
  434. s[0] = LOAD(in);
  435. s[1] = LOAD(in + 16);
  436. s[2] = LOAD(in + 32);
  437. s[3] = LOAD(in + 48);
  438. AES4(s[0], s[1], s[2], s[3], state->rc);
  439. MIX4(s[0], s[1], s[2], s[3]);
  440. AES4(s[0], s[1], s[2], s[3], state->rc + 8);
  441. MIX4(s[0], s[1], s[2], s[3]);
  442. AES4(s[0], s[1], s[2], s[3], state->rc + 16);
  443. MIX4(s[0], s[1], s[2], s[3]);
  444. AES4(s[0], s[1], s[2], s[3], state->rc + 24);
  445. MIX4(s[0], s[1], s[2], s[3]);
  446. AES4(s[0], s[1], s[2], s[3], state->rc + 32);
  447. MIX4(s[0], s[1], s[2], s[3]);
  448. s[0] = XOR128(s[0], LOAD(in));
  449. s[1] = XOR128(s[1], LOAD(in + 16));
  450. s[2] = XOR128(s[2], LOAD(in + 32));
  451. s[3] = XOR128(s[3], LOAD(in + 48));
  452. // truncate and store result
  453. TRUNCSTORE(out, s[0], s[1], s[2], s[3]);
  454. }
  455. void PQCLEAN_SPHINCSHARAKA192FROBUST_AESNI_haraka512x4(unsigned char *out, const unsigned char *in, const harakactx *state) {
  456. u128 s[4][4], tmp;
  457. s[0][0] = LOAD(in);
  458. s[0][1] = LOAD(in + 16);
  459. s[0][2] = LOAD(in + 32);
  460. s[0][3] = LOAD(in + 48);
  461. s[1][0] = LOAD(in + 64);
  462. s[1][1] = LOAD(in + 80);
  463. s[1][2] = LOAD(in + 96);
  464. s[1][3] = LOAD(in + 112);
  465. s[2][0] = LOAD(in + 128);
  466. s[2][1] = LOAD(in + 144);
  467. s[2][2] = LOAD(in + 160);
  468. s[2][3] = LOAD(in + 176);
  469. s[3][0] = LOAD(in + 192);
  470. s[3][1] = LOAD(in + 208);
  471. s[3][2] = LOAD(in + 224);
  472. s[3][3] = LOAD(in + 240);
  473. AES4_4x(s[0], s[1], s[2], s[3], state->rc);
  474. MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
  475. MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
  476. MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
  477. MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
  478. AES4_4x(s[0], s[1], s[2], s[3], state->rc + 8);
  479. MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
  480. MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
  481. MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
  482. MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
  483. AES4_4x(s[0], s[1], s[2], s[3], state->rc + 16);
  484. MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
  485. MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
  486. MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
  487. MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
  488. AES4_4x(s[0], s[1], s[2], s[3], state->rc + 24);
  489. MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
  490. MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
  491. MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
  492. MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
  493. AES4_4x(s[0], s[1], s[2], s[3], state->rc + 32);
  494. MIX4(s[0][0], s[0][1], s[0][2], s[0][3]);
  495. MIX4(s[1][0], s[1][1], s[1][2], s[1][3]);
  496. MIX4(s[2][0], s[2][1], s[2][2], s[2][3]);
  497. MIX4(s[3][0], s[3][1], s[3][2], s[3][3]);
  498. s[0][0] = XOR128(s[0][0], LOAD(in));
  499. s[0][1] = XOR128(s[0][1], LOAD(in + 16));
  500. s[0][2] = XOR128(s[0][2], LOAD(in + 32));
  501. s[0][3] = XOR128(s[0][3], LOAD(in + 48));
  502. s[1][0] = XOR128(s[1][0], LOAD(in + 64));
  503. s[1][1] = XOR128(s[1][1], LOAD(in + 80));
  504. s[1][2] = XOR128(s[1][2], LOAD(in + 96));
  505. s[1][3] = XOR128(s[1][3], LOAD(in + 112));
  506. s[2][0] = XOR128(s[2][0], LOAD(in + 128));
  507. s[2][1] = XOR128(s[2][1], LOAD(in + 144));
  508. s[2][2] = XOR128(s[2][2], LOAD(in + 160));
  509. s[2][3] = XOR128(s[2][3], LOAD(in + 176));
  510. s[3][0] = XOR128(s[3][0], LOAD(in + 192));
  511. s[3][1] = XOR128(s[3][1], LOAD(in + 208));
  512. s[3][2] = XOR128(s[3][2], LOAD(in + 224));
  513. s[3][3] = XOR128(s[3][3], LOAD(in + 240));
  514. TRUNCSTORE(out, s[0][0], s[0][1], s[0][2], s[0][3]);
  515. TRUNCSTORE((out + 32), s[1][0], s[1][1], s[1][2], s[1][3]);
  516. TRUNCSTORE((out + 64), s[2][0], s[2][1], s[2][2], s[2][3]);
  517. TRUNCSTORE((out + 96), s[3][0], s[3][1], s[3][2], s[3][3]);
  518. }
  519. void PQCLEAN_SPHINCSHARAKA192FROBUST_AESNI_haraka256(unsigned char *out, const unsigned char *in, const harakactx *state) {
  520. u128 s[2], tmp;
  521. s[0] = LOAD(in);
  522. s[1] = LOAD(in + 16);
  523. AES2(s[0], s[1], state->rc);
  524. MIX2(s[0], s[1]);
  525. AES2(s[0], s[1], state->rc + 4);
  526. MIX2(s[0], s[1]);
  527. AES2(s[0], s[1], state->rc + 8);
  528. MIX2(s[0], s[1]);
  529. AES2(s[0], s[1], state->rc + 12);
  530. MIX2(s[0], s[1]);
  531. AES2(s[0], s[1], state->rc + 16);
  532. MIX2(s[0], s[1]);
  533. s[0] = XOR128(s[0], LOAD(in));
  534. s[1] = XOR128(s[1], LOAD(in + 16));
  535. STORE(out, s[0]);
  536. STORE(out + 16, s[1]);
  537. }
  538. void PQCLEAN_SPHINCSHARAKA192FROBUST_AESNI_haraka256x4(unsigned char *out, const unsigned char *in, const harakactx *state) {
  539. u128 s[4][2], tmp;
  540. s[0][0] = LOAD(in);
  541. s[0][1] = LOAD(in + 16);
  542. s[1][0] = LOAD(in + 32);
  543. s[1][1] = LOAD(in + 48);
  544. s[2][0] = LOAD(in + 64);
  545. s[2][1] = LOAD(in + 80);
  546. s[3][0] = LOAD(in + 96);
  547. s[3][1] = LOAD(in + 112);
  548. // Round 1
  549. AES2_4x(s[0], s[1], s[2], s[3], state->rc);
  550. MIX2(s[0][0], s[0][1]);
  551. MIX2(s[1][0], s[1][1]);
  552. MIX2(s[2][0], s[2][1]);
  553. MIX2(s[3][0], s[3][1]);
  554. // Round 2
  555. AES2_4x(s[0], s[1], s[2], s[3], state->rc + 4);
  556. MIX2(s[0][0], s[0][1]);
  557. MIX2(s[1][0], s[1][1]);
  558. MIX2(s[2][0], s[2][1]);
  559. MIX2(s[3][0], s[3][1]);
  560. // Round 3
  561. AES2_4x(s[0], s[1], s[2], s[3], state->rc + 8);
  562. MIX2(s[0][0], s[0][1]);
  563. MIX2(s[1][0], s[1][1]);
  564. MIX2(s[2][0], s[2][1]);
  565. MIX2(s[3][0], s[3][1]);
  566. // Round 4
  567. AES2_4x(s[0], s[1], s[2], s[3], state->rc + 12);
  568. MIX2(s[0][0], s[0][1]);
  569. MIX2(s[1][0], s[1][1]);
  570. MIX2(s[2][0], s[2][1]);
  571. MIX2(s[3][0], s[3][1]);
  572. // Round 5
  573. AES2_4x(s[0], s[1], s[2], s[3], state->rc + 16);
  574. MIX2(s[0][0], s[0][1]);
  575. MIX2(s[1][0], s[1][1]);
  576. MIX2(s[2][0], s[2][1]);
  577. MIX2(s[3][0], s[3][1]);
  578. // Feed Forward
  579. s[0][0] = _mm_xor_si128(s[0][0], LOAD(in));
  580. s[0][1] = _mm_xor_si128(s[0][1], LOAD(in + 16));
  581. s[1][0] = _mm_xor_si128(s[1][0], LOAD(in + 32));
  582. s[1][1] = _mm_xor_si128(s[1][1], LOAD(in + 48));
  583. s[2][0] = _mm_xor_si128(s[2][0], LOAD(in + 64));
  584. s[2][1] = _mm_xor_si128(s[2][1], LOAD(in + 80));
  585. s[3][0] = _mm_xor_si128(s[3][0], LOAD(in + 96));
  586. s[3][1] = _mm_xor_si128(s[3][1], LOAD(in + 112));
  587. STORE(out, s[0][0]);
  588. STORE(out + 16, s[0][1]);
  589. STORE(out + 32, s[1][0]);
  590. STORE(out + 48, s[1][1]);
  591. STORE(out + 64, s[2][0]);
  592. STORE(out + 80, s[2][1]);
  593. STORE(out + 96, s[3][0]);
  594. STORE(out + 112, s[3][1]);
  595. }
  596. void PQCLEAN_SPHINCSHARAKA192FROBUST_AESNI_haraka256_sk(unsigned char *out, const unsigned char *in, const harakactx *state) {
  597. u128 s[2], tmp;
  598. s[0] = LOAD(in);
  599. s[1] = LOAD(in + 16);
  600. AES2(s[0], s[1], state->rc_sseed);
  601. MIX2(s[0], s[1]);
  602. AES2(s[0], s[1], state->rc_sseed + 4);
  603. MIX2(s[0], s[1]);
  604. AES2(s[0], s[1], state->rc_sseed + 8);
  605. MIX2(s[0], s[1]);
  606. AES2(s[0], s[1], state->rc_sseed + 12);
  607. MIX2(s[0], s[1]);
  608. AES2(s[0], s[1], state->rc_sseed + 16);
  609. MIX2(s[0], s[1]);
  610. s[0] = XOR128(s[0], LOAD(in));
  611. s[1] = XOR128(s[1], LOAD(in + 16));
  612. STORE(out, s[0]);
  613. STORE(out + 16, s[1]);
  614. }
  615. void PQCLEAN_SPHINCSHARAKA192FROBUST_AESNI_haraka256_skx4(unsigned char *out, const unsigned char *in, const harakactx *state) {
  616. u128 s[4][2], tmp;
  617. s[0][0] = LOAD(in);
  618. s[0][1] = LOAD(in + 16);
  619. s[1][0] = LOAD(in + 32);
  620. s[1][1] = LOAD(in + 48);
  621. s[2][0] = LOAD(in + 64);
  622. s[2][1] = LOAD(in + 80);
  623. s[3][0] = LOAD(in + 96);
  624. s[3][1] = LOAD(in + 112);
  625. // Round 1
  626. AES2_4x(s[0], s[1], s[2], s[3], state->rc_sseed);
  627. MIX2(s[0][0], s[0][1]);
  628. MIX2(s[1][0], s[1][1]);
  629. MIX2(s[2][0], s[2][1]);
  630. MIX2(s[3][0], s[3][1]);
  631. // Round 2
  632. AES2_4x(s[0], s[1], s[2], s[3], state->rc_sseed + 4);
  633. MIX2(s[0][0], s[0][1]);
  634. MIX2(s[1][0], s[1][1]);
  635. MIX2(s[2][0], s[2][1]);
  636. MIX2(s[3][0], s[3][1]);
  637. // Round 3
  638. AES2_4x(s[0], s[1], s[2], s[3], state->rc_sseed + 8);
  639. MIX2(s[0][0], s[0][1]);
  640. MIX2(s[1][0], s[1][1]);
  641. MIX2(s[2][0], s[2][1]);
  642. MIX2(s[3][0], s[3][1]);
  643. // Round 4
  644. AES2_4x(s[0], s[1], s[2], s[3], state->rc_sseed + 12);
  645. MIX2(s[0][0], s[0][1]);
  646. MIX2(s[1][0], s[1][1]);
  647. MIX2(s[2][0], s[2][1]);
  648. MIX2(s[3][0], s[3][1]);
  649. // Round 5
  650. AES2_4x(s[0], s[1], s[2], s[3], state->rc_sseed + 16);
  651. MIX2(s[0][0], s[0][1]);
  652. MIX2(s[1][0], s[1][1]);
  653. MIX2(s[2][0], s[2][1]);
  654. MIX2(s[3][0], s[3][1]);
  655. // Feed Forward
  656. s[0][0] = XOR128(s[0][0], LOAD(in));
  657. s[0][1] = XOR128(s[0][1], LOAD(in + 16));
  658. s[1][0] = XOR128(s[1][0], LOAD(in + 32));
  659. s[1][1] = XOR128(s[1][1], LOAD(in + 48));
  660. s[2][0] = XOR128(s[2][0], LOAD(in + 64));
  661. s[2][1] = XOR128(s[2][1], LOAD(in + 80));
  662. s[3][0] = XOR128(s[3][0], LOAD(in + 96));
  663. s[3][1] = XOR128(s[3][1], LOAD(in + 112));
  664. STORE(out, s[0][0]);
  665. STORE(out + 16, s[0][1]);
  666. STORE(out + 32, s[1][0]);
  667. STORE(out + 48, s[1][1]);
  668. STORE(out + 64, s[2][0]);
  669. STORE(out + 80, s[2][1]);
  670. STORE(out + 96, s[3][0]);
  671. STORE(out + 112, s[3][1]);
  672. }