#include "fips202.h" #include "fips202x4.h" #include #include #include #include /* Use implementation from the Keccak Code Package */ #define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds extern void KeccakF1600_StatePermute4x(__m256i *s); static inline void store64(uint8_t x[8], uint64_t u) { unsigned int i = 0; for (i = 0; i < 8; i++) { x[i] = u >> 8 * i; } } static void keccakx4_absorb(__m256i s[25], unsigned int r, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen, uint8_t p) { size_t i = 0, pos = 0; __m256i t, idx; for (i = 0; i < 25; ++i) { s[i] = _mm256_setzero_si256(); } idx = _mm256_set_epi64x((long long)in3, (long long)in2, (long long)in1, (long long)in0); while (inlen >= r) { for (i = 0; i < r / 8; ++i) { t = _mm256_i64gather_epi64((long long *)pos, idx, 1); s[i] = _mm256_xor_si256(s[i], t); pos += 8; } KeccakF1600_StatePermute4x(s); inlen -= r; } i = 0; while (inlen >= 8) { t = _mm256_i64gather_epi64((long long *)pos, idx, 1); s[i] = _mm256_xor_si256(s[i], t); i++; pos += 8; inlen -= 8; } if (inlen) { t = _mm256_i64gather_epi64((long long *)pos, idx, 1); idx = _mm256_set1_epi64x((long long)((1ULL << (8 * inlen)) - 1)); t = _mm256_and_si256(t, idx); s[i] = _mm256_xor_si256(s[i], t); } t = _mm256_set1_epi64x((uint64_t)p << 8 * inlen); s[i] = _mm256_xor_si256(s[i], t); t = _mm256_set1_epi64x((long long)(1ULL << 63)); s[r / 8 - 1] = _mm256_xor_si256(s[r / 8 - 1], t); } static void keccakx4_squeezeblocks(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t nblocks, unsigned int r, __m256i s[25]) { unsigned int i = 0; uint64_t f0 = 0, f1 = 0, f2 = 0, f3 = 0; while (nblocks > 0) { KeccakF1600_StatePermute4x(s); for (i = 0; i < r / 8; ++i) { f0 = _mm256_extract_epi64(s[i], 0); f1 = _mm256_extract_epi64(s[i], 1); f2 = _mm256_extract_epi64(s[i], 2); f3 = _mm256_extract_epi64(s[i], 3); store64(out0, f0); store64(out1, f1); store64(out2, f2); store64(out3, f3); out0 += 8; out1 += 8; out2 += 8; out3 += 8; } --nblocks; } } void PQCLEAN_KYBER1024_AVX2_shake128x4_absorb(keccakx4_state *state, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen) { keccakx4_absorb(state->s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F); } void PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t nblocks, keccakx4_state *state) { keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE128_RATE, state->s); } void PQCLEAN_KYBER1024_AVX2_shake256x4_absorb(keccakx4_state *state, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen) { keccakx4_absorb(state->s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F); } void PQCLEAN_KYBER1024_AVX2_shake256x4_squeezeblocks(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t nblocks, keccakx4_state *state) { keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE256_RATE, state->s); } void PQCLEAN_KYBER1024_AVX2_shake128x4(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t outlen, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen) { unsigned int i = 0; size_t nblocks = outlen / SHAKE128_RATE; uint8_t t[4][SHAKE128_RATE]; keccakx4_state state; PQCLEAN_KYBER1024_AVX2_shake128x4_absorb(&state, in0, in1, in2, in3, inlen); PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); out0 += nblocks * SHAKE128_RATE; out1 += nblocks * SHAKE128_RATE; out2 += nblocks * SHAKE128_RATE; out3 += nblocks * SHAKE128_RATE; outlen -= nblocks * SHAKE128_RATE; if (outlen) { PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state); for (i = 0; i < outlen; ++i) { out0[i] = t[0][i]; out1[i] = t[1][i]; out2[i] = t[2][i]; out3[i] = t[3][i]; } } } void PQCLEAN_KYBER1024_AVX2_shake256x4(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t outlen, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen) { unsigned int i = 0; size_t nblocks = outlen / SHAKE256_RATE; uint8_t t[4][SHAKE256_RATE]; keccakx4_state state; PQCLEAN_KYBER1024_AVX2_shake256x4_absorb(&state, in0, in1, in2, in3, inlen); PQCLEAN_KYBER1024_AVX2_shake256x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); out0 += nblocks * SHAKE256_RATE; out1 += nblocks * SHAKE256_RATE; out2 += nblocks * SHAKE256_RATE; out3 += nblocks * SHAKE256_RATE; outlen -= nblocks * SHAKE256_RATE; if (outlen) { PQCLEAN_KYBER1024_AVX2_shake256x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state); for (i = 0; i < outlen; ++i) { out0[i] = t[0][i]; out1[i] = t[1][i]; out2[i] = t[2][i]; out3[i] = t[3][i]; } } }