#include "inner.h" #include /* * PRNG and interface to the system RNG. * * ==========================(LICENSE BEGIN)============================ * * Copyright (c) 2017-2019 Falcon Project * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * * ===========================(LICENSE END)============================= * * @author Thomas Pornin */ /* * Include relevant system header files. For Win32, this will also need * linking with advapi32.dll, which we trigger with an appropriate #pragma. */ /* see inner.h */ int PQCLEAN_FALCON1024_AVX2_get_seed(void *seed, size_t len) { (void)seed; if (len == 0) { return 1; } return 0; } /* see inner.h */ void PQCLEAN_FALCON1024_AVX2_prng_init(prng *p, inner_shake256_context *src) { inner_shake256_extract(src, p->state.d, 56); PQCLEAN_FALCON1024_AVX2_prng_refill(p); } /* * PRNG based on ChaCha20. * * State consists in key (32 bytes) then IV (16 bytes) and block counter * (8 bytes). Normally, we should not care about local endianness (this * is for a PRNG), but for the NIST competition we need reproducible KAT * vectors that work across architectures, so we enforce little-endian * interpretation where applicable. Moreover, output words are "spread * out" over the output buffer with the interleaving pattern that is * naturally obtained from the AVX2 implementation that runs eight * ChaCha20 instances in parallel. * * The block counter is XORed into the first 8 bytes of the IV. */ void PQCLEAN_FALCON1024_AVX2_prng_refill(prng *p) { static const uint32_t CW[] = { 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 }; uint64_t cc; size_t u; int i; uint32_t *sw; union { uint32_t w[16]; __m256i y[2]; /* for alignment */ } t; __m256i state[16], init[16]; sw = (uint32_t *)p->state.d; /* * XOR next counter values into state. */ cc = *(uint64_t *)(p->state.d + 48); for (u = 0; u < 8; u ++) { t.w[u] = (uint32_t)(cc + u); t.w[u + 8] = (uint32_t)((cc + u) >> 32); } *(uint64_t *)(p->state.d + 48) = cc + 8; /* * Load state. */ for (u = 0; u < 4; u ++) { state[u] = init[u] = _mm256_broadcastd_epi32(_mm_cvtsi32_si128((int32_t)CW[u])); } for (u = 0; u < 10; u ++) { state[u + 4] = init[u + 4] = _mm256_broadcastd_epi32(_mm_cvtsi32_si128((int32_t)sw[u])); } state[14] = init[14] = _mm256_xor_si256( _mm256_broadcastd_epi32(_mm_cvtsi32_si128((int32_t)sw[10])), _mm256_loadu_si256((__m256i *)&t.w[0])); state[15] = init[15] = _mm256_xor_si256( _mm256_broadcastd_epi32(_mm_cvtsi32_si128((int32_t)sw[11])), _mm256_loadu_si256((__m256i *)&t.w[8])); /* * Do all rounds. */ for (i = 0; i < 10; i ++) { #define QROUND(a, b, c, d) do { \ state[a] = _mm256_add_epi32(state[a], state[b]); \ state[d] = _mm256_xor_si256(state[d], state[a]); \ state[d] = _mm256_or_si256( \ _mm256_slli_epi32(state[d], 16), \ _mm256_srli_epi32(state[d], 16)); \ state[c] = _mm256_add_epi32(state[c], state[d]); \ state[b] = _mm256_xor_si256(state[b], state[c]); \ state[b] = _mm256_or_si256( \ _mm256_slli_epi32(state[b], 12), \ _mm256_srli_epi32(state[b], 20)); \ state[a] = _mm256_add_epi32(state[a], state[b]); \ state[d] = _mm256_xor_si256(state[d], state[a]); \ state[d] = _mm256_or_si256( \ _mm256_slli_epi32(state[d], 8), \ _mm256_srli_epi32(state[d], 24)); \ state[c] = _mm256_add_epi32(state[c], state[d]); \ state[b] = _mm256_xor_si256(state[b], state[c]); \ state[b] = _mm256_or_si256( \ _mm256_slli_epi32(state[b], 7), \ _mm256_srli_epi32(state[b], 25)); \ } while (0) QROUND( 0, 4, 8, 12); QROUND( 1, 5, 9, 13); QROUND( 2, 6, 10, 14); QROUND( 3, 7, 11, 15); QROUND( 0, 5, 10, 15); QROUND( 1, 6, 11, 12); QROUND( 2, 7, 8, 13); QROUND( 3, 4, 9, 14); #undef QROUND } /* * Add initial state back and encode the result in the destination * buffer. We can dump the AVX2 values "as is" because the non-AVX2 * code uses a compatible order of values. */ for (u = 0; u < 16; u ++) { _mm256_storeu_si256((__m256i *)&p->buf.d[u << 5], _mm256_add_epi32(state[u], init[u])); } p->ptr = 0; } /* see inner.h */ void PQCLEAN_FALCON1024_AVX2_prng_get_bytes(prng *p, void *dst, size_t len) { uint8_t *buf; buf = dst; while (len > 0) { size_t clen; clen = (sizeof p->buf.d) - p->ptr; if (clen > len) { clen = len; } memcpy(buf, p->buf.d, clen); buf += clen; len -= clen; p->ptr += clen; if (p->ptr == sizeof p->buf.d) { PQCLEAN_FALCON1024_AVX2_prng_refill(p); } } }