32c613e8ec
* Update Dilithium * Alternative montgomery reduce to avoid i386 functest errors * Explicit casts for msvc * More casts; bump upstream version; fix metadata * another cast
409 lines
14 KiB
C
409 lines
14 KiB
C
#include "params.h"
|
|
#include "rejsample.h"
|
|
#include "symmetric.h"
|
|
#include <immintrin.h>
|
|
#include <stdint.h>
|
|
|
|
const uint8_t PQCLEAN_DILITHIUM2_AVX2_idxlut[256][8] = {
|
|
{ 0, 0, 0, 0, 0, 0, 0, 0},
|
|
{ 0, 0, 0, 0, 0, 0, 0, 0},
|
|
{ 1, 0, 0, 0, 0, 0, 0, 0},
|
|
{ 0, 1, 0, 0, 0, 0, 0, 0},
|
|
{ 2, 0, 0, 0, 0, 0, 0, 0},
|
|
{ 0, 2, 0, 0, 0, 0, 0, 0},
|
|
{ 1, 2, 0, 0, 0, 0, 0, 0},
|
|
{ 0, 1, 2, 0, 0, 0, 0, 0},
|
|
{ 3, 0, 0, 0, 0, 0, 0, 0},
|
|
{ 0, 3, 0, 0, 0, 0, 0, 0},
|
|
{ 1, 3, 0, 0, 0, 0, 0, 0},
|
|
{ 0, 1, 3, 0, 0, 0, 0, 0},
|
|
{ 2, 3, 0, 0, 0, 0, 0, 0},
|
|
{ 0, 2, 3, 0, 0, 0, 0, 0},
|
|
{ 1, 2, 3, 0, 0, 0, 0, 0},
|
|
{ 0, 1, 2, 3, 0, 0, 0, 0},
|
|
{ 4, 0, 0, 0, 0, 0, 0, 0},
|
|
{ 0, 4, 0, 0, 0, 0, 0, 0},
|
|
{ 1, 4, 0, 0, 0, 0, 0, 0},
|
|
{ 0, 1, 4, 0, 0, 0, 0, 0},
|
|
{ 2, 4, 0, 0, 0, 0, 0, 0},
|
|
{ 0, 2, 4, 0, 0, 0, 0, 0},
|
|
{ 1, 2, 4, 0, 0, 0, 0, 0},
|
|
{ 0, 1, 2, 4, 0, 0, 0, 0},
|
|
{ 3, 4, 0, 0, 0, 0, 0, 0},
|
|
{ 0, 3, 4, 0, 0, 0, 0, 0},
|
|
{ 1, 3, 4, 0, 0, 0, 0, 0},
|
|
{ 0, 1, 3, 4, 0, 0, 0, 0},
|
|
{ 2, 3, 4, 0, 0, 0, 0, 0},
|
|
{ 0, 2, 3, 4, 0, 0, 0, 0},
|
|
{ 1, 2, 3, 4, 0, 0, 0, 0},
|
|
{ 0, 1, 2, 3, 4, 0, 0, 0},
|
|
{ 5, 0, 0, 0, 0, 0, 0, 0},
|
|
{ 0, 5, 0, 0, 0, 0, 0, 0},
|
|
{ 1, 5, 0, 0, 0, 0, 0, 0},
|
|
{ 0, 1, 5, 0, 0, 0, 0, 0},
|
|
{ 2, 5, 0, 0, 0, 0, 0, 0},
|
|
{ 0, 2, 5, 0, 0, 0, 0, 0},
|
|
{ 1, 2, 5, 0, 0, 0, 0, 0},
|
|
{ 0, 1, 2, 5, 0, 0, 0, 0},
|
|
{ 3, 5, 0, 0, 0, 0, 0, 0},
|
|
{ 0, 3, 5, 0, 0, 0, 0, 0},
|
|
{ 1, 3, 5, 0, 0, 0, 0, 0},
|
|
{ 0, 1, 3, 5, 0, 0, 0, 0},
|
|
{ 2, 3, 5, 0, 0, 0, 0, 0},
|
|
{ 0, 2, 3, 5, 0, 0, 0, 0},
|
|
{ 1, 2, 3, 5, 0, 0, 0, 0},
|
|
{ 0, 1, 2, 3, 5, 0, 0, 0},
|
|
{ 4, 5, 0, 0, 0, 0, 0, 0},
|
|
{ 0, 4, 5, 0, 0, 0, 0, 0},
|
|
{ 1, 4, 5, 0, 0, 0, 0, 0},
|
|
{ 0, 1, 4, 5, 0, 0, 0, 0},
|
|
{ 2, 4, 5, 0, 0, 0, 0, 0},
|
|
{ 0, 2, 4, 5, 0, 0, 0, 0},
|
|
{ 1, 2, 4, 5, 0, 0, 0, 0},
|
|
{ 0, 1, 2, 4, 5, 0, 0, 0},
|
|
{ 3, 4, 5, 0, 0, 0, 0, 0},
|
|
{ 0, 3, 4, 5, 0, 0, 0, 0},
|
|
{ 1, 3, 4, 5, 0, 0, 0, 0},
|
|
{ 0, 1, 3, 4, 5, 0, 0, 0},
|
|
{ 2, 3, 4, 5, 0, 0, 0, 0},
|
|
{ 0, 2, 3, 4, 5, 0, 0, 0},
|
|
{ 1, 2, 3, 4, 5, 0, 0, 0},
|
|
{ 0, 1, 2, 3, 4, 5, 0, 0},
|
|
{ 6, 0, 0, 0, 0, 0, 0, 0},
|
|
{ 0, 6, 0, 0, 0, 0, 0, 0},
|
|
{ 1, 6, 0, 0, 0, 0, 0, 0},
|
|
{ 0, 1, 6, 0, 0, 0, 0, 0},
|
|
{ 2, 6, 0, 0, 0, 0, 0, 0},
|
|
{ 0, 2, 6, 0, 0, 0, 0, 0},
|
|
{ 1, 2, 6, 0, 0, 0, 0, 0},
|
|
{ 0, 1, 2, 6, 0, 0, 0, 0},
|
|
{ 3, 6, 0, 0, 0, 0, 0, 0},
|
|
{ 0, 3, 6, 0, 0, 0, 0, 0},
|
|
{ 1, 3, 6, 0, 0, 0, 0, 0},
|
|
{ 0, 1, 3, 6, 0, 0, 0, 0},
|
|
{ 2, 3, 6, 0, 0, 0, 0, 0},
|
|
{ 0, 2, 3, 6, 0, 0, 0, 0},
|
|
{ 1, 2, 3, 6, 0, 0, 0, 0},
|
|
{ 0, 1, 2, 3, 6, 0, 0, 0},
|
|
{ 4, 6, 0, 0, 0, 0, 0, 0},
|
|
{ 0, 4, 6, 0, 0, 0, 0, 0},
|
|
{ 1, 4, 6, 0, 0, 0, 0, 0},
|
|
{ 0, 1, 4, 6, 0, 0, 0, 0},
|
|
{ 2, 4, 6, 0, 0, 0, 0, 0},
|
|
{ 0, 2, 4, 6, 0, 0, 0, 0},
|
|
{ 1, 2, 4, 6, 0, 0, 0, 0},
|
|
{ 0, 1, 2, 4, 6, 0, 0, 0},
|
|
{ 3, 4, 6, 0, 0, 0, 0, 0},
|
|
{ 0, 3, 4, 6, 0, 0, 0, 0},
|
|
{ 1, 3, 4, 6, 0, 0, 0, 0},
|
|
{ 0, 1, 3, 4, 6, 0, 0, 0},
|
|
{ 2, 3, 4, 6, 0, 0, 0, 0},
|
|
{ 0, 2, 3, 4, 6, 0, 0, 0},
|
|
{ 1, 2, 3, 4, 6, 0, 0, 0},
|
|
{ 0, 1, 2, 3, 4, 6, 0, 0},
|
|
{ 5, 6, 0, 0, 0, 0, 0, 0},
|
|
{ 0, 5, 6, 0, 0, 0, 0, 0},
|
|
{ 1, 5, 6, 0, 0, 0, 0, 0},
|
|
{ 0, 1, 5, 6, 0, 0, 0, 0},
|
|
{ 2, 5, 6, 0, 0, 0, 0, 0},
|
|
{ 0, 2, 5, 6, 0, 0, 0, 0},
|
|
{ 1, 2, 5, 6, 0, 0, 0, 0},
|
|
{ 0, 1, 2, 5, 6, 0, 0, 0},
|
|
{ 3, 5, 6, 0, 0, 0, 0, 0},
|
|
{ 0, 3, 5, 6, 0, 0, 0, 0},
|
|
{ 1, 3, 5, 6, 0, 0, 0, 0},
|
|
{ 0, 1, 3, 5, 6, 0, 0, 0},
|
|
{ 2, 3, 5, 6, 0, 0, 0, 0},
|
|
{ 0, 2, 3, 5, 6, 0, 0, 0},
|
|
{ 1, 2, 3, 5, 6, 0, 0, 0},
|
|
{ 0, 1, 2, 3, 5, 6, 0, 0},
|
|
{ 4, 5, 6, 0, 0, 0, 0, 0},
|
|
{ 0, 4, 5, 6, 0, 0, 0, 0},
|
|
{ 1, 4, 5, 6, 0, 0, 0, 0},
|
|
{ 0, 1, 4, 5, 6, 0, 0, 0},
|
|
{ 2, 4, 5, 6, 0, 0, 0, 0},
|
|
{ 0, 2, 4, 5, 6, 0, 0, 0},
|
|
{ 1, 2, 4, 5, 6, 0, 0, 0},
|
|
{ 0, 1, 2, 4, 5, 6, 0, 0},
|
|
{ 3, 4, 5, 6, 0, 0, 0, 0},
|
|
{ 0, 3, 4, 5, 6, 0, 0, 0},
|
|
{ 1, 3, 4, 5, 6, 0, 0, 0},
|
|
{ 0, 1, 3, 4, 5, 6, 0, 0},
|
|
{ 2, 3, 4, 5, 6, 0, 0, 0},
|
|
{ 0, 2, 3, 4, 5, 6, 0, 0},
|
|
{ 1, 2, 3, 4, 5, 6, 0, 0},
|
|
{ 0, 1, 2, 3, 4, 5, 6, 0},
|
|
{ 7, 0, 0, 0, 0, 0, 0, 0},
|
|
{ 0, 7, 0, 0, 0, 0, 0, 0},
|
|
{ 1, 7, 0, 0, 0, 0, 0, 0},
|
|
{ 0, 1, 7, 0, 0, 0, 0, 0},
|
|
{ 2, 7, 0, 0, 0, 0, 0, 0},
|
|
{ 0, 2, 7, 0, 0, 0, 0, 0},
|
|
{ 1, 2, 7, 0, 0, 0, 0, 0},
|
|
{ 0, 1, 2, 7, 0, 0, 0, 0},
|
|
{ 3, 7, 0, 0, 0, 0, 0, 0},
|
|
{ 0, 3, 7, 0, 0, 0, 0, 0},
|
|
{ 1, 3, 7, 0, 0, 0, 0, 0},
|
|
{ 0, 1, 3, 7, 0, 0, 0, 0},
|
|
{ 2, 3, 7, 0, 0, 0, 0, 0},
|
|
{ 0, 2, 3, 7, 0, 0, 0, 0},
|
|
{ 1, 2, 3, 7, 0, 0, 0, 0},
|
|
{ 0, 1, 2, 3, 7, 0, 0, 0},
|
|
{ 4, 7, 0, 0, 0, 0, 0, 0},
|
|
{ 0, 4, 7, 0, 0, 0, 0, 0},
|
|
{ 1, 4, 7, 0, 0, 0, 0, 0},
|
|
{ 0, 1, 4, 7, 0, 0, 0, 0},
|
|
{ 2, 4, 7, 0, 0, 0, 0, 0},
|
|
{ 0, 2, 4, 7, 0, 0, 0, 0},
|
|
{ 1, 2, 4, 7, 0, 0, 0, 0},
|
|
{ 0, 1, 2, 4, 7, 0, 0, 0},
|
|
{ 3, 4, 7, 0, 0, 0, 0, 0},
|
|
{ 0, 3, 4, 7, 0, 0, 0, 0},
|
|
{ 1, 3, 4, 7, 0, 0, 0, 0},
|
|
{ 0, 1, 3, 4, 7, 0, 0, 0},
|
|
{ 2, 3, 4, 7, 0, 0, 0, 0},
|
|
{ 0, 2, 3, 4, 7, 0, 0, 0},
|
|
{ 1, 2, 3, 4, 7, 0, 0, 0},
|
|
{ 0, 1, 2, 3, 4, 7, 0, 0},
|
|
{ 5, 7, 0, 0, 0, 0, 0, 0},
|
|
{ 0, 5, 7, 0, 0, 0, 0, 0},
|
|
{ 1, 5, 7, 0, 0, 0, 0, 0},
|
|
{ 0, 1, 5, 7, 0, 0, 0, 0},
|
|
{ 2, 5, 7, 0, 0, 0, 0, 0},
|
|
{ 0, 2, 5, 7, 0, 0, 0, 0},
|
|
{ 1, 2, 5, 7, 0, 0, 0, 0},
|
|
{ 0, 1, 2, 5, 7, 0, 0, 0},
|
|
{ 3, 5, 7, 0, 0, 0, 0, 0},
|
|
{ 0, 3, 5, 7, 0, 0, 0, 0},
|
|
{ 1, 3, 5, 7, 0, 0, 0, 0},
|
|
{ 0, 1, 3, 5, 7, 0, 0, 0},
|
|
{ 2, 3, 5, 7, 0, 0, 0, 0},
|
|
{ 0, 2, 3, 5, 7, 0, 0, 0},
|
|
{ 1, 2, 3, 5, 7, 0, 0, 0},
|
|
{ 0, 1, 2, 3, 5, 7, 0, 0},
|
|
{ 4, 5, 7, 0, 0, 0, 0, 0},
|
|
{ 0, 4, 5, 7, 0, 0, 0, 0},
|
|
{ 1, 4, 5, 7, 0, 0, 0, 0},
|
|
{ 0, 1, 4, 5, 7, 0, 0, 0},
|
|
{ 2, 4, 5, 7, 0, 0, 0, 0},
|
|
{ 0, 2, 4, 5, 7, 0, 0, 0},
|
|
{ 1, 2, 4, 5, 7, 0, 0, 0},
|
|
{ 0, 1, 2, 4, 5, 7, 0, 0},
|
|
{ 3, 4, 5, 7, 0, 0, 0, 0},
|
|
{ 0, 3, 4, 5, 7, 0, 0, 0},
|
|
{ 1, 3, 4, 5, 7, 0, 0, 0},
|
|
{ 0, 1, 3, 4, 5, 7, 0, 0},
|
|
{ 2, 3, 4, 5, 7, 0, 0, 0},
|
|
{ 0, 2, 3, 4, 5, 7, 0, 0},
|
|
{ 1, 2, 3, 4, 5, 7, 0, 0},
|
|
{ 0, 1, 2, 3, 4, 5, 7, 0},
|
|
{ 6, 7, 0, 0, 0, 0, 0, 0},
|
|
{ 0, 6, 7, 0, 0, 0, 0, 0},
|
|
{ 1, 6, 7, 0, 0, 0, 0, 0},
|
|
{ 0, 1, 6, 7, 0, 0, 0, 0},
|
|
{ 2, 6, 7, 0, 0, 0, 0, 0},
|
|
{ 0, 2, 6, 7, 0, 0, 0, 0},
|
|
{ 1, 2, 6, 7, 0, 0, 0, 0},
|
|
{ 0, 1, 2, 6, 7, 0, 0, 0},
|
|
{ 3, 6, 7, 0, 0, 0, 0, 0},
|
|
{ 0, 3, 6, 7, 0, 0, 0, 0},
|
|
{ 1, 3, 6, 7, 0, 0, 0, 0},
|
|
{ 0, 1, 3, 6, 7, 0, 0, 0},
|
|
{ 2, 3, 6, 7, 0, 0, 0, 0},
|
|
{ 0, 2, 3, 6, 7, 0, 0, 0},
|
|
{ 1, 2, 3, 6, 7, 0, 0, 0},
|
|
{ 0, 1, 2, 3, 6, 7, 0, 0},
|
|
{ 4, 6, 7, 0, 0, 0, 0, 0},
|
|
{ 0, 4, 6, 7, 0, 0, 0, 0},
|
|
{ 1, 4, 6, 7, 0, 0, 0, 0},
|
|
{ 0, 1, 4, 6, 7, 0, 0, 0},
|
|
{ 2, 4, 6, 7, 0, 0, 0, 0},
|
|
{ 0, 2, 4, 6, 7, 0, 0, 0},
|
|
{ 1, 2, 4, 6, 7, 0, 0, 0},
|
|
{ 0, 1, 2, 4, 6, 7, 0, 0},
|
|
{ 3, 4, 6, 7, 0, 0, 0, 0},
|
|
{ 0, 3, 4, 6, 7, 0, 0, 0},
|
|
{ 1, 3, 4, 6, 7, 0, 0, 0},
|
|
{ 0, 1, 3, 4, 6, 7, 0, 0},
|
|
{ 2, 3, 4, 6, 7, 0, 0, 0},
|
|
{ 0, 2, 3, 4, 6, 7, 0, 0},
|
|
{ 1, 2, 3, 4, 6, 7, 0, 0},
|
|
{ 0, 1, 2, 3, 4, 6, 7, 0},
|
|
{ 5, 6, 7, 0, 0, 0, 0, 0},
|
|
{ 0, 5, 6, 7, 0, 0, 0, 0},
|
|
{ 1, 5, 6, 7, 0, 0, 0, 0},
|
|
{ 0, 1, 5, 6, 7, 0, 0, 0},
|
|
{ 2, 5, 6, 7, 0, 0, 0, 0},
|
|
{ 0, 2, 5, 6, 7, 0, 0, 0},
|
|
{ 1, 2, 5, 6, 7, 0, 0, 0},
|
|
{ 0, 1, 2, 5, 6, 7, 0, 0},
|
|
{ 3, 5, 6, 7, 0, 0, 0, 0},
|
|
{ 0, 3, 5, 6, 7, 0, 0, 0},
|
|
{ 1, 3, 5, 6, 7, 0, 0, 0},
|
|
{ 0, 1, 3, 5, 6, 7, 0, 0},
|
|
{ 2, 3, 5, 6, 7, 0, 0, 0},
|
|
{ 0, 2, 3, 5, 6, 7, 0, 0},
|
|
{ 1, 2, 3, 5, 6, 7, 0, 0},
|
|
{ 0, 1, 2, 3, 5, 6, 7, 0},
|
|
{ 4, 5, 6, 7, 0, 0, 0, 0},
|
|
{ 0, 4, 5, 6, 7, 0, 0, 0},
|
|
{ 1, 4, 5, 6, 7, 0, 0, 0},
|
|
{ 0, 1, 4, 5, 6, 7, 0, 0},
|
|
{ 2, 4, 5, 6, 7, 0, 0, 0},
|
|
{ 0, 2, 4, 5, 6, 7, 0, 0},
|
|
{ 1, 2, 4, 5, 6, 7, 0, 0},
|
|
{ 0, 1, 2, 4, 5, 6, 7, 0},
|
|
{ 3, 4, 5, 6, 7, 0, 0, 0},
|
|
{ 0, 3, 4, 5, 6, 7, 0, 0},
|
|
{ 1, 3, 4, 5, 6, 7, 0, 0},
|
|
{ 0, 1, 3, 4, 5, 6, 7, 0},
|
|
{ 2, 3, 4, 5, 6, 7, 0, 0},
|
|
{ 0, 2, 3, 4, 5, 6, 7, 0},
|
|
{ 1, 2, 3, 4, 5, 6, 7, 0},
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7}
|
|
};
|
|
|
|
unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_uniform_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_BUFLEN + 8]) {
|
|
unsigned int ctr, pos;
|
|
uint32_t good;
|
|
__m256i d, tmp;
|
|
const __m256i bound = _mm256_set1_epi32(Q);
|
|
const __m256i mask = _mm256_set1_epi32(0x7FFFFF);
|
|
const __m256i idx8 = _mm256_set_epi8(-1, 15, 14, 13, -1, 12, 11, 10,
|
|
-1, 9, 8, 7, -1, 6, 5, 4,
|
|
-1, 11, 10, 9, -1, 8, 7, 6,
|
|
-1, 5, 4, 3, -1, 2, 1, 0);
|
|
|
|
ctr = pos = 0;
|
|
while (pos <= REJ_UNIFORM_BUFLEN - 24) {
|
|
d = _mm256_loadu_si256((__m256i *)&buf[pos]);
|
|
d = _mm256_permute4x64_epi64(d, 0x94);
|
|
d = _mm256_shuffle_epi8(d, idx8);
|
|
d = _mm256_and_si256(d, mask);
|
|
pos += 24;
|
|
|
|
tmp = _mm256_sub_epi32(d, bound);
|
|
good = _mm256_movemask_ps((__m256)tmp);
|
|
tmp = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2_AVX2_idxlut[good]));
|
|
d = _mm256_permutevar8x32_epi32(d, tmp);
|
|
|
|
_mm256_storeu_si256((__m256i *)&r[ctr], d);
|
|
ctr += _mm_popcnt_u32(good);
|
|
|
|
if (ctr > N - 8) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
uint32_t t;
|
|
while (ctr < N && pos <= REJ_UNIFORM_BUFLEN - 3) {
|
|
t = buf[pos++];
|
|
t |= (uint32_t)buf[pos++] << 8;
|
|
t |= (uint32_t)buf[pos++] << 16;
|
|
t &= 0x7FFFFF;
|
|
|
|
if (t < Q) {
|
|
r[ctr++] = t;
|
|
}
|
|
}
|
|
|
|
return ctr;
|
|
}
|
|
|
|
unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_eta_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_ETA_BUFLEN]) {
|
|
unsigned int ctr, pos;
|
|
uint32_t good;
|
|
__m256i f0, f1, f2;
|
|
__m128i g0, g1;
|
|
const __m256i mask = _mm256_set1_epi8(15);
|
|
const __m256i eta = _mm256_set1_epi8(ETA);
|
|
const __m256i bound = mask;
|
|
const __m256i v = _mm256_set1_epi32(-6560);
|
|
const __m256i p = _mm256_set1_epi32(5);
|
|
|
|
ctr = pos = 0;
|
|
while (ctr <= N - 8 && pos <= REJ_UNIFORM_ETA_BUFLEN - 16) {
|
|
f0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)&buf[pos]));
|
|
f1 = _mm256_slli_epi16(f0, 4);
|
|
f0 = _mm256_or_si256(f0, f1);
|
|
f0 = _mm256_and_si256(f0, mask);
|
|
|
|
f1 = _mm256_sub_epi8(f0, bound);
|
|
f0 = _mm256_sub_epi8(eta, f0);
|
|
good = _mm256_movemask_epi8(f1);
|
|
|
|
g0 = _mm256_castsi256_si128(f0);
|
|
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2_AVX2_idxlut[good & 0xFF]);
|
|
g1 = _mm_shuffle_epi8(g0, g1);
|
|
f1 = _mm256_cvtepi8_epi32(g1);
|
|
f2 = _mm256_mulhrs_epi16(f1, v);
|
|
f2 = _mm256_mullo_epi16(f2, p);
|
|
f1 = _mm256_add_epi32(f1, f2);
|
|
_mm256_storeu_si256((__m256i *)&r[ctr], f1);
|
|
ctr += _mm_popcnt_u32(good & 0xFF);
|
|
good >>= 8;
|
|
pos += 4;
|
|
|
|
if (ctr > N - 8) {
|
|
break;
|
|
}
|
|
g0 = _mm_bsrli_si128(g0, 8);
|
|
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2_AVX2_idxlut[good & 0xFF]);
|
|
g1 = _mm_shuffle_epi8(g0, g1);
|
|
f1 = _mm256_cvtepi8_epi32(g1);
|
|
f2 = _mm256_mulhrs_epi16(f1, v);
|
|
f2 = _mm256_mullo_epi16(f2, p);
|
|
f1 = _mm256_add_epi32(f1, f2);
|
|
_mm256_storeu_si256((__m256i *)&r[ctr], f1);
|
|
ctr += _mm_popcnt_u32(good & 0xFF);
|
|
good >>= 8;
|
|
pos += 4;
|
|
|
|
if (ctr > N - 8) {
|
|
break;
|
|
}
|
|
g0 = _mm256_extracti128_si256(f0, 1);
|
|
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2_AVX2_idxlut[good & 0xFF]);
|
|
g1 = _mm_shuffle_epi8(g0, g1);
|
|
f1 = _mm256_cvtepi8_epi32(g1);
|
|
f2 = _mm256_mulhrs_epi16(f1, v);
|
|
f2 = _mm256_mullo_epi16(f2, p);
|
|
f1 = _mm256_add_epi32(f1, f2);
|
|
_mm256_storeu_si256((__m256i *)&r[ctr], f1);
|
|
ctr += _mm_popcnt_u32(good & 0xFF);
|
|
good >>= 8;
|
|
pos += 4;
|
|
|
|
if (ctr > N - 8) {
|
|
break;
|
|
}
|
|
g0 = _mm_bsrli_si128(g0, 8);
|
|
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2_AVX2_idxlut[good]);
|
|
g1 = _mm_shuffle_epi8(g0, g1);
|
|
f1 = _mm256_cvtepi8_epi32(g1);
|
|
f2 = _mm256_mulhrs_epi16(f1, v);
|
|
f2 = _mm256_mullo_epi16(f2, p);
|
|
f1 = _mm256_add_epi32(f1, f2);
|
|
_mm256_storeu_si256((__m256i *)&r[ctr], f1);
|
|
ctr += _mm_popcnt_u32(good);
|
|
pos += 4;
|
|
}
|
|
|
|
uint32_t t0, t1;
|
|
while (ctr < N && pos < REJ_UNIFORM_ETA_BUFLEN) {
|
|
t0 = buf[pos] & 0x0F;
|
|
t1 = buf[pos++] >> 4;
|
|
|
|
if (t0 < 15) {
|
|
t0 = t0 - (205 * t0 >> 10) * 5;
|
|
r[ctr++] = 2 - t0;
|
|
}
|
|
if (t1 < 15 && ctr < N) {
|
|
t1 = t1 - (205 * t1 >> 10) * 5;
|
|
r[ctr++] = 2 - t1;
|
|
}
|
|
}
|
|
|
|
return ctr;
|
|
}
|