1
1
mirror of https://github.com/henrydcase/pqc.git synced 2024-11-26 17:31:38 +00:00
pqcrypto/crypto_sign/dilithium3/avx2/rejsample.c
John Schanck 32c613e8ec Round 3 update for Dilithium (from github source) (#369)
* Update Dilithium

* Alternative montgomery reduce to avoid i386 functest errors

* Explicit casts for msvc

* More casts; bump upstream version; fix metadata

* another cast
2021-03-24 21:02:50 +00:00

393 lines
14 KiB
C

#include "params.h"
#include "rejsample.h"
#include "symmetric.h"
#include <immintrin.h>
#include <stdint.h>
const uint8_t PQCLEAN_DILITHIUM3_AVX2_idxlut[256][8] = {
{ 0, 0, 0, 0, 0, 0, 0, 0},
{ 0, 0, 0, 0, 0, 0, 0, 0},
{ 1, 0, 0, 0, 0, 0, 0, 0},
{ 0, 1, 0, 0, 0, 0, 0, 0},
{ 2, 0, 0, 0, 0, 0, 0, 0},
{ 0, 2, 0, 0, 0, 0, 0, 0},
{ 1, 2, 0, 0, 0, 0, 0, 0},
{ 0, 1, 2, 0, 0, 0, 0, 0},
{ 3, 0, 0, 0, 0, 0, 0, 0},
{ 0, 3, 0, 0, 0, 0, 0, 0},
{ 1, 3, 0, 0, 0, 0, 0, 0},
{ 0, 1, 3, 0, 0, 0, 0, 0},
{ 2, 3, 0, 0, 0, 0, 0, 0},
{ 0, 2, 3, 0, 0, 0, 0, 0},
{ 1, 2, 3, 0, 0, 0, 0, 0},
{ 0, 1, 2, 3, 0, 0, 0, 0},
{ 4, 0, 0, 0, 0, 0, 0, 0},
{ 0, 4, 0, 0, 0, 0, 0, 0},
{ 1, 4, 0, 0, 0, 0, 0, 0},
{ 0, 1, 4, 0, 0, 0, 0, 0},
{ 2, 4, 0, 0, 0, 0, 0, 0},
{ 0, 2, 4, 0, 0, 0, 0, 0},
{ 1, 2, 4, 0, 0, 0, 0, 0},
{ 0, 1, 2, 4, 0, 0, 0, 0},
{ 3, 4, 0, 0, 0, 0, 0, 0},
{ 0, 3, 4, 0, 0, 0, 0, 0},
{ 1, 3, 4, 0, 0, 0, 0, 0},
{ 0, 1, 3, 4, 0, 0, 0, 0},
{ 2, 3, 4, 0, 0, 0, 0, 0},
{ 0, 2, 3, 4, 0, 0, 0, 0},
{ 1, 2, 3, 4, 0, 0, 0, 0},
{ 0, 1, 2, 3, 4, 0, 0, 0},
{ 5, 0, 0, 0, 0, 0, 0, 0},
{ 0, 5, 0, 0, 0, 0, 0, 0},
{ 1, 5, 0, 0, 0, 0, 0, 0},
{ 0, 1, 5, 0, 0, 0, 0, 0},
{ 2, 5, 0, 0, 0, 0, 0, 0},
{ 0, 2, 5, 0, 0, 0, 0, 0},
{ 1, 2, 5, 0, 0, 0, 0, 0},
{ 0, 1, 2, 5, 0, 0, 0, 0},
{ 3, 5, 0, 0, 0, 0, 0, 0},
{ 0, 3, 5, 0, 0, 0, 0, 0},
{ 1, 3, 5, 0, 0, 0, 0, 0},
{ 0, 1, 3, 5, 0, 0, 0, 0},
{ 2, 3, 5, 0, 0, 0, 0, 0},
{ 0, 2, 3, 5, 0, 0, 0, 0},
{ 1, 2, 3, 5, 0, 0, 0, 0},
{ 0, 1, 2, 3, 5, 0, 0, 0},
{ 4, 5, 0, 0, 0, 0, 0, 0},
{ 0, 4, 5, 0, 0, 0, 0, 0},
{ 1, 4, 5, 0, 0, 0, 0, 0},
{ 0, 1, 4, 5, 0, 0, 0, 0},
{ 2, 4, 5, 0, 0, 0, 0, 0},
{ 0, 2, 4, 5, 0, 0, 0, 0},
{ 1, 2, 4, 5, 0, 0, 0, 0},
{ 0, 1, 2, 4, 5, 0, 0, 0},
{ 3, 4, 5, 0, 0, 0, 0, 0},
{ 0, 3, 4, 5, 0, 0, 0, 0},
{ 1, 3, 4, 5, 0, 0, 0, 0},
{ 0, 1, 3, 4, 5, 0, 0, 0},
{ 2, 3, 4, 5, 0, 0, 0, 0},
{ 0, 2, 3, 4, 5, 0, 0, 0},
{ 1, 2, 3, 4, 5, 0, 0, 0},
{ 0, 1, 2, 3, 4, 5, 0, 0},
{ 6, 0, 0, 0, 0, 0, 0, 0},
{ 0, 6, 0, 0, 0, 0, 0, 0},
{ 1, 6, 0, 0, 0, 0, 0, 0},
{ 0, 1, 6, 0, 0, 0, 0, 0},
{ 2, 6, 0, 0, 0, 0, 0, 0},
{ 0, 2, 6, 0, 0, 0, 0, 0},
{ 1, 2, 6, 0, 0, 0, 0, 0},
{ 0, 1, 2, 6, 0, 0, 0, 0},
{ 3, 6, 0, 0, 0, 0, 0, 0},
{ 0, 3, 6, 0, 0, 0, 0, 0},
{ 1, 3, 6, 0, 0, 0, 0, 0},
{ 0, 1, 3, 6, 0, 0, 0, 0},
{ 2, 3, 6, 0, 0, 0, 0, 0},
{ 0, 2, 3, 6, 0, 0, 0, 0},
{ 1, 2, 3, 6, 0, 0, 0, 0},
{ 0, 1, 2, 3, 6, 0, 0, 0},
{ 4, 6, 0, 0, 0, 0, 0, 0},
{ 0, 4, 6, 0, 0, 0, 0, 0},
{ 1, 4, 6, 0, 0, 0, 0, 0},
{ 0, 1, 4, 6, 0, 0, 0, 0},
{ 2, 4, 6, 0, 0, 0, 0, 0},
{ 0, 2, 4, 6, 0, 0, 0, 0},
{ 1, 2, 4, 6, 0, 0, 0, 0},
{ 0, 1, 2, 4, 6, 0, 0, 0},
{ 3, 4, 6, 0, 0, 0, 0, 0},
{ 0, 3, 4, 6, 0, 0, 0, 0},
{ 1, 3, 4, 6, 0, 0, 0, 0},
{ 0, 1, 3, 4, 6, 0, 0, 0},
{ 2, 3, 4, 6, 0, 0, 0, 0},
{ 0, 2, 3, 4, 6, 0, 0, 0},
{ 1, 2, 3, 4, 6, 0, 0, 0},
{ 0, 1, 2, 3, 4, 6, 0, 0},
{ 5, 6, 0, 0, 0, 0, 0, 0},
{ 0, 5, 6, 0, 0, 0, 0, 0},
{ 1, 5, 6, 0, 0, 0, 0, 0},
{ 0, 1, 5, 6, 0, 0, 0, 0},
{ 2, 5, 6, 0, 0, 0, 0, 0},
{ 0, 2, 5, 6, 0, 0, 0, 0},
{ 1, 2, 5, 6, 0, 0, 0, 0},
{ 0, 1, 2, 5, 6, 0, 0, 0},
{ 3, 5, 6, 0, 0, 0, 0, 0},
{ 0, 3, 5, 6, 0, 0, 0, 0},
{ 1, 3, 5, 6, 0, 0, 0, 0},
{ 0, 1, 3, 5, 6, 0, 0, 0},
{ 2, 3, 5, 6, 0, 0, 0, 0},
{ 0, 2, 3, 5, 6, 0, 0, 0},
{ 1, 2, 3, 5, 6, 0, 0, 0},
{ 0, 1, 2, 3, 5, 6, 0, 0},
{ 4, 5, 6, 0, 0, 0, 0, 0},
{ 0, 4, 5, 6, 0, 0, 0, 0},
{ 1, 4, 5, 6, 0, 0, 0, 0},
{ 0, 1, 4, 5, 6, 0, 0, 0},
{ 2, 4, 5, 6, 0, 0, 0, 0},
{ 0, 2, 4, 5, 6, 0, 0, 0},
{ 1, 2, 4, 5, 6, 0, 0, 0},
{ 0, 1, 2, 4, 5, 6, 0, 0},
{ 3, 4, 5, 6, 0, 0, 0, 0},
{ 0, 3, 4, 5, 6, 0, 0, 0},
{ 1, 3, 4, 5, 6, 0, 0, 0},
{ 0, 1, 3, 4, 5, 6, 0, 0},
{ 2, 3, 4, 5, 6, 0, 0, 0},
{ 0, 2, 3, 4, 5, 6, 0, 0},
{ 1, 2, 3, 4, 5, 6, 0, 0},
{ 0, 1, 2, 3, 4, 5, 6, 0},
{ 7, 0, 0, 0, 0, 0, 0, 0},
{ 0, 7, 0, 0, 0, 0, 0, 0},
{ 1, 7, 0, 0, 0, 0, 0, 0},
{ 0, 1, 7, 0, 0, 0, 0, 0},
{ 2, 7, 0, 0, 0, 0, 0, 0},
{ 0, 2, 7, 0, 0, 0, 0, 0},
{ 1, 2, 7, 0, 0, 0, 0, 0},
{ 0, 1, 2, 7, 0, 0, 0, 0},
{ 3, 7, 0, 0, 0, 0, 0, 0},
{ 0, 3, 7, 0, 0, 0, 0, 0},
{ 1, 3, 7, 0, 0, 0, 0, 0},
{ 0, 1, 3, 7, 0, 0, 0, 0},
{ 2, 3, 7, 0, 0, 0, 0, 0},
{ 0, 2, 3, 7, 0, 0, 0, 0},
{ 1, 2, 3, 7, 0, 0, 0, 0},
{ 0, 1, 2, 3, 7, 0, 0, 0},
{ 4, 7, 0, 0, 0, 0, 0, 0},
{ 0, 4, 7, 0, 0, 0, 0, 0},
{ 1, 4, 7, 0, 0, 0, 0, 0},
{ 0, 1, 4, 7, 0, 0, 0, 0},
{ 2, 4, 7, 0, 0, 0, 0, 0},
{ 0, 2, 4, 7, 0, 0, 0, 0},
{ 1, 2, 4, 7, 0, 0, 0, 0},
{ 0, 1, 2, 4, 7, 0, 0, 0},
{ 3, 4, 7, 0, 0, 0, 0, 0},
{ 0, 3, 4, 7, 0, 0, 0, 0},
{ 1, 3, 4, 7, 0, 0, 0, 0},
{ 0, 1, 3, 4, 7, 0, 0, 0},
{ 2, 3, 4, 7, 0, 0, 0, 0},
{ 0, 2, 3, 4, 7, 0, 0, 0},
{ 1, 2, 3, 4, 7, 0, 0, 0},
{ 0, 1, 2, 3, 4, 7, 0, 0},
{ 5, 7, 0, 0, 0, 0, 0, 0},
{ 0, 5, 7, 0, 0, 0, 0, 0},
{ 1, 5, 7, 0, 0, 0, 0, 0},
{ 0, 1, 5, 7, 0, 0, 0, 0},
{ 2, 5, 7, 0, 0, 0, 0, 0},
{ 0, 2, 5, 7, 0, 0, 0, 0},
{ 1, 2, 5, 7, 0, 0, 0, 0},
{ 0, 1, 2, 5, 7, 0, 0, 0},
{ 3, 5, 7, 0, 0, 0, 0, 0},
{ 0, 3, 5, 7, 0, 0, 0, 0},
{ 1, 3, 5, 7, 0, 0, 0, 0},
{ 0, 1, 3, 5, 7, 0, 0, 0},
{ 2, 3, 5, 7, 0, 0, 0, 0},
{ 0, 2, 3, 5, 7, 0, 0, 0},
{ 1, 2, 3, 5, 7, 0, 0, 0},
{ 0, 1, 2, 3, 5, 7, 0, 0},
{ 4, 5, 7, 0, 0, 0, 0, 0},
{ 0, 4, 5, 7, 0, 0, 0, 0},
{ 1, 4, 5, 7, 0, 0, 0, 0},
{ 0, 1, 4, 5, 7, 0, 0, 0},
{ 2, 4, 5, 7, 0, 0, 0, 0},
{ 0, 2, 4, 5, 7, 0, 0, 0},
{ 1, 2, 4, 5, 7, 0, 0, 0},
{ 0, 1, 2, 4, 5, 7, 0, 0},
{ 3, 4, 5, 7, 0, 0, 0, 0},
{ 0, 3, 4, 5, 7, 0, 0, 0},
{ 1, 3, 4, 5, 7, 0, 0, 0},
{ 0, 1, 3, 4, 5, 7, 0, 0},
{ 2, 3, 4, 5, 7, 0, 0, 0},
{ 0, 2, 3, 4, 5, 7, 0, 0},
{ 1, 2, 3, 4, 5, 7, 0, 0},
{ 0, 1, 2, 3, 4, 5, 7, 0},
{ 6, 7, 0, 0, 0, 0, 0, 0},
{ 0, 6, 7, 0, 0, 0, 0, 0},
{ 1, 6, 7, 0, 0, 0, 0, 0},
{ 0, 1, 6, 7, 0, 0, 0, 0},
{ 2, 6, 7, 0, 0, 0, 0, 0},
{ 0, 2, 6, 7, 0, 0, 0, 0},
{ 1, 2, 6, 7, 0, 0, 0, 0},
{ 0, 1, 2, 6, 7, 0, 0, 0},
{ 3, 6, 7, 0, 0, 0, 0, 0},
{ 0, 3, 6, 7, 0, 0, 0, 0},
{ 1, 3, 6, 7, 0, 0, 0, 0},
{ 0, 1, 3, 6, 7, 0, 0, 0},
{ 2, 3, 6, 7, 0, 0, 0, 0},
{ 0, 2, 3, 6, 7, 0, 0, 0},
{ 1, 2, 3, 6, 7, 0, 0, 0},
{ 0, 1, 2, 3, 6, 7, 0, 0},
{ 4, 6, 7, 0, 0, 0, 0, 0},
{ 0, 4, 6, 7, 0, 0, 0, 0},
{ 1, 4, 6, 7, 0, 0, 0, 0},
{ 0, 1, 4, 6, 7, 0, 0, 0},
{ 2, 4, 6, 7, 0, 0, 0, 0},
{ 0, 2, 4, 6, 7, 0, 0, 0},
{ 1, 2, 4, 6, 7, 0, 0, 0},
{ 0, 1, 2, 4, 6, 7, 0, 0},
{ 3, 4, 6, 7, 0, 0, 0, 0},
{ 0, 3, 4, 6, 7, 0, 0, 0},
{ 1, 3, 4, 6, 7, 0, 0, 0},
{ 0, 1, 3, 4, 6, 7, 0, 0},
{ 2, 3, 4, 6, 7, 0, 0, 0},
{ 0, 2, 3, 4, 6, 7, 0, 0},
{ 1, 2, 3, 4, 6, 7, 0, 0},
{ 0, 1, 2, 3, 4, 6, 7, 0},
{ 5, 6, 7, 0, 0, 0, 0, 0},
{ 0, 5, 6, 7, 0, 0, 0, 0},
{ 1, 5, 6, 7, 0, 0, 0, 0},
{ 0, 1, 5, 6, 7, 0, 0, 0},
{ 2, 5, 6, 7, 0, 0, 0, 0},
{ 0, 2, 5, 6, 7, 0, 0, 0},
{ 1, 2, 5, 6, 7, 0, 0, 0},
{ 0, 1, 2, 5, 6, 7, 0, 0},
{ 3, 5, 6, 7, 0, 0, 0, 0},
{ 0, 3, 5, 6, 7, 0, 0, 0},
{ 1, 3, 5, 6, 7, 0, 0, 0},
{ 0, 1, 3, 5, 6, 7, 0, 0},
{ 2, 3, 5, 6, 7, 0, 0, 0},
{ 0, 2, 3, 5, 6, 7, 0, 0},
{ 1, 2, 3, 5, 6, 7, 0, 0},
{ 0, 1, 2, 3, 5, 6, 7, 0},
{ 4, 5, 6, 7, 0, 0, 0, 0},
{ 0, 4, 5, 6, 7, 0, 0, 0},
{ 1, 4, 5, 6, 7, 0, 0, 0},
{ 0, 1, 4, 5, 6, 7, 0, 0},
{ 2, 4, 5, 6, 7, 0, 0, 0},
{ 0, 2, 4, 5, 6, 7, 0, 0},
{ 1, 2, 4, 5, 6, 7, 0, 0},
{ 0, 1, 2, 4, 5, 6, 7, 0},
{ 3, 4, 5, 6, 7, 0, 0, 0},
{ 0, 3, 4, 5, 6, 7, 0, 0},
{ 1, 3, 4, 5, 6, 7, 0, 0},
{ 0, 1, 3, 4, 5, 6, 7, 0},
{ 2, 3, 4, 5, 6, 7, 0, 0},
{ 0, 2, 3, 4, 5, 6, 7, 0},
{ 1, 2, 3, 4, 5, 6, 7, 0},
{ 0, 1, 2, 3, 4, 5, 6, 7}
};
unsigned int PQCLEAN_DILITHIUM3_AVX2_rej_uniform_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_BUFLEN + 8]) {
unsigned int ctr, pos;
uint32_t good;
__m256i d, tmp;
const __m256i bound = _mm256_set1_epi32(Q);
const __m256i mask = _mm256_set1_epi32(0x7FFFFF);
const __m256i idx8 = _mm256_set_epi8(-1, 15, 14, 13, -1, 12, 11, 10,
-1, 9, 8, 7, -1, 6, 5, 4,
-1, 11, 10, 9, -1, 8, 7, 6,
-1, 5, 4, 3, -1, 2, 1, 0);
ctr = pos = 0;
while (pos <= REJ_UNIFORM_BUFLEN - 24) {
d = _mm256_loadu_si256((__m256i *)&buf[pos]);
d = _mm256_permute4x64_epi64(d, 0x94);
d = _mm256_shuffle_epi8(d, idx8);
d = _mm256_and_si256(d, mask);
pos += 24;
tmp = _mm256_sub_epi32(d, bound);
good = _mm256_movemask_ps((__m256)tmp);
tmp = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM3_AVX2_idxlut[good]));
d = _mm256_permutevar8x32_epi32(d, tmp);
_mm256_storeu_si256((__m256i *)&r[ctr], d);
ctr += _mm_popcnt_u32(good);
if (ctr > N - 8) {
break;
}
}
uint32_t t;
while (ctr < N && pos <= REJ_UNIFORM_BUFLEN - 3) {
t = buf[pos++];
t |= (uint32_t)buf[pos++] << 8;
t |= (uint32_t)buf[pos++] << 16;
t &= 0x7FFFFF;
if (t < Q) {
r[ctr++] = t;
}
}
return ctr;
}
unsigned int PQCLEAN_DILITHIUM3_AVX2_rej_eta_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_ETA_BUFLEN]) {
unsigned int ctr, pos;
uint32_t good;
__m256i f0, f1;
__m128i g0, g1;
const __m256i mask = _mm256_set1_epi8(15);
const __m256i eta = _mm256_set1_epi8(4);
const __m256i bound = _mm256_set1_epi8(9);
ctr = pos = 0;
while (ctr <= N - 8 && pos <= REJ_UNIFORM_ETA_BUFLEN - 16) {
f0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)&buf[pos]));
f1 = _mm256_slli_epi16(f0, 4);
f0 = _mm256_or_si256(f0, f1);
f0 = _mm256_and_si256(f0, mask);
f1 = _mm256_sub_epi8(f0, bound);
f0 = _mm256_sub_epi8(eta, f0);
good = _mm256_movemask_epi8(f1);
g0 = _mm256_castsi256_si128(f0);
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM3_AVX2_idxlut[good & 0xFF]);
g1 = _mm_shuffle_epi8(g0, g1);
f1 = _mm256_cvtepi8_epi32(g1);
_mm256_storeu_si256((__m256i *)&r[ctr], f1);
ctr += _mm_popcnt_u32(good & 0xFF);
good >>= 8;
pos += 4;
if (ctr > N - 8) {
break;
}
g0 = _mm_bsrli_si128(g0, 8);
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM3_AVX2_idxlut[good & 0xFF]);
g1 = _mm_shuffle_epi8(g0, g1);
f1 = _mm256_cvtepi8_epi32(g1);
_mm256_storeu_si256((__m256i *)&r[ctr], f1);
ctr += _mm_popcnt_u32(good & 0xFF);
good >>= 8;
pos += 4;
if (ctr > N - 8) {
break;
}
g0 = _mm256_extracti128_si256(f0, 1);
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM3_AVX2_idxlut[good & 0xFF]);
g1 = _mm_shuffle_epi8(g0, g1);
f1 = _mm256_cvtepi8_epi32(g1);
_mm256_storeu_si256((__m256i *)&r[ctr], f1);
ctr += _mm_popcnt_u32(good & 0xFF);
good >>= 8;
pos += 4;
if (ctr > N - 8) {
break;
}
g0 = _mm_bsrli_si128(g0, 8);
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM3_AVX2_idxlut[good]);
g1 = _mm_shuffle_epi8(g0, g1);
f1 = _mm256_cvtepi8_epi32(g1);
_mm256_storeu_si256((__m256i *)&r[ctr], f1);
ctr += _mm_popcnt_u32(good);
pos += 4;
}
uint32_t t0, t1;
while (ctr < N && pos < REJ_UNIFORM_ETA_BUFLEN) {
t0 = buf[pos] & 0x0F;
t1 = buf[pos++] >> 4;
if (t0 < 9) {
r[ctr++] = 4 - t0;
}
if (t1 < 9 && ctr < N) {
r[ctr++] = 4 - t1;
}
}
return ctr;
}