pqc/crypto_sign/dilithium5/avx2/rejsample.c
John Schanck 4f86c39515
Round 3 update for Dilithium (from github source) (#369)
* Update Dilithium

* Alternative montgomery reduce to avoid i386 functest errors

* Explicit casts for msvc

* More casts; bump upstream version; fix metadata

* another cast
2021-02-01 13:32:40 +08:00

409 lines
14 KiB
C

#include "params.h"
#include "rejsample.h"
#include "symmetric.h"
#include <immintrin.h>
#include <stdint.h>
const uint8_t PQCLEAN_DILITHIUM5_AVX2_idxlut[256][8] = {
{ 0, 0, 0, 0, 0, 0, 0, 0},
{ 0, 0, 0, 0, 0, 0, 0, 0},
{ 1, 0, 0, 0, 0, 0, 0, 0},
{ 0, 1, 0, 0, 0, 0, 0, 0},
{ 2, 0, 0, 0, 0, 0, 0, 0},
{ 0, 2, 0, 0, 0, 0, 0, 0},
{ 1, 2, 0, 0, 0, 0, 0, 0},
{ 0, 1, 2, 0, 0, 0, 0, 0},
{ 3, 0, 0, 0, 0, 0, 0, 0},
{ 0, 3, 0, 0, 0, 0, 0, 0},
{ 1, 3, 0, 0, 0, 0, 0, 0},
{ 0, 1, 3, 0, 0, 0, 0, 0},
{ 2, 3, 0, 0, 0, 0, 0, 0},
{ 0, 2, 3, 0, 0, 0, 0, 0},
{ 1, 2, 3, 0, 0, 0, 0, 0},
{ 0, 1, 2, 3, 0, 0, 0, 0},
{ 4, 0, 0, 0, 0, 0, 0, 0},
{ 0, 4, 0, 0, 0, 0, 0, 0},
{ 1, 4, 0, 0, 0, 0, 0, 0},
{ 0, 1, 4, 0, 0, 0, 0, 0},
{ 2, 4, 0, 0, 0, 0, 0, 0},
{ 0, 2, 4, 0, 0, 0, 0, 0},
{ 1, 2, 4, 0, 0, 0, 0, 0},
{ 0, 1, 2, 4, 0, 0, 0, 0},
{ 3, 4, 0, 0, 0, 0, 0, 0},
{ 0, 3, 4, 0, 0, 0, 0, 0},
{ 1, 3, 4, 0, 0, 0, 0, 0},
{ 0, 1, 3, 4, 0, 0, 0, 0},
{ 2, 3, 4, 0, 0, 0, 0, 0},
{ 0, 2, 3, 4, 0, 0, 0, 0},
{ 1, 2, 3, 4, 0, 0, 0, 0},
{ 0, 1, 2, 3, 4, 0, 0, 0},
{ 5, 0, 0, 0, 0, 0, 0, 0},
{ 0, 5, 0, 0, 0, 0, 0, 0},
{ 1, 5, 0, 0, 0, 0, 0, 0},
{ 0, 1, 5, 0, 0, 0, 0, 0},
{ 2, 5, 0, 0, 0, 0, 0, 0},
{ 0, 2, 5, 0, 0, 0, 0, 0},
{ 1, 2, 5, 0, 0, 0, 0, 0},
{ 0, 1, 2, 5, 0, 0, 0, 0},
{ 3, 5, 0, 0, 0, 0, 0, 0},
{ 0, 3, 5, 0, 0, 0, 0, 0},
{ 1, 3, 5, 0, 0, 0, 0, 0},
{ 0, 1, 3, 5, 0, 0, 0, 0},
{ 2, 3, 5, 0, 0, 0, 0, 0},
{ 0, 2, 3, 5, 0, 0, 0, 0},
{ 1, 2, 3, 5, 0, 0, 0, 0},
{ 0, 1, 2, 3, 5, 0, 0, 0},
{ 4, 5, 0, 0, 0, 0, 0, 0},
{ 0, 4, 5, 0, 0, 0, 0, 0},
{ 1, 4, 5, 0, 0, 0, 0, 0},
{ 0, 1, 4, 5, 0, 0, 0, 0},
{ 2, 4, 5, 0, 0, 0, 0, 0},
{ 0, 2, 4, 5, 0, 0, 0, 0},
{ 1, 2, 4, 5, 0, 0, 0, 0},
{ 0, 1, 2, 4, 5, 0, 0, 0},
{ 3, 4, 5, 0, 0, 0, 0, 0},
{ 0, 3, 4, 5, 0, 0, 0, 0},
{ 1, 3, 4, 5, 0, 0, 0, 0},
{ 0, 1, 3, 4, 5, 0, 0, 0},
{ 2, 3, 4, 5, 0, 0, 0, 0},
{ 0, 2, 3, 4, 5, 0, 0, 0},
{ 1, 2, 3, 4, 5, 0, 0, 0},
{ 0, 1, 2, 3, 4, 5, 0, 0},
{ 6, 0, 0, 0, 0, 0, 0, 0},
{ 0, 6, 0, 0, 0, 0, 0, 0},
{ 1, 6, 0, 0, 0, 0, 0, 0},
{ 0, 1, 6, 0, 0, 0, 0, 0},
{ 2, 6, 0, 0, 0, 0, 0, 0},
{ 0, 2, 6, 0, 0, 0, 0, 0},
{ 1, 2, 6, 0, 0, 0, 0, 0},
{ 0, 1, 2, 6, 0, 0, 0, 0},
{ 3, 6, 0, 0, 0, 0, 0, 0},
{ 0, 3, 6, 0, 0, 0, 0, 0},
{ 1, 3, 6, 0, 0, 0, 0, 0},
{ 0, 1, 3, 6, 0, 0, 0, 0},
{ 2, 3, 6, 0, 0, 0, 0, 0},
{ 0, 2, 3, 6, 0, 0, 0, 0},
{ 1, 2, 3, 6, 0, 0, 0, 0},
{ 0, 1, 2, 3, 6, 0, 0, 0},
{ 4, 6, 0, 0, 0, 0, 0, 0},
{ 0, 4, 6, 0, 0, 0, 0, 0},
{ 1, 4, 6, 0, 0, 0, 0, 0},
{ 0, 1, 4, 6, 0, 0, 0, 0},
{ 2, 4, 6, 0, 0, 0, 0, 0},
{ 0, 2, 4, 6, 0, 0, 0, 0},
{ 1, 2, 4, 6, 0, 0, 0, 0},
{ 0, 1, 2, 4, 6, 0, 0, 0},
{ 3, 4, 6, 0, 0, 0, 0, 0},
{ 0, 3, 4, 6, 0, 0, 0, 0},
{ 1, 3, 4, 6, 0, 0, 0, 0},
{ 0, 1, 3, 4, 6, 0, 0, 0},
{ 2, 3, 4, 6, 0, 0, 0, 0},
{ 0, 2, 3, 4, 6, 0, 0, 0},
{ 1, 2, 3, 4, 6, 0, 0, 0},
{ 0, 1, 2, 3, 4, 6, 0, 0},
{ 5, 6, 0, 0, 0, 0, 0, 0},
{ 0, 5, 6, 0, 0, 0, 0, 0},
{ 1, 5, 6, 0, 0, 0, 0, 0},
{ 0, 1, 5, 6, 0, 0, 0, 0},
{ 2, 5, 6, 0, 0, 0, 0, 0},
{ 0, 2, 5, 6, 0, 0, 0, 0},
{ 1, 2, 5, 6, 0, 0, 0, 0},
{ 0, 1, 2, 5, 6, 0, 0, 0},
{ 3, 5, 6, 0, 0, 0, 0, 0},
{ 0, 3, 5, 6, 0, 0, 0, 0},
{ 1, 3, 5, 6, 0, 0, 0, 0},
{ 0, 1, 3, 5, 6, 0, 0, 0},
{ 2, 3, 5, 6, 0, 0, 0, 0},
{ 0, 2, 3, 5, 6, 0, 0, 0},
{ 1, 2, 3, 5, 6, 0, 0, 0},
{ 0, 1, 2, 3, 5, 6, 0, 0},
{ 4, 5, 6, 0, 0, 0, 0, 0},
{ 0, 4, 5, 6, 0, 0, 0, 0},
{ 1, 4, 5, 6, 0, 0, 0, 0},
{ 0, 1, 4, 5, 6, 0, 0, 0},
{ 2, 4, 5, 6, 0, 0, 0, 0},
{ 0, 2, 4, 5, 6, 0, 0, 0},
{ 1, 2, 4, 5, 6, 0, 0, 0},
{ 0, 1, 2, 4, 5, 6, 0, 0},
{ 3, 4, 5, 6, 0, 0, 0, 0},
{ 0, 3, 4, 5, 6, 0, 0, 0},
{ 1, 3, 4, 5, 6, 0, 0, 0},
{ 0, 1, 3, 4, 5, 6, 0, 0},
{ 2, 3, 4, 5, 6, 0, 0, 0},
{ 0, 2, 3, 4, 5, 6, 0, 0},
{ 1, 2, 3, 4, 5, 6, 0, 0},
{ 0, 1, 2, 3, 4, 5, 6, 0},
{ 7, 0, 0, 0, 0, 0, 0, 0},
{ 0, 7, 0, 0, 0, 0, 0, 0},
{ 1, 7, 0, 0, 0, 0, 0, 0},
{ 0, 1, 7, 0, 0, 0, 0, 0},
{ 2, 7, 0, 0, 0, 0, 0, 0},
{ 0, 2, 7, 0, 0, 0, 0, 0},
{ 1, 2, 7, 0, 0, 0, 0, 0},
{ 0, 1, 2, 7, 0, 0, 0, 0},
{ 3, 7, 0, 0, 0, 0, 0, 0},
{ 0, 3, 7, 0, 0, 0, 0, 0},
{ 1, 3, 7, 0, 0, 0, 0, 0},
{ 0, 1, 3, 7, 0, 0, 0, 0},
{ 2, 3, 7, 0, 0, 0, 0, 0},
{ 0, 2, 3, 7, 0, 0, 0, 0},
{ 1, 2, 3, 7, 0, 0, 0, 0},
{ 0, 1, 2, 3, 7, 0, 0, 0},
{ 4, 7, 0, 0, 0, 0, 0, 0},
{ 0, 4, 7, 0, 0, 0, 0, 0},
{ 1, 4, 7, 0, 0, 0, 0, 0},
{ 0, 1, 4, 7, 0, 0, 0, 0},
{ 2, 4, 7, 0, 0, 0, 0, 0},
{ 0, 2, 4, 7, 0, 0, 0, 0},
{ 1, 2, 4, 7, 0, 0, 0, 0},
{ 0, 1, 2, 4, 7, 0, 0, 0},
{ 3, 4, 7, 0, 0, 0, 0, 0},
{ 0, 3, 4, 7, 0, 0, 0, 0},
{ 1, 3, 4, 7, 0, 0, 0, 0},
{ 0, 1, 3, 4, 7, 0, 0, 0},
{ 2, 3, 4, 7, 0, 0, 0, 0},
{ 0, 2, 3, 4, 7, 0, 0, 0},
{ 1, 2, 3, 4, 7, 0, 0, 0},
{ 0, 1, 2, 3, 4, 7, 0, 0},
{ 5, 7, 0, 0, 0, 0, 0, 0},
{ 0, 5, 7, 0, 0, 0, 0, 0},
{ 1, 5, 7, 0, 0, 0, 0, 0},
{ 0, 1, 5, 7, 0, 0, 0, 0},
{ 2, 5, 7, 0, 0, 0, 0, 0},
{ 0, 2, 5, 7, 0, 0, 0, 0},
{ 1, 2, 5, 7, 0, 0, 0, 0},
{ 0, 1, 2, 5, 7, 0, 0, 0},
{ 3, 5, 7, 0, 0, 0, 0, 0},
{ 0, 3, 5, 7, 0, 0, 0, 0},
{ 1, 3, 5, 7, 0, 0, 0, 0},
{ 0, 1, 3, 5, 7, 0, 0, 0},
{ 2, 3, 5, 7, 0, 0, 0, 0},
{ 0, 2, 3, 5, 7, 0, 0, 0},
{ 1, 2, 3, 5, 7, 0, 0, 0},
{ 0, 1, 2, 3, 5, 7, 0, 0},
{ 4, 5, 7, 0, 0, 0, 0, 0},
{ 0, 4, 5, 7, 0, 0, 0, 0},
{ 1, 4, 5, 7, 0, 0, 0, 0},
{ 0, 1, 4, 5, 7, 0, 0, 0},
{ 2, 4, 5, 7, 0, 0, 0, 0},
{ 0, 2, 4, 5, 7, 0, 0, 0},
{ 1, 2, 4, 5, 7, 0, 0, 0},
{ 0, 1, 2, 4, 5, 7, 0, 0},
{ 3, 4, 5, 7, 0, 0, 0, 0},
{ 0, 3, 4, 5, 7, 0, 0, 0},
{ 1, 3, 4, 5, 7, 0, 0, 0},
{ 0, 1, 3, 4, 5, 7, 0, 0},
{ 2, 3, 4, 5, 7, 0, 0, 0},
{ 0, 2, 3, 4, 5, 7, 0, 0},
{ 1, 2, 3, 4, 5, 7, 0, 0},
{ 0, 1, 2, 3, 4, 5, 7, 0},
{ 6, 7, 0, 0, 0, 0, 0, 0},
{ 0, 6, 7, 0, 0, 0, 0, 0},
{ 1, 6, 7, 0, 0, 0, 0, 0},
{ 0, 1, 6, 7, 0, 0, 0, 0},
{ 2, 6, 7, 0, 0, 0, 0, 0},
{ 0, 2, 6, 7, 0, 0, 0, 0},
{ 1, 2, 6, 7, 0, 0, 0, 0},
{ 0, 1, 2, 6, 7, 0, 0, 0},
{ 3, 6, 7, 0, 0, 0, 0, 0},
{ 0, 3, 6, 7, 0, 0, 0, 0},
{ 1, 3, 6, 7, 0, 0, 0, 0},
{ 0, 1, 3, 6, 7, 0, 0, 0},
{ 2, 3, 6, 7, 0, 0, 0, 0},
{ 0, 2, 3, 6, 7, 0, 0, 0},
{ 1, 2, 3, 6, 7, 0, 0, 0},
{ 0, 1, 2, 3, 6, 7, 0, 0},
{ 4, 6, 7, 0, 0, 0, 0, 0},
{ 0, 4, 6, 7, 0, 0, 0, 0},
{ 1, 4, 6, 7, 0, 0, 0, 0},
{ 0, 1, 4, 6, 7, 0, 0, 0},
{ 2, 4, 6, 7, 0, 0, 0, 0},
{ 0, 2, 4, 6, 7, 0, 0, 0},
{ 1, 2, 4, 6, 7, 0, 0, 0},
{ 0, 1, 2, 4, 6, 7, 0, 0},
{ 3, 4, 6, 7, 0, 0, 0, 0},
{ 0, 3, 4, 6, 7, 0, 0, 0},
{ 1, 3, 4, 6, 7, 0, 0, 0},
{ 0, 1, 3, 4, 6, 7, 0, 0},
{ 2, 3, 4, 6, 7, 0, 0, 0},
{ 0, 2, 3, 4, 6, 7, 0, 0},
{ 1, 2, 3, 4, 6, 7, 0, 0},
{ 0, 1, 2, 3, 4, 6, 7, 0},
{ 5, 6, 7, 0, 0, 0, 0, 0},
{ 0, 5, 6, 7, 0, 0, 0, 0},
{ 1, 5, 6, 7, 0, 0, 0, 0},
{ 0, 1, 5, 6, 7, 0, 0, 0},
{ 2, 5, 6, 7, 0, 0, 0, 0},
{ 0, 2, 5, 6, 7, 0, 0, 0},
{ 1, 2, 5, 6, 7, 0, 0, 0},
{ 0, 1, 2, 5, 6, 7, 0, 0},
{ 3, 5, 6, 7, 0, 0, 0, 0},
{ 0, 3, 5, 6, 7, 0, 0, 0},
{ 1, 3, 5, 6, 7, 0, 0, 0},
{ 0, 1, 3, 5, 6, 7, 0, 0},
{ 2, 3, 5, 6, 7, 0, 0, 0},
{ 0, 2, 3, 5, 6, 7, 0, 0},
{ 1, 2, 3, 5, 6, 7, 0, 0},
{ 0, 1, 2, 3, 5, 6, 7, 0},
{ 4, 5, 6, 7, 0, 0, 0, 0},
{ 0, 4, 5, 6, 7, 0, 0, 0},
{ 1, 4, 5, 6, 7, 0, 0, 0},
{ 0, 1, 4, 5, 6, 7, 0, 0},
{ 2, 4, 5, 6, 7, 0, 0, 0},
{ 0, 2, 4, 5, 6, 7, 0, 0},
{ 1, 2, 4, 5, 6, 7, 0, 0},
{ 0, 1, 2, 4, 5, 6, 7, 0},
{ 3, 4, 5, 6, 7, 0, 0, 0},
{ 0, 3, 4, 5, 6, 7, 0, 0},
{ 1, 3, 4, 5, 6, 7, 0, 0},
{ 0, 1, 3, 4, 5, 6, 7, 0},
{ 2, 3, 4, 5, 6, 7, 0, 0},
{ 0, 2, 3, 4, 5, 6, 7, 0},
{ 1, 2, 3, 4, 5, 6, 7, 0},
{ 0, 1, 2, 3, 4, 5, 6, 7}
};
unsigned int PQCLEAN_DILITHIUM5_AVX2_rej_uniform_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_BUFLEN + 8]) {
unsigned int ctr, pos;
uint32_t good;
__m256i d, tmp;
const __m256i bound = _mm256_set1_epi32(Q);
const __m256i mask = _mm256_set1_epi32(0x7FFFFF);
const __m256i idx8 = _mm256_set_epi8(-1, 15, 14, 13, -1, 12, 11, 10,
-1, 9, 8, 7, -1, 6, 5, 4,
-1, 11, 10, 9, -1, 8, 7, 6,
-1, 5, 4, 3, -1, 2, 1, 0);
ctr = pos = 0;
while (pos <= REJ_UNIFORM_BUFLEN - 24) {
d = _mm256_loadu_si256((__m256i *)&buf[pos]);
d = _mm256_permute4x64_epi64(d, 0x94);
d = _mm256_shuffle_epi8(d, idx8);
d = _mm256_and_si256(d, mask);
pos += 24;
tmp = _mm256_sub_epi32(d, bound);
good = _mm256_movemask_ps((__m256)tmp);
tmp = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM5_AVX2_idxlut[good]));
d = _mm256_permutevar8x32_epi32(d, tmp);
_mm256_storeu_si256((__m256i *)&r[ctr], d);
ctr += _mm_popcnt_u32(good);
if (ctr > N - 8) {
break;
}
}
uint32_t t;
while (ctr < N && pos <= REJ_UNIFORM_BUFLEN - 3) {
t = buf[pos++];
t |= (uint32_t)buf[pos++] << 8;
t |= (uint32_t)buf[pos++] << 16;
t &= 0x7FFFFF;
if (t < Q) {
r[ctr++] = t;
}
}
return ctr;
}
unsigned int PQCLEAN_DILITHIUM5_AVX2_rej_eta_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_ETA_BUFLEN]) {
unsigned int ctr, pos;
uint32_t good;
__m256i f0, f1, f2;
__m128i g0, g1;
const __m256i mask = _mm256_set1_epi8(15);
const __m256i eta = _mm256_set1_epi8(ETA);
const __m256i bound = mask;
const __m256i v = _mm256_set1_epi32(-6560);
const __m256i p = _mm256_set1_epi32(5);
ctr = pos = 0;
while (ctr <= N - 8 && pos <= REJ_UNIFORM_ETA_BUFLEN - 16) {
f0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)&buf[pos]));
f1 = _mm256_slli_epi16(f0, 4);
f0 = _mm256_or_si256(f0, f1);
f0 = _mm256_and_si256(f0, mask);
f1 = _mm256_sub_epi8(f0, bound);
f0 = _mm256_sub_epi8(eta, f0);
good = _mm256_movemask_epi8(f1);
g0 = _mm256_castsi256_si128(f0);
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM5_AVX2_idxlut[good & 0xFF]);
g1 = _mm_shuffle_epi8(g0, g1);
f1 = _mm256_cvtepi8_epi32(g1);
f2 = _mm256_mulhrs_epi16(f1, v);
f2 = _mm256_mullo_epi16(f2, p);
f1 = _mm256_add_epi32(f1, f2);
_mm256_storeu_si256((__m256i *)&r[ctr], f1);
ctr += _mm_popcnt_u32(good & 0xFF);
good >>= 8;
pos += 4;
if (ctr > N - 8) {
break;
}
g0 = _mm_bsrli_si128(g0, 8);
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM5_AVX2_idxlut[good & 0xFF]);
g1 = _mm_shuffle_epi8(g0, g1);
f1 = _mm256_cvtepi8_epi32(g1);
f2 = _mm256_mulhrs_epi16(f1, v);
f2 = _mm256_mullo_epi16(f2, p);
f1 = _mm256_add_epi32(f1, f2);
_mm256_storeu_si256((__m256i *)&r[ctr], f1);
ctr += _mm_popcnt_u32(good & 0xFF);
good >>= 8;
pos += 4;
if (ctr > N - 8) {
break;
}
g0 = _mm256_extracti128_si256(f0, 1);
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM5_AVX2_idxlut[good & 0xFF]);
g1 = _mm_shuffle_epi8(g0, g1);
f1 = _mm256_cvtepi8_epi32(g1);
f2 = _mm256_mulhrs_epi16(f1, v);
f2 = _mm256_mullo_epi16(f2, p);
f1 = _mm256_add_epi32(f1, f2);
_mm256_storeu_si256((__m256i *)&r[ctr], f1);
ctr += _mm_popcnt_u32(good & 0xFF);
good >>= 8;
pos += 4;
if (ctr > N - 8) {
break;
}
g0 = _mm_bsrli_si128(g0, 8);
g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM5_AVX2_idxlut[good]);
g1 = _mm_shuffle_epi8(g0, g1);
f1 = _mm256_cvtepi8_epi32(g1);
f2 = _mm256_mulhrs_epi16(f1, v);
f2 = _mm256_mullo_epi16(f2, p);
f1 = _mm256_add_epi32(f1, f2);
_mm256_storeu_si256((__m256i *)&r[ctr], f1);
ctr += _mm_popcnt_u32(good);
pos += 4;
}
uint32_t t0, t1;
while (ctr < N && pos < REJ_UNIFORM_ETA_BUFLEN) {
t0 = buf[pos] & 0x0F;
t1 = buf[pos++] >> 4;
if (t0 < 15) {
t0 = t0 - (205 * t0 >> 10) * 5;
r[ctr++] = 2 - t0;
}
if (t1 < 15 && ctr < N) {
t1 = t1 - (205 * t1 >> 10) * 5;
r[ctr++] = 2 - t1;
}
}
return ctr;
}