1
1
mirror of https://github.com/henrydcase/pqc.git synced 2024-11-22 23:48:58 +00:00
pqcrypto/crypto_kem/sntrup857/avx2/crypto_encode_857x5167.c
2021-03-24 21:02:47 +00:00

332 lines
11 KiB
C

#include "crypto_encode_857x5167.h"
#include <immintrin.h>
/* auto-generated; do not edit */
#define int16 int16_t
#define uint16 uint16_t
#define uint32 uint32_t
void PQCLEAN_SNTRUP857_AVX2_crypto_encode_857x5167(unsigned char *out, const void *v) {
const int16 *R0 = v;
/* XXX: caller could overlap R with input */
uint16 R[429];
long i;
const uint16 *reading;
uint16 *writing;
uint16 r0, r1;
uint32 r2;
uint32 s0;
reading = (uint16 *) R0;
writing = R;
i = 27;
while (i > 0) {
__m256i x, x2, y, y2;
--i;
if (!i) {
reading -= 8;
writing -= 4;
out -= 8;
}
x = _mm256_loadu_si256((__m256i *) (reading + 0));
x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
x = _mm256_add_epi16(x, _mm256_set1_epi16(2583));
x2 = _mm256_add_epi16(x2, _mm256_set1_epi16(2583));
x &= _mm256_set1_epi16(16383);
x2 &= _mm256_set1_epi16(16383);
y = x & _mm256_set1_epi32(65535);
y2 = x2 & _mm256_set1_epi32(65535);
x = _mm256_srli_epi32(x, 16);
x2 = _mm256_srli_epi32(x2, 16);
x = _mm256_mullo_epi32(x, _mm256_set1_epi32(5167));
x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(5167));
x = _mm256_add_epi32(y, x);
x2 = _mm256_add_epi32(y2, x2);
x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
));
x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
));
x = _mm256_permute4x64_epi64(x, 0xd8);
x2 = _mm256_permute4x64_epi64(x2, 0xd8);
_mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
_mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
reading += 32;
writing += 16;
out += 32;
}
R[428] = (uint16) (((R0[856] + 2583) & 16383));
reading = (uint16 *) R;
writing = R;
i = 27;
while (i > 0) {
__m256i x, y;
--i;
if (!i) {
reading -= 4;
writing -= 2;
out -= 2;
}
x = _mm256_loadu_si256((__m256i *) reading);
y = x & _mm256_set1_epi32(65535);
x = _mm256_srli_epi32(x, 16);
x = _mm256_mullo_epi32(x, _mm256_set1_epi32(408));
x = _mm256_add_epi32(y, x);
x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
));
x = _mm256_permute4x64_epi64(x, 0xd8);
_mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
s0 = (uint32) _mm256_extract_epi32(x, 4);
*out++ = (unsigned char) s0;
s0 >>= 8;
*out++ = (unsigned char) s0;
s0 >>= 8;
*out++ = (unsigned char) s0;
s0 >>= 8;
*out++ = (unsigned char) s0;
s0 = (uint32) _mm256_extract_epi32(x, 6);
*out++ = (unsigned char) s0;
s0 >>= 8;
*out++ = (unsigned char) s0;
s0 >>= 8;
*out++ = (unsigned char) s0;
s0 >>= 8;
*out++ = (unsigned char) s0;
reading += 16;
writing += 8;
}
R[214] = R[428];
reading = (uint16 *) R;
writing = R;
i = 14;
while (i > 0) {
__m256i x, y;
--i;
if (!i) {
reading -= 10;
writing -= 5;
out -= 5;
}
x = _mm256_loadu_si256((__m256i *) reading);
y = x & _mm256_set1_epi32(65535);
x = _mm256_srli_epi32(x, 16);
x = _mm256_mullo_epi32(x, _mm256_set1_epi32(651));
x = _mm256_add_epi32(y, x);
x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
));
x = _mm256_permute4x64_epi64(x, 0xd8);
_mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
s0 = (uint32) _mm256_extract_epi32(x, 4);
*out++ = (unsigned char) s0;
s0 >>= 8;
*out++ = (unsigned char) s0;
s0 >>= 8;
*out++ = (unsigned char) s0;
s0 >>= 8;
*out++ = (unsigned char) s0;
s0 = (uint32) _mm256_extract_epi32(x, 6);
*out++ = (unsigned char) s0;
s0 >>= 8;
*out++ = (unsigned char) s0;
s0 >>= 8;
*out++ = (unsigned char) s0;
s0 >>= 8;
*out++ = (unsigned char) s0;
reading += 16;
writing += 8;
}
R[107] = R[214];
reading = (uint16 *) R;
writing = R;
i = 7;
while (i > 0) {
__m256i x, y;
--i;
if (!i) {
reading -= 6;
writing -= 3;
out -= 3;
}
x = _mm256_loadu_si256((__m256i *) reading);
y = x & _mm256_set1_epi32(65535);
x = _mm256_srli_epi32(x, 16);
x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1656));
x = _mm256_add_epi32(y, x);
x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
));
x = _mm256_permute4x64_epi64(x, 0xd8);
_mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
s0 = (uint32) _mm256_extract_epi32(x, 4);
*out++ = (unsigned char) s0;
s0 >>= 8;
*out++ = (unsigned char) s0;
s0 >>= 8;
*out++ = (unsigned char) s0;
s0 >>= 8;
*out++ = (unsigned char) s0;
s0 = (uint32) _mm256_extract_epi32(x, 6);
*out++ = (unsigned char) s0;
s0 >>= 8;
*out++ = (unsigned char) s0;
s0 >>= 8;
*out++ = (unsigned char) s0;
s0 >>= 8;
*out++ = (unsigned char) s0;
reading += 16;
writing += 8;
}
r0 = R[106];
r1 = R[107];
r2 = r0 + r1 * (uint32)1656;
*out++ = (unsigned char) r2;
r2 >>= 8;
*out++ = (unsigned char) r2;
r2 >>= 8;
R[53] = (uint16) r2;
reading = (uint16 *) R;
writing = R;
i = 2;
while (i > 0) {
__m256i x, x2, y, y2;
--i;
if (!i) {
reading -= 12;
writing -= 6;
out -= 12;
}
x = _mm256_loadu_si256((__m256i *) (reading + 0));
x2 = _mm256_loadu_si256((__m256i *) (reading + 16));
y = x & _mm256_set1_epi32(65535);
y2 = x2 & _mm256_set1_epi32(65535);
x = _mm256_srli_epi32(x, 16);
x2 = _mm256_srli_epi32(x2, 16);
x = _mm256_mullo_epi32(x, _mm256_set1_epi32(10713));
x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(10713));
x = _mm256_add_epi32(y, x);
x2 = _mm256_add_epi32(y2, x2);
x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
));
x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8(
15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
));
x = _mm256_permute4x64_epi64(x, 0xd8);
x2 = _mm256_permute4x64_epi64(x2, 0xd8);
_mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31));
_mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20));
reading += 32;
writing += 16;
out += 32;
}
r0 = R[52];
r1 = R[53];
r2 = r0 + r1 * (uint32)10713;
*out++ = (unsigned char) r2;
r2 >>= 8;
R[26] = (uint16) r2;
reading = (uint16 *) R;
writing = R;
i = 2;
while (i > 0) {
__m256i x, y;
--i;
if (!i) {
reading -= 6;
writing -= 3;
out -= 3;
}
x = _mm256_loadu_si256((__m256i *) reading);
y = x & _mm256_set1_epi32(65535);
x = _mm256_srli_epi32(x, 16);
x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1752));
x = _mm256_add_epi32(y, x);
x = _mm256_shuffle_epi8(x, _mm256_set_epi8(
12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1,
12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1
));
x = _mm256_permute4x64_epi64(x, 0xd8);
_mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0));
s0 = (uint32) _mm256_extract_epi32(x, 4);
*out++ = (unsigned char) s0;
s0 >>= 8;
*out++ = (unsigned char) s0;
s0 >>= 8;
*out++ = (unsigned char) s0;
s0 >>= 8;
*out++ = (unsigned char) s0;
s0 = (uint32) _mm256_extract_epi32(x, 6);
*out++ = (unsigned char) s0;
s0 >>= 8;
*out++ = (unsigned char) s0;
s0 >>= 8;
*out++ = (unsigned char) s0;
s0 >>= 8;
*out++ = (unsigned char) s0;
reading += 16;
writing += 8;
}
R[13] = R[26];
for (i = 0; i < 7; ++i) {
r0 = R[2 * i];
r1 = R[2 * i + 1];
r2 = r0 + r1 * (uint32)11991;
*out++ = (unsigned char) r2;
r2 >>= 8;
*out++ = (unsigned char) r2;
r2 >>= 8;
R[i] = (uint16) r2;
}
for (i = 0; i < 3; ++i) {
r0 = R[2 * i];
r1 = R[2 * i + 1];
r2 = r0 + r1 * (uint32)2194;
*out++ = (unsigned char) r2;
r2 >>= 8;
*out++ = (unsigned char) r2;
r2 >>= 8;
R[i] = (uint16) r2;
}
R[3] = R[6];
r0 = R[0];
r1 = R[1];
r2 = r0 + r1 * (uint32)74;
R[0] = (uint16) r2;
r0 = R[2];
r1 = R[3];
r2 = r0 + r1 * (uint32)74;
*out++ = (unsigned char) r2;
r2 >>= 8;
R[1] = (uint16) r2;
r0 = R[0];
r1 = R[1];
r2 = r0 + r1 * (uint32)5476;
*out++ = (unsigned char) r2;
r2 >>= 8;
R[0] = (uint16) r2;
r0 = R[0];
*out++ = (unsigned char) r0;
r0 >>= 8;
*out++ = (unsigned char) r0;
}