2019-09-10 10:45:01 +01:00
|
|
|
#include "params.h"
|
2020-07-31 07:17:42 +01:00
|
|
|
#include "cbd.h"
|
2019-09-10 10:45:01 +01:00
|
|
|
#include <immintrin.h>
|
|
|
|
#include <stdint.h>
|
|
|
|
|
|
|
|
/*************************************************
|
2020-07-31 07:17:42 +01:00
|
|
|
* Name: PQCLEAN_KYBER768_AVX2_cbd
|
2019-09-10 10:45:01 +01:00
|
|
|
*
|
|
|
|
* Description: Given an array of uniformly random bytes, compute
|
|
|
|
* polynomial with coefficients distributed according to
|
|
|
|
* a centered binomial distribution with parameter KYBER_ETA
|
|
|
|
*
|
|
|
|
* Arguments: - poly *r: pointer to output polynomial
|
2020-07-31 07:17:42 +01:00
|
|
|
* - const unsigned char *buf: pointer to input byte array
|
2019-09-10 10:45:01 +01:00
|
|
|
**************************************************/
|
2020-07-31 07:17:42 +01:00
|
|
|
void PQCLEAN_KYBER768_AVX2_cbd(poly *restrict r, const uint8_t *restrict buf) {
|
|
|
|
unsigned int i = 0;
|
2019-09-10 10:45:01 +01:00
|
|
|
__m256i vec0, vec1, vec2, vec3, tmp;
|
|
|
|
const __m256i mask55 = _mm256_set1_epi32(0x55555555);
|
|
|
|
const __m256i mask33 = _mm256_set1_epi32(0x33333333);
|
|
|
|
const __m256i mask03 = _mm256_set1_epi32(0x03030303);
|
|
|
|
|
2020-07-31 07:17:42 +01:00
|
|
|
for (i = 0; i < KYBER_N / 64; i++) {
|
|
|
|
vec0 = _mm256_load_si256((__m256i *)&buf[32 * i]);
|
2019-09-10 10:45:01 +01:00
|
|
|
|
|
|
|
vec1 = _mm256_srli_epi32(vec0, 1);
|
|
|
|
vec0 = _mm256_and_si256(mask55, vec0);
|
|
|
|
vec1 = _mm256_and_si256(mask55, vec1);
|
|
|
|
vec0 = _mm256_add_epi32(vec0, vec1);
|
|
|
|
|
|
|
|
vec1 = _mm256_srli_epi32(vec0, 2);
|
|
|
|
vec0 = _mm256_and_si256(mask33, vec0);
|
|
|
|
vec1 = _mm256_and_si256(mask33, vec1);
|
|
|
|
|
|
|
|
vec2 = _mm256_srli_epi32(vec0, 4);
|
|
|
|
vec3 = _mm256_srli_epi32(vec1, 4);
|
|
|
|
vec0 = _mm256_and_si256(mask03, vec0);
|
|
|
|
vec1 = _mm256_and_si256(mask03, vec1);
|
|
|
|
vec2 = _mm256_and_si256(mask03, vec2);
|
|
|
|
vec3 = _mm256_and_si256(mask03, vec3);
|
|
|
|
|
|
|
|
vec1 = _mm256_sub_epi8(vec0, vec1);
|
|
|
|
vec3 = _mm256_sub_epi8(vec2, vec3);
|
|
|
|
|
|
|
|
vec0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec1));
|
|
|
|
vec1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec1, 1));
|
|
|
|
vec2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec3));
|
|
|
|
vec3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec3, 1));
|
|
|
|
|
|
|
|
tmp = _mm256_unpacklo_epi16(vec0, vec2);
|
|
|
|
vec2 = _mm256_unpackhi_epi16(vec0, vec2);
|
|
|
|
vec0 = tmp;
|
|
|
|
tmp = _mm256_unpacklo_epi16(vec1, vec3);
|
|
|
|
vec3 = _mm256_unpackhi_epi16(vec1, vec3);
|
|
|
|
vec1 = tmp;
|
|
|
|
|
|
|
|
tmp = _mm256_permute2x128_si256(vec0, vec2, 0x20);
|
|
|
|
vec2 = _mm256_permute2x128_si256(vec0, vec2, 0x31);
|
|
|
|
vec0 = tmp;
|
|
|
|
tmp = _mm256_permute2x128_si256(vec1, vec3, 0x20);
|
|
|
|
vec3 = _mm256_permute2x128_si256(vec1, vec3, 0x31);
|
|
|
|
vec1 = tmp;
|
|
|
|
|
|
|
|
_mm256_store_si256((__m256i *)&r->coeffs[64 * i + 0], vec0);
|
|
|
|
_mm256_store_si256((__m256i *)&r->coeffs[64 * i + 16], vec2);
|
|
|
|
_mm256_store_si256((__m256i *)&r->coeffs[64 * i + 32], vec1);
|
|
|
|
_mm256_store_si256((__m256i *)&r->coeffs[64 * i + 48], vec3);
|
|
|
|
}
|
|
|
|
}
|