pqc/crypto_kem/kyber768-90s/avx2/cbd.c

68 lines
2.4 KiB
C
Raw Permalink Normal View History

#include "cbd.h"
#include "params.h"
2019-09-17 13:02:01 +01:00
#include <immintrin.h>
#include <stdint.h>
/*************************************************
2020-10-27 13:48:42 +00:00
* Name: cbd2
2019-09-17 13:02:01 +01:00
*
* Description: Given an array of uniformly random bytes, compute
* polynomial with coefficients distributed according to
2020-10-27 13:48:42 +00:00
* a centered binomial distribution with parameter eta=2
2019-09-17 13:02:01 +01:00
*
2020-10-27 13:48:42 +00:00
* Arguments: - poly *r: pointer to output polynomial
* - const __m256i *buf: pointer to aligned input byte array
2019-09-17 13:02:01 +01:00
**************************************************/
2020-10-27 13:48:42 +00:00
static void cbd2(poly *restrict r, const __m256i buf[2 * KYBER_N / 128]) {
unsigned int i;
2020-10-27 13:48:42 +00:00
__m256i f0, f1, f2, f3;
2019-09-17 13:02:01 +01:00
const __m256i mask55 = _mm256_set1_epi32(0x55555555);
const __m256i mask33 = _mm256_set1_epi32(0x33333333);
const __m256i mask03 = _mm256_set1_epi32(0x03030303);
2020-10-27 13:48:42 +00:00
const __m256i mask0F = _mm256_set1_epi32(0x0F0F0F0F);
2019-09-17 13:02:01 +01:00
for (i = 0; i < KYBER_N / 64; i++) {
2020-10-27 13:48:42 +00:00
f0 = _mm256_load_si256(&buf[i]);
2019-09-17 13:02:01 +01:00
2020-10-27 13:48:42 +00:00
f1 = _mm256_srli_epi16(f0, 1);
f0 = _mm256_and_si256(mask55, f0);
f1 = _mm256_and_si256(mask55, f1);
f0 = _mm256_add_epi8(f0, f1);
2019-09-17 13:02:01 +01:00
2020-10-27 13:48:42 +00:00
f1 = _mm256_srli_epi16(f0, 2);
f0 = _mm256_and_si256(mask33, f0);
f1 = _mm256_and_si256(mask33, f1);
f0 = _mm256_add_epi8(f0, mask33);
f0 = _mm256_sub_epi8(f0, f1);
2019-09-17 13:02:01 +01:00
2020-10-27 13:48:42 +00:00
f1 = _mm256_srli_epi16(f0, 4);
f0 = _mm256_and_si256(mask0F, f0);
f1 = _mm256_and_si256(mask0F, f1);
f0 = _mm256_sub_epi8(f0, mask03);
f1 = _mm256_sub_epi8(f1, mask03);
2019-09-17 13:02:01 +01:00
2020-10-27 13:48:42 +00:00
f2 = _mm256_unpacklo_epi8(f0, f1);
f3 = _mm256_unpackhi_epi8(f0, f1);
2019-09-17 13:02:01 +01:00
2020-10-27 13:48:42 +00:00
f0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f2));
f1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f2, 1));
f2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f3));
f3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f3, 1));
2019-09-17 13:02:01 +01:00
2020-10-27 13:48:42 +00:00
_mm256_store_si256(&r->vec[4 * i + 0], f0);
_mm256_store_si256(&r->vec[4 * i + 1], f2);
_mm256_store_si256(&r->vec[4 * i + 2], f1);
_mm256_store_si256(&r->vec[4 * i + 3], f3);
}
}
2019-09-17 13:02:01 +01:00
2020-10-27 13:48:42 +00:00
/* buf 32 bytes longer for cbd3 */
void PQCLEAN_KYBER76890S_AVX2_poly_cbd_eta1(poly *r, const __m256i buf[KYBER_ETA1 * KYBER_N / 128 + 1]) {
cbd2(r, buf);
}
void PQCLEAN_KYBER76890S_AVX2_poly_cbd_eta2(poly *r, const __m256i buf[KYBER_ETA2 * KYBER_N / 128]) {
cbd2(r, buf);
2019-09-17 13:02:01 +01:00
}