1
1
mirror of https://github.com/henrydcase/pqc.git synced 2024-11-24 00:11:27 +00:00
pqcrypto/crypto_kem/kyber1024/avx2/poly.c

357 lines
14 KiB
C
Raw Normal View History

#include "cbd.h"
#include "ntt.h"
#include "params.h"
#include "poly.h"
#include "reduce.h"
#include "symmetric.h"
#include <immintrin.h>
#include <stdint.h>
/*************************************************
* Name: poly_compress
*
* Description: Compression and subsequent serialization of a polynomial
*
* Arguments: - uint8_t *r: pointer to output byte array
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_compress(uint8_t *r, poly *a) {
uint8_t t[8];
size_t i, j, k = 0;
PQCLEAN_KYBER1024_AVX2_poly_csubq(a);
for (i = 0; i < KYBER_N; i += 8) {
for (j = 0; j < 8; j++) {
t[j] = (uint8_t)(((((uint32_t)a->coeffs[i + j] << 5) + KYBER_Q / 2) / KYBER_Q) & 31);
}
r[k] = (uint8_t)( t[0] | (t[1] << 5));
r[k + 1] = (uint8_t)((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
r[k + 2] = (uint8_t)((t[3] >> 1) | (t[4] << 4));
r[k + 3] = (uint8_t)((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
r[k + 4] = (uint8_t)((t[6] >> 2) | (t[7] << 3));
k += 5;
}
}
/*************************************************
* Name: poly_decompress
*
* Description: De-serialization and subsequent decompression of a polynomial;
* approximate inverse of poly_compress
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *a: pointer to input byte array
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_decompress(poly *r, const uint8_t *a) {
size_t i;
for (i = 0; i < KYBER_N; i += 8) {
r->coeffs[i + 0] = (int16_t)( (((a[0] & 31) * KYBER_Q) + 16) >> 5);
r->coeffs[i + 1] = (int16_t)(((((a[0] >> 5) | ((a[1] & 3) << 3)) * KYBER_Q) + 16) >> 5);
r->coeffs[i + 2] = (int16_t)(((((a[1] >> 2) & 31) * KYBER_Q) + 16) >> 5);
r->coeffs[i + 3] = (int16_t)(((((a[1] >> 7) | ((a[2] & 15) << 1)) * KYBER_Q) + 16) >> 5);
r->coeffs[i + 4] = (int16_t)(((((a[2] >> 4) | ((a[3] & 1) << 4)) * KYBER_Q) + 16) >> 5);
r->coeffs[i + 5] = (int16_t)(((((a[3] >> 1) & 31) * KYBER_Q) + 16) >> 5);
r->coeffs[i + 6] = (int16_t)(((((a[3] >> 6) | ((a[4] & 7) << 2)) * KYBER_Q) + 16) >> 5);
r->coeffs[i + 7] = (int16_t)( (((a[4] >> 3) * KYBER_Q) + 16) >> 5);
a += 5;
}
}
/*************************************************
* Name: poly_tobytes
*
* Description: Serialization of a polynomial
*
* Arguments: - uint8_t *r: pointer to output byte array
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_tobytes(uint8_t *r, poly *a) {
PQCLEAN_KYBER1024_AVX2_ntttobytes_avx(r, a->coeffs);
PQCLEAN_KYBER1024_AVX2_ntttobytes_avx(r + 192, a->coeffs + 128);
}
/*************************************************
* Name: poly_frombytes
*
* Description: De-serialization of a polynomial;
* inverse of poly_tobytes
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *a: pointer to input byte array
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_frombytes(poly *r, const uint8_t *a) {
PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx(r->coeffs, a);
PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx(r->coeffs + 128, a + 192);
}
/*************************************************
* Name: poly_getnoise
*
* Description: Sample a polynomial deterministically from a seed and a nonce,
* with output polynomial close to centered binomial distribution
* with parameter KYBER_ETA
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *seed: pointer to input seed
* - uint8_t nonce: one-byte input nonce
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce) {
uint8_t buf[KYBER_ETA * KYBER_N / 4];
prf(buf, KYBER_ETA * KYBER_N / 4, seed, nonce);
PQCLEAN_KYBER1024_AVX2_cbd(r, buf);
}
// FIXME
void PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(poly *r0,
poly *r1,
poly *r2,
poly *r3,
const uint8_t *seed,
uint8_t nonce0,
uint8_t nonce1,
uint8_t nonce2,
uint8_t nonce3) {
uint8_t buf[4][SHAKE256_RATE];
PQCLEAN_KYBER1024_AVX2_shake256x4_prf(buf[0], buf[1], buf[2], buf[3], SHAKE256_RATE, seed, nonce0, nonce1, nonce2, nonce3);
PQCLEAN_KYBER1024_AVX2_cbd(r0, buf[0]);
PQCLEAN_KYBER1024_AVX2_cbd(r1, buf[1]);
PQCLEAN_KYBER1024_AVX2_cbd(r2, buf[2]);
PQCLEAN_KYBER1024_AVX2_cbd(r3, buf[3]);
}
/*************************************************
* Name: poly_ntt
*
* Description: Computes negacyclic number-theoretic transform (NTT) of
* a polynomial in place;
* inputs assumed to be in normal order, output in bitreversed order
*
* Arguments: - uint16_t *r: pointer to in/output polynomial
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_ntt(poly *r) {
PQCLEAN_KYBER1024_AVX2_ntt_level0_avx(r->coeffs, PQCLEAN_KYBER1024_AVX2_zetas_exp);
PQCLEAN_KYBER1024_AVX2_ntt_level0_avx(r->coeffs + 64, PQCLEAN_KYBER1024_AVX2_zetas_exp);
PQCLEAN_KYBER1024_AVX2_ntt_levels1t6_avx(r->coeffs, PQCLEAN_KYBER1024_AVX2_zetas_exp + 4);
PQCLEAN_KYBER1024_AVX2_ntt_levels1t6_avx(r->coeffs + 128, PQCLEAN_KYBER1024_AVX2_zetas_exp + 200);
}
/*************************************************
* Name: poly_invntt
*
* Description: Computes inverse of negacyclic number-theoretic transform (NTT) of
* a polynomial in place;
* inputs assumed to be in bitreversed order, output in normal order
*
* Arguments: - uint16_t *a: pointer to in/output polynomial
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_invntt(poly *r) {
PQCLEAN_KYBER1024_AVX2_invntt_levels0t5_avx(r->coeffs, PQCLEAN_KYBER1024_AVX2_zetas_inv_exp);
PQCLEAN_KYBER1024_AVX2_invntt_levels0t5_avx(r->coeffs + 128, PQCLEAN_KYBER1024_AVX2_zetas_inv_exp + 196);
PQCLEAN_KYBER1024_AVX2_invntt_level6_avx(r->coeffs, PQCLEAN_KYBER1024_AVX2_zetas_inv_exp + 392);
}
// FIXME
void PQCLEAN_KYBER1024_AVX2_poly_nttunpack(poly *r) {
PQCLEAN_KYBER1024_AVX2_nttunpack_avx(r->coeffs);
PQCLEAN_KYBER1024_AVX2_nttunpack_avx(r->coeffs + 128);
}
//XXX Add comment
void PQCLEAN_KYBER1024_AVX2_poly_basemul(poly *r, const poly *a, const poly *b) {
PQCLEAN_KYBER1024_AVX2_basemul_avx(r->coeffs,
a->coeffs,
b->coeffs,
PQCLEAN_KYBER1024_AVX2_zetas_exp + 152);
PQCLEAN_KYBER1024_AVX2_basemul_avx(r->coeffs + 64,
a->coeffs + 64,
b->coeffs + 64,
PQCLEAN_KYBER1024_AVX2_zetas_exp + 184);
PQCLEAN_KYBER1024_AVX2_basemul_avx(r->coeffs + 128,
a->coeffs + 128,
b->coeffs + 128,
PQCLEAN_KYBER1024_AVX2_zetas_exp + 348);
PQCLEAN_KYBER1024_AVX2_basemul_avx(r->coeffs + 192,
a->coeffs + 192,
b->coeffs + 192,
PQCLEAN_KYBER1024_AVX2_zetas_exp + 380);
}
// FIXME
void PQCLEAN_KYBER1024_AVX2_poly_frommont(poly *r) {
PQCLEAN_KYBER1024_AVX2_frommont_avx(r->coeffs);
PQCLEAN_KYBER1024_AVX2_frommont_avx(r->coeffs + 128);
}
// FIXME
void PQCLEAN_KYBER1024_AVX2_poly_reduce(poly *r) {
PQCLEAN_KYBER1024_AVX2_reduce_avx(r->coeffs);
PQCLEAN_KYBER1024_AVX2_reduce_avx(r->coeffs + 128);
}
// FIXME
void PQCLEAN_KYBER1024_AVX2_poly_csubq(poly *r) {
PQCLEAN_KYBER1024_AVX2_csubq_avx(r->coeffs);
PQCLEAN_KYBER1024_AVX2_csubq_avx(r->coeffs + 128);
}
/*************************************************
* Name: poly_add
*
* Description: Add two polynomials
*
* Arguments: - poly *r: pointer to output polynomial
* - const poly *a: pointer to first input polynomial
* - const poly *b: pointer to second input polynomial
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_add(poly *r, const poly *a, const poly *b) {
__m256i vec0, vec1;
for (size_t i = 0; i < KYBER_N; i += 16) {
vec0 = _mm256_load_si256((__m256i *)&a->coeffs[i]);
vec1 = _mm256_load_si256((__m256i *)&b->coeffs[i]);
vec0 = _mm256_add_epi16(vec0, vec1);
_mm256_store_si256((__m256i *)&r->coeffs[i], vec0);
}
}
/*************************************************
* Name: poly_sub
*
* Description: Subtract two polynomials
*
* Arguments: - poly *r: pointer to output polynomial
* - const poly *a: pointer to first input polynomial
* - const poly *b: pointer to second input polynomial
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_sub(poly *r, const poly *a, const poly *b) {
__m256i vec0, vec1;
for (size_t i = 0; i < KYBER_N; i += 16) {
vec0 = _mm256_load_si256((__m256i *)&a->coeffs[i]);
vec1 = _mm256_load_si256((__m256i *)&b->coeffs[i]);
vec0 = _mm256_sub_epi16(vec0, vec1);
_mm256_store_si256((__m256i *)&r->coeffs[i], vec0);
}
}
/*************************************************
* Name: poly_frommsg
*
* Description: Convert 32-byte message to polynomial
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *msg: pointer to input message
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]) {
__m128i tmp;
__m256i a[4], d0, d1, d2, d3;
const __m256i shift = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
const __m256i zeros = _mm256_setzero_si256();
const __m256i ones = _mm256_set1_epi32(1);
const __m256i hqs = _mm256_set1_epi32((KYBER_Q + 1) / 2);
tmp = _mm_loadu_si128((__m128i *)msg);
for (size_t i = 0; i < 4; i++) {
a[i] = _mm256_broadcastd_epi32(tmp);
tmp = _mm_srli_si128(tmp, 4);
}
for (size_t i = 0; i < 4; i++) {
d0 = _mm256_srlv_epi32(a[i], shift);
d1 = _mm256_srli_epi32(d0, 8);
d2 = _mm256_srli_epi32(d0, 16);
d3 = _mm256_srli_epi32(d0, 24);
d0 = _mm256_and_si256(d0, ones);
d1 = _mm256_and_si256(d1, ones);
d2 = _mm256_and_si256(d2, ones);
d3 = _mm256_and_si256(d3, ones);
d0 = _mm256_sub_epi32(zeros, d0);
d1 = _mm256_sub_epi32(zeros, d1);
d2 = _mm256_sub_epi32(zeros, d2);
d3 = _mm256_sub_epi32(zeros, d3);
d0 = _mm256_and_si256(hqs, d0);
d1 = _mm256_and_si256(hqs, d1);
d2 = _mm256_and_si256(hqs, d2);
d3 = _mm256_and_si256(hqs, d3);
d0 = _mm256_packus_epi32(d0, d1);
d2 = _mm256_packus_epi32(d2, d3);
d0 = _mm256_permute4x64_epi64(d0, 0xD8);
d2 = _mm256_permute4x64_epi64(d2, 0xD8);
_mm256_store_si256((__m256i *)&r->coeffs[32 * i + 0], d0);
_mm256_store_si256((__m256i *)&r->coeffs[32 * i + 16], d2);
}
tmp = _mm_loadu_si128((__m128i *)&msg[16]);
for (size_t i = 0; i < 4; i++) {
a[i] = _mm256_broadcastd_epi32(tmp);
tmp = _mm_srli_si128(tmp, 4);
}
for (size_t i = 0; i < 4; i++) {
d0 = _mm256_srlv_epi32(a[i], shift);
d1 = _mm256_srli_epi32(d0, 8);
d2 = _mm256_srli_epi32(d0, 16);
d3 = _mm256_srli_epi32(d0, 24);
d0 = _mm256_and_si256(d0, ones);
d1 = _mm256_and_si256(d1, ones);
d2 = _mm256_and_si256(d2, ones);
d3 = _mm256_and_si256(d3, ones);
d0 = _mm256_sub_epi32(zeros, d0);
d1 = _mm256_sub_epi32(zeros, d1);
d2 = _mm256_sub_epi32(zeros, d2);
d3 = _mm256_sub_epi32(zeros, d3);
d0 = _mm256_and_si256(hqs, d0);
d1 = _mm256_and_si256(hqs, d1);
d2 = _mm256_and_si256(hqs, d2);
d3 = _mm256_and_si256(hqs, d3);
d0 = _mm256_packus_epi32(d0, d1);
d2 = _mm256_packus_epi32(d2, d3);
d0 = _mm256_permute4x64_epi64(d0, 0xD8);
d2 = _mm256_permute4x64_epi64(d2, 0xD8);
_mm256_store_si256((__m256i *)&r->coeffs[128 + 32 * i + 0], d0);
_mm256_store_si256((__m256i *)&r->coeffs[128 + 32 * i + 16], d2);
}
}
/*************************************************
* Name: poly_tomsg
*
* Description: Convert polynomial to 32-byte message
*
* Arguments: - uint8_t *msg: pointer to output message
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a) {
uint32_t small;
__m256i vec, tmp;
const __m256i hqs = _mm256_set1_epi16((KYBER_Q - 1) / 2);
const __m256i hhqs = _mm256_set1_epi16((KYBER_Q - 5) / 4);
for (size_t i = 0; i < KYBER_N / 16; i++) {
vec = _mm256_load_si256((__m256i *)&a->coeffs[16 * i]);
vec = _mm256_sub_epi16(hqs, vec);
tmp = _mm256_srai_epi16(vec, 15);
vec = _mm256_xor_si256(vec, tmp);
vec = _mm256_sub_epi16(hhqs, vec);
small = (uint32_t)_mm256_movemask_epi8(vec);
small = _pext_u32(small, 0xAAAAAAAA);
small = ~small;
msg[2 * i + 0] = (uint8_t)small;
msg[2 * i + 1] = (uint8_t)(small >> 8);
}
}