mirror of
https://github.com/henrydcase/pqc.git
synced 2024-11-23 16:08:59 +00:00
f4bd312180
* Integrate Kyber-AVX2 into PQClean * Fix types and formatting in Kyber * Workaround a valgrind crash * Remove comment in shuffle.s * Remove some extraneous truncations * fixup! Fix types and formatting in Kyber
356 lines
13 KiB
C
356 lines
13 KiB
C
#include "cbd.h"
|
|
#include "ntt.h"
|
|
#include "params.h"
|
|
#include "poly.h"
|
|
#include "reduce.h"
|
|
#include "symmetric.h"
|
|
|
|
#include <immintrin.h>
|
|
#include <stdint.h>
|
|
|
|
/*************************************************
|
|
* Name: poly_compress
|
|
*
|
|
* Description: Compression and subsequent serialization of a polynomial
|
|
*
|
|
* Arguments: - uint8_t *r: pointer to output byte array
|
|
* - const poly *a: pointer to input polynomial
|
|
**************************************************/
|
|
void PQCLEAN_KYBER768_AVX2_poly_compress(uint8_t *r, poly *a) {
|
|
uint8_t t[8];
|
|
size_t i, j, k = 0;
|
|
|
|
PQCLEAN_KYBER768_AVX2_poly_csubq(a);
|
|
|
|
for (i = 0; i < KYBER_N; i += 8) {
|
|
for (j = 0; j < 8; j++) {
|
|
t[j] = (uint8_t)(((((uint16_t)a->coeffs[i + j] << 4) + KYBER_Q / 2) / KYBER_Q) & 15);
|
|
}
|
|
|
|
r[k] = (uint8_t)(t[0] | (t[1] << 4));
|
|
r[k + 1] = (uint8_t)(t[2] | (t[3] << 4));
|
|
r[k + 2] = (uint8_t)(t[4] | (t[5] << 4));
|
|
r[k + 3] = (uint8_t)(t[6] | (t[7] << 4));
|
|
k += 4;
|
|
}
|
|
}
|
|
|
|
/*************************************************
|
|
* Name: poly_decompress
|
|
*
|
|
* Description: De-serialization and subsequent decompression of a polynomial;
|
|
* approximate inverse of poly_compress
|
|
*
|
|
* Arguments: - poly *r: pointer to output polynomial
|
|
* - const uint8_t *a: pointer to input byte array
|
|
**************************************************/
|
|
void PQCLEAN_KYBER768_AVX2_poly_decompress(poly *r, const uint8_t *a) {
|
|
size_t i;
|
|
for (i = 0; i < KYBER_N; i += 8) {
|
|
r->coeffs[i + 0] = (int16_t)((((a[0] & 15) * KYBER_Q) + 8) >> 4);
|
|
r->coeffs[i + 1] = (int16_t)((((a[0] >> 4) * KYBER_Q) + 8) >> 4);
|
|
r->coeffs[i + 2] = (int16_t)((((a[1] & 15) * KYBER_Q) + 8) >> 4);
|
|
r->coeffs[i + 3] = (int16_t)((((a[1] >> 4) * KYBER_Q) + 8) >> 4);
|
|
r->coeffs[i + 4] = (int16_t)((((a[2] & 15) * KYBER_Q) + 8) >> 4);
|
|
r->coeffs[i + 5] = (int16_t)((((a[2] >> 4) * KYBER_Q) + 8) >> 4);
|
|
r->coeffs[i + 6] = (int16_t)((((a[3] & 15) * KYBER_Q) + 8) >> 4);
|
|
r->coeffs[i + 7] = (int16_t)((((a[3] >> 4) * KYBER_Q) + 8) >> 4);
|
|
a += 4;
|
|
}
|
|
}
|
|
|
|
/*************************************************
|
|
* Name: poly_tobytes
|
|
*
|
|
* Description: Serialization of a polynomial
|
|
*
|
|
* Arguments: - uint8_t *r: pointer to output byte array
|
|
* - const poly *a: pointer to input polynomial
|
|
**************************************************/
|
|
void PQCLEAN_KYBER768_AVX2_poly_tobytes(uint8_t *r, poly *a) {
|
|
PQCLEAN_KYBER768_AVX2_ntttobytes_avx(r, a->coeffs);
|
|
PQCLEAN_KYBER768_AVX2_ntttobytes_avx(r + 192, a->coeffs + 128);
|
|
}
|
|
|
|
/*************************************************
|
|
* Name: poly_frombytes
|
|
*
|
|
* Description: De-serialization of a polynomial;
|
|
* inverse of poly_tobytes
|
|
*
|
|
* Arguments: - poly *r: pointer to output polynomial
|
|
* - const uint8_t *a: pointer to input byte array
|
|
**************************************************/
|
|
void PQCLEAN_KYBER768_AVX2_poly_frombytes(poly *r, const uint8_t *a) {
|
|
PQCLEAN_KYBER768_AVX2_nttfrombytes_avx(r->coeffs, a);
|
|
PQCLEAN_KYBER768_AVX2_nttfrombytes_avx(r->coeffs + 128, a + 192);
|
|
}
|
|
|
|
/*************************************************
|
|
* Name: poly_getnoise
|
|
*
|
|
* Description: Sample a polynomial deterministically from a seed and a nonce,
|
|
* with output polynomial close to centered binomial distribution
|
|
* with parameter KYBER_ETA
|
|
*
|
|
* Arguments: - poly *r: pointer to output polynomial
|
|
* - const uint8_t *seed: pointer to input seed
|
|
* - uint8_t nonce: one-byte input nonce
|
|
**************************************************/
|
|
void PQCLEAN_KYBER768_AVX2_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce) {
|
|
uint8_t buf[KYBER_ETA * KYBER_N / 4];
|
|
|
|
prf(buf, KYBER_ETA * KYBER_N / 4, seed, nonce);
|
|
PQCLEAN_KYBER768_AVX2_cbd(r, buf);
|
|
}
|
|
|
|
// FIXME
|
|
void PQCLEAN_KYBER768_AVX2_poly_getnoise4x(poly *r0,
|
|
poly *r1,
|
|
poly *r2,
|
|
poly *r3,
|
|
const uint8_t *seed,
|
|
uint8_t nonce0,
|
|
uint8_t nonce1,
|
|
uint8_t nonce2,
|
|
uint8_t nonce3) {
|
|
uint8_t buf[4][SHAKE256_RATE];
|
|
|
|
PQCLEAN_KYBER768_AVX2_shake256x4_prf(buf[0], buf[1], buf[2], buf[3], SHAKE256_RATE, seed, nonce0, nonce1, nonce2, nonce3);
|
|
|
|
PQCLEAN_KYBER768_AVX2_cbd(r0, buf[0]);
|
|
PQCLEAN_KYBER768_AVX2_cbd(r1, buf[1]);
|
|
PQCLEAN_KYBER768_AVX2_cbd(r2, buf[2]);
|
|
PQCLEAN_KYBER768_AVX2_cbd(r3, buf[3]);
|
|
}
|
|
|
|
/*************************************************
|
|
* Name: poly_ntt
|
|
*
|
|
* Description: Computes negacyclic number-theoretic transform (NTT) of
|
|
* a polynomial in place;
|
|
* inputs assumed to be in normal order, output in bitreversed order
|
|
*
|
|
* Arguments: - uint16_t *r: pointer to in/output polynomial
|
|
**************************************************/
|
|
void PQCLEAN_KYBER768_AVX2_poly_ntt(poly *r) {
|
|
PQCLEAN_KYBER768_AVX2_ntt_level0_avx(r->coeffs, PQCLEAN_KYBER768_AVX2_zetas_exp);
|
|
PQCLEAN_KYBER768_AVX2_ntt_level0_avx(r->coeffs + 64, PQCLEAN_KYBER768_AVX2_zetas_exp);
|
|
PQCLEAN_KYBER768_AVX2_ntt_levels1t6_avx(r->coeffs, PQCLEAN_KYBER768_AVX2_zetas_exp + 4);
|
|
PQCLEAN_KYBER768_AVX2_ntt_levels1t6_avx(r->coeffs + 128, PQCLEAN_KYBER768_AVX2_zetas_exp + 200);
|
|
}
|
|
|
|
/*************************************************
|
|
* Name: poly_invntt
|
|
*
|
|
* Description: Computes inverse of negacyclic number-theoretic transform (NTT) of
|
|
* a polynomial in place;
|
|
* inputs assumed to be in bitreversed order, output in normal order
|
|
*
|
|
* Arguments: - uint16_t *a: pointer to in/output polynomial
|
|
**************************************************/
|
|
void PQCLEAN_KYBER768_AVX2_poly_invntt(poly *r) {
|
|
PQCLEAN_KYBER768_AVX2_invntt_levels0t5_avx(r->coeffs, PQCLEAN_KYBER768_AVX2_zetas_inv_exp);
|
|
PQCLEAN_KYBER768_AVX2_invntt_levels0t5_avx(r->coeffs + 128, PQCLEAN_KYBER768_AVX2_zetas_inv_exp + 196);
|
|
PQCLEAN_KYBER768_AVX2_invntt_level6_avx(r->coeffs, PQCLEAN_KYBER768_AVX2_zetas_inv_exp + 392);
|
|
}
|
|
|
|
// FIXME
|
|
void PQCLEAN_KYBER768_AVX2_poly_nttunpack(poly *r) {
|
|
PQCLEAN_KYBER768_AVX2_nttunpack_avx(r->coeffs);
|
|
PQCLEAN_KYBER768_AVX2_nttunpack_avx(r->coeffs + 128);
|
|
}
|
|
|
|
//XXX Add comment
|
|
void PQCLEAN_KYBER768_AVX2_poly_basemul(poly *r, const poly *a, const poly *b) {
|
|
PQCLEAN_KYBER768_AVX2_basemul_avx(r->coeffs,
|
|
a->coeffs,
|
|
b->coeffs,
|
|
PQCLEAN_KYBER768_AVX2_zetas_exp + 152);
|
|
PQCLEAN_KYBER768_AVX2_basemul_avx(r->coeffs + 64,
|
|
a->coeffs + 64,
|
|
b->coeffs + 64,
|
|
PQCLEAN_KYBER768_AVX2_zetas_exp + 184);
|
|
PQCLEAN_KYBER768_AVX2_basemul_avx(r->coeffs + 128,
|
|
a->coeffs + 128,
|
|
b->coeffs + 128,
|
|
PQCLEAN_KYBER768_AVX2_zetas_exp + 348);
|
|
PQCLEAN_KYBER768_AVX2_basemul_avx(r->coeffs + 192,
|
|
a->coeffs + 192,
|
|
b->coeffs + 192,
|
|
PQCLEAN_KYBER768_AVX2_zetas_exp + 380);
|
|
}
|
|
|
|
// FIXME
|
|
void PQCLEAN_KYBER768_AVX2_poly_frommont(poly *r) {
|
|
PQCLEAN_KYBER768_AVX2_frommont_avx(r->coeffs);
|
|
PQCLEAN_KYBER768_AVX2_frommont_avx(r->coeffs + 128);
|
|
}
|
|
|
|
// FIXME
|
|
void PQCLEAN_KYBER768_AVX2_poly_reduce(poly *r) {
|
|
PQCLEAN_KYBER768_AVX2_reduce_avx(r->coeffs);
|
|
PQCLEAN_KYBER768_AVX2_reduce_avx(r->coeffs + 128);
|
|
}
|
|
|
|
// FIXME
|
|
void PQCLEAN_KYBER768_AVX2_poly_csubq(poly *r) {
|
|
PQCLEAN_KYBER768_AVX2_csubq_avx(r->coeffs);
|
|
PQCLEAN_KYBER768_AVX2_csubq_avx(r->coeffs + 128);
|
|
}
|
|
|
|
/*************************************************
|
|
* Name: poly_add
|
|
*
|
|
* Description: Add two polynomials
|
|
*
|
|
* Arguments: - poly *r: pointer to output polynomial
|
|
* - const poly *a: pointer to first input polynomial
|
|
* - const poly *b: pointer to second input polynomial
|
|
**************************************************/
|
|
void PQCLEAN_KYBER768_AVX2_poly_add(poly *r, const poly *a, const poly *b) {
|
|
__m256i vec0, vec1;
|
|
|
|
for (size_t i = 0; i < KYBER_N; i += 16) {
|
|
vec0 = _mm256_load_si256((__m256i *)&a->coeffs[i]);
|
|
vec1 = _mm256_load_si256((__m256i *)&b->coeffs[i]);
|
|
vec0 = _mm256_add_epi16(vec0, vec1);
|
|
_mm256_store_si256((__m256i *)&r->coeffs[i], vec0);
|
|
}
|
|
}
|
|
|
|
/*************************************************
|
|
* Name: poly_sub
|
|
*
|
|
* Description: Subtract two polynomials
|
|
*
|
|
* Arguments: - poly *r: pointer to output polynomial
|
|
* - const poly *a: pointer to first input polynomial
|
|
* - const poly *b: pointer to second input polynomial
|
|
**************************************************/
|
|
void PQCLEAN_KYBER768_AVX2_poly_sub(poly *r, const poly *a, const poly *b) {
|
|
__m256i vec0, vec1;
|
|
|
|
for (size_t i = 0; i < KYBER_N; i += 16) {
|
|
vec0 = _mm256_load_si256((__m256i *)&a->coeffs[i]);
|
|
vec1 = _mm256_load_si256((__m256i *)&b->coeffs[i]);
|
|
vec0 = _mm256_sub_epi16(vec0, vec1);
|
|
_mm256_store_si256((__m256i *)&r->coeffs[i], vec0);
|
|
}
|
|
}
|
|
|
|
/*************************************************
|
|
* Name: poly_frommsg
|
|
*
|
|
* Description: Convert 32-byte message to polynomial
|
|
*
|
|
* Arguments: - poly *r: pointer to output polynomial
|
|
* - const uint8_t *msg: pointer to input message
|
|
**************************************************/
|
|
void PQCLEAN_KYBER768_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]) {
|
|
__m128i tmp;
|
|
__m256i a[4], d0, d1, d2, d3;
|
|
const __m256i shift = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
|
|
const __m256i zeros = _mm256_setzero_si256();
|
|
const __m256i ones = _mm256_set1_epi32(1);
|
|
const __m256i hqs = _mm256_set1_epi32((KYBER_Q + 1) / 2);
|
|
|
|
tmp = _mm_loadu_si128((__m128i *)msg);
|
|
for (size_t i = 0; i < 4; i++) {
|
|
a[i] = _mm256_broadcastd_epi32(tmp);
|
|
tmp = _mm_srli_si128(tmp, 4);
|
|
}
|
|
|
|
for (size_t i = 0; i < 4; i++) {
|
|
d0 = _mm256_srlv_epi32(a[i], shift);
|
|
d1 = _mm256_srli_epi32(d0, 8);
|
|
d2 = _mm256_srli_epi32(d0, 16);
|
|
d3 = _mm256_srli_epi32(d0, 24);
|
|
|
|
d0 = _mm256_and_si256(d0, ones);
|
|
d1 = _mm256_and_si256(d1, ones);
|
|
d2 = _mm256_and_si256(d2, ones);
|
|
d3 = _mm256_and_si256(d3, ones);
|
|
|
|
d0 = _mm256_sub_epi32(zeros, d0);
|
|
d1 = _mm256_sub_epi32(zeros, d1);
|
|
d2 = _mm256_sub_epi32(zeros, d2);
|
|
d3 = _mm256_sub_epi32(zeros, d3);
|
|
|
|
d0 = _mm256_and_si256(hqs, d0);
|
|
d1 = _mm256_and_si256(hqs, d1);
|
|
d2 = _mm256_and_si256(hqs, d2);
|
|
d3 = _mm256_and_si256(hqs, d3);
|
|
|
|
d0 = _mm256_packus_epi32(d0, d1);
|
|
d2 = _mm256_packus_epi32(d2, d3);
|
|
d0 = _mm256_permute4x64_epi64(d0, 0xD8);
|
|
d2 = _mm256_permute4x64_epi64(d2, 0xD8);
|
|
_mm256_store_si256((__m256i *)&r->coeffs[32 * i + 0], d0);
|
|
_mm256_store_si256((__m256i *)&r->coeffs[32 * i + 16], d2);
|
|
}
|
|
|
|
tmp = _mm_loadu_si128((__m128i *)&msg[16]);
|
|
for (size_t i = 0; i < 4; i++) {
|
|
a[i] = _mm256_broadcastd_epi32(tmp);
|
|
tmp = _mm_srli_si128(tmp, 4);
|
|
}
|
|
|
|
for (size_t i = 0; i < 4; i++) {
|
|
d0 = _mm256_srlv_epi32(a[i], shift);
|
|
d1 = _mm256_srli_epi32(d0, 8);
|
|
d2 = _mm256_srli_epi32(d0, 16);
|
|
d3 = _mm256_srli_epi32(d0, 24);
|
|
|
|
d0 = _mm256_and_si256(d0, ones);
|
|
d1 = _mm256_and_si256(d1, ones);
|
|
d2 = _mm256_and_si256(d2, ones);
|
|
d3 = _mm256_and_si256(d3, ones);
|
|
|
|
d0 = _mm256_sub_epi32(zeros, d0);
|
|
d1 = _mm256_sub_epi32(zeros, d1);
|
|
d2 = _mm256_sub_epi32(zeros, d2);
|
|
d3 = _mm256_sub_epi32(zeros, d3);
|
|
|
|
d0 = _mm256_and_si256(hqs, d0);
|
|
d1 = _mm256_and_si256(hqs, d1);
|
|
d2 = _mm256_and_si256(hqs, d2);
|
|
d3 = _mm256_and_si256(hqs, d3);
|
|
|
|
d0 = _mm256_packus_epi32(d0, d1);
|
|
d2 = _mm256_packus_epi32(d2, d3);
|
|
d0 = _mm256_permute4x64_epi64(d0, 0xD8);
|
|
d2 = _mm256_permute4x64_epi64(d2, 0xD8);
|
|
_mm256_store_si256((__m256i *)&r->coeffs[128 + 32 * i + 0], d0);
|
|
_mm256_store_si256((__m256i *)&r->coeffs[128 + 32 * i + 16], d2);
|
|
}
|
|
}
|
|
|
|
/*************************************************
|
|
* Name: poly_tomsg
|
|
*
|
|
* Description: Convert polynomial to 32-byte message
|
|
*
|
|
* Arguments: - uint8_t *msg: pointer to output message
|
|
* - const poly *a: pointer to input polynomial
|
|
**************************************************/
|
|
void PQCLEAN_KYBER768_AVX2_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a) {
|
|
uint32_t small;
|
|
__m256i vec, tmp;
|
|
const __m256i hqs = _mm256_set1_epi16((KYBER_Q - 1) / 2);
|
|
const __m256i hhqs = _mm256_set1_epi16((KYBER_Q - 5) / 4);
|
|
|
|
for (size_t i = 0; i < KYBER_N / 16; i++) {
|
|
vec = _mm256_load_si256((__m256i *)&a->coeffs[16 * i]);
|
|
vec = _mm256_sub_epi16(hqs, vec);
|
|
tmp = _mm256_srai_epi16(vec, 15);
|
|
vec = _mm256_xor_si256(vec, tmp);
|
|
vec = _mm256_sub_epi16(hhqs, vec);
|
|
small = (uint32_t)_mm256_movemask_epi8(vec);
|
|
small = _pext_u32(small, 0xAAAAAAAA);
|
|
small = ~small;
|
|
msg[2 * i + 0] = (uint8_t)small;
|
|
msg[2 * i + 1] = (uint8_t)(small >> 8);
|
|
}
|
|
}
|