1
1
mirror of https://github.com/henrydcase/pqc.git synced 2024-11-22 23:48:58 +00:00

Replace avx2 'reduce' in gf2x

This commit is contained in:
John M. Schanck 2020-09-07 17:00:59 -04:00 committed by Kris Kwiatkowski
parent b0afb62c0e
commit cb7675a9b6
7 changed files with 177 additions and 310 deletions

View File

@ -14,25 +14,21 @@
#define T_TM3_3W_256 32 #define T_TM3_3W_256 32
#define T_TM3_3W_64 128 #define T_TM3_3W_64 128
#define VEC_N_ARRAY_SIZE_VEC CEIL_DIVIDE(PARAM_N, 256) /*!< The number of needed vectors to store PARAM_N bits*/ #define VEC_N_SIZE_256 CEIL_DIVIDE(PARAM_N, 256) /*!< The number of needed vectors to store PARAM_N bits*/
#define WORD 64 __m256i a1_times_a2[2 * VEC_N_SIZE_256 + 1];
#define LAST64 (PARAM_N >> 6)
uint64_t a1_times_a2[2 * VEC_N_256_SIZE_64 + 1];
uint64_t tmp_reduce[VEC_N_ARRAY_SIZE_VEC << 2];
__m256i *o256 = (__m256i *) tmp_reduce;
uint64_t bloc64[PARAM_OMEGA_R]; // Allocation with the biggest possible weight uint64_t bloc64[PARAM_OMEGA_R]; // Allocation with the biggest possible weight
uint64_t bit64[PARAM_OMEGA_R]; // Allocation with the biggest possible weight uint64_t bit64[PARAM_OMEGA_R]; // Allocation with the biggest possible weight
static inline void reduce(uint64_t *o, const uint64_t *a); static inline void reduce(uint64_t *o, const uint64_t *a);
inline static void karat_mult_1(__m128i *C, __m128i *A, __m128i *B); static inline void karat_mult_1(__m128i *C, __m128i *A, __m128i *B);
inline static void karat_mult_2(__m256i *C, __m256i *A, __m256i *B); static inline void karat_mult_2(__m256i *C, __m256i *A, __m256i *B);
inline static void karat_mult_4(__m256i *C, __m256i *A, __m256i *B); static inline void karat_mult_4(__m256i *C, __m256i *A, __m256i *B);
inline static void karat_mult_8(__m256i *C, __m256i *A, __m256i *B); static inline void karat_mult_8(__m256i *C, __m256i *A, __m256i *B);
inline static void karat_mult_16(__m256i *C, __m256i *A, __m256i *B); static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B);
inline static void karat_mult_32(__m256i *C, __m256i *A, __m256i *B); static inline void karat_mult_32(__m256i *C, __m256i *A, __m256i *B);
static inline void divByXplus1(__m256i *out, __m256i *in, int size); static inline void divByXplus1(__m256i *out, __m256i *in, int size);
static void TOOM3Mult(uint64_t *Out, const uint64_t *A, const uint64_t *B); static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B);
@ -45,33 +41,16 @@ static void TOOM3Mult(uint64_t *Out, const uint64_t *A, const uint64_t *B);
* @param[in] a Pointer to the polynomial a(x) * @param[in] a Pointer to the polynomial a(x)
*/ */
static inline void reduce(uint64_t *o, const uint64_t *a) { static inline void reduce(uint64_t *o, const uint64_t *a) {
__m256i r256, carry256; uint64_t r;
__m256i *a256 = (__m256i *) a; uint64_t carry;
static const int32_t dec64 = PARAM_N & 0x3f;
static const int32_t d0 = WORD - dec64;
int32_t i, i2;
for (i = LAST64 ; i < (PARAM_N >> 5) - 4 ; i += 4) { for (uint32_t i = 0 ; i < VEC_N_SIZE_64 ; i++) {
r256 = _mm256_lddqu_si256((__m256i const *) (& a[i])); r = a[i + VEC_N_SIZE_64 - 1] >> (PARAM_N & 63);
r256 = _mm256_srli_epi64(r256, dec64); carry = (uint64_t) (a[i + VEC_N_SIZE_64] << (64 - (PARAM_N & 63)));
carry256 = _mm256_lddqu_si256((__m256i const *) (& a[i + 1])); o[i] = a[i] ^ r ^ carry;
carry256 = _mm256_slli_epi64(carry256, d0);
r256 ^= carry256;
i2 = (i - LAST64) >> 2;
o256[i2] = a256[i2] ^ r256;
} }
r256 = (__m256i) { o[VEC_N_SIZE_64 - 1] &= RED_MASK;
a[i], a[i + 1], 0x0UL, 0x0UL
};
carry256 = _mm256_lddqu_si256((__m256i const *) (& a[i + 1]));
r256 = _mm256_srli_epi64(r256, dec64);
carry256 = _mm256_slli_epi64(carry256, d0);
r256 ^= carry256;
i2 = (i - LAST64) >> 2;
o256[i2] = (a256[i2] ^ r256);
tmp_reduce[LAST64] &= RED_MASK;
memcpy(o, tmp_reduce, VEC_N_SIZE_BYTES);
} }
/** /**
@ -83,7 +62,7 @@ static inline void reduce(uint64_t *o, const uint64_t *a) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_1(__m128i *C, __m128i *A, __m128i *B) { static inline void karat_mult_1(__m128i *C, __m128i *A, __m128i *B) {
__m128i D1[2]; __m128i D1[2];
__m128i D0[2], D2[2]; __m128i D0[2], D2[2];
__m128i Al = _mm_loadu_si128(A); __m128i Al = _mm_loadu_si128(A);
@ -140,7 +119,7 @@ inline static void karat_mult_1(__m128i *C, __m128i *A, __m128i *B) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_2(__m256i *C, __m256i *A, __m256i *B) { static inline void karat_mult_2(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[2], D1[2], D2[2], SAA, SBB; __m256i D0[2], D1[2], D2[2], SAA, SBB;
__m128i *A128 = (__m128i *)A, *B128 = (__m128i *)B; __m128i *A128 = (__m128i *)A, *B128 = (__m128i *)B;
@ -170,7 +149,7 @@ inline static void karat_mult_2(__m256i *C, __m256i *A, __m256i *B) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_4(__m256i *C, __m256i *A, __m256i *B) { static inline void karat_mult_4(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[4], D1[4], D2[4], SAA[2], SBB[2]; __m256i D0[4], D1[4], D2[4], SAA[2], SBB[2];
karat_mult_2( D0, A, B); karat_mult_2( D0, A, B);
@ -207,7 +186,7 @@ inline static void karat_mult_4(__m256i *C, __m256i *A, __m256i *B) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) { static inline void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[8], D1[8], D2[8], SAA[4], SBB[4]; __m256i D0[8], D1[8], D2[8], SAA[4], SBB[4];
karat_mult_4( D0, A, B); karat_mult_4( D0, A, B);
@ -246,7 +225,7 @@ inline static void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) { static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[16], D1[16], D2[16], SAA[8], SBB[8]; __m256i D0[16], D1[16], D2[16], SAA[8], SBB[8];
karat_mult_8( D0, A, B); karat_mult_8( D0, A, B);
@ -285,7 +264,7 @@ inline static void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) { static inline void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[32], D1[32], D2[32], SAA[16], SBB[16]; __m256i D0[32], D1[32], D2[32], SAA[16], SBB[16];
karat_mult_16( D0, A, B); karat_mult_16( D0, A, B);
@ -344,7 +323,7 @@ static inline void divByXplus1(__m256i *out, __m256i *in, int size) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
static void TOOM3Mult(uint64_t *Out, const uint64_t *A, const uint64_t *B) { static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
static __m256i U0[T_TM3_3W_256], V0[T_TM3_3W_256], U1[T_TM3_3W_256], V1[T_TM3_3W_256], U2[T_TM3_3W_256], V2[T_TM3_3W_256]; static __m256i U0[T_TM3_3W_256], V0[T_TM3_3W_256], U1[T_TM3_3W_256], V1[T_TM3_3W_256], U2[T_TM3_3W_256], V2[T_TM3_3W_256];
static __m256i W0[2 * (T_TM3_3W_256)], W1[2 * (T_TM3_3W_256)], W2[2 * (T_TM3_3W_256)], W3[2 * (T_TM3_3W_256)], W4[2 * (T_TM3_3W_256)]; static __m256i W0[2 * (T_TM3_3W_256)], W1[2 * (T_TM3_3W_256)], W2[2 * (T_TM3_3W_256)], W3[2 * (T_TM3_3W_256)], W4[2 * (T_TM3_3W_256)];
static __m256i tmp[2 * (T_TM3_3W_256)]; static __m256i tmp[2 * (T_TM3_3W_256)];
@ -533,7 +512,7 @@ static void TOOM3Mult(uint64_t *Out, const uint64_t *A, const uint64_t *B) {
} }
for (int32_t i = 0 ; i < 6 * T_TM3_3W_256 - 2 ; i++) { for (int32_t i = 0 ; i < 6 * T_TM3_3W_256 - 2 ; i++) {
uint64_t *out64 = Out + (i << 2); uint64_t *out64 = ((uint64_t *)Out) + (i << 2);
_mm256_storeu_si256((__m256i *)out64, ro256[i]); _mm256_storeu_si256((__m256i *)out64, ro256[i]);
} }
} }
@ -551,8 +530,8 @@ static void TOOM3Mult(uint64_t *Out, const uint64_t *A, const uint64_t *B) {
*/ */
void PQCLEAN_HQC128_AVX2_vect_mul(uint64_t *o, const uint64_t *a1, const uint64_t *a2) { void PQCLEAN_HQC128_AVX2_vect_mul(uint64_t *o, const uint64_t *a1, const uint64_t *a2) {
TOOM3Mult(a1_times_a2, a1, a2); TOOM3Mult(a1_times_a2, a1, a2);
reduce(o, a1_times_a2); reduce(o, (uint64_t *)a1_times_a2);
// clear all // clear all
memset(a1_times_a2, 0, (VEC_N_SIZE_64 << 1) * sizeof(uint64_t)); memset(a1_times_a2, 0, (2 * VEC_N_SIZE_256 + 1) * sizeof(__m256i));
} }

View File

@ -14,26 +14,22 @@
#define T_TM3_3W_256 64 #define T_TM3_3W_256 64
#define T_TM3_3W_64 256 #define T_TM3_3W_64 256
#define VEC_N_ARRAY_SIZE_VEC CEIL_DIVIDE(PARAM_N, 256) /*!< The number of needed vectors to store PARAM_N bits*/ #define VEC_N_SIZE_256 CEIL_DIVIDE(PARAM_N, 256) /*!< The number of needed vectors to store PARAM_N bits*/
#define WORD 64 __m256i a1_times_a2[2 * VEC_N_SIZE_256 + 1];
#define LAST64 (PARAM_N >> 6)
uint64_t a1_times_a2[2 * VEC_N_256_SIZE_64 + 1];
uint64_t tmp_reduce[VEC_N_ARRAY_SIZE_VEC << 2];
__m256i *o256 = (__m256i *) tmp_reduce;
uint64_t bloc64[PARAM_OMEGA_R]; // Allocation with the biggest possible weight uint64_t bloc64[PARAM_OMEGA_R]; // Allocation with the biggest possible weight
uint64_t bit64[PARAM_OMEGA_R]; // Allocation with the biggest possible weight uint64_t bit64[PARAM_OMEGA_R]; // Allocation with the biggest possible weight
static inline void reduce(uint64_t *o, const uint64_t *a); static inline void reduce(uint64_t *o, const uint64_t *a);
inline static void karat_mult_1(__m128i *C, __m128i *A, __m128i *B); static inline void karat_mult_1(__m128i *C, __m128i *A, __m128i *B);
inline static void karat_mult_2(__m256i *C, __m256i *A, __m256i *B); static inline void karat_mult_2(__m256i *C, __m256i *A, __m256i *B);
inline static void karat_mult_4(__m256i *C, __m256i *A, __m256i *B); static inline void karat_mult_4(__m256i *C, __m256i *A, __m256i *B);
inline static void karat_mult_8(__m256i *C, __m256i *A, __m256i *B); static inline void karat_mult_8(__m256i *C, __m256i *A, __m256i *B);
inline static void karat_mult_16(__m256i *C, __m256i *A, __m256i *B); static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B);
inline static void karat_mult_32(__m256i *C, __m256i *A, __m256i *B); static inline void karat_mult_32(__m256i *C, __m256i *A, __m256i *B);
inline static void karat_mult_64(__m256i *C, __m256i *A, __m256i *B); static inline void karat_mult_64(__m256i *C, __m256i *A, __m256i *B);
static inline void divByXplus1(__m256i *out, __m256i *in, int size); static inline void divByXplus1(__m256i *out, __m256i *in, int size);
static void TOOM3Mult(uint64_t *Out, const uint64_t *A, const uint64_t *B); static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B);
@ -46,35 +42,18 @@ static void TOOM3Mult(uint64_t *Out, const uint64_t *A, const uint64_t *B);
* @param[in] a Pointer to the polynomial a(x) * @param[in] a Pointer to the polynomial a(x)
*/ */
static inline void reduce(uint64_t *o, const uint64_t *a) { static inline void reduce(uint64_t *o, const uint64_t *a) {
__m256i r256, carry256; uint64_t r;
__m256i *a256 = (__m256i *) a; uint64_t carry;
static const int32_t dec64 = PARAM_N & 0x3f;
static const int32_t d0 = WORD - dec64;
int32_t i, i2;
for (i = LAST64 ; i < (PARAM_N >> 5) - 4 ; i += 4) { for (uint32_t i = 0 ; i < VEC_N_SIZE_64 ; i++) {
r256 = _mm256_lddqu_si256((__m256i const *) (& a[i])); r = a[i + VEC_N_SIZE_64 - 1] >> (PARAM_N & 63);
r256 = _mm256_srli_epi64(r256, dec64); carry = (uint64_t) (a[i + VEC_N_SIZE_64] << (64 - (PARAM_N & 63)));
carry256 = _mm256_lddqu_si256((__m256i const *) (& a[i + 1])); o[i] = a[i] ^ r ^ carry;
carry256 = _mm256_slli_epi64(carry256, d0);
r256 ^= carry256;
i2 = (i - LAST64) >> 2;
o256[i2] = a256[i2] ^ r256;
} }
r256 = _mm256_lddqu_si256((__m256i const *) (& a[i])); o[VEC_N_SIZE_64 - 1] &= RED_MASK;
carry256 = _mm256_lddqu_si256((__m256i const *) (& a[i + 1]));
r256 = _mm256_srli_epi64(r256, dec64);
carry256 = _mm256_slli_epi64(carry256, d0);
r256 ^= carry256;
i2 = (i - LAST64) >> 2;
o256[i2] = (a256[i2] ^ r256);
tmp_reduce[LAST64] &= RED_MASK;
memcpy(o, tmp_reduce, VEC_N_SIZE_BYTES);
} }
/** /**
* @brief Compute C(x) = A(x)*B(x) * @brief Compute C(x) = A(x)*B(x)
* A(x) and B(x) are stored in 128-bit registers * A(x) and B(x) are stored in 128-bit registers
@ -84,7 +63,7 @@ static inline void reduce(uint64_t *o, const uint64_t *a) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_1(__m128i *C, __m128i *A, __m128i *B) { static inline void karat_mult_1(__m128i *C, __m128i *A, __m128i *B) {
__m128i D1[2]; __m128i D1[2];
__m128i D0[2], D2[2]; __m128i D0[2], D2[2];
__m128i Al = _mm_loadu_si128(A); __m128i Al = _mm_loadu_si128(A);
@ -141,7 +120,7 @@ inline static void karat_mult_1(__m128i *C, __m128i *A, __m128i *B) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_2(__m256i *C, __m256i *A, __m256i *B) { static inline void karat_mult_2(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[2], D1[2], D2[2], SAA, SBB; __m256i D0[2], D1[2], D2[2], SAA, SBB;
__m128i *A128 = (__m128i *)A, *B128 = (__m128i *)B; __m128i *A128 = (__m128i *)A, *B128 = (__m128i *)B;
@ -171,7 +150,7 @@ inline static void karat_mult_2(__m256i *C, __m256i *A, __m256i *B) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_4(__m256i *C, __m256i *A, __m256i *B) { static inline void karat_mult_4(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[4], D1[4], D2[4], SAA[2], SBB[2]; __m256i D0[4], D1[4], D2[4], SAA[2], SBB[2];
karat_mult_2( D0, A, B); karat_mult_2( D0, A, B);
@ -208,7 +187,7 @@ inline static void karat_mult_4(__m256i *C, __m256i *A, __m256i *B) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) { static inline void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[8], D1[8], D2[8], SAA[4], SBB[4]; __m256i D0[8], D1[8], D2[8], SAA[4], SBB[4];
karat_mult_4( D0, A, B); karat_mult_4( D0, A, B);
@ -247,7 +226,7 @@ inline static void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) { static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[16], D1[16], D2[16], SAA[8], SBB[8]; __m256i D0[16], D1[16], D2[16], SAA[8], SBB[8];
karat_mult_8( D0, A, B); karat_mult_8( D0, A, B);
@ -286,7 +265,7 @@ inline static void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) { static inline void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[32], D1[32], D2[32], SAA[16], SBB[16]; __m256i D0[32], D1[32], D2[32], SAA[16], SBB[16];
karat_mult_16( D0, A, B); karat_mult_16( D0, A, B);
@ -325,7 +304,7 @@ inline static void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_64(__m256i *C, __m256i *A, __m256i *B) { static inline void karat_mult_64(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[64], D1[64], D2[64], SAA[32], SBB[32]; __m256i D0[64], D1[64], D2[64], SAA[32], SBB[32];
karat_mult_32( D0, A, B); karat_mult_32( D0, A, B);
@ -384,7 +363,7 @@ static inline void divByXplus1(__m256i *out, __m256i *in, int size) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
static void TOOM3Mult(uint64_t *Out, const uint64_t *A, const uint64_t *B) { static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
static __m256i U0[T_TM3_3W_256], V0[T_TM3_3W_256], U1[T_TM3_3W_256], V1[T_TM3_3W_256], U2[T_TM3_3W_256], V2[T_TM3_3W_256]; static __m256i U0[T_TM3_3W_256], V0[T_TM3_3W_256], U1[T_TM3_3W_256], V1[T_TM3_3W_256], U2[T_TM3_3W_256], V2[T_TM3_3W_256];
static __m256i W0[2 * (T_TM3_3W_256)], W1[2 * (T_TM3_3W_256)], W2[2 * (T_TM3_3W_256)], W3[2 * (T_TM3_3W_256)], W4[2 * (T_TM3_3W_256)]; static __m256i W0[2 * (T_TM3_3W_256)], W1[2 * (T_TM3_3W_256)], W2[2 * (T_TM3_3W_256)], W3[2 * (T_TM3_3W_256)], W4[2 * (T_TM3_3W_256)];
static __m256i tmp[2 * (T_TM3_3W_256)]; static __m256i tmp[2 * (T_TM3_3W_256)];
@ -572,7 +551,7 @@ static void TOOM3Mult(uint64_t *Out, const uint64_t *A, const uint64_t *B) {
} }
for (int32_t i = 0 ; i < 6 * T_TM3_3W_256 - 2 ; i++) { for (int32_t i = 0 ; i < 6 * T_TM3_3W_256 - 2 ; i++) {
uint64_t *out64 = Out + (i << 2); uint64_t *out64 = ((uint64_t *)Out) + (i << 2);
_mm256_storeu_si256((__m256i *)out64, ro256[i]); _mm256_storeu_si256((__m256i *)out64, ro256[i]);
} }
} }
@ -591,8 +570,8 @@ static void TOOM3Mult(uint64_t *Out, const uint64_t *A, const uint64_t *B) {
*/ */
void PQCLEAN_HQC192_AVX2_vect_mul(uint64_t *o, const uint64_t *a1, const uint64_t *a2) { void PQCLEAN_HQC192_AVX2_vect_mul(uint64_t *o, const uint64_t *a1, const uint64_t *a2) {
TOOM3Mult(a1_times_a2, a1, a2); TOOM3Mult(a1_times_a2, a1, a2);
reduce(o, a1_times_a2); reduce(o, (uint64_t *)a1_times_a2);
// clear all // clear all
memset(a1_times_a2, 0, (VEC_N_SIZE_64 << 1) * sizeof(uint64_t)); memset(a1_times_a2, 0, (2 * VEC_N_SIZE_256 + 1) * sizeof(__m256i));
} }

View File

@ -18,27 +18,23 @@
#define T_TM3R_3W_256 93 #define T_TM3R_3W_256 93
#define T_TM3R_3W_64 (T_TM3R_3W_256<<2) #define T_TM3R_3W_64 (T_TM3R_3W_256<<2)
#define VEC_N_ARRAY_SIZE_VEC CEIL_DIVIDE(PARAM_N, 256) /*!< The number of needed vectors to store PARAM_N bits*/ #define VEC_N_SIZE_256 CEIL_DIVIDE(PARAM_N, 256) /*!< The number of needed vectors to store PARAM_N bits*/
#define WORD 64 __m256i a1_times_a2[2 * VEC_N_SIZE_256 + 1];
#define LAST64 (PARAM_N >> 6)
uint64_t a1_times_a2[2 * VEC_N_256_SIZE_64 + 1];
uint64_t tmp_reduce[VEC_N_ARRAY_SIZE_VEC << 2];
__m256i *o256 = (__m256i *) tmp_reduce;
uint64_t bloc64[PARAM_OMEGA_R]; // Allocation with the biggest possible weight uint64_t bloc64[PARAM_OMEGA_R]; // Allocation with the biggest possible weight
uint64_t bit64[PARAM_OMEGA_R]; // Allocation with the biggest possible weight uint64_t bit64[PARAM_OMEGA_R]; // Allocation with the biggest possible weight
static inline void reduce(uint64_t *o, const uint64_t *a); static inline void reduce(uint64_t *o, const uint64_t *a);
inline static void karat_mult_1(__m128i *C, __m128i *A, __m128i *B); static inline void karat_mult_1(__m128i *C, __m128i *A, __m128i *B);
inline static void karat_mult_2(__m256i *C, __m256i *A, __m256i *B); static inline void karat_mult_2(__m256i *C, __m256i *A, __m256i *B);
inline static void karat_mult_4(__m256i *C, __m256i *A, __m256i *B); static inline void karat_mult_4(__m256i *C, __m256i *A, __m256i *B);
inline static void karat_mult_8(__m256i *C, __m256i *A, __m256i *B); static inline void karat_mult_8(__m256i *C, __m256i *A, __m256i *B);
inline static void karat_mult_16(__m256i *C, __m256i *A, __m256i *B); static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B);
inline static void karat_mult_32(__m256i *C, __m256i *A, __m256i *B); static inline void karat_mult_32(__m256i *C, __m256i *A, __m256i *B);
static inline void divByXplus1(__m256i *out, __m256i *in, int size); static inline void divByXplus1(__m256i *out, __m256i *in, int size);
static inline void divByXplus1_256(__m256i *out, __m256i *in, int size); static inline void divByXplus1_256(__m256i *out, __m256i *in, int size);
static void TOOM3Mult(uint64_t *Out, const uint64_t *A, const uint64_t *B); static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B);
static void TOOM3RecMult(uint64_t *Out, const uint64_t *A, const uint64_t *B); static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B);
@ -51,35 +47,18 @@ static void TOOM3RecMult(uint64_t *Out, const uint64_t *A, const uint64_t *B);
* @param[in] a Pointer to the polynomial a(x) * @param[in] a Pointer to the polynomial a(x)
*/ */
static inline void reduce(uint64_t *o, const uint64_t *a) { static inline void reduce(uint64_t *o, const uint64_t *a) {
__m256i r256, carry256; uint64_t r;
__m256i *a256 = (__m256i *) a; uint64_t carry;
static const int32_t dec64 = PARAM_N & 0x3f;
static const int32_t d0 = WORD - dec64;
int32_t i, i2;
for (i = LAST64 ; i < (PARAM_N >> 5) - 4 ; i += 4) { for (uint32_t i = 0 ; i < VEC_N_SIZE_64 ; i++) {
r256 = _mm256_lddqu_si256((__m256i const *) (& a[i])); r = a[i + VEC_N_SIZE_64 - 1] >> (PARAM_N & 63);
r256 = _mm256_srli_epi64(r256, dec64); carry = (uint64_t) (a[i + VEC_N_SIZE_64] << (64 - (PARAM_N & 63)));
carry256 = _mm256_lddqu_si256((__m256i const *) (& a[i + 1])); o[i] = a[i] ^ r ^ carry;
carry256 = _mm256_slli_epi64(carry256, d0);
r256 ^= carry256;
i2 = (i - LAST64) >> 2;
o256[i2] = a256[i2] ^ r256;
} }
r256 = _mm256_lddqu_si256((__m256i const *)(& a[i])); o[VEC_N_SIZE_64 - 1] &= RED_MASK;
carry256 = _mm256_lddqu_si256((__m256i const *)(& a[i + 1]));
r256 = _mm256_srli_epi64(r256, dec64);
carry256 = _mm256_slli_epi64(carry256, d0);
r256 ^= carry256;
i2 = (i - LAST64) >> 2;
o256[i2] = a256[i2] ^ r256;
tmp_reduce[LAST64] &= RED_MASK;
memcpy(o, tmp_reduce, VEC_N_SIZE_BYTES);
} }
/** /**
* @brief Compute C(x) = A(x)*B(x) * @brief Compute C(x) = A(x)*B(x)
* A(x) and B(x) are stored in 128-bit registers * A(x) and B(x) are stored in 128-bit registers
@ -89,7 +68,7 @@ static inline void reduce(uint64_t *o, const uint64_t *a) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_1(__m128i *C, __m128i *A, __m128i *B) { static inline void karat_mult_1(__m128i *C, __m128i *A, __m128i *B) {
__m128i D1[2]; __m128i D1[2];
__m128i D0[2], D2[2]; __m128i D0[2], D2[2];
__m128i Al = _mm_loadu_si128(A); __m128i Al = _mm_loadu_si128(A);
@ -146,7 +125,7 @@ inline static void karat_mult_1(__m128i *C, __m128i *A, __m128i *B) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_2(__m256i *C, __m256i *A, __m256i *B) { static inline void karat_mult_2(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[2], D1[2], D2[2], SAA, SBB; __m256i D0[2], D1[2], D2[2], SAA, SBB;
__m128i *A128 = (__m128i *)A, *B128 = (__m128i *)B; __m128i *A128 = (__m128i *)A, *B128 = (__m128i *)B;
@ -176,7 +155,7 @@ inline static void karat_mult_2(__m256i *C, __m256i *A, __m256i *B) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_4(__m256i *C, __m256i *A, __m256i *B) { static inline void karat_mult_4(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[4], D1[4], D2[4], SAA[2], SBB[2]; __m256i D0[4], D1[4], D2[4], SAA[2], SBB[2];
karat_mult_2( D0, A, B); karat_mult_2( D0, A, B);
@ -213,7 +192,7 @@ inline static void karat_mult_4(__m256i *C, __m256i *A, __m256i *B) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) { static inline void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[8], D1[8], D2[8], SAA[4], SBB[4]; __m256i D0[8], D1[8], D2[8], SAA[4], SBB[4];
karat_mult_4( D0, A, B); karat_mult_4( D0, A, B);
@ -252,7 +231,7 @@ inline static void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) { static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[16], D1[16], D2[16], SAA[8], SBB[8]; __m256i D0[16], D1[16], D2[16], SAA[8], SBB[8];
karat_mult_8( D0, A, B); karat_mult_8( D0, A, B);
@ -291,7 +270,7 @@ inline static void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) { static inline void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[32], D1[32], D2[32], SAA[16], SBB[16]; __m256i D0[32], D1[32], D2[32], SAA[16], SBB[16];
karat_mult_16( D0, A, B); karat_mult_16( D0, A, B);
@ -351,7 +330,7 @@ static inline void divByXplus1(__m256i *out, __m256i *in, int size) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
static void TOOM3Mult(uint64_t *Out, const uint64_t *A, const uint64_t *B) { static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
static __m256i U0[T_TM3_3W_256], V0[T_TM3_3W_256], U1[T_TM3_3W_256], V1[T_TM3_3W_256], U2[T_TM3_3W_256], V2[T_TM3_3W_256]; static __m256i U0[T_TM3_3W_256], V0[T_TM3_3W_256], U1[T_TM3_3W_256], V1[T_TM3_3W_256], U2[T_TM3_3W_256], V2[T_TM3_3W_256];
static __m256i W0[2 * (T_TM3_3W_256)], W1[2 * (T_TM3_3W_256)], W2[2 * (T_TM3_3W_256)], W3[2 * (T_TM3_3W_256)], W4[2 * (T_TM3_3W_256)]; static __m256i W0[2 * (T_TM3_3W_256)], W1[2 * (T_TM3_3W_256)], W2[2 * (T_TM3_3W_256)], W3[2 * (T_TM3_3W_256)], W4[2 * (T_TM3_3W_256)];
static __m256i tmp[2 * (T_TM3_3W_256)]; static __m256i tmp[2 * (T_TM3_3W_256)];
@ -540,7 +519,7 @@ static void TOOM3Mult(uint64_t *Out, const uint64_t *A, const uint64_t *B) {
} }
for (int32_t i = 0 ; i < 6 * T_TM3_3W_256 - 2 ; i++) { for (int32_t i = 0 ; i < 6 * T_TM3_3W_256 - 2 ; i++) {
uint64_t *out64 = Out + (i << 2); uint64_t *out64 = ((uint64_t *)Out) + (i << 2);
_mm256_storeu_si256((__m256i *)out64, ro256[i]); _mm256_storeu_si256((__m256i *)out64, ro256[i]);
} }
} }
@ -572,7 +551,7 @@ static inline void divByXplus1_256(__m256i *out, __m256i *in, int32_t size) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
static void TOOM3RecMult(uint64_t *Out, const uint64_t *A, const uint64_t *B) { static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
__m256i U0[T_TM3R_3W_256 + 2], V0[T_TM3R_3W_256 + 2], U1[T_TM3R_3W_256 + 2], V1[T_TM3R_3W_256 + 2], U2[T_TM3R_3W_256 + 2], V2[T_TM3R_3W_256 + 2]; __m256i U0[T_TM3R_3W_256 + 2], V0[T_TM3R_3W_256 + 2], U1[T_TM3R_3W_256 + 2], V1[T_TM3R_3W_256 + 2], U2[T_TM3R_3W_256 + 2], V2[T_TM3R_3W_256 + 2];
__m256i W0[2 * (T_TM3R_3W_256 + 2)], W1[2 * (T_TM3R_3W_256 + 2)], W2[2 * (T_TM3R_3W_256 + 2)], W3[2 * (T_TM3R_3W_256 + 2)], W4[2 * (T_TM3R_3W_256 + 2)]; __m256i W0[2 * (T_TM3R_3W_256 + 2)], W1[2 * (T_TM3R_3W_256 + 2)], W2[2 * (T_TM3R_3W_256 + 2)], W3[2 * (T_TM3R_3W_256 + 2)], W4[2 * (T_TM3R_3W_256 + 2)];
__m256i tmp[2 * (T_TM3R_3W_256 + 2) + 3]; __m256i tmp[2 * (T_TM3R_3W_256 + 2) + 3];
@ -617,7 +596,7 @@ static void TOOM3RecMult(uint64_t *Out, const uint64_t *A, const uint64_t *B) {
} }
//W1 = W2 * W3 //W1 = W2 * W3
TOOM3Mult((uint64_t *) W1, (uint64_t *) W2, (uint64_t *) W3); TOOM3Mult(W1, (uint64_t *) W2, (uint64_t *) W3);
//W0 =(U1 + U2*x)*x ; W4 =(V1 + V2*x)*x (SIZE = T_TM3_3W_256 + 2 !) //W0 =(U1 + U2*x)*x ; W4 =(V1 + V2*x)*x (SIZE = T_TM3_3W_256 + 2 !)
W0[0] = zero; W0[0] = zero;
W4[0] = zero; W4[0] = zero;
@ -645,17 +624,17 @@ static void TOOM3RecMult(uint64_t *Out, const uint64_t *A, const uint64_t *B) {
} }
//W3 = W3 * W2 ; W2 = W0 * W4 //W3 = W3 * W2 ; W2 = W0 * W4
TOOM3Mult((uint64_t *) tmp, (uint64_t *) W3, (uint64_t *) W2); TOOM3Mult(tmp, (uint64_t *) W3, (uint64_t *) W2);
for (int32_t i = 0 ; i < 2 * (T_TM3R_3W_256 + 2) ; i++) { for (int32_t i = 0 ; i < 2 * (T_TM3R_3W_256 + 2) ; i++) {
W3[i] = tmp[i]; W3[i] = tmp[i];
} }
TOOM3Mult((uint64_t *) W2, (uint64_t *) W0, (uint64_t *) W4); TOOM3Mult(W2, (uint64_t *) W0, (uint64_t *) W4);
//W4 = U2 * V2 ; W0 = U0 * V0 //W4 = U2 * V2 ; W0 = U0 * V0
TOOM3Mult((uint64_t *) W4, (uint64_t *) U2, (uint64_t *) V2); TOOM3Mult(W4, (uint64_t *) U2, (uint64_t *) V2);
TOOM3Mult((uint64_t *) W0, (uint64_t *) U0, (uint64_t *) V0); TOOM3Mult(W0, (uint64_t *) U0, (uint64_t *) V0);
//Interpolation phase //Interpolation phase
//9 add, 1 shift, 1 Smul, 2 Sdiv (2n) //9 add, 1 shift, 1 Smul, 2 Sdiv (2n)
@ -737,7 +716,7 @@ static void TOOM3RecMult(uint64_t *Out, const uint64_t *A, const uint64_t *B) {
for (int32_t i = 0 ; i < 6 * T_TM3R_3W_256 - 2 ; i++) { for (int32_t i = 0 ; i < 6 * T_TM3R_3W_256 - 2 ; i++) {
uint64_t *out64 = Out + (i << 2); uint64_t *out64 = ((uint64_t *)Out) + (i << 2);
_mm256_storeu_si256((__m256i *)out64, ro256[i]); _mm256_storeu_si256((__m256i *)out64, ro256[i]);
} }
} }
@ -756,8 +735,8 @@ static void TOOM3RecMult(uint64_t *Out, const uint64_t *A, const uint64_t *B) {
*/ */
void PQCLEAN_HQC256_AVX2_vect_mul(uint64_t *o, const uint64_t *a1, const uint64_t *a2) { void PQCLEAN_HQC256_AVX2_vect_mul(uint64_t *o, const uint64_t *a1, const uint64_t *a2) {
TOOM3RecMult(a1_times_a2, a1, a2); TOOM3RecMult(a1_times_a2, a1, a2);
reduce(o, a1_times_a2); reduce(o, (uint64_t *)a1_times_a2);
// clear all // clear all
memset(a1_times_a2, 0, (VEC_N_SIZE_64 << 1) * sizeof(uint64_t)); memset(a1_times_a2, 0, (2 * VEC_N_SIZE_256 + 1) * sizeof(__m256i));
} }

View File

@ -14,25 +14,21 @@
#define T_TM3_3W_256 32 #define T_TM3_3W_256 32
#define T_TM3_3W_64 128 #define T_TM3_3W_64 128
#define VEC_N_ARRAY_SIZE_VEC CEIL_DIVIDE(PARAM_N, 256) /*!< The number of needed vectors to store PARAM_N bits*/ #define VEC_N_SIZE_256 CEIL_DIVIDE(PARAM_N, 256) /*!< The number of needed vectors to store PARAM_N bits*/
#define WORD 64 __m256i a1_times_a2[2 * VEC_N_SIZE_256 + 1];
#define LAST64 (PARAM_N >> 6)
uint64_t a1_times_a2[2 * VEC_N_256_SIZE_64 + 1];
uint64_t tmp_reduce[VEC_N_ARRAY_SIZE_VEC << 2];
__m256i *o256 = (__m256i *) tmp_reduce;
uint64_t bloc64[PARAM_OMEGA_R]; // Allocation with the biggest possible weight uint64_t bloc64[PARAM_OMEGA_R]; // Allocation with the biggest possible weight
uint64_t bit64[PARAM_OMEGA_R]; // Allocation with the biggest possible weight uint64_t bit64[PARAM_OMEGA_R]; // Allocation with the biggest possible weight
static inline void reduce(uint64_t *o, const uint64_t *a); static inline void reduce(uint64_t *o, const uint64_t *a);
inline static void karat_mult_1(__m128i *C, __m128i *A, __m128i *B); static inline void karat_mult_1(__m128i *C, __m128i *A, __m128i *B);
inline static void karat_mult_2(__m256i *C, __m256i *A, __m256i *B); static inline void karat_mult_2(__m256i *C, __m256i *A, __m256i *B);
inline static void karat_mult_4(__m256i *C, __m256i *A, __m256i *B); static inline void karat_mult_4(__m256i *C, __m256i *A, __m256i *B);
inline static void karat_mult_8(__m256i *C, __m256i *A, __m256i *B); static inline void karat_mult_8(__m256i *C, __m256i *A, __m256i *B);
inline static void karat_mult_16(__m256i *C, __m256i *A, __m256i *B); static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B);
inline static void karat_mult_32(__m256i *C, __m256i *A, __m256i *B); static inline void karat_mult_32(__m256i *C, __m256i *A, __m256i *B);
static inline void divByXplus1(__m256i *out, __m256i *in, int size); static inline void divByXplus1(__m256i *out, __m256i *in, int size);
static void TOOM3Mult(uint64_t *Out, const uint64_t *A, const uint64_t *B); static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B);
@ -45,33 +41,16 @@ static void TOOM3Mult(uint64_t *Out, const uint64_t *A, const uint64_t *B);
* @param[in] a Pointer to the polynomial a(x) * @param[in] a Pointer to the polynomial a(x)
*/ */
static inline void reduce(uint64_t *o, const uint64_t *a) { static inline void reduce(uint64_t *o, const uint64_t *a) {
__m256i r256, carry256; uint64_t r;
__m256i *a256 = (__m256i *) a; uint64_t carry;
static const int32_t dec64 = PARAM_N & 0x3f;
static const int32_t d0 = WORD - dec64;
int32_t i, i2;
for (i = LAST64 ; i < (PARAM_N >> 5) - 4 ; i += 4) { for (uint32_t i = 0 ; i < VEC_N_SIZE_64 ; i++) {
r256 = _mm256_lddqu_si256((__m256i const *) (& a[i])); r = a[i + VEC_N_SIZE_64 - 1] >> (PARAM_N & 63);
r256 = _mm256_srli_epi64(r256, dec64); carry = (uint64_t) (a[i + VEC_N_SIZE_64] << (64 - (PARAM_N & 63)));
carry256 = _mm256_lddqu_si256((__m256i const *) (& a[i + 1])); o[i] = a[i] ^ r ^ carry;
carry256 = _mm256_slli_epi64(carry256, d0);
r256 ^= carry256;
i2 = (i - LAST64) >> 2;
o256[i2] = a256[i2] ^ r256;
} }
r256 = (__m256i) { o[VEC_N_SIZE_64 - 1] &= RED_MASK;
a[i], a[i + 1], 0x0UL, 0x0UL
};
carry256 = _mm256_lddqu_si256((__m256i const *) (& a[i + 1]));
r256 = _mm256_srli_epi64(r256, dec64);
carry256 = _mm256_slli_epi64(carry256, d0);
r256 ^= carry256;
i2 = (i - LAST64) >> 2;
o256[i2] = (a256[i2] ^ r256);
tmp_reduce[LAST64] &= RED_MASK;
memcpy(o, tmp_reduce, VEC_N_SIZE_BYTES);
} }
/** /**
@ -83,7 +62,7 @@ static inline void reduce(uint64_t *o, const uint64_t *a) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_1(__m128i *C, __m128i *A, __m128i *B) { static inline void karat_mult_1(__m128i *C, __m128i *A, __m128i *B) {
__m128i D1[2]; __m128i D1[2];
__m128i D0[2], D2[2]; __m128i D0[2], D2[2];
__m128i Al = _mm_loadu_si128(A); __m128i Al = _mm_loadu_si128(A);
@ -140,7 +119,7 @@ inline static void karat_mult_1(__m128i *C, __m128i *A, __m128i *B) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_2(__m256i *C, __m256i *A, __m256i *B) { static inline void karat_mult_2(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[2], D1[2], D2[2], SAA, SBB; __m256i D0[2], D1[2], D2[2], SAA, SBB;
__m128i *A128 = (__m128i *)A, *B128 = (__m128i *)B; __m128i *A128 = (__m128i *)A, *B128 = (__m128i *)B;
@ -170,7 +149,7 @@ inline static void karat_mult_2(__m256i *C, __m256i *A, __m256i *B) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_4(__m256i *C, __m256i *A, __m256i *B) { static inline void karat_mult_4(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[4], D1[4], D2[4], SAA[2], SBB[2]; __m256i D0[4], D1[4], D2[4], SAA[2], SBB[2];
karat_mult_2( D0, A, B); karat_mult_2( D0, A, B);
@ -207,7 +186,7 @@ inline static void karat_mult_4(__m256i *C, __m256i *A, __m256i *B) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) { static inline void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[8], D1[8], D2[8], SAA[4], SBB[4]; __m256i D0[8], D1[8], D2[8], SAA[4], SBB[4];
karat_mult_4( D0, A, B); karat_mult_4( D0, A, B);
@ -246,7 +225,7 @@ inline static void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) { static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[16], D1[16], D2[16], SAA[8], SBB[8]; __m256i D0[16], D1[16], D2[16], SAA[8], SBB[8];
karat_mult_8( D0, A, B); karat_mult_8( D0, A, B);
@ -285,7 +264,7 @@ inline static void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) { static inline void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[32], D1[32], D2[32], SAA[16], SBB[16]; __m256i D0[32], D1[32], D2[32], SAA[16], SBB[16];
karat_mult_16( D0, A, B); karat_mult_16( D0, A, B);
@ -344,7 +323,7 @@ static inline void divByXplus1(__m256i *out, __m256i *in, int size) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
static void TOOM3Mult(uint64_t *Out, const uint64_t *A, const uint64_t *B) { static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
static __m256i U0[T_TM3_3W_256], V0[T_TM3_3W_256], U1[T_TM3_3W_256], V1[T_TM3_3W_256], U2[T_TM3_3W_256], V2[T_TM3_3W_256]; static __m256i U0[T_TM3_3W_256], V0[T_TM3_3W_256], U1[T_TM3_3W_256], V1[T_TM3_3W_256], U2[T_TM3_3W_256], V2[T_TM3_3W_256];
static __m256i W0[2 * (T_TM3_3W_256)], W1[2 * (T_TM3_3W_256)], W2[2 * (T_TM3_3W_256)], W3[2 * (T_TM3_3W_256)], W4[2 * (T_TM3_3W_256)]; static __m256i W0[2 * (T_TM3_3W_256)], W1[2 * (T_TM3_3W_256)], W2[2 * (T_TM3_3W_256)], W3[2 * (T_TM3_3W_256)], W4[2 * (T_TM3_3W_256)];
static __m256i tmp[2 * (T_TM3_3W_256)]; static __m256i tmp[2 * (T_TM3_3W_256)];
@ -533,7 +512,7 @@ static void TOOM3Mult(uint64_t *Out, const uint64_t *A, const uint64_t *B) {
} }
for (int32_t i = 0 ; i < 6 * T_TM3_3W_256 - 2 ; i++) { for (int32_t i = 0 ; i < 6 * T_TM3_3W_256 - 2 ; i++) {
uint64_t *out64 = Out + (i << 2); uint64_t *out64 = ((uint64_t *)Out) + (i << 2);
_mm256_storeu_si256((__m256i *)out64, ro256[i]); _mm256_storeu_si256((__m256i *)out64, ro256[i]);
} }
} }
@ -551,8 +530,8 @@ static void TOOM3Mult(uint64_t *Out, const uint64_t *A, const uint64_t *B) {
*/ */
void PQCLEAN_HQCRMRS128_AVX2_vect_mul(uint64_t *o, const uint64_t *a1, const uint64_t *a2) { void PQCLEAN_HQCRMRS128_AVX2_vect_mul(uint64_t *o, const uint64_t *a1, const uint64_t *a2) {
TOOM3Mult(a1_times_a2, a1, a2); TOOM3Mult(a1_times_a2, a1, a2);
reduce(o, a1_times_a2); reduce(o, (uint64_t *)a1_times_a2);
// clear all // clear all
memset(a1_times_a2, 0, (VEC_N_SIZE_64 << 1) * sizeof(uint64_t)); memset(a1_times_a2, 0, (2 * VEC_N_SIZE_256 + 1) * sizeof(__m256i));
} }

View File

@ -14,26 +14,22 @@
#define T_TM3_3W_256 64 #define T_TM3_3W_256 64
#define T_TM3_3W_64 256 #define T_TM3_3W_64 256
#define VEC_N_ARRAY_SIZE_VEC CEIL_DIVIDE(PARAM_N, 256) /*!< The number of needed vectors to store PARAM_N bits*/ #define VEC_N_SIZE_256 CEIL_DIVIDE(PARAM_N, 256) /*!< The number of needed vectors to store PARAM_N bits*/
#define WORD 64 __m256i a1_times_a2[2 * VEC_N_SIZE_256 + 1];
#define LAST64 (PARAM_N >> 6)
uint64_t a1_times_a2[2 * VEC_N_256_SIZE_64 + 1];
uint64_t tmp_reduce[VEC_N_ARRAY_SIZE_VEC << 2];
__m256i *o256 = (__m256i *) tmp_reduce;
uint64_t bloc64[PARAM_OMEGA_R]; // Allocation with the biggest possible weight uint64_t bloc64[PARAM_OMEGA_R]; // Allocation with the biggest possible weight
uint64_t bit64[PARAM_OMEGA_R]; // Allocation with the biggest possible weight uint64_t bit64[PARAM_OMEGA_R]; // Allocation with the biggest possible weight
static inline void reduce(uint64_t *o, const uint64_t *a); static inline void reduce(uint64_t *o, const uint64_t *a);
inline static void karat_mult_1(__m128i *C, __m128i *A, __m128i *B); static inline void karat_mult_1(__m128i *C, __m128i *A, __m128i *B);
inline static void karat_mult_2(__m256i *C, __m256i *A, __m256i *B); static inline void karat_mult_2(__m256i *C, __m256i *A, __m256i *B);
inline static void karat_mult_4(__m256i *C, __m256i *A, __m256i *B); static inline void karat_mult_4(__m256i *C, __m256i *A, __m256i *B);
inline static void karat_mult_8(__m256i *C, __m256i *A, __m256i *B); static inline void karat_mult_8(__m256i *C, __m256i *A, __m256i *B);
inline static void karat_mult_16(__m256i *C, __m256i *A, __m256i *B); static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B);
inline static void karat_mult_32(__m256i *C, __m256i *A, __m256i *B); static inline void karat_mult_32(__m256i *C, __m256i *A, __m256i *B);
inline static void karat_mult_64(__m256i *C, __m256i *A, __m256i *B); static inline void karat_mult_64(__m256i *C, __m256i *A, __m256i *B);
static inline void divByXplus1(__m256i *out, __m256i *in, int size); static inline void divByXplus1(__m256i *out, __m256i *in, int size);
static void TOOM3Mult(uint64_t *Out, const uint64_t *A, const uint64_t *B); static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B);
@ -46,43 +42,18 @@ static void TOOM3Mult(uint64_t *Out, const uint64_t *A, const uint64_t *B);
* @param[in] a Pointer to the polynomial a(x) * @param[in] a Pointer to the polynomial a(x)
*/ */
static inline void reduce(uint64_t *o, const uint64_t *a) { static inline void reduce(uint64_t *o, const uint64_t *a) {
__m256i r256, carry256; uint64_t r;
__m256i *a256 = (__m256i *) a; uint64_t carry;
static const int32_t dec64 = PARAM_N & 0x3f;
static const int32_t d0 = WORD - dec64;
int32_t i, i2;
for (i = LAST64 ; i < (PARAM_N >> 5) - 4 ; i += 4) { for (uint32_t i = 0 ; i < VEC_N_SIZE_64 ; i++) {
r256 = _mm256_lddqu_si256((__m256i const *) (& a[i])); r = a[i + VEC_N_SIZE_64 - 1] >> (PARAM_N & 63);
r256 = _mm256_srli_epi64(r256, dec64); carry = (uint64_t) (a[i + VEC_N_SIZE_64] << (64 - (PARAM_N & 63)));
carry256 = _mm256_lddqu_si256((__m256i const *) (& a[i + 1])); o[i] = a[i] ^ r ^ carry;
carry256 = _mm256_slli_epi64(carry256, d0);
r256 ^= carry256;
i2 = (i - LAST64) >> 2;
o256[i2] = a256[i2] ^ r256;
} }
r256 = _mm256_lddqu_si256((__m256i const *)(& a[i])); o[VEC_N_SIZE_64 - 1] &= RED_MASK;
r256 = _mm256_srli_epi64(r256, dec64);
carry256 = _mm256_lddqu_si256((__m256i const *)(& a[i + 1]));
carry256 = _mm256_slli_epi64(carry256, d0);
r256 ^= carry256;
i2 = (i - LAST64) >> 2;
o256[i2] = a256[i2] ^ r256;
i += 4;
r256 = _mm256_lddqu_si256((__m256i const *)(& a[i]));
carry256 = _mm256_lddqu_si256((__m256i const *) (& a[i + 1]));
r256 = _mm256_srli_epi64(r256, dec64);
carry256 = _mm256_slli_epi64(carry256, d0);
r256 ^= carry256;
i2 = (i - LAST64) >> 2;
o256[i2] = (a256[i2] ^ r256);
tmp_reduce[LAST64] &= RED_MASK;
memcpy(o, tmp_reduce, VEC_N_SIZE_BYTES);
} }
/** /**
* @brief Compute C(x) = A(x)*B(x) * @brief Compute C(x) = A(x)*B(x)
* A(x) and B(x) are stored in 128-bit registers * A(x) and B(x) are stored in 128-bit registers
@ -92,7 +63,7 @@ static inline void reduce(uint64_t *o, const uint64_t *a) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_1(__m128i *C, __m128i *A, __m128i *B) { static inline void karat_mult_1(__m128i *C, __m128i *A, __m128i *B) {
__m128i D1[2]; __m128i D1[2];
__m128i D0[2], D2[2]; __m128i D0[2], D2[2];
__m128i Al = _mm_loadu_si128(A); __m128i Al = _mm_loadu_si128(A);
@ -149,7 +120,7 @@ inline static void karat_mult_1(__m128i *C, __m128i *A, __m128i *B) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_2(__m256i *C, __m256i *A, __m256i *B) { static inline void karat_mult_2(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[2], D1[2], D2[2], SAA, SBB; __m256i D0[2], D1[2], D2[2], SAA, SBB;
__m128i *A128 = (__m128i *)A, *B128 = (__m128i *)B; __m128i *A128 = (__m128i *)A, *B128 = (__m128i *)B;
@ -179,7 +150,7 @@ inline static void karat_mult_2(__m256i *C, __m256i *A, __m256i *B) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_4(__m256i *C, __m256i *A, __m256i *B) { static inline void karat_mult_4(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[4], D1[4], D2[4], SAA[2], SBB[2]; __m256i D0[4], D1[4], D2[4], SAA[2], SBB[2];
karat_mult_2( D0, A, B); karat_mult_2( D0, A, B);
@ -216,7 +187,7 @@ inline static void karat_mult_4(__m256i *C, __m256i *A, __m256i *B) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) { static inline void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[8], D1[8], D2[8], SAA[4], SBB[4]; __m256i D0[8], D1[8], D2[8], SAA[4], SBB[4];
karat_mult_4( D0, A, B); karat_mult_4( D0, A, B);
@ -255,7 +226,7 @@ inline static void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) { static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[16], D1[16], D2[16], SAA[8], SBB[8]; __m256i D0[16], D1[16], D2[16], SAA[8], SBB[8];
karat_mult_8( D0, A, B); karat_mult_8( D0, A, B);
@ -294,7 +265,7 @@ inline static void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) { static inline void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[32], D1[32], D2[32], SAA[16], SBB[16]; __m256i D0[32], D1[32], D2[32], SAA[16], SBB[16];
karat_mult_16( D0, A, B); karat_mult_16( D0, A, B);
@ -333,7 +304,7 @@ inline static void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_64(__m256i *C, __m256i *A, __m256i *B) { static inline void karat_mult_64(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[64], D1[64], D2[64], SAA[32], SBB[32]; __m256i D0[64], D1[64], D2[64], SAA[32], SBB[32];
karat_mult_32( D0, A, B); karat_mult_32( D0, A, B);
@ -392,7 +363,7 @@ static inline void divByXplus1(__m256i *out, __m256i *in, int size) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
static void TOOM3Mult(uint64_t *Out, const uint64_t *A, const uint64_t *B) { static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
static __m256i U0[T_TM3_3W_256], V0[T_TM3_3W_256], U1[T_TM3_3W_256], V1[T_TM3_3W_256], U2[T_TM3_3W_256], V2[T_TM3_3W_256]; static __m256i U0[T_TM3_3W_256], V0[T_TM3_3W_256], U1[T_TM3_3W_256], V1[T_TM3_3W_256], U2[T_TM3_3W_256], V2[T_TM3_3W_256];
static __m256i W0[2 * (T_TM3_3W_256)], W1[2 * (T_TM3_3W_256)], W2[2 * (T_TM3_3W_256)], W3[2 * (T_TM3_3W_256)], W4[2 * (T_TM3_3W_256)]; static __m256i W0[2 * (T_TM3_3W_256)], W1[2 * (T_TM3_3W_256)], W2[2 * (T_TM3_3W_256)], W3[2 * (T_TM3_3W_256)], W4[2 * (T_TM3_3W_256)];
static __m256i tmp[2 * (T_TM3_3W_256)]; static __m256i tmp[2 * (T_TM3_3W_256)];
@ -580,7 +551,7 @@ static void TOOM3Mult(uint64_t *Out, const uint64_t *A, const uint64_t *B) {
} }
for (int32_t i = 0 ; i < 6 * T_TM3_3W_256 - 2 ; i++) { for (int32_t i = 0 ; i < 6 * T_TM3_3W_256 - 2 ; i++) {
uint64_t *out64 = Out + (i << 2); uint64_t *out64 = ((uint64_t *)Out) + (i << 2);
_mm256_storeu_si256((__m256i *)out64, ro256[i]); _mm256_storeu_si256((__m256i *)out64, ro256[i]);
} }
} }
@ -599,8 +570,8 @@ static void TOOM3Mult(uint64_t *Out, const uint64_t *A, const uint64_t *B) {
*/ */
void PQCLEAN_HQCRMRS192_AVX2_vect_mul(uint64_t *o, const uint64_t *a1, const uint64_t *a2) { void PQCLEAN_HQCRMRS192_AVX2_vect_mul(uint64_t *o, const uint64_t *a1, const uint64_t *a2) {
TOOM3Mult(a1_times_a2, a1, a2); TOOM3Mult(a1_times_a2, a1, a2);
reduce(o, a1_times_a2); reduce(o, (uint64_t *)a1_times_a2);
// clear all // clear all
memset(a1_times_a2, 0, (VEC_N_SIZE_64 << 1) * sizeof(uint64_t)); memset(a1_times_a2, 0, (2 * VEC_N_SIZE_256 + 1) * sizeof(__m256i));
} }

View File

@ -18,27 +18,23 @@
#define T_TM3R_3W_256 93 #define T_TM3R_3W_256 93
#define T_TM3R_3W_64 (T_TM3R_3W_256<<2) #define T_TM3R_3W_64 (T_TM3R_3W_256<<2)
#define VEC_N_ARRAY_SIZE_VEC CEIL_DIVIDE(PARAM_N, 256) /*!< The number of needed vectors to store PARAM_N bits*/ #define VEC_N_SIZE_256 CEIL_DIVIDE(PARAM_N, 256) /*!< The number of needed vectors to store PARAM_N bits*/
#define WORD 64 __m256i a1_times_a2[2 * VEC_N_SIZE_256 + 1];
#define LAST64 (PARAM_N >> 6)
uint64_t a1_times_a2[2 * VEC_N_256_SIZE_64 + 1];
uint64_t tmp_reduce[VEC_N_ARRAY_SIZE_VEC << 2];
__m256i *o256 = (__m256i *) tmp_reduce;
uint64_t bloc64[PARAM_OMEGA_R]; // Allocation with the biggest possible weight uint64_t bloc64[PARAM_OMEGA_R]; // Allocation with the biggest possible weight
uint64_t bit64[PARAM_OMEGA_R]; // Allocation with the biggest possible weight uint64_t bit64[PARAM_OMEGA_R]; // Allocation with the biggest possible weight
static inline void reduce(uint64_t *o, const uint64_t *a); static inline void reduce(uint64_t *o, const uint64_t *a);
inline static void karat_mult_1(__m128i *C, __m128i *A, __m128i *B); static inline void karat_mult_1(__m128i *C, __m128i *A, __m128i *B);
inline static void karat_mult_2(__m256i *C, __m256i *A, __m256i *B); static inline void karat_mult_2(__m256i *C, __m256i *A, __m256i *B);
inline static void karat_mult_4(__m256i *C, __m256i *A, __m256i *B); static inline void karat_mult_4(__m256i *C, __m256i *A, __m256i *B);
inline static void karat_mult_8(__m256i *C, __m256i *A, __m256i *B); static inline void karat_mult_8(__m256i *C, __m256i *A, __m256i *B);
inline static void karat_mult_16(__m256i *C, __m256i *A, __m256i *B); static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B);
inline static void karat_mult_32(__m256i *C, __m256i *A, __m256i *B); static inline void karat_mult_32(__m256i *C, __m256i *A, __m256i *B);
static inline void divByXplus1(__m256i *out, __m256i *in, int size); static inline void divByXplus1(__m256i *out, __m256i *in, int size);
static inline void divByXplus1_256(__m256i *out, __m256i *in, int size); static inline void divByXplus1_256(__m256i *out, __m256i *in, int size);
static void TOOM3Mult(uint64_t *Out, const uint64_t *A, const uint64_t *B); static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B);
static void TOOM3RecMult(uint64_t *Out, const uint64_t *A, const uint64_t *B); static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B);
@ -51,35 +47,18 @@ static void TOOM3RecMult(uint64_t *Out, const uint64_t *A, const uint64_t *B);
* @param[in] a Pointer to the polynomial a(x) * @param[in] a Pointer to the polynomial a(x)
*/ */
static inline void reduce(uint64_t *o, const uint64_t *a) { static inline void reduce(uint64_t *o, const uint64_t *a) {
__m256i r256, carry256; uint64_t r;
__m256i *a256 = (__m256i *) a; uint64_t carry;
static const int32_t dec64 = PARAM_N & 0x3f;
static const int32_t d0 = WORD - dec64;
int32_t i, i2;
for (i = LAST64 ; i < (PARAM_N >> 5) - 4 ; i += 4) { for (uint32_t i = 0 ; i < VEC_N_SIZE_64 ; i++) {
r256 = _mm256_lddqu_si256((__m256i const *) (& a[i])); r = a[i + VEC_N_SIZE_64 - 1] >> (PARAM_N & 63);
r256 = _mm256_srli_epi64(r256, dec64); carry = (uint64_t) (a[i + VEC_N_SIZE_64] << (64 - (PARAM_N & 63)));
carry256 = _mm256_lddqu_si256((__m256i const *) (& a[i + 1])); o[i] = a[i] ^ r ^ carry;
carry256 = _mm256_slli_epi64(carry256, d0);
r256 ^= carry256;
i2 = (i - LAST64) >> 2;
o256[i2] = a256[i2] ^ r256;
} }
r256 = _mm256_lddqu_si256((__m256i const *)(& a[i])); o[VEC_N_SIZE_64 - 1] &= RED_MASK;
carry256 = _mm256_lddqu_si256((__m256i const *)(& a[i + 1]));
r256 = _mm256_srli_epi64(r256, dec64);
carry256 = _mm256_slli_epi64(carry256, d0);
r256 ^= carry256;
i2 = (i - LAST64) >> 2;
o256[i2] = a256[i2] ^ r256;
tmp_reduce[LAST64] &= RED_MASK;
memcpy(o, tmp_reduce, VEC_N_SIZE_BYTES);
} }
/** /**
* @brief Compute C(x) = A(x)*B(x) * @brief Compute C(x) = A(x)*B(x)
* A(x) and B(x) are stored in 128-bit registers * A(x) and B(x) are stored in 128-bit registers
@ -89,7 +68,7 @@ static inline void reduce(uint64_t *o, const uint64_t *a) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_1(__m128i *C, __m128i *A, __m128i *B) { static inline void karat_mult_1(__m128i *C, __m128i *A, __m128i *B) {
__m128i D1[2]; __m128i D1[2];
__m128i D0[2], D2[2]; __m128i D0[2], D2[2];
__m128i Al = _mm_loadu_si128(A); __m128i Al = _mm_loadu_si128(A);
@ -146,7 +125,7 @@ inline static void karat_mult_1(__m128i *C, __m128i *A, __m128i *B) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_2(__m256i *C, __m256i *A, __m256i *B) { static inline void karat_mult_2(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[2], D1[2], D2[2], SAA, SBB; __m256i D0[2], D1[2], D2[2], SAA, SBB;
__m128i *A128 = (__m128i *)A, *B128 = (__m128i *)B; __m128i *A128 = (__m128i *)A, *B128 = (__m128i *)B;
@ -176,7 +155,7 @@ inline static void karat_mult_2(__m256i *C, __m256i *A, __m256i *B) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_4(__m256i *C, __m256i *A, __m256i *B) { static inline void karat_mult_4(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[4], D1[4], D2[4], SAA[2], SBB[2]; __m256i D0[4], D1[4], D2[4], SAA[2], SBB[2];
karat_mult_2( D0, A, B); karat_mult_2( D0, A, B);
@ -213,7 +192,7 @@ inline static void karat_mult_4(__m256i *C, __m256i *A, __m256i *B) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) { static inline void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[8], D1[8], D2[8], SAA[4], SBB[4]; __m256i D0[8], D1[8], D2[8], SAA[4], SBB[4];
karat_mult_4( D0, A, B); karat_mult_4( D0, A, B);
@ -252,7 +231,7 @@ inline static void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) { static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[16], D1[16], D2[16], SAA[8], SBB[8]; __m256i D0[16], D1[16], D2[16], SAA[8], SBB[8];
karat_mult_8( D0, A, B); karat_mult_8( D0, A, B);
@ -291,7 +270,7 @@ inline static void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
inline static void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) { static inline void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[32], D1[32], D2[32], SAA[16], SBB[16]; __m256i D0[32], D1[32], D2[32], SAA[16], SBB[16];
karat_mult_16( D0, A, B); karat_mult_16( D0, A, B);
@ -351,7 +330,7 @@ static inline void divByXplus1(__m256i *out, __m256i *in, int size) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
static void TOOM3Mult(uint64_t *Out, const uint64_t *A, const uint64_t *B) { static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
static __m256i U0[T_TM3_3W_256], V0[T_TM3_3W_256], U1[T_TM3_3W_256], V1[T_TM3_3W_256], U2[T_TM3_3W_256], V2[T_TM3_3W_256]; static __m256i U0[T_TM3_3W_256], V0[T_TM3_3W_256], U1[T_TM3_3W_256], V1[T_TM3_3W_256], U2[T_TM3_3W_256], V2[T_TM3_3W_256];
static __m256i W0[2 * (T_TM3_3W_256)], W1[2 * (T_TM3_3W_256)], W2[2 * (T_TM3_3W_256)], W3[2 * (T_TM3_3W_256)], W4[2 * (T_TM3_3W_256)]; static __m256i W0[2 * (T_TM3_3W_256)], W1[2 * (T_TM3_3W_256)], W2[2 * (T_TM3_3W_256)], W3[2 * (T_TM3_3W_256)], W4[2 * (T_TM3_3W_256)];
static __m256i tmp[2 * (T_TM3_3W_256)]; static __m256i tmp[2 * (T_TM3_3W_256)];
@ -540,7 +519,7 @@ static void TOOM3Mult(uint64_t *Out, const uint64_t *A, const uint64_t *B) {
} }
for (int32_t i = 0 ; i < 6 * T_TM3_3W_256 - 2 ; i++) { for (int32_t i = 0 ; i < 6 * T_TM3_3W_256 - 2 ; i++) {
uint64_t *out64 = Out + (i << 2); uint64_t *out64 = ((uint64_t *)Out) + (i << 2);
_mm256_storeu_si256((__m256i *)out64, ro256[i]); _mm256_storeu_si256((__m256i *)out64, ro256[i]);
} }
} }
@ -572,7 +551,7 @@ static inline void divByXplus1_256(__m256i *out, __m256i *in, int32_t size) {
* @param[in] A Pointer to the polynomial A(x) * @param[in] A Pointer to the polynomial A(x)
* @param[in] B Pointer to the polynomial B(x) * @param[in] B Pointer to the polynomial B(x)
*/ */
static void TOOM3RecMult(uint64_t *Out, const uint64_t *A, const uint64_t *B) { static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
__m256i U0[T_TM3R_3W_256 + 2], V0[T_TM3R_3W_256 + 2], U1[T_TM3R_3W_256 + 2], V1[T_TM3R_3W_256 + 2], U2[T_TM3R_3W_256 + 2], V2[T_TM3R_3W_256 + 2]; __m256i U0[T_TM3R_3W_256 + 2], V0[T_TM3R_3W_256 + 2], U1[T_TM3R_3W_256 + 2], V1[T_TM3R_3W_256 + 2], U2[T_TM3R_3W_256 + 2], V2[T_TM3R_3W_256 + 2];
__m256i W0[2 * (T_TM3R_3W_256 + 2)], W1[2 * (T_TM3R_3W_256 + 2)], W2[2 * (T_TM3R_3W_256 + 2)], W3[2 * (T_TM3R_3W_256 + 2)], W4[2 * (T_TM3R_3W_256 + 2)]; __m256i W0[2 * (T_TM3R_3W_256 + 2)], W1[2 * (T_TM3R_3W_256 + 2)], W2[2 * (T_TM3R_3W_256 + 2)], W3[2 * (T_TM3R_3W_256 + 2)], W4[2 * (T_TM3R_3W_256 + 2)];
__m256i tmp[2 * (T_TM3R_3W_256 + 2) + 3]; __m256i tmp[2 * (T_TM3R_3W_256 + 2) + 3];
@ -617,7 +596,7 @@ static void TOOM3RecMult(uint64_t *Out, const uint64_t *A, const uint64_t *B) {
} }
//W1 = W2 * W3 //W1 = W2 * W3
TOOM3Mult((uint64_t *) W1, (uint64_t *) W2, (uint64_t *) W3); TOOM3Mult(W1, (uint64_t *) W2, (uint64_t *) W3);
//W0 =(U1 + U2*x)*x ; W4 =(V1 + V2*x)*x (SIZE = T_TM3_3W_256 + 2 !) //W0 =(U1 + U2*x)*x ; W4 =(V1 + V2*x)*x (SIZE = T_TM3_3W_256 + 2 !)
W0[0] = zero; W0[0] = zero;
W4[0] = zero; W4[0] = zero;
@ -645,17 +624,17 @@ static void TOOM3RecMult(uint64_t *Out, const uint64_t *A, const uint64_t *B) {
} }
//W3 = W3 * W2 ; W2 = W0 * W4 //W3 = W3 * W2 ; W2 = W0 * W4
TOOM3Mult((uint64_t *) tmp, (uint64_t *) W3, (uint64_t *) W2); TOOM3Mult(tmp, (uint64_t *) W3, (uint64_t *) W2);
for (int32_t i = 0 ; i < 2 * (T_TM3R_3W_256 + 2) ; i++) { for (int32_t i = 0 ; i < 2 * (T_TM3R_3W_256 + 2) ; i++) {
W3[i] = tmp[i]; W3[i] = tmp[i];
} }
TOOM3Mult((uint64_t *) W2, (uint64_t *) W0, (uint64_t *) W4); TOOM3Mult(W2, (uint64_t *) W0, (uint64_t *) W4);
//W4 = U2 * V2 ; W0 = U0 * V0 //W4 = U2 * V2 ; W0 = U0 * V0
TOOM3Mult((uint64_t *) W4, (uint64_t *) U2, (uint64_t *) V2); TOOM3Mult(W4, (uint64_t *) U2, (uint64_t *) V2);
TOOM3Mult((uint64_t *) W0, (uint64_t *) U0, (uint64_t *) V0); TOOM3Mult(W0, (uint64_t *) U0, (uint64_t *) V0);
//Interpolation phase //Interpolation phase
//9 add, 1 shift, 1 Smul, 2 Sdiv (2n) //9 add, 1 shift, 1 Smul, 2 Sdiv (2n)
@ -737,7 +716,7 @@ static void TOOM3RecMult(uint64_t *Out, const uint64_t *A, const uint64_t *B) {
for (int32_t i = 0 ; i < 6 * T_TM3R_3W_256 - 2 ; i++) { for (int32_t i = 0 ; i < 6 * T_TM3R_3W_256 - 2 ; i++) {
uint64_t *out64 = Out + (i << 2); uint64_t *out64 = ((uint64_t *)Out) + (i << 2);
_mm256_storeu_si256((__m256i *)out64, ro256[i]); _mm256_storeu_si256((__m256i *)out64, ro256[i]);
} }
} }
@ -756,8 +735,8 @@ static void TOOM3RecMult(uint64_t *Out, const uint64_t *A, const uint64_t *B) {
*/ */
void PQCLEAN_HQCRMRS256_AVX2_vect_mul(uint64_t *o, const uint64_t *a1, const uint64_t *a2) { void PQCLEAN_HQCRMRS256_AVX2_vect_mul(uint64_t *o, const uint64_t *a1, const uint64_t *a2) {
TOOM3RecMult(a1_times_a2, a1, a2); TOOM3RecMult(a1_times_a2, a1, a2);
reduce(o, a1_times_a2); reduce(o, (uint64_t *)a1_times_a2);
// clear all // clear all
memset(a1_times_a2, 0, (VEC_N_SIZE_64 << 1) * sizeof(uint64_t)); memset(a1_times_a2, 0, (2 * VEC_N_SIZE_256 + 1) * sizeof(__m256i));
} }

View File

@ -68,6 +68,7 @@ consistency_checks:
- hqc.h - hqc.h
- parsing.h - parsing.h
- vector.h - vector.h
- gf2x.c
- hqc.c - hqc.c
- kem.c - kem.c
- parsing.c - parsing.c