1
1
mirror of https://github.com/henrydcase/pqc.git synced 2024-11-26 09:21:28 +00:00

compiler warnings

This commit is contained in:
John M. Schanck 2020-09-10 16:26:03 -04:00 committed by Kris Kwiatkowski
parent 63d033cf02
commit 1f4fa5ec3e
27 changed files with 686 additions and 849 deletions

View File

@ -19,6 +19,7 @@
static void compute_fft_betas(uint16_t *betas); static void compute_fft_betas(uint16_t *betas);
static void compute_subset_sums(uint16_t *subset_sums, const uint16_t *set, size_t set_size); static void compute_subset_sums(uint16_t *subset_sums, const uint16_t *set, size_t set_size);
static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f); static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f);
static void radix_big(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f);
static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32_t m_f, const uint16_t *betas); static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32_t m_f, const uint16_t *betas);
@ -28,7 +29,8 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
* @param[out] betas Array of size PARAM_M-1 * @param[out] betas Array of size PARAM_M-1
*/ */
static void compute_fft_betas(uint16_t *betas) { static void compute_fft_betas(uint16_t *betas) {
for (size_t i = 0 ; i < PARAM_M - 1 ; ++i) { size_t i;
for (i = 0 ; i < PARAM_M - 1 ; ++i) {
betas[i] = 1 << (PARAM_M - 1 - i); betas[i] = 1 << (PARAM_M - 1 - i);
} }
} }
@ -46,10 +48,11 @@ static void compute_fft_betas(uint16_t *betas) {
* @param[in] set_size Size of the array set * @param[in] set_size Size of the array set
*/ */
static void compute_subset_sums(uint16_t *subset_sums, const uint16_t *set, size_t set_size) { static void compute_subset_sums(uint16_t *subset_sums, const uint16_t *set, size_t set_size) {
size_t i, j;
subset_sums[0] = 0; subset_sums[0] = 0;
for (size_t i = 0 ; i < set_size ; ++i) { for (i = 0 ; i < set_size ; ++i) {
for (size_t j = 0 ; j < (1U << i) ; ++j) { for (j = 0 ; j < (1U << i) ; ++j) {
subset_sums[(1 << i) + j] = set[i] ^ subset_sums[j]; subset_sums[(1 << i) + j] = set[i] ^ subset_sums[j];
} }
} }
@ -89,7 +92,7 @@ static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f) {
f1[2] = f[3] ^ f1[1] ^ f0[3]; f1[2] = f[3] ^ f1[1] ^ f0[3];
f0[1] = f[2] ^ f0[2] ^ f1[1]; f0[1] = f[2] ^ f0[2] ^ f1[1];
f1[0] = f[1] ^ f0[1]; f1[0] = f[1] ^ f0[1];
return; break;
case 3: case 3:
f0[0] = f[0]; f0[0] = f[0];
@ -100,37 +103,43 @@ static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f) {
f1[3] = f[7]; f1[3] = f[7];
f0[1] = f[2] ^ f0[2] ^ f1[1]; f0[1] = f[2] ^ f0[2] ^ f1[1];
f1[0] = f[1] ^ f0[1]; f1[0] = f[1] ^ f0[1];
return; break;
case 2: case 2:
f0[0] = f[0]; f0[0] = f[0];
f0[1] = f[2] ^ f[3]; f0[1] = f[2] ^ f[3];
f1[0] = f[1] ^ f0[1]; f1[0] = f[1] ^ f0[1];
f1[1] = f[3]; f1[1] = f[3];
return; break;
case 1: case 1:
f0[0] = f[0]; f0[0] = f[0];
f1[0] = f[1]; f1[0] = f[1];
return; break;
default: default:
; radix_big(f0, f1, f, m_f);
size_t n = 1 << (m_f - 2); break;
}
}
uint16_t Q[2 * (1 << (PARAM_FFT - 2))]; static void radix_big(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f) {
uint16_t R[2 * (1 << (PARAM_FFT - 2))]; uint16_t Q[2 * (1 << (PARAM_FFT - 2))] = {0};
uint16_t R[2 * (1 << (PARAM_FFT - 2))] = {0};
uint16_t Q0[1 << (PARAM_FFT - 2)]; uint16_t Q0[1 << (PARAM_FFT - 2)] = {0};
uint16_t Q1[1 << (PARAM_FFT - 2)]; uint16_t Q1[1 << (PARAM_FFT - 2)] = {0};
uint16_t R0[1 << (PARAM_FFT - 2)]; uint16_t R0[1 << (PARAM_FFT - 2)] = {0};
uint16_t R1[1 << (PARAM_FFT - 2)]; uint16_t R1[1 << (PARAM_FFT - 2)] = {0};
size_t i, n;
n = 1 << (m_f - 2);
memcpy(Q, f + 3 * n, 2 * n); memcpy(Q, f + 3 * n, 2 * n);
memcpy(Q + n, f + 3 * n, 2 * n); memcpy(Q + n, f + 3 * n, 2 * n);
memcpy(R, f, 4 * n); memcpy(R, f, 4 * n);
for (size_t i = 0 ; i < n ; ++i) { for (i = 0 ; i < n ; ++i) {
Q[i] ^= f[2 * n + i]; Q[i] ^= f[2 * n + i];
R[n + i] ^= Q[i]; R[n + i] ^= Q[i];
} }
@ -142,7 +151,6 @@ static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f) {
memcpy(f0 + n, Q0, 2 * n); memcpy(f0 + n, Q0, 2 * n);
memcpy(f1, R1, 2 * n); memcpy(f1, R1, 2 * n);
memcpy(f1 + n, Q1, 2 * n); memcpy(f1 + n, Q1, 2 * n);
}
} }
@ -160,25 +168,27 @@ static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f) {
* @param[in] betas FFT constants * @param[in] betas FFT constants
*/ */
static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32_t m_f, const uint16_t *betas) { static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32_t m_f, const uint16_t *betas) {
uint16_t f0[1 << (PARAM_FFT - 2)]; uint16_t f0[1 << (PARAM_FFT - 2)] = {0};
uint16_t f1[1 << (PARAM_FFT - 2)]; uint16_t f1[1 << (PARAM_FFT - 2)] = {0};
uint16_t gammas[PARAM_M - 2]; uint16_t gammas[PARAM_M - 2] = {0};
uint16_t deltas[PARAM_M - 2]; uint16_t deltas[PARAM_M - 2] = {0};
size_t k = 1 << (m - 1); uint16_t gammas_sums[1 << (PARAM_M - 2)] = {0};
uint16_t gammas_sums[1 << (PARAM_M - 2)];
uint16_t u[1 << (PARAM_M - 2)] = {0}; uint16_t u[1 << (PARAM_M - 2)] = {0};
uint16_t v[1 << (PARAM_M - 2)] = {0}; uint16_t v[1 << (PARAM_M - 2)] = {0};
uint16_t tmp[PARAM_M - (PARAM_FFT - 1)] = {0};
uint16_t beta_m_pow;
size_t i, j, k;
// Step 1 // Step 1
if (m_f == 1) { if (m_f == 1) {
uint16_t tmp[PARAM_M - (PARAM_FFT - 1)]; for (i = 0 ; i < m ; ++i) {
for (size_t i = 0 ; i < m ; ++i) {
tmp[i] = PQCLEAN_HQC128_AVX2_gf_mul(betas[i], f[1]); tmp[i] = PQCLEAN_HQC128_AVX2_gf_mul(betas[i], f[1]);
} }
w[0] = f[0]; w[0] = f[0];
for (size_t j = 0 ; j < m ; ++j) { for (j = 0 ; j < m ; ++j) {
for (size_t k = 0 ; k < (1U << j) ; ++k) { for (k = 0 ; k < (1U << j) ; ++k) {
w[(1 << j) + k] = w[k] ^ tmp[j]; w[(1 << j) + k] = w[k] ^ tmp[j];
} }
} }
@ -188,8 +198,8 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
// Step 2: compute g // Step 2: compute g
if (betas[m - 1] != 1) { if (betas[m - 1] != 1) {
uint16_t beta_m_pow = 1; beta_m_pow = 1;
for (size_t i = 1 ; i < (1U << m_f) ; ++i) { for (i = 1 ; i < (1U << m_f) ; ++i) {
beta_m_pow = PQCLEAN_HQC128_AVX2_gf_mul(beta_m_pow, betas[m - 1]); beta_m_pow = PQCLEAN_HQC128_AVX2_gf_mul(beta_m_pow, betas[m - 1]);
f[i] = PQCLEAN_HQC128_AVX2_gf_mul(beta_m_pow, f[i]); f[i] = PQCLEAN_HQC128_AVX2_gf_mul(beta_m_pow, f[i]);
} }
@ -199,7 +209,7 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
radix(f0, f1, f, m_f); radix(f0, f1, f, m_f);
// Step 4: compute gammas and deltas // Step 4: compute gammas and deltas
for (uint8_t i = 0 ; i < m - 1 ; ++i) { for (i = 0 ; i + 1 < m ; ++i) {
gammas[i] = PQCLEAN_HQC128_AVX2_gf_mul(betas[i], PQCLEAN_HQC128_AVX2_gf_inverse(betas[m - 1])); gammas[i] = PQCLEAN_HQC128_AVX2_gf_mul(betas[i], PQCLEAN_HQC128_AVX2_gf_inverse(betas[m - 1]));
deltas[i] = PQCLEAN_HQC128_AVX2_gf_square(gammas[i]) ^ gammas[i]; deltas[i] = PQCLEAN_HQC128_AVX2_gf_square(gammas[i]) ^ gammas[i];
} }
@ -210,10 +220,11 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
// Step 5 // Step 5
fft_rec(u, f0, (f_coeffs + 1) / 2, m - 1, m_f - 1, deltas); fft_rec(u, f0, (f_coeffs + 1) / 2, m - 1, m_f - 1, deltas);
k = 1 << ((m - 1) & 0xf); // &0xf is to let the compiler know that m-1 is small.
if (f_coeffs <= 3) { // 3-coefficient polynomial f case: f1 is constant if (f_coeffs <= 3) { // 3-coefficient polynomial f case: f1 is constant
w[0] = u[0]; w[0] = u[0];
w[k] = u[0] ^ f1[0]; w[k] = u[0] ^ f1[0];
for (size_t i = 1 ; i < k ; ++i) { for (i = 1 ; i < k ; ++i) {
w[i] = u[i] ^ PQCLEAN_HQC128_AVX2_gf_mul(gammas_sums[i], f1[0]); w[i] = u[i] ^ PQCLEAN_HQC128_AVX2_gf_mul(gammas_sums[i], f1[0]);
w[k + i] = w[i] ^ f1[0]; w[k + i] = w[i] ^ f1[0];
} }
@ -224,7 +235,7 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
memcpy(w + k, v, 2 * k); memcpy(w + k, v, 2 * k);
w[0] = u[0]; w[0] = u[0];
w[k] ^= u[0]; w[k] ^= u[0];
for (size_t i = 1 ; i < k ; ++i) { for (i = 1 ; i < k ; ++i) {
w[i] = u[i] ^ PQCLEAN_HQC128_AVX2_gf_mul(gammas_sums[i], v[i]); w[i] = u[i] ^ PQCLEAN_HQC128_AVX2_gf_mul(gammas_sums[i], v[i]);
w[k + i] ^= w[i]; w[k + i] ^= w[i];
} }
@ -253,14 +264,15 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
* @param[in] f_coeffs Number coefficients of f (i.e. deg(f)+1) * @param[in] f_coeffs Number coefficients of f (i.e. deg(f)+1)
*/ */
void PQCLEAN_HQC128_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs) { void PQCLEAN_HQC128_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs) {
uint16_t betas[PARAM_M - 1]; uint16_t betas[PARAM_M - 1] = {0};
uint16_t betas_sums[1 << (PARAM_M - 1)]; uint16_t betas_sums[1 << (PARAM_M - 1)] = {0};
uint16_t f0[1 << (PARAM_FFT - 1)]; uint16_t f0[1 << (PARAM_FFT - 1)] = {0};
uint16_t f1[1 << (PARAM_FFT - 1)]; uint16_t f1[1 << (PARAM_FFT - 1)] = {0};
uint16_t deltas[PARAM_M - 1]; uint16_t deltas[PARAM_M - 1] = {0};
size_t k = 1 << (PARAM_M - 1); uint16_t u[1 << (PARAM_M - 1)] = {0};
uint16_t u[1 << (PARAM_M - 1)]; uint16_t v[1 << (PARAM_M - 1)] = {0};
uint16_t v[1 << (PARAM_M - 1)];
size_t i, k;
// Follows Gao and Mateer algorithm // Follows Gao and Mateer algorithm
compute_fft_betas(betas); compute_fft_betas(betas);
@ -276,7 +288,7 @@ void PQCLEAN_HQC128_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs) {
radix(f0, f1, f, PARAM_FFT); radix(f0, f1, f, PARAM_FFT);
// Step 4: Compute deltas // Step 4: Compute deltas
for (size_t i = 0 ; i < PARAM_M - 1 ; ++i) { for (i = 0 ; i < PARAM_M - 1 ; ++i) {
deltas[i] = PQCLEAN_HQC128_AVX2_gf_square(betas[i]) ^ betas[i]; deltas[i] = PQCLEAN_HQC128_AVX2_gf_square(betas[i]) ^ betas[i];
} }
@ -284,6 +296,7 @@ void PQCLEAN_HQC128_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs) {
fft_rec(u, f0, (f_coeffs + 1) / 2, PARAM_M - 1, PARAM_FFT - 1, deltas); fft_rec(u, f0, (f_coeffs + 1) / 2, PARAM_M - 1, PARAM_FFT - 1, deltas);
fft_rec(v, f1, f_coeffs / 2, PARAM_M - 1, PARAM_FFT - 1, deltas); fft_rec(v, f1, f_coeffs / 2, PARAM_M - 1, PARAM_FFT - 1, deltas);
k = 1 << (PARAM_M - 1);
// Step 6, 7 and error polynomial computation // Step 6, 7 and error polynomial computation
memcpy(w + k, v, 2 * k); memcpy(w + k, v, 2 * k);
@ -294,7 +307,7 @@ void PQCLEAN_HQC128_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs) {
w[k] ^= u[0]; w[k] ^= u[0];
// Find other roots // Find other roots
for (size_t i = 1 ; i < k ; ++i) { for (i = 1 ; i < k ; ++i) {
w[i] = u[i] ^ PQCLEAN_HQC128_AVX2_gf_mul(betas_sums[i], v[i]); w[i] = u[i] ^ PQCLEAN_HQC128_AVX2_gf_mul(betas_sums[i], v[i]);
w[k + i] ^= w[i]; w[k + i] ^= w[i];
} }
@ -309,25 +322,28 @@ void PQCLEAN_HQC128_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs) {
* @param[in] w Array of size 2^PARAM_M * @param[in] w Array of size 2^PARAM_M
*/ */
void PQCLEAN_HQC128_AVX2_fft_retrieve_bch_error_poly(uint64_t *error, const uint16_t *w) { void PQCLEAN_HQC128_AVX2_fft_retrieve_bch_error_poly(uint64_t *error, const uint16_t *w) {
uint16_t gammas[PARAM_M - 1]; uint16_t gammas[PARAM_M - 1] = {0};
uint16_t gammas_sums[1 << (PARAM_M - 1)]; uint16_t gammas_sums[1 << (PARAM_M - 1)] = {0};
size_t k = 1 << (PARAM_M - 1); uint64_t bit;
size_t index = PARAM_GF_MUL_ORDER; size_t i, k, index;
compute_fft_betas(gammas); compute_fft_betas(gammas);
compute_subset_sums(gammas_sums, gammas, PARAM_M - 1); compute_subset_sums(gammas_sums, gammas, PARAM_M - 1);
error[0] ^= ((uint64_t) 1) ^ ((uint16_t) - w[0] >> 15); error[0] ^= 1 ^ ((uint16_t) - w[0] >> 15);
uint64_t bit = ((uint64_t) 1) ^ ((uint16_t) - w[k] >> 15);
k = 1 << (PARAM_M - 1);
index = PARAM_GF_MUL_ORDER;
bit = 1 ^ ((uint16_t) - w[k] >> 15);
error[index / 8] ^= bit << (index % 64); error[index / 8] ^= bit << (index % 64);
for (size_t i = 1 ; i < k ; ++i) { for (i = 1 ; i < k ; ++i) {
index = PARAM_GF_MUL_ORDER - PQCLEAN_HQC128_AVX2_gf_log(gammas_sums[i]); index = PARAM_GF_MUL_ORDER - PQCLEAN_HQC128_AVX2_gf_log(gammas_sums[i]);
bit = ((uint64_t) 1) ^ ((uint16_t) - w[i] >> 15); bit = 1 ^ ((uint16_t) - w[i] >> 15);
error[index / 64] ^= bit << (index % 64); error[index / 64] ^= bit << (index % 64);
index = PARAM_GF_MUL_ORDER - PQCLEAN_HQC128_AVX2_gf_log(gammas_sums[i] ^ 1); index = PARAM_GF_MUL_ORDER - PQCLEAN_HQC128_AVX2_gf_log(gammas_sums[i] ^ 1);
bit = ((uint64_t) 1) ^ ((uint16_t) - w[k + i] >> 15); bit = 1 ^ ((uint16_t) - w[k + i] >> 15);
error[index / 64] ^= bit << (index % 64); error[index / 64] ^= bit << (index % 64);
} }
} }

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -328,9 +328,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
static __m256i W0[2 * (T_TM3_3W_256)], W1[2 * (T_TM3_3W_256)], W2[2 * (T_TM3_3W_256)], W3[2 * (T_TM3_3W_256)], W4[2 * (T_TM3_3W_256)]; static __m256i W0[2 * (T_TM3_3W_256)], W1[2 * (T_TM3_3W_256)], W2[2 * (T_TM3_3W_256)], W3[2 * (T_TM3_3W_256)], W4[2 * (T_TM3_3W_256)];
static __m256i tmp[2 * (T_TM3_3W_256)]; static __m256i tmp[2 * (T_TM3_3W_256)];
static __m256i ro256[6 * (T_TM3_3W_256)]; static __m256i ro256[6 * (T_TM3_3W_256)];
const __m256i zero = (__m256i) { const __m256i zero = _mm256_setzero_si256();
0ul, 0ul, 0ul, 0ul
};
int32_t T2 = T_TM3_3W_64 << 1; int32_t T2 = T_TM3_3W_64 << 1;
for (int32_t i = 0 ; i < T_TM3_3W_256 - 1 ; i++) { for (int32_t i = 0 ; i < T_TM3_3W_256 - 1 ; i++) {
@ -347,24 +345,12 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
for (int32_t i = T_TM3_3W_256 - 1 ; i < T_TM3_3W_256 ; i++) { for (int32_t i = T_TM3_3W_256 - 1 ; i < T_TM3_3W_256 ; i++) {
int32_t i4 = i << 2; int32_t i4 = i << 2;
int32_t i41 = i4 + 1; int32_t i41 = i4 + 1;
U0[i] = (__m256i) { U0[i] = _mm256_set_epi64x(0, 0, A[i41], A[i4]);
A[i4], A[i41], 0x0ul, 0x0ul V0[i] = _mm256_set_epi64x(0, 0, B[i41], B[i4]);
}; U1[i] = _mm256_set_epi64x(0, 0, A[i41 + T_TM3_3W_64 - 2], A[i4 + T_TM3_3W_64 - 2]);
V0[i] = (__m256i) { V1[i] = _mm256_set_epi64x(0, 0, B[i41 + T_TM3_3W_64 - 2], B[i4 + T_TM3_3W_64 - 2]);
B[i4], B[i41], 0x0ul, 0x0ul U2[i] = _mm256_set_epi64x(0, 0, A[i4 - 3 + T2], A[i4 - 4 + T2]);
}; V2[i] = _mm256_set_epi64x(0, 0, B[i4 - 3 + T2], B[i4 - 4 + T2]);
U1[i] = (__m256i) {
A[i4 + T_TM3_3W_64 - 2], A[i41 + T_TM3_3W_64 - 2], 0x0ul, 0x0ul
};
V1[i] = (__m256i) {
B[i4 + T_TM3_3W_64 - 2], B[i41 + T_TM3_3W_64 - 2], 0x0ul, 0x0ul
};
U2[i] = (__m256i) {
A[i4 - 4 + T2], A[i4 - 3 + T2], 0x0ul, 0x0ul
};
V2[i] = (__m256i) {
B[i4 - 4 + T2], B[i4 - 3 + T2], 0x0ul, 0x0ul
};
} }
// Evaluation phase : x= X^64 // Evaluation phase : x= X^64
@ -452,9 +438,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
//W2 =(W2 + W3 + W4*(x^3+1))/(x+1) //W2 =(W2 + W3 + W4*(x^3+1))/(x+1)
U1_64 = ((int64_t *) W4); U1_64 = ((int64_t *) W4);
__m256i *U1_256 = (__m256i *) (U1_64 + 1); __m256i *U1_256 = (__m256i *) (U1_64 + 1);
tmp[0] = W2[0] ^ W3[0] ^ W4[0] ^ (__m256i) { tmp[0] = W2[0] ^ W3[0] ^ W4[0] ^ _mm256_set_epi64x(U1_64[0], 0, 0, 0);
0x0ul, 0x0ul, 0x0ul, U1_64[0]
};
for (int32_t i = 1 ; i < (T_TM3_3W_256 << 1) - 1 ; i++) { for (int32_t i = 1 ; i < (T_TM3_3W_256 << 1) - 1 ; i++) {
tmp[i] = W2[i] ^ W3[i] ^ W4[i] ^ _mm256_lddqu_si256(&U1_256[i - 1]); tmp[i] = W2[i] ^ W3[i] ^ W4[i] ^ _mm256_lddqu_si256(&U1_256[i - 1]);

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -19,6 +19,7 @@
static void compute_fft_betas(uint16_t *betas); static void compute_fft_betas(uint16_t *betas);
static void compute_subset_sums(uint16_t *subset_sums, const uint16_t *set, size_t set_size); static void compute_subset_sums(uint16_t *subset_sums, const uint16_t *set, size_t set_size);
static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f); static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f);
static void radix_big(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f);
static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32_t m_f, const uint16_t *betas); static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32_t m_f, const uint16_t *betas);
@ -28,7 +29,8 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
* @param[out] betas Array of size PARAM_M-1 * @param[out] betas Array of size PARAM_M-1
*/ */
static void compute_fft_betas(uint16_t *betas) { static void compute_fft_betas(uint16_t *betas) {
for (size_t i = 0 ; i < PARAM_M - 1 ; ++i) { size_t i;
for (i = 0 ; i < PARAM_M - 1 ; ++i) {
betas[i] = 1 << (PARAM_M - 1 - i); betas[i] = 1 << (PARAM_M - 1 - i);
} }
} }
@ -46,10 +48,11 @@ static void compute_fft_betas(uint16_t *betas) {
* @param[in] set_size Size of the array set * @param[in] set_size Size of the array set
*/ */
static void compute_subset_sums(uint16_t *subset_sums, const uint16_t *set, size_t set_size) { static void compute_subset_sums(uint16_t *subset_sums, const uint16_t *set, size_t set_size) {
size_t i, j;
subset_sums[0] = 0; subset_sums[0] = 0;
for (size_t i = 0 ; i < set_size ; ++i) { for (i = 0 ; i < set_size ; ++i) {
for (size_t j = 0 ; j < (1U << i) ; ++j) { for (j = 0 ; j < (1U << i) ; ++j) {
subset_sums[(1 << i) + j] = set[i] ^ subset_sums[j]; subset_sums[(1 << i) + j] = set[i] ^ subset_sums[j];
} }
} }
@ -89,7 +92,7 @@ static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f) {
f1[2] = f[3] ^ f1[1] ^ f0[3]; f1[2] = f[3] ^ f1[1] ^ f0[3];
f0[1] = f[2] ^ f0[2] ^ f1[1]; f0[1] = f[2] ^ f0[2] ^ f1[1];
f1[0] = f[1] ^ f0[1]; f1[0] = f[1] ^ f0[1];
return; break;
case 3: case 3:
f0[0] = f[0]; f0[0] = f[0];
@ -100,37 +103,43 @@ static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f) {
f1[3] = f[7]; f1[3] = f[7];
f0[1] = f[2] ^ f0[2] ^ f1[1]; f0[1] = f[2] ^ f0[2] ^ f1[1];
f1[0] = f[1] ^ f0[1]; f1[0] = f[1] ^ f0[1];
return; break;
case 2: case 2:
f0[0] = f[0]; f0[0] = f[0];
f0[1] = f[2] ^ f[3]; f0[1] = f[2] ^ f[3];
f1[0] = f[1] ^ f0[1]; f1[0] = f[1] ^ f0[1];
f1[1] = f[3]; f1[1] = f[3];
return; break;
case 1: case 1:
f0[0] = f[0]; f0[0] = f[0];
f1[0] = f[1]; f1[0] = f[1];
return; break;
default: default:
; radix_big(f0, f1, f, m_f);
size_t n = 1 << (m_f - 2); break;
}
}
uint16_t Q[2 * (1 << (PARAM_FFT - 2))]; static void radix_big(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f) {
uint16_t R[2 * (1 << (PARAM_FFT - 2))]; uint16_t Q[2 * (1 << (PARAM_FFT - 2))] = {0};
uint16_t R[2 * (1 << (PARAM_FFT - 2))] = {0};
uint16_t Q0[1 << (PARAM_FFT - 2)]; uint16_t Q0[1 << (PARAM_FFT - 2)] = {0};
uint16_t Q1[1 << (PARAM_FFT - 2)]; uint16_t Q1[1 << (PARAM_FFT - 2)] = {0};
uint16_t R0[1 << (PARAM_FFT - 2)]; uint16_t R0[1 << (PARAM_FFT - 2)] = {0};
uint16_t R1[1 << (PARAM_FFT - 2)]; uint16_t R1[1 << (PARAM_FFT - 2)] = {0};
size_t i, n;
n = 1 << (m_f - 2);
memcpy(Q, f + 3 * n, 2 * n); memcpy(Q, f + 3 * n, 2 * n);
memcpy(Q + n, f + 3 * n, 2 * n); memcpy(Q + n, f + 3 * n, 2 * n);
memcpy(R, f, 4 * n); memcpy(R, f, 4 * n);
for (size_t i = 0 ; i < n ; ++i) { for (i = 0 ; i < n ; ++i) {
Q[i] ^= f[2 * n + i]; Q[i] ^= f[2 * n + i];
R[n + i] ^= Q[i]; R[n + i] ^= Q[i];
} }
@ -142,7 +151,6 @@ static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f) {
memcpy(f0 + n, Q0, 2 * n); memcpy(f0 + n, Q0, 2 * n);
memcpy(f1, R1, 2 * n); memcpy(f1, R1, 2 * n);
memcpy(f1 + n, Q1, 2 * n); memcpy(f1 + n, Q1, 2 * n);
}
} }
@ -160,25 +168,27 @@ static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f) {
* @param[in] betas FFT constants * @param[in] betas FFT constants
*/ */
static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32_t m_f, const uint16_t *betas) { static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32_t m_f, const uint16_t *betas) {
uint16_t f0[1 << (PARAM_FFT - 2)]; uint16_t f0[1 << (PARAM_FFT - 2)] = {0};
uint16_t f1[1 << (PARAM_FFT - 2)]; uint16_t f1[1 << (PARAM_FFT - 2)] = {0};
uint16_t gammas[PARAM_M - 2]; uint16_t gammas[PARAM_M - 2] = {0};
uint16_t deltas[PARAM_M - 2]; uint16_t deltas[PARAM_M - 2] = {0};
size_t k = 1 << (m - 1); uint16_t gammas_sums[1 << (PARAM_M - 2)] = {0};
uint16_t gammas_sums[1 << (PARAM_M - 2)];
uint16_t u[1 << (PARAM_M - 2)] = {0}; uint16_t u[1 << (PARAM_M - 2)] = {0};
uint16_t v[1 << (PARAM_M - 2)] = {0}; uint16_t v[1 << (PARAM_M - 2)] = {0};
uint16_t tmp[PARAM_M - (PARAM_FFT - 1)] = {0};
uint16_t beta_m_pow;
size_t i, j, k;
// Step 1 // Step 1
if (m_f == 1) { if (m_f == 1) {
uint16_t tmp[PARAM_M - (PARAM_FFT - 1)]; for (i = 0 ; i < m ; ++i) {
for (size_t i = 0 ; i < m ; ++i) {
tmp[i] = PQCLEAN_HQC192_AVX2_gf_mul(betas[i], f[1]); tmp[i] = PQCLEAN_HQC192_AVX2_gf_mul(betas[i], f[1]);
} }
w[0] = f[0]; w[0] = f[0];
for (size_t j = 0 ; j < m ; ++j) { for (j = 0 ; j < m ; ++j) {
for (size_t k = 0 ; k < (1U << j) ; ++k) { for (k = 0 ; k < (1U << j) ; ++k) {
w[(1 << j) + k] = w[k] ^ tmp[j]; w[(1 << j) + k] = w[k] ^ tmp[j];
} }
} }
@ -188,8 +198,8 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
// Step 2: compute g // Step 2: compute g
if (betas[m - 1] != 1) { if (betas[m - 1] != 1) {
uint16_t beta_m_pow = 1; beta_m_pow = 1;
for (size_t i = 1 ; i < (1U << m_f) ; ++i) { for (i = 1 ; i < (1U << m_f) ; ++i) {
beta_m_pow = PQCLEAN_HQC192_AVX2_gf_mul(beta_m_pow, betas[m - 1]); beta_m_pow = PQCLEAN_HQC192_AVX2_gf_mul(beta_m_pow, betas[m - 1]);
f[i] = PQCLEAN_HQC192_AVX2_gf_mul(beta_m_pow, f[i]); f[i] = PQCLEAN_HQC192_AVX2_gf_mul(beta_m_pow, f[i]);
} }
@ -199,7 +209,7 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
radix(f0, f1, f, m_f); radix(f0, f1, f, m_f);
// Step 4: compute gammas and deltas // Step 4: compute gammas and deltas
for (uint8_t i = 0 ; i < m - 1 ; ++i) { for (i = 0 ; i + 1 < m ; ++i) {
gammas[i] = PQCLEAN_HQC192_AVX2_gf_mul(betas[i], PQCLEAN_HQC192_AVX2_gf_inverse(betas[m - 1])); gammas[i] = PQCLEAN_HQC192_AVX2_gf_mul(betas[i], PQCLEAN_HQC192_AVX2_gf_inverse(betas[m - 1]));
deltas[i] = PQCLEAN_HQC192_AVX2_gf_square(gammas[i]) ^ gammas[i]; deltas[i] = PQCLEAN_HQC192_AVX2_gf_square(gammas[i]) ^ gammas[i];
} }
@ -210,10 +220,11 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
// Step 5 // Step 5
fft_rec(u, f0, (f_coeffs + 1) / 2, m - 1, m_f - 1, deltas); fft_rec(u, f0, (f_coeffs + 1) / 2, m - 1, m_f - 1, deltas);
k = 1 << ((m - 1) & 0xf); // &0xf is to let the compiler know that m-1 is small.
if (f_coeffs <= 3) { // 3-coefficient polynomial f case: f1 is constant if (f_coeffs <= 3) { // 3-coefficient polynomial f case: f1 is constant
w[0] = u[0]; w[0] = u[0];
w[k] = u[0] ^ f1[0]; w[k] = u[0] ^ f1[0];
for (size_t i = 1 ; i < k ; ++i) { for (i = 1 ; i < k ; ++i) {
w[i] = u[i] ^ PQCLEAN_HQC192_AVX2_gf_mul(gammas_sums[i], f1[0]); w[i] = u[i] ^ PQCLEAN_HQC192_AVX2_gf_mul(gammas_sums[i], f1[0]);
w[k + i] = w[i] ^ f1[0]; w[k + i] = w[i] ^ f1[0];
} }
@ -224,7 +235,7 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
memcpy(w + k, v, 2 * k); memcpy(w + k, v, 2 * k);
w[0] = u[0]; w[0] = u[0];
w[k] ^= u[0]; w[k] ^= u[0];
for (size_t i = 1 ; i < k ; ++i) { for (i = 1 ; i < k ; ++i) {
w[i] = u[i] ^ PQCLEAN_HQC192_AVX2_gf_mul(gammas_sums[i], v[i]); w[i] = u[i] ^ PQCLEAN_HQC192_AVX2_gf_mul(gammas_sums[i], v[i]);
w[k + i] ^= w[i]; w[k + i] ^= w[i];
} }
@ -253,14 +264,15 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
* @param[in] f_coeffs Number coefficients of f (i.e. deg(f)+1) * @param[in] f_coeffs Number coefficients of f (i.e. deg(f)+1)
*/ */
void PQCLEAN_HQC192_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs) { void PQCLEAN_HQC192_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs) {
uint16_t betas[PARAM_M - 1]; uint16_t betas[PARAM_M - 1] = {0};
uint16_t betas_sums[1 << (PARAM_M - 1)]; uint16_t betas_sums[1 << (PARAM_M - 1)] = {0};
uint16_t f0[1 << (PARAM_FFT - 1)]; uint16_t f0[1 << (PARAM_FFT - 1)] = {0};
uint16_t f1[1 << (PARAM_FFT - 1)]; uint16_t f1[1 << (PARAM_FFT - 1)] = {0};
uint16_t deltas[PARAM_M - 1]; uint16_t deltas[PARAM_M - 1] = {0};
size_t k = 1 << (PARAM_M - 1); uint16_t u[1 << (PARAM_M - 1)] = {0};
uint16_t u[1 << (PARAM_M - 1)]; uint16_t v[1 << (PARAM_M - 1)] = {0};
uint16_t v[1 << (PARAM_M - 1)];
size_t i, k;
// Follows Gao and Mateer algorithm // Follows Gao and Mateer algorithm
compute_fft_betas(betas); compute_fft_betas(betas);
@ -276,7 +288,7 @@ void PQCLEAN_HQC192_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs) {
radix(f0, f1, f, PARAM_FFT); radix(f0, f1, f, PARAM_FFT);
// Step 4: Compute deltas // Step 4: Compute deltas
for (size_t i = 0 ; i < PARAM_M - 1 ; ++i) { for (i = 0 ; i < PARAM_M - 1 ; ++i) {
deltas[i] = PQCLEAN_HQC192_AVX2_gf_square(betas[i]) ^ betas[i]; deltas[i] = PQCLEAN_HQC192_AVX2_gf_square(betas[i]) ^ betas[i];
} }
@ -284,6 +296,7 @@ void PQCLEAN_HQC192_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs) {
fft_rec(u, f0, (f_coeffs + 1) / 2, PARAM_M - 1, PARAM_FFT - 1, deltas); fft_rec(u, f0, (f_coeffs + 1) / 2, PARAM_M - 1, PARAM_FFT - 1, deltas);
fft_rec(v, f1, f_coeffs / 2, PARAM_M - 1, PARAM_FFT - 1, deltas); fft_rec(v, f1, f_coeffs / 2, PARAM_M - 1, PARAM_FFT - 1, deltas);
k = 1 << (PARAM_M - 1);
// Step 6, 7 and error polynomial computation // Step 6, 7 and error polynomial computation
memcpy(w + k, v, 2 * k); memcpy(w + k, v, 2 * k);
@ -294,7 +307,7 @@ void PQCLEAN_HQC192_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs) {
w[k] ^= u[0]; w[k] ^= u[0];
// Find other roots // Find other roots
for (size_t i = 1 ; i < k ; ++i) { for (i = 1 ; i < k ; ++i) {
w[i] = u[i] ^ PQCLEAN_HQC192_AVX2_gf_mul(betas_sums[i], v[i]); w[i] = u[i] ^ PQCLEAN_HQC192_AVX2_gf_mul(betas_sums[i], v[i]);
w[k + i] ^= w[i]; w[k + i] ^= w[i];
} }
@ -309,25 +322,28 @@ void PQCLEAN_HQC192_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs) {
* @param[in] w Array of size 2^PARAM_M * @param[in] w Array of size 2^PARAM_M
*/ */
void PQCLEAN_HQC192_AVX2_fft_retrieve_bch_error_poly(uint64_t *error, const uint16_t *w) { void PQCLEAN_HQC192_AVX2_fft_retrieve_bch_error_poly(uint64_t *error, const uint16_t *w) {
uint16_t gammas[PARAM_M - 1]; uint16_t gammas[PARAM_M - 1] = {0};
uint16_t gammas_sums[1 << (PARAM_M - 1)]; uint16_t gammas_sums[1 << (PARAM_M - 1)] = {0};
size_t k = 1 << (PARAM_M - 1); uint64_t bit;
size_t index = PARAM_GF_MUL_ORDER; size_t i, k, index;
compute_fft_betas(gammas); compute_fft_betas(gammas);
compute_subset_sums(gammas_sums, gammas, PARAM_M - 1); compute_subset_sums(gammas_sums, gammas, PARAM_M - 1);
error[0] ^= ((uint64_t) 1) ^ ((uint16_t) - w[0] >> 15); error[0] ^= 1 ^ ((uint16_t) - w[0] >> 15);
uint64_t bit = ((uint64_t) 1) ^ ((uint16_t) - w[k] >> 15);
k = 1 << (PARAM_M - 1);
index = PARAM_GF_MUL_ORDER;
bit = 1 ^ ((uint16_t) - w[k] >> 15);
error[index / 8] ^= bit << (index % 64); error[index / 8] ^= bit << (index % 64);
for (size_t i = 1 ; i < k ; ++i) { for (i = 1 ; i < k ; ++i) {
index = PARAM_GF_MUL_ORDER - PQCLEAN_HQC192_AVX2_gf_log(gammas_sums[i]); index = PARAM_GF_MUL_ORDER - PQCLEAN_HQC192_AVX2_gf_log(gammas_sums[i]);
bit = ((uint64_t) 1) ^ ((uint16_t) - w[i] >> 15); bit = 1 ^ ((uint16_t) - w[i] >> 15);
error[index / 64] ^= bit << (index % 64); error[index / 64] ^= bit << (index % 64);
index = PARAM_GF_MUL_ORDER - PQCLEAN_HQC192_AVX2_gf_log(gammas_sums[i] ^ 1); index = PARAM_GF_MUL_ORDER - PQCLEAN_HQC192_AVX2_gf_log(gammas_sums[i] ^ 1);
bit = ((uint64_t) 1) ^ ((uint16_t) - w[k + i] >> 15); bit = 1 ^ ((uint16_t) - w[k + i] >> 15);
error[index / 64] ^= bit << (index % 64); error[index / 64] ^= bit << (index % 64);
} }
} }

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -368,9 +368,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
static __m256i W0[2 * (T_TM3_3W_256)], W1[2 * (T_TM3_3W_256)], W2[2 * (T_TM3_3W_256)], W3[2 * (T_TM3_3W_256)], W4[2 * (T_TM3_3W_256)]; static __m256i W0[2 * (T_TM3_3W_256)], W1[2 * (T_TM3_3W_256)], W2[2 * (T_TM3_3W_256)], W3[2 * (T_TM3_3W_256)], W4[2 * (T_TM3_3W_256)];
static __m256i tmp[2 * (T_TM3_3W_256)]; static __m256i tmp[2 * (T_TM3_3W_256)];
static __m256i ro256[6 * (T_TM3_3W_256)]; static __m256i ro256[6 * (T_TM3_3W_256)];
const __m256i zero = (__m256i) { const __m256i zero = _mm256_setzero_si256();
0ul, 0ul, 0ul, 0ul
};
int32_t T2 = T_TM3_3W_64 << 1; int32_t T2 = T_TM3_3W_64 << 1;
for (int32_t i = 0 ; i < T_TM3_3W_256 - 1 ; i++) { for (int32_t i = 0 ; i < T_TM3_3W_256 - 1 ; i++) {
@ -387,24 +385,12 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
for (int32_t i = T_TM3_3W_256 - 1 ; i < T_TM3_3W_256 ; i++) { for (int32_t i = T_TM3_3W_256 - 1 ; i < T_TM3_3W_256 ; i++) {
int32_t i4 = i << 2; int32_t i4 = i << 2;
int32_t i41 = i4 + 1; int32_t i41 = i4 + 1;
U0[i] = (__m256i) { U0[i] = _mm256_set_epi64x(0, 0, A[i41], A[i4]);
A[i4], A[i41], 0x0ul, 0x0ul V0[i] = _mm256_set_epi64x(0, 0, B[i41], B[i4]);
}; U1[i] = _mm256_set_epi64x(0, 0, A[i41 + T_TM3_3W_64 - 2], A[i4 + T_TM3_3W_64 - 2]);
V0[i] = (__m256i) { V1[i] = _mm256_set_epi64x(0, 0, B[i41 + T_TM3_3W_64 - 2], B[i4 + T_TM3_3W_64 - 2]);
B[i4], B[i41], 0x0ul, 0x0ul U2[i] = _mm256_set_epi64x(0, 0, A[i4 - 3 + T2], A[i4 - 4 + T2]);
}; V2[i] = _mm256_set_epi64x(0, 0, B[i4 - 3 + T2], B[i4 - 4 + T2]);
U1[i] = (__m256i) {
A[i4 + T_TM3_3W_64 - 2], A[i41 + T_TM3_3W_64 - 2], 0x0ul, 0x0ul
};
V1[i] = (__m256i) {
B[i4 + T_TM3_3W_64 - 2], B[i41 + T_TM3_3W_64 - 2], 0x0ul, 0x0ul
};
U2[i] = (__m256i) {
A[i4 - 4 + T2], A[i4 - 3 + T2], 0x0ul, 0x0ul
};
V2[i] = (__m256i) {
B[i4 - 4 + T2], B[i4 - 3 + T2], 0x0ul, 0x0ul
};
} }
// Evaluation phase : x= X^64 // Evaluation phase : x= X^64
@ -492,9 +478,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
//W2 =(W2 + W3 + W4*(x^3+1))/(x+1) //W2 =(W2 + W3 + W4*(x^3+1))/(x+1)
U1_64 = ((int64_t *) W4); U1_64 = ((int64_t *) W4);
__m256i *U1_256 = (__m256i *) (U1_64 + 1); __m256i *U1_256 = (__m256i *) (U1_64 + 1);
tmp[0] = W2[0] ^ W3[0] ^ W4[0] ^ (__m256i) { tmp[0] = W2[0] ^ W3[0] ^ W4[0] ^ _mm256_set_epi64x(U1_64[0], 0, 0, 0);
0x0ul, 0x0ul, 0x0ul, U1_64[0]
};
for (int32_t i = 1 ; i < (T_TM3_3W_256 << 1) - 1 ; i++) { for (int32_t i = 1 ; i < (T_TM3_3W_256 << 1) - 1 ; i++) {
tmp[i] = W2[i] ^ W3[i] ^ W4[i] ^ _mm256_lddqu_si256(&U1_256[i - 1]); tmp[i] = W2[i] ^ W3[i] ^ W4[i] ^ _mm256_lddqu_si256(&U1_256[i - 1]);

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -19,6 +19,7 @@
static void compute_fft_betas(uint16_t *betas); static void compute_fft_betas(uint16_t *betas);
static void compute_subset_sums(uint16_t *subset_sums, const uint16_t *set, size_t set_size); static void compute_subset_sums(uint16_t *subset_sums, const uint16_t *set, size_t set_size);
static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f); static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f);
static void radix_big(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f);
static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32_t m_f, const uint16_t *betas); static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32_t m_f, const uint16_t *betas);
@ -28,7 +29,8 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
* @param[out] betas Array of size PARAM_M-1 * @param[out] betas Array of size PARAM_M-1
*/ */
static void compute_fft_betas(uint16_t *betas) { static void compute_fft_betas(uint16_t *betas) {
for (size_t i = 0 ; i < PARAM_M - 1 ; ++i) { size_t i;
for (i = 0 ; i < PARAM_M - 1 ; ++i) {
betas[i] = 1 << (PARAM_M - 1 - i); betas[i] = 1 << (PARAM_M - 1 - i);
} }
} }
@ -46,10 +48,11 @@ static void compute_fft_betas(uint16_t *betas) {
* @param[in] set_size Size of the array set * @param[in] set_size Size of the array set
*/ */
static void compute_subset_sums(uint16_t *subset_sums, const uint16_t *set, size_t set_size) { static void compute_subset_sums(uint16_t *subset_sums, const uint16_t *set, size_t set_size) {
size_t i, j;
subset_sums[0] = 0; subset_sums[0] = 0;
for (size_t i = 0 ; i < set_size ; ++i) { for (i = 0 ; i < set_size ; ++i) {
for (size_t j = 0 ; j < (1U << i) ; ++j) { for (j = 0 ; j < (1U << i) ; ++j) {
subset_sums[(1 << i) + j] = set[i] ^ subset_sums[j]; subset_sums[(1 << i) + j] = set[i] ^ subset_sums[j];
} }
} }
@ -89,7 +92,7 @@ static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f) {
f1[2] = f[3] ^ f1[1] ^ f0[3]; f1[2] = f[3] ^ f1[1] ^ f0[3];
f0[1] = f[2] ^ f0[2] ^ f1[1]; f0[1] = f[2] ^ f0[2] ^ f1[1];
f1[0] = f[1] ^ f0[1]; f1[0] = f[1] ^ f0[1];
return; break;
case 3: case 3:
f0[0] = f[0]; f0[0] = f[0];
@ -100,37 +103,43 @@ static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f) {
f1[3] = f[7]; f1[3] = f[7];
f0[1] = f[2] ^ f0[2] ^ f1[1]; f0[1] = f[2] ^ f0[2] ^ f1[1];
f1[0] = f[1] ^ f0[1]; f1[0] = f[1] ^ f0[1];
return; break;
case 2: case 2:
f0[0] = f[0]; f0[0] = f[0];
f0[1] = f[2] ^ f[3]; f0[1] = f[2] ^ f[3];
f1[0] = f[1] ^ f0[1]; f1[0] = f[1] ^ f0[1];
f1[1] = f[3]; f1[1] = f[3];
return; break;
case 1: case 1:
f0[0] = f[0]; f0[0] = f[0];
f1[0] = f[1]; f1[0] = f[1];
return; break;
default: default:
; radix_big(f0, f1, f, m_f);
size_t n = 1 << (m_f - 2); break;
}
}
uint16_t Q[2 * (1 << (PARAM_FFT - 2))]; static void radix_big(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f) {
uint16_t R[2 * (1 << (PARAM_FFT - 2))]; uint16_t Q[2 * (1 << (PARAM_FFT - 2))] = {0};
uint16_t R[2 * (1 << (PARAM_FFT - 2))] = {0};
uint16_t Q0[1 << (PARAM_FFT - 2)]; uint16_t Q0[1 << (PARAM_FFT - 2)] = {0};
uint16_t Q1[1 << (PARAM_FFT - 2)]; uint16_t Q1[1 << (PARAM_FFT - 2)] = {0};
uint16_t R0[1 << (PARAM_FFT - 2)]; uint16_t R0[1 << (PARAM_FFT - 2)] = {0};
uint16_t R1[1 << (PARAM_FFT - 2)]; uint16_t R1[1 << (PARAM_FFT - 2)] = {0};
size_t i, n;
n = 1 << (m_f - 2);
memcpy(Q, f + 3 * n, 2 * n); memcpy(Q, f + 3 * n, 2 * n);
memcpy(Q + n, f + 3 * n, 2 * n); memcpy(Q + n, f + 3 * n, 2 * n);
memcpy(R, f, 4 * n); memcpy(R, f, 4 * n);
for (size_t i = 0 ; i < n ; ++i) { for (i = 0 ; i < n ; ++i) {
Q[i] ^= f[2 * n + i]; Q[i] ^= f[2 * n + i];
R[n + i] ^= Q[i]; R[n + i] ^= Q[i];
} }
@ -142,7 +151,6 @@ static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f) {
memcpy(f0 + n, Q0, 2 * n); memcpy(f0 + n, Q0, 2 * n);
memcpy(f1, R1, 2 * n); memcpy(f1, R1, 2 * n);
memcpy(f1 + n, Q1, 2 * n); memcpy(f1 + n, Q1, 2 * n);
}
} }
@ -160,25 +168,27 @@ static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f) {
* @param[in] betas FFT constants * @param[in] betas FFT constants
*/ */
static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32_t m_f, const uint16_t *betas) { static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32_t m_f, const uint16_t *betas) {
uint16_t f0[1 << (PARAM_FFT - 2)]; uint16_t f0[1 << (PARAM_FFT - 2)] = {0};
uint16_t f1[1 << (PARAM_FFT - 2)]; uint16_t f1[1 << (PARAM_FFT - 2)] = {0};
uint16_t gammas[PARAM_M - 2]; uint16_t gammas[PARAM_M - 2] = {0};
uint16_t deltas[PARAM_M - 2]; uint16_t deltas[PARAM_M - 2] = {0};
size_t k = 1 << (m - 1); uint16_t gammas_sums[1 << (PARAM_M - 2)] = {0};
uint16_t gammas_sums[1 << (PARAM_M - 2)];
uint16_t u[1 << (PARAM_M - 2)] = {0}; uint16_t u[1 << (PARAM_M - 2)] = {0};
uint16_t v[1 << (PARAM_M - 2)] = {0}; uint16_t v[1 << (PARAM_M - 2)] = {0};
uint16_t tmp[PARAM_M - (PARAM_FFT - 1)] = {0};
uint16_t beta_m_pow;
size_t i, j, k;
// Step 1 // Step 1
if (m_f == 1) { if (m_f == 1) {
uint16_t tmp[PARAM_M - (PARAM_FFT - 1)]; for (i = 0 ; i < m ; ++i) {
for (size_t i = 0 ; i < m ; ++i) {
tmp[i] = PQCLEAN_HQC256_AVX2_gf_mul(betas[i], f[1]); tmp[i] = PQCLEAN_HQC256_AVX2_gf_mul(betas[i], f[1]);
} }
w[0] = f[0]; w[0] = f[0];
for (size_t j = 0 ; j < m ; ++j) { for (j = 0 ; j < m ; ++j) {
for (size_t k = 0 ; k < (1U << j) ; ++k) { for (k = 0 ; k < (1U << j) ; ++k) {
w[(1 << j) + k] = w[k] ^ tmp[j]; w[(1 << j) + k] = w[k] ^ tmp[j];
} }
} }
@ -188,8 +198,8 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
// Step 2: compute g // Step 2: compute g
if (betas[m - 1] != 1) { if (betas[m - 1] != 1) {
uint16_t beta_m_pow = 1; beta_m_pow = 1;
for (size_t i = 1 ; i < (1U << m_f) ; ++i) { for (i = 1 ; i < (1U << m_f) ; ++i) {
beta_m_pow = PQCLEAN_HQC256_AVX2_gf_mul(beta_m_pow, betas[m - 1]); beta_m_pow = PQCLEAN_HQC256_AVX2_gf_mul(beta_m_pow, betas[m - 1]);
f[i] = PQCLEAN_HQC256_AVX2_gf_mul(beta_m_pow, f[i]); f[i] = PQCLEAN_HQC256_AVX2_gf_mul(beta_m_pow, f[i]);
} }
@ -199,7 +209,7 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
radix(f0, f1, f, m_f); radix(f0, f1, f, m_f);
// Step 4: compute gammas and deltas // Step 4: compute gammas and deltas
for (uint8_t i = 0 ; i < m - 1 ; ++i) { for (i = 0 ; i + 1 < m ; ++i) {
gammas[i] = PQCLEAN_HQC256_AVX2_gf_mul(betas[i], PQCLEAN_HQC256_AVX2_gf_inverse(betas[m - 1])); gammas[i] = PQCLEAN_HQC256_AVX2_gf_mul(betas[i], PQCLEAN_HQC256_AVX2_gf_inverse(betas[m - 1]));
deltas[i] = PQCLEAN_HQC256_AVX2_gf_square(gammas[i]) ^ gammas[i]; deltas[i] = PQCLEAN_HQC256_AVX2_gf_square(gammas[i]) ^ gammas[i];
} }
@ -210,10 +220,11 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
// Step 5 // Step 5
fft_rec(u, f0, (f_coeffs + 1) / 2, m - 1, m_f - 1, deltas); fft_rec(u, f0, (f_coeffs + 1) / 2, m - 1, m_f - 1, deltas);
k = 1 << ((m - 1) & 0xf); // &0xf is to let the compiler know that m-1 is small.
if (f_coeffs <= 3) { // 3-coefficient polynomial f case: f1 is constant if (f_coeffs <= 3) { // 3-coefficient polynomial f case: f1 is constant
w[0] = u[0]; w[0] = u[0];
w[k] = u[0] ^ f1[0]; w[k] = u[0] ^ f1[0];
for (size_t i = 1 ; i < k ; ++i) { for (i = 1 ; i < k ; ++i) {
w[i] = u[i] ^ PQCLEAN_HQC256_AVX2_gf_mul(gammas_sums[i], f1[0]); w[i] = u[i] ^ PQCLEAN_HQC256_AVX2_gf_mul(gammas_sums[i], f1[0]);
w[k + i] = w[i] ^ f1[0]; w[k + i] = w[i] ^ f1[0];
} }
@ -224,7 +235,7 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
memcpy(w + k, v, 2 * k); memcpy(w + k, v, 2 * k);
w[0] = u[0]; w[0] = u[0];
w[k] ^= u[0]; w[k] ^= u[0];
for (size_t i = 1 ; i < k ; ++i) { for (i = 1 ; i < k ; ++i) {
w[i] = u[i] ^ PQCLEAN_HQC256_AVX2_gf_mul(gammas_sums[i], v[i]); w[i] = u[i] ^ PQCLEAN_HQC256_AVX2_gf_mul(gammas_sums[i], v[i]);
w[k + i] ^= w[i]; w[k + i] ^= w[i];
} }
@ -253,14 +264,15 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
* @param[in] f_coeffs Number coefficients of f (i.e. deg(f)+1) * @param[in] f_coeffs Number coefficients of f (i.e. deg(f)+1)
*/ */
void PQCLEAN_HQC256_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs) { void PQCLEAN_HQC256_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs) {
uint16_t betas[PARAM_M - 1]; uint16_t betas[PARAM_M - 1] = {0};
uint16_t betas_sums[1 << (PARAM_M - 1)]; uint16_t betas_sums[1 << (PARAM_M - 1)] = {0};
uint16_t f0[1 << (PARAM_FFT - 1)]; uint16_t f0[1 << (PARAM_FFT - 1)] = {0};
uint16_t f1[1 << (PARAM_FFT - 1)]; uint16_t f1[1 << (PARAM_FFT - 1)] = {0};
uint16_t deltas[PARAM_M - 1]; uint16_t deltas[PARAM_M - 1] = {0};
size_t k = 1 << (PARAM_M - 1); uint16_t u[1 << (PARAM_M - 1)] = {0};
uint16_t u[1 << (PARAM_M - 1)]; uint16_t v[1 << (PARAM_M - 1)] = {0};
uint16_t v[1 << (PARAM_M - 1)];
size_t i, k;
// Follows Gao and Mateer algorithm // Follows Gao and Mateer algorithm
compute_fft_betas(betas); compute_fft_betas(betas);
@ -276,7 +288,7 @@ void PQCLEAN_HQC256_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs) {
radix(f0, f1, f, PARAM_FFT); radix(f0, f1, f, PARAM_FFT);
// Step 4: Compute deltas // Step 4: Compute deltas
for (size_t i = 0 ; i < PARAM_M - 1 ; ++i) { for (i = 0 ; i < PARAM_M - 1 ; ++i) {
deltas[i] = PQCLEAN_HQC256_AVX2_gf_square(betas[i]) ^ betas[i]; deltas[i] = PQCLEAN_HQC256_AVX2_gf_square(betas[i]) ^ betas[i];
} }
@ -284,6 +296,7 @@ void PQCLEAN_HQC256_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs) {
fft_rec(u, f0, (f_coeffs + 1) / 2, PARAM_M - 1, PARAM_FFT - 1, deltas); fft_rec(u, f0, (f_coeffs + 1) / 2, PARAM_M - 1, PARAM_FFT - 1, deltas);
fft_rec(v, f1, f_coeffs / 2, PARAM_M - 1, PARAM_FFT - 1, deltas); fft_rec(v, f1, f_coeffs / 2, PARAM_M - 1, PARAM_FFT - 1, deltas);
k = 1 << (PARAM_M - 1);
// Step 6, 7 and error polynomial computation // Step 6, 7 and error polynomial computation
memcpy(w + k, v, 2 * k); memcpy(w + k, v, 2 * k);
@ -294,7 +307,7 @@ void PQCLEAN_HQC256_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs) {
w[k] ^= u[0]; w[k] ^= u[0];
// Find other roots // Find other roots
for (size_t i = 1 ; i < k ; ++i) { for (i = 1 ; i < k ; ++i) {
w[i] = u[i] ^ PQCLEAN_HQC256_AVX2_gf_mul(betas_sums[i], v[i]); w[i] = u[i] ^ PQCLEAN_HQC256_AVX2_gf_mul(betas_sums[i], v[i]);
w[k + i] ^= w[i]; w[k + i] ^= w[i];
} }
@ -309,25 +322,28 @@ void PQCLEAN_HQC256_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs) {
* @param[in] w Array of size 2^PARAM_M * @param[in] w Array of size 2^PARAM_M
*/ */
void PQCLEAN_HQC256_AVX2_fft_retrieve_bch_error_poly(uint64_t *error, const uint16_t *w) { void PQCLEAN_HQC256_AVX2_fft_retrieve_bch_error_poly(uint64_t *error, const uint16_t *w) {
uint16_t gammas[PARAM_M - 1]; uint16_t gammas[PARAM_M - 1] = {0};
uint16_t gammas_sums[1 << (PARAM_M - 1)]; uint16_t gammas_sums[1 << (PARAM_M - 1)] = {0};
size_t k = 1 << (PARAM_M - 1); uint64_t bit;
size_t index = PARAM_GF_MUL_ORDER; size_t i, k, index;
compute_fft_betas(gammas); compute_fft_betas(gammas);
compute_subset_sums(gammas_sums, gammas, PARAM_M - 1); compute_subset_sums(gammas_sums, gammas, PARAM_M - 1);
error[0] ^= ((uint64_t) 1) ^ ((uint16_t) - w[0] >> 15); error[0] ^= 1 ^ ((uint16_t) - w[0] >> 15);
uint64_t bit = ((uint64_t) 1) ^ ((uint16_t) - w[k] >> 15);
k = 1 << (PARAM_M - 1);
index = PARAM_GF_MUL_ORDER;
bit = 1 ^ ((uint16_t) - w[k] >> 15);
error[index / 8] ^= bit << (index % 64); error[index / 8] ^= bit << (index % 64);
for (size_t i = 1 ; i < k ; ++i) { for (i = 1 ; i < k ; ++i) {
index = PARAM_GF_MUL_ORDER - PQCLEAN_HQC256_AVX2_gf_log(gammas_sums[i]); index = PARAM_GF_MUL_ORDER - PQCLEAN_HQC256_AVX2_gf_log(gammas_sums[i]);
bit = ((uint64_t) 1) ^ ((uint16_t) - w[i] >> 15); bit = 1 ^ ((uint16_t) - w[i] >> 15);
error[index / 64] ^= bit << (index % 64); error[index / 64] ^= bit << (index % 64);
index = PARAM_GF_MUL_ORDER - PQCLEAN_HQC256_AVX2_gf_log(gammas_sums[i] ^ 1); index = PARAM_GF_MUL_ORDER - PQCLEAN_HQC256_AVX2_gf_log(gammas_sums[i] ^ 1);
bit = ((uint64_t) 1) ^ ((uint16_t) - w[k + i] >> 15); bit = 1 ^ ((uint16_t) - w[k + i] >> 15);
error[index / 64] ^= bit << (index % 64); error[index / 64] ^= bit << (index % 64);
} }
} }

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -335,9 +335,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
static __m256i W0[2 * (T_TM3_3W_256)], W1[2 * (T_TM3_3W_256)], W2[2 * (T_TM3_3W_256)], W3[2 * (T_TM3_3W_256)], W4[2 * (T_TM3_3W_256)]; static __m256i W0[2 * (T_TM3_3W_256)], W1[2 * (T_TM3_3W_256)], W2[2 * (T_TM3_3W_256)], W3[2 * (T_TM3_3W_256)], W4[2 * (T_TM3_3W_256)];
static __m256i tmp[2 * (T_TM3_3W_256)]; static __m256i tmp[2 * (T_TM3_3W_256)];
static __m256i ro256[6 * (T_TM3_3W_256)]; static __m256i ro256[6 * (T_TM3_3W_256)];
const __m256i zero = (__m256i) { const __m256i zero = _mm256_setzero_si256();
0ul, 0ul, 0ul, 0ul
};
int32_t T2 = T_TM3_3W_64 << 1; int32_t T2 = T_TM3_3W_64 << 1;
for (int32_t i = 0 ; i < T_TM3_3W_256 - 1 ; i++) { for (int32_t i = 0 ; i < T_TM3_3W_256 - 1 ; i++) {
@ -354,24 +352,12 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
for (int32_t i = T_TM3_3W_256 - 1 ; i < T_TM3_3W_256 ; i++) { for (int32_t i = T_TM3_3W_256 - 1 ; i < T_TM3_3W_256 ; i++) {
int32_t i4 = i << 2; int32_t i4 = i << 2;
int32_t i41 = i4 + 1; int32_t i41 = i4 + 1;
U0[i] = (__m256i) { U0[i] = _mm256_set_epi64x(0, 0, A[i41], A[i4]);
A[i4], A[i41], 0x0ul, 0x0ul V0[i] = _mm256_set_epi64x(0, 0, B[i41], B[i4]);
}; U1[i] = _mm256_set_epi64x(0, 0, A[i41 + T_TM3_3W_64 - 2], A[i4 + T_TM3_3W_64 - 2]);
V0[i] = (__m256i) { V1[i] = _mm256_set_epi64x(0, 0, B[i41 + T_TM3_3W_64 - 2], B[i4 + T_TM3_3W_64 - 2]);
B[i4], B[i41], 0x0ul, 0x0ul U2[i] = _mm256_set_epi64x(0, 0, A[i4 - 3 + T2], A[i4 - 4 + T2]);
}; V2[i] = _mm256_set_epi64x(0, 0, B[i4 - 3 + T2], B[i4 - 4 + T2]);
U1[i] = (__m256i) {
A[i4 + T_TM3_3W_64 - 2], A[i41 + T_TM3_3W_64 - 2], 0x0ul, 0x0ul
};
V1[i] = (__m256i) {
B[i4 + T_TM3_3W_64 - 2], B[i41 + T_TM3_3W_64 - 2], 0x0ul, 0x0ul
};
U2[i] = (__m256i) {
A[i4 - 4 + T2], A[i4 - 3 + T2], 0x0ul, 0x0ul
};
V2[i] = (__m256i) {
B[i4 - 4 + T2], B[i4 - 3 + T2], 0x0ul, 0x0ul
};
} }
// Evaluation phase : x= X^64 // Evaluation phase : x= X^64
@ -459,9 +445,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
//W2 =(W2 + W3 + W4*(x^3+1))/(x+1) //W2 =(W2 + W3 + W4*(x^3+1))/(x+1)
U1_64 = ((int64_t *) W4); U1_64 = ((int64_t *) W4);
__m256i *U1_256 = (__m256i *) (U1_64 + 1); __m256i *U1_256 = (__m256i *) (U1_64 + 1);
tmp[0] = W2[0] ^ W3[0] ^ W4[0] ^ (__m256i) { tmp[0] = W2[0] ^ W3[0] ^ W4[0] ^ _mm256_set_epi64x(U1_64[0], 0, 0, 0);
0x0ul, 0x0ul, 0x0ul, U1_64[0]
};
for (int32_t i = 1 ; i < (T_TM3_3W_256 << 1) - 1 ; i++) { for (int32_t i = 1 ; i < (T_TM3_3W_256 << 1) - 1 ; i++) {
tmp[i] = W2[i] ^ W3[i] ^ W4[i] ^ _mm256_lddqu_si256(&U1_256[i - 1]); tmp[i] = W2[i] ^ W3[i] ^ W4[i] ^ _mm256_lddqu_si256(&U1_256[i - 1]);
@ -555,9 +539,7 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
__m256i W0[2 * (T_TM3R_3W_256 + 2)], W1[2 * (T_TM3R_3W_256 + 2)], W2[2 * (T_TM3R_3W_256 + 2)], W3[2 * (T_TM3R_3W_256 + 2)], W4[2 * (T_TM3R_3W_256 + 2)]; __m256i W0[2 * (T_TM3R_3W_256 + 2)], W1[2 * (T_TM3R_3W_256 + 2)], W2[2 * (T_TM3R_3W_256 + 2)], W3[2 * (T_TM3R_3W_256 + 2)], W4[2 * (T_TM3R_3W_256 + 2)];
__m256i tmp[2 * (T_TM3R_3W_256 + 2) + 3]; __m256i tmp[2 * (T_TM3R_3W_256 + 2) + 3];
__m256i ro256[tTM3R / 2]; __m256i ro256[tTM3R / 2];
const __m256i zero = (__m256i) { const __m256i zero = _mm256_setzero_si256();
0ul, 0ul, 0ul, 0ul
};
int32_t T2 = T_TM3R_3W_64 << 1; int32_t T2 = T_TM3R_3W_64 << 1;
for (int32_t i = 0 ; i < T_TM3R_3W_256 ; i++) { for (int32_t i = 0 ; i < T_TM3R_3W_256 ; i++) {

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -18,6 +18,7 @@
static void compute_fft_betas(uint16_t *betas); static void compute_fft_betas(uint16_t *betas);
static void compute_subset_sums(uint16_t *subset_sums, const uint16_t *set, size_t set_size); static void compute_subset_sums(uint16_t *subset_sums, const uint16_t *set, size_t set_size);
static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f); static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f);
static void radix_big(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f);
static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32_t m_f, const uint16_t *betas); static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32_t m_f, const uint16_t *betas);
@ -27,7 +28,8 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
* @param[out] betas Array of size PARAM_M-1 * @param[out] betas Array of size PARAM_M-1
*/ */
static void compute_fft_betas(uint16_t *betas) { static void compute_fft_betas(uint16_t *betas) {
for (size_t i = 0 ; i < PARAM_M - 1 ; ++i) { size_t i;
for (i = 0 ; i < PARAM_M - 1 ; ++i) {
betas[i] = 1 << (PARAM_M - 1 - i); betas[i] = 1 << (PARAM_M - 1 - i);
} }
} }
@ -45,10 +47,11 @@ static void compute_fft_betas(uint16_t *betas) {
* @param[in] set_size Size of the array set * @param[in] set_size Size of the array set
*/ */
static void compute_subset_sums(uint16_t *subset_sums, const uint16_t *set, size_t set_size) { static void compute_subset_sums(uint16_t *subset_sums, const uint16_t *set, size_t set_size) {
size_t i, j;
subset_sums[0] = 0; subset_sums[0] = 0;
for (size_t i = 0 ; i < set_size ; ++i) { for (i = 0 ; i < set_size ; ++i) {
for (size_t j = 0 ; j < (1U << i) ; ++j) { for (j = 0 ; j < (1U << i) ; ++j) {
subset_sums[(1 << i) + j] = set[i] ^ subset_sums[j]; subset_sums[(1 << i) + j] = set[i] ^ subset_sums[j];
} }
} }
@ -88,7 +91,7 @@ static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f) {
f1[2] = f[3] ^ f1[1] ^ f0[3]; f1[2] = f[3] ^ f1[1] ^ f0[3];
f0[1] = f[2] ^ f0[2] ^ f1[1]; f0[1] = f[2] ^ f0[2] ^ f1[1];
f1[0] = f[1] ^ f0[1]; f1[0] = f[1] ^ f0[1];
return; break;
case 3: case 3:
f0[0] = f[0]; f0[0] = f[0];
@ -99,37 +102,43 @@ static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f) {
f1[3] = f[7]; f1[3] = f[7];
f0[1] = f[2] ^ f0[2] ^ f1[1]; f0[1] = f[2] ^ f0[2] ^ f1[1];
f1[0] = f[1] ^ f0[1]; f1[0] = f[1] ^ f0[1];
return; break;
case 2: case 2:
f0[0] = f[0]; f0[0] = f[0];
f0[1] = f[2] ^ f[3]; f0[1] = f[2] ^ f[3];
f1[0] = f[1] ^ f0[1]; f1[0] = f[1] ^ f0[1];
f1[1] = f[3]; f1[1] = f[3];
return; break;
case 1: case 1:
f0[0] = f[0]; f0[0] = f[0];
f1[0] = f[1]; f1[0] = f[1];
return; break;
default: default:
; radix_big(f0, f1, f, m_f);
size_t n = 1 << (m_f - 2); break;
}
}
uint16_t Q[2 * (1 << (PARAM_FFT - 2))]; static void radix_big(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f) {
uint16_t R[2 * (1 << (PARAM_FFT - 2))]; uint16_t Q[2 * (1 << (PARAM_FFT - 2))] = {0};
uint16_t R[2 * (1 << (PARAM_FFT - 2))] = {0};
uint16_t Q0[1 << (PARAM_FFT - 2)]; uint16_t Q0[1 << (PARAM_FFT - 2)] = {0};
uint16_t Q1[1 << (PARAM_FFT - 2)]; uint16_t Q1[1 << (PARAM_FFT - 2)] = {0};
uint16_t R0[1 << (PARAM_FFT - 2)]; uint16_t R0[1 << (PARAM_FFT - 2)] = {0};
uint16_t R1[1 << (PARAM_FFT - 2)]; uint16_t R1[1 << (PARAM_FFT - 2)] = {0};
size_t i, n;
n = 1 << (m_f - 2);
memcpy(Q, f + 3 * n, 2 * n); memcpy(Q, f + 3 * n, 2 * n);
memcpy(Q + n, f + 3 * n, 2 * n); memcpy(Q + n, f + 3 * n, 2 * n);
memcpy(R, f, 4 * n); memcpy(R, f, 4 * n);
for (size_t i = 0 ; i < n ; ++i) { for (i = 0 ; i < n ; ++i) {
Q[i] ^= f[2 * n + i]; Q[i] ^= f[2 * n + i];
R[n + i] ^= Q[i]; R[n + i] ^= Q[i];
} }
@ -141,7 +150,6 @@ static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f) {
memcpy(f0 + n, Q0, 2 * n); memcpy(f0 + n, Q0, 2 * n);
memcpy(f1, R1, 2 * n); memcpy(f1, R1, 2 * n);
memcpy(f1 + n, Q1, 2 * n); memcpy(f1 + n, Q1, 2 * n);
}
} }
@ -159,25 +167,27 @@ static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f) {
* @param[in] betas FFT constants * @param[in] betas FFT constants
*/ */
static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32_t m_f, const uint16_t *betas) { static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32_t m_f, const uint16_t *betas) {
uint16_t f0[1 << (PARAM_FFT - 2)]; uint16_t f0[1 << (PARAM_FFT - 2)] = {0};
uint16_t f1[1 << (PARAM_FFT - 2)]; uint16_t f1[1 << (PARAM_FFT - 2)] = {0};
uint16_t gammas[PARAM_M - 2]; uint16_t gammas[PARAM_M - 2] = {0};
uint16_t deltas[PARAM_M - 2]; uint16_t deltas[PARAM_M - 2] = {0};
size_t k = 1 << (m - 1); uint16_t gammas_sums[1 << (PARAM_M - 2)] = {0};
uint16_t gammas_sums[1 << (PARAM_M - 2)];
uint16_t u[1 << (PARAM_M - 2)] = {0}; uint16_t u[1 << (PARAM_M - 2)] = {0};
uint16_t v[1 << (PARAM_M - 2)] = {0}; uint16_t v[1 << (PARAM_M - 2)] = {0};
uint16_t tmp[PARAM_M - (PARAM_FFT - 1)] = {0};
uint16_t beta_m_pow;
size_t i, j, k;
// Step 1 // Step 1
if (m_f == 1) { if (m_f == 1) {
uint16_t tmp[PARAM_M - (PARAM_FFT - 1)]; for (i = 0 ; i < m ; ++i) {
for (size_t i = 0 ; i < m ; ++i) {
tmp[i] = PQCLEAN_HQCRMRS128_AVX2_gf_mul(betas[i], f[1]); tmp[i] = PQCLEAN_HQCRMRS128_AVX2_gf_mul(betas[i], f[1]);
} }
w[0] = f[0]; w[0] = f[0];
for (size_t j = 0 ; j < m ; ++j) { for (j = 0 ; j < m ; ++j) {
for (size_t k = 0 ; k < (1U << j) ; ++k) { for (k = 0 ; k < (1U << j) ; ++k) {
w[(1 << j) + k] = w[k] ^ tmp[j]; w[(1 << j) + k] = w[k] ^ tmp[j];
} }
} }
@ -187,8 +197,8 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
// Step 2: compute g // Step 2: compute g
if (betas[m - 1] != 1) { if (betas[m - 1] != 1) {
uint16_t beta_m_pow = 1; beta_m_pow = 1;
for (size_t i = 1 ; i < (1U << m_f) ; ++i) { for (i = 1 ; i < (1U << m_f) ; ++i) {
beta_m_pow = PQCLEAN_HQCRMRS128_AVX2_gf_mul(beta_m_pow, betas[m - 1]); beta_m_pow = PQCLEAN_HQCRMRS128_AVX2_gf_mul(beta_m_pow, betas[m - 1]);
f[i] = PQCLEAN_HQCRMRS128_AVX2_gf_mul(beta_m_pow, f[i]); f[i] = PQCLEAN_HQCRMRS128_AVX2_gf_mul(beta_m_pow, f[i]);
} }
@ -198,7 +208,7 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
radix(f0, f1, f, m_f); radix(f0, f1, f, m_f);
// Step 4: compute gammas and deltas // Step 4: compute gammas and deltas
for (uint8_t i = 0 ; i < m - 1 ; ++i) { for (i = 0 ; i + 1 < m ; ++i) {
gammas[i] = PQCLEAN_HQCRMRS128_AVX2_gf_mul(betas[i], PQCLEAN_HQCRMRS128_AVX2_gf_inverse(betas[m - 1])); gammas[i] = PQCLEAN_HQCRMRS128_AVX2_gf_mul(betas[i], PQCLEAN_HQCRMRS128_AVX2_gf_inverse(betas[m - 1]));
deltas[i] = PQCLEAN_HQCRMRS128_AVX2_gf_square(gammas[i]) ^ gammas[i]; deltas[i] = PQCLEAN_HQCRMRS128_AVX2_gf_square(gammas[i]) ^ gammas[i];
} }
@ -209,10 +219,11 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
// Step 5 // Step 5
fft_rec(u, f0, (f_coeffs + 1) / 2, m - 1, m_f - 1, deltas); fft_rec(u, f0, (f_coeffs + 1) / 2, m - 1, m_f - 1, deltas);
k = 1 << ((m - 1) & 0xf); // &0xf is to let the compiler know that m-1 is small.
if (f_coeffs <= 3) { // 3-coefficient polynomial f case: f1 is constant if (f_coeffs <= 3) { // 3-coefficient polynomial f case: f1 is constant
w[0] = u[0]; w[0] = u[0];
w[k] = u[0] ^ f1[0]; w[k] = u[0] ^ f1[0];
for (size_t i = 1 ; i < k ; ++i) { for (i = 1 ; i < k ; ++i) {
w[i] = u[i] ^ PQCLEAN_HQCRMRS128_AVX2_gf_mul(gammas_sums[i], f1[0]); w[i] = u[i] ^ PQCLEAN_HQCRMRS128_AVX2_gf_mul(gammas_sums[i], f1[0]);
w[k + i] = w[i] ^ f1[0]; w[k + i] = w[i] ^ f1[0];
} }
@ -223,7 +234,7 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
memcpy(w + k, v, 2 * k); memcpy(w + k, v, 2 * k);
w[0] = u[0]; w[0] = u[0];
w[k] ^= u[0]; w[k] ^= u[0];
for (size_t i = 1 ; i < k ; ++i) { for (i = 1 ; i < k ; ++i) {
w[i] = u[i] ^ PQCLEAN_HQCRMRS128_AVX2_gf_mul(gammas_sums[i], v[i]); w[i] = u[i] ^ PQCLEAN_HQCRMRS128_AVX2_gf_mul(gammas_sums[i], v[i]);
w[k + i] ^= w[i]; w[k + i] ^= w[i];
} }
@ -252,14 +263,15 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
* @param[in] f_coeffs Number coefficients of f (i.e. deg(f)+1) * @param[in] f_coeffs Number coefficients of f (i.e. deg(f)+1)
*/ */
void PQCLEAN_HQCRMRS128_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs) { void PQCLEAN_HQCRMRS128_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs) {
uint16_t betas[PARAM_M - 1]; uint16_t betas[PARAM_M - 1] = {0};
uint16_t betas_sums[1 << (PARAM_M - 1)]; uint16_t betas_sums[1 << (PARAM_M - 1)] = {0};
uint16_t f0[1 << (PARAM_FFT - 1)]; uint16_t f0[1 << (PARAM_FFT - 1)] = {0};
uint16_t f1[1 << (PARAM_FFT - 1)]; uint16_t f1[1 << (PARAM_FFT - 1)] = {0};
uint16_t deltas[PARAM_M - 1]; uint16_t deltas[PARAM_M - 1] = {0};
size_t k = 1 << (PARAM_M - 1); uint16_t u[1 << (PARAM_M - 1)] = {0};
uint16_t u[1 << (PARAM_M - 1)]; uint16_t v[1 << (PARAM_M - 1)] = {0};
uint16_t v[1 << (PARAM_M - 1)];
size_t i, k;
// Follows Gao and Mateer algorithm // Follows Gao and Mateer algorithm
compute_fft_betas(betas); compute_fft_betas(betas);
@ -275,7 +287,7 @@ void PQCLEAN_HQCRMRS128_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs
radix(f0, f1, f, PARAM_FFT); radix(f0, f1, f, PARAM_FFT);
// Step 4: Compute deltas // Step 4: Compute deltas
for (size_t i = 0 ; i < PARAM_M - 1 ; ++i) { for (i = 0 ; i < PARAM_M - 1 ; ++i) {
deltas[i] = PQCLEAN_HQCRMRS128_AVX2_gf_square(betas[i]) ^ betas[i]; deltas[i] = PQCLEAN_HQCRMRS128_AVX2_gf_square(betas[i]) ^ betas[i];
} }
@ -283,6 +295,7 @@ void PQCLEAN_HQCRMRS128_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs
fft_rec(u, f0, (f_coeffs + 1) / 2, PARAM_M - 1, PARAM_FFT - 1, deltas); fft_rec(u, f0, (f_coeffs + 1) / 2, PARAM_M - 1, PARAM_FFT - 1, deltas);
fft_rec(v, f1, f_coeffs / 2, PARAM_M - 1, PARAM_FFT - 1, deltas); fft_rec(v, f1, f_coeffs / 2, PARAM_M - 1, PARAM_FFT - 1, deltas);
k = 1 << (PARAM_M - 1);
// Step 6, 7 and error polynomial computation // Step 6, 7 and error polynomial computation
memcpy(w + k, v, 2 * k); memcpy(w + k, v, 2 * k);
@ -293,7 +306,7 @@ void PQCLEAN_HQCRMRS128_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs
w[k] ^= u[0]; w[k] ^= u[0];
// Find other roots // Find other roots
for (size_t i = 1 ; i < k ; ++i) { for (i = 1 ; i < k ; ++i) {
w[i] = u[i] ^ PQCLEAN_HQCRMRS128_AVX2_gf_mul(betas_sums[i], v[i]); w[i] = u[i] ^ PQCLEAN_HQCRMRS128_AVX2_gf_mul(betas_sums[i], v[i]);
w[k + i] ^= w[i]; w[k + i] ^= w[i];
} }
@ -311,17 +324,16 @@ void PQCLEAN_HQCRMRS128_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs
void PQCLEAN_HQCRMRS128_AVX2_fft_retrieve_error_poly(uint8_t *error, const uint16_t *w) { void PQCLEAN_HQCRMRS128_AVX2_fft_retrieve_error_poly(uint8_t *error, const uint16_t *w) {
uint16_t gammas[PARAM_M - 1] = {0}; uint16_t gammas[PARAM_M - 1] = {0};
uint16_t gammas_sums[1 << (PARAM_M - 1)] = {0}; uint16_t gammas_sums[1 << (PARAM_M - 1)] = {0};
size_t k = 1 << (PARAM_M - 1); size_t i, k, index;
compute_fft_betas(gammas); compute_fft_betas(gammas);
compute_subset_sums(gammas_sums, gammas, PARAM_M - 1); compute_subset_sums(gammas_sums, gammas, PARAM_M - 1);
k = 1 << (PARAM_M - 1);
error[0] ^= 1 ^ ((uint16_t) - w[0] >> 15); error[0] ^= 1 ^ ((uint16_t) - w[0] >> 15);
error[0] ^= 1 ^ ((uint16_t) - w[k] >> 15); error[0] ^= 1 ^ ((uint16_t) - w[k] >> 15);
size_t index = PARAM_GF_MUL_ORDER; for (i = 1 ; i < k ; ++i) {
for (size_t i = 1 ; i < k ; ++i) {
index = PARAM_GF_MUL_ORDER - PQCLEAN_HQCRMRS128_AVX2_gf_log(gammas_sums[i]); index = PARAM_GF_MUL_ORDER - PQCLEAN_HQCRMRS128_AVX2_gf_log(gammas_sums[i]);
error[index] ^= 1 ^ ((uint16_t) - w[i] >> 15); error[index] ^= 1 ^ ((uint16_t) - w[i] >> 15);

View File

@ -30,29 +30,28 @@ uint16_t PQCLEAN_HQCRMRS128_AVX2_gf_log(uint16_t elt) {
* @param[in] deg_x The degree of polynomial x * @param[in] deg_x The degree of polynomial x
*/ */
static uint16_t gf_reduce(uint64_t x, size_t deg_x) { static uint16_t gf_reduce(uint64_t x, size_t deg_x) {
// Compute the distance between the primitive polynomial first two set bits uint16_t z1, z2, rmdr, dist;
size_t lz1 = __builtin_clz(PARAM_GF_POLY); uint64_t mod;
size_t lz2 = __builtin_clz(PARAM_GF_POLY ^ 1 << PARAM_M); size_t steps, i, j;
size_t dist = lz2 - lz1;
// Deduce the number of steps of reduction // Deduce the number of steps of reduction
size_t steps = CEIL_DIVIDE(deg_x - (PARAM_M - 1), dist); steps = CEIL_DIVIDE(deg_x - (PARAM_M - 1), PARAM_GF_POLY_M2);
// Reduce // Reduce
for (size_t i = 0; i < steps; ++i) { for (i = 0; i < steps; ++i) {
uint64_t mod = x >> PARAM_M; mod = x >> PARAM_M;
x &= (1 << PARAM_M) - 1; x &= (1 << PARAM_M) - 1;
x ^= mod; x ^= mod;
size_t tz1 = 0; z1 = 0;
uint16_t rmdr = PARAM_GF_POLY ^ 1; rmdr = PARAM_GF_POLY ^ 1;
for (size_t j = __builtin_popcount(PARAM_GF_POLY) - 2; j; --j) { for (j = PARAM_GF_POLY_WT - 2; j; --j) {
size_t tz2 = __builtin_ctz(rmdr); z2 = __tzcnt_u16(rmdr);
size_t shift = tz2 - tz1; dist = (uint16_t) (z2 - z1);
mod <<= shift; mod <<= dist;
x ^= mod; x ^= mod;
rmdr ^= 1 << tz2; rmdr ^= 1 << z2;
tz1 = tz2; z1 = z2;
} }
} }

View File

@ -328,9 +328,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
static __m256i W0[2 * (T_TM3_3W_256)], W1[2 * (T_TM3_3W_256)], W2[2 * (T_TM3_3W_256)], W3[2 * (T_TM3_3W_256)], W4[2 * (T_TM3_3W_256)]; static __m256i W0[2 * (T_TM3_3W_256)], W1[2 * (T_TM3_3W_256)], W2[2 * (T_TM3_3W_256)], W3[2 * (T_TM3_3W_256)], W4[2 * (T_TM3_3W_256)];
static __m256i tmp[2 * (T_TM3_3W_256)]; static __m256i tmp[2 * (T_TM3_3W_256)];
static __m256i ro256[6 * (T_TM3_3W_256)]; static __m256i ro256[6 * (T_TM3_3W_256)];
const __m256i zero = (__m256i) { const __m256i zero = _mm256_setzero_si256();
0ul, 0ul, 0ul, 0ul
};
int32_t T2 = T_TM3_3W_64 << 1; int32_t T2 = T_TM3_3W_64 << 1;
for (int32_t i = 0 ; i < T_TM3_3W_256 - 1 ; i++) { for (int32_t i = 0 ; i < T_TM3_3W_256 - 1 ; i++) {
@ -347,24 +345,12 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
for (int32_t i = T_TM3_3W_256 - 1 ; i < T_TM3_3W_256 ; i++) { for (int32_t i = T_TM3_3W_256 - 1 ; i < T_TM3_3W_256 ; i++) {
int32_t i4 = i << 2; int32_t i4 = i << 2;
int32_t i41 = i4 + 1; int32_t i41 = i4 + 1;
U0[i] = (__m256i) { U0[i] = _mm256_set_epi64x(0, 0, A[i41], A[i4]);
A[i4], A[i41], 0x0ul, 0x0ul V0[i] = _mm256_set_epi64x(0, 0, B[i41], B[i4]);
}; U1[i] = _mm256_set_epi64x(0, 0, A[i41 + T_TM3_3W_64 - 2], A[i4 + T_TM3_3W_64 - 2]);
V0[i] = (__m256i) { V1[i] = _mm256_set_epi64x(0, 0, B[i41 + T_TM3_3W_64 - 2], B[i4 + T_TM3_3W_64 - 2]);
B[i4], B[i41], 0x0ul, 0x0ul U2[i] = _mm256_set_epi64x(0, 0, A[i4 - 3 + T2], A[i4 - 4 + T2]);
}; V2[i] = _mm256_set_epi64x(0, 0, B[i4 - 3 + T2], B[i4 - 4 + T2]);
U1[i] = (__m256i) {
A[i4 + T_TM3_3W_64 - 2], A[i41 + T_TM3_3W_64 - 2], 0x0ul, 0x0ul
};
V1[i] = (__m256i) {
B[i4 + T_TM3_3W_64 - 2], B[i41 + T_TM3_3W_64 - 2], 0x0ul, 0x0ul
};
U2[i] = (__m256i) {
A[i4 - 4 + T2], A[i4 - 3 + T2], 0x0ul, 0x0ul
};
V2[i] = (__m256i) {
B[i4 - 4 + T2], B[i4 - 3 + T2], 0x0ul, 0x0ul
};
} }
// Evaluation phase : x= X^64 // Evaluation phase : x= X^64
@ -452,9 +438,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
//W2 =(W2 + W3 + W4*(x^3+1))/(x+1) //W2 =(W2 + W3 + W4*(x^3+1))/(x+1)
U1_64 = ((int64_t *) W4); U1_64 = ((int64_t *) W4);
__m256i *U1_256 = (__m256i *) (U1_64 + 1); __m256i *U1_256 = (__m256i *) (U1_64 + 1);
tmp[0] = W2[0] ^ W3[0] ^ W4[0] ^ (__m256i) { tmp[0] = W2[0] ^ W3[0] ^ W4[0] ^ _mm256_set_epi64x(U1_64[0], 0, 0, 0);
0x0ul, 0x0ul, 0x0ul, U1_64[0]
};
for (int32_t i = 1 ; i < (T_TM3_3W_256 << 1) - 1 ; i++) { for (int32_t i = 1 ; i < (T_TM3_3W_256 << 1) - 1 ; i++) {
tmp[i] = W2[i] ^ W3[i] ^ W4[i] ^ _mm256_lddqu_si256(&U1_256[i - 1]); tmp[i] = W2[i] ^ W3[i] ^ W4[i] ^ _mm256_lddqu_si256(&U1_256[i - 1]);

View File

@ -18,6 +18,7 @@
static void compute_fft_betas(uint16_t *betas); static void compute_fft_betas(uint16_t *betas);
static void compute_subset_sums(uint16_t *subset_sums, const uint16_t *set, size_t set_size); static void compute_subset_sums(uint16_t *subset_sums, const uint16_t *set, size_t set_size);
static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f); static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f);
static void radix_big(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f);
static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32_t m_f, const uint16_t *betas); static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32_t m_f, const uint16_t *betas);
@ -27,7 +28,8 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
* @param[out] betas Array of size PARAM_M-1 * @param[out] betas Array of size PARAM_M-1
*/ */
static void compute_fft_betas(uint16_t *betas) { static void compute_fft_betas(uint16_t *betas) {
for (size_t i = 0 ; i < PARAM_M - 1 ; ++i) { size_t i;
for (i = 0 ; i < PARAM_M - 1 ; ++i) {
betas[i] = 1 << (PARAM_M - 1 - i); betas[i] = 1 << (PARAM_M - 1 - i);
} }
} }
@ -45,10 +47,11 @@ static void compute_fft_betas(uint16_t *betas) {
* @param[in] set_size Size of the array set * @param[in] set_size Size of the array set
*/ */
static void compute_subset_sums(uint16_t *subset_sums, const uint16_t *set, size_t set_size) { static void compute_subset_sums(uint16_t *subset_sums, const uint16_t *set, size_t set_size) {
size_t i, j;
subset_sums[0] = 0; subset_sums[0] = 0;
for (size_t i = 0 ; i < set_size ; ++i) { for (i = 0 ; i < set_size ; ++i) {
for (size_t j = 0 ; j < (1U << i) ; ++j) { for (j = 0 ; j < (1U << i) ; ++j) {
subset_sums[(1 << i) + j] = set[i] ^ subset_sums[j]; subset_sums[(1 << i) + j] = set[i] ^ subset_sums[j];
} }
} }
@ -88,7 +91,7 @@ static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f) {
f1[2] = f[3] ^ f1[1] ^ f0[3]; f1[2] = f[3] ^ f1[1] ^ f0[3];
f0[1] = f[2] ^ f0[2] ^ f1[1]; f0[1] = f[2] ^ f0[2] ^ f1[1];
f1[0] = f[1] ^ f0[1]; f1[0] = f[1] ^ f0[1];
return; break;
case 3: case 3:
f0[0] = f[0]; f0[0] = f[0];
@ -99,37 +102,43 @@ static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f) {
f1[3] = f[7]; f1[3] = f[7];
f0[1] = f[2] ^ f0[2] ^ f1[1]; f0[1] = f[2] ^ f0[2] ^ f1[1];
f1[0] = f[1] ^ f0[1]; f1[0] = f[1] ^ f0[1];
return; break;
case 2: case 2:
f0[0] = f[0]; f0[0] = f[0];
f0[1] = f[2] ^ f[3]; f0[1] = f[2] ^ f[3];
f1[0] = f[1] ^ f0[1]; f1[0] = f[1] ^ f0[1];
f1[1] = f[3]; f1[1] = f[3];
return; break;
case 1: case 1:
f0[0] = f[0]; f0[0] = f[0];
f1[0] = f[1]; f1[0] = f[1];
return; break;
default: default:
; radix_big(f0, f1, f, m_f);
size_t n = 1 << (m_f - 2); break;
}
}
uint16_t Q[2 * (1 << (PARAM_FFT - 2))]; static void radix_big(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f) {
uint16_t R[2 * (1 << (PARAM_FFT - 2))]; uint16_t Q[2 * (1 << (PARAM_FFT - 2))] = {0};
uint16_t R[2 * (1 << (PARAM_FFT - 2))] = {0};
uint16_t Q0[1 << (PARAM_FFT - 2)]; uint16_t Q0[1 << (PARAM_FFT - 2)] = {0};
uint16_t Q1[1 << (PARAM_FFT - 2)]; uint16_t Q1[1 << (PARAM_FFT - 2)] = {0};
uint16_t R0[1 << (PARAM_FFT - 2)]; uint16_t R0[1 << (PARAM_FFT - 2)] = {0};
uint16_t R1[1 << (PARAM_FFT - 2)]; uint16_t R1[1 << (PARAM_FFT - 2)] = {0};
size_t i, n;
n = 1 << (m_f - 2);
memcpy(Q, f + 3 * n, 2 * n); memcpy(Q, f + 3 * n, 2 * n);
memcpy(Q + n, f + 3 * n, 2 * n); memcpy(Q + n, f + 3 * n, 2 * n);
memcpy(R, f, 4 * n); memcpy(R, f, 4 * n);
for (size_t i = 0 ; i < n ; ++i) { for (i = 0 ; i < n ; ++i) {
Q[i] ^= f[2 * n + i]; Q[i] ^= f[2 * n + i];
R[n + i] ^= Q[i]; R[n + i] ^= Q[i];
} }
@ -141,7 +150,6 @@ static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f) {
memcpy(f0 + n, Q0, 2 * n); memcpy(f0 + n, Q0, 2 * n);
memcpy(f1, R1, 2 * n); memcpy(f1, R1, 2 * n);
memcpy(f1 + n, Q1, 2 * n); memcpy(f1 + n, Q1, 2 * n);
}
} }
@ -159,25 +167,27 @@ static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f) {
* @param[in] betas FFT constants * @param[in] betas FFT constants
*/ */
static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32_t m_f, const uint16_t *betas) { static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32_t m_f, const uint16_t *betas) {
uint16_t f0[1 << (PARAM_FFT - 2)]; uint16_t f0[1 << (PARAM_FFT - 2)] = {0};
uint16_t f1[1 << (PARAM_FFT - 2)]; uint16_t f1[1 << (PARAM_FFT - 2)] = {0};
uint16_t gammas[PARAM_M - 2]; uint16_t gammas[PARAM_M - 2] = {0};
uint16_t deltas[PARAM_M - 2]; uint16_t deltas[PARAM_M - 2] = {0};
size_t k = 1 << (m - 1); uint16_t gammas_sums[1 << (PARAM_M - 2)] = {0};
uint16_t gammas_sums[1 << (PARAM_M - 2)];
uint16_t u[1 << (PARAM_M - 2)] = {0}; uint16_t u[1 << (PARAM_M - 2)] = {0};
uint16_t v[1 << (PARAM_M - 2)] = {0}; uint16_t v[1 << (PARAM_M - 2)] = {0};
uint16_t tmp[PARAM_M - (PARAM_FFT - 1)] = {0};
uint16_t beta_m_pow;
size_t i, j, k;
// Step 1 // Step 1
if (m_f == 1) { if (m_f == 1) {
uint16_t tmp[PARAM_M - (PARAM_FFT - 1)]; for (i = 0 ; i < m ; ++i) {
for (size_t i = 0 ; i < m ; ++i) {
tmp[i] = PQCLEAN_HQCRMRS192_AVX2_gf_mul(betas[i], f[1]); tmp[i] = PQCLEAN_HQCRMRS192_AVX2_gf_mul(betas[i], f[1]);
} }
w[0] = f[0]; w[0] = f[0];
for (size_t j = 0 ; j < m ; ++j) { for (j = 0 ; j < m ; ++j) {
for (size_t k = 0 ; k < (1U << j) ; ++k) { for (k = 0 ; k < (1U << j) ; ++k) {
w[(1 << j) + k] = w[k] ^ tmp[j]; w[(1 << j) + k] = w[k] ^ tmp[j];
} }
} }
@ -187,8 +197,8 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
// Step 2: compute g // Step 2: compute g
if (betas[m - 1] != 1) { if (betas[m - 1] != 1) {
uint16_t beta_m_pow = 1; beta_m_pow = 1;
for (size_t i = 1 ; i < (1U << m_f) ; ++i) { for (i = 1 ; i < (1U << m_f) ; ++i) {
beta_m_pow = PQCLEAN_HQCRMRS192_AVX2_gf_mul(beta_m_pow, betas[m - 1]); beta_m_pow = PQCLEAN_HQCRMRS192_AVX2_gf_mul(beta_m_pow, betas[m - 1]);
f[i] = PQCLEAN_HQCRMRS192_AVX2_gf_mul(beta_m_pow, f[i]); f[i] = PQCLEAN_HQCRMRS192_AVX2_gf_mul(beta_m_pow, f[i]);
} }
@ -198,7 +208,7 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
radix(f0, f1, f, m_f); radix(f0, f1, f, m_f);
// Step 4: compute gammas and deltas // Step 4: compute gammas and deltas
for (uint8_t i = 0 ; i < m - 1 ; ++i) { for (i = 0 ; i + 1 < m ; ++i) {
gammas[i] = PQCLEAN_HQCRMRS192_AVX2_gf_mul(betas[i], PQCLEAN_HQCRMRS192_AVX2_gf_inverse(betas[m - 1])); gammas[i] = PQCLEAN_HQCRMRS192_AVX2_gf_mul(betas[i], PQCLEAN_HQCRMRS192_AVX2_gf_inverse(betas[m - 1]));
deltas[i] = PQCLEAN_HQCRMRS192_AVX2_gf_square(gammas[i]) ^ gammas[i]; deltas[i] = PQCLEAN_HQCRMRS192_AVX2_gf_square(gammas[i]) ^ gammas[i];
} }
@ -209,10 +219,11 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
// Step 5 // Step 5
fft_rec(u, f0, (f_coeffs + 1) / 2, m - 1, m_f - 1, deltas); fft_rec(u, f0, (f_coeffs + 1) / 2, m - 1, m_f - 1, deltas);
k = 1 << ((m - 1) & 0xf); // &0xf is to let the compiler know that m-1 is small.
if (f_coeffs <= 3) { // 3-coefficient polynomial f case: f1 is constant if (f_coeffs <= 3) { // 3-coefficient polynomial f case: f1 is constant
w[0] = u[0]; w[0] = u[0];
w[k] = u[0] ^ f1[0]; w[k] = u[0] ^ f1[0];
for (size_t i = 1 ; i < k ; ++i) { for (i = 1 ; i < k ; ++i) {
w[i] = u[i] ^ PQCLEAN_HQCRMRS192_AVX2_gf_mul(gammas_sums[i], f1[0]); w[i] = u[i] ^ PQCLEAN_HQCRMRS192_AVX2_gf_mul(gammas_sums[i], f1[0]);
w[k + i] = w[i] ^ f1[0]; w[k + i] = w[i] ^ f1[0];
} }
@ -223,7 +234,7 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
memcpy(w + k, v, 2 * k); memcpy(w + k, v, 2 * k);
w[0] = u[0]; w[0] = u[0];
w[k] ^= u[0]; w[k] ^= u[0];
for (size_t i = 1 ; i < k ; ++i) { for (i = 1 ; i < k ; ++i) {
w[i] = u[i] ^ PQCLEAN_HQCRMRS192_AVX2_gf_mul(gammas_sums[i], v[i]); w[i] = u[i] ^ PQCLEAN_HQCRMRS192_AVX2_gf_mul(gammas_sums[i], v[i]);
w[k + i] ^= w[i]; w[k + i] ^= w[i];
} }
@ -252,14 +263,15 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
* @param[in] f_coeffs Number coefficients of f (i.e. deg(f)+1) * @param[in] f_coeffs Number coefficients of f (i.e. deg(f)+1)
*/ */
void PQCLEAN_HQCRMRS192_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs) { void PQCLEAN_HQCRMRS192_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs) {
uint16_t betas[PARAM_M - 1]; uint16_t betas[PARAM_M - 1] = {0};
uint16_t betas_sums[1 << (PARAM_M - 1)]; uint16_t betas_sums[1 << (PARAM_M - 1)] = {0};
uint16_t f0[1 << (PARAM_FFT - 1)]; uint16_t f0[1 << (PARAM_FFT - 1)] = {0};
uint16_t f1[1 << (PARAM_FFT - 1)]; uint16_t f1[1 << (PARAM_FFT - 1)] = {0};
uint16_t deltas[PARAM_M - 1]; uint16_t deltas[PARAM_M - 1] = {0};
size_t k = 1 << (PARAM_M - 1); uint16_t u[1 << (PARAM_M - 1)] = {0};
uint16_t u[1 << (PARAM_M - 1)]; uint16_t v[1 << (PARAM_M - 1)] = {0};
uint16_t v[1 << (PARAM_M - 1)];
size_t i, k;
// Follows Gao and Mateer algorithm // Follows Gao and Mateer algorithm
compute_fft_betas(betas); compute_fft_betas(betas);
@ -275,7 +287,7 @@ void PQCLEAN_HQCRMRS192_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs
radix(f0, f1, f, PARAM_FFT); radix(f0, f1, f, PARAM_FFT);
// Step 4: Compute deltas // Step 4: Compute deltas
for (size_t i = 0 ; i < PARAM_M - 1 ; ++i) { for (i = 0 ; i < PARAM_M - 1 ; ++i) {
deltas[i] = PQCLEAN_HQCRMRS192_AVX2_gf_square(betas[i]) ^ betas[i]; deltas[i] = PQCLEAN_HQCRMRS192_AVX2_gf_square(betas[i]) ^ betas[i];
} }
@ -283,6 +295,7 @@ void PQCLEAN_HQCRMRS192_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs
fft_rec(u, f0, (f_coeffs + 1) / 2, PARAM_M - 1, PARAM_FFT - 1, deltas); fft_rec(u, f0, (f_coeffs + 1) / 2, PARAM_M - 1, PARAM_FFT - 1, deltas);
fft_rec(v, f1, f_coeffs / 2, PARAM_M - 1, PARAM_FFT - 1, deltas); fft_rec(v, f1, f_coeffs / 2, PARAM_M - 1, PARAM_FFT - 1, deltas);
k = 1 << (PARAM_M - 1);
// Step 6, 7 and error polynomial computation // Step 6, 7 and error polynomial computation
memcpy(w + k, v, 2 * k); memcpy(w + k, v, 2 * k);
@ -293,7 +306,7 @@ void PQCLEAN_HQCRMRS192_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs
w[k] ^= u[0]; w[k] ^= u[0];
// Find other roots // Find other roots
for (size_t i = 1 ; i < k ; ++i) { for (i = 1 ; i < k ; ++i) {
w[i] = u[i] ^ PQCLEAN_HQCRMRS192_AVX2_gf_mul(betas_sums[i], v[i]); w[i] = u[i] ^ PQCLEAN_HQCRMRS192_AVX2_gf_mul(betas_sums[i], v[i]);
w[k + i] ^= w[i]; w[k + i] ^= w[i];
} }
@ -311,17 +324,16 @@ void PQCLEAN_HQCRMRS192_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs
void PQCLEAN_HQCRMRS192_AVX2_fft_retrieve_error_poly(uint8_t *error, const uint16_t *w) { void PQCLEAN_HQCRMRS192_AVX2_fft_retrieve_error_poly(uint8_t *error, const uint16_t *w) {
uint16_t gammas[PARAM_M - 1] = {0}; uint16_t gammas[PARAM_M - 1] = {0};
uint16_t gammas_sums[1 << (PARAM_M - 1)] = {0}; uint16_t gammas_sums[1 << (PARAM_M - 1)] = {0};
size_t k = 1 << (PARAM_M - 1); size_t i, k, index;
compute_fft_betas(gammas); compute_fft_betas(gammas);
compute_subset_sums(gammas_sums, gammas, PARAM_M - 1); compute_subset_sums(gammas_sums, gammas, PARAM_M - 1);
k = 1 << (PARAM_M - 1);
error[0] ^= 1 ^ ((uint16_t) - w[0] >> 15); error[0] ^= 1 ^ ((uint16_t) - w[0] >> 15);
error[0] ^= 1 ^ ((uint16_t) - w[k] >> 15); error[0] ^= 1 ^ ((uint16_t) - w[k] >> 15);
size_t index = PARAM_GF_MUL_ORDER; for (i = 1 ; i < k ; ++i) {
for (size_t i = 1 ; i < k ; ++i) {
index = PARAM_GF_MUL_ORDER - PQCLEAN_HQCRMRS192_AVX2_gf_log(gammas_sums[i]); index = PARAM_GF_MUL_ORDER - PQCLEAN_HQCRMRS192_AVX2_gf_log(gammas_sums[i]);
error[index] ^= 1 ^ ((uint16_t) - w[i] >> 15); error[index] ^= 1 ^ ((uint16_t) - w[i] >> 15);

View File

@ -30,29 +30,28 @@ uint16_t PQCLEAN_HQCRMRS192_AVX2_gf_log(uint16_t elt) {
* @param[in] deg_x The degree of polynomial x * @param[in] deg_x The degree of polynomial x
*/ */
static uint16_t gf_reduce(uint64_t x, size_t deg_x) { static uint16_t gf_reduce(uint64_t x, size_t deg_x) {
// Compute the distance between the primitive polynomial first two set bits uint16_t z1, z2, rmdr, dist;
size_t lz1 = __builtin_clz(PARAM_GF_POLY); uint64_t mod;
size_t lz2 = __builtin_clz(PARAM_GF_POLY ^ 1 << PARAM_M); size_t steps, i, j;
size_t dist = lz2 - lz1;
// Deduce the number of steps of reduction // Deduce the number of steps of reduction
size_t steps = CEIL_DIVIDE(deg_x - (PARAM_M - 1), dist); steps = CEIL_DIVIDE(deg_x - (PARAM_M - 1), PARAM_GF_POLY_M2);
// Reduce // Reduce
for (size_t i = 0; i < steps; ++i) { for (i = 0; i < steps; ++i) {
uint64_t mod = x >> PARAM_M; mod = x >> PARAM_M;
x &= (1 << PARAM_M) - 1; x &= (1 << PARAM_M) - 1;
x ^= mod; x ^= mod;
size_t tz1 = 0; z1 = 0;
uint16_t rmdr = PARAM_GF_POLY ^ 1; rmdr = PARAM_GF_POLY ^ 1;
for (size_t j = __builtin_popcount(PARAM_GF_POLY) - 2; j; --j) { for (j = PARAM_GF_POLY_WT - 2; j; --j) {
size_t tz2 = __builtin_ctz(rmdr); z2 = __tzcnt_u16(rmdr);
size_t shift = tz2 - tz1; dist = (uint16_t) (z2 - z1);
mod <<= shift; mod <<= dist;
x ^= mod; x ^= mod;
rmdr ^= 1 << tz2; rmdr ^= 1 << z2;
tz1 = tz2; z1 = z2;
} }
} }

View File

@ -368,9 +368,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
static __m256i W0[2 * (T_TM3_3W_256)], W1[2 * (T_TM3_3W_256)], W2[2 * (T_TM3_3W_256)], W3[2 * (T_TM3_3W_256)], W4[2 * (T_TM3_3W_256)]; static __m256i W0[2 * (T_TM3_3W_256)], W1[2 * (T_TM3_3W_256)], W2[2 * (T_TM3_3W_256)], W3[2 * (T_TM3_3W_256)], W4[2 * (T_TM3_3W_256)];
static __m256i tmp[2 * (T_TM3_3W_256)]; static __m256i tmp[2 * (T_TM3_3W_256)];
static __m256i ro256[6 * (T_TM3_3W_256)]; static __m256i ro256[6 * (T_TM3_3W_256)];
const __m256i zero = (__m256i) { const __m256i zero = _mm256_setzero_si256();
0ul, 0ul, 0ul, 0ul
};
int32_t T2 = T_TM3_3W_64 << 1; int32_t T2 = T_TM3_3W_64 << 1;
for (int32_t i = 0 ; i < T_TM3_3W_256 - 1 ; i++) { for (int32_t i = 0 ; i < T_TM3_3W_256 - 1 ; i++) {
@ -387,24 +385,12 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
for (int32_t i = T_TM3_3W_256 - 1 ; i < T_TM3_3W_256 ; i++) { for (int32_t i = T_TM3_3W_256 - 1 ; i < T_TM3_3W_256 ; i++) {
int32_t i4 = i << 2; int32_t i4 = i << 2;
int32_t i41 = i4 + 1; int32_t i41 = i4 + 1;
U0[i] = (__m256i) { U0[i] = _mm256_set_epi64x(0, 0, A[i41], A[i4]);
A[i4], A[i41], 0x0ul, 0x0ul V0[i] = _mm256_set_epi64x(0, 0, B[i41], B[i4]);
}; U1[i] = _mm256_set_epi64x(0, 0, A[i41 + T_TM3_3W_64 - 2], A[i4 + T_TM3_3W_64 - 2]);
V0[i] = (__m256i) { V1[i] = _mm256_set_epi64x(0, 0, B[i41 + T_TM3_3W_64 - 2], B[i4 + T_TM3_3W_64 - 2]);
B[i4], B[i41], 0x0ul, 0x0ul U2[i] = _mm256_set_epi64x(0, 0, A[i4 - 3 + T2], A[i4 - 4 + T2]);
}; V2[i] = _mm256_set_epi64x(0, 0, B[i4 - 3 + T2], B[i4 - 4 + T2]);
U1[i] = (__m256i) {
A[i4 + T_TM3_3W_64 - 2], A[i41 + T_TM3_3W_64 - 2], 0x0ul, 0x0ul
};
V1[i] = (__m256i) {
B[i4 + T_TM3_3W_64 - 2], B[i41 + T_TM3_3W_64 - 2], 0x0ul, 0x0ul
};
U2[i] = (__m256i) {
A[i4 - 4 + T2], A[i4 - 3 + T2], 0x0ul, 0x0ul
};
V2[i] = (__m256i) {
B[i4 - 4 + T2], B[i4 - 3 + T2], 0x0ul, 0x0ul
};
} }
// Evaluation phase : x= X^64 // Evaluation phase : x= X^64
@ -492,9 +478,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
//W2 =(W2 + W3 + W4*(x^3+1))/(x+1) //W2 =(W2 + W3 + W4*(x^3+1))/(x+1)
U1_64 = ((int64_t *) W4); U1_64 = ((int64_t *) W4);
__m256i *U1_256 = (__m256i *) (U1_64 + 1); __m256i *U1_256 = (__m256i *) (U1_64 + 1);
tmp[0] = W2[0] ^ W3[0] ^ W4[0] ^ (__m256i) { tmp[0] = W2[0] ^ W3[0] ^ W4[0] ^ _mm256_set_epi64x(U1_64[0], 0, 0, 0);
0x0ul, 0x0ul, 0x0ul, U1_64[0]
};
for (int32_t i = 1 ; i < (T_TM3_3W_256 << 1) - 1 ; i++) { for (int32_t i = 1 ; i < (T_TM3_3W_256 << 1) - 1 ; i++) {
tmp[i] = W2[i] ^ W3[i] ^ W4[i] ^ _mm256_lddqu_si256(&U1_256[i - 1]); tmp[i] = W2[i] ^ W3[i] ^ W4[i] ^ _mm256_lddqu_si256(&U1_256[i - 1]);

View File

@ -18,6 +18,7 @@
static void compute_fft_betas(uint16_t *betas); static void compute_fft_betas(uint16_t *betas);
static void compute_subset_sums(uint16_t *subset_sums, const uint16_t *set, size_t set_size); static void compute_subset_sums(uint16_t *subset_sums, const uint16_t *set, size_t set_size);
static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f); static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f);
static void radix_big(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f);
static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32_t m_f, const uint16_t *betas); static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32_t m_f, const uint16_t *betas);
@ -27,7 +28,8 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
* @param[out] betas Array of size PARAM_M-1 * @param[out] betas Array of size PARAM_M-1
*/ */
static void compute_fft_betas(uint16_t *betas) { static void compute_fft_betas(uint16_t *betas) {
for (size_t i = 0 ; i < PARAM_M - 1 ; ++i) { size_t i;
for (i = 0 ; i < PARAM_M - 1 ; ++i) {
betas[i] = 1 << (PARAM_M - 1 - i); betas[i] = 1 << (PARAM_M - 1 - i);
} }
} }
@ -45,10 +47,11 @@ static void compute_fft_betas(uint16_t *betas) {
* @param[in] set_size Size of the array set * @param[in] set_size Size of the array set
*/ */
static void compute_subset_sums(uint16_t *subset_sums, const uint16_t *set, size_t set_size) { static void compute_subset_sums(uint16_t *subset_sums, const uint16_t *set, size_t set_size) {
size_t i, j;
subset_sums[0] = 0; subset_sums[0] = 0;
for (size_t i = 0 ; i < set_size ; ++i) { for (i = 0 ; i < set_size ; ++i) {
for (size_t j = 0 ; j < (1U << i) ; ++j) { for (j = 0 ; j < (1U << i) ; ++j) {
subset_sums[(1 << i) + j] = set[i] ^ subset_sums[j]; subset_sums[(1 << i) + j] = set[i] ^ subset_sums[j];
} }
} }
@ -88,7 +91,7 @@ static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f) {
f1[2] = f[3] ^ f1[1] ^ f0[3]; f1[2] = f[3] ^ f1[1] ^ f0[3];
f0[1] = f[2] ^ f0[2] ^ f1[1]; f0[1] = f[2] ^ f0[2] ^ f1[1];
f1[0] = f[1] ^ f0[1]; f1[0] = f[1] ^ f0[1];
return; break;
case 3: case 3:
f0[0] = f[0]; f0[0] = f[0];
@ -99,37 +102,43 @@ static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f) {
f1[3] = f[7]; f1[3] = f[7];
f0[1] = f[2] ^ f0[2] ^ f1[1]; f0[1] = f[2] ^ f0[2] ^ f1[1];
f1[0] = f[1] ^ f0[1]; f1[0] = f[1] ^ f0[1];
return; break;
case 2: case 2:
f0[0] = f[0]; f0[0] = f[0];
f0[1] = f[2] ^ f[3]; f0[1] = f[2] ^ f[3];
f1[0] = f[1] ^ f0[1]; f1[0] = f[1] ^ f0[1];
f1[1] = f[3]; f1[1] = f[3];
return; break;
case 1: case 1:
f0[0] = f[0]; f0[0] = f[0];
f1[0] = f[1]; f1[0] = f[1];
return; break;
default: default:
; radix_big(f0, f1, f, m_f);
size_t n = 1 << (m_f - 2); break;
}
}
uint16_t Q[2 * (1 << (PARAM_FFT - 2))]; static void radix_big(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f) {
uint16_t R[2 * (1 << (PARAM_FFT - 2))]; uint16_t Q[2 * (1 << (PARAM_FFT - 2))] = {0};
uint16_t R[2 * (1 << (PARAM_FFT - 2))] = {0};
uint16_t Q0[1 << (PARAM_FFT - 2)]; uint16_t Q0[1 << (PARAM_FFT - 2)] = {0};
uint16_t Q1[1 << (PARAM_FFT - 2)]; uint16_t Q1[1 << (PARAM_FFT - 2)] = {0};
uint16_t R0[1 << (PARAM_FFT - 2)]; uint16_t R0[1 << (PARAM_FFT - 2)] = {0};
uint16_t R1[1 << (PARAM_FFT - 2)]; uint16_t R1[1 << (PARAM_FFT - 2)] = {0};
size_t i, n;
n = 1 << (m_f - 2);
memcpy(Q, f + 3 * n, 2 * n); memcpy(Q, f + 3 * n, 2 * n);
memcpy(Q + n, f + 3 * n, 2 * n); memcpy(Q + n, f + 3 * n, 2 * n);
memcpy(R, f, 4 * n); memcpy(R, f, 4 * n);
for (size_t i = 0 ; i < n ; ++i) { for (i = 0 ; i < n ; ++i) {
Q[i] ^= f[2 * n + i]; Q[i] ^= f[2 * n + i];
R[n + i] ^= Q[i]; R[n + i] ^= Q[i];
} }
@ -141,7 +150,6 @@ static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f) {
memcpy(f0 + n, Q0, 2 * n); memcpy(f0 + n, Q0, 2 * n);
memcpy(f1, R1, 2 * n); memcpy(f1, R1, 2 * n);
memcpy(f1 + n, Q1, 2 * n); memcpy(f1 + n, Q1, 2 * n);
}
} }
@ -159,25 +167,27 @@ static void radix(uint16_t *f0, uint16_t *f1, const uint16_t *f, uint32_t m_f) {
* @param[in] betas FFT constants * @param[in] betas FFT constants
*/ */
static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32_t m_f, const uint16_t *betas) { static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32_t m_f, const uint16_t *betas) {
uint16_t f0[1 << (PARAM_FFT - 2)]; uint16_t f0[1 << (PARAM_FFT - 2)] = {0};
uint16_t f1[1 << (PARAM_FFT - 2)]; uint16_t f1[1 << (PARAM_FFT - 2)] = {0};
uint16_t gammas[PARAM_M - 2]; uint16_t gammas[PARAM_M - 2] = {0};
uint16_t deltas[PARAM_M - 2]; uint16_t deltas[PARAM_M - 2] = {0};
size_t k = 1 << (m - 1); uint16_t gammas_sums[1 << (PARAM_M - 2)] = {0};
uint16_t gammas_sums[1 << (PARAM_M - 2)];
uint16_t u[1 << (PARAM_M - 2)] = {0}; uint16_t u[1 << (PARAM_M - 2)] = {0};
uint16_t v[1 << (PARAM_M - 2)] = {0}; uint16_t v[1 << (PARAM_M - 2)] = {0};
uint16_t tmp[PARAM_M - (PARAM_FFT - 1)] = {0};
uint16_t beta_m_pow;
size_t i, j, k;
// Step 1 // Step 1
if (m_f == 1) { if (m_f == 1) {
uint16_t tmp[PARAM_M - (PARAM_FFT - 1)]; for (i = 0 ; i < m ; ++i) {
for (size_t i = 0 ; i < m ; ++i) {
tmp[i] = PQCLEAN_HQCRMRS256_AVX2_gf_mul(betas[i], f[1]); tmp[i] = PQCLEAN_HQCRMRS256_AVX2_gf_mul(betas[i], f[1]);
} }
w[0] = f[0]; w[0] = f[0];
for (size_t j = 0 ; j < m ; ++j) { for (j = 0 ; j < m ; ++j) {
for (size_t k = 0 ; k < (1U << j) ; ++k) { for (k = 0 ; k < (1U << j) ; ++k) {
w[(1 << j) + k] = w[k] ^ tmp[j]; w[(1 << j) + k] = w[k] ^ tmp[j];
} }
} }
@ -187,8 +197,8 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
// Step 2: compute g // Step 2: compute g
if (betas[m - 1] != 1) { if (betas[m - 1] != 1) {
uint16_t beta_m_pow = 1; beta_m_pow = 1;
for (size_t i = 1 ; i < (1U << m_f) ; ++i) { for (i = 1 ; i < (1U << m_f) ; ++i) {
beta_m_pow = PQCLEAN_HQCRMRS256_AVX2_gf_mul(beta_m_pow, betas[m - 1]); beta_m_pow = PQCLEAN_HQCRMRS256_AVX2_gf_mul(beta_m_pow, betas[m - 1]);
f[i] = PQCLEAN_HQCRMRS256_AVX2_gf_mul(beta_m_pow, f[i]); f[i] = PQCLEAN_HQCRMRS256_AVX2_gf_mul(beta_m_pow, f[i]);
} }
@ -198,7 +208,7 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
radix(f0, f1, f, m_f); radix(f0, f1, f, m_f);
// Step 4: compute gammas and deltas // Step 4: compute gammas and deltas
for (uint8_t i = 0 ; i < m - 1 ; ++i) { for (i = 0 ; i + 1 < m ; ++i) {
gammas[i] = PQCLEAN_HQCRMRS256_AVX2_gf_mul(betas[i], PQCLEAN_HQCRMRS256_AVX2_gf_inverse(betas[m - 1])); gammas[i] = PQCLEAN_HQCRMRS256_AVX2_gf_mul(betas[i], PQCLEAN_HQCRMRS256_AVX2_gf_inverse(betas[m - 1]));
deltas[i] = PQCLEAN_HQCRMRS256_AVX2_gf_square(gammas[i]) ^ gammas[i]; deltas[i] = PQCLEAN_HQCRMRS256_AVX2_gf_square(gammas[i]) ^ gammas[i];
} }
@ -209,10 +219,11 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
// Step 5 // Step 5
fft_rec(u, f0, (f_coeffs + 1) / 2, m - 1, m_f - 1, deltas); fft_rec(u, f0, (f_coeffs + 1) / 2, m - 1, m_f - 1, deltas);
k = 1 << ((m - 1) & 0xf); // &0xf is to let the compiler know that m-1 is small.
if (f_coeffs <= 3) { // 3-coefficient polynomial f case: f1 is constant if (f_coeffs <= 3) { // 3-coefficient polynomial f case: f1 is constant
w[0] = u[0]; w[0] = u[0];
w[k] = u[0] ^ f1[0]; w[k] = u[0] ^ f1[0];
for (size_t i = 1 ; i < k ; ++i) { for (i = 1 ; i < k ; ++i) {
w[i] = u[i] ^ PQCLEAN_HQCRMRS256_AVX2_gf_mul(gammas_sums[i], f1[0]); w[i] = u[i] ^ PQCLEAN_HQCRMRS256_AVX2_gf_mul(gammas_sums[i], f1[0]);
w[k + i] = w[i] ^ f1[0]; w[k + i] = w[i] ^ f1[0];
} }
@ -223,7 +234,7 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
memcpy(w + k, v, 2 * k); memcpy(w + k, v, 2 * k);
w[0] = u[0]; w[0] = u[0];
w[k] ^= u[0]; w[k] ^= u[0];
for (size_t i = 1 ; i < k ; ++i) { for (i = 1 ; i < k ; ++i) {
w[i] = u[i] ^ PQCLEAN_HQCRMRS256_AVX2_gf_mul(gammas_sums[i], v[i]); w[i] = u[i] ^ PQCLEAN_HQCRMRS256_AVX2_gf_mul(gammas_sums[i], v[i]);
w[k + i] ^= w[i]; w[k + i] ^= w[i];
} }
@ -252,14 +263,15 @@ static void fft_rec(uint16_t *w, uint16_t *f, size_t f_coeffs, uint8_t m, uint32
* @param[in] f_coeffs Number coefficients of f (i.e. deg(f)+1) * @param[in] f_coeffs Number coefficients of f (i.e. deg(f)+1)
*/ */
void PQCLEAN_HQCRMRS256_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs) { void PQCLEAN_HQCRMRS256_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs) {
uint16_t betas[PARAM_M - 1]; uint16_t betas[PARAM_M - 1] = {0};
uint16_t betas_sums[1 << (PARAM_M - 1)]; uint16_t betas_sums[1 << (PARAM_M - 1)] = {0};
uint16_t f0[1 << (PARAM_FFT - 1)]; uint16_t f0[1 << (PARAM_FFT - 1)] = {0};
uint16_t f1[1 << (PARAM_FFT - 1)]; uint16_t f1[1 << (PARAM_FFT - 1)] = {0};
uint16_t deltas[PARAM_M - 1]; uint16_t deltas[PARAM_M - 1] = {0};
size_t k = 1 << (PARAM_M - 1); uint16_t u[1 << (PARAM_M - 1)] = {0};
uint16_t u[1 << (PARAM_M - 1)]; uint16_t v[1 << (PARAM_M - 1)] = {0};
uint16_t v[1 << (PARAM_M - 1)];
size_t i, k;
// Follows Gao and Mateer algorithm // Follows Gao and Mateer algorithm
compute_fft_betas(betas); compute_fft_betas(betas);
@ -275,7 +287,7 @@ void PQCLEAN_HQCRMRS256_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs
radix(f0, f1, f, PARAM_FFT); radix(f0, f1, f, PARAM_FFT);
// Step 4: Compute deltas // Step 4: Compute deltas
for (size_t i = 0 ; i < PARAM_M - 1 ; ++i) { for (i = 0 ; i < PARAM_M - 1 ; ++i) {
deltas[i] = PQCLEAN_HQCRMRS256_AVX2_gf_square(betas[i]) ^ betas[i]; deltas[i] = PQCLEAN_HQCRMRS256_AVX2_gf_square(betas[i]) ^ betas[i];
} }
@ -283,6 +295,7 @@ void PQCLEAN_HQCRMRS256_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs
fft_rec(u, f0, (f_coeffs + 1) / 2, PARAM_M - 1, PARAM_FFT - 1, deltas); fft_rec(u, f0, (f_coeffs + 1) / 2, PARAM_M - 1, PARAM_FFT - 1, deltas);
fft_rec(v, f1, f_coeffs / 2, PARAM_M - 1, PARAM_FFT - 1, deltas); fft_rec(v, f1, f_coeffs / 2, PARAM_M - 1, PARAM_FFT - 1, deltas);
k = 1 << (PARAM_M - 1);
// Step 6, 7 and error polynomial computation // Step 6, 7 and error polynomial computation
memcpy(w + k, v, 2 * k); memcpy(w + k, v, 2 * k);
@ -293,7 +306,7 @@ void PQCLEAN_HQCRMRS256_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs
w[k] ^= u[0]; w[k] ^= u[0];
// Find other roots // Find other roots
for (size_t i = 1 ; i < k ; ++i) { for (i = 1 ; i < k ; ++i) {
w[i] = u[i] ^ PQCLEAN_HQCRMRS256_AVX2_gf_mul(betas_sums[i], v[i]); w[i] = u[i] ^ PQCLEAN_HQCRMRS256_AVX2_gf_mul(betas_sums[i], v[i]);
w[k + i] ^= w[i]; w[k + i] ^= w[i];
} }
@ -311,17 +324,16 @@ void PQCLEAN_HQCRMRS256_AVX2_fft(uint16_t *w, const uint16_t *f, size_t f_coeffs
void PQCLEAN_HQCRMRS256_AVX2_fft_retrieve_error_poly(uint8_t *error, const uint16_t *w) { void PQCLEAN_HQCRMRS256_AVX2_fft_retrieve_error_poly(uint8_t *error, const uint16_t *w) {
uint16_t gammas[PARAM_M - 1] = {0}; uint16_t gammas[PARAM_M - 1] = {0};
uint16_t gammas_sums[1 << (PARAM_M - 1)] = {0}; uint16_t gammas_sums[1 << (PARAM_M - 1)] = {0};
size_t k = 1 << (PARAM_M - 1); size_t i, k, index;
compute_fft_betas(gammas); compute_fft_betas(gammas);
compute_subset_sums(gammas_sums, gammas, PARAM_M - 1); compute_subset_sums(gammas_sums, gammas, PARAM_M - 1);
k = 1 << (PARAM_M - 1);
error[0] ^= 1 ^ ((uint16_t) - w[0] >> 15); error[0] ^= 1 ^ ((uint16_t) - w[0] >> 15);
error[0] ^= 1 ^ ((uint16_t) - w[k] >> 15); error[0] ^= 1 ^ ((uint16_t) - w[k] >> 15);
size_t index = PARAM_GF_MUL_ORDER; for (i = 1 ; i < k ; ++i) {
for (size_t i = 1 ; i < k ; ++i) {
index = PARAM_GF_MUL_ORDER - PQCLEAN_HQCRMRS256_AVX2_gf_log(gammas_sums[i]); index = PARAM_GF_MUL_ORDER - PQCLEAN_HQCRMRS256_AVX2_gf_log(gammas_sums[i]);
error[index] ^= 1 ^ ((uint16_t) - w[i] >> 15); error[index] ^= 1 ^ ((uint16_t) - w[i] >> 15);

View File

@ -30,29 +30,28 @@ uint16_t PQCLEAN_HQCRMRS256_AVX2_gf_log(uint16_t elt) {
* @param[in] deg_x The degree of polynomial x * @param[in] deg_x The degree of polynomial x
*/ */
static uint16_t gf_reduce(uint64_t x, size_t deg_x) { static uint16_t gf_reduce(uint64_t x, size_t deg_x) {
// Compute the distance between the primitive polynomial first two set bits uint16_t z1, z2, rmdr, dist;
size_t lz1 = __builtin_clz(PARAM_GF_POLY); uint64_t mod;
size_t lz2 = __builtin_clz(PARAM_GF_POLY ^ 1 << PARAM_M); size_t steps, i, j;
size_t dist = lz2 - lz1;
// Deduce the number of steps of reduction // Deduce the number of steps of reduction
size_t steps = CEIL_DIVIDE(deg_x - (PARAM_M - 1), dist); steps = CEIL_DIVIDE(deg_x - (PARAM_M - 1), PARAM_GF_POLY_M2);
// Reduce // Reduce
for (size_t i = 0; i < steps; ++i) { for (i = 0; i < steps; ++i) {
uint64_t mod = x >> PARAM_M; mod = x >> PARAM_M;
x &= (1 << PARAM_M) - 1; x &= (1 << PARAM_M) - 1;
x ^= mod; x ^= mod;
size_t tz1 = 0; z1 = 0;
uint16_t rmdr = PARAM_GF_POLY ^ 1; rmdr = PARAM_GF_POLY ^ 1;
for (size_t j = __builtin_popcount(PARAM_GF_POLY) - 2; j; --j) { for (j = PARAM_GF_POLY_WT - 2; j; --j) {
size_t tz2 = __builtin_ctz(rmdr); z2 = __tzcnt_u16(rmdr);
size_t shift = tz2 - tz1; dist = (uint16_t) (z2 - z1);
mod <<= shift; mod <<= dist;
x ^= mod; x ^= mod;
rmdr ^= 1 << tz2; rmdr ^= 1 << z2;
tz1 = tz2; z1 = z2;
} }
} }

View File

@ -335,9 +335,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
static __m256i W0[2 * (T_TM3_3W_256)], W1[2 * (T_TM3_3W_256)], W2[2 * (T_TM3_3W_256)], W3[2 * (T_TM3_3W_256)], W4[2 * (T_TM3_3W_256)]; static __m256i W0[2 * (T_TM3_3W_256)], W1[2 * (T_TM3_3W_256)], W2[2 * (T_TM3_3W_256)], W3[2 * (T_TM3_3W_256)], W4[2 * (T_TM3_3W_256)];
static __m256i tmp[2 * (T_TM3_3W_256)]; static __m256i tmp[2 * (T_TM3_3W_256)];
static __m256i ro256[6 * (T_TM3_3W_256)]; static __m256i ro256[6 * (T_TM3_3W_256)];
const __m256i zero = (__m256i) { const __m256i zero = _mm256_setzero_si256();
0ul, 0ul, 0ul, 0ul
};
int32_t T2 = T_TM3_3W_64 << 1; int32_t T2 = T_TM3_3W_64 << 1;
for (int32_t i = 0 ; i < T_TM3_3W_256 - 1 ; i++) { for (int32_t i = 0 ; i < T_TM3_3W_256 - 1 ; i++) {
@ -354,24 +352,12 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
for (int32_t i = T_TM3_3W_256 - 1 ; i < T_TM3_3W_256 ; i++) { for (int32_t i = T_TM3_3W_256 - 1 ; i < T_TM3_3W_256 ; i++) {
int32_t i4 = i << 2; int32_t i4 = i << 2;
int32_t i41 = i4 + 1; int32_t i41 = i4 + 1;
U0[i] = (__m256i) { U0[i] = _mm256_set_epi64x(0, 0, A[i41], A[i4]);
A[i4], A[i41], 0x0ul, 0x0ul V0[i] = _mm256_set_epi64x(0, 0, B[i41], B[i4]);
}; U1[i] = _mm256_set_epi64x(0, 0, A[i41 + T_TM3_3W_64 - 2], A[i4 + T_TM3_3W_64 - 2]);
V0[i] = (__m256i) { V1[i] = _mm256_set_epi64x(0, 0, B[i41 + T_TM3_3W_64 - 2], B[i4 + T_TM3_3W_64 - 2]);
B[i4], B[i41], 0x0ul, 0x0ul U2[i] = _mm256_set_epi64x(0, 0, A[i4 - 3 + T2], A[i4 - 4 + T2]);
}; V2[i] = _mm256_set_epi64x(0, 0, B[i4 - 3 + T2], B[i4 - 4 + T2]);
U1[i] = (__m256i) {
A[i4 + T_TM3_3W_64 - 2], A[i41 + T_TM3_3W_64 - 2], 0x0ul, 0x0ul
};
V1[i] = (__m256i) {
B[i4 + T_TM3_3W_64 - 2], B[i41 + T_TM3_3W_64 - 2], 0x0ul, 0x0ul
};
U2[i] = (__m256i) {
A[i4 - 4 + T2], A[i4 - 3 + T2], 0x0ul, 0x0ul
};
V2[i] = (__m256i) {
B[i4 - 4 + T2], B[i4 - 3 + T2], 0x0ul, 0x0ul
};
} }
// Evaluation phase : x= X^64 // Evaluation phase : x= X^64
@ -459,9 +445,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
//W2 =(W2 + W3 + W4*(x^3+1))/(x+1) //W2 =(W2 + W3 + W4*(x^3+1))/(x+1)
U1_64 = ((int64_t *) W4); U1_64 = ((int64_t *) W4);
__m256i *U1_256 = (__m256i *) (U1_64 + 1); __m256i *U1_256 = (__m256i *) (U1_64 + 1);
tmp[0] = W2[0] ^ W3[0] ^ W4[0] ^ (__m256i) { tmp[0] = W2[0] ^ W3[0] ^ W4[0] ^ _mm256_set_epi64x(U1_64[0], 0, 0, 0);
0x0ul, 0x0ul, 0x0ul, U1_64[0]
};
for (int32_t i = 1 ; i < (T_TM3_3W_256 << 1) - 1 ; i++) { for (int32_t i = 1 ; i < (T_TM3_3W_256 << 1) - 1 ; i++) {
tmp[i] = W2[i] ^ W3[i] ^ W4[i] ^ _mm256_lddqu_si256(&U1_256[i - 1]); tmp[i] = W2[i] ^ W3[i] ^ W4[i] ^ _mm256_lddqu_si256(&U1_256[i - 1]);
@ -555,9 +539,7 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
__m256i W0[2 * (T_TM3R_3W_256 + 2)], W1[2 * (T_TM3R_3W_256 + 2)], W2[2 * (T_TM3R_3W_256 + 2)], W3[2 * (T_TM3R_3W_256 + 2)], W4[2 * (T_TM3R_3W_256 + 2)]; __m256i W0[2 * (T_TM3R_3W_256 + 2)], W1[2 * (T_TM3R_3W_256 + 2)], W2[2 * (T_TM3R_3W_256 + 2)], W3[2 * (T_TM3R_3W_256 + 2)], W4[2 * (T_TM3R_3W_256 + 2)];
__m256i tmp[2 * (T_TM3R_3W_256 + 2) + 3]; __m256i tmp[2 * (T_TM3R_3W_256 + 2) + 3];
__m256i ro256[tTM3R / 2]; __m256i ro256[tTM3R / 2];
const __m256i zero = (__m256i) { const __m256i zero = _mm256_setzero_si256();
0ul, 0ul, 0ul, 0ul
};
int32_t T2 = T_TM3R_3W_64 << 1; int32_t T2 = T_TM3R_3W_64 << 1;
for (int32_t i = 0 ; i < T_TM3R_3W_256 ; i++) { for (int32_t i = 0 ; i < T_TM3R_3W_256 ; i++) {