From 629b89ba73b831dada6720a1a37f3f9405969993 Mon Sep 17 00:00:00 2001 From: "John M. Schanck" Date: Tue, 15 Sep 2020 10:33:06 -0400 Subject: [PATCH] readability changes --- crypto_kem/hqc-128/avx2/bch.c | 50 +++---- crypto_kem/hqc-128/avx2/code.c | 41 +++--- crypto_kem/hqc-128/avx2/gf2x.c | 102 +++++++------- crypto_kem/hqc-128/avx2/vector.c | 60 ++++----- crypto_kem/hqc-128/clean/bch.c | 50 +++---- crypto_kem/hqc-128/clean/vector.c | 75 +++-------- crypto_kem/hqc-192/avx2/bch.c | 50 +++---- crypto_kem/hqc-192/avx2/code.c | 41 +++--- crypto_kem/hqc-192/avx2/gf2x.c | 116 ++++++++-------- crypto_kem/hqc-192/avx2/vector.c | 60 ++++----- crypto_kem/hqc-192/clean/bch.c | 46 ++++--- crypto_kem/hqc-192/clean/vector.c | 75 +++-------- crypto_kem/hqc-256/avx2/bch.c | 50 +++---- crypto_kem/hqc-256/avx2/code.c | 66 ++++----- crypto_kem/hqc-256/avx2/gf2x.c | 133 ++++++++++--------- crypto_kem/hqc-256/avx2/vector.c | 60 ++++----- crypto_kem/hqc-256/clean/bch.c | 46 ++++--- crypto_kem/hqc-256/clean/vector.c | 75 +++-------- crypto_kem/hqc-rmrs-128/avx2/gf2x.c | 102 +++++++------- crypto_kem/hqc-rmrs-128/avx2/reed_muller.c | 88 ++++++------ crypto_kem/hqc-rmrs-128/avx2/reed_solomon.c | 44 +++--- crypto_kem/hqc-rmrs-128/avx2/vector.c | 60 ++++----- crypto_kem/hqc-rmrs-128/clean/reed_solomon.c | 44 +++--- crypto_kem/hqc-rmrs-128/clean/vector.c | 75 +++-------- crypto_kem/hqc-rmrs-192/avx2/gf2x.c | 116 ++++++++-------- crypto_kem/hqc-rmrs-192/avx2/reed_muller.c | 88 ++++++------ crypto_kem/hqc-rmrs-192/avx2/reed_solomon.c | 44 +++--- crypto_kem/hqc-rmrs-192/avx2/vector.c | 60 ++++----- crypto_kem/hqc-rmrs-192/clean/reed_solomon.c | 44 +++--- crypto_kem/hqc-rmrs-192/clean/vector.c | 75 +++-------- crypto_kem/hqc-rmrs-256/avx2/gf2x.c | 133 ++++++++++--------- crypto_kem/hqc-rmrs-256/avx2/reed_muller.c | 88 ++++++------ crypto_kem/hqc-rmrs-256/avx2/reed_solomon.c | 44 +++--- crypto_kem/hqc-rmrs-256/avx2/vector.c | 60 ++++----- crypto_kem/hqc-rmrs-256/clean/reed_solomon.c | 44 +++--- crypto_kem/hqc-rmrs-256/clean/vector.c | 75 +++-------- test/duplicate_consistency/hqc-128_clean.yml | 2 + 37 files changed, 1134 insertions(+), 1348 deletions(-) diff --git a/crypto_kem/hqc-128/avx2/bch.c b/crypto_kem/hqc-128/avx2/bch.c index 6b93c391..519c82c6 100644 --- a/crypto_kem/hqc-128/avx2/bch.c +++ b/crypto_kem/hqc-128/avx2/bch.c @@ -35,52 +35,54 @@ static void compute_roots(uint64_t *error, const uint16_t *sigma); * @param[in] syndromes Array of size (at least) 2*PARAM_DELTA storing the syndromes */ static size_t compute_elp(uint16_t *sigma, const uint16_t *syndromes) { - sigma[0] = 1; - size_t deg_sigma = 0; - size_t deg_sigma_p = 0; uint16_t sigma_copy[PARAM_DELTA - 1] = {0}; - size_t deg_sigma_copy = 0; - uint16_t X_sigma_p[PARAM_DELTA + 1] = {0, 1}; - int32_t pp = -1; // 2*rho - uint16_t d_p = 1; - uint16_t d = syndromes[0]; + uint16_t X_sigma_p[PARAM_DELTA + 1] = {0}; + uint16_t d_p, d, dd; + uint16_t mask; + int32_t pp; // 2*rho + size_t deg_sigma, deg_sigma_p, deg_sigma_copy, deg_X_sigma_p; + d = syndromes[0]; + sigma[0] = 1; + X_sigma_p[1] = 1; + deg_sigma = 0; + deg_sigma_p = 0; + d_p = 1; + pp = -1; for (size_t mu = 0; mu < PARAM_DELTA; ++mu) { // Save sigma in case we need it to update X_sigma_p memcpy(sigma_copy, sigma, 2 * (PARAM_DELTA - 1)); deg_sigma_copy = deg_sigma; - uint16_t dd = PQCLEAN_HQC128_AVX2_gf_mul(d, PQCLEAN_HQC128_AVX2_gf_inverse(d_p)); // 0 if(d == 0) + dd = PQCLEAN_HQC128_AVX2_gf_mul(d, PQCLEAN_HQC128_AVX2_gf_inverse(d_p)); // 0 if(d == 0) for (size_t i = 1; (i <= 2 * mu + 1) && (i <= PARAM_DELTA); ++i) { sigma[i] ^= PQCLEAN_HQC128_AVX2_gf_mul(dd, X_sigma_p[i]); } - size_t deg_X = 2 * mu - pp; // 2*(mu-rho) - size_t deg_X_sigma_p = deg_X + deg_sigma_p; + deg_X_sigma_p = 2 * mu - pp + deg_sigma_p; - // mask1 = 0xffff if(d != 0) and 0 otherwise - int16_t mask1 = -((uint16_t) - d >> 15); + // mask = 0xffff if(d != 0) and 0 otherwise + mask = -((uint16_t) - d >> 15); - // mask2 = 0xffff if(deg_X_sigma_p > deg_sigma) and 0 otherwise - int16_t mask2 = -((uint16_t) (deg_sigma - deg_X_sigma_p) >> 15); + // mask &= 0xffff if(deg_X_sigma_p > deg_sigma) and 0 otherwise + mask &= -((uint16_t) (deg_sigma - deg_X_sigma_p) >> 15); - // mask12 = 0xffff if the deg_sigma increased and 0 otherwise - int16_t mask12 = mask1 & mask2; - deg_sigma = (mask12 & deg_X_sigma_p) ^ (~mask12 & deg_sigma); + deg_sigma ^= mask & (deg_sigma ^ deg_X_sigma_p); if (mu == PARAM_DELTA - 1) { break; } // Update pp, d_p and X_sigma_p if needed - pp = (mask12 & (2 * mu)) ^ (~mask12 & pp); - d_p = (mask12 & d) ^ (~mask12 & d_p); + pp ^= mask & (pp ^ (2 * mu)); + d_p ^= mask & (d_p ^ d); for (size_t i = PARAM_DELTA - 1; i; --i) { - X_sigma_p[i + 1] = (mask12 & sigma_copy[i - 1]) ^ (~mask12 & X_sigma_p[i - 1]); + X_sigma_p[i + 1] = X_sigma_p[i - 1]; + X_sigma_p[i + 1] ^= mask & (X_sigma_p[i + 1] ^ sigma_copy[i - 1]); } X_sigma_p[1] = 0; X_sigma_p[0] = 0; - deg_sigma_p = (mask12 & deg_sigma_copy) ^ (~mask12 & deg_sigma_p); + deg_sigma_p ^= mask & (deg_sigma_p ^ deg_sigma_copy); // Compute the next discrepancy d = syndromes[2 * mu + 2]; @@ -145,6 +147,7 @@ void compute_syndromes(__m256i *syndromes, const uint64_t *rcv) { uint32_t *aux; int16_t *alpha_tmp; uint32_t i; + uint32_t nzflag; // static variable so that it is stored in the DATA segment // not in the STACK segment static uint8_t tmp_array[PARAM_N1 + 4]; // +4 to control overflow due to management of 256 bits @@ -169,7 +172,8 @@ void compute_syndromes(__m256i *syndromes, const uint64_t *rcv) { alpha_tmp = table_alpha_ij + (j << 4); for (size_t i = 0; i < PARAM_N1; ++i) { - tmp_repeat = _mm256_set1_epi64x((long long)(tmp_array[i] != 0)); + nzflag = ((-(int32_t) tmp_array[i]) >> 31) & 1; + tmp_repeat = _mm256_set1_epi64x(nzflag); L = _mm256_cmpeq_epi64(tmp_repeat, un_256); tmp_repeat = _mm256_lddqu_si256((__m256i *)(alpha_tmp + i * (PARAM_DELTA << 1))); L = _mm256_and_si256(L, tmp_repeat); diff --git a/crypto_kem/hqc-128/avx2/code.c b/crypto_kem/hqc-128/avx2/code.c index 34d70b2d..fe8204d6 100644 --- a/crypto_kem/hqc-128/avx2/code.c +++ b/crypto_kem/hqc-128/avx2/code.c @@ -34,15 +34,18 @@ static inline uint64_t mux(uint64_t a, uint64_t b, int64_t bit) { * @param[in] m Pointer to an array that is the message */ void PQCLEAN_HQC128_AVX2_code_encode(uint64_t *em, const uint64_t *m) { - uint64_t res; - uint32_t i; - static const uint64_t mask[2][2] = {{0x0UL, 0x0UL}, {0x7FFFFFFFUL, 0x3FFFFFFFUL}}; + const uint64_t mask[2][2] = {{0x0UL, 0x0UL}, {0x7FFFFFFFUL, 0x3FFFFFFFUL}}; + size_t i, pos_r; + uint64_t bit; + uint64_t idx_r; + uint64_t select; __m256i *colonne, y, aux0; __m256i msg = _mm256_lddqu_si256((const __m256i *) m); colonne = ((__m256i *) gen_matrix); + pos_r = 0; for (i = 0; i < PARAM_N1 - PARAM_K; i++) { // y is the and operation between m and ith column of G y = _mm256_and_si256(colonne[i], msg); @@ -54,34 +57,32 @@ void PQCLEAN_HQC128_AVX2_code_encode(uint64_t *em, const uint64_t *m) { aux0 = _mm256_shuffle_epi32(y, 0x4e); // y = (y0^y1^y2^y3 repeated 4 times) y = _mm256_xor_si256(aux0, y); - res = _mm_popcnt_u64(_mm256_extract_epi64(y, 0)) & 1; + bit = _mm_popcnt_u64(_mm256_extract_epi64(y, 0)) & 1; - uint16_t pos_r = PARAM_N2 * i; - uint16_t idx_r = (pos_r & 0x3f); - uint64_t *p64 = em; - p64 += pos_r >> 6; - uint64_t select = mux(mask[0][0], mask[1][0], res); - *p64 ^= select << idx_r; - select = mux(mask[0][1], mask[1][1], res); - *(p64 + 1) ^= select >> ((63 - idx_r)); + idx_r = (pos_r & 0x3f); + select = mux(mask[0][0], mask[1][0], bit); + em[(pos_r >> 6) + 0] ^= select << idx_r; + select = mux(mask[0][1], mask[1][1], bit); + em[(pos_r >> 6) + 1] ^= select >> ((63 - idx_r)); + pos_r += PARAM_N2; } /* now we add the message m */ /* systematic encoding */ + pos_r = PARAM_N2 * (PARAM_N1 - PARAM_K); for (int32_t i = 0; i < 4; i++) { for (int32_t j = 0; j < 64; j++) { - uint8_t bit = (m[i] >> j) & 0x1; - uint32_t pos_r = PARAM_N2 * ((PARAM_N1 - PARAM_K) + ((i << 6) + j)); - uint16_t idx_r = (pos_r & 0x3f); - uint64_t *p64 = em; + bit = (m[i] >> j) & 0x1; - p64 += pos_r >> 6; - uint64_t select = mux(mask[0][0], mask[1][0], bit); - *p64 ^= select << idx_r; + idx_r = (pos_r & 0x3f); + select = mux(mask[0][0], mask[1][0], bit); + em[(pos_r >> 6) + 0] ^= select << idx_r; select = mux(mask[0][1], mask[1][1], bit); - *(p64 + 1) ^= select >> ((63 - idx_r)); + em[(pos_r >> 6) + 1] ^= select >> ((63 - idx_r)); + + pos_r += PARAM_N2; } } diff --git a/crypto_kem/hqc-128/avx2/gf2x.c b/crypto_kem/hqc-128/avx2/gf2x.c index d9faf703..e1fa89de 100644 --- a/crypto_kem/hqc-128/avx2/gf2x.c +++ b/crypto_kem/hqc-128/avx2/gf2x.c @@ -188,22 +188,23 @@ static inline void karat_mult_4(__m256i *C, __m256i *A, __m256i *B) { */ static inline void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) { __m256i D0[8], D1[8], D2[8], SAA[4], SBB[4]; + int32_t i, is, is2, is3; karat_mult_4( D0, A, B); karat_mult_4(D2, A + 4, B + 4); - for (int32_t i = 0; i < 4; i++) { - int is = i + 4; + for (i = 0; i < 4; i++) { + is = i + 4; SAA[i] = A[i] ^ A[is]; SBB[i] = B[i] ^ B[is]; } karat_mult_4(D1, SAA, SBB); - for (int32_t i = 0; i < 4; i++) { - int32_t is = i + 4; - int32_t is2 = is + 4; - int32_t is3 = is2 + 4; + for (i = 0; i < 4; i++) { + is = i + 4; + is2 = is + 4; + is3 = is2 + 4; __m256i middle = _mm256_xor_si256(D0[is], D2[i]); @@ -227,22 +228,23 @@ static inline void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) { */ static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) { __m256i D0[16], D1[16], D2[16], SAA[8], SBB[8]; + int32_t i, is, is2, is3; karat_mult_8( D0, A, B); karat_mult_8(D2, A + 8, B + 8); - for (int32_t i = 0; i < 8; i++) { - int32_t is = i + 8; + for (i = 0; i < 8; i++) { + is = i + 8; SAA[i] = A[i] ^ A[is]; SBB[i] = B[i] ^ B[is]; } karat_mult_8( D1, SAA, SBB); - for (int32_t i = 0; i < 8; i++) { - int32_t is = i + 8; - int32_t is2 = is + 8; - int32_t is3 = is2 + 8; + for (i = 0; i < 8; i++) { + is = i + 8; + is2 = is + 8; + is3 = is2 + 8; __m256i middle = _mm256_xor_si256(D0[is], D2[i]); @@ -266,22 +268,23 @@ static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) { */ static inline void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) { __m256i D0[32], D1[32], D2[32], SAA[16], SBB[16]; + int32_t i, is, is2, is3; karat_mult_16( D0, A, B); karat_mult_16(D2, A + 16, B + 16); - for (int32_t i = 0; i < 16; i++) { - int is = i + 16; + for (i = 0; i < 16; i++) { + is = i + 16; SAA[i] = A[i] ^ A[is]; SBB[i] = B[i] ^ B[is]; } karat_mult_16( D1, SAA, SBB); - for (int32_t i = 0; i < 16; i++) { - int32_t is = i + 16; - int32_t is2 = is + 16; - int32_t is3 = is2 + 16; + for (i = 0; i < 16; i++) { + is = i + 16; + is2 = is + 16; + is3 = is2 + 16; __m256i middle = _mm256_xor_si256(D0[is], D2[i]); @@ -329,11 +332,16 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { static __m256i tmp[2 * (T_TM3_3W_256)]; static __m256i ro256[6 * (T_TM3_3W_256)]; const __m256i zero = _mm256_setzero_si256(); + int64_t *U1_64; + int64_t *U2_64; + int64_t *V1_64; + int64_t *V2_64; int32_t T2 = T_TM3_3W_64 << 1; + int32_t i, i4, i41, i42; - for (int32_t i = 0; i < T_TM3_3W_256 - 1; i++) { - int32_t i4 = i << 2; - int32_t i42 = i4 - 2; + for (i = 0; i < T_TM3_3W_256 - 1; i++) { + i4 = i << 2; + i42 = i4 - 2; U0[i] = _mm256_lddqu_si256((__m256i const *)(& A[i4])); V0[i] = _mm256_lddqu_si256((__m256i const *)(& B[i4])); U1[i] = _mm256_lddqu_si256((__m256i const *)(& A[i42 + T_TM3_3W_64])); @@ -342,9 +350,9 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { V2[i] = _mm256_lddqu_si256((__m256i const *)(& B[i4 + T2 - 4])); } - for (int32_t i = T_TM3_3W_256 - 1; i < T_TM3_3W_256; i++) { - int32_t i4 = i << 2; - int32_t i41 = i4 + 1; + for (i = T_TM3_3W_256 - 1; i < T_TM3_3W_256; i++) { + i4 = i << 2; + i41 = i4 + 1; U0[i] = _mm256_set_epi64x(0, 0, A[i41], A[i4]); V0[i] = _mm256_set_epi64x(0, 0, B[i41], B[i4]); U1[i] = _mm256_set_epi64x(0, 0, A[i41 + T_TM3_3W_64 - 2], A[i4 + T_TM3_3W_64 - 2]); @@ -357,7 +365,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { // P(X): P0=(0); P1=(1); P2=(x); P3=(1+x); P4=(\infty) // Evaluation: 5*2 add, 2*2 shift; 5 mul (n) //W3 = U2 + U1 + U0; W2 = V2 + V1 + V0 - for (int32_t i = 0; i < T_TM3_3W_256; i++) { + for (i = 0; i < T_TM3_3W_256; i++) { W3[i] = U0[i] ^ U1[i] ^ U2[i]; W2[i] = V0[i] ^ V1[i] ^ V2[i]; } @@ -366,23 +374,17 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { karat_mult_32( W1, W2, W3); //W0 =(U1 + U2*x)*x; W4 =(V1 + V2*x)*x (SIZE = T_TM3_3W_256 !) - int64_t *U1_64 = ((int64_t *) U1); - int64_t *U2_64 = ((int64_t *) U2); - - int64_t *V1_64 = ((int64_t *) V1); - int64_t *V2_64 = ((int64_t *) V2); - - W0[0] = _mm256_set_epi64x(U1_64[2] ^ U2_64[1], U1_64[1] ^ U2_64[0], U1_64[0], 0); - W4[0] = _mm256_set_epi64x(V1_64[2] ^ V2_64[1], V1_64[1] ^ V2_64[0], V1_64[0], 0); - U1_64 = ((int64_t *) U1); U2_64 = ((int64_t *) U2); V1_64 = ((int64_t *) V1); V2_64 = ((int64_t *) V2); - for (int32_t i = 1; i < T_TM3_3W_256; i++) { - int i4 = i << 2; + W0[0] = _mm256_set_epi64x(U1_64[2] ^ U2_64[1], U1_64[1] ^ U2_64[0], U1_64[0], 0); + W4[0] = _mm256_set_epi64x(V1_64[2] ^ V2_64[1], V1_64[1] ^ V2_64[0], V1_64[0], 0); + + for (i = 1; i < T_TM3_3W_256; i++) { + i4 = i << 2; W0[i] = _mm256_lddqu_si256((__m256i const *)(& U1_64[i4 - 1])); W0[i] ^= _mm256_lddqu_si256((__m256i const *)(& U2_64[i4 - 2])); @@ -391,13 +393,13 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { } //W3 = W3 + W0 ; W2 = W2 + W4 - for (int32_t i = 0; i < T_TM3_3W_256; i++) { + for (i = 0; i < T_TM3_3W_256; i++) { W3[i] ^= W0[i]; W2[i] ^= W4[i]; } //W0 = W0 + U0 ; W4 = W4 + V0 - for (int32_t i = 0; i < T_TM3_3W_256; i++) { + for (i = 0; i < T_TM3_3W_256; i++) { W0[i] ^= U0[i]; W4[i] ^= V0[i]; } @@ -405,7 +407,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { //W3 = W3 * W2 ; W2 = W0 * W4 karat_mult_32(tmp, W3, W2); - for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3_3W_256); i++) { W3[i] = tmp[i]; } @@ -417,20 +419,20 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { // Interpolation phase // 9 add, 1 shift, 1 Smul, 2 Sdiv (2n) //W3 = W3 + W2 - for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3_3W_256); i++) { W3[i] ^= W2[i]; } //W1 = W1 + W0 - for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3_3W_256); i++) { W1[i] ^= W0[i]; } //W2 =(W2 + W0)/x -> x = X^64 U1_64 = ((int64_t *) W2); U2_64 = ((int64_t *) W0); - for (int32_t i = 0; i < (T_TM3_3W_256 << 1); i++) { - int32_t i4 = i << 2; + for (i = 0; i < (T_TM3_3W_256 << 1); i++) { + i4 = i << 2; W2[i] = _mm256_lddqu_si256((__m256i const *)(& U1_64[i4 + 1])); W2[i] ^= _mm256_lddqu_si256((__m256i const *)(& U2_64[i4 + 1])); } @@ -440,7 +442,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { __m256i *U1_256 = (__m256i *) (U1_64 + 1); tmp[0] = W2[0] ^ W3[0] ^ W4[0] ^ _mm256_set_epi64x(U1_64[0], 0, 0, 0); - for (int32_t i = 1; i < (T_TM3_3W_256 << 1) - 1; i++) { + for (i = 1; i < (T_TM3_3W_256 << 1) - 1; i++) { tmp[i] = W2[i] ^ W3[i] ^ W4[i] ^ _mm256_lddqu_si256(&U1_256[i - 1]); } @@ -454,7 +456,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { U2_64 = (int64_t *) W1; __m256i *U2_256 = (__m256i *) (U2_64 + 1); - for (int32_t i = 0; i < 2 * (T_TM3_3W_256) - 1; i++) { + for (i = 0; i < 2 * (T_TM3_3W_256) - 1; i++) { tmp[i] = _mm256_lddqu_si256(&U1_256[i]) ^ _mm256_lddqu_si256(&U2_256[i]); } @@ -462,19 +464,19 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { W3[2 * (T_TM3_3W_256) - 1] = zero; //W1 = W1 + W4 + W2 - for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3_3W_256); i++) { W1[i] ^= W2[i] ^ W4[i]; } //W2 = W2 + W3 - for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3_3W_256); i++) { W2[i] ^= W3[i]; } // Recomposition //W = W0+ W1*x+ W2*x^2+ W3*x^3 + W4*x^4 //W0, W1, W4 of size 2*T_TM3_3W_256, W2 and W3 of size 2*(T_TM3_3W_256) - for (int32_t i = 0; i < (T_TM3_3W_256 << 1) - 1; i++) { + for (i = 0; i < (T_TM3_3W_256 << 1) - 1; i++) { ro256[i] = W0[i]; ro256[i + 2 * T_TM3_3W_256 - 1] = W2[i]; ro256[i + 4 * T_TM3_3W_256 - 2] = W4[i]; @@ -490,12 +492,12 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { U2_64 = ((int64_t *) &ro256[3 * T_TM3_3W_256 - 1]); U2_256 = (__m256i *) (U2_64 - 2); - for (int32_t i = 0; i < T_TM3_3W_256 << 1; i++) { + for (i = 0; i < T_TM3_3W_256 << 1; i++) { _mm256_storeu_si256(&U1_256[i], W1[i] ^ _mm256_lddqu_si256(&U1_256[i])); _mm256_storeu_si256(&U2_256[i], W3[i] ^ _mm256_loadu_si256(&U2_256[i])); } - for (int32_t i = 0; i < 2 * VEC_N_SIZE_256 + 1; i++) { + for (i = 0; i < 2 * VEC_N_SIZE_256 + 1; i++) { _mm256_storeu_si256(&Out[i], ro256[i]); } } diff --git a/crypto_kem/hqc-128/avx2/vector.c b/crypto_kem/hqc-128/avx2/vector.c index 0a3fb074..4b66690a 100644 --- a/crypto_kem/hqc-128/avx2/vector.c +++ b/crypto_kem/hqc-128/avx2/vector.c @@ -32,72 +32,63 @@ void PQCLEAN_HQC128_AVX2_vect_set_random_fixed_weight(AES_XOF_struct *ctx, uint64_t *v, uint16_t weight) { size_t random_bytes_size = 3 * weight; uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0}; - uint32_t random_data = 0; uint32_t tmp[PARAM_OMEGA_R] = {0}; - uint8_t exist = 0; - size_t j = 0; __m256i bit256[PARAM_OMEGA_R]; __m256i bloc256[PARAM_OMEGA_R]; - static __m256i posCmp256 = (__m256i) { - 0UL, 1UL, 2UL, 3UL - }; -#define LOOP_SIZE CEIL_DIVIDE(PARAM_N, 256) - - seedexpander(ctx, rand_bytes, random_bytes_size); - - for (uint32_t i = 0; i < weight; ++i) { - exist = 0; + __m256i posCmp256 = _mm256_set_epi64x(3, 2, 1, 0); + uint64_t bloc, pos, bit64; + uint8_t inc; + size_t i, j; + + i = 0; + j = random_bytes_size; + while (i < weight) { do { if (j == random_bytes_size) { seedexpander(ctx, rand_bytes, random_bytes_size); j = 0; } - random_data = ((uint32_t) rand_bytes[j++]) << 16; - random_data |= ((uint32_t) rand_bytes[j++]) << 8; - random_data |= rand_bytes[j++]; + tmp[i] = ((uint32_t) rand_bytes[j++]) << 16; + tmp[i] |= ((uint32_t) rand_bytes[j++]) << 8; + tmp[i] |= rand_bytes[j++]; - } while (random_data >= UTILS_REJECTION_THRESHOLD); + } while (tmp[i] >= UTILS_REJECTION_THRESHOLD); - random_data = random_data % PARAM_N; + tmp[i] = tmp[i] % PARAM_N; + inc = 1; for (uint32_t k = 0; k < i; k++) { - if (tmp[k] == random_data) { - exist = 1; + if (tmp[k] == tmp[i]) { + inc = 0; } } - - if (exist == 1) { - i--; - } else { - tmp[i] = random_data; - } + i += inc; } - for (uint32_t i = 0; i < weight; i++) { + for (i = 0; i < weight; i++) { // we store the bloc number and bit position of each vb[i] - uint64_t bloc = tmp[i] >> 6; + bloc = tmp[i] >> 6; bloc256[i] = _mm256_set1_epi64x(bloc >> 2); - uint64_t pos = (bloc & 0x3UL); + pos = (bloc & 0x3UL); __m256i pos256 = _mm256_set1_epi64x(pos); __m256i mask256 = _mm256_cmpeq_epi64(pos256, posCmp256); - uint64_t bit64 = 1ULL << (tmp[i] & 0x3f); + bit64 = 1ULL << (tmp[i] & 0x3f); __m256i bloc256 = _mm256_set1_epi64x(bit64); bit256[i] = bloc256 & mask256; } - for (uint32_t i = 0; i < LOOP_SIZE; i++) { + for (i = 0; i < CEIL_DIVIDE(PARAM_N, 256); i++) { __m256i aux = _mm256_loadu_si256(((__m256i *)v) + i); __m256i i256 = _mm256_set1_epi64x(i); - for (uint32_t j = 0; j < weight; j++) { + for (j = 0; j < weight; j++) { __m256i mask256 = _mm256_cmpeq_epi64(bloc256[j], i256); aux ^= bit256[j] & mask256; } _mm256_storeu_si256(((__m256i *)v) + i, aux); } -#undef LOOP_SIZE } @@ -182,10 +173,9 @@ uint8_t PQCLEAN_HQC128_AVX2_vect_compare(const uint8_t *v1, const uint8_t *v2, u * @param[in] size_v Integer that is the size of the input vector in bits */ void PQCLEAN_HQC128_AVX2_vect_resize(uint64_t *o, uint32_t size_o, const uint64_t *v, uint32_t size_v) { + uint64_t mask = 0x7FFFFFFFFFFFFFFF; + int8_t val = 0; if (size_o < size_v) { - uint64_t mask = 0x7FFFFFFFFFFFFFFF; - int8_t val = 0; - if (size_o % 64) { val = 64 - (size_o % 64); } diff --git a/crypto_kem/hqc-128/clean/bch.c b/crypto_kem/hqc-128/clean/bch.c index 28bd8c8f..f932fa6a 100644 --- a/crypto_kem/hqc-128/clean/bch.c +++ b/crypto_kem/hqc-128/clean/bch.c @@ -28,12 +28,12 @@ static void compute_roots(uint64_t *error, const uint16_t *sigma); static void unpack_message(uint8_t *message_unpacked, const uint64_t *message) { for (size_t i = 0; i < (VEC_K_SIZE_64 - (PARAM_K % 64 != 0)); ++i) { for (size_t j = 0; j < 64; ++j) { - message_unpacked[j + 64 * i] = (message[i] >> j) & 1; + message_unpacked[j + 64 * i] = (message[i] >> j) & 0x0000000000000001; } } for (int8_t j = 0; j < PARAM_K % 64; ++j) { - message_unpacked[j + 64 * (VEC_K_SIZE_64 - 1)] = (message[VEC_K_SIZE_64 - 1] >> j) & 1; + message_unpacked[j + 64 * (VEC_K_SIZE_64 - 1)] = (message[VEC_K_SIZE_64 - 1] >> j) & 0x0000000000000001; } } @@ -121,52 +121,54 @@ void PQCLEAN_HQC128_CLEAN_bch_code_encode(uint64_t *codeword, const uint64_t *me * @param[in] syndromes Array of size (at least) 2*PARAM_DELTA storing the syndromes */ static size_t compute_elp(uint16_t *sigma, const uint16_t *syndromes) { - sigma[0] = 1; - size_t deg_sigma = 0; - size_t deg_sigma_p = 0; uint16_t sigma_copy[PARAM_DELTA - 1] = {0}; - size_t deg_sigma_copy = 0; - uint16_t X_sigma_p[PARAM_DELTA + 1] = {0, 1}; - int32_t pp = -1; // 2*rho - uint16_t d_p = 1; - uint16_t d = syndromes[0]; + uint16_t X_sigma_p[PARAM_DELTA + 1] = {0}; + uint16_t d_p, d, dd; + uint16_t mask; + int32_t pp; // 2*rho + size_t deg_sigma, deg_sigma_p, deg_sigma_copy, deg_X_sigma_p; + d = syndromes[0]; + sigma[0] = 1; + X_sigma_p[1] = 1; + deg_sigma = 0; + deg_sigma_p = 0; + d_p = 1; + pp = -1; for (size_t mu = 0; mu < PARAM_DELTA; ++mu) { // Save sigma in case we need it to update X_sigma_p memcpy(sigma_copy, sigma, 2 * (PARAM_DELTA - 1)); deg_sigma_copy = deg_sigma; - uint16_t dd = PQCLEAN_HQC128_CLEAN_gf_mul(d, PQCLEAN_HQC128_CLEAN_gf_inverse(d_p)); // 0 if(d == 0) + dd = PQCLEAN_HQC128_CLEAN_gf_mul(d, PQCLEAN_HQC128_CLEAN_gf_inverse(d_p)); // 0 if(d == 0) for (size_t i = 1; (i <= 2 * mu + 1) && (i <= PARAM_DELTA); ++i) { sigma[i] ^= PQCLEAN_HQC128_CLEAN_gf_mul(dd, X_sigma_p[i]); } - size_t deg_X = 2 * mu - pp; // 2*(mu-rho) - size_t deg_X_sigma_p = deg_X + deg_sigma_p; + deg_X_sigma_p = 2 * mu - pp + deg_sigma_p; - // mask1 = 0xffff if(d != 0) and 0 otherwise - int16_t mask1 = -((uint16_t) - d >> 15); + // mask = 0xffff if(d != 0) and 0 otherwise + mask = -((uint16_t) - d >> 15); - // mask2 = 0xffff if(deg_X_sigma_p > deg_sigma) and 0 otherwise - int16_t mask2 = -((uint16_t) (deg_sigma - deg_X_sigma_p) >> 15); + // mask2 &= 0xffff if(deg_X_sigma_p > deg_sigma) and 0 otherwise + mask &= -((uint16_t) (deg_sigma - deg_X_sigma_p) >> 15); - // mask12 = 0xffff if the deg_sigma increased and 0 otherwise - int16_t mask12 = mask1 & mask2; - deg_sigma = (mask12 & deg_X_sigma_p) ^ (~mask12 & deg_sigma); + deg_sigma ^= mask & (deg_sigma ^ deg_X_sigma_p); if (mu == PARAM_DELTA - 1) { break; } // Update pp, d_p and X_sigma_p if needed - pp = (mask12 & (2 * mu)) ^ (~mask12 & pp); - d_p = (mask12 & d) ^ (~mask12 & d_p); + pp ^= mask & (pp ^ (2 * mu)); + d_p ^= mask & (d_p ^ d); for (size_t i = PARAM_DELTA - 1; i; --i) { - X_sigma_p[i + 1] = (mask12 & sigma_copy[i - 1]) ^ (~mask12 & X_sigma_p[i - 1]); + X_sigma_p[i + 1] = X_sigma_p[i - 1]; + X_sigma_p[i + 1] ^= mask & (X_sigma_p[i + 1] ^ sigma_copy[i - 1]); } X_sigma_p[1] = 0; X_sigma_p[0] = 0; - deg_sigma_p = (mask12 & deg_sigma_copy) ^ (~mask12 & deg_sigma_p); + deg_sigma_p ^= mask & (deg_sigma_p ^ deg_sigma_copy); // Compute the next discrepancy d = syndromes[2 * mu + 2]; diff --git a/crypto_kem/hqc-128/clean/vector.c b/crypto_kem/hqc-128/clean/vector.c index 198ace94..80812899 100644 --- a/crypto_kem/hqc-128/clean/vector.c +++ b/crypto_kem/hqc-128/clean/vector.c @@ -31,39 +31,33 @@ void PQCLEAN_HQC128_CLEAN_vect_set_random_fixed_weight_by_coordinates(AES_XOF_struct *ctx, uint32_t *v, uint16_t weight) { size_t random_bytes_size = 3 * weight; uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0}; // weight is expected to be <= PARAM_OMEGA_R - uint32_t random_data = 0; - uint8_t exist = 0; - size_t j = 0; + uint8_t inc; + size_t i, j; - seedexpander(ctx, rand_bytes, random_bytes_size); - - for (uint32_t i = 0; i < weight; ++i) { - exist = 0; + i = 0; + j = random_bytes_size; + while (i < weight) { do { if (j == random_bytes_size) { seedexpander(ctx, rand_bytes, random_bytes_size); j = 0; } - random_data = ((uint32_t) rand_bytes[j++]) << 16; - random_data |= ((uint32_t) rand_bytes[j++]) << 8; - random_data |= rand_bytes[j++]; + v[i] = ((uint32_t) rand_bytes[j++]) << 16; + v[i] |= ((uint32_t) rand_bytes[j++]) << 8; + v[i] |= rand_bytes[j++]; - } while (random_data >= UTILS_REJECTION_THRESHOLD); + } while (v[i] >= UTILS_REJECTION_THRESHOLD); - random_data = random_data % PARAM_N; + v[i] = v[i] % PARAM_N; - for (uint32_t k = 0; k < i; k++) { - if (v[k] == random_data) { - exist = 1; + inc = 1; + for (size_t k = 0; k < i; k++) { + if (v[k] == v[i]) { + inc = 0; } } - - if (exist == 1) { - i--; - } else { - v[i] = random_data; - } + i += inc; } } @@ -86,46 +80,11 @@ void PQCLEAN_HQC128_CLEAN_vect_set_random_fixed_weight_by_coordinates(AES_XOF_st * @param[in] ctx Pointer to the context of the seed expander */ void PQCLEAN_HQC128_CLEAN_vect_set_random_fixed_weight(AES_XOF_struct *ctx, uint64_t *v, uint16_t weight) { - - size_t random_bytes_size = 3 * weight; - uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0}; // weight is expected to be <= PARAM_OMEGA_R - uint32_t random_data = 0; uint32_t tmp[PARAM_OMEGA_R] = {0}; - uint8_t exist = 0; - size_t j = 0; - seedexpander(ctx, rand_bytes, random_bytes_size); - - for (uint32_t i = 0; i < weight; ++i) { - exist = 0; - do { - if (j == random_bytes_size) { - seedexpander(ctx, rand_bytes, random_bytes_size); - j = 0; - } - - random_data = ((uint32_t) rand_bytes[j++]) << 16; - random_data |= ((uint32_t) rand_bytes[j++]) << 8; - random_data |= rand_bytes[j++]; - - } while (random_data >= UTILS_REJECTION_THRESHOLD); - - random_data = random_data % PARAM_N; - - for (uint32_t k = 0; k < i; k++) { - if (tmp[k] == random_data) { - exist = 1; - } - } - - if (exist == 1) { - i--; - } else { - tmp[i] = random_data; - } - } + PQCLEAN_HQC128_CLEAN_vect_set_random_fixed_weight_by_coordinates(ctx, tmp, weight); - for (uint16_t i = 0; i < weight; ++i) { + for (size_t i = 0; i < weight; ++i) { int32_t index = tmp[i] / 64; int32_t pos = tmp[i] % 64; v[index] |= ((uint64_t) 1) << pos; diff --git a/crypto_kem/hqc-192/avx2/bch.c b/crypto_kem/hqc-192/avx2/bch.c index 5d564836..84edb762 100644 --- a/crypto_kem/hqc-192/avx2/bch.c +++ b/crypto_kem/hqc-192/avx2/bch.c @@ -35,52 +35,54 @@ static void compute_roots(uint64_t *error, const uint16_t *sigma); * @param[in] syndromes Array of size (at least) 2*PARAM_DELTA storing the syndromes */ static size_t compute_elp(uint16_t *sigma, const uint16_t *syndromes) { - sigma[0] = 1; - size_t deg_sigma = 0; - size_t deg_sigma_p = 0; uint16_t sigma_copy[PARAM_DELTA - 1] = {0}; - size_t deg_sigma_copy = 0; - uint16_t X_sigma_p[PARAM_DELTA + 1] = {0, 1}; - int32_t pp = -1; // 2*rho - uint16_t d_p = 1; - uint16_t d = syndromes[0]; + uint16_t X_sigma_p[PARAM_DELTA + 1] = {0}; + uint16_t d_p, d, dd; + uint16_t mask; + int32_t pp; // 2*rho + size_t deg_sigma, deg_sigma_p, deg_sigma_copy, deg_X_sigma_p; + d = syndromes[0]; + sigma[0] = 1; + X_sigma_p[1] = 1; + deg_sigma = 0; + deg_sigma_p = 0; + d_p = 1; + pp = -1; for (size_t mu = 0; mu < PARAM_DELTA; ++mu) { // Save sigma in case we need it to update X_sigma_p memcpy(sigma_copy, sigma, 2 * (PARAM_DELTA - 1)); deg_sigma_copy = deg_sigma; - uint16_t dd = PQCLEAN_HQC192_AVX2_gf_mul(d, PQCLEAN_HQC192_AVX2_gf_inverse(d_p)); // 0 if(d == 0) + dd = PQCLEAN_HQC192_AVX2_gf_mul(d, PQCLEAN_HQC192_AVX2_gf_inverse(d_p)); // 0 if(d == 0) for (size_t i = 1; (i <= 2 * mu + 1) && (i <= PARAM_DELTA); ++i) { sigma[i] ^= PQCLEAN_HQC192_AVX2_gf_mul(dd, X_sigma_p[i]); } - size_t deg_X = 2 * mu - pp; // 2*(mu-rho) - size_t deg_X_sigma_p = deg_X + deg_sigma_p; + deg_X_sigma_p = 2 * mu - pp + deg_sigma_p; - // mask1 = 0xffff if(d != 0) and 0 otherwise - int16_t mask1 = -((uint16_t) - d >> 15); + // mask = 0xffff if(d != 0) and 0 otherwise + mask = -((uint16_t) - d >> 15); - // mask2 = 0xffff if(deg_X_sigma_p > deg_sigma) and 0 otherwise - int16_t mask2 = -((uint16_t) (deg_sigma - deg_X_sigma_p) >> 15); + // mask &= 0xffff if(deg_X_sigma_p > deg_sigma) and 0 otherwise + mask &= -((uint16_t) (deg_sigma - deg_X_sigma_p) >> 15); - // mask12 = 0xffff if the deg_sigma increased and 0 otherwise - int16_t mask12 = mask1 & mask2; - deg_sigma = (mask12 & deg_X_sigma_p) ^ (~mask12 & deg_sigma); + deg_sigma ^= mask & (deg_sigma ^ deg_X_sigma_p); if (mu == PARAM_DELTA - 1) { break; } // Update pp, d_p and X_sigma_p if needed - pp = (mask12 & (2 * mu)) ^ (~mask12 & pp); - d_p = (mask12 & d) ^ (~mask12 & d_p); + pp ^= mask & (pp ^ (2 * mu)); + d_p ^= mask & (d_p ^ d); for (size_t i = PARAM_DELTA - 1; i; --i) { - X_sigma_p[i + 1] = (mask12 & sigma_copy[i - 1]) ^ (~mask12 & X_sigma_p[i - 1]); + X_sigma_p[i + 1] = X_sigma_p[i - 1]; + X_sigma_p[i + 1] ^= mask & (X_sigma_p[i + 1] ^ sigma_copy[i - 1]); } X_sigma_p[1] = 0; X_sigma_p[0] = 0; - deg_sigma_p = (mask12 & deg_sigma_copy) ^ (~mask12 & deg_sigma_p); + deg_sigma_p ^= mask & (deg_sigma_p ^ deg_sigma_copy); // Compute the next discrepancy d = syndromes[2 * mu + 2]; @@ -145,6 +147,7 @@ void compute_syndromes(__m256i *syndromes, const uint64_t *rcv) { uint32_t *aux; int16_t *alpha_tmp; uint32_t i; + uint32_t nzflag; // static variable so that it is stored in the DATA segment // not in the STACK segment static uint8_t tmp_array[PARAM_N1 + 4]; // +4 to control overflow due to management of 256 bits @@ -169,7 +172,8 @@ void compute_syndromes(__m256i *syndromes, const uint64_t *rcv) { alpha_tmp = table_alpha_ij + (j << 4); for (size_t i = 0; i < PARAM_N1; ++i) { - tmp_repeat = _mm256_set1_epi64x((long long)(tmp_array[i] != 0)); + nzflag = ((-(int32_t) tmp_array[i]) >> 31) & 1; + tmp_repeat = _mm256_set1_epi64x(nzflag); L = _mm256_cmpeq_epi64(tmp_repeat, un_256); tmp_repeat = _mm256_lddqu_si256((__m256i *)(alpha_tmp + i * (PARAM_DELTA << 1))); L = _mm256_and_si256(L, tmp_repeat); diff --git a/crypto_kem/hqc-192/avx2/code.c b/crypto_kem/hqc-192/avx2/code.c index e5529347..ccc61bdb 100644 --- a/crypto_kem/hqc-192/avx2/code.c +++ b/crypto_kem/hqc-192/avx2/code.c @@ -34,15 +34,18 @@ static inline uint64_t mux(uint64_t a, uint64_t b, int64_t bit) { * @param[in] m Pointer to an array that is the message */ void PQCLEAN_HQC192_AVX2_code_encode(uint64_t *em, const uint64_t *m) { - uint64_t res; - uint32_t i; - static const uint64_t mask[2][2] = {{0x0UL, 0x0UL}, {0x7FFFFFFFFFFFFFFUL, 0x3FFFFFFFFFFFFFFUL}}; + const uint64_t mask[2][2] = {{0x0UL, 0x0UL}, {0x7FFFFFFFFFFFFFFUL, 0x3FFFFFFFFFFFFFFUL}}; + size_t i, pos_r; + uint64_t bit; + uint16_t idx_r; + uint64_t select; __m256i *colonne, y, aux0; __m256i msg = _mm256_lddqu_si256((const __m256i *) m); colonne = ((__m256i *) gen_matrix); + pos_r = 0; for (i = 0; i < PARAM_N1 - PARAM_K; i++) { // y is the and operation between m and ith column of G y = _mm256_and_si256(colonne[i], msg); @@ -54,34 +57,32 @@ void PQCLEAN_HQC192_AVX2_code_encode(uint64_t *em, const uint64_t *m) { aux0 = _mm256_shuffle_epi32(y, 0x4e); // y = (y0^y1^y2^y3 repeated 4 times) y = _mm256_xor_si256(aux0, y); - res = _mm_popcnt_u64(_mm256_extract_epi64(y, 0)) & 1; + bit = _mm_popcnt_u64(_mm256_extract_epi64(y, 0)) & 1; - uint16_t pos_r = PARAM_N2 * i; - uint16_t idx_r = (pos_r & 0x3f); - uint64_t *p64 = em; - p64 += pos_r >> 6; - uint64_t select = mux(mask[0][0], mask[1][0], res); - *p64 ^= select << idx_r; - select = mux(mask[0][1], mask[1][1], res); - *(p64 + 1) ^= select >> ((63 - idx_r)); + idx_r = (pos_r & 0x3f); + select = mux(mask[0][0], mask[1][0], bit); + em[(pos_r >> 6) + 0] ^= select << idx_r; + select = mux(mask[0][1], mask[1][1], bit); + em[(pos_r >> 6) + 1] ^= select >> ((63 - idx_r)); + pos_r += PARAM_N2; } /* now we add the message m */ /* systematic encoding */ + pos_r = PARAM_N2 * (PARAM_N1 - PARAM_K); for (int32_t i = 0; i < 4; i++) { for (int32_t j = 0; j < 64; j++) { - uint8_t bit = (m[i] >> j) & 0x1; - uint32_t pos_r = PARAM_N2 * ((PARAM_N1 - PARAM_K) + ((i << 6) + j)); - uint16_t idx_r = (pos_r & 0x3f); - uint64_t *p64 = em; + bit = (m[i] >> j) & 0x1; - p64 += pos_r >> 6; - uint64_t select = mux(mask[0][0], mask[1][0], bit); - *p64 ^= select << idx_r; + idx_r = (pos_r & 0x3f); + select = mux(mask[0][0], mask[1][0], bit); + em[(pos_r >> 6) + 0] ^= select << idx_r; select = mux(mask[0][1], mask[1][1], bit); - *(p64 + 1) ^= select >> ((63 - idx_r)); + em[(pos_r >> 6) + 1] ^= select >> ((63 - idx_r)); + + pos_r += PARAM_N2; } } diff --git a/crypto_kem/hqc-192/avx2/gf2x.c b/crypto_kem/hqc-192/avx2/gf2x.c index 1688c050..ade1f48d 100644 --- a/crypto_kem/hqc-192/avx2/gf2x.c +++ b/crypto_kem/hqc-192/avx2/gf2x.c @@ -188,23 +188,24 @@ static inline void karat_mult_4(__m256i *C, __m256i *A, __m256i *B) { * @param[in] B Pointer to the polynomial B(x) */ static inline void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) { + int32_t i, is, is2, is3; __m256i D0[8], D1[8], D2[8], SAA[4], SBB[4]; karat_mult_4( D0, A, B); karat_mult_4(D2, A + 4, B + 4); - for (int32_t i = 0; i < 4; i++) { - int is = i + 4; + for (i = 0; i < 4; i++) { + is = i + 4; SAA[i] = A[i] ^ A[is]; SBB[i] = B[i] ^ B[is]; } karat_mult_4(D1, SAA, SBB); - for (int32_t i = 0; i < 4; i++) { - int32_t is = i + 4; - int32_t is2 = is + 4; - int32_t is3 = is2 + 4; + for (i = 0; i < 4; i++) { + is = i + 4; + is2 = is + 4; + is3 = is2 + 4; __m256i middle = _mm256_xor_si256(D0[is], D2[i]); @@ -228,22 +229,23 @@ static inline void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) { */ static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) { __m256i D0[16], D1[16], D2[16], SAA[8], SBB[8]; + int32_t i, is, is2, is3; karat_mult_8( D0, A, B); karat_mult_8(D2, A + 8, B + 8); - for (int32_t i = 0; i < 8; i++) { - int32_t is = i + 8; + for (i = 0; i < 8; i++) { + is = i + 8; SAA[i] = A[i] ^ A[is]; SBB[i] = B[i] ^ B[is]; } karat_mult_8( D1, SAA, SBB); - for (int32_t i = 0; i < 8; i++) { - int32_t is = i + 8; - int32_t is2 = is + 8; - int32_t is3 = is2 + 8; + for (i = 0; i < 8; i++) { + is = i + 8; + is2 = is + 8; + is3 = is2 + 8; __m256i middle = _mm256_xor_si256(D0[is], D2[i]); @@ -267,22 +269,23 @@ static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) { */ static inline void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) { __m256i D0[32], D1[32], D2[32], SAA[16], SBB[16]; + int32_t i, is, is2, is3; karat_mult_16( D0, A, B); karat_mult_16(D2, A + 16, B + 16); - for (int32_t i = 0; i < 16; i++) { - int is = i + 16; + for (i = 0; i < 16; i++) { + is = i + 16; SAA[i] = A[i] ^ A[is]; SBB[i] = B[i] ^ B[is]; } karat_mult_16( D1, SAA, SBB); - for (int32_t i = 0; i < 16; i++) { - int32_t is = i + 16; - int32_t is2 = is + 16; - int32_t is3 = is2 + 16; + for (i = 0; i < 16; i++) { + is = i + 16; + is2 = is + 16; + is3 = is2 + 16; __m256i middle = _mm256_xor_si256(D0[is], D2[i]); @@ -306,21 +309,22 @@ static inline void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) { */ static inline void karat_mult_64(__m256i *C, __m256i *A, __m256i *B) { __m256i D0[64], D1[64], D2[64], SAA[32], SBB[32]; + int32_t i, is, is2, is3; karat_mult_32( D0, A, B); karat_mult_32(D2, A + 32, B + 32); - for (int32_t i = 0; i < 32; i++) { - int32_t is = i + 32; + for (i = 0; i < 32; i++) { + is = i + 32; SAA[i] = A[i] ^ A[is]; SBB[i] = B[i] ^ B[is]; } karat_mult_32( D1, SAA, SBB); - for (int32_t i = 0; i < 32; i++) { - int32_t is = i + 32; - int32_t is2 = is + 32; - int32_t is3 = is2 + 32; + for (i = 0; i < 32; i++) { + is = i + 32; + is2 = is + 32; + is3 = is2 + 32; __m256i middle = _mm256_xor_si256(D0[is], D2[i]); @@ -369,11 +373,16 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { static __m256i tmp[2 * (T_TM3_3W_256)]; static __m256i ro256[6 * (T_TM3_3W_256)]; const __m256i zero = _mm256_setzero_si256(); + int64_t *U1_64; + int64_t *U2_64; + int64_t *V1_64; + int64_t *V2_64; int32_t T2 = T_TM3_3W_64 << 1; + int32_t i, i4, i41, i42; - for (int32_t i = 0; i < T_TM3_3W_256 - 1; i++) { - int32_t i4 = i << 2; - int32_t i42 = i4 - 2; + for (i = 0; i < T_TM3_3W_256 - 1; i++) { + i4 = i << 2; + i42 = i4 - 2; U0[i] = _mm256_lddqu_si256((__m256i const *)(& A[i4])); V0[i] = _mm256_lddqu_si256((__m256i const *)(& B[i4])); U1[i] = _mm256_lddqu_si256((__m256i const *)(& A[i42 + T_TM3_3W_64])); @@ -382,9 +391,9 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { V2[i] = _mm256_lddqu_si256((__m256i const *)(& B[i4 + T2 - 4])); } - for (int32_t i = T_TM3_3W_256 - 1; i < T_TM3_3W_256; i++) { - int32_t i4 = i << 2; - int32_t i41 = i4 + 1; + for (i = T_TM3_3W_256 - 1; i < T_TM3_3W_256; i++) { + i4 = i << 2; + i41 = i4 + 1; U0[i] = _mm256_set_epi64x(0, 0, A[i41], A[i4]); V0[i] = _mm256_set_epi64x(0, 0, B[i41], B[i4]); U1[i] = _mm256_set_epi64x(0, 0, A[i41 + T_TM3_3W_64 - 2], A[i4 + T_TM3_3W_64 - 2]); @@ -397,7 +406,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { // P(X): P0=(0); P1=(1); P2=(x); P3=(1+x); P4=(\infty) // Evaluation: 5*2 add, 2*2 shift; 5 mul (n) //W3 = U2 + U1 + U0; W2 = V2 + V1 + V0 - for (int32_t i = 0; i < T_TM3_3W_256; i++) { + for (i = 0; i < T_TM3_3W_256; i++) { W3[i] = U0[i] ^ U1[i] ^ U2[i]; W2[i] = V0[i] ^ V1[i] ^ V2[i]; } @@ -406,23 +415,17 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { karat_mult_64( W1, W2, W3); //W0 =(U1 + U2*x)*x; W4 =(V1 + V2*x)*x (SIZE = T_TM3_3W_256 !) - int64_t *U1_64 = ((int64_t *) U1); - int64_t *U2_64 = ((int64_t *) U2); - - int64_t *V1_64 = ((int64_t *) V1); - int64_t *V2_64 = ((int64_t *) V2); - - W0[0] = _mm256_set_epi64x(U1_64[2] ^ U2_64[1], U1_64[1] ^ U2_64[0], U1_64[0], 0); - W4[0] = _mm256_set_epi64x(V1_64[2] ^ V2_64[1], V1_64[1] ^ V2_64[0], V1_64[0], 0); - U1_64 = ((int64_t *) U1); U2_64 = ((int64_t *) U2); V1_64 = ((int64_t *) V1); V2_64 = ((int64_t *) V2); - for (int32_t i = 1; i < T_TM3_3W_256; i++) { - int i4 = i << 2; + W0[0] = _mm256_set_epi64x(U1_64[2] ^ U2_64[1], U1_64[1] ^ U2_64[0], U1_64[0], 0); + W4[0] = _mm256_set_epi64x(V1_64[2] ^ V2_64[1], V1_64[1] ^ V2_64[0], V1_64[0], 0); + + for (i = 1; i < T_TM3_3W_256; i++) { + i4 = i << 2; W0[i] = _mm256_lddqu_si256((__m256i const *)(& U1_64[i4 - 1])); W0[i] ^= _mm256_lddqu_si256((__m256i const *)(& U2_64[i4 - 2])); @@ -431,21 +434,20 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { } //W3 = W3 + W0 ; W2 = W2 + W4 - for (int32_t i = 0; i < T_TM3_3W_256; i++) { + for (i = 0; i < T_TM3_3W_256; i++) { W3[i] ^= W0[i]; W2[i] ^= W4[i]; } //W0 = W0 + U0 ; W4 = W4 + V0 - for (int32_t i = 0; i < T_TM3_3W_256; i++) { + for (i = 0; i < T_TM3_3W_256; i++) { W0[i] ^= U0[i]; W4[i] ^= V0[i]; } - karat_mult_64(tmp, W3, W2); - for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3_3W_256); i++) { W3[i] = tmp[i]; } @@ -457,20 +459,20 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { // Interpolation phase // 9 add, 1 shift, 1 Smul, 2 Sdiv (2n) //W3 = W3 + W2 - for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3_3W_256); i++) { W3[i] ^= W2[i]; } //W1 = W1 + W0 - for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3_3W_256); i++) { W1[i] ^= W0[i]; } //W2 =(W2 + W0)/x -> x = X^64 U1_64 = ((int64_t *) W2); U2_64 = ((int64_t *) W0); - for (int32_t i = 0; i < (T_TM3_3W_256 << 1); i++) { - int32_t i4 = i << 2; + for (i = 0; i < (T_TM3_3W_256 << 1); i++) { + i4 = i << 2; W2[i] = _mm256_lddqu_si256((__m256i const *)(& U1_64[i4 + 1])); W2[i] ^= _mm256_lddqu_si256((__m256i const *)(& U2_64[i4 + 1])); } @@ -480,7 +482,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { __m256i *U1_256 = (__m256i *) (U1_64 + 1); tmp[0] = W2[0] ^ W3[0] ^ W4[0] ^ _mm256_set_epi64x(U1_64[0], 0, 0, 0); - for (int32_t i = 1; i < (T_TM3_3W_256 << 1) - 1; i++) { + for (i = 1; i < (T_TM3_3W_256 << 1) - 1; i++) { tmp[i] = W2[i] ^ W3[i] ^ W4[i] ^ _mm256_lddqu_si256(&U1_256[i - 1]); } @@ -494,7 +496,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { U2_64 = (int64_t *) W1; __m256i *U2_256 = (__m256i *) (U2_64 + 1); - for (int32_t i = 0; i < 2 * (T_TM3_3W_256) - 1; i++) { + for (i = 0; i < 2 * (T_TM3_3W_256) - 1; i++) { tmp[i] = _mm256_lddqu_si256(&U1_256[i]) ^ _mm256_lddqu_si256(&U2_256[i]); } @@ -502,19 +504,19 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { W3[2 * (T_TM3_3W_256) - 1] = zero; //W1 = W1 + W4 + W2 - for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3_3W_256); i++) { W1[i] ^= W2[i] ^ W4[i]; } //W2 = W2 + W3 - for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3_3W_256); i++) { W2[i] ^= W3[i]; } // Recomposition //W = W0+ W1*x+ W2*x^2+ W3*x^3 + W4*x^4 //W0, W1, W4 of size 2*T_TM3_3W_256, W2 and W3 of size 2*(T_TM3_3W_256) - for (int32_t i = 0; i < (T_TM3_3W_256 << 1) - 1; i++) { + for (i = 0; i < (T_TM3_3W_256 << 1) - 1; i++) { ro256[i] = W0[i]; ro256[i + 2 * T_TM3_3W_256 - 1] = W2[i]; ro256[i + 4 * T_TM3_3W_256 - 2] = W4[i]; @@ -530,12 +532,12 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { U2_64 = ((int64_t *) &ro256[3 * T_TM3_3W_256 - 1]); U2_256 = (__m256i *) (U2_64 - 2); - for (int32_t i = 0; i < T_TM3_3W_256 << 1; i++) { + for (i = 0; i < T_TM3_3W_256 << 1; i++) { _mm256_storeu_si256(&U1_256[i], W1[i] ^ _mm256_lddqu_si256(&U1_256[i])); _mm256_storeu_si256(&U2_256[i], W3[i] ^ _mm256_loadu_si256(&U2_256[i])); } - for (int32_t i = 0; i < 2 * VEC_N_SIZE_256 + 1; i++) { + for (i = 0; i < 2 * VEC_N_SIZE_256 + 1; i++) { _mm256_storeu_si256(&Out[i], ro256[i]); } } diff --git a/crypto_kem/hqc-192/avx2/vector.c b/crypto_kem/hqc-192/avx2/vector.c index 1eaf9571..53a021e0 100644 --- a/crypto_kem/hqc-192/avx2/vector.c +++ b/crypto_kem/hqc-192/avx2/vector.c @@ -32,72 +32,63 @@ void PQCLEAN_HQC192_AVX2_vect_set_random_fixed_weight(AES_XOF_struct *ctx, uint64_t *v, uint16_t weight) { size_t random_bytes_size = 3 * weight; uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0}; - uint32_t random_data = 0; uint32_t tmp[PARAM_OMEGA_R] = {0}; - uint8_t exist = 0; - size_t j = 0; __m256i bit256[PARAM_OMEGA_R]; __m256i bloc256[PARAM_OMEGA_R]; - static __m256i posCmp256 = (__m256i) { - 0UL, 1UL, 2UL, 3UL - }; -#define LOOP_SIZE CEIL_DIVIDE(PARAM_N, 256) - - seedexpander(ctx, rand_bytes, random_bytes_size); - - for (uint32_t i = 0; i < weight; ++i) { - exist = 0; + __m256i posCmp256 = _mm256_set_epi64x(3, 2, 1, 0); + uint64_t bloc, pos, bit64; + uint8_t inc; + size_t i, j; + + i = 0; + j = random_bytes_size; + while (i < weight) { do { if (j == random_bytes_size) { seedexpander(ctx, rand_bytes, random_bytes_size); j = 0; } - random_data = ((uint32_t) rand_bytes[j++]) << 16; - random_data |= ((uint32_t) rand_bytes[j++]) << 8; - random_data |= rand_bytes[j++]; + tmp[i] = ((uint32_t) rand_bytes[j++]) << 16; + tmp[i] |= ((uint32_t) rand_bytes[j++]) << 8; + tmp[i] |= rand_bytes[j++]; - } while (random_data >= UTILS_REJECTION_THRESHOLD); + } while (tmp[i] >= UTILS_REJECTION_THRESHOLD); - random_data = random_data % PARAM_N; + tmp[i] = tmp[i] % PARAM_N; + inc = 1; for (uint32_t k = 0; k < i; k++) { - if (tmp[k] == random_data) { - exist = 1; + if (tmp[k] == tmp[i]) { + inc = 0; } } - - if (exist == 1) { - i--; - } else { - tmp[i] = random_data; - } + i += inc; } - for (uint32_t i = 0; i < weight; i++) { + for (i = 0; i < weight; i++) { // we store the bloc number and bit position of each vb[i] - uint64_t bloc = tmp[i] >> 6; + bloc = tmp[i] >> 6; bloc256[i] = _mm256_set1_epi64x(bloc >> 2); - uint64_t pos = (bloc & 0x3UL); + pos = (bloc & 0x3UL); __m256i pos256 = _mm256_set1_epi64x(pos); __m256i mask256 = _mm256_cmpeq_epi64(pos256, posCmp256); - uint64_t bit64 = 1ULL << (tmp[i] & 0x3f); + bit64 = 1ULL << (tmp[i] & 0x3f); __m256i bloc256 = _mm256_set1_epi64x(bit64); bit256[i] = bloc256 & mask256; } - for (uint32_t i = 0; i < LOOP_SIZE; i++) { + for (i = 0; i < CEIL_DIVIDE(PARAM_N, 256); i++) { __m256i aux = _mm256_loadu_si256(((__m256i *)v) + i); __m256i i256 = _mm256_set1_epi64x(i); - for (uint32_t j = 0; j < weight; j++) { + for (j = 0; j < weight; j++) { __m256i mask256 = _mm256_cmpeq_epi64(bloc256[j], i256); aux ^= bit256[j] & mask256; } _mm256_storeu_si256(((__m256i *)v) + i, aux); } -#undef LOOP_SIZE } @@ -182,10 +173,9 @@ uint8_t PQCLEAN_HQC192_AVX2_vect_compare(const uint8_t *v1, const uint8_t *v2, u * @param[in] size_v Integer that is the size of the input vector in bits */ void PQCLEAN_HQC192_AVX2_vect_resize(uint64_t *o, uint32_t size_o, const uint64_t *v, uint32_t size_v) { + uint64_t mask = 0x7FFFFFFFFFFFFFFF; + int8_t val = 0; if (size_o < size_v) { - uint64_t mask = 0x7FFFFFFFFFFFFFFF; - int8_t val = 0; - if (size_o % 64) { val = 64 - (size_o % 64); } diff --git a/crypto_kem/hqc-192/clean/bch.c b/crypto_kem/hqc-192/clean/bch.c index 8f970a82..d092fc1e 100644 --- a/crypto_kem/hqc-192/clean/bch.c +++ b/crypto_kem/hqc-192/clean/bch.c @@ -121,52 +121,54 @@ void PQCLEAN_HQC192_CLEAN_bch_code_encode(uint64_t *codeword, const uint64_t *me * @param[in] syndromes Array of size (at least) 2*PARAM_DELTA storing the syndromes */ static size_t compute_elp(uint16_t *sigma, const uint16_t *syndromes) { - sigma[0] = 1; - size_t deg_sigma = 0; - size_t deg_sigma_p = 0; uint16_t sigma_copy[PARAM_DELTA - 1] = {0}; - size_t deg_sigma_copy = 0; - uint16_t X_sigma_p[PARAM_DELTA + 1] = {0, 1}; - int32_t pp = -1; // 2*rho - uint16_t d_p = 1; - uint16_t d = syndromes[0]; + uint16_t X_sigma_p[PARAM_DELTA + 1] = {0}; + uint16_t d_p, d, dd; + uint16_t mask; + int32_t pp; // 2*rho + size_t deg_sigma, deg_sigma_p, deg_sigma_copy, deg_X_sigma_p; + d = syndromes[0]; + sigma[0] = 1; + X_sigma_p[1] = 1; + deg_sigma = 0; + deg_sigma_p = 0; + d_p = 1; + pp = -1; for (size_t mu = 0; mu < PARAM_DELTA; ++mu) { // Save sigma in case we need it to update X_sigma_p memcpy(sigma_copy, sigma, 2 * (PARAM_DELTA - 1)); deg_sigma_copy = deg_sigma; - uint16_t dd = PQCLEAN_HQC192_CLEAN_gf_mul(d, PQCLEAN_HQC192_CLEAN_gf_inverse(d_p)); // 0 if(d == 0) + dd = PQCLEAN_HQC192_CLEAN_gf_mul(d, PQCLEAN_HQC192_CLEAN_gf_inverse(d_p)); // 0 if(d == 0) for (size_t i = 1; (i <= 2 * mu + 1) && (i <= PARAM_DELTA); ++i) { sigma[i] ^= PQCLEAN_HQC192_CLEAN_gf_mul(dd, X_sigma_p[i]); } - size_t deg_X = 2 * mu - pp; // 2*(mu-rho) - size_t deg_X_sigma_p = deg_X + deg_sigma_p; + deg_X_sigma_p = 2 * mu - pp + deg_sigma_p; - // mask1 = 0xffff if(d != 0) and 0 otherwise - int16_t mask1 = -((uint16_t) - d >> 15); + // mask = 0xffff if(d != 0) and 0 otherwise + mask = -((uint16_t) - d >> 15); - // mask2 = 0xffff if(deg_X_sigma_p > deg_sigma) and 0 otherwise - int16_t mask2 = -((uint16_t) (deg_sigma - deg_X_sigma_p) >> 15); + // mask2 &= 0xffff if(deg_X_sigma_p > deg_sigma) and 0 otherwise + mask &= -((uint16_t) (deg_sigma - deg_X_sigma_p) >> 15); - // mask12 = 0xffff if the deg_sigma increased and 0 otherwise - int16_t mask12 = mask1 & mask2; - deg_sigma = (mask12 & deg_X_sigma_p) ^ (~mask12 & deg_sigma); + deg_sigma ^= mask & (deg_sigma ^ deg_X_sigma_p); if (mu == PARAM_DELTA - 1) { break; } // Update pp, d_p and X_sigma_p if needed - pp = (mask12 & (2 * mu)) ^ (~mask12 & pp); - d_p = (mask12 & d) ^ (~mask12 & d_p); + pp ^= mask & (pp ^ (2 * mu)); + d_p ^= mask & (d_p ^ d); for (size_t i = PARAM_DELTA - 1; i; --i) { - X_sigma_p[i + 1] = (mask12 & sigma_copy[i - 1]) ^ (~mask12 & X_sigma_p[i - 1]); + X_sigma_p[i + 1] = X_sigma_p[i - 1]; + X_sigma_p[i + 1] ^= mask & (X_sigma_p[i + 1] ^ sigma_copy[i - 1]); } X_sigma_p[1] = 0; X_sigma_p[0] = 0; - deg_sigma_p = (mask12 & deg_sigma_copy) ^ (~mask12 & deg_sigma_p); + deg_sigma_p ^= mask & (deg_sigma_p ^ deg_sigma_copy); // Compute the next discrepancy d = syndromes[2 * mu + 2]; diff --git a/crypto_kem/hqc-192/clean/vector.c b/crypto_kem/hqc-192/clean/vector.c index 33a95650..34fd819c 100644 --- a/crypto_kem/hqc-192/clean/vector.c +++ b/crypto_kem/hqc-192/clean/vector.c @@ -31,39 +31,33 @@ void PQCLEAN_HQC192_CLEAN_vect_set_random_fixed_weight_by_coordinates(AES_XOF_struct *ctx, uint32_t *v, uint16_t weight) { size_t random_bytes_size = 3 * weight; uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0}; // weight is expected to be <= PARAM_OMEGA_R - uint32_t random_data = 0; - uint8_t exist = 0; - size_t j = 0; + uint8_t inc; + size_t i, j; - seedexpander(ctx, rand_bytes, random_bytes_size); - - for (uint32_t i = 0; i < weight; ++i) { - exist = 0; + i = 0; + j = random_bytes_size; + while (i < weight) { do { if (j == random_bytes_size) { seedexpander(ctx, rand_bytes, random_bytes_size); j = 0; } - random_data = ((uint32_t) rand_bytes[j++]) << 16; - random_data |= ((uint32_t) rand_bytes[j++]) << 8; - random_data |= rand_bytes[j++]; + v[i] = ((uint32_t) rand_bytes[j++]) << 16; + v[i] |= ((uint32_t) rand_bytes[j++]) << 8; + v[i] |= rand_bytes[j++]; - } while (random_data >= UTILS_REJECTION_THRESHOLD); + } while (v[i] >= UTILS_REJECTION_THRESHOLD); - random_data = random_data % PARAM_N; + v[i] = v[i] % PARAM_N; - for (uint32_t k = 0; k < i; k++) { - if (v[k] == random_data) { - exist = 1; + inc = 1; + for (size_t k = 0; k < i; k++) { + if (v[k] == v[i]) { + inc = 0; } } - - if (exist == 1) { - i--; - } else { - v[i] = random_data; - } + i += inc; } } @@ -86,46 +80,11 @@ void PQCLEAN_HQC192_CLEAN_vect_set_random_fixed_weight_by_coordinates(AES_XOF_st * @param[in] ctx Pointer to the context of the seed expander */ void PQCLEAN_HQC192_CLEAN_vect_set_random_fixed_weight(AES_XOF_struct *ctx, uint64_t *v, uint16_t weight) { - - size_t random_bytes_size = 3 * weight; - uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0}; // weight is expected to be <= PARAM_OMEGA_R - uint32_t random_data = 0; uint32_t tmp[PARAM_OMEGA_R] = {0}; - uint8_t exist = 0; - size_t j = 0; - seedexpander(ctx, rand_bytes, random_bytes_size); - - for (uint32_t i = 0; i < weight; ++i) { - exist = 0; - do { - if (j == random_bytes_size) { - seedexpander(ctx, rand_bytes, random_bytes_size); - j = 0; - } - - random_data = ((uint32_t) rand_bytes[j++]) << 16; - random_data |= ((uint32_t) rand_bytes[j++]) << 8; - random_data |= rand_bytes[j++]; - - } while (random_data >= UTILS_REJECTION_THRESHOLD); - - random_data = random_data % PARAM_N; - - for (uint32_t k = 0; k < i; k++) { - if (tmp[k] == random_data) { - exist = 1; - } - } - - if (exist == 1) { - i--; - } else { - tmp[i] = random_data; - } - } + PQCLEAN_HQC192_CLEAN_vect_set_random_fixed_weight_by_coordinates(ctx, tmp, weight); - for (uint16_t i = 0; i < weight; ++i) { + for (size_t i = 0; i < weight; ++i) { int32_t index = tmp[i] / 64; int32_t pos = tmp[i] % 64; v[index] |= ((uint64_t) 1) << pos; diff --git a/crypto_kem/hqc-256/avx2/bch.c b/crypto_kem/hqc-256/avx2/bch.c index 3fc0a467..e6ee3168 100644 --- a/crypto_kem/hqc-256/avx2/bch.c +++ b/crypto_kem/hqc-256/avx2/bch.c @@ -35,52 +35,54 @@ static void compute_roots(uint64_t *error, const uint16_t *sigma); * @param[in] syndromes Array of size (at least) 2*PARAM_DELTA storing the syndromes */ static size_t compute_elp(uint16_t *sigma, const uint16_t *syndromes) { - sigma[0] = 1; - size_t deg_sigma = 0; - size_t deg_sigma_p = 0; uint16_t sigma_copy[PARAM_DELTA - 1] = {0}; - size_t deg_sigma_copy = 0; - uint16_t X_sigma_p[PARAM_DELTA + 1] = {0, 1}; - int32_t pp = -1; // 2*rho - uint16_t d_p = 1; - uint16_t d = syndromes[0]; + uint16_t X_sigma_p[PARAM_DELTA + 1] = {0}; + uint16_t d_p, d, dd; + uint16_t mask; + int32_t pp; // 2*rho + size_t deg_sigma, deg_sigma_p, deg_sigma_copy, deg_X_sigma_p; + d = syndromes[0]; + sigma[0] = 1; + X_sigma_p[1] = 1; + deg_sigma = 0; + deg_sigma_p = 0; + d_p = 1; + pp = -1; for (size_t mu = 0; mu < PARAM_DELTA; ++mu) { // Save sigma in case we need it to update X_sigma_p memcpy(sigma_copy, sigma, 2 * (PARAM_DELTA - 1)); deg_sigma_copy = deg_sigma; - uint16_t dd = PQCLEAN_HQC256_AVX2_gf_mul(d, PQCLEAN_HQC256_AVX2_gf_inverse(d_p)); // 0 if(d == 0) + dd = PQCLEAN_HQC256_AVX2_gf_mul(d, PQCLEAN_HQC256_AVX2_gf_inverse(d_p)); // 0 if(d == 0) for (size_t i = 1; (i <= 2 * mu + 1) && (i <= PARAM_DELTA); ++i) { sigma[i] ^= PQCLEAN_HQC256_AVX2_gf_mul(dd, X_sigma_p[i]); } - size_t deg_X = 2 * mu - pp; // 2*(mu-rho) - size_t deg_X_sigma_p = deg_X + deg_sigma_p; + deg_X_sigma_p = 2 * mu - pp + deg_sigma_p; - // mask1 = 0xffff if(d != 0) and 0 otherwise - int16_t mask1 = -((uint16_t) - d >> 15); + // mask = 0xffff if(d != 0) and 0 otherwise + mask = -((uint16_t) - d >> 15); - // mask2 = 0xffff if(deg_X_sigma_p > deg_sigma) and 0 otherwise - int16_t mask2 = -((uint16_t) (deg_sigma - deg_X_sigma_p) >> 15); + // mask &= 0xffff if(deg_X_sigma_p > deg_sigma) and 0 otherwise + mask &= -((uint16_t) (deg_sigma - deg_X_sigma_p) >> 15); - // mask12 = 0xffff if the deg_sigma increased and 0 otherwise - int16_t mask12 = mask1 & mask2; - deg_sigma = (mask12 & deg_X_sigma_p) ^ (~mask12 & deg_sigma); + deg_sigma ^= mask & (deg_sigma ^ deg_X_sigma_p); if (mu == PARAM_DELTA - 1) { break; } // Update pp, d_p and X_sigma_p if needed - pp = (mask12 & (2 * mu)) ^ (~mask12 & pp); - d_p = (mask12 & d) ^ (~mask12 & d_p); + pp ^= mask & (pp ^ (2 * mu)); + d_p ^= mask & (d_p ^ d); for (size_t i = PARAM_DELTA - 1; i; --i) { - X_sigma_p[i + 1] = (mask12 & sigma_copy[i - 1]) ^ (~mask12 & X_sigma_p[i - 1]); + X_sigma_p[i + 1] = X_sigma_p[i - 1]; + X_sigma_p[i + 1] ^= mask & (X_sigma_p[i + 1] ^ sigma_copy[i - 1]); } X_sigma_p[1] = 0; X_sigma_p[0] = 0; - deg_sigma_p = (mask12 & deg_sigma_copy) ^ (~mask12 & deg_sigma_p); + deg_sigma_p ^= mask & (deg_sigma_p ^ deg_sigma_copy); // Compute the next discrepancy d = syndromes[2 * mu + 2]; @@ -145,6 +147,7 @@ void compute_syndromes(__m256i *syndromes, const uint64_t *rcv) { uint32_t *aux; int16_t *alpha_tmp; uint32_t i; + uint32_t nzflag; // static variable so that it is stored in the DATA segment // not in the STACK segment static uint8_t tmp_array[PARAM_N1 + 4]; // +4 to control overflow due to management of 256 bits @@ -169,7 +172,8 @@ void compute_syndromes(__m256i *syndromes, const uint64_t *rcv) { alpha_tmp = table_alpha_ij + (j << 4); for (size_t i = 0; i < PARAM_N1; ++i) { - tmp_repeat = _mm256_set1_epi64x((long long)(tmp_array[i] != 0)); + nzflag = ((-(int32_t) tmp_array[i]) >> 31) & 1; + tmp_repeat = _mm256_set1_epi64x(nzflag); L = _mm256_cmpeq_epi64(tmp_repeat, un_256); tmp_repeat = _mm256_lddqu_si256((__m256i *)(alpha_tmp + i * (PARAM_DELTA << 1))); L = _mm256_and_si256(L, tmp_repeat); diff --git a/crypto_kem/hqc-256/avx2/code.c b/crypto_kem/hqc-256/avx2/code.c index 0d22af5c..b5f4a6d7 100644 --- a/crypto_kem/hqc-256/avx2/code.c +++ b/crypto_kem/hqc-256/avx2/code.c @@ -34,15 +34,19 @@ static inline uint64_t mux(uint64_t a, uint64_t b, int64_t bit) { * @param[in] m Pointer to an array that is the message */ void PQCLEAN_HQC256_AVX2_code_encode(uint64_t *em, const uint64_t *m) { - uint64_t res; - uint32_t i; - static const uint64_t mask[2][3] = {{0x0UL, 0x0UL, 0x0UL}, {0xFFFFFFFFFFFFFFFFUL, 0xFFFFFFFFFFFFFFFFUL, 0x3FFFFFUL}}; + const uint64_t mask[2][3] = {{0x0UL, 0x0UL, 0x0UL}, {0xFFFFFFFFFFFFFFFFUL, 0xFFFFFFFFFFFFFFFFUL, 0x3FFFFFUL}}; + size_t i, pos_r; + uint64_t bit; + uint64_t idx_r; + uint64_t idx_2; + uint64_t select; __m256i *colonne, y, aux0; __m256i msg = _mm256_lddqu_si256((const __m256i *) m); colonne = ((__m256i *) gen_matrix); + pos_r = 0; for (i = 0; i < PARAM_N1 - PARAM_K; i++) { // y is the and operation between m and ith column of G y = _mm256_and_si256(colonne[i], msg); @@ -54,44 +58,40 @@ void PQCLEAN_HQC256_AVX2_code_encode(uint64_t *em, const uint64_t *m) { aux0 = _mm256_shuffle_epi32(y, 0x4e); // y = (y0^y1^y2^y3 repeated 4 times) y = _mm256_xor_si256(aux0, y); - res = _mm_popcnt_u64(_mm256_extract_epi64(y, 0)) & 1; - - - uint16_t pos_r = PARAM_N2 * i; - uint16_t idx_r = (pos_r & 0x3f); - uint64_t *p64 = em; - p64 += pos_r >> 6; - uint64_t select = mux(mask[0][0], mask[1][0], res); - *p64 ^= select << idx_r; - int64_t aux = (41 - idx_r); - uint64_t aux2 = (aux > 0); - uint64_t idx2 = aux * aux2; - select = mux(mask[0][1], mask[1][1], res); - *(p64 + 1) ^= select >> idx2; - select = mux(mask[0][2], mask[1][2], res); - *(p64 + 2) ^= select >> ((63 - idx_r)); + bit = _mm_popcnt_u64(_mm256_extract_epi64(y, 0)) & 1; + + + idx_r = (pos_r & 0x3f); + idx_2 = 41 - idx_r; + idx_2 &= (uint64_t) (-((int64_t)idx_2) >> 63); + select = mux(mask[0][0], mask[1][0], bit); + em[(pos_r >> 6) + 0] ^= select << idx_r; + select = mux(mask[0][1], mask[1][1], bit); + em[(pos_r >> 6) + 1] ^= select >> idx_2; + select = mux(mask[0][2], mask[1][2], bit); + em[(pos_r >> 6) + 2] ^= select >> ((63 - idx_r)); + pos_r += PARAM_N2; } /* now we add the message m */ /* systematic encoding */ + pos_r = PARAM_N2 * (PARAM_N1 - PARAM_K); for (int32_t i = 0; i < 4; i++) { for (int32_t j = 0; j < 64; j++) { - uint8_t bit = (m[i] >> j) & 0x1; - uint32_t pos_r = PARAM_N2 * ((PARAM_N1 - PARAM_K) + ((i << 6) + j)); - uint16_t idx_r = (pos_r & 0x3f); - uint64_t *p64 = em; - - - p64 += pos_r >> 6; - uint64_t select = mux(mask[0][0], mask[1][0], bit); - *p64 ^= select << idx_r; - int64_t aux = (41 - idx_r); - uint64_t aux2 = (aux > 0); - uint64_t idx2 = aux * aux2; + bit = (m[i] >> j) & 0x1; + + + idx_r = (pos_r & 0x3f); + idx_2 = 41 - idx_r; + idx_2 &= (uint64_t) (-((int64_t)idx_2) >> 63); + select = mux(mask[0][0], mask[1][0], bit); + em[(pos_r >> 6) + 0] ^= select << idx_r; select = mux(mask[0][1], mask[1][1], bit); - *(p64 + 1) ^= select >> idx2; + em[(pos_r >> 6) + 1] ^= select >> idx_2; select = mux(mask[0][2], mask[1][2], bit); - *(p64 + 2) ^= select >> ((63 - idx_r)); + em[(pos_r >> 6) + 2] ^= select >> ((63 - idx_r)); + + pos_r += PARAM_N2; } } diff --git a/crypto_kem/hqc-256/avx2/gf2x.c b/crypto_kem/hqc-256/avx2/gf2x.c index cfcaae31..55cc0528 100644 --- a/crypto_kem/hqc-256/avx2/gf2x.c +++ b/crypto_kem/hqc-256/avx2/gf2x.c @@ -232,23 +232,24 @@ static inline void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) { * @param[in] B Pointer to the polynomial B(x) */ static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) { + int32_t i, is, is2, is3; __m256i D0[16], D1[16], D2[16], SAA[8], SBB[8]; karat_mult_8( D0, A, B); karat_mult_8(D2, A + 8, B + 8); - for (int32_t i = 0; i < 8; i++) { - int32_t is = i + 8; + for (i = 0; i < 8; i++) { + is = i + 8; SAA[i] = A[i] ^ A[is]; SBB[i] = B[i] ^ B[is]; } karat_mult_8( D1, SAA, SBB); - for (int32_t i = 0; i < 8; i++) { - int32_t is = i + 8; - int32_t is2 = is + 8; - int32_t is3 = is2 + 8; + for (i = 0; i < 8; i++) { + is = i + 8; + is2 = is + 8; + is3 = is2 + 8; __m256i middle = _mm256_xor_si256(D0[is], D2[i]); @@ -272,22 +273,23 @@ static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) { */ static inline void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) { __m256i D0[32], D1[32], D2[32], SAA[16], SBB[16]; + int32_t i, is, is2, is3; karat_mult_16( D0, A, B); karat_mult_16(D2, A + 16, B + 16); - for (int32_t i = 0; i < 16; i++) { - int is = i + 16; + for (i = 0; i < 16; i++) { + is = i + 16; SAA[i] = A[i] ^ A[is]; SBB[i] = B[i] ^ B[is]; } karat_mult_16( D1, SAA, SBB); - for (int32_t i = 0; i < 16; i++) { - int32_t is = i + 16; - int32_t is2 = is + 16; - int32_t is3 = is2 + 16; + for (i = 0; i < 16; i++) { + is = i + 16; + is2 = is + 16; + is3 = is2 + 16; __m256i middle = _mm256_xor_si256(D0[is], D2[i]); @@ -299,7 +301,6 @@ static inline void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) { } - /** * @brief Compute B(x) = A(x)/(x+1) * @@ -336,11 +337,16 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { static __m256i tmp[2 * (T_TM3_3W_256)]; static __m256i ro256[6 * (T_TM3_3W_256)]; const __m256i zero = _mm256_setzero_si256(); + int64_t *U1_64; + int64_t *U2_64; + int64_t *V1_64; + int64_t *V2_64; int32_t T2 = T_TM3_3W_64 << 1; + int32_t i, i4, i41, i42; - for (int32_t i = 0; i < T_TM3_3W_256 - 1; i++) { - int32_t i4 = i << 2; - int32_t i42 = i4 - 2; + for (i = 0; i < T_TM3_3W_256 - 1; i++) { + i4 = i << 2; + i42 = i4 - 2; U0[i] = _mm256_lddqu_si256((__m256i const *)(& A[i4])); V0[i] = _mm256_lddqu_si256((__m256i const *)(& B[i4])); U1[i] = _mm256_lddqu_si256((__m256i const *)(& A[i42 + T_TM3_3W_64])); @@ -349,9 +355,9 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { V2[i] = _mm256_lddqu_si256((__m256i const *)(& B[i4 + T2 - 4])); } - for (int32_t i = T_TM3_3W_256 - 1; i < T_TM3_3W_256; i++) { - int32_t i4 = i << 2; - int32_t i41 = i4 + 1; + for (i = T_TM3_3W_256 - 1; i < T_TM3_3W_256; i++) { + i4 = i << 2; + i41 = i4 + 1; U0[i] = _mm256_set_epi64x(0, 0, A[i41], A[i4]); V0[i] = _mm256_set_epi64x(0, 0, B[i41], B[i4]); U1[i] = _mm256_set_epi64x(0, 0, A[i41 + T_TM3_3W_64 - 2], A[i4 + T_TM3_3W_64 - 2]); @@ -364,7 +370,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { // P(X): P0=(0); P1=(1); P2=(x); P3=(1+x); P4=(\infty) // Evaluation: 5*2 add, 2*2 shift; 5 mul (n) //W3 = U2 + U1 + U0; W2 = V2 + V1 + V0 - for (int32_t i = 0; i < T_TM3_3W_256; i++) { + for (i = 0; i < T_TM3_3W_256; i++) { W3[i] = U0[i] ^ U1[i] ^ U2[i]; W2[i] = V0[i] ^ V1[i] ^ V2[i]; } @@ -373,23 +379,17 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { karat_mult_32( W1, W2, W3); //W0 =(U1 + U2*x)*x; W4 =(V1 + V2*x)*x (SIZE = T_TM3_3W_256 !) - int64_t *U1_64 = ((int64_t *) U1); - int64_t *U2_64 = ((int64_t *) U2); - - int64_t *V1_64 = ((int64_t *) V1); - int64_t *V2_64 = ((int64_t *) V2); - - W0[0] = _mm256_set_epi64x(U1_64[2] ^ U2_64[1], U1_64[1] ^ U2_64[0], U1_64[0], 0); - W4[0] = _mm256_set_epi64x(V1_64[2] ^ V2_64[1], V1_64[1] ^ V2_64[0], V1_64[0], 0); - U1_64 = ((int64_t *) U1); U2_64 = ((int64_t *) U2); V1_64 = ((int64_t *) V1); V2_64 = ((int64_t *) V2); - for (int32_t i = 1; i < T_TM3_3W_256; i++) { - int i4 = i << 2; + W0[0] = _mm256_set_epi64x(U1_64[2] ^ U2_64[1], U1_64[1] ^ U2_64[0], U1_64[0], 0); + W4[0] = _mm256_set_epi64x(V1_64[2] ^ V2_64[1], V1_64[1] ^ V2_64[0], V1_64[0], 0); + + for (i = 1; i < T_TM3_3W_256; i++) { + i4 = i << 2; W0[i] = _mm256_lddqu_si256((__m256i const *)(& U1_64[i4 - 1])); W0[i] ^= _mm256_lddqu_si256((__m256i const *)(& U2_64[i4 - 2])); @@ -398,13 +398,13 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { } //W3 = W3 + W0 ; W2 = W2 + W4 - for (int32_t i = 0; i < T_TM3_3W_256; i++) { + for (i = 0; i < T_TM3_3W_256; i++) { W3[i] ^= W0[i]; W2[i] ^= W4[i]; } //W0 = W0 + U0 ; W4 = W4 + V0 - for (int32_t i = 0; i < T_TM3_3W_256; i++) { + for (i = 0; i < T_TM3_3W_256; i++) { W0[i] ^= U0[i]; W4[i] ^= V0[i]; } @@ -412,7 +412,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { //W3 = W3 * W2 ; W2 = W0 * W4 karat_mult_32(tmp, W3, W2); - for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3_3W_256); i++) { W3[i] = tmp[i]; } @@ -424,20 +424,20 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { // Interpolation phase // 9 add, 1 shift, 1 Smul, 2 Sdiv (2n) //W3 = W3 + W2 - for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3_3W_256); i++) { W3[i] ^= W2[i]; } //W1 = W1 + W0 - for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3_3W_256); i++) { W1[i] ^= W0[i]; } //W2 =(W2 + W0)/x -> x = X^64 U1_64 = ((int64_t *) W2); U2_64 = ((int64_t *) W0); - for (int32_t i = 0; i < (T_TM3_3W_256 << 1); i++) { - int32_t i4 = i << 2; + for (i = 0; i < (T_TM3_3W_256 << 1); i++) { + i4 = i << 2; W2[i] = _mm256_lddqu_si256((__m256i const *)(& U1_64[i4 + 1])); W2[i] ^= _mm256_lddqu_si256((__m256i const *)(& U2_64[i4 + 1])); } @@ -447,7 +447,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { __m256i *U1_256 = (__m256i *) (U1_64 + 1); tmp[0] = W2[0] ^ W3[0] ^ W4[0] ^ _mm256_set_epi64x(U1_64[0], 0, 0, 0); - for (int32_t i = 1; i < (T_TM3_3W_256 << 1) - 1; i++) { + for (i = 1; i < (T_TM3_3W_256 << 1) - 1; i++) { tmp[i] = W2[i] ^ W3[i] ^ W4[i] ^ _mm256_lddqu_si256(&U1_256[i - 1]); } @@ -461,7 +461,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { U2_64 = (int64_t *) W1; __m256i *U2_256 = (__m256i *) (U2_64 + 1); - for (int32_t i = 0; i < 2 * (T_TM3_3W_256) - 1; i++) { + for (i = 0; i < 2 * (T_TM3_3W_256) - 1; i++) { tmp[i] = _mm256_lddqu_si256(&U1_256[i]) ^ _mm256_lddqu_si256(&U2_256[i]); } @@ -469,19 +469,19 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { W3[2 * (T_TM3_3W_256) - 1] = zero; //W1 = W1 + W4 + W2 - for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3_3W_256); i++) { W1[i] ^= W2[i] ^ W4[i]; } //W2 = W2 + W3 - for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3_3W_256); i++) { W2[i] ^= W3[i]; } // Recomposition //W = W0+ W1*x+ W2*x^2+ W3*x^3 + W4*x^4 //W0, W1, W4 of size 2*T_TM3_3W_256, W2 and W3 of size 2*(T_TM3_3W_256) - for (int32_t i = 0; i < (T_TM3_3W_256 << 1) - 1; i++) { + for (i = 0; i < (T_TM3_3W_256 << 1) - 1; i++) { ro256[i] = W0[i]; ro256[i + 2 * T_TM3_3W_256 - 1] = W2[i]; ro256[i + 4 * T_TM3_3W_256 - 2] = W4[i]; @@ -497,12 +497,12 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { U2_64 = ((int64_t *) &ro256[3 * T_TM3_3W_256 - 1]); U2_256 = (__m256i *) (U2_64 - 2); - for (int32_t i = 0; i < T_TM3_3W_256 << 1; i++) { + for (i = 0; i < T_TM3_3W_256 << 1; i++) { _mm256_storeu_si256(&U1_256[i], W1[i] ^ _mm256_lddqu_si256(&U1_256[i])); _mm256_storeu_si256(&U2_256[i], W3[i] ^ _mm256_loadu_si256(&U2_256[i])); } - for (int32_t i = 0; i < 6 * T_TM3_3W_256 - 2; i++) { + for (i = 0; i < 6 * T_TM3_3W_256 - 2; i++) { _mm256_storeu_si256(&Out[i], ro256[i]); } } @@ -541,9 +541,10 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) { __m256i ro256[tTM3R / 2]; const __m256i zero = _mm256_setzero_si256(); int32_t T2 = T_TM3R_3W_64 << 1; + int32_t i, i1, i4; - for (int32_t i = 0; i < T_TM3R_3W_256; i++) { - int32_t i4 = i << 2; + for (i = 0; i < T_TM3R_3W_256; i++) { + i4 = i << 2; U0[i] = _mm256_lddqu_si256((__m256i const *)(& A[i4])); V0[i] = _mm256_lddqu_si256((__m256i const *)(& B[i4])); U1[i] = _mm256_lddqu_si256((__m256i const *)(& A[i4 + T_TM3R_3W_64])); @@ -552,7 +553,7 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) { V2[i] = _mm256_lddqu_si256((__m256i const *)(& B[i4 + T2])); } - for (int32_t i = T_TM3R_3W_256; i < T_TM3R_3W_256 + 2; i++) { + for (i = T_TM3R_3W_256; i < T_TM3R_3W_256 + 2; i++) { U0[i] = zero; V0[i] = zero; U1[i] = zero; @@ -566,12 +567,12 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) { // Evaluation: 5*2 add, 2*2 shift; 5 mul (n) //W3 = U2 + U1 + U0; W2 = V2 + V1 + V0 - for (int32_t i = 0; i < T_TM3R_3W_256; i++) { + for (i = 0; i < T_TM3R_3W_256; i++) { W3[i] = U0[i] ^ U1[i] ^ U2[i]; W2[i] = V0[i] ^ V1[i] ^ V2[i]; } - for (int32_t i = T_TM3R_3W_256; i < T_TM3R_3W_256 + 2; i++) { + for (i = T_TM3R_3W_256; i < T_TM3R_3W_256 + 2; i++) { W2[i] = zero; W3[i] = zero; } @@ -584,7 +585,7 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) { W0[1] = U1[0]; W4[1] = V1[0]; - for (int32_t i = 1; i < T_TM3R_3W_256 + 1; i++) { + for (i = 1; i < T_TM3R_3W_256 + 1; i++) { W0[i + 1] = U1[i] ^ U2[i - 1]; W4[i + 1] = V1[i] ^ V2[i - 1]; } @@ -593,13 +594,13 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) { W4[T_TM3R_3W_256 + 1] = V2[T_TM3R_3W_256 - 1]; //W3 = W3 + W0 ; W2 = W2 + W4 - for (int32_t i = 0; i < T_TM3R_3W_256 + 2; i++) { + for (i = 0; i < T_TM3R_3W_256 + 2; i++) { W3[i] ^= W0[i]; W2[i] ^= W4[i]; } //W0 = W0 + U0 ; W4 = W4 + V0 - for (int32_t i = 0; i < T_TM3R_3W_256 + 2; i++) { + for (i = 0; i < T_TM3R_3W_256 + 2; i++) { W0[i] ^= U0[i]; W4[i] ^= V0[i]; } @@ -607,7 +608,7 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) { //W3 = W3 * W2 ; W2 = W0 * W4 TOOM3Mult(tmp, (uint64_t *) W3, (uint64_t *) W2); - for (int32_t i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) { + for (i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) { W3[i] = tmp[i]; } @@ -621,25 +622,25 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) { //9 add, 1 shift, 1 Smul, 2 Sdiv (2n) //W3 = W3 + W2 - for (int32_t i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) { + for (i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) { W3[i] ^= W2[i]; } //W1 = W1 + W0 - for (int32_t i = 0; i < 2 * (T_TM3R_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3R_3W_256); i++) { W1[i] ^= W0[i]; } //W2 =(W2 + W0)/x - for (int32_t i = 0; i < 2 * (T_TM3R_3W_256 + 2) - 1; i++) { - int32_t i1 = i + 1; + for (i = 0; i < 2 * (T_TM3R_3W_256 + 2) - 1; i++) { + i1 = i + 1; W2[i] = W2[i1] ^ W0[i1]; } W2[2 * (T_TM3R_3W_256 + 2) - 1] = zero; //W2 =(W2 + W3 + W4*(x^3+1))/(x+1) - for (int32_t i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) { + for (i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) { tmp[i] = W2[i] ^ W3[i] ^ W4[i]; } @@ -647,15 +648,15 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) { tmp[2 * (T_TM3R_3W_256 + 2) + 1] = zero; tmp[2 * (T_TM3R_3W_256 + 2) + 2] = zero; - for (int32_t i = 0; i < 2 * (T_TM3R_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3R_3W_256); i++) { tmp[i + 3] ^= W4[i]; } divByXplus1_256(W2, tmp, T_TM3R_3W_256); //W3 =(W3 + W1)/(x*(x+1)) - for (int32_t i = 0; i < 2 * (T_TM3R_3W_256 + 2) - 1; i++) { - int32_t i1 = i + 1; + for (i = 0; i < 2 * (T_TM3R_3W_256 + 2) - 1; i++) { + i1 = i + 1; tmp[i] = W3[i1] ^ W1[i1]; } @@ -663,18 +664,18 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) { divByXplus1_256(W3, tmp, T_TM3R_3W_256); //W1 = W1 + W4 + W2 - for (int32_t i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) { + for (i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) { W1[i] ^= W2[i] ^ W4[i]; } //W2 = W2 + W3 - for (int32_t i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) { + for (i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) { W2[i] ^= W3[i]; } // Recomposition //W = W0+ W1*x+ W2*x^2+ W3*x^3 + W4*x^4 //W0, W1, W4 of size 2*T_TM3_3W_256, W2 and W3 of size 2*(T_TM3_3W_256+2) - for (int32_t i = 0; i < T_TM3R_3W_256; i++) { + for (i = 0; i < T_TM3R_3W_256; i++) { ro256[i] = W0[i]; ro256[i + T_TM3R_3W_256] = W0[i + T_TM3R_3W_256] ^ W1[i]; ro256[i + 2 * T_TM3R_3W_256] = W1[i + T_TM3R_3W_256] ^ W2[i]; @@ -696,7 +697,7 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) { ro256[3 + 5 * T_TM3R_3W_256] ^= W3[3 + 2 * T_TM3R_3W_256]; - for (int32_t i = 0; i < 2 * VEC_N_SIZE_256 + 1; i++) { + for (i = 0; i < 2 * VEC_N_SIZE_256 + 1; i++) { _mm256_storeu_si256(&Out[i], ro256[i]); } } diff --git a/crypto_kem/hqc-256/avx2/vector.c b/crypto_kem/hqc-256/avx2/vector.c index 24c264d8..610fe47d 100644 --- a/crypto_kem/hqc-256/avx2/vector.c +++ b/crypto_kem/hqc-256/avx2/vector.c @@ -32,72 +32,63 @@ void PQCLEAN_HQC256_AVX2_vect_set_random_fixed_weight(AES_XOF_struct *ctx, uint64_t *v, uint16_t weight) { size_t random_bytes_size = 3 * weight; uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0}; - uint32_t random_data = 0; uint32_t tmp[PARAM_OMEGA_R] = {0}; - uint8_t exist = 0; - size_t j = 0; __m256i bit256[PARAM_OMEGA_R]; __m256i bloc256[PARAM_OMEGA_R]; - static __m256i posCmp256 = (__m256i) { - 0UL, 1UL, 2UL, 3UL - }; -#define LOOP_SIZE CEIL_DIVIDE(PARAM_N, 256) - - seedexpander(ctx, rand_bytes, random_bytes_size); - - for (uint32_t i = 0; i < weight; ++i) { - exist = 0; + __m256i posCmp256 = _mm256_set_epi64x(3, 2, 1, 0); + uint64_t bloc, pos, bit64; + uint8_t inc; + size_t i, j; + + i = 0; + j = random_bytes_size; + while (i < weight) { do { if (j == random_bytes_size) { seedexpander(ctx, rand_bytes, random_bytes_size); j = 0; } - random_data = ((uint32_t) rand_bytes[j++]) << 16; - random_data |= ((uint32_t) rand_bytes[j++]) << 8; - random_data |= rand_bytes[j++]; + tmp[i] = ((uint32_t) rand_bytes[j++]) << 16; + tmp[i] |= ((uint32_t) rand_bytes[j++]) << 8; + tmp[i] |= rand_bytes[j++]; - } while (random_data >= UTILS_REJECTION_THRESHOLD); + } while (tmp[i] >= UTILS_REJECTION_THRESHOLD); - random_data = random_data % PARAM_N; + tmp[i] = tmp[i] % PARAM_N; + inc = 1; for (uint32_t k = 0; k < i; k++) { - if (tmp[k] == random_data) { - exist = 1; + if (tmp[k] == tmp[i]) { + inc = 0; } } - - if (exist == 1) { - i--; - } else { - tmp[i] = random_data; - } + i += inc; } - for (uint32_t i = 0; i < weight; i++) { + for (i = 0; i < weight; i++) { // we store the bloc number and bit position of each vb[i] - uint64_t bloc = tmp[i] >> 6; + bloc = tmp[i] >> 6; bloc256[i] = _mm256_set1_epi64x(bloc >> 2); - uint64_t pos = (bloc & 0x3UL); + pos = (bloc & 0x3UL); __m256i pos256 = _mm256_set1_epi64x(pos); __m256i mask256 = _mm256_cmpeq_epi64(pos256, posCmp256); - uint64_t bit64 = 1ULL << (tmp[i] & 0x3f); + bit64 = 1ULL << (tmp[i] & 0x3f); __m256i bloc256 = _mm256_set1_epi64x(bit64); bit256[i] = bloc256 & mask256; } - for (uint32_t i = 0; i < LOOP_SIZE; i++) { + for (i = 0; i < CEIL_DIVIDE(PARAM_N, 256); i++) { __m256i aux = _mm256_loadu_si256(((__m256i *)v) + i); __m256i i256 = _mm256_set1_epi64x(i); - for (uint32_t j = 0; j < weight; j++) { + for (j = 0; j < weight; j++) { __m256i mask256 = _mm256_cmpeq_epi64(bloc256[j], i256); aux ^= bit256[j] & mask256; } _mm256_storeu_si256(((__m256i *)v) + i, aux); } -#undef LOOP_SIZE } @@ -182,10 +173,9 @@ uint8_t PQCLEAN_HQC256_AVX2_vect_compare(const uint8_t *v1, const uint8_t *v2, u * @param[in] size_v Integer that is the size of the input vector in bits */ void PQCLEAN_HQC256_AVX2_vect_resize(uint64_t *o, uint32_t size_o, const uint64_t *v, uint32_t size_v) { + uint64_t mask = 0x7FFFFFFFFFFFFFFF; + int8_t val = 0; if (size_o < size_v) { - uint64_t mask = 0x7FFFFFFFFFFFFFFF; - int8_t val = 0; - if (size_o % 64) { val = 64 - (size_o % 64); } diff --git a/crypto_kem/hqc-256/clean/bch.c b/crypto_kem/hqc-256/clean/bch.c index 4c8157fa..3a9aa68d 100644 --- a/crypto_kem/hqc-256/clean/bch.c +++ b/crypto_kem/hqc-256/clean/bch.c @@ -121,52 +121,54 @@ void PQCLEAN_HQC256_CLEAN_bch_code_encode(uint64_t *codeword, const uint64_t *me * @param[in] syndromes Array of size (at least) 2*PARAM_DELTA storing the syndromes */ static size_t compute_elp(uint16_t *sigma, const uint16_t *syndromes) { - sigma[0] = 1; - size_t deg_sigma = 0; - size_t deg_sigma_p = 0; uint16_t sigma_copy[PARAM_DELTA - 1] = {0}; - size_t deg_sigma_copy = 0; - uint16_t X_sigma_p[PARAM_DELTA + 1] = {0, 1}; - int32_t pp = -1; // 2*rho - uint16_t d_p = 1; - uint16_t d = syndromes[0]; + uint16_t X_sigma_p[PARAM_DELTA + 1] = {0}; + uint16_t d_p, d, dd; + uint16_t mask; + int32_t pp; // 2*rho + size_t deg_sigma, deg_sigma_p, deg_sigma_copy, deg_X_sigma_p; + d = syndromes[0]; + sigma[0] = 1; + X_sigma_p[1] = 1; + deg_sigma = 0; + deg_sigma_p = 0; + d_p = 1; + pp = -1; for (size_t mu = 0; mu < PARAM_DELTA; ++mu) { // Save sigma in case we need it to update X_sigma_p memcpy(sigma_copy, sigma, 2 * (PARAM_DELTA - 1)); deg_sigma_copy = deg_sigma; - uint16_t dd = PQCLEAN_HQC256_CLEAN_gf_mul(d, PQCLEAN_HQC256_CLEAN_gf_inverse(d_p)); // 0 if(d == 0) + dd = PQCLEAN_HQC256_CLEAN_gf_mul(d, PQCLEAN_HQC256_CLEAN_gf_inverse(d_p)); // 0 if(d == 0) for (size_t i = 1; (i <= 2 * mu + 1) && (i <= PARAM_DELTA); ++i) { sigma[i] ^= PQCLEAN_HQC256_CLEAN_gf_mul(dd, X_sigma_p[i]); } - size_t deg_X = 2 * mu - pp; // 2*(mu-rho) - size_t deg_X_sigma_p = deg_X + deg_sigma_p; + deg_X_sigma_p = 2 * mu - pp + deg_sigma_p; - // mask1 = 0xffff if(d != 0) and 0 otherwise - int16_t mask1 = -((uint16_t) - d >> 15); + // mask = 0xffff if(d != 0) and 0 otherwise + mask = -((uint16_t) - d >> 15); - // mask2 = 0xffff if(deg_X_sigma_p > deg_sigma) and 0 otherwise - int16_t mask2 = -((uint16_t) (deg_sigma - deg_X_sigma_p) >> 15); + // mask2 &= 0xffff if(deg_X_sigma_p > deg_sigma) and 0 otherwise + mask &= -((uint16_t) (deg_sigma - deg_X_sigma_p) >> 15); - // mask12 = 0xffff if the deg_sigma increased and 0 otherwise - int16_t mask12 = mask1 & mask2; - deg_sigma = (mask12 & deg_X_sigma_p) ^ (~mask12 & deg_sigma); + deg_sigma ^= mask & (deg_sigma ^ deg_X_sigma_p); if (mu == PARAM_DELTA - 1) { break; } // Update pp, d_p and X_sigma_p if needed - pp = (mask12 & (2 * mu)) ^ (~mask12 & pp); - d_p = (mask12 & d) ^ (~mask12 & d_p); + pp ^= mask & (pp ^ (2 * mu)); + d_p ^= mask & (d_p ^ d); for (size_t i = PARAM_DELTA - 1; i; --i) { - X_sigma_p[i + 1] = (mask12 & sigma_copy[i - 1]) ^ (~mask12 & X_sigma_p[i - 1]); + X_sigma_p[i + 1] = X_sigma_p[i - 1]; + X_sigma_p[i + 1] ^= mask & (X_sigma_p[i + 1] ^ sigma_copy[i - 1]); } X_sigma_p[1] = 0; X_sigma_p[0] = 0; - deg_sigma_p = (mask12 & deg_sigma_copy) ^ (~mask12 & deg_sigma_p); + deg_sigma_p ^= mask & (deg_sigma_p ^ deg_sigma_copy); // Compute the next discrepancy d = syndromes[2 * mu + 2]; diff --git a/crypto_kem/hqc-256/clean/vector.c b/crypto_kem/hqc-256/clean/vector.c index 2eb148a3..2c27d01e 100644 --- a/crypto_kem/hqc-256/clean/vector.c +++ b/crypto_kem/hqc-256/clean/vector.c @@ -31,39 +31,33 @@ void PQCLEAN_HQC256_CLEAN_vect_set_random_fixed_weight_by_coordinates(AES_XOF_struct *ctx, uint32_t *v, uint16_t weight) { size_t random_bytes_size = 3 * weight; uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0}; // weight is expected to be <= PARAM_OMEGA_R - uint32_t random_data = 0; - uint8_t exist = 0; - size_t j = 0; + uint8_t inc; + size_t i, j; - seedexpander(ctx, rand_bytes, random_bytes_size); - - for (uint32_t i = 0; i < weight; ++i) { - exist = 0; + i = 0; + j = random_bytes_size; + while (i < weight) { do { if (j == random_bytes_size) { seedexpander(ctx, rand_bytes, random_bytes_size); j = 0; } - random_data = ((uint32_t) rand_bytes[j++]) << 16; - random_data |= ((uint32_t) rand_bytes[j++]) << 8; - random_data |= rand_bytes[j++]; + v[i] = ((uint32_t) rand_bytes[j++]) << 16; + v[i] |= ((uint32_t) rand_bytes[j++]) << 8; + v[i] |= rand_bytes[j++]; - } while (random_data >= UTILS_REJECTION_THRESHOLD); + } while (v[i] >= UTILS_REJECTION_THRESHOLD); - random_data = random_data % PARAM_N; + v[i] = v[i] % PARAM_N; - for (uint32_t k = 0; k < i; k++) { - if (v[k] == random_data) { - exist = 1; + inc = 1; + for (size_t k = 0; k < i; k++) { + if (v[k] == v[i]) { + inc = 0; } } - - if (exist == 1) { - i--; - } else { - v[i] = random_data; - } + i += inc; } } @@ -86,46 +80,11 @@ void PQCLEAN_HQC256_CLEAN_vect_set_random_fixed_weight_by_coordinates(AES_XOF_st * @param[in] ctx Pointer to the context of the seed expander */ void PQCLEAN_HQC256_CLEAN_vect_set_random_fixed_weight(AES_XOF_struct *ctx, uint64_t *v, uint16_t weight) { - - size_t random_bytes_size = 3 * weight; - uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0}; // weight is expected to be <= PARAM_OMEGA_R - uint32_t random_data = 0; uint32_t tmp[PARAM_OMEGA_R] = {0}; - uint8_t exist = 0; - size_t j = 0; - seedexpander(ctx, rand_bytes, random_bytes_size); - - for (uint32_t i = 0; i < weight; ++i) { - exist = 0; - do { - if (j == random_bytes_size) { - seedexpander(ctx, rand_bytes, random_bytes_size); - j = 0; - } - - random_data = ((uint32_t) rand_bytes[j++]) << 16; - random_data |= ((uint32_t) rand_bytes[j++]) << 8; - random_data |= rand_bytes[j++]; - - } while (random_data >= UTILS_REJECTION_THRESHOLD); - - random_data = random_data % PARAM_N; - - for (uint32_t k = 0; k < i; k++) { - if (tmp[k] == random_data) { - exist = 1; - } - } - - if (exist == 1) { - i--; - } else { - tmp[i] = random_data; - } - } + PQCLEAN_HQC256_CLEAN_vect_set_random_fixed_weight_by_coordinates(ctx, tmp, weight); - for (uint16_t i = 0; i < weight; ++i) { + for (size_t i = 0; i < weight; ++i) { int32_t index = tmp[i] / 64; int32_t pos = tmp[i] % 64; v[index] |= ((uint64_t) 1) << pos; diff --git a/crypto_kem/hqc-rmrs-128/avx2/gf2x.c b/crypto_kem/hqc-rmrs-128/avx2/gf2x.c index 0c8c91c9..96ce2c2a 100644 --- a/crypto_kem/hqc-rmrs-128/avx2/gf2x.c +++ b/crypto_kem/hqc-rmrs-128/avx2/gf2x.c @@ -188,22 +188,23 @@ static inline void karat_mult_4(__m256i *C, __m256i *A, __m256i *B) { */ static inline void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) { __m256i D0[8], D1[8], D2[8], SAA[4], SBB[4]; + int32_t i, is, is2, is3; karat_mult_4( D0, A, B); karat_mult_4(D2, A + 4, B + 4); - for (int32_t i = 0; i < 4; i++) { - int is = i + 4; + for (i = 0; i < 4; i++) { + is = i + 4; SAA[i] = A[i] ^ A[is]; SBB[i] = B[i] ^ B[is]; } karat_mult_4(D1, SAA, SBB); - for (int32_t i = 0; i < 4; i++) { - int32_t is = i + 4; - int32_t is2 = is + 4; - int32_t is3 = is2 + 4; + for (i = 0; i < 4; i++) { + is = i + 4; + is2 = is + 4; + is3 = is2 + 4; __m256i middle = _mm256_xor_si256(D0[is], D2[i]); @@ -227,22 +228,23 @@ static inline void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) { */ static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) { __m256i D0[16], D1[16], D2[16], SAA[8], SBB[8]; + int32_t i, is, is2, is3; karat_mult_8( D0, A, B); karat_mult_8(D2, A + 8, B + 8); - for (int32_t i = 0; i < 8; i++) { - int32_t is = i + 8; + for (i = 0; i < 8; i++) { + is = i + 8; SAA[i] = A[i] ^ A[is]; SBB[i] = B[i] ^ B[is]; } karat_mult_8( D1, SAA, SBB); - for (int32_t i = 0; i < 8; i++) { - int32_t is = i + 8; - int32_t is2 = is + 8; - int32_t is3 = is2 + 8; + for (i = 0; i < 8; i++) { + is = i + 8; + is2 = is + 8; + is3 = is2 + 8; __m256i middle = _mm256_xor_si256(D0[is], D2[i]); @@ -266,22 +268,23 @@ static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) { */ static inline void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) { __m256i D0[32], D1[32], D2[32], SAA[16], SBB[16]; + int32_t i, is, is2, is3; karat_mult_16( D0, A, B); karat_mult_16(D2, A + 16, B + 16); - for (int32_t i = 0; i < 16; i++) { - int is = i + 16; + for (i = 0; i < 16; i++) { + is = i + 16; SAA[i] = A[i] ^ A[is]; SBB[i] = B[i] ^ B[is]; } karat_mult_16( D1, SAA, SBB); - for (int32_t i = 0; i < 16; i++) { - int32_t is = i + 16; - int32_t is2 = is + 16; - int32_t is3 = is2 + 16; + for (i = 0; i < 16; i++) { + is = i + 16; + is2 = is + 16; + is3 = is2 + 16; __m256i middle = _mm256_xor_si256(D0[is], D2[i]); @@ -329,11 +332,16 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { static __m256i tmp[2 * (T_TM3_3W_256)]; static __m256i ro256[6 * (T_TM3_3W_256)]; const __m256i zero = _mm256_setzero_si256(); + int64_t *U1_64; + int64_t *U2_64; + int64_t *V1_64; + int64_t *V2_64; int32_t T2 = T_TM3_3W_64 << 1; + int32_t i, i4, i41, i42; - for (int32_t i = 0; i < T_TM3_3W_256 - 1; i++) { - int32_t i4 = i << 2; - int32_t i42 = i4 - 2; + for (i = 0; i < T_TM3_3W_256 - 1; i++) { + i4 = i << 2; + i42 = i4 - 2; U0[i] = _mm256_lddqu_si256((__m256i const *)(& A[i4])); V0[i] = _mm256_lddqu_si256((__m256i const *)(& B[i4])); U1[i] = _mm256_lddqu_si256((__m256i const *)(& A[i42 + T_TM3_3W_64])); @@ -342,9 +350,9 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { V2[i] = _mm256_lddqu_si256((__m256i const *)(& B[i4 + T2 - 4])); } - for (int32_t i = T_TM3_3W_256 - 1; i < T_TM3_3W_256; i++) { - int32_t i4 = i << 2; - int32_t i41 = i4 + 1; + for (i = T_TM3_3W_256 - 1; i < T_TM3_3W_256; i++) { + i4 = i << 2; + i41 = i4 + 1; U0[i] = _mm256_set_epi64x(0, 0, A[i41], A[i4]); V0[i] = _mm256_set_epi64x(0, 0, B[i41], B[i4]); U1[i] = _mm256_set_epi64x(0, 0, A[i41 + T_TM3_3W_64 - 2], A[i4 + T_TM3_3W_64 - 2]); @@ -357,7 +365,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { // P(X): P0=(0); P1=(1); P2=(x); P3=(1+x); P4=(\infty) // Evaluation: 5*2 add, 2*2 shift; 5 mul (n) //W3 = U2 + U1 + U0; W2 = V2 + V1 + V0 - for (int32_t i = 0; i < T_TM3_3W_256; i++) { + for (i = 0; i < T_TM3_3W_256; i++) { W3[i] = U0[i] ^ U1[i] ^ U2[i]; W2[i] = V0[i] ^ V1[i] ^ V2[i]; } @@ -366,23 +374,17 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { karat_mult_32( W1, W2, W3); //W0 =(U1 + U2*x)*x; W4 =(V1 + V2*x)*x (SIZE = T_TM3_3W_256 !) - int64_t *U1_64 = ((int64_t *) U1); - int64_t *U2_64 = ((int64_t *) U2); - - int64_t *V1_64 = ((int64_t *) V1); - int64_t *V2_64 = ((int64_t *) V2); - - W0[0] = _mm256_set_epi64x(U1_64[2] ^ U2_64[1], U1_64[1] ^ U2_64[0], U1_64[0], 0); - W4[0] = _mm256_set_epi64x(V1_64[2] ^ V2_64[1], V1_64[1] ^ V2_64[0], V1_64[0], 0); - U1_64 = ((int64_t *) U1); U2_64 = ((int64_t *) U2); V1_64 = ((int64_t *) V1); V2_64 = ((int64_t *) V2); - for (int32_t i = 1; i < T_TM3_3W_256; i++) { - int i4 = i << 2; + W0[0] = _mm256_set_epi64x(U1_64[2] ^ U2_64[1], U1_64[1] ^ U2_64[0], U1_64[0], 0); + W4[0] = _mm256_set_epi64x(V1_64[2] ^ V2_64[1], V1_64[1] ^ V2_64[0], V1_64[0], 0); + + for (i = 1; i < T_TM3_3W_256; i++) { + i4 = i << 2; W0[i] = _mm256_lddqu_si256((__m256i const *)(& U1_64[i4 - 1])); W0[i] ^= _mm256_lddqu_si256((__m256i const *)(& U2_64[i4 - 2])); @@ -391,13 +393,13 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { } //W3 = W3 + W0 ; W2 = W2 + W4 - for (int32_t i = 0; i < T_TM3_3W_256; i++) { + for (i = 0; i < T_TM3_3W_256; i++) { W3[i] ^= W0[i]; W2[i] ^= W4[i]; } //W0 = W0 + U0 ; W4 = W4 + V0 - for (int32_t i = 0; i < T_TM3_3W_256; i++) { + for (i = 0; i < T_TM3_3W_256; i++) { W0[i] ^= U0[i]; W4[i] ^= V0[i]; } @@ -405,7 +407,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { //W3 = W3 * W2 ; W2 = W0 * W4 karat_mult_32(tmp, W3, W2); - for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3_3W_256); i++) { W3[i] = tmp[i]; } @@ -417,20 +419,20 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { // Interpolation phase // 9 add, 1 shift, 1 Smul, 2 Sdiv (2n) //W3 = W3 + W2 - for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3_3W_256); i++) { W3[i] ^= W2[i]; } //W1 = W1 + W0 - for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3_3W_256); i++) { W1[i] ^= W0[i]; } //W2 =(W2 + W0)/x -> x = X^64 U1_64 = ((int64_t *) W2); U2_64 = ((int64_t *) W0); - for (int32_t i = 0; i < (T_TM3_3W_256 << 1); i++) { - int32_t i4 = i << 2; + for (i = 0; i < (T_TM3_3W_256 << 1); i++) { + i4 = i << 2; W2[i] = _mm256_lddqu_si256((__m256i const *)(& U1_64[i4 + 1])); W2[i] ^= _mm256_lddqu_si256((__m256i const *)(& U2_64[i4 + 1])); } @@ -440,7 +442,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { __m256i *U1_256 = (__m256i *) (U1_64 + 1); tmp[0] = W2[0] ^ W3[0] ^ W4[0] ^ _mm256_set_epi64x(U1_64[0], 0, 0, 0); - for (int32_t i = 1; i < (T_TM3_3W_256 << 1) - 1; i++) { + for (i = 1; i < (T_TM3_3W_256 << 1) - 1; i++) { tmp[i] = W2[i] ^ W3[i] ^ W4[i] ^ _mm256_lddqu_si256(&U1_256[i - 1]); } @@ -454,7 +456,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { U2_64 = (int64_t *) W1; __m256i *U2_256 = (__m256i *) (U2_64 + 1); - for (int32_t i = 0; i < 2 * (T_TM3_3W_256) - 1; i++) { + for (i = 0; i < 2 * (T_TM3_3W_256) - 1; i++) { tmp[i] = _mm256_lddqu_si256(&U1_256[i]) ^ _mm256_lddqu_si256(&U2_256[i]); } @@ -462,19 +464,19 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { W3[2 * (T_TM3_3W_256) - 1] = zero; //W1 = W1 + W4 + W2 - for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3_3W_256); i++) { W1[i] ^= W2[i] ^ W4[i]; } //W2 = W2 + W3 - for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3_3W_256); i++) { W2[i] ^= W3[i]; } // Recomposition //W = W0+ W1*x+ W2*x^2+ W3*x^3 + W4*x^4 //W0, W1, W4 of size 2*T_TM3_3W_256, W2 and W3 of size 2*(T_TM3_3W_256) - for (int32_t i = 0; i < (T_TM3_3W_256 << 1) - 1; i++) { + for (i = 0; i < (T_TM3_3W_256 << 1) - 1; i++) { ro256[i] = W0[i]; ro256[i + 2 * T_TM3_3W_256 - 1] = W2[i]; ro256[i + 4 * T_TM3_3W_256 - 2] = W4[i]; @@ -490,12 +492,12 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { U2_64 = ((int64_t *) &ro256[3 * T_TM3_3W_256 - 1]); U2_256 = (__m256i *) (U2_64 - 2); - for (int32_t i = 0; i < T_TM3_3W_256 << 1; i++) { + for (i = 0; i < T_TM3_3W_256 << 1; i++) { _mm256_storeu_si256(&U1_256[i], W1[i] ^ _mm256_lddqu_si256(&U1_256[i])); _mm256_storeu_si256(&U2_256[i], W3[i] ^ _mm256_loadu_si256(&U2_256[i])); } - for (int32_t i = 0; i < 2 * VEC_N_SIZE_256 + 1; i++) { + for (i = 0; i < 2 * VEC_N_SIZE_256 + 1; i++) { _mm256_storeu_si256(&Out[i], ro256[i]); } } diff --git a/crypto_kem/hqc-rmrs-128/avx2/reed_muller.c b/crypto_kem/hqc-rmrs-128/avx2/reed_muller.c index 0c987e1f..bfc6e713 100644 --- a/crypto_kem/hqc-rmrs-128/avx2/reed_muller.c +++ b/crypto_kem/hqc-rmrs-128/avx2/reed_muller.c @@ -231,7 +231,19 @@ inline void hadamard(__m256i *src, __m256i *dst) { inline uint32_t find_peaks(__m256i *transform) { // a whole lot of vector variables __m256i bitmap, abs_rows[8], bound, active_row, max_abs_rows; - __m256i peak_mask; + __m256i tmp = _mm256_setzero_si256(); + __m256i vect_mask; + __m256i res; + int32_t lower; + int32_t width; + uint32_t message; + uint32_t mask; + int8_t index; + int8_t abs_value; + int8_t mask1; + int8_t mask2; + uint16_t result; + // compute absolute value of transform for (size_t i = 0; i < 8; i++) { abs_rows[i] = _mm256_abs_epi16(transform[i]); @@ -245,9 +257,9 @@ inline uint32_t find_peaks(__m256i *transform) { // do binary search for the highest value that is lower than the maximum // loop invariant: lower gives bit map = 0, lower + width gives bit map > 0 - int32_t lower = 1; + lower = 1; // this gives 64, 128 or 256 for MULTIPLICITY = 2, 4, 6 - int32_t width = 1 << (5 + MULTIPLICITY / 2); + width = 1 << (5 + MULTIPLICITY / 2); // if you don't unroll this loop, it fits in the loop cache // uncomment the line below to speeding up the program by a few percent // #pragma GCC unroll 0 @@ -259,8 +271,9 @@ inline uint32_t find_peaks(__m256i *transform) { bitmap = _mm256_cmpgt_epi16(max_abs_rows, bound); // step up if there are any matches // rely on compiler to use conditional move here - int32_t step_mask = _mm256_testz_si256(bitmap, bitmap) - 1; - lower += step_mask & width; + mask = (uint32_t) _mm256_testz_si256(bitmap, bitmap); + mask = ~(uint32_t) ((-(int64_t) mask) >> 63); + lower += mask & width; } // lower+width contains the maximum value of the vector // or less, if the maximum is very high (which is OK) @@ -272,30 +285,26 @@ inline uint32_t find_peaks(__m256i *transform) { // find in which of the 8 groups a maximum occurs to compute bits 4, 5, 6 of message // find lowest value by searching backwards skip first check to save time - size_t message = 0x70; - for (int32_t i = 7; i >= 0; i--) { - bitmap = _mm256_cmpgt_epi16(abs_rows[i], bound); - int message_mask = (-(int16_t)(_mm256_testz_si256(bitmap, bitmap) == 0)) >> 15; - message ^= message_mask & (message ^ (unsigned)i << 4); + message = 0x70; + for (size_t i = 0; i < 8; i++) { + bitmap = _mm256_cmpgt_epi16(abs_rows[7 - i], bound); + mask = (uint32_t) _mm256_testz_si256(bitmap, bitmap); + mask = ~(uint32_t) ((-(int64_t) mask) >> 63); + message ^= mask & (message ^ ((7 - i) << 4)); } // we decided which row of the matrix contains the lowest match // select proper row - int8_t index = message >> 4; - __m256i res; - __m256i tmp = (__m256i) { - 0ULL, 0ULL, 0ULL, 0ULL - }; + index = message >> 4; - for (int8_t i = 0; i < 8; i++) { - int8_t abs_value = (int8_t)(index - i); - int8_t mask1 = abs_value >> 7; + tmp = _mm256_setzero_si256(); + for (size_t i = 0; i < 8; i++) { + abs_value = (int8_t)(index - i); + mask1 = abs_value >> 7; abs_value ^= mask1; abs_value -= mask1; - int8_t mask2 = ((uint8_t) - abs_value >> 7); - int64_t mask3 = (-1ULL) + mask2; - __m256i vect_mask = (__m256i) { - mask3, mask3, mask3, mask3 - }; + mask2 = ((uint8_t) - abs_value >> 7); + mask = (-1ULL) + mask2; + vect_mask = _mm256_set1_epi32(mask); res = _mm256_and_si256(abs_rows[i], vect_mask); tmp = _mm256_or_si256(tmp, res); } @@ -305,34 +314,29 @@ inline uint32_t find_peaks(__m256i *transform) { // get the column number of the vector element // by setting the bits corresponding to the columns // and then adding elements within two groups of 8 - peak_mask = _mm256_cmpgt_epi16(active_row, bound); - peak_mask &= _mm256_set_epi16(-32768, 16384, 8192, 4096, 2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1); - for (int32_t i = 0; i < 3; i++) { - peak_mask = _mm256_hadd_epi16(peak_mask, peak_mask); + vect_mask = _mm256_cmpgt_epi16(active_row, bound); + vect_mask &= _mm256_set_epi16(-32768, 16384, 8192, 4096, 2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1); + for (size_t i = 0; i < 3; i++) { + vect_mask = _mm256_hadd_epi16(vect_mask, vect_mask); } // add low 4 bits of message - message |= __tzcnt_u16(_mm256_extract_epi16(peak_mask, 0) + _mm256_extract_epi16(peak_mask, 8)); + message |= __tzcnt_u16(_mm256_extract_epi16(vect_mask, 0) + _mm256_extract_epi16(vect_mask, 8)); // set bit 7 if sign of biggest value is positive // make sure a jump isn't generated by the compiler - tmp = (__m256i) { - 0ULL, 0ULL, 0ULL, 0ULL - }; - for (uint32_t i = 0; i < 8; i++) { - int64_t message_mask = (-(int64_t)(i == message / 16)) >> 63; - __m256i vect_mask = (__m256i) { - message_mask, message_mask, message_mask, message_mask - }; + tmp = _mm256_setzero_si256(); + for (size_t i = 0; i < 8; i++) { + mask = ~(uint32_t) ((-(int64_t)(i ^ message / 16)) >> 63); + __m256i vect_mask = _mm256_set1_epi32(mask); tmp = _mm256_or_si256(tmp, _mm256_and_si256(vect_mask, transform[i])); } - uint16_t result = 0; - for (uint32_t i = 0; i < 16; i++) { - uint16_t *ptr = (uint16_t *) &tmp; - int32_t message_mask = (-(int32_t)(i == message % 16)) >> (sizeof(int32_t) * 8 - 1); - result |= message_mask & ptr[i]; + result = 0; + for (size_t i = 0; i < 16; i++) { + mask = ~(uint32_t) ((-(int64_t)(i ^ message % 16)) >> 63); + result |= mask & ((uint16_t *)&tmp)[i]; } message |= (0x8000 & ~result) >> 8; - return (uint32_t) message; + return message; } diff --git a/crypto_kem/hqc-rmrs-128/avx2/reed_solomon.c b/crypto_kem/hqc-rmrs-128/avx2/reed_solomon.c index b80b36dc..e45f6010 100644 --- a/crypto_kem/hqc-rmrs-128/avx2/reed_solomon.c +++ b/crypto_kem/hqc-rmrs-128/avx2/reed_solomon.c @@ -228,17 +228,25 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons uint16_t beta_j[PARAM_DELTA] = {0}; uint16_t e_j[PARAM_DELTA] = {0}; - uint16_t delta_counter = 0; + uint16_t delta_counter; uint16_t delta_real_value; + uint16_t found; + uint16_t mask1; + uint16_t mask2; + uint16_t tmp1; + uint16_t tmp2; + uint16_t inverse; + uint16_t inverse_power_j; // Compute the beta_{j_i} page 31 of the documentation + delta_counter = 0; for (size_t i = 0; i < PARAM_N1; i++) { - uint16_t found = 0; - uint16_t valuemask = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0 - for (uint16_t j = 0; j < PARAM_DELTA; j++) { - uint16_t indexmask = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter - beta_j[j] += indexmask & valuemask & gf_exp[i]; - found += indexmask & valuemask & 1; + found = 0; + mask1 = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0 + for (size_t j = 0; j < PARAM_DELTA; j++) { + mask2 = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter + beta_j[j] += mask1 & mask2 & gf_exp[i]; + found += mask1 & mask2 & 1; } delta_counter += found; } @@ -246,10 +254,10 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons // Compute the e_{j_i} page 31 of the documentation for (size_t i = 0; i < PARAM_DELTA; ++i) { - uint16_t tmp1 = 1; - uint16_t tmp2 = 1; - uint16_t inverse = PQCLEAN_HQCRMRS128_AVX2_gf_inverse(beta_j[i]); - uint16_t inverse_power_j = 1; + tmp1 = 1; + tmp2 = 1; + inverse = PQCLEAN_HQCRMRS128_AVX2_gf_inverse(beta_j[i]); + inverse_power_j = 1; for (size_t j = 1; j <= PARAM_DELTA; ++j) { inverse_power_j = PQCLEAN_HQCRMRS128_AVX2_gf_mul(inverse_power_j, inverse); @@ -258,19 +266,19 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons for (size_t k = 1; k < PARAM_DELTA; ++k) { tmp2 = PQCLEAN_HQCRMRS128_AVX2_gf_mul(tmp2, (1 ^ PQCLEAN_HQCRMRS128_AVX2_gf_mul(inverse, beta_j[(i + k) % PARAM_DELTA]))); } - uint16_t mask = (uint16_t) (((int16_t) i - delta_real_value) >> 15); // i < delta_real_value - e_j[i] = mask & PQCLEAN_HQCRMRS128_AVX2_gf_mul(tmp1, PQCLEAN_HQCRMRS128_AVX2_gf_inverse(tmp2)); + mask1 = (uint16_t) (((int16_t) i - delta_real_value) >> 15); // i < delta_real_value + e_j[i] = mask1 & PQCLEAN_HQCRMRS128_AVX2_gf_mul(tmp1, PQCLEAN_HQCRMRS128_AVX2_gf_inverse(tmp2)); } // Place the delta e_{j_i} values at the right coordinates of the output vector delta_counter = 0; for (size_t i = 0; i < PARAM_N1; ++i) { - uint16_t found = 0; - uint16_t valuemask = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0 + found = 0; + mask1 = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0 for (size_t j = 0; j < PARAM_DELTA; j++) { - uint16_t indexmask = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter - error_values[i] += indexmask & valuemask & e_j[j]; - found += indexmask & valuemask & 1; + mask2 = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter + error_values[i] += mask1 & mask2 & e_j[j]; + found += mask1 & mask2 & 1; } delta_counter += found; } diff --git a/crypto_kem/hqc-rmrs-128/avx2/vector.c b/crypto_kem/hqc-rmrs-128/avx2/vector.c index d60431b4..0ac5069e 100644 --- a/crypto_kem/hqc-rmrs-128/avx2/vector.c +++ b/crypto_kem/hqc-rmrs-128/avx2/vector.c @@ -32,72 +32,63 @@ void PQCLEAN_HQCRMRS128_AVX2_vect_set_random_fixed_weight(AES_XOF_struct *ctx, uint64_t *v, uint16_t weight) { size_t random_bytes_size = 3 * weight; uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0}; - uint32_t random_data = 0; uint32_t tmp[PARAM_OMEGA_R] = {0}; - uint8_t exist = 0; - size_t j = 0; __m256i bit256[PARAM_OMEGA_R]; __m256i bloc256[PARAM_OMEGA_R]; - static __m256i posCmp256 = (__m256i) { - 0UL, 1UL, 2UL, 3UL - }; -#define LOOP_SIZE CEIL_DIVIDE(PARAM_N, 256) - - seedexpander(ctx, rand_bytes, random_bytes_size); - - for (uint32_t i = 0; i < weight; ++i) { - exist = 0; + __m256i posCmp256 = _mm256_set_epi64x(3, 2, 1, 0); + uint64_t bloc, pos, bit64; + uint8_t inc; + size_t i, j; + + i = 0; + j = random_bytes_size; + while (i < weight) { do { if (j == random_bytes_size) { seedexpander(ctx, rand_bytes, random_bytes_size); j = 0; } - random_data = ((uint32_t) rand_bytes[j++]) << 16; - random_data |= ((uint32_t) rand_bytes[j++]) << 8; - random_data |= rand_bytes[j++]; + tmp[i] = ((uint32_t) rand_bytes[j++]) << 16; + tmp[i] |= ((uint32_t) rand_bytes[j++]) << 8; + tmp[i] |= rand_bytes[j++]; - } while (random_data >= UTILS_REJECTION_THRESHOLD); + } while (tmp[i] >= UTILS_REJECTION_THRESHOLD); - random_data = random_data % PARAM_N; + tmp[i] = tmp[i] % PARAM_N; + inc = 1; for (uint32_t k = 0; k < i; k++) { - if (tmp[k] == random_data) { - exist = 1; + if (tmp[k] == tmp[i]) { + inc = 0; } } - - if (exist == 1) { - i--; - } else { - tmp[i] = random_data; - } + i += inc; } - for (uint32_t i = 0; i < weight; i++) { + for (i = 0; i < weight; i++) { // we store the bloc number and bit position of each vb[i] - uint64_t bloc = tmp[i] >> 6; + bloc = tmp[i] >> 6; bloc256[i] = _mm256_set1_epi64x(bloc >> 2); - uint64_t pos = (bloc & 0x3UL); + pos = (bloc & 0x3UL); __m256i pos256 = _mm256_set1_epi64x(pos); __m256i mask256 = _mm256_cmpeq_epi64(pos256, posCmp256); - uint64_t bit64 = 1ULL << (tmp[i] & 0x3f); + bit64 = 1ULL << (tmp[i] & 0x3f); __m256i bloc256 = _mm256_set1_epi64x(bit64); bit256[i] = bloc256 & mask256; } - for (uint32_t i = 0; i < LOOP_SIZE; i++) { + for (i = 0; i < CEIL_DIVIDE(PARAM_N, 256); i++) { __m256i aux = _mm256_loadu_si256(((__m256i *)v) + i); __m256i i256 = _mm256_set1_epi64x(i); - for (uint32_t j = 0; j < weight; j++) { + for (j = 0; j < weight; j++) { __m256i mask256 = _mm256_cmpeq_epi64(bloc256[j], i256); aux ^= bit256[j] & mask256; } _mm256_storeu_si256(((__m256i *)v) + i, aux); } -#undef LOOP_SIZE } @@ -167,10 +158,9 @@ uint8_t PQCLEAN_HQCRMRS128_AVX2_vect_compare(const uint8_t *v1, const uint8_t *v * @param[in] size_v Integer that is the size of the input vector in bits */ void PQCLEAN_HQCRMRS128_AVX2_vect_resize(uint64_t *o, uint32_t size_o, const uint64_t *v, uint32_t size_v) { + uint64_t mask = 0x7FFFFFFFFFFFFFFF; + int8_t val = 0; if (size_o < size_v) { - uint64_t mask = 0x7FFFFFFFFFFFFFFF; - int8_t val = 0; - if (size_o % 64) { val = 64 - (size_o % 64); } diff --git a/crypto_kem/hqc-rmrs-128/clean/reed_solomon.c b/crypto_kem/hqc-rmrs-128/clean/reed_solomon.c index e0212bdc..0f111b16 100644 --- a/crypto_kem/hqc-rmrs-128/clean/reed_solomon.c +++ b/crypto_kem/hqc-rmrs-128/clean/reed_solomon.c @@ -228,17 +228,25 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons uint16_t beta_j[PARAM_DELTA] = {0}; uint16_t e_j[PARAM_DELTA] = {0}; - uint16_t delta_counter = 0; + uint16_t delta_counter; uint16_t delta_real_value; + uint16_t found; + uint16_t mask1; + uint16_t mask2; + uint16_t tmp1; + uint16_t tmp2; + uint16_t inverse; + uint16_t inverse_power_j; // Compute the beta_{j_i} page 31 of the documentation + delta_counter = 0; for (size_t i = 0; i < PARAM_N1; i++) { - uint16_t found = 0; - uint16_t valuemask = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0 - for (uint16_t j = 0; j < PARAM_DELTA; j++) { - uint16_t indexmask = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter - beta_j[j] += indexmask & valuemask & gf_exp[i]; - found += indexmask & valuemask & 1; + found = 0; + mask1 = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0 + for (size_t j = 0; j < PARAM_DELTA; j++) { + mask2 = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter + beta_j[j] += mask1 & mask2 & gf_exp[i]; + found += mask1 & mask2 & 1; } delta_counter += found; } @@ -246,10 +254,10 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons // Compute the e_{j_i} page 31 of the documentation for (size_t i = 0; i < PARAM_DELTA; ++i) { - uint16_t tmp1 = 1; - uint16_t tmp2 = 1; - uint16_t inverse = PQCLEAN_HQCRMRS128_CLEAN_gf_inverse(beta_j[i]); - uint16_t inverse_power_j = 1; + tmp1 = 1; + tmp2 = 1; + inverse = PQCLEAN_HQCRMRS128_CLEAN_gf_inverse(beta_j[i]); + inverse_power_j = 1; for (size_t j = 1; j <= PARAM_DELTA; ++j) { inverse_power_j = PQCLEAN_HQCRMRS128_CLEAN_gf_mul(inverse_power_j, inverse); @@ -258,19 +266,19 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons for (size_t k = 1; k < PARAM_DELTA; ++k) { tmp2 = PQCLEAN_HQCRMRS128_CLEAN_gf_mul(tmp2, (1 ^ PQCLEAN_HQCRMRS128_CLEAN_gf_mul(inverse, beta_j[(i + k) % PARAM_DELTA]))); } - uint16_t mask = (uint16_t) (((int16_t) i - delta_real_value) >> 15); // i < delta_real_value - e_j[i] = mask & PQCLEAN_HQCRMRS128_CLEAN_gf_mul(tmp1, PQCLEAN_HQCRMRS128_CLEAN_gf_inverse(tmp2)); + mask1 = (uint16_t) (((int16_t) i - delta_real_value) >> 15); // i < delta_real_value + e_j[i] = mask1 & PQCLEAN_HQCRMRS128_CLEAN_gf_mul(tmp1, PQCLEAN_HQCRMRS128_CLEAN_gf_inverse(tmp2)); } // Place the delta e_{j_i} values at the right coordinates of the output vector delta_counter = 0; for (size_t i = 0; i < PARAM_N1; ++i) { - uint16_t found = 0; - uint16_t valuemask = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0 + found = 0; + mask1 = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0 for (size_t j = 0; j < PARAM_DELTA; j++) { - uint16_t indexmask = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter - error_values[i] += indexmask & valuemask & e_j[j]; - found += indexmask & valuemask & 1; + mask2 = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter + error_values[i] += mask1 & mask2 & e_j[j]; + found += mask1 & mask2 & 1; } delta_counter += found; } diff --git a/crypto_kem/hqc-rmrs-128/clean/vector.c b/crypto_kem/hqc-rmrs-128/clean/vector.c index 35a19120..8d4485c8 100644 --- a/crypto_kem/hqc-rmrs-128/clean/vector.c +++ b/crypto_kem/hqc-rmrs-128/clean/vector.c @@ -31,39 +31,33 @@ void PQCLEAN_HQCRMRS128_CLEAN_vect_set_random_fixed_weight_by_coordinates(AES_XOF_struct *ctx, uint32_t *v, uint16_t weight) { size_t random_bytes_size = 3 * weight; uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0}; // weight is expected to be <= PARAM_OMEGA_R - uint32_t random_data = 0; - uint8_t exist = 0; - size_t j = 0; + uint8_t inc; + size_t i, j; - seedexpander(ctx, rand_bytes, random_bytes_size); - - for (uint32_t i = 0; i < weight; ++i) { - exist = 0; + i = 0; + j = random_bytes_size; + while (i < weight) { do { if (j == random_bytes_size) { seedexpander(ctx, rand_bytes, random_bytes_size); j = 0; } - random_data = ((uint32_t) rand_bytes[j++]) << 16; - random_data |= ((uint32_t) rand_bytes[j++]) << 8; - random_data |= rand_bytes[j++]; + v[i] = ((uint32_t) rand_bytes[j++]) << 16; + v[i] |= ((uint32_t) rand_bytes[j++]) << 8; + v[i] |= rand_bytes[j++]; - } while (random_data >= UTILS_REJECTION_THRESHOLD); + } while (v[i] >= UTILS_REJECTION_THRESHOLD); - random_data = random_data % PARAM_N; + v[i] = v[i] % PARAM_N; - for (uint32_t k = 0; k < i; k++) { - if (v[k] == random_data) { - exist = 1; + inc = 1; + for (size_t k = 0; k < i; k++) { + if (v[k] == v[i]) { + inc = 0; } } - - if (exist == 1) { - i--; - } else { - v[i] = random_data; - } + i += inc; } } @@ -86,46 +80,11 @@ void PQCLEAN_HQCRMRS128_CLEAN_vect_set_random_fixed_weight_by_coordinates(AES_XO * @param[in] ctx Pointer to the context of the seed expander */ void PQCLEAN_HQCRMRS128_CLEAN_vect_set_random_fixed_weight(AES_XOF_struct *ctx, uint64_t *v, uint16_t weight) { - - size_t random_bytes_size = 3 * weight; - uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0}; // weight is expected to be <= PARAM_OMEGA_R - uint32_t random_data = 0; uint32_t tmp[PARAM_OMEGA_R] = {0}; - uint8_t exist = 0; - size_t j = 0; - seedexpander(ctx, rand_bytes, random_bytes_size); - - for (uint32_t i = 0; i < weight; ++i) { - exist = 0; - do { - if (j == random_bytes_size) { - seedexpander(ctx, rand_bytes, random_bytes_size); - j = 0; - } - - random_data = ((uint32_t) rand_bytes[j++]) << 16; - random_data |= ((uint32_t) rand_bytes[j++]) << 8; - random_data |= rand_bytes[j++]; - - } while (random_data >= UTILS_REJECTION_THRESHOLD); - - random_data = random_data % PARAM_N; - - for (uint32_t k = 0; k < i; k++) { - if (tmp[k] == random_data) { - exist = 1; - } - } - - if (exist == 1) { - i--; - } else { - tmp[i] = random_data; - } - } + PQCLEAN_HQCRMRS128_CLEAN_vect_set_random_fixed_weight_by_coordinates(ctx, tmp, weight); - for (uint16_t i = 0; i < weight; ++i) { + for (size_t i = 0; i < weight; ++i) { int32_t index = tmp[i] / 64; int32_t pos = tmp[i] % 64; v[index] |= ((uint64_t) 1) << pos; diff --git a/crypto_kem/hqc-rmrs-192/avx2/gf2x.c b/crypto_kem/hqc-rmrs-192/avx2/gf2x.c index e8c6f85c..cb822ded 100644 --- a/crypto_kem/hqc-rmrs-192/avx2/gf2x.c +++ b/crypto_kem/hqc-rmrs-192/avx2/gf2x.c @@ -188,23 +188,24 @@ static inline void karat_mult_4(__m256i *C, __m256i *A, __m256i *B) { * @param[in] B Pointer to the polynomial B(x) */ static inline void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) { + int32_t i, is, is2, is3; __m256i D0[8], D1[8], D2[8], SAA[4], SBB[4]; karat_mult_4( D0, A, B); karat_mult_4(D2, A + 4, B + 4); - for (int32_t i = 0; i < 4; i++) { - int is = i + 4; + for (i = 0; i < 4; i++) { + is = i + 4; SAA[i] = A[i] ^ A[is]; SBB[i] = B[i] ^ B[is]; } karat_mult_4(D1, SAA, SBB); - for (int32_t i = 0; i < 4; i++) { - int32_t is = i + 4; - int32_t is2 = is + 4; - int32_t is3 = is2 + 4; + for (i = 0; i < 4; i++) { + is = i + 4; + is2 = is + 4; + is3 = is2 + 4; __m256i middle = _mm256_xor_si256(D0[is], D2[i]); @@ -228,22 +229,23 @@ static inline void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) { */ static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) { __m256i D0[16], D1[16], D2[16], SAA[8], SBB[8]; + int32_t i, is, is2, is3; karat_mult_8( D0, A, B); karat_mult_8(D2, A + 8, B + 8); - for (int32_t i = 0; i < 8; i++) { - int32_t is = i + 8; + for (i = 0; i < 8; i++) { + is = i + 8; SAA[i] = A[i] ^ A[is]; SBB[i] = B[i] ^ B[is]; } karat_mult_8( D1, SAA, SBB); - for (int32_t i = 0; i < 8; i++) { - int32_t is = i + 8; - int32_t is2 = is + 8; - int32_t is3 = is2 + 8; + for (i = 0; i < 8; i++) { + is = i + 8; + is2 = is + 8; + is3 = is2 + 8; __m256i middle = _mm256_xor_si256(D0[is], D2[i]); @@ -267,22 +269,23 @@ static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) { */ static inline void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) { __m256i D0[32], D1[32], D2[32], SAA[16], SBB[16]; + int32_t i, is, is2, is3; karat_mult_16( D0, A, B); karat_mult_16(D2, A + 16, B + 16); - for (int32_t i = 0; i < 16; i++) { - int is = i + 16; + for (i = 0; i < 16; i++) { + is = i + 16; SAA[i] = A[i] ^ A[is]; SBB[i] = B[i] ^ B[is]; } karat_mult_16( D1, SAA, SBB); - for (int32_t i = 0; i < 16; i++) { - int32_t is = i + 16; - int32_t is2 = is + 16; - int32_t is3 = is2 + 16; + for (i = 0; i < 16; i++) { + is = i + 16; + is2 = is + 16; + is3 = is2 + 16; __m256i middle = _mm256_xor_si256(D0[is], D2[i]); @@ -306,21 +309,22 @@ static inline void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) { */ static inline void karat_mult_64(__m256i *C, __m256i *A, __m256i *B) { __m256i D0[64], D1[64], D2[64], SAA[32], SBB[32]; + int32_t i, is, is2, is3; karat_mult_32( D0, A, B); karat_mult_32(D2, A + 32, B + 32); - for (int32_t i = 0; i < 32; i++) { - int32_t is = i + 32; + for (i = 0; i < 32; i++) { + is = i + 32; SAA[i] = A[i] ^ A[is]; SBB[i] = B[i] ^ B[is]; } karat_mult_32( D1, SAA, SBB); - for (int32_t i = 0; i < 32; i++) { - int32_t is = i + 32; - int32_t is2 = is + 32; - int32_t is3 = is2 + 32; + for (i = 0; i < 32; i++) { + is = i + 32; + is2 = is + 32; + is3 = is2 + 32; __m256i middle = _mm256_xor_si256(D0[is], D2[i]); @@ -369,11 +373,16 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { static __m256i tmp[2 * (T_TM3_3W_256)]; static __m256i ro256[6 * (T_TM3_3W_256)]; const __m256i zero = _mm256_setzero_si256(); + int64_t *U1_64; + int64_t *U2_64; + int64_t *V1_64; + int64_t *V2_64; int32_t T2 = T_TM3_3W_64 << 1; + int32_t i, i4, i41, i42; - for (int32_t i = 0; i < T_TM3_3W_256 - 1; i++) { - int32_t i4 = i << 2; - int32_t i42 = i4 - 2; + for (i = 0; i < T_TM3_3W_256 - 1; i++) { + i4 = i << 2; + i42 = i4 - 2; U0[i] = _mm256_lddqu_si256((__m256i const *)(& A[i4])); V0[i] = _mm256_lddqu_si256((__m256i const *)(& B[i4])); U1[i] = _mm256_lddqu_si256((__m256i const *)(& A[i42 + T_TM3_3W_64])); @@ -382,9 +391,9 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { V2[i] = _mm256_lddqu_si256((__m256i const *)(& B[i4 + T2 - 4])); } - for (int32_t i = T_TM3_3W_256 - 1; i < T_TM3_3W_256; i++) { - int32_t i4 = i << 2; - int32_t i41 = i4 + 1; + for (i = T_TM3_3W_256 - 1; i < T_TM3_3W_256; i++) { + i4 = i << 2; + i41 = i4 + 1; U0[i] = _mm256_set_epi64x(0, 0, A[i41], A[i4]); V0[i] = _mm256_set_epi64x(0, 0, B[i41], B[i4]); U1[i] = _mm256_set_epi64x(0, 0, A[i41 + T_TM3_3W_64 - 2], A[i4 + T_TM3_3W_64 - 2]); @@ -397,7 +406,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { // P(X): P0=(0); P1=(1); P2=(x); P3=(1+x); P4=(\infty) // Evaluation: 5*2 add, 2*2 shift; 5 mul (n) //W3 = U2 + U1 + U0; W2 = V2 + V1 + V0 - for (int32_t i = 0; i < T_TM3_3W_256; i++) { + for (i = 0; i < T_TM3_3W_256; i++) { W3[i] = U0[i] ^ U1[i] ^ U2[i]; W2[i] = V0[i] ^ V1[i] ^ V2[i]; } @@ -406,23 +415,17 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { karat_mult_64( W1, W2, W3); //W0 =(U1 + U2*x)*x; W4 =(V1 + V2*x)*x (SIZE = T_TM3_3W_256 !) - int64_t *U1_64 = ((int64_t *) U1); - int64_t *U2_64 = ((int64_t *) U2); - - int64_t *V1_64 = ((int64_t *) V1); - int64_t *V2_64 = ((int64_t *) V2); - - W0[0] = _mm256_set_epi64x(U1_64[2] ^ U2_64[1], U1_64[1] ^ U2_64[0], U1_64[0], 0); - W4[0] = _mm256_set_epi64x(V1_64[2] ^ V2_64[1], V1_64[1] ^ V2_64[0], V1_64[0], 0); - U1_64 = ((int64_t *) U1); U2_64 = ((int64_t *) U2); V1_64 = ((int64_t *) V1); V2_64 = ((int64_t *) V2); - for (int32_t i = 1; i < T_TM3_3W_256; i++) { - int i4 = i << 2; + W0[0] = _mm256_set_epi64x(U1_64[2] ^ U2_64[1], U1_64[1] ^ U2_64[0], U1_64[0], 0); + W4[0] = _mm256_set_epi64x(V1_64[2] ^ V2_64[1], V1_64[1] ^ V2_64[0], V1_64[0], 0); + + for (i = 1; i < T_TM3_3W_256; i++) { + i4 = i << 2; W0[i] = _mm256_lddqu_si256((__m256i const *)(& U1_64[i4 - 1])); W0[i] ^= _mm256_lddqu_si256((__m256i const *)(& U2_64[i4 - 2])); @@ -431,21 +434,20 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { } //W3 = W3 + W0 ; W2 = W2 + W4 - for (int32_t i = 0; i < T_TM3_3W_256; i++) { + for (i = 0; i < T_TM3_3W_256; i++) { W3[i] ^= W0[i]; W2[i] ^= W4[i]; } //W0 = W0 + U0 ; W4 = W4 + V0 - for (int32_t i = 0; i < T_TM3_3W_256; i++) { + for (i = 0; i < T_TM3_3W_256; i++) { W0[i] ^= U0[i]; W4[i] ^= V0[i]; } - karat_mult_64(tmp, W3, W2); - for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3_3W_256); i++) { W3[i] = tmp[i]; } @@ -457,20 +459,20 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { // Interpolation phase // 9 add, 1 shift, 1 Smul, 2 Sdiv (2n) //W3 = W3 + W2 - for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3_3W_256); i++) { W3[i] ^= W2[i]; } //W1 = W1 + W0 - for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3_3W_256); i++) { W1[i] ^= W0[i]; } //W2 =(W2 + W0)/x -> x = X^64 U1_64 = ((int64_t *) W2); U2_64 = ((int64_t *) W0); - for (int32_t i = 0; i < (T_TM3_3W_256 << 1); i++) { - int32_t i4 = i << 2; + for (i = 0; i < (T_TM3_3W_256 << 1); i++) { + i4 = i << 2; W2[i] = _mm256_lddqu_si256((__m256i const *)(& U1_64[i4 + 1])); W2[i] ^= _mm256_lddqu_si256((__m256i const *)(& U2_64[i4 + 1])); } @@ -480,7 +482,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { __m256i *U1_256 = (__m256i *) (U1_64 + 1); tmp[0] = W2[0] ^ W3[0] ^ W4[0] ^ _mm256_set_epi64x(U1_64[0], 0, 0, 0); - for (int32_t i = 1; i < (T_TM3_3W_256 << 1) - 1; i++) { + for (i = 1; i < (T_TM3_3W_256 << 1) - 1; i++) { tmp[i] = W2[i] ^ W3[i] ^ W4[i] ^ _mm256_lddqu_si256(&U1_256[i - 1]); } @@ -494,7 +496,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { U2_64 = (int64_t *) W1; __m256i *U2_256 = (__m256i *) (U2_64 + 1); - for (int32_t i = 0; i < 2 * (T_TM3_3W_256) - 1; i++) { + for (i = 0; i < 2 * (T_TM3_3W_256) - 1; i++) { tmp[i] = _mm256_lddqu_si256(&U1_256[i]) ^ _mm256_lddqu_si256(&U2_256[i]); } @@ -502,19 +504,19 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { W3[2 * (T_TM3_3W_256) - 1] = zero; //W1 = W1 + W4 + W2 - for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3_3W_256); i++) { W1[i] ^= W2[i] ^ W4[i]; } //W2 = W2 + W3 - for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3_3W_256); i++) { W2[i] ^= W3[i]; } // Recomposition //W = W0+ W1*x+ W2*x^2+ W3*x^3 + W4*x^4 //W0, W1, W4 of size 2*T_TM3_3W_256, W2 and W3 of size 2*(T_TM3_3W_256) - for (int32_t i = 0; i < (T_TM3_3W_256 << 1) - 1; i++) { + for (i = 0; i < (T_TM3_3W_256 << 1) - 1; i++) { ro256[i] = W0[i]; ro256[i + 2 * T_TM3_3W_256 - 1] = W2[i]; ro256[i + 4 * T_TM3_3W_256 - 2] = W4[i]; @@ -530,12 +532,12 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { U2_64 = ((int64_t *) &ro256[3 * T_TM3_3W_256 - 1]); U2_256 = (__m256i *) (U2_64 - 2); - for (int32_t i = 0; i < T_TM3_3W_256 << 1; i++) { + for (i = 0; i < T_TM3_3W_256 << 1; i++) { _mm256_storeu_si256(&U1_256[i], W1[i] ^ _mm256_lddqu_si256(&U1_256[i])); _mm256_storeu_si256(&U2_256[i], W3[i] ^ _mm256_loadu_si256(&U2_256[i])); } - for (int32_t i = 0; i < 2 * VEC_N_SIZE_256 + 1; i++) { + for (i = 0; i < 2 * VEC_N_SIZE_256 + 1; i++) { _mm256_storeu_si256(&Out[i], ro256[i]); } } diff --git a/crypto_kem/hqc-rmrs-192/avx2/reed_muller.c b/crypto_kem/hqc-rmrs-192/avx2/reed_muller.c index 439cc62f..5a40f3d0 100644 --- a/crypto_kem/hqc-rmrs-192/avx2/reed_muller.c +++ b/crypto_kem/hqc-rmrs-192/avx2/reed_muller.c @@ -231,7 +231,19 @@ inline void hadamard(__m256i *src, __m256i *dst) { inline uint32_t find_peaks(__m256i *transform) { // a whole lot of vector variables __m256i bitmap, abs_rows[8], bound, active_row, max_abs_rows; - __m256i peak_mask; + __m256i tmp = _mm256_setzero_si256(); + __m256i vect_mask; + __m256i res; + int32_t lower; + int32_t width; + uint32_t message; + uint32_t mask; + int8_t index; + int8_t abs_value; + int8_t mask1; + int8_t mask2; + uint16_t result; + // compute absolute value of transform for (size_t i = 0; i < 8; i++) { abs_rows[i] = _mm256_abs_epi16(transform[i]); @@ -245,9 +257,9 @@ inline uint32_t find_peaks(__m256i *transform) { // do binary search for the highest value that is lower than the maximum // loop invariant: lower gives bit map = 0, lower + width gives bit map > 0 - int32_t lower = 1; + lower = 1; // this gives 64, 128 or 256 for MULTIPLICITY = 2, 4, 6 - int32_t width = 1 << (5 + MULTIPLICITY / 2); + width = 1 << (5 + MULTIPLICITY / 2); // if you don't unroll this loop, it fits in the loop cache // uncomment the line below to speeding up the program by a few percent // #pragma GCC unroll 0 @@ -259,8 +271,9 @@ inline uint32_t find_peaks(__m256i *transform) { bitmap = _mm256_cmpgt_epi16(max_abs_rows, bound); // step up if there are any matches // rely on compiler to use conditional move here - int32_t step_mask = _mm256_testz_si256(bitmap, bitmap) - 1; - lower += step_mask & width; + mask = (uint32_t) _mm256_testz_si256(bitmap, bitmap); + mask = ~(uint32_t) ((-(int64_t) mask) >> 63); + lower += mask & width; } // lower+width contains the maximum value of the vector // or less, if the maximum is very high (which is OK) @@ -272,30 +285,26 @@ inline uint32_t find_peaks(__m256i *transform) { // find in which of the 8 groups a maximum occurs to compute bits 4, 5, 6 of message // find lowest value by searching backwards skip first check to save time - size_t message = 0x70; - for (int32_t i = 7; i >= 0; i--) { - bitmap = _mm256_cmpgt_epi16(abs_rows[i], bound); - int message_mask = (-(int16_t)(_mm256_testz_si256(bitmap, bitmap) == 0)) >> 15; - message ^= message_mask & (message ^ (unsigned)i << 4); + message = 0x70; + for (size_t i = 0; i < 8; i++) { + bitmap = _mm256_cmpgt_epi16(abs_rows[7 - i], bound); + mask = (uint32_t) _mm256_testz_si256(bitmap, bitmap); + mask = ~(uint32_t) ((-(int64_t) mask) >> 63); + message ^= mask & (message ^ ((7 - i) << 4)); } // we decided which row of the matrix contains the lowest match // select proper row - int8_t index = message >> 4; - __m256i res; - __m256i tmp = (__m256i) { - 0ULL, 0ULL, 0ULL, 0ULL - }; + index = message >> 4; - for (int8_t i = 0; i < 8; i++) { - int8_t abs_value = (int8_t)(index - i); - int8_t mask1 = abs_value >> 7; + tmp = _mm256_setzero_si256(); + for (size_t i = 0; i < 8; i++) { + abs_value = (int8_t)(index - i); + mask1 = abs_value >> 7; abs_value ^= mask1; abs_value -= mask1; - int8_t mask2 = ((uint8_t) - abs_value >> 7); - int64_t mask3 = (-1ULL) + mask2; - __m256i vect_mask = (__m256i) { - mask3, mask3, mask3, mask3 - }; + mask2 = ((uint8_t) - abs_value >> 7); + mask = (-1ULL) + mask2; + vect_mask = _mm256_set1_epi32(mask); res = _mm256_and_si256(abs_rows[i], vect_mask); tmp = _mm256_or_si256(tmp, res); } @@ -305,34 +314,29 @@ inline uint32_t find_peaks(__m256i *transform) { // get the column number of the vector element // by setting the bits corresponding to the columns // and then adding elements within two groups of 8 - peak_mask = _mm256_cmpgt_epi16(active_row, bound); - peak_mask &= _mm256_set_epi16(-32768, 16384, 8192, 4096, 2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1); - for (int32_t i = 0; i < 3; i++) { - peak_mask = _mm256_hadd_epi16(peak_mask, peak_mask); + vect_mask = _mm256_cmpgt_epi16(active_row, bound); + vect_mask &= _mm256_set_epi16(-32768, 16384, 8192, 4096, 2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1); + for (size_t i = 0; i < 3; i++) { + vect_mask = _mm256_hadd_epi16(vect_mask, vect_mask); } // add low 4 bits of message - message |= __tzcnt_u16(_mm256_extract_epi16(peak_mask, 0) + _mm256_extract_epi16(peak_mask, 8)); + message |= __tzcnt_u16(_mm256_extract_epi16(vect_mask, 0) + _mm256_extract_epi16(vect_mask, 8)); // set bit 7 if sign of biggest value is positive // make sure a jump isn't generated by the compiler - tmp = (__m256i) { - 0ULL, 0ULL, 0ULL, 0ULL - }; - for (uint32_t i = 0; i < 8; i++) { - int64_t message_mask = (-(int64_t)(i == message / 16)) >> 63; - __m256i vect_mask = (__m256i) { - message_mask, message_mask, message_mask, message_mask - }; + tmp = _mm256_setzero_si256(); + for (size_t i = 0; i < 8; i++) { + mask = ~(uint32_t) ((-(int64_t)(i ^ message / 16)) >> 63); + __m256i vect_mask = _mm256_set1_epi32(mask); tmp = _mm256_or_si256(tmp, _mm256_and_si256(vect_mask, transform[i])); } - uint16_t result = 0; - for (uint32_t i = 0; i < 16; i++) { - uint16_t *ptr = (uint16_t *) &tmp; - int32_t message_mask = (-(int32_t)(i == message % 16)) >> (sizeof(int32_t) * 8 - 1); - result |= message_mask & ptr[i]; + result = 0; + for (size_t i = 0; i < 16; i++) { + mask = ~(uint32_t) ((-(int64_t)(i ^ message % 16)) >> 63); + result |= mask & ((uint16_t *)&tmp)[i]; } message |= (0x8000 & ~result) >> 8; - return (uint32_t) message; + return message; } diff --git a/crypto_kem/hqc-rmrs-192/avx2/reed_solomon.c b/crypto_kem/hqc-rmrs-192/avx2/reed_solomon.c index f455f447..6df78622 100644 --- a/crypto_kem/hqc-rmrs-192/avx2/reed_solomon.c +++ b/crypto_kem/hqc-rmrs-192/avx2/reed_solomon.c @@ -228,17 +228,25 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons uint16_t beta_j[PARAM_DELTA] = {0}; uint16_t e_j[PARAM_DELTA] = {0}; - uint16_t delta_counter = 0; + uint16_t delta_counter; uint16_t delta_real_value; + uint16_t found; + uint16_t mask1; + uint16_t mask2; + uint16_t tmp1; + uint16_t tmp2; + uint16_t inverse; + uint16_t inverse_power_j; // Compute the beta_{j_i} page 31 of the documentation + delta_counter = 0; for (size_t i = 0; i < PARAM_N1; i++) { - uint16_t found = 0; - uint16_t valuemask = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0 - for (uint16_t j = 0; j < PARAM_DELTA; j++) { - uint16_t indexmask = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter - beta_j[j] += indexmask & valuemask & gf_exp[i]; - found += indexmask & valuemask & 1; + found = 0; + mask1 = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0 + for (size_t j = 0; j < PARAM_DELTA; j++) { + mask2 = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter + beta_j[j] += mask1 & mask2 & gf_exp[i]; + found += mask1 & mask2 & 1; } delta_counter += found; } @@ -246,10 +254,10 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons // Compute the e_{j_i} page 31 of the documentation for (size_t i = 0; i < PARAM_DELTA; ++i) { - uint16_t tmp1 = 1; - uint16_t tmp2 = 1; - uint16_t inverse = PQCLEAN_HQCRMRS192_AVX2_gf_inverse(beta_j[i]); - uint16_t inverse_power_j = 1; + tmp1 = 1; + tmp2 = 1; + inverse = PQCLEAN_HQCRMRS192_AVX2_gf_inverse(beta_j[i]); + inverse_power_j = 1; for (size_t j = 1; j <= PARAM_DELTA; ++j) { inverse_power_j = PQCLEAN_HQCRMRS192_AVX2_gf_mul(inverse_power_j, inverse); @@ -258,19 +266,19 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons for (size_t k = 1; k < PARAM_DELTA; ++k) { tmp2 = PQCLEAN_HQCRMRS192_AVX2_gf_mul(tmp2, (1 ^ PQCLEAN_HQCRMRS192_AVX2_gf_mul(inverse, beta_j[(i + k) % PARAM_DELTA]))); } - uint16_t mask = (uint16_t) (((int16_t) i - delta_real_value) >> 15); // i < delta_real_value - e_j[i] = mask & PQCLEAN_HQCRMRS192_AVX2_gf_mul(tmp1, PQCLEAN_HQCRMRS192_AVX2_gf_inverse(tmp2)); + mask1 = (uint16_t) (((int16_t) i - delta_real_value) >> 15); // i < delta_real_value + e_j[i] = mask1 & PQCLEAN_HQCRMRS192_AVX2_gf_mul(tmp1, PQCLEAN_HQCRMRS192_AVX2_gf_inverse(tmp2)); } // Place the delta e_{j_i} values at the right coordinates of the output vector delta_counter = 0; for (size_t i = 0; i < PARAM_N1; ++i) { - uint16_t found = 0; - uint16_t valuemask = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0 + found = 0; + mask1 = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0 for (size_t j = 0; j < PARAM_DELTA; j++) { - uint16_t indexmask = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter - error_values[i] += indexmask & valuemask & e_j[j]; - found += indexmask & valuemask & 1; + mask2 = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter + error_values[i] += mask1 & mask2 & e_j[j]; + found += mask1 & mask2 & 1; } delta_counter += found; } diff --git a/crypto_kem/hqc-rmrs-192/avx2/vector.c b/crypto_kem/hqc-rmrs-192/avx2/vector.c index 69671d47..51d444ae 100644 --- a/crypto_kem/hqc-rmrs-192/avx2/vector.c +++ b/crypto_kem/hqc-rmrs-192/avx2/vector.c @@ -32,72 +32,63 @@ void PQCLEAN_HQCRMRS192_AVX2_vect_set_random_fixed_weight(AES_XOF_struct *ctx, uint64_t *v, uint16_t weight) { size_t random_bytes_size = 3 * weight; uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0}; - uint32_t random_data = 0; uint32_t tmp[PARAM_OMEGA_R] = {0}; - uint8_t exist = 0; - size_t j = 0; __m256i bit256[PARAM_OMEGA_R]; __m256i bloc256[PARAM_OMEGA_R]; - static __m256i posCmp256 = (__m256i) { - 0UL, 1UL, 2UL, 3UL - }; -#define LOOP_SIZE CEIL_DIVIDE(PARAM_N, 256) - - seedexpander(ctx, rand_bytes, random_bytes_size); - - for (uint32_t i = 0; i < weight; ++i) { - exist = 0; + __m256i posCmp256 = _mm256_set_epi64x(3, 2, 1, 0); + uint64_t bloc, pos, bit64; + uint8_t inc; + size_t i, j; + + i = 0; + j = random_bytes_size; + while (i < weight) { do { if (j == random_bytes_size) { seedexpander(ctx, rand_bytes, random_bytes_size); j = 0; } - random_data = ((uint32_t) rand_bytes[j++]) << 16; - random_data |= ((uint32_t) rand_bytes[j++]) << 8; - random_data |= rand_bytes[j++]; + tmp[i] = ((uint32_t) rand_bytes[j++]) << 16; + tmp[i] |= ((uint32_t) rand_bytes[j++]) << 8; + tmp[i] |= rand_bytes[j++]; - } while (random_data >= UTILS_REJECTION_THRESHOLD); + } while (tmp[i] >= UTILS_REJECTION_THRESHOLD); - random_data = random_data % PARAM_N; + tmp[i] = tmp[i] % PARAM_N; + inc = 1; for (uint32_t k = 0; k < i; k++) { - if (tmp[k] == random_data) { - exist = 1; + if (tmp[k] == tmp[i]) { + inc = 0; } } - - if (exist == 1) { - i--; - } else { - tmp[i] = random_data; - } + i += inc; } - for (uint32_t i = 0; i < weight; i++) { + for (i = 0; i < weight; i++) { // we store the bloc number and bit position of each vb[i] - uint64_t bloc = tmp[i] >> 6; + bloc = tmp[i] >> 6; bloc256[i] = _mm256_set1_epi64x(bloc >> 2); - uint64_t pos = (bloc & 0x3UL); + pos = (bloc & 0x3UL); __m256i pos256 = _mm256_set1_epi64x(pos); __m256i mask256 = _mm256_cmpeq_epi64(pos256, posCmp256); - uint64_t bit64 = 1ULL << (tmp[i] & 0x3f); + bit64 = 1ULL << (tmp[i] & 0x3f); __m256i bloc256 = _mm256_set1_epi64x(bit64); bit256[i] = bloc256 & mask256; } - for (uint32_t i = 0; i < LOOP_SIZE; i++) { + for (i = 0; i < CEIL_DIVIDE(PARAM_N, 256); i++) { __m256i aux = _mm256_loadu_si256(((__m256i *)v) + i); __m256i i256 = _mm256_set1_epi64x(i); - for (uint32_t j = 0; j < weight; j++) { + for (j = 0; j < weight; j++) { __m256i mask256 = _mm256_cmpeq_epi64(bloc256[j], i256); aux ^= bit256[j] & mask256; } _mm256_storeu_si256(((__m256i *)v) + i, aux); } -#undef LOOP_SIZE } @@ -167,10 +158,9 @@ uint8_t PQCLEAN_HQCRMRS192_AVX2_vect_compare(const uint8_t *v1, const uint8_t *v * @param[in] size_v Integer that is the size of the input vector in bits */ void PQCLEAN_HQCRMRS192_AVX2_vect_resize(uint64_t *o, uint32_t size_o, const uint64_t *v, uint32_t size_v) { + uint64_t mask = 0x7FFFFFFFFFFFFFFF; + int8_t val = 0; if (size_o < size_v) { - uint64_t mask = 0x7FFFFFFFFFFFFFFF; - int8_t val = 0; - if (size_o % 64) { val = 64 - (size_o % 64); } diff --git a/crypto_kem/hqc-rmrs-192/clean/reed_solomon.c b/crypto_kem/hqc-rmrs-192/clean/reed_solomon.c index c8365024..8d56467e 100644 --- a/crypto_kem/hqc-rmrs-192/clean/reed_solomon.c +++ b/crypto_kem/hqc-rmrs-192/clean/reed_solomon.c @@ -228,17 +228,25 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons uint16_t beta_j[PARAM_DELTA] = {0}; uint16_t e_j[PARAM_DELTA] = {0}; - uint16_t delta_counter = 0; + uint16_t delta_counter; uint16_t delta_real_value; + uint16_t found; + uint16_t mask1; + uint16_t mask2; + uint16_t tmp1; + uint16_t tmp2; + uint16_t inverse; + uint16_t inverse_power_j; // Compute the beta_{j_i} page 31 of the documentation + delta_counter = 0; for (size_t i = 0; i < PARAM_N1; i++) { - uint16_t found = 0; - uint16_t valuemask = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0 - for (uint16_t j = 0; j < PARAM_DELTA; j++) { - uint16_t indexmask = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter - beta_j[j] += indexmask & valuemask & gf_exp[i]; - found += indexmask & valuemask & 1; + found = 0; + mask1 = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0 + for (size_t j = 0; j < PARAM_DELTA; j++) { + mask2 = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter + beta_j[j] += mask1 & mask2 & gf_exp[i]; + found += mask1 & mask2 & 1; } delta_counter += found; } @@ -246,10 +254,10 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons // Compute the e_{j_i} page 31 of the documentation for (size_t i = 0; i < PARAM_DELTA; ++i) { - uint16_t tmp1 = 1; - uint16_t tmp2 = 1; - uint16_t inverse = PQCLEAN_HQCRMRS192_CLEAN_gf_inverse(beta_j[i]); - uint16_t inverse_power_j = 1; + tmp1 = 1; + tmp2 = 1; + inverse = PQCLEAN_HQCRMRS192_CLEAN_gf_inverse(beta_j[i]); + inverse_power_j = 1; for (size_t j = 1; j <= PARAM_DELTA; ++j) { inverse_power_j = PQCLEAN_HQCRMRS192_CLEAN_gf_mul(inverse_power_j, inverse); @@ -258,19 +266,19 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons for (size_t k = 1; k < PARAM_DELTA; ++k) { tmp2 = PQCLEAN_HQCRMRS192_CLEAN_gf_mul(tmp2, (1 ^ PQCLEAN_HQCRMRS192_CLEAN_gf_mul(inverse, beta_j[(i + k) % PARAM_DELTA]))); } - uint16_t mask = (uint16_t) (((int16_t) i - delta_real_value) >> 15); // i < delta_real_value - e_j[i] = mask & PQCLEAN_HQCRMRS192_CLEAN_gf_mul(tmp1, PQCLEAN_HQCRMRS192_CLEAN_gf_inverse(tmp2)); + mask1 = (uint16_t) (((int16_t) i - delta_real_value) >> 15); // i < delta_real_value + e_j[i] = mask1 & PQCLEAN_HQCRMRS192_CLEAN_gf_mul(tmp1, PQCLEAN_HQCRMRS192_CLEAN_gf_inverse(tmp2)); } // Place the delta e_{j_i} values at the right coordinates of the output vector delta_counter = 0; for (size_t i = 0; i < PARAM_N1; ++i) { - uint16_t found = 0; - uint16_t valuemask = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0 + found = 0; + mask1 = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0 for (size_t j = 0; j < PARAM_DELTA; j++) { - uint16_t indexmask = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter - error_values[i] += indexmask & valuemask & e_j[j]; - found += indexmask & valuemask & 1; + mask2 = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter + error_values[i] += mask1 & mask2 & e_j[j]; + found += mask1 & mask2 & 1; } delta_counter += found; } diff --git a/crypto_kem/hqc-rmrs-192/clean/vector.c b/crypto_kem/hqc-rmrs-192/clean/vector.c index a9888879..6f9949c7 100644 --- a/crypto_kem/hqc-rmrs-192/clean/vector.c +++ b/crypto_kem/hqc-rmrs-192/clean/vector.c @@ -31,39 +31,33 @@ void PQCLEAN_HQCRMRS192_CLEAN_vect_set_random_fixed_weight_by_coordinates(AES_XOF_struct *ctx, uint32_t *v, uint16_t weight) { size_t random_bytes_size = 3 * weight; uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0}; // weight is expected to be <= PARAM_OMEGA_R - uint32_t random_data = 0; - uint8_t exist = 0; - size_t j = 0; + uint8_t inc; + size_t i, j; - seedexpander(ctx, rand_bytes, random_bytes_size); - - for (uint32_t i = 0; i < weight; ++i) { - exist = 0; + i = 0; + j = random_bytes_size; + while (i < weight) { do { if (j == random_bytes_size) { seedexpander(ctx, rand_bytes, random_bytes_size); j = 0; } - random_data = ((uint32_t) rand_bytes[j++]) << 16; - random_data |= ((uint32_t) rand_bytes[j++]) << 8; - random_data |= rand_bytes[j++]; + v[i] = ((uint32_t) rand_bytes[j++]) << 16; + v[i] |= ((uint32_t) rand_bytes[j++]) << 8; + v[i] |= rand_bytes[j++]; - } while (random_data >= UTILS_REJECTION_THRESHOLD); + } while (v[i] >= UTILS_REJECTION_THRESHOLD); - random_data = random_data % PARAM_N; + v[i] = v[i] % PARAM_N; - for (uint32_t k = 0; k < i; k++) { - if (v[k] == random_data) { - exist = 1; + inc = 1; + for (size_t k = 0; k < i; k++) { + if (v[k] == v[i]) { + inc = 0; } } - - if (exist == 1) { - i--; - } else { - v[i] = random_data; - } + i += inc; } } @@ -86,46 +80,11 @@ void PQCLEAN_HQCRMRS192_CLEAN_vect_set_random_fixed_weight_by_coordinates(AES_XO * @param[in] ctx Pointer to the context of the seed expander */ void PQCLEAN_HQCRMRS192_CLEAN_vect_set_random_fixed_weight(AES_XOF_struct *ctx, uint64_t *v, uint16_t weight) { - - size_t random_bytes_size = 3 * weight; - uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0}; // weight is expected to be <= PARAM_OMEGA_R - uint32_t random_data = 0; uint32_t tmp[PARAM_OMEGA_R] = {0}; - uint8_t exist = 0; - size_t j = 0; - seedexpander(ctx, rand_bytes, random_bytes_size); - - for (uint32_t i = 0; i < weight; ++i) { - exist = 0; - do { - if (j == random_bytes_size) { - seedexpander(ctx, rand_bytes, random_bytes_size); - j = 0; - } - - random_data = ((uint32_t) rand_bytes[j++]) << 16; - random_data |= ((uint32_t) rand_bytes[j++]) << 8; - random_data |= rand_bytes[j++]; - - } while (random_data >= UTILS_REJECTION_THRESHOLD); - - random_data = random_data % PARAM_N; - - for (uint32_t k = 0; k < i; k++) { - if (tmp[k] == random_data) { - exist = 1; - } - } - - if (exist == 1) { - i--; - } else { - tmp[i] = random_data; - } - } + PQCLEAN_HQCRMRS192_CLEAN_vect_set_random_fixed_weight_by_coordinates(ctx, tmp, weight); - for (uint16_t i = 0; i < weight; ++i) { + for (size_t i = 0; i < weight; ++i) { int32_t index = tmp[i] / 64; int32_t pos = tmp[i] % 64; v[index] |= ((uint64_t) 1) << pos; diff --git a/crypto_kem/hqc-rmrs-256/avx2/gf2x.c b/crypto_kem/hqc-rmrs-256/avx2/gf2x.c index f2dd26f8..294561c8 100644 --- a/crypto_kem/hqc-rmrs-256/avx2/gf2x.c +++ b/crypto_kem/hqc-rmrs-256/avx2/gf2x.c @@ -232,23 +232,24 @@ static inline void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) { * @param[in] B Pointer to the polynomial B(x) */ static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) { + int32_t i, is, is2, is3; __m256i D0[16], D1[16], D2[16], SAA[8], SBB[8]; karat_mult_8( D0, A, B); karat_mult_8(D2, A + 8, B + 8); - for (int32_t i = 0; i < 8; i++) { - int32_t is = i + 8; + for (i = 0; i < 8; i++) { + is = i + 8; SAA[i] = A[i] ^ A[is]; SBB[i] = B[i] ^ B[is]; } karat_mult_8( D1, SAA, SBB); - for (int32_t i = 0; i < 8; i++) { - int32_t is = i + 8; - int32_t is2 = is + 8; - int32_t is3 = is2 + 8; + for (i = 0; i < 8; i++) { + is = i + 8; + is2 = is + 8; + is3 = is2 + 8; __m256i middle = _mm256_xor_si256(D0[is], D2[i]); @@ -272,22 +273,23 @@ static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) { */ static inline void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) { __m256i D0[32], D1[32], D2[32], SAA[16], SBB[16]; + int32_t i, is, is2, is3; karat_mult_16( D0, A, B); karat_mult_16(D2, A + 16, B + 16); - for (int32_t i = 0; i < 16; i++) { - int is = i + 16; + for (i = 0; i < 16; i++) { + is = i + 16; SAA[i] = A[i] ^ A[is]; SBB[i] = B[i] ^ B[is]; } karat_mult_16( D1, SAA, SBB); - for (int32_t i = 0; i < 16; i++) { - int32_t is = i + 16; - int32_t is2 = is + 16; - int32_t is3 = is2 + 16; + for (i = 0; i < 16; i++) { + is = i + 16; + is2 = is + 16; + is3 = is2 + 16; __m256i middle = _mm256_xor_si256(D0[is], D2[i]); @@ -299,7 +301,6 @@ static inline void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) { } - /** * @brief Compute B(x) = A(x)/(x+1) * @@ -336,11 +337,16 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { static __m256i tmp[2 * (T_TM3_3W_256)]; static __m256i ro256[6 * (T_TM3_3W_256)]; const __m256i zero = _mm256_setzero_si256(); + int64_t *U1_64; + int64_t *U2_64; + int64_t *V1_64; + int64_t *V2_64; int32_t T2 = T_TM3_3W_64 << 1; + int32_t i, i4, i41, i42; - for (int32_t i = 0; i < T_TM3_3W_256 - 1; i++) { - int32_t i4 = i << 2; - int32_t i42 = i4 - 2; + for (i = 0; i < T_TM3_3W_256 - 1; i++) { + i4 = i << 2; + i42 = i4 - 2; U0[i] = _mm256_lddqu_si256((__m256i const *)(& A[i4])); V0[i] = _mm256_lddqu_si256((__m256i const *)(& B[i4])); U1[i] = _mm256_lddqu_si256((__m256i const *)(& A[i42 + T_TM3_3W_64])); @@ -349,9 +355,9 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { V2[i] = _mm256_lddqu_si256((__m256i const *)(& B[i4 + T2 - 4])); } - for (int32_t i = T_TM3_3W_256 - 1; i < T_TM3_3W_256; i++) { - int32_t i4 = i << 2; - int32_t i41 = i4 + 1; + for (i = T_TM3_3W_256 - 1; i < T_TM3_3W_256; i++) { + i4 = i << 2; + i41 = i4 + 1; U0[i] = _mm256_set_epi64x(0, 0, A[i41], A[i4]); V0[i] = _mm256_set_epi64x(0, 0, B[i41], B[i4]); U1[i] = _mm256_set_epi64x(0, 0, A[i41 + T_TM3_3W_64 - 2], A[i4 + T_TM3_3W_64 - 2]); @@ -364,7 +370,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { // P(X): P0=(0); P1=(1); P2=(x); P3=(1+x); P4=(\infty) // Evaluation: 5*2 add, 2*2 shift; 5 mul (n) //W3 = U2 + U1 + U0; W2 = V2 + V1 + V0 - for (int32_t i = 0; i < T_TM3_3W_256; i++) { + for (i = 0; i < T_TM3_3W_256; i++) { W3[i] = U0[i] ^ U1[i] ^ U2[i]; W2[i] = V0[i] ^ V1[i] ^ V2[i]; } @@ -373,23 +379,17 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { karat_mult_32( W1, W2, W3); //W0 =(U1 + U2*x)*x; W4 =(V1 + V2*x)*x (SIZE = T_TM3_3W_256 !) - int64_t *U1_64 = ((int64_t *) U1); - int64_t *U2_64 = ((int64_t *) U2); - - int64_t *V1_64 = ((int64_t *) V1); - int64_t *V2_64 = ((int64_t *) V2); - - W0[0] = _mm256_set_epi64x(U1_64[2] ^ U2_64[1], U1_64[1] ^ U2_64[0], U1_64[0], 0); - W4[0] = _mm256_set_epi64x(V1_64[2] ^ V2_64[1], V1_64[1] ^ V2_64[0], V1_64[0], 0); - U1_64 = ((int64_t *) U1); U2_64 = ((int64_t *) U2); V1_64 = ((int64_t *) V1); V2_64 = ((int64_t *) V2); - for (int32_t i = 1; i < T_TM3_3W_256; i++) { - int i4 = i << 2; + W0[0] = _mm256_set_epi64x(U1_64[2] ^ U2_64[1], U1_64[1] ^ U2_64[0], U1_64[0], 0); + W4[0] = _mm256_set_epi64x(V1_64[2] ^ V2_64[1], V1_64[1] ^ V2_64[0], V1_64[0], 0); + + for (i = 1; i < T_TM3_3W_256; i++) { + i4 = i << 2; W0[i] = _mm256_lddqu_si256((__m256i const *)(& U1_64[i4 - 1])); W0[i] ^= _mm256_lddqu_si256((__m256i const *)(& U2_64[i4 - 2])); @@ -398,13 +398,13 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { } //W3 = W3 + W0 ; W2 = W2 + W4 - for (int32_t i = 0; i < T_TM3_3W_256; i++) { + for (i = 0; i < T_TM3_3W_256; i++) { W3[i] ^= W0[i]; W2[i] ^= W4[i]; } //W0 = W0 + U0 ; W4 = W4 + V0 - for (int32_t i = 0; i < T_TM3_3W_256; i++) { + for (i = 0; i < T_TM3_3W_256; i++) { W0[i] ^= U0[i]; W4[i] ^= V0[i]; } @@ -412,7 +412,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { //W3 = W3 * W2 ; W2 = W0 * W4 karat_mult_32(tmp, W3, W2); - for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3_3W_256); i++) { W3[i] = tmp[i]; } @@ -424,20 +424,20 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { // Interpolation phase // 9 add, 1 shift, 1 Smul, 2 Sdiv (2n) //W3 = W3 + W2 - for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3_3W_256); i++) { W3[i] ^= W2[i]; } //W1 = W1 + W0 - for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3_3W_256); i++) { W1[i] ^= W0[i]; } //W2 =(W2 + W0)/x -> x = X^64 U1_64 = ((int64_t *) W2); U2_64 = ((int64_t *) W0); - for (int32_t i = 0; i < (T_TM3_3W_256 << 1); i++) { - int32_t i4 = i << 2; + for (i = 0; i < (T_TM3_3W_256 << 1); i++) { + i4 = i << 2; W2[i] = _mm256_lddqu_si256((__m256i const *)(& U1_64[i4 + 1])); W2[i] ^= _mm256_lddqu_si256((__m256i const *)(& U2_64[i4 + 1])); } @@ -447,7 +447,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { __m256i *U1_256 = (__m256i *) (U1_64 + 1); tmp[0] = W2[0] ^ W3[0] ^ W4[0] ^ _mm256_set_epi64x(U1_64[0], 0, 0, 0); - for (int32_t i = 1; i < (T_TM3_3W_256 << 1) - 1; i++) { + for (i = 1; i < (T_TM3_3W_256 << 1) - 1; i++) { tmp[i] = W2[i] ^ W3[i] ^ W4[i] ^ _mm256_lddqu_si256(&U1_256[i - 1]); } @@ -461,7 +461,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { U2_64 = (int64_t *) W1; __m256i *U2_256 = (__m256i *) (U2_64 + 1); - for (int32_t i = 0; i < 2 * (T_TM3_3W_256) - 1; i++) { + for (i = 0; i < 2 * (T_TM3_3W_256) - 1; i++) { tmp[i] = _mm256_lddqu_si256(&U1_256[i]) ^ _mm256_lddqu_si256(&U2_256[i]); } @@ -469,19 +469,19 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { W3[2 * (T_TM3_3W_256) - 1] = zero; //W1 = W1 + W4 + W2 - for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3_3W_256); i++) { W1[i] ^= W2[i] ^ W4[i]; } //W2 = W2 + W3 - for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3_3W_256); i++) { W2[i] ^= W3[i]; } // Recomposition //W = W0+ W1*x+ W2*x^2+ W3*x^3 + W4*x^4 //W0, W1, W4 of size 2*T_TM3_3W_256, W2 and W3 of size 2*(T_TM3_3W_256) - for (int32_t i = 0; i < (T_TM3_3W_256 << 1) - 1; i++) { + for (i = 0; i < (T_TM3_3W_256 << 1) - 1; i++) { ro256[i] = W0[i]; ro256[i + 2 * T_TM3_3W_256 - 1] = W2[i]; ro256[i + 4 * T_TM3_3W_256 - 2] = W4[i]; @@ -497,12 +497,12 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) { U2_64 = ((int64_t *) &ro256[3 * T_TM3_3W_256 - 1]); U2_256 = (__m256i *) (U2_64 - 2); - for (int32_t i = 0; i < T_TM3_3W_256 << 1; i++) { + for (i = 0; i < T_TM3_3W_256 << 1; i++) { _mm256_storeu_si256(&U1_256[i], W1[i] ^ _mm256_lddqu_si256(&U1_256[i])); _mm256_storeu_si256(&U2_256[i], W3[i] ^ _mm256_loadu_si256(&U2_256[i])); } - for (int32_t i = 0; i < 6 * T_TM3_3W_256 - 2; i++) { + for (i = 0; i < 6 * T_TM3_3W_256 - 2; i++) { _mm256_storeu_si256(&Out[i], ro256[i]); } } @@ -541,9 +541,10 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) { __m256i ro256[tTM3R / 2]; const __m256i zero = _mm256_setzero_si256(); int32_t T2 = T_TM3R_3W_64 << 1; + int32_t i, i1, i4; - for (int32_t i = 0; i < T_TM3R_3W_256; i++) { - int32_t i4 = i << 2; + for (i = 0; i < T_TM3R_3W_256; i++) { + i4 = i << 2; U0[i] = _mm256_lddqu_si256((__m256i const *)(& A[i4])); V0[i] = _mm256_lddqu_si256((__m256i const *)(& B[i4])); U1[i] = _mm256_lddqu_si256((__m256i const *)(& A[i4 + T_TM3R_3W_64])); @@ -552,7 +553,7 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) { V2[i] = _mm256_lddqu_si256((__m256i const *)(& B[i4 + T2])); } - for (int32_t i = T_TM3R_3W_256; i < T_TM3R_3W_256 + 2; i++) { + for (i = T_TM3R_3W_256; i < T_TM3R_3W_256 + 2; i++) { U0[i] = zero; V0[i] = zero; U1[i] = zero; @@ -566,12 +567,12 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) { // Evaluation: 5*2 add, 2*2 shift; 5 mul (n) //W3 = U2 + U1 + U0; W2 = V2 + V1 + V0 - for (int32_t i = 0; i < T_TM3R_3W_256; i++) { + for (i = 0; i < T_TM3R_3W_256; i++) { W3[i] = U0[i] ^ U1[i] ^ U2[i]; W2[i] = V0[i] ^ V1[i] ^ V2[i]; } - for (int32_t i = T_TM3R_3W_256; i < T_TM3R_3W_256 + 2; i++) { + for (i = T_TM3R_3W_256; i < T_TM3R_3W_256 + 2; i++) { W2[i] = zero; W3[i] = zero; } @@ -584,7 +585,7 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) { W0[1] = U1[0]; W4[1] = V1[0]; - for (int32_t i = 1; i < T_TM3R_3W_256 + 1; i++) { + for (i = 1; i < T_TM3R_3W_256 + 1; i++) { W0[i + 1] = U1[i] ^ U2[i - 1]; W4[i + 1] = V1[i] ^ V2[i - 1]; } @@ -593,13 +594,13 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) { W4[T_TM3R_3W_256 + 1] = V2[T_TM3R_3W_256 - 1]; //W3 = W3 + W0 ; W2 = W2 + W4 - for (int32_t i = 0; i < T_TM3R_3W_256 + 2; i++) { + for (i = 0; i < T_TM3R_3W_256 + 2; i++) { W3[i] ^= W0[i]; W2[i] ^= W4[i]; } //W0 = W0 + U0 ; W4 = W4 + V0 - for (int32_t i = 0; i < T_TM3R_3W_256 + 2; i++) { + for (i = 0; i < T_TM3R_3W_256 + 2; i++) { W0[i] ^= U0[i]; W4[i] ^= V0[i]; } @@ -607,7 +608,7 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) { //W3 = W3 * W2 ; W2 = W0 * W4 TOOM3Mult(tmp, (uint64_t *) W3, (uint64_t *) W2); - for (int32_t i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) { + for (i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) { W3[i] = tmp[i]; } @@ -621,25 +622,25 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) { //9 add, 1 shift, 1 Smul, 2 Sdiv (2n) //W3 = W3 + W2 - for (int32_t i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) { + for (i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) { W3[i] ^= W2[i]; } //W1 = W1 + W0 - for (int32_t i = 0; i < 2 * (T_TM3R_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3R_3W_256); i++) { W1[i] ^= W0[i]; } //W2 =(W2 + W0)/x - for (int32_t i = 0; i < 2 * (T_TM3R_3W_256 + 2) - 1; i++) { - int32_t i1 = i + 1; + for (i = 0; i < 2 * (T_TM3R_3W_256 + 2) - 1; i++) { + i1 = i + 1; W2[i] = W2[i1] ^ W0[i1]; } W2[2 * (T_TM3R_3W_256 + 2) - 1] = zero; //W2 =(W2 + W3 + W4*(x^3+1))/(x+1) - for (int32_t i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) { + for (i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) { tmp[i] = W2[i] ^ W3[i] ^ W4[i]; } @@ -647,15 +648,15 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) { tmp[2 * (T_TM3R_3W_256 + 2) + 1] = zero; tmp[2 * (T_TM3R_3W_256 + 2) + 2] = zero; - for (int32_t i = 0; i < 2 * (T_TM3R_3W_256); i++) { + for (i = 0; i < 2 * (T_TM3R_3W_256); i++) { tmp[i + 3] ^= W4[i]; } divByXplus1_256(W2, tmp, T_TM3R_3W_256); //W3 =(W3 + W1)/(x*(x+1)) - for (int32_t i = 0; i < 2 * (T_TM3R_3W_256 + 2) - 1; i++) { - int32_t i1 = i + 1; + for (i = 0; i < 2 * (T_TM3R_3W_256 + 2) - 1; i++) { + i1 = i + 1; tmp[i] = W3[i1] ^ W1[i1]; } @@ -663,18 +664,18 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) { divByXplus1_256(W3, tmp, T_TM3R_3W_256); //W1 = W1 + W4 + W2 - for (int32_t i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) { + for (i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) { W1[i] ^= W2[i] ^ W4[i]; } //W2 = W2 + W3 - for (int32_t i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) { + for (i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) { W2[i] ^= W3[i]; } // Recomposition //W = W0+ W1*x+ W2*x^2+ W3*x^3 + W4*x^4 //W0, W1, W4 of size 2*T_TM3_3W_256, W2 and W3 of size 2*(T_TM3_3W_256+2) - for (int32_t i = 0; i < T_TM3R_3W_256; i++) { + for (i = 0; i < T_TM3R_3W_256; i++) { ro256[i] = W0[i]; ro256[i + T_TM3R_3W_256] = W0[i + T_TM3R_3W_256] ^ W1[i]; ro256[i + 2 * T_TM3R_3W_256] = W1[i + T_TM3R_3W_256] ^ W2[i]; @@ -696,7 +697,7 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) { ro256[3 + 5 * T_TM3R_3W_256] ^= W3[3 + 2 * T_TM3R_3W_256]; - for (int32_t i = 0; i < 2 * VEC_N_SIZE_256 + 1; i++) { + for (i = 0; i < 2 * VEC_N_SIZE_256 + 1; i++) { _mm256_storeu_si256(&Out[i], ro256[i]); } } diff --git a/crypto_kem/hqc-rmrs-256/avx2/reed_muller.c b/crypto_kem/hqc-rmrs-256/avx2/reed_muller.c index abc2c6ab..995cf45f 100644 --- a/crypto_kem/hqc-rmrs-256/avx2/reed_muller.c +++ b/crypto_kem/hqc-rmrs-256/avx2/reed_muller.c @@ -231,7 +231,19 @@ inline void hadamard(__m256i *src, __m256i *dst) { inline uint32_t find_peaks(__m256i *transform) { // a whole lot of vector variables __m256i bitmap, abs_rows[8], bound, active_row, max_abs_rows; - __m256i peak_mask; + __m256i tmp = _mm256_setzero_si256(); + __m256i vect_mask; + __m256i res; + int32_t lower; + int32_t width; + uint32_t message; + uint32_t mask; + int8_t index; + int8_t abs_value; + int8_t mask1; + int8_t mask2; + uint16_t result; + // compute absolute value of transform for (size_t i = 0; i < 8; i++) { abs_rows[i] = _mm256_abs_epi16(transform[i]); @@ -245,9 +257,9 @@ inline uint32_t find_peaks(__m256i *transform) { // do binary search for the highest value that is lower than the maximum // loop invariant: lower gives bit map = 0, lower + width gives bit map > 0 - int32_t lower = 1; + lower = 1; // this gives 64, 128 or 256 for MULTIPLICITY = 2, 4, 6 - int32_t width = 1 << (5 + MULTIPLICITY / 2); + width = 1 << (5 + MULTIPLICITY / 2); // if you don't unroll this loop, it fits in the loop cache // uncomment the line below to speeding up the program by a few percent // #pragma GCC unroll 0 @@ -259,8 +271,9 @@ inline uint32_t find_peaks(__m256i *transform) { bitmap = _mm256_cmpgt_epi16(max_abs_rows, bound); // step up if there are any matches // rely on compiler to use conditional move here - int32_t step_mask = _mm256_testz_si256(bitmap, bitmap) - 1; - lower += step_mask & width; + mask = (uint32_t) _mm256_testz_si256(bitmap, bitmap); + mask = ~(uint32_t) ((-(int64_t) mask) >> 63); + lower += mask & width; } // lower+width contains the maximum value of the vector // or less, if the maximum is very high (which is OK) @@ -272,30 +285,26 @@ inline uint32_t find_peaks(__m256i *transform) { // find in which of the 8 groups a maximum occurs to compute bits 4, 5, 6 of message // find lowest value by searching backwards skip first check to save time - size_t message = 0x70; - for (int32_t i = 7; i >= 0; i--) { - bitmap = _mm256_cmpgt_epi16(abs_rows[i], bound); - int message_mask = (-(int16_t)(_mm256_testz_si256(bitmap, bitmap) == 0)) >> 15; - message ^= message_mask & (message ^ (unsigned)i << 4); + message = 0x70; + for (size_t i = 0; i < 8; i++) { + bitmap = _mm256_cmpgt_epi16(abs_rows[7 - i], bound); + mask = (uint32_t) _mm256_testz_si256(bitmap, bitmap); + mask = ~(uint32_t) ((-(int64_t) mask) >> 63); + message ^= mask & (message ^ ((7 - i) << 4)); } // we decided which row of the matrix contains the lowest match // select proper row - int8_t index = message >> 4; - __m256i res; - __m256i tmp = (__m256i) { - 0ULL, 0ULL, 0ULL, 0ULL - }; + index = message >> 4; - for (int8_t i = 0; i < 8; i++) { - int8_t abs_value = (int8_t)(index - i); - int8_t mask1 = abs_value >> 7; + tmp = _mm256_setzero_si256(); + for (size_t i = 0; i < 8; i++) { + abs_value = (int8_t)(index - i); + mask1 = abs_value >> 7; abs_value ^= mask1; abs_value -= mask1; - int8_t mask2 = ((uint8_t) - abs_value >> 7); - int64_t mask3 = (-1ULL) + mask2; - __m256i vect_mask = (__m256i) { - mask3, mask3, mask3, mask3 - }; + mask2 = ((uint8_t) - abs_value >> 7); + mask = (-1ULL) + mask2; + vect_mask = _mm256_set1_epi32(mask); res = _mm256_and_si256(abs_rows[i], vect_mask); tmp = _mm256_or_si256(tmp, res); } @@ -305,34 +314,29 @@ inline uint32_t find_peaks(__m256i *transform) { // get the column number of the vector element // by setting the bits corresponding to the columns // and then adding elements within two groups of 8 - peak_mask = _mm256_cmpgt_epi16(active_row, bound); - peak_mask &= _mm256_set_epi16(-32768, 16384, 8192, 4096, 2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1); - for (int32_t i = 0; i < 3; i++) { - peak_mask = _mm256_hadd_epi16(peak_mask, peak_mask); + vect_mask = _mm256_cmpgt_epi16(active_row, bound); + vect_mask &= _mm256_set_epi16(-32768, 16384, 8192, 4096, 2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1); + for (size_t i = 0; i < 3; i++) { + vect_mask = _mm256_hadd_epi16(vect_mask, vect_mask); } // add low 4 bits of message - message |= __tzcnt_u16(_mm256_extract_epi16(peak_mask, 0) + _mm256_extract_epi16(peak_mask, 8)); + message |= __tzcnt_u16(_mm256_extract_epi16(vect_mask, 0) + _mm256_extract_epi16(vect_mask, 8)); // set bit 7 if sign of biggest value is positive // make sure a jump isn't generated by the compiler - tmp = (__m256i) { - 0ULL, 0ULL, 0ULL, 0ULL - }; - for (uint32_t i = 0; i < 8; i++) { - int64_t message_mask = (-(int64_t)(i == message / 16)) >> 63; - __m256i vect_mask = (__m256i) { - message_mask, message_mask, message_mask, message_mask - }; + tmp = _mm256_setzero_si256(); + for (size_t i = 0; i < 8; i++) { + mask = ~(uint32_t) ((-(int64_t)(i ^ message / 16)) >> 63); + __m256i vect_mask = _mm256_set1_epi32(mask); tmp = _mm256_or_si256(tmp, _mm256_and_si256(vect_mask, transform[i])); } - uint16_t result = 0; - for (uint32_t i = 0; i < 16; i++) { - uint16_t *ptr = (uint16_t *) &tmp; - int32_t message_mask = (-(int32_t)(i == message % 16)) >> (sizeof(int32_t) * 8 - 1); - result |= message_mask & ptr[i]; + result = 0; + for (size_t i = 0; i < 16; i++) { + mask = ~(uint32_t) ((-(int64_t)(i ^ message % 16)) >> 63); + result |= mask & ((uint16_t *)&tmp)[i]; } message |= (0x8000 & ~result) >> 8; - return (uint32_t) message; + return message; } diff --git a/crypto_kem/hqc-rmrs-256/avx2/reed_solomon.c b/crypto_kem/hqc-rmrs-256/avx2/reed_solomon.c index 11b52c2b..89ba812c 100644 --- a/crypto_kem/hqc-rmrs-256/avx2/reed_solomon.c +++ b/crypto_kem/hqc-rmrs-256/avx2/reed_solomon.c @@ -228,17 +228,25 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons uint16_t beta_j[PARAM_DELTA] = {0}; uint16_t e_j[PARAM_DELTA] = {0}; - uint16_t delta_counter = 0; + uint16_t delta_counter; uint16_t delta_real_value; + uint16_t found; + uint16_t mask1; + uint16_t mask2; + uint16_t tmp1; + uint16_t tmp2; + uint16_t inverse; + uint16_t inverse_power_j; // Compute the beta_{j_i} page 31 of the documentation + delta_counter = 0; for (size_t i = 0; i < PARAM_N1; i++) { - uint16_t found = 0; - uint16_t valuemask = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0 - for (uint16_t j = 0; j < PARAM_DELTA; j++) { - uint16_t indexmask = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter - beta_j[j] += indexmask & valuemask & gf_exp[i]; - found += indexmask & valuemask & 1; + found = 0; + mask1 = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0 + for (size_t j = 0; j < PARAM_DELTA; j++) { + mask2 = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter + beta_j[j] += mask1 & mask2 & gf_exp[i]; + found += mask1 & mask2 & 1; } delta_counter += found; } @@ -246,10 +254,10 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons // Compute the e_{j_i} page 31 of the documentation for (size_t i = 0; i < PARAM_DELTA; ++i) { - uint16_t tmp1 = 1; - uint16_t tmp2 = 1; - uint16_t inverse = PQCLEAN_HQCRMRS256_AVX2_gf_inverse(beta_j[i]); - uint16_t inverse_power_j = 1; + tmp1 = 1; + tmp2 = 1; + inverse = PQCLEAN_HQCRMRS256_AVX2_gf_inverse(beta_j[i]); + inverse_power_j = 1; for (size_t j = 1; j <= PARAM_DELTA; ++j) { inverse_power_j = PQCLEAN_HQCRMRS256_AVX2_gf_mul(inverse_power_j, inverse); @@ -258,19 +266,19 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons for (size_t k = 1; k < PARAM_DELTA; ++k) { tmp2 = PQCLEAN_HQCRMRS256_AVX2_gf_mul(tmp2, (1 ^ PQCLEAN_HQCRMRS256_AVX2_gf_mul(inverse, beta_j[(i + k) % PARAM_DELTA]))); } - uint16_t mask = (uint16_t) (((int16_t) i - delta_real_value) >> 15); // i < delta_real_value - e_j[i] = mask & PQCLEAN_HQCRMRS256_AVX2_gf_mul(tmp1, PQCLEAN_HQCRMRS256_AVX2_gf_inverse(tmp2)); + mask1 = (uint16_t) (((int16_t) i - delta_real_value) >> 15); // i < delta_real_value + e_j[i] = mask1 & PQCLEAN_HQCRMRS256_AVX2_gf_mul(tmp1, PQCLEAN_HQCRMRS256_AVX2_gf_inverse(tmp2)); } // Place the delta e_{j_i} values at the right coordinates of the output vector delta_counter = 0; for (size_t i = 0; i < PARAM_N1; ++i) { - uint16_t found = 0; - uint16_t valuemask = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0 + found = 0; + mask1 = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0 for (size_t j = 0; j < PARAM_DELTA; j++) { - uint16_t indexmask = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter - error_values[i] += indexmask & valuemask & e_j[j]; - found += indexmask & valuemask & 1; + mask2 = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter + error_values[i] += mask1 & mask2 & e_j[j]; + found += mask1 & mask2 & 1; } delta_counter += found; } diff --git a/crypto_kem/hqc-rmrs-256/avx2/vector.c b/crypto_kem/hqc-rmrs-256/avx2/vector.c index f6aa1d5e..1d4546ca 100644 --- a/crypto_kem/hqc-rmrs-256/avx2/vector.c +++ b/crypto_kem/hqc-rmrs-256/avx2/vector.c @@ -32,72 +32,63 @@ void PQCLEAN_HQCRMRS256_AVX2_vect_set_random_fixed_weight(AES_XOF_struct *ctx, uint64_t *v, uint16_t weight) { size_t random_bytes_size = 3 * weight; uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0}; - uint32_t random_data = 0; uint32_t tmp[PARAM_OMEGA_R] = {0}; - uint8_t exist = 0; - size_t j = 0; __m256i bit256[PARAM_OMEGA_R]; __m256i bloc256[PARAM_OMEGA_R]; - static __m256i posCmp256 = (__m256i) { - 0UL, 1UL, 2UL, 3UL - }; -#define LOOP_SIZE CEIL_DIVIDE(PARAM_N, 256) - - seedexpander(ctx, rand_bytes, random_bytes_size); - - for (uint32_t i = 0; i < weight; ++i) { - exist = 0; + __m256i posCmp256 = _mm256_set_epi64x(3, 2, 1, 0); + uint64_t bloc, pos, bit64; + uint8_t inc; + size_t i, j; + + i = 0; + j = random_bytes_size; + while (i < weight) { do { if (j == random_bytes_size) { seedexpander(ctx, rand_bytes, random_bytes_size); j = 0; } - random_data = ((uint32_t) rand_bytes[j++]) << 16; - random_data |= ((uint32_t) rand_bytes[j++]) << 8; - random_data |= rand_bytes[j++]; + tmp[i] = ((uint32_t) rand_bytes[j++]) << 16; + tmp[i] |= ((uint32_t) rand_bytes[j++]) << 8; + tmp[i] |= rand_bytes[j++]; - } while (random_data >= UTILS_REJECTION_THRESHOLD); + } while (tmp[i] >= UTILS_REJECTION_THRESHOLD); - random_data = random_data % PARAM_N; + tmp[i] = tmp[i] % PARAM_N; + inc = 1; for (uint32_t k = 0; k < i; k++) { - if (tmp[k] == random_data) { - exist = 1; + if (tmp[k] == tmp[i]) { + inc = 0; } } - - if (exist == 1) { - i--; - } else { - tmp[i] = random_data; - } + i += inc; } - for (uint32_t i = 0; i < weight; i++) { + for (i = 0; i < weight; i++) { // we store the bloc number and bit position of each vb[i] - uint64_t bloc = tmp[i] >> 6; + bloc = tmp[i] >> 6; bloc256[i] = _mm256_set1_epi64x(bloc >> 2); - uint64_t pos = (bloc & 0x3UL); + pos = (bloc & 0x3UL); __m256i pos256 = _mm256_set1_epi64x(pos); __m256i mask256 = _mm256_cmpeq_epi64(pos256, posCmp256); - uint64_t bit64 = 1ULL << (tmp[i] & 0x3f); + bit64 = 1ULL << (tmp[i] & 0x3f); __m256i bloc256 = _mm256_set1_epi64x(bit64); bit256[i] = bloc256 & mask256; } - for (uint32_t i = 0; i < LOOP_SIZE; i++) { + for (i = 0; i < CEIL_DIVIDE(PARAM_N, 256); i++) { __m256i aux = _mm256_loadu_si256(((__m256i *)v) + i); __m256i i256 = _mm256_set1_epi64x(i); - for (uint32_t j = 0; j < weight; j++) { + for (j = 0; j < weight; j++) { __m256i mask256 = _mm256_cmpeq_epi64(bloc256[j], i256); aux ^= bit256[j] & mask256; } _mm256_storeu_si256(((__m256i *)v) + i, aux); } -#undef LOOP_SIZE } @@ -167,10 +158,9 @@ uint8_t PQCLEAN_HQCRMRS256_AVX2_vect_compare(const uint8_t *v1, const uint8_t *v * @param[in] size_v Integer that is the size of the input vector in bits */ void PQCLEAN_HQCRMRS256_AVX2_vect_resize(uint64_t *o, uint32_t size_o, const uint64_t *v, uint32_t size_v) { + uint64_t mask = 0x7FFFFFFFFFFFFFFF; + int8_t val = 0; if (size_o < size_v) { - uint64_t mask = 0x7FFFFFFFFFFFFFFF; - int8_t val = 0; - if (size_o % 64) { val = 64 - (size_o % 64); } diff --git a/crypto_kem/hqc-rmrs-256/clean/reed_solomon.c b/crypto_kem/hqc-rmrs-256/clean/reed_solomon.c index df6b3997..2d28554d 100644 --- a/crypto_kem/hqc-rmrs-256/clean/reed_solomon.c +++ b/crypto_kem/hqc-rmrs-256/clean/reed_solomon.c @@ -228,17 +228,25 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons uint16_t beta_j[PARAM_DELTA] = {0}; uint16_t e_j[PARAM_DELTA] = {0}; - uint16_t delta_counter = 0; + uint16_t delta_counter; uint16_t delta_real_value; + uint16_t found; + uint16_t mask1; + uint16_t mask2; + uint16_t tmp1; + uint16_t tmp2; + uint16_t inverse; + uint16_t inverse_power_j; // Compute the beta_{j_i} page 31 of the documentation + delta_counter = 0; for (size_t i = 0; i < PARAM_N1; i++) { - uint16_t found = 0; - uint16_t valuemask = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0 - for (uint16_t j = 0; j < PARAM_DELTA; j++) { - uint16_t indexmask = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter - beta_j[j] += indexmask & valuemask & gf_exp[i]; - found += indexmask & valuemask & 1; + found = 0; + mask1 = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0 + for (size_t j = 0; j < PARAM_DELTA; j++) { + mask2 = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter + beta_j[j] += mask1 & mask2 & gf_exp[i]; + found += mask1 & mask2 & 1; } delta_counter += found; } @@ -246,10 +254,10 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons // Compute the e_{j_i} page 31 of the documentation for (size_t i = 0; i < PARAM_DELTA; ++i) { - uint16_t tmp1 = 1; - uint16_t tmp2 = 1; - uint16_t inverse = PQCLEAN_HQCRMRS256_CLEAN_gf_inverse(beta_j[i]); - uint16_t inverse_power_j = 1; + tmp1 = 1; + tmp2 = 1; + inverse = PQCLEAN_HQCRMRS256_CLEAN_gf_inverse(beta_j[i]); + inverse_power_j = 1; for (size_t j = 1; j <= PARAM_DELTA; ++j) { inverse_power_j = PQCLEAN_HQCRMRS256_CLEAN_gf_mul(inverse_power_j, inverse); @@ -258,19 +266,19 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons for (size_t k = 1; k < PARAM_DELTA; ++k) { tmp2 = PQCLEAN_HQCRMRS256_CLEAN_gf_mul(tmp2, (1 ^ PQCLEAN_HQCRMRS256_CLEAN_gf_mul(inverse, beta_j[(i + k) % PARAM_DELTA]))); } - uint16_t mask = (uint16_t) (((int16_t) i - delta_real_value) >> 15); // i < delta_real_value - e_j[i] = mask & PQCLEAN_HQCRMRS256_CLEAN_gf_mul(tmp1, PQCLEAN_HQCRMRS256_CLEAN_gf_inverse(tmp2)); + mask1 = (uint16_t) (((int16_t) i - delta_real_value) >> 15); // i < delta_real_value + e_j[i] = mask1 & PQCLEAN_HQCRMRS256_CLEAN_gf_mul(tmp1, PQCLEAN_HQCRMRS256_CLEAN_gf_inverse(tmp2)); } // Place the delta e_{j_i} values at the right coordinates of the output vector delta_counter = 0; for (size_t i = 0; i < PARAM_N1; ++i) { - uint16_t found = 0; - uint16_t valuemask = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0 + found = 0; + mask1 = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0 for (size_t j = 0; j < PARAM_DELTA; j++) { - uint16_t indexmask = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter - error_values[i] += indexmask & valuemask & e_j[j]; - found += indexmask & valuemask & 1; + mask2 = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter + error_values[i] += mask1 & mask2 & e_j[j]; + found += mask1 & mask2 & 1; } delta_counter += found; } diff --git a/crypto_kem/hqc-rmrs-256/clean/vector.c b/crypto_kem/hqc-rmrs-256/clean/vector.c index 58486178..139e5bc3 100644 --- a/crypto_kem/hqc-rmrs-256/clean/vector.c +++ b/crypto_kem/hqc-rmrs-256/clean/vector.c @@ -31,39 +31,33 @@ void PQCLEAN_HQCRMRS256_CLEAN_vect_set_random_fixed_weight_by_coordinates(AES_XOF_struct *ctx, uint32_t *v, uint16_t weight) { size_t random_bytes_size = 3 * weight; uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0}; // weight is expected to be <= PARAM_OMEGA_R - uint32_t random_data = 0; - uint8_t exist = 0; - size_t j = 0; + uint8_t inc; + size_t i, j; - seedexpander(ctx, rand_bytes, random_bytes_size); - - for (uint32_t i = 0; i < weight; ++i) { - exist = 0; + i = 0; + j = random_bytes_size; + while (i < weight) { do { if (j == random_bytes_size) { seedexpander(ctx, rand_bytes, random_bytes_size); j = 0; } - random_data = ((uint32_t) rand_bytes[j++]) << 16; - random_data |= ((uint32_t) rand_bytes[j++]) << 8; - random_data |= rand_bytes[j++]; + v[i] = ((uint32_t) rand_bytes[j++]) << 16; + v[i] |= ((uint32_t) rand_bytes[j++]) << 8; + v[i] |= rand_bytes[j++]; - } while (random_data >= UTILS_REJECTION_THRESHOLD); + } while (v[i] >= UTILS_REJECTION_THRESHOLD); - random_data = random_data % PARAM_N; + v[i] = v[i] % PARAM_N; - for (uint32_t k = 0; k < i; k++) { - if (v[k] == random_data) { - exist = 1; + inc = 1; + for (size_t k = 0; k < i; k++) { + if (v[k] == v[i]) { + inc = 0; } } - - if (exist == 1) { - i--; - } else { - v[i] = random_data; - } + i += inc; } } @@ -86,46 +80,11 @@ void PQCLEAN_HQCRMRS256_CLEAN_vect_set_random_fixed_weight_by_coordinates(AES_XO * @param[in] ctx Pointer to the context of the seed expander */ void PQCLEAN_HQCRMRS256_CLEAN_vect_set_random_fixed_weight(AES_XOF_struct *ctx, uint64_t *v, uint16_t weight) { - - size_t random_bytes_size = 3 * weight; - uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0}; // weight is expected to be <= PARAM_OMEGA_R - uint32_t random_data = 0; uint32_t tmp[PARAM_OMEGA_R] = {0}; - uint8_t exist = 0; - size_t j = 0; - seedexpander(ctx, rand_bytes, random_bytes_size); - - for (uint32_t i = 0; i < weight; ++i) { - exist = 0; - do { - if (j == random_bytes_size) { - seedexpander(ctx, rand_bytes, random_bytes_size); - j = 0; - } - - random_data = ((uint32_t) rand_bytes[j++]) << 16; - random_data |= ((uint32_t) rand_bytes[j++]) << 8; - random_data |= rand_bytes[j++]; - - } while (random_data >= UTILS_REJECTION_THRESHOLD); - - random_data = random_data % PARAM_N; - - for (uint32_t k = 0; k < i; k++) { - if (tmp[k] == random_data) { - exist = 1; - } - } - - if (exist == 1) { - i--; - } else { - tmp[i] = random_data; - } - } + PQCLEAN_HQCRMRS256_CLEAN_vect_set_random_fixed_weight_by_coordinates(ctx, tmp, weight); - for (uint16_t i = 0; i < weight; ++i) { + for (size_t i = 0; i < weight; ++i) { int32_t index = tmp[i] / 64; int32_t pos = tmp[i] % 64; v[index] |= ((uint64_t) 1) << pos; diff --git a/test/duplicate_consistency/hqc-128_clean.yml b/test/duplicate_consistency/hqc-128_clean.yml index a20fa459..40027f65 100644 --- a/test/duplicate_consistency/hqc-128_clean.yml +++ b/test/duplicate_consistency/hqc-128_clean.yml @@ -19,6 +19,7 @@ consistency_checks: - parsing.h - repetition.h - vector.h + - bch.c - code.c - fft.c - gf2x.c @@ -46,6 +47,7 @@ consistency_checks: - parsing.h - repetition.h - vector.h + - bch.c - code.c - fft.c - gf2x.c