Pārlūkot izejas kodu

readability changes

tags/v0.0.1
John M. Schanck pirms 4 gadiem
committed by Kris Kwiatkowski
vecāks
revīzija
629b89ba73
37 mainītis faili ar 1134 papildinājumiem un 1348 dzēšanām
  1. +27
    -23
      crypto_kem/hqc-128/avx2/bch.c
  2. +21
    -20
      crypto_kem/hqc-128/avx2/code.c
  3. +52
    -50
      crypto_kem/hqc-128/avx2/gf2x.c
  4. +25
    -35
      crypto_kem/hqc-128/avx2/vector.c
  5. +26
    -24
      crypto_kem/hqc-128/clean/bch.c
  6. +17
    -58
      crypto_kem/hqc-128/clean/vector.c
  7. +27
    -23
      crypto_kem/hqc-192/avx2/bch.c
  8. +21
    -20
      crypto_kem/hqc-192/avx2/code.c
  9. +59
    -57
      crypto_kem/hqc-192/avx2/gf2x.c
  10. +25
    -35
      crypto_kem/hqc-192/avx2/vector.c
  11. +24
    -22
      crypto_kem/hqc-192/clean/bch.c
  12. +17
    -58
      crypto_kem/hqc-192/clean/vector.c
  13. +27
    -23
      crypto_kem/hqc-256/avx2/bch.c
  14. +33
    -33
      crypto_kem/hqc-256/avx2/code.c
  15. +67
    -66
      crypto_kem/hqc-256/avx2/gf2x.c
  16. +25
    -35
      crypto_kem/hqc-256/avx2/vector.c
  17. +24
    -22
      crypto_kem/hqc-256/clean/bch.c
  18. +17
    -58
      crypto_kem/hqc-256/clean/vector.c
  19. +52
    -50
      crypto_kem/hqc-rmrs-128/avx2/gf2x.c
  20. +46
    -42
      crypto_kem/hqc-rmrs-128/avx2/reed_muller.c
  21. +26
    -18
      crypto_kem/hqc-rmrs-128/avx2/reed_solomon.c
  22. +25
    -35
      crypto_kem/hqc-rmrs-128/avx2/vector.c
  23. +26
    -18
      crypto_kem/hqc-rmrs-128/clean/reed_solomon.c
  24. +17
    -58
      crypto_kem/hqc-rmrs-128/clean/vector.c
  25. +59
    -57
      crypto_kem/hqc-rmrs-192/avx2/gf2x.c
  26. +46
    -42
      crypto_kem/hqc-rmrs-192/avx2/reed_muller.c
  27. +26
    -18
      crypto_kem/hqc-rmrs-192/avx2/reed_solomon.c
  28. +25
    -35
      crypto_kem/hqc-rmrs-192/avx2/vector.c
  29. +26
    -18
      crypto_kem/hqc-rmrs-192/clean/reed_solomon.c
  30. +17
    -58
      crypto_kem/hqc-rmrs-192/clean/vector.c
  31. +67
    -66
      crypto_kem/hqc-rmrs-256/avx2/gf2x.c
  32. +46
    -42
      crypto_kem/hqc-rmrs-256/avx2/reed_muller.c
  33. +26
    -18
      crypto_kem/hqc-rmrs-256/avx2/reed_solomon.c
  34. +25
    -35
      crypto_kem/hqc-rmrs-256/avx2/vector.c
  35. +26
    -18
      crypto_kem/hqc-rmrs-256/clean/reed_solomon.c
  36. +17
    -58
      crypto_kem/hqc-rmrs-256/clean/vector.c
  37. +2
    -0
      test/duplicate_consistency/hqc-128_clean.yml

+ 27
- 23
crypto_kem/hqc-128/avx2/bch.c Parādīt failu

@@ -35,52 +35,54 @@ static void compute_roots(uint64_t *error, const uint16_t *sigma);
* @param[in] syndromes Array of size (at least) 2*PARAM_DELTA storing the syndromes
*/
static size_t compute_elp(uint16_t *sigma, const uint16_t *syndromes) {
sigma[0] = 1;
size_t deg_sigma = 0;
size_t deg_sigma_p = 0;
uint16_t sigma_copy[PARAM_DELTA - 1] = {0};
size_t deg_sigma_copy = 0;
uint16_t X_sigma_p[PARAM_DELTA + 1] = {0, 1};
int32_t pp = -1; // 2*rho
uint16_t d_p = 1;
uint16_t d = syndromes[0];
uint16_t X_sigma_p[PARAM_DELTA + 1] = {0};
uint16_t d_p, d, dd;
uint16_t mask;
int32_t pp; // 2*rho
size_t deg_sigma, deg_sigma_p, deg_sigma_copy, deg_X_sigma_p;

d = syndromes[0];
sigma[0] = 1;
X_sigma_p[1] = 1;
deg_sigma = 0;
deg_sigma_p = 0;
d_p = 1;
pp = -1;
for (size_t mu = 0; mu < PARAM_DELTA; ++mu) {
// Save sigma in case we need it to update X_sigma_p
memcpy(sigma_copy, sigma, 2 * (PARAM_DELTA - 1));
deg_sigma_copy = deg_sigma;

uint16_t dd = PQCLEAN_HQC128_AVX2_gf_mul(d, PQCLEAN_HQC128_AVX2_gf_inverse(d_p)); // 0 if(d == 0)
dd = PQCLEAN_HQC128_AVX2_gf_mul(d, PQCLEAN_HQC128_AVX2_gf_inverse(d_p)); // 0 if(d == 0)
for (size_t i = 1; (i <= 2 * mu + 1) && (i <= PARAM_DELTA); ++i) {
sigma[i] ^= PQCLEAN_HQC128_AVX2_gf_mul(dd, X_sigma_p[i]);
}

size_t deg_X = 2 * mu - pp; // 2*(mu-rho)
size_t deg_X_sigma_p = deg_X + deg_sigma_p;
deg_X_sigma_p = 2 * mu - pp + deg_sigma_p;

// mask1 = 0xffff if(d != 0) and 0 otherwise
int16_t mask1 = -((uint16_t) - d >> 15);
// mask = 0xffff if(d != 0) and 0 otherwise
mask = -((uint16_t) - d >> 15);

// mask2 = 0xffff if(deg_X_sigma_p > deg_sigma) and 0 otherwise
int16_t mask2 = -((uint16_t) (deg_sigma - deg_X_sigma_p) >> 15);
// mask &= 0xffff if(deg_X_sigma_p > deg_sigma) and 0 otherwise
mask &= -((uint16_t) (deg_sigma - deg_X_sigma_p) >> 15);

// mask12 = 0xffff if the deg_sigma increased and 0 otherwise
int16_t mask12 = mask1 & mask2;
deg_sigma = (mask12 & deg_X_sigma_p) ^ (~mask12 & deg_sigma);
deg_sigma ^= mask & (deg_sigma ^ deg_X_sigma_p);

if (mu == PARAM_DELTA - 1) {
break;
}

// Update pp, d_p and X_sigma_p if needed
pp = (mask12 & (2 * mu)) ^ (~mask12 & pp);
d_p = (mask12 & d) ^ (~mask12 & d_p);
pp ^= mask & (pp ^ (2 * mu));
d_p ^= mask & (d_p ^ d);
for (size_t i = PARAM_DELTA - 1; i; --i) {
X_sigma_p[i + 1] = (mask12 & sigma_copy[i - 1]) ^ (~mask12 & X_sigma_p[i - 1]);
X_sigma_p[i + 1] = X_sigma_p[i - 1];
X_sigma_p[i + 1] ^= mask & (X_sigma_p[i + 1] ^ sigma_copy[i - 1]);
}
X_sigma_p[1] = 0;
X_sigma_p[0] = 0;
deg_sigma_p = (mask12 & deg_sigma_copy) ^ (~mask12 & deg_sigma_p);
deg_sigma_p ^= mask & (deg_sigma_p ^ deg_sigma_copy);

// Compute the next discrepancy
d = syndromes[2 * mu + 2];
@@ -145,6 +147,7 @@ void compute_syndromes(__m256i *syndromes, const uint64_t *rcv) {
uint32_t *aux;
int16_t *alpha_tmp;
uint32_t i;
uint32_t nzflag;
// static variable so that it is stored in the DATA segment
// not in the STACK segment
static uint8_t tmp_array[PARAM_N1 + 4]; // +4 to control overflow due to management of 256 bits
@@ -169,7 +172,8 @@ void compute_syndromes(__m256i *syndromes, const uint64_t *rcv) {
alpha_tmp = table_alpha_ij + (j << 4);

for (size_t i = 0; i < PARAM_N1; ++i) {
tmp_repeat = _mm256_set1_epi64x((long long)(tmp_array[i] != 0));
nzflag = ((-(int32_t) tmp_array[i]) >> 31) & 1;
tmp_repeat = _mm256_set1_epi64x(nzflag);
L = _mm256_cmpeq_epi64(tmp_repeat, un_256);
tmp_repeat = _mm256_lddqu_si256((__m256i *)(alpha_tmp + i * (PARAM_DELTA << 1)));
L = _mm256_and_si256(L, tmp_repeat);


+ 21
- 20
crypto_kem/hqc-128/avx2/code.c Parādīt failu

@@ -34,15 +34,18 @@ static inline uint64_t mux(uint64_t a, uint64_t b, int64_t bit) {
* @param[in] m Pointer to an array that is the message
*/
void PQCLEAN_HQC128_AVX2_code_encode(uint64_t *em, const uint64_t *m) {
uint64_t res;
uint32_t i;
static const uint64_t mask[2][2] = {{0x0UL, 0x0UL}, {0x7FFFFFFFUL, 0x3FFFFFFFUL}};
const uint64_t mask[2][2] = {{0x0UL, 0x0UL}, {0x7FFFFFFFUL, 0x3FFFFFFFUL}};
size_t i, pos_r;
uint64_t bit;
uint64_t idx_r;
uint64_t select;


__m256i *colonne, y, aux0;
__m256i msg = _mm256_lddqu_si256((const __m256i *) m);
colonne = ((__m256i *) gen_matrix);

pos_r = 0;
for (i = 0; i < PARAM_N1 - PARAM_K; i++) {
// y is the and operation between m and ith column of G
y = _mm256_and_si256(colonne[i], msg);
@@ -54,34 +57,32 @@ void PQCLEAN_HQC128_AVX2_code_encode(uint64_t *em, const uint64_t *m) {
aux0 = _mm256_shuffle_epi32(y, 0x4e);
// y = (y0^y1^y2^y3 repeated 4 times)
y = _mm256_xor_si256(aux0, y);
res = _mm_popcnt_u64(_mm256_extract_epi64(y, 0)) & 1;
bit = _mm_popcnt_u64(_mm256_extract_epi64(y, 0)) & 1;


uint16_t pos_r = PARAM_N2 * i;
uint16_t idx_r = (pos_r & 0x3f);
uint64_t *p64 = em;
p64 += pos_r >> 6;
uint64_t select = mux(mask[0][0], mask[1][0], res);
*p64 ^= select << idx_r;
select = mux(mask[0][1], mask[1][1], res);
*(p64 + 1) ^= select >> ((63 - idx_r));
idx_r = (pos_r & 0x3f);
select = mux(mask[0][0], mask[1][0], bit);
em[(pos_r >> 6) + 0] ^= select << idx_r;
select = mux(mask[0][1], mask[1][1], bit);
em[(pos_r >> 6) + 1] ^= select >> ((63 - idx_r));
pos_r += PARAM_N2;
}

/* now we add the message m */
/* systematic encoding */
pos_r = PARAM_N2 * (PARAM_N1 - PARAM_K);
for (int32_t i = 0; i < 4; i++) {
for (int32_t j = 0; j < 64; j++) {
uint8_t bit = (m[i] >> j) & 0x1;
uint32_t pos_r = PARAM_N2 * ((PARAM_N1 - PARAM_K) + ((i << 6) + j));
uint16_t idx_r = (pos_r & 0x3f);
uint64_t *p64 = em;
bit = (m[i] >> j) & 0x1;


p64 += pos_r >> 6;
uint64_t select = mux(mask[0][0], mask[1][0], bit);
*p64 ^= select << idx_r;
idx_r = (pos_r & 0x3f);
select = mux(mask[0][0], mask[1][0], bit);
em[(pos_r >> 6) + 0] ^= select << idx_r;
select = mux(mask[0][1], mask[1][1], bit);
*(p64 + 1) ^= select >> ((63 - idx_r));
em[(pos_r >> 6) + 1] ^= select >> ((63 - idx_r));

pos_r += PARAM_N2;
}
}



+ 52
- 50
crypto_kem/hqc-128/avx2/gf2x.c Parādīt failu

@@ -188,22 +188,23 @@ static inline void karat_mult_4(__m256i *C, __m256i *A, __m256i *B) {
*/
static inline void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[8], D1[8], D2[8], SAA[4], SBB[4];
int32_t i, is, is2, is3;

karat_mult_4( D0, A, B);
karat_mult_4(D2, A + 4, B + 4);

for (int32_t i = 0; i < 4; i++) {
int is = i + 4;
for (i = 0; i < 4; i++) {
is = i + 4;
SAA[i] = A[i] ^ A[is];
SBB[i] = B[i] ^ B[is];
}

karat_mult_4(D1, SAA, SBB);

for (int32_t i = 0; i < 4; i++) {
int32_t is = i + 4;
int32_t is2 = is + 4;
int32_t is3 = is2 + 4;
for (i = 0; i < 4; i++) {
is = i + 4;
is2 = is + 4;
is3 = is2 + 4;

__m256i middle = _mm256_xor_si256(D0[is], D2[i]);

@@ -227,22 +228,23 @@ static inline void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) {
*/
static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[16], D1[16], D2[16], SAA[8], SBB[8];
int32_t i, is, is2, is3;

karat_mult_8( D0, A, B);
karat_mult_8(D2, A + 8, B + 8);

for (int32_t i = 0; i < 8; i++) {
int32_t is = i + 8;
for (i = 0; i < 8; i++) {
is = i + 8;
SAA[i] = A[i] ^ A[is];
SBB[i] = B[i] ^ B[is];
}

karat_mult_8( D1, SAA, SBB);

for (int32_t i = 0; i < 8; i++) {
int32_t is = i + 8;
int32_t is2 = is + 8;
int32_t is3 = is2 + 8;
for (i = 0; i < 8; i++) {
is = i + 8;
is2 = is + 8;
is3 = is2 + 8;

__m256i middle = _mm256_xor_si256(D0[is], D2[i]);

@@ -266,22 +268,23 @@ static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) {
*/
static inline void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[32], D1[32], D2[32], SAA[16], SBB[16];
int32_t i, is, is2, is3;

karat_mult_16( D0, A, B);
karat_mult_16(D2, A + 16, B + 16);

for (int32_t i = 0; i < 16; i++) {
int is = i + 16;
for (i = 0; i < 16; i++) {
is = i + 16;
SAA[i] = A[i] ^ A[is];
SBB[i] = B[i] ^ B[is];
}

karat_mult_16( D1, SAA, SBB);

for (int32_t i = 0; i < 16; i++) {
int32_t is = i + 16;
int32_t is2 = is + 16;
int32_t is3 = is2 + 16;
for (i = 0; i < 16; i++) {
is = i + 16;
is2 = is + 16;
is3 = is2 + 16;

__m256i middle = _mm256_xor_si256(D0[is], D2[i]);

@@ -329,11 +332,16 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
static __m256i tmp[2 * (T_TM3_3W_256)];
static __m256i ro256[6 * (T_TM3_3W_256)];
const __m256i zero = _mm256_setzero_si256();
int64_t *U1_64;
int64_t *U2_64;
int64_t *V1_64;
int64_t *V2_64;
int32_t T2 = T_TM3_3W_64 << 1;
int32_t i, i4, i41, i42;

for (int32_t i = 0; i < T_TM3_3W_256 - 1; i++) {
int32_t i4 = i << 2;
int32_t i42 = i4 - 2;
for (i = 0; i < T_TM3_3W_256 - 1; i++) {
i4 = i << 2;
i42 = i4 - 2;
U0[i] = _mm256_lddqu_si256((__m256i const *)(& A[i4]));
V0[i] = _mm256_lddqu_si256((__m256i const *)(& B[i4]));
U1[i] = _mm256_lddqu_si256((__m256i const *)(& A[i42 + T_TM3_3W_64]));
@@ -342,9 +350,9 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
V2[i] = _mm256_lddqu_si256((__m256i const *)(& B[i4 + T2 - 4]));
}

for (int32_t i = T_TM3_3W_256 - 1; i < T_TM3_3W_256; i++) {
int32_t i4 = i << 2;
int32_t i41 = i4 + 1;
for (i = T_TM3_3W_256 - 1; i < T_TM3_3W_256; i++) {
i4 = i << 2;
i41 = i4 + 1;
U0[i] = _mm256_set_epi64x(0, 0, A[i41], A[i4]);
V0[i] = _mm256_set_epi64x(0, 0, B[i41], B[i4]);
U1[i] = _mm256_set_epi64x(0, 0, A[i41 + T_TM3_3W_64 - 2], A[i4 + T_TM3_3W_64 - 2]);
@@ -357,7 +365,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
// P(X): P0=(0); P1=(1); P2=(x); P3=(1+x); P4=(\infty)
// Evaluation: 5*2 add, 2*2 shift; 5 mul (n)
//W3 = U2 + U1 + U0; W2 = V2 + V1 + V0
for (int32_t i = 0; i < T_TM3_3W_256; i++) {
for (i = 0; i < T_TM3_3W_256; i++) {
W3[i] = U0[i] ^ U1[i] ^ U2[i];
W2[i] = V0[i] ^ V1[i] ^ V2[i];
}
@@ -366,23 +374,17 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
karat_mult_32( W1, W2, W3);

//W0 =(U1 + U2*x)*x; W4 =(V1 + V2*x)*x (SIZE = T_TM3_3W_256 !)
int64_t *U1_64 = ((int64_t *) U1);
int64_t *U2_64 = ((int64_t *) U2);

int64_t *V1_64 = ((int64_t *) V1);
int64_t *V2_64 = ((int64_t *) V2);

W0[0] = _mm256_set_epi64x(U1_64[2] ^ U2_64[1], U1_64[1] ^ U2_64[0], U1_64[0], 0);
W4[0] = _mm256_set_epi64x(V1_64[2] ^ V2_64[1], V1_64[1] ^ V2_64[0], V1_64[0], 0);

U1_64 = ((int64_t *) U1);
U2_64 = ((int64_t *) U2);

V1_64 = ((int64_t *) V1);
V2_64 = ((int64_t *) V2);

for (int32_t i = 1; i < T_TM3_3W_256; i++) {
int i4 = i << 2;
W0[0] = _mm256_set_epi64x(U1_64[2] ^ U2_64[1], U1_64[1] ^ U2_64[0], U1_64[0], 0);
W4[0] = _mm256_set_epi64x(V1_64[2] ^ V2_64[1], V1_64[1] ^ V2_64[0], V1_64[0], 0);

for (i = 1; i < T_TM3_3W_256; i++) {
i4 = i << 2;
W0[i] = _mm256_lddqu_si256((__m256i const *)(& U1_64[i4 - 1]));
W0[i] ^= _mm256_lddqu_si256((__m256i const *)(& U2_64[i4 - 2]));

@@ -391,13 +393,13 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
}

//W3 = W3 + W0 ; W2 = W2 + W4
for (int32_t i = 0; i < T_TM3_3W_256; i++) {
for (i = 0; i < T_TM3_3W_256; i++) {
W3[i] ^= W0[i];
W2[i] ^= W4[i];
}

//W0 = W0 + U0 ; W4 = W4 + V0
for (int32_t i = 0; i < T_TM3_3W_256; i++) {
for (i = 0; i < T_TM3_3W_256; i++) {
W0[i] ^= U0[i];
W4[i] ^= V0[i];
}
@@ -405,7 +407,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
//W3 = W3 * W2 ; W2 = W0 * W4
karat_mult_32(tmp, W3, W2);

for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3_3W_256); i++) {
W3[i] = tmp[i];
}

@@ -417,20 +419,20 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
// Interpolation phase
// 9 add, 1 shift, 1 Smul, 2 Sdiv (2n)
//W3 = W3 + W2
for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3_3W_256); i++) {
W3[i] ^= W2[i];
}

//W1 = W1 + W0
for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3_3W_256); i++) {
W1[i] ^= W0[i];
}

//W2 =(W2 + W0)/x -> x = X^64
U1_64 = ((int64_t *) W2);
U2_64 = ((int64_t *) W0);
for (int32_t i = 0; i < (T_TM3_3W_256 << 1); i++) {
int32_t i4 = i << 2;
for (i = 0; i < (T_TM3_3W_256 << 1); i++) {
i4 = i << 2;
W2[i] = _mm256_lddqu_si256((__m256i const *)(& U1_64[i4 + 1]));
W2[i] ^= _mm256_lddqu_si256((__m256i const *)(& U2_64[i4 + 1]));
}
@@ -440,7 +442,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
__m256i *U1_256 = (__m256i *) (U1_64 + 1);
tmp[0] = W2[0] ^ W3[0] ^ W4[0] ^ _mm256_set_epi64x(U1_64[0], 0, 0, 0);

for (int32_t i = 1; i < (T_TM3_3W_256 << 1) - 1; i++) {
for (i = 1; i < (T_TM3_3W_256 << 1) - 1; i++) {
tmp[i] = W2[i] ^ W3[i] ^ W4[i] ^ _mm256_lddqu_si256(&U1_256[i - 1]);
}

@@ -454,7 +456,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
U2_64 = (int64_t *) W1;
__m256i *U2_256 = (__m256i *) (U2_64 + 1);

for (int32_t i = 0; i < 2 * (T_TM3_3W_256) - 1; i++) {
for (i = 0; i < 2 * (T_TM3_3W_256) - 1; i++) {
tmp[i] = _mm256_lddqu_si256(&U1_256[i]) ^ _mm256_lddqu_si256(&U2_256[i]);
}

@@ -462,19 +464,19 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
W3[2 * (T_TM3_3W_256) - 1] = zero;

//W1 = W1 + W4 + W2
for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3_3W_256); i++) {
W1[i] ^= W2[i] ^ W4[i];
}

//W2 = W2 + W3
for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3_3W_256); i++) {
W2[i] ^= W3[i];
}

// Recomposition
//W = W0+ W1*x+ W2*x^2+ W3*x^3 + W4*x^4
//W0, W1, W4 of size 2*T_TM3_3W_256, W2 and W3 of size 2*(T_TM3_3W_256)
for (int32_t i = 0; i < (T_TM3_3W_256 << 1) - 1; i++) {
for (i = 0; i < (T_TM3_3W_256 << 1) - 1; i++) {
ro256[i] = W0[i];
ro256[i + 2 * T_TM3_3W_256 - 1] = W2[i];
ro256[i + 4 * T_TM3_3W_256 - 2] = W4[i];
@@ -490,12 +492,12 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
U2_64 = ((int64_t *) &ro256[3 * T_TM3_3W_256 - 1]);
U2_256 = (__m256i *) (U2_64 - 2);

for (int32_t i = 0; i < T_TM3_3W_256 << 1; i++) {
for (i = 0; i < T_TM3_3W_256 << 1; i++) {
_mm256_storeu_si256(&U1_256[i], W1[i] ^ _mm256_lddqu_si256(&U1_256[i]));
_mm256_storeu_si256(&U2_256[i], W3[i] ^ _mm256_loadu_si256(&U2_256[i]));
}

for (int32_t i = 0; i < 2 * VEC_N_SIZE_256 + 1; i++) {
for (i = 0; i < 2 * VEC_N_SIZE_256 + 1; i++) {
_mm256_storeu_si256(&Out[i], ro256[i]);
}
}


+ 25
- 35
crypto_kem/hqc-128/avx2/vector.c Parādīt failu

@@ -32,72 +32,63 @@
void PQCLEAN_HQC128_AVX2_vect_set_random_fixed_weight(AES_XOF_struct *ctx, uint64_t *v, uint16_t weight) {
size_t random_bytes_size = 3 * weight;
uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0};
uint32_t random_data = 0;
uint32_t tmp[PARAM_OMEGA_R] = {0};
uint8_t exist = 0;
size_t j = 0;
__m256i bit256[PARAM_OMEGA_R];
__m256i bloc256[PARAM_OMEGA_R];
static __m256i posCmp256 = (__m256i) {
0UL, 1UL, 2UL, 3UL
};
#define LOOP_SIZE CEIL_DIVIDE(PARAM_N, 256)

seedexpander(ctx, rand_bytes, random_bytes_size);

for (uint32_t i = 0; i < weight; ++i) {
exist = 0;
__m256i posCmp256 = _mm256_set_epi64x(3, 2, 1, 0);
uint64_t bloc, pos, bit64;
uint8_t inc;
size_t i, j;

i = 0;
j = random_bytes_size;
while (i < weight) {
do {
if (j == random_bytes_size) {
seedexpander(ctx, rand_bytes, random_bytes_size);
j = 0;
}

random_data = ((uint32_t) rand_bytes[j++]) << 16;
random_data |= ((uint32_t) rand_bytes[j++]) << 8;
random_data |= rand_bytes[j++];
tmp[i] = ((uint32_t) rand_bytes[j++]) << 16;
tmp[i] |= ((uint32_t) rand_bytes[j++]) << 8;
tmp[i] |= rand_bytes[j++];

} while (random_data >= UTILS_REJECTION_THRESHOLD);
} while (tmp[i] >= UTILS_REJECTION_THRESHOLD);

random_data = random_data % PARAM_N;
tmp[i] = tmp[i] % PARAM_N;

inc = 1;
for (uint32_t k = 0; k < i; k++) {
if (tmp[k] == random_data) {
exist = 1;
if (tmp[k] == tmp[i]) {
inc = 0;
}
}

if (exist == 1) {
i--;
} else {
tmp[i] = random_data;
}
i += inc;
}

for (uint32_t i = 0; i < weight; i++) {
for (i = 0; i < weight; i++) {
// we store the bloc number and bit position of each vb[i]
uint64_t bloc = tmp[i] >> 6;
bloc = tmp[i] >> 6;
bloc256[i] = _mm256_set1_epi64x(bloc >> 2);
uint64_t pos = (bloc & 0x3UL);
pos = (bloc & 0x3UL);
__m256i pos256 = _mm256_set1_epi64x(pos);
__m256i mask256 = _mm256_cmpeq_epi64(pos256, posCmp256);
uint64_t bit64 = 1ULL << (tmp[i] & 0x3f);
bit64 = 1ULL << (tmp[i] & 0x3f);
__m256i bloc256 = _mm256_set1_epi64x(bit64);
bit256[i] = bloc256 & mask256;
}

for (uint32_t i = 0; i < LOOP_SIZE; i++) {
for (i = 0; i < CEIL_DIVIDE(PARAM_N, 256); i++) {
__m256i aux = _mm256_loadu_si256(((__m256i *)v) + i);
__m256i i256 = _mm256_set1_epi64x(i);

for (uint32_t j = 0; j < weight; j++) {
for (j = 0; j < weight; j++) {
__m256i mask256 = _mm256_cmpeq_epi64(bloc256[j], i256);
aux ^= bit256[j] & mask256;
}
_mm256_storeu_si256(((__m256i *)v) + i, aux);
}

#undef LOOP_SIZE
}


@@ -182,10 +173,9 @@ uint8_t PQCLEAN_HQC128_AVX2_vect_compare(const uint8_t *v1, const uint8_t *v2, u
* @param[in] size_v Integer that is the size of the input vector in bits
*/
void PQCLEAN_HQC128_AVX2_vect_resize(uint64_t *o, uint32_t size_o, const uint64_t *v, uint32_t size_v) {
uint64_t mask = 0x7FFFFFFFFFFFFFFF;
int8_t val = 0;
if (size_o < size_v) {
uint64_t mask = 0x7FFFFFFFFFFFFFFF;
int8_t val = 0;

if (size_o % 64) {
val = 64 - (size_o % 64);
}


+ 26
- 24
crypto_kem/hqc-128/clean/bch.c Parādīt failu

@@ -28,12 +28,12 @@ static void compute_roots(uint64_t *error, const uint16_t *sigma);
static void unpack_message(uint8_t *message_unpacked, const uint64_t *message) {
for (size_t i = 0; i < (VEC_K_SIZE_64 - (PARAM_K % 64 != 0)); ++i) {
for (size_t j = 0; j < 64; ++j) {
message_unpacked[j + 64 * i] = (message[i] >> j) & 1;
message_unpacked[j + 64 * i] = (message[i] >> j) & 0x0000000000000001;
}
}

for (int8_t j = 0; j < PARAM_K % 64; ++j) {
message_unpacked[j + 64 * (VEC_K_SIZE_64 - 1)] = (message[VEC_K_SIZE_64 - 1] >> j) & 1;
message_unpacked[j + 64 * (VEC_K_SIZE_64 - 1)] = (message[VEC_K_SIZE_64 - 1] >> j) & 0x0000000000000001;
}
}

@@ -121,52 +121,54 @@ void PQCLEAN_HQC128_CLEAN_bch_code_encode(uint64_t *codeword, const uint64_t *me
* @param[in] syndromes Array of size (at least) 2*PARAM_DELTA storing the syndromes
*/
static size_t compute_elp(uint16_t *sigma, const uint16_t *syndromes) {
sigma[0] = 1;
size_t deg_sigma = 0;
size_t deg_sigma_p = 0;
uint16_t sigma_copy[PARAM_DELTA - 1] = {0};
size_t deg_sigma_copy = 0;
uint16_t X_sigma_p[PARAM_DELTA + 1] = {0, 1};
int32_t pp = -1; // 2*rho
uint16_t d_p = 1;
uint16_t d = syndromes[0];
uint16_t X_sigma_p[PARAM_DELTA + 1] = {0};
uint16_t d_p, d, dd;
uint16_t mask;
int32_t pp; // 2*rho
size_t deg_sigma, deg_sigma_p, deg_sigma_copy, deg_X_sigma_p;

d = syndromes[0];
sigma[0] = 1;
X_sigma_p[1] = 1;
deg_sigma = 0;
deg_sigma_p = 0;
d_p = 1;
pp = -1;
for (size_t mu = 0; mu < PARAM_DELTA; ++mu) {
// Save sigma in case we need it to update X_sigma_p
memcpy(sigma_copy, sigma, 2 * (PARAM_DELTA - 1));
deg_sigma_copy = deg_sigma;

uint16_t dd = PQCLEAN_HQC128_CLEAN_gf_mul(d, PQCLEAN_HQC128_CLEAN_gf_inverse(d_p)); // 0 if(d == 0)
dd = PQCLEAN_HQC128_CLEAN_gf_mul(d, PQCLEAN_HQC128_CLEAN_gf_inverse(d_p)); // 0 if(d == 0)
for (size_t i = 1; (i <= 2 * mu + 1) && (i <= PARAM_DELTA); ++i) {
sigma[i] ^= PQCLEAN_HQC128_CLEAN_gf_mul(dd, X_sigma_p[i]);
}

size_t deg_X = 2 * mu - pp; // 2*(mu-rho)
size_t deg_X_sigma_p = deg_X + deg_sigma_p;
deg_X_sigma_p = 2 * mu - pp + deg_sigma_p;

// mask1 = 0xffff if(d != 0) and 0 otherwise
int16_t mask1 = -((uint16_t) - d >> 15);
// mask = 0xffff if(d != 0) and 0 otherwise
mask = -((uint16_t) - d >> 15);

// mask2 = 0xffff if(deg_X_sigma_p > deg_sigma) and 0 otherwise
int16_t mask2 = -((uint16_t) (deg_sigma - deg_X_sigma_p) >> 15);
// mask2 &= 0xffff if(deg_X_sigma_p > deg_sigma) and 0 otherwise
mask &= -((uint16_t) (deg_sigma - deg_X_sigma_p) >> 15);

// mask12 = 0xffff if the deg_sigma increased and 0 otherwise
int16_t mask12 = mask1 & mask2;
deg_sigma = (mask12 & deg_X_sigma_p) ^ (~mask12 & deg_sigma);
deg_sigma ^= mask & (deg_sigma ^ deg_X_sigma_p);

if (mu == PARAM_DELTA - 1) {
break;
}

// Update pp, d_p and X_sigma_p if needed
pp = (mask12 & (2 * mu)) ^ (~mask12 & pp);
d_p = (mask12 & d) ^ (~mask12 & d_p);
pp ^= mask & (pp ^ (2 * mu));
d_p ^= mask & (d_p ^ d);
for (size_t i = PARAM_DELTA - 1; i; --i) {
X_sigma_p[i + 1] = (mask12 & sigma_copy[i - 1]) ^ (~mask12 & X_sigma_p[i - 1]);
X_sigma_p[i + 1] = X_sigma_p[i - 1];
X_sigma_p[i + 1] ^= mask & (X_sigma_p[i + 1] ^ sigma_copy[i - 1]);
}
X_sigma_p[1] = 0;
X_sigma_p[0] = 0;
deg_sigma_p = (mask12 & deg_sigma_copy) ^ (~mask12 & deg_sigma_p);
deg_sigma_p ^= mask & (deg_sigma_p ^ deg_sigma_copy);

// Compute the next discrepancy
d = syndromes[2 * mu + 2];


+ 17
- 58
crypto_kem/hqc-128/clean/vector.c Parādīt failu

@@ -31,39 +31,33 @@
void PQCLEAN_HQC128_CLEAN_vect_set_random_fixed_weight_by_coordinates(AES_XOF_struct *ctx, uint32_t *v, uint16_t weight) {
size_t random_bytes_size = 3 * weight;
uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0}; // weight is expected to be <= PARAM_OMEGA_R
uint32_t random_data = 0;
uint8_t exist = 0;
size_t j = 0;
uint8_t inc;
size_t i, j;

seedexpander(ctx, rand_bytes, random_bytes_size);

for (uint32_t i = 0; i < weight; ++i) {
exist = 0;
i = 0;
j = random_bytes_size;
while (i < weight) {
do {
if (j == random_bytes_size) {
seedexpander(ctx, rand_bytes, random_bytes_size);
j = 0;
}

random_data = ((uint32_t) rand_bytes[j++]) << 16;
random_data |= ((uint32_t) rand_bytes[j++]) << 8;
random_data |= rand_bytes[j++];
v[i] = ((uint32_t) rand_bytes[j++]) << 16;
v[i] |= ((uint32_t) rand_bytes[j++]) << 8;
v[i] |= rand_bytes[j++];

} while (random_data >= UTILS_REJECTION_THRESHOLD);
} while (v[i] >= UTILS_REJECTION_THRESHOLD);

random_data = random_data % PARAM_N;
v[i] = v[i] % PARAM_N;

for (uint32_t k = 0; k < i; k++) {
if (v[k] == random_data) {
exist = 1;
inc = 1;
for (size_t k = 0; k < i; k++) {
if (v[k] == v[i]) {
inc = 0;
}
}

if (exist == 1) {
i--;
} else {
v[i] = random_data;
}
i += inc;
}
}

@@ -86,46 +80,11 @@ void PQCLEAN_HQC128_CLEAN_vect_set_random_fixed_weight_by_coordinates(AES_XOF_st
* @param[in] ctx Pointer to the context of the seed expander
*/
void PQCLEAN_HQC128_CLEAN_vect_set_random_fixed_weight(AES_XOF_struct *ctx, uint64_t *v, uint16_t weight) {

size_t random_bytes_size = 3 * weight;
uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0}; // weight is expected to be <= PARAM_OMEGA_R
uint32_t random_data = 0;
uint32_t tmp[PARAM_OMEGA_R] = {0};
uint8_t exist = 0;
size_t j = 0;

seedexpander(ctx, rand_bytes, random_bytes_size);

for (uint32_t i = 0; i < weight; ++i) {
exist = 0;
do {
if (j == random_bytes_size) {
seedexpander(ctx, rand_bytes, random_bytes_size);
j = 0;
}

random_data = ((uint32_t) rand_bytes[j++]) << 16;
random_data |= ((uint32_t) rand_bytes[j++]) << 8;
random_data |= rand_bytes[j++];

} while (random_data >= UTILS_REJECTION_THRESHOLD);

random_data = random_data % PARAM_N;

for (uint32_t k = 0; k < i; k++) {
if (tmp[k] == random_data) {
exist = 1;
}
}

if (exist == 1) {
i--;
} else {
tmp[i] = random_data;
}
}
PQCLEAN_HQC128_CLEAN_vect_set_random_fixed_weight_by_coordinates(ctx, tmp, weight);

for (uint16_t i = 0; i < weight; ++i) {
for (size_t i = 0; i < weight; ++i) {
int32_t index = tmp[i] / 64;
int32_t pos = tmp[i] % 64;
v[index] |= ((uint64_t) 1) << pos;


+ 27
- 23
crypto_kem/hqc-192/avx2/bch.c Parādīt failu

@@ -35,52 +35,54 @@ static void compute_roots(uint64_t *error, const uint16_t *sigma);
* @param[in] syndromes Array of size (at least) 2*PARAM_DELTA storing the syndromes
*/
static size_t compute_elp(uint16_t *sigma, const uint16_t *syndromes) {
sigma[0] = 1;
size_t deg_sigma = 0;
size_t deg_sigma_p = 0;
uint16_t sigma_copy[PARAM_DELTA - 1] = {0};
size_t deg_sigma_copy = 0;
uint16_t X_sigma_p[PARAM_DELTA + 1] = {0, 1};
int32_t pp = -1; // 2*rho
uint16_t d_p = 1;
uint16_t d = syndromes[0];
uint16_t X_sigma_p[PARAM_DELTA + 1] = {0};
uint16_t d_p, d, dd;
uint16_t mask;
int32_t pp; // 2*rho
size_t deg_sigma, deg_sigma_p, deg_sigma_copy, deg_X_sigma_p;

d = syndromes[0];
sigma[0] = 1;
X_sigma_p[1] = 1;
deg_sigma = 0;
deg_sigma_p = 0;
d_p = 1;
pp = -1;
for (size_t mu = 0; mu < PARAM_DELTA; ++mu) {
// Save sigma in case we need it to update X_sigma_p
memcpy(sigma_copy, sigma, 2 * (PARAM_DELTA - 1));
deg_sigma_copy = deg_sigma;

uint16_t dd = PQCLEAN_HQC192_AVX2_gf_mul(d, PQCLEAN_HQC192_AVX2_gf_inverse(d_p)); // 0 if(d == 0)
dd = PQCLEAN_HQC192_AVX2_gf_mul(d, PQCLEAN_HQC192_AVX2_gf_inverse(d_p)); // 0 if(d == 0)
for (size_t i = 1; (i <= 2 * mu + 1) && (i <= PARAM_DELTA); ++i) {
sigma[i] ^= PQCLEAN_HQC192_AVX2_gf_mul(dd, X_sigma_p[i]);
}

size_t deg_X = 2 * mu - pp; // 2*(mu-rho)
size_t deg_X_sigma_p = deg_X + deg_sigma_p;
deg_X_sigma_p = 2 * mu - pp + deg_sigma_p;

// mask1 = 0xffff if(d != 0) and 0 otherwise
int16_t mask1 = -((uint16_t) - d >> 15);
// mask = 0xffff if(d != 0) and 0 otherwise
mask = -((uint16_t) - d >> 15);

// mask2 = 0xffff if(deg_X_sigma_p > deg_sigma) and 0 otherwise
int16_t mask2 = -((uint16_t) (deg_sigma - deg_X_sigma_p) >> 15);
// mask &= 0xffff if(deg_X_sigma_p > deg_sigma) and 0 otherwise
mask &= -((uint16_t) (deg_sigma - deg_X_sigma_p) >> 15);

// mask12 = 0xffff if the deg_sigma increased and 0 otherwise
int16_t mask12 = mask1 & mask2;
deg_sigma = (mask12 & deg_X_sigma_p) ^ (~mask12 & deg_sigma);
deg_sigma ^= mask & (deg_sigma ^ deg_X_sigma_p);

if (mu == PARAM_DELTA - 1) {
break;
}

// Update pp, d_p and X_sigma_p if needed
pp = (mask12 & (2 * mu)) ^ (~mask12 & pp);
d_p = (mask12 & d) ^ (~mask12 & d_p);
pp ^= mask & (pp ^ (2 * mu));
d_p ^= mask & (d_p ^ d);
for (size_t i = PARAM_DELTA - 1; i; --i) {
X_sigma_p[i + 1] = (mask12 & sigma_copy[i - 1]) ^ (~mask12 & X_sigma_p[i - 1]);
X_sigma_p[i + 1] = X_sigma_p[i - 1];
X_sigma_p[i + 1] ^= mask & (X_sigma_p[i + 1] ^ sigma_copy[i - 1]);
}
X_sigma_p[1] = 0;
X_sigma_p[0] = 0;
deg_sigma_p = (mask12 & deg_sigma_copy) ^ (~mask12 & deg_sigma_p);
deg_sigma_p ^= mask & (deg_sigma_p ^ deg_sigma_copy);

// Compute the next discrepancy
d = syndromes[2 * mu + 2];
@@ -145,6 +147,7 @@ void compute_syndromes(__m256i *syndromes, const uint64_t *rcv) {
uint32_t *aux;
int16_t *alpha_tmp;
uint32_t i;
uint32_t nzflag;
// static variable so that it is stored in the DATA segment
// not in the STACK segment
static uint8_t tmp_array[PARAM_N1 + 4]; // +4 to control overflow due to management of 256 bits
@@ -169,7 +172,8 @@ void compute_syndromes(__m256i *syndromes, const uint64_t *rcv) {
alpha_tmp = table_alpha_ij + (j << 4);

for (size_t i = 0; i < PARAM_N1; ++i) {
tmp_repeat = _mm256_set1_epi64x((long long)(tmp_array[i] != 0));
nzflag = ((-(int32_t) tmp_array[i]) >> 31) & 1;
tmp_repeat = _mm256_set1_epi64x(nzflag);
L = _mm256_cmpeq_epi64(tmp_repeat, un_256);
tmp_repeat = _mm256_lddqu_si256((__m256i *)(alpha_tmp + i * (PARAM_DELTA << 1)));
L = _mm256_and_si256(L, tmp_repeat);


+ 21
- 20
crypto_kem/hqc-192/avx2/code.c Parādīt failu

@@ -34,15 +34,18 @@ static inline uint64_t mux(uint64_t a, uint64_t b, int64_t bit) {
* @param[in] m Pointer to an array that is the message
*/
void PQCLEAN_HQC192_AVX2_code_encode(uint64_t *em, const uint64_t *m) {
uint64_t res;
uint32_t i;
static const uint64_t mask[2][2] = {{0x0UL, 0x0UL}, {0x7FFFFFFFFFFFFFFUL, 0x3FFFFFFFFFFFFFFUL}};
const uint64_t mask[2][2] = {{0x0UL, 0x0UL}, {0x7FFFFFFFFFFFFFFUL, 0x3FFFFFFFFFFFFFFUL}};
size_t i, pos_r;
uint64_t bit;
uint16_t idx_r;
uint64_t select;


__m256i *colonne, y, aux0;
__m256i msg = _mm256_lddqu_si256((const __m256i *) m);
colonne = ((__m256i *) gen_matrix);

pos_r = 0;
for (i = 0; i < PARAM_N1 - PARAM_K; i++) {
// y is the and operation between m and ith column of G
y = _mm256_and_si256(colonne[i], msg);
@@ -54,34 +57,32 @@ void PQCLEAN_HQC192_AVX2_code_encode(uint64_t *em, const uint64_t *m) {
aux0 = _mm256_shuffle_epi32(y, 0x4e);
// y = (y0^y1^y2^y3 repeated 4 times)
y = _mm256_xor_si256(aux0, y);
res = _mm_popcnt_u64(_mm256_extract_epi64(y, 0)) & 1;
bit = _mm_popcnt_u64(_mm256_extract_epi64(y, 0)) & 1;


uint16_t pos_r = PARAM_N2 * i;
uint16_t idx_r = (pos_r & 0x3f);
uint64_t *p64 = em;
p64 += pos_r >> 6;
uint64_t select = mux(mask[0][0], mask[1][0], res);
*p64 ^= select << idx_r;
select = mux(mask[0][1], mask[1][1], res);
*(p64 + 1) ^= select >> ((63 - idx_r));
idx_r = (pos_r & 0x3f);
select = mux(mask[0][0], mask[1][0], bit);
em[(pos_r >> 6) + 0] ^= select << idx_r;
select = mux(mask[0][1], mask[1][1], bit);
em[(pos_r >> 6) + 1] ^= select >> ((63 - idx_r));
pos_r += PARAM_N2;
}

/* now we add the message m */
/* systematic encoding */
pos_r = PARAM_N2 * (PARAM_N1 - PARAM_K);
for (int32_t i = 0; i < 4; i++) {
for (int32_t j = 0; j < 64; j++) {
uint8_t bit = (m[i] >> j) & 0x1;
uint32_t pos_r = PARAM_N2 * ((PARAM_N1 - PARAM_K) + ((i << 6) + j));
uint16_t idx_r = (pos_r & 0x3f);
uint64_t *p64 = em;
bit = (m[i] >> j) & 0x1;


p64 += pos_r >> 6;
uint64_t select = mux(mask[0][0], mask[1][0], bit);
*p64 ^= select << idx_r;
idx_r = (pos_r & 0x3f);
select = mux(mask[0][0], mask[1][0], bit);
em[(pos_r >> 6) + 0] ^= select << idx_r;
select = mux(mask[0][1], mask[1][1], bit);
*(p64 + 1) ^= select >> ((63 - idx_r));
em[(pos_r >> 6) + 1] ^= select >> ((63 - idx_r));

pos_r += PARAM_N2;
}
}



+ 59
- 57
crypto_kem/hqc-192/avx2/gf2x.c Parādīt failu

@@ -188,23 +188,24 @@ static inline void karat_mult_4(__m256i *C, __m256i *A, __m256i *B) {
* @param[in] B Pointer to the polynomial B(x)
*/
static inline void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) {
int32_t i, is, is2, is3;
__m256i D0[8], D1[8], D2[8], SAA[4], SBB[4];

karat_mult_4( D0, A, B);
karat_mult_4(D2, A + 4, B + 4);

for (int32_t i = 0; i < 4; i++) {
int is = i + 4;
for (i = 0; i < 4; i++) {
is = i + 4;
SAA[i] = A[i] ^ A[is];
SBB[i] = B[i] ^ B[is];
}

karat_mult_4(D1, SAA, SBB);

for (int32_t i = 0; i < 4; i++) {
int32_t is = i + 4;
int32_t is2 = is + 4;
int32_t is3 = is2 + 4;
for (i = 0; i < 4; i++) {
is = i + 4;
is2 = is + 4;
is3 = is2 + 4;

__m256i middle = _mm256_xor_si256(D0[is], D2[i]);

@@ -228,22 +229,23 @@ static inline void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) {
*/
static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[16], D1[16], D2[16], SAA[8], SBB[8];
int32_t i, is, is2, is3;

karat_mult_8( D0, A, B);
karat_mult_8(D2, A + 8, B + 8);

for (int32_t i = 0; i < 8; i++) {
int32_t is = i + 8;
for (i = 0; i < 8; i++) {
is = i + 8;
SAA[i] = A[i] ^ A[is];
SBB[i] = B[i] ^ B[is];
}

karat_mult_8( D1, SAA, SBB);

for (int32_t i = 0; i < 8; i++) {
int32_t is = i + 8;
int32_t is2 = is + 8;
int32_t is3 = is2 + 8;
for (i = 0; i < 8; i++) {
is = i + 8;
is2 = is + 8;
is3 = is2 + 8;

__m256i middle = _mm256_xor_si256(D0[is], D2[i]);

@@ -267,22 +269,23 @@ static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) {
*/
static inline void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[32], D1[32], D2[32], SAA[16], SBB[16];
int32_t i, is, is2, is3;

karat_mult_16( D0, A, B);
karat_mult_16(D2, A + 16, B + 16);

for (int32_t i = 0; i < 16; i++) {
int is = i + 16;
for (i = 0; i < 16; i++) {
is = i + 16;
SAA[i] = A[i] ^ A[is];
SBB[i] = B[i] ^ B[is];
}

karat_mult_16( D1, SAA, SBB);

for (int32_t i = 0; i < 16; i++) {
int32_t is = i + 16;
int32_t is2 = is + 16;
int32_t is3 = is2 + 16;
for (i = 0; i < 16; i++) {
is = i + 16;
is2 = is + 16;
is3 = is2 + 16;

__m256i middle = _mm256_xor_si256(D0[is], D2[i]);

@@ -306,21 +309,22 @@ static inline void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) {
*/
static inline void karat_mult_64(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[64], D1[64], D2[64], SAA[32], SBB[32];
int32_t i, is, is2, is3;

karat_mult_32( D0, A, B);
karat_mult_32(D2, A + 32, B + 32);
for (int32_t i = 0; i < 32; i++) {
int32_t is = i + 32;
for (i = 0; i < 32; i++) {
is = i + 32;
SAA[i] = A[i] ^ A[is];
SBB[i] = B[i] ^ B[is];
}

karat_mult_32( D1, SAA, SBB);

for (int32_t i = 0; i < 32; i++) {
int32_t is = i + 32;
int32_t is2 = is + 32;
int32_t is3 = is2 + 32;
for (i = 0; i < 32; i++) {
is = i + 32;
is2 = is + 32;
is3 = is2 + 32;

__m256i middle = _mm256_xor_si256(D0[is], D2[i]);

@@ -369,11 +373,16 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
static __m256i tmp[2 * (T_TM3_3W_256)];
static __m256i ro256[6 * (T_TM3_3W_256)];
const __m256i zero = _mm256_setzero_si256();
int64_t *U1_64;
int64_t *U2_64;
int64_t *V1_64;
int64_t *V2_64;
int32_t T2 = T_TM3_3W_64 << 1;
int32_t i, i4, i41, i42;

for (int32_t i = 0; i < T_TM3_3W_256 - 1; i++) {
int32_t i4 = i << 2;
int32_t i42 = i4 - 2;
for (i = 0; i < T_TM3_3W_256 - 1; i++) {
i4 = i << 2;
i42 = i4 - 2;
U0[i] = _mm256_lddqu_si256((__m256i const *)(& A[i4]));
V0[i] = _mm256_lddqu_si256((__m256i const *)(& B[i4]));
U1[i] = _mm256_lddqu_si256((__m256i const *)(& A[i42 + T_TM3_3W_64]));
@@ -382,9 +391,9 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
V2[i] = _mm256_lddqu_si256((__m256i const *)(& B[i4 + T2 - 4]));
}

for (int32_t i = T_TM3_3W_256 - 1; i < T_TM3_3W_256; i++) {
int32_t i4 = i << 2;
int32_t i41 = i4 + 1;
for (i = T_TM3_3W_256 - 1; i < T_TM3_3W_256; i++) {
i4 = i << 2;
i41 = i4 + 1;
U0[i] = _mm256_set_epi64x(0, 0, A[i41], A[i4]);
V0[i] = _mm256_set_epi64x(0, 0, B[i41], B[i4]);
U1[i] = _mm256_set_epi64x(0, 0, A[i41 + T_TM3_3W_64 - 2], A[i4 + T_TM3_3W_64 - 2]);
@@ -397,7 +406,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
// P(X): P0=(0); P1=(1); P2=(x); P3=(1+x); P4=(\infty)
// Evaluation: 5*2 add, 2*2 shift; 5 mul (n)
//W3 = U2 + U1 + U0; W2 = V2 + V1 + V0
for (int32_t i = 0; i < T_TM3_3W_256; i++) {
for (i = 0; i < T_TM3_3W_256; i++) {
W3[i] = U0[i] ^ U1[i] ^ U2[i];
W2[i] = V0[i] ^ V1[i] ^ V2[i];
}
@@ -406,23 +415,17 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
karat_mult_64( W1, W2, W3);

//W0 =(U1 + U2*x)*x; W4 =(V1 + V2*x)*x (SIZE = T_TM3_3W_256 !)
int64_t *U1_64 = ((int64_t *) U1);
int64_t *U2_64 = ((int64_t *) U2);

int64_t *V1_64 = ((int64_t *) V1);
int64_t *V2_64 = ((int64_t *) V2);

W0[0] = _mm256_set_epi64x(U1_64[2] ^ U2_64[1], U1_64[1] ^ U2_64[0], U1_64[0], 0);
W4[0] = _mm256_set_epi64x(V1_64[2] ^ V2_64[1], V1_64[1] ^ V2_64[0], V1_64[0], 0);

U1_64 = ((int64_t *) U1);
U2_64 = ((int64_t *) U2);

V1_64 = ((int64_t *) V1);
V2_64 = ((int64_t *) V2);

for (int32_t i = 1; i < T_TM3_3W_256; i++) {
int i4 = i << 2;
W0[0] = _mm256_set_epi64x(U1_64[2] ^ U2_64[1], U1_64[1] ^ U2_64[0], U1_64[0], 0);
W4[0] = _mm256_set_epi64x(V1_64[2] ^ V2_64[1], V1_64[1] ^ V2_64[0], V1_64[0], 0);

for (i = 1; i < T_TM3_3W_256; i++) {
i4 = i << 2;
W0[i] = _mm256_lddqu_si256((__m256i const *)(& U1_64[i4 - 1]));
W0[i] ^= _mm256_lddqu_si256((__m256i const *)(& U2_64[i4 - 2]));

@@ -431,21 +434,20 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
}

//W3 = W3 + W0 ; W2 = W2 + W4
for (int32_t i = 0; i < T_TM3_3W_256; i++) {
for (i = 0; i < T_TM3_3W_256; i++) {
W3[i] ^= W0[i];
W2[i] ^= W4[i];
}

//W0 = W0 + U0 ; W4 = W4 + V0
for (int32_t i = 0; i < T_TM3_3W_256; i++) {
for (i = 0; i < T_TM3_3W_256; i++) {
W0[i] ^= U0[i];
W4[i] ^= V0[i];
}


karat_mult_64(tmp, W3, W2);

for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3_3W_256); i++) {
W3[i] = tmp[i];
}

@@ -457,20 +459,20 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
// Interpolation phase
// 9 add, 1 shift, 1 Smul, 2 Sdiv (2n)
//W3 = W3 + W2
for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3_3W_256); i++) {
W3[i] ^= W2[i];
}

//W1 = W1 + W0
for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3_3W_256); i++) {
W1[i] ^= W0[i];
}

//W2 =(W2 + W0)/x -> x = X^64
U1_64 = ((int64_t *) W2);
U2_64 = ((int64_t *) W0);
for (int32_t i = 0; i < (T_TM3_3W_256 << 1); i++) {
int32_t i4 = i << 2;
for (i = 0; i < (T_TM3_3W_256 << 1); i++) {
i4 = i << 2;
W2[i] = _mm256_lddqu_si256((__m256i const *)(& U1_64[i4 + 1]));
W2[i] ^= _mm256_lddqu_si256((__m256i const *)(& U2_64[i4 + 1]));
}
@@ -480,7 +482,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
__m256i *U1_256 = (__m256i *) (U1_64 + 1);
tmp[0] = W2[0] ^ W3[0] ^ W4[0] ^ _mm256_set_epi64x(U1_64[0], 0, 0, 0);

for (int32_t i = 1; i < (T_TM3_3W_256 << 1) - 1; i++) {
for (i = 1; i < (T_TM3_3W_256 << 1) - 1; i++) {
tmp[i] = W2[i] ^ W3[i] ^ W4[i] ^ _mm256_lddqu_si256(&U1_256[i - 1]);
}

@@ -494,7 +496,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
U2_64 = (int64_t *) W1;
__m256i *U2_256 = (__m256i *) (U2_64 + 1);

for (int32_t i = 0; i < 2 * (T_TM3_3W_256) - 1; i++) {
for (i = 0; i < 2 * (T_TM3_3W_256) - 1; i++) {
tmp[i] = _mm256_lddqu_si256(&U1_256[i]) ^ _mm256_lddqu_si256(&U2_256[i]);
}

@@ -502,19 +504,19 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
W3[2 * (T_TM3_3W_256) - 1] = zero;

//W1 = W1 + W4 + W2
for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3_3W_256); i++) {
W1[i] ^= W2[i] ^ W4[i];
}

//W2 = W2 + W3
for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3_3W_256); i++) {
W2[i] ^= W3[i];
}

// Recomposition
//W = W0+ W1*x+ W2*x^2+ W3*x^3 + W4*x^4
//W0, W1, W4 of size 2*T_TM3_3W_256, W2 and W3 of size 2*(T_TM3_3W_256)
for (int32_t i = 0; i < (T_TM3_3W_256 << 1) - 1; i++) {
for (i = 0; i < (T_TM3_3W_256 << 1) - 1; i++) {
ro256[i] = W0[i];
ro256[i + 2 * T_TM3_3W_256 - 1] = W2[i];
ro256[i + 4 * T_TM3_3W_256 - 2] = W4[i];
@@ -530,12 +532,12 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
U2_64 = ((int64_t *) &ro256[3 * T_TM3_3W_256 - 1]);
U2_256 = (__m256i *) (U2_64 - 2);

for (int32_t i = 0; i < T_TM3_3W_256 << 1; i++) {
for (i = 0; i < T_TM3_3W_256 << 1; i++) {
_mm256_storeu_si256(&U1_256[i], W1[i] ^ _mm256_lddqu_si256(&U1_256[i]));
_mm256_storeu_si256(&U2_256[i], W3[i] ^ _mm256_loadu_si256(&U2_256[i]));
}

for (int32_t i = 0; i < 2 * VEC_N_SIZE_256 + 1; i++) {
for (i = 0; i < 2 * VEC_N_SIZE_256 + 1; i++) {
_mm256_storeu_si256(&Out[i], ro256[i]);
}
}


+ 25
- 35
crypto_kem/hqc-192/avx2/vector.c Parādīt failu

@@ -32,72 +32,63 @@
void PQCLEAN_HQC192_AVX2_vect_set_random_fixed_weight(AES_XOF_struct *ctx, uint64_t *v, uint16_t weight) {
size_t random_bytes_size = 3 * weight;
uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0};
uint32_t random_data = 0;
uint32_t tmp[PARAM_OMEGA_R] = {0};
uint8_t exist = 0;
size_t j = 0;
__m256i bit256[PARAM_OMEGA_R];
__m256i bloc256[PARAM_OMEGA_R];
static __m256i posCmp256 = (__m256i) {
0UL, 1UL, 2UL, 3UL
};
#define LOOP_SIZE CEIL_DIVIDE(PARAM_N, 256)

seedexpander(ctx, rand_bytes, random_bytes_size);

for (uint32_t i = 0; i < weight; ++i) {
exist = 0;
__m256i posCmp256 = _mm256_set_epi64x(3, 2, 1, 0);
uint64_t bloc, pos, bit64;
uint8_t inc;
size_t i, j;

i = 0;
j = random_bytes_size;
while (i < weight) {
do {
if (j == random_bytes_size) {
seedexpander(ctx, rand_bytes, random_bytes_size);
j = 0;
}

random_data = ((uint32_t) rand_bytes[j++]) << 16;
random_data |= ((uint32_t) rand_bytes[j++]) << 8;
random_data |= rand_bytes[j++];
tmp[i] = ((uint32_t) rand_bytes[j++]) << 16;
tmp[i] |= ((uint32_t) rand_bytes[j++]) << 8;
tmp[i] |= rand_bytes[j++];

} while (random_data >= UTILS_REJECTION_THRESHOLD);
} while (tmp[i] >= UTILS_REJECTION_THRESHOLD);

random_data = random_data % PARAM_N;
tmp[i] = tmp[i] % PARAM_N;

inc = 1;
for (uint32_t k = 0; k < i; k++) {
if (tmp[k] == random_data) {
exist = 1;
if (tmp[k] == tmp[i]) {
inc = 0;
}
}

if (exist == 1) {
i--;
} else {
tmp[i] = random_data;
}
i += inc;
}

for (uint32_t i = 0; i < weight; i++) {
for (i = 0; i < weight; i++) {
// we store the bloc number and bit position of each vb[i]
uint64_t bloc = tmp[i] >> 6;
bloc = tmp[i] >> 6;
bloc256[i] = _mm256_set1_epi64x(bloc >> 2);
uint64_t pos = (bloc & 0x3UL);
pos = (bloc & 0x3UL);
__m256i pos256 = _mm256_set1_epi64x(pos);
__m256i mask256 = _mm256_cmpeq_epi64(pos256, posCmp256);
uint64_t bit64 = 1ULL << (tmp[i] & 0x3f);
bit64 = 1ULL << (tmp[i] & 0x3f);
__m256i bloc256 = _mm256_set1_epi64x(bit64);
bit256[i] = bloc256 & mask256;
}

for (uint32_t i = 0; i < LOOP_SIZE; i++) {
for (i = 0; i < CEIL_DIVIDE(PARAM_N, 256); i++) {
__m256i aux = _mm256_loadu_si256(((__m256i *)v) + i);
__m256i i256 = _mm256_set1_epi64x(i);

for (uint32_t j = 0; j < weight; j++) {
for (j = 0; j < weight; j++) {
__m256i mask256 = _mm256_cmpeq_epi64(bloc256[j], i256);
aux ^= bit256[j] & mask256;
}
_mm256_storeu_si256(((__m256i *)v) + i, aux);
}

#undef LOOP_SIZE
}


@@ -182,10 +173,9 @@ uint8_t PQCLEAN_HQC192_AVX2_vect_compare(const uint8_t *v1, const uint8_t *v2, u
* @param[in] size_v Integer that is the size of the input vector in bits
*/
void PQCLEAN_HQC192_AVX2_vect_resize(uint64_t *o, uint32_t size_o, const uint64_t *v, uint32_t size_v) {
uint64_t mask = 0x7FFFFFFFFFFFFFFF;
int8_t val = 0;
if (size_o < size_v) {
uint64_t mask = 0x7FFFFFFFFFFFFFFF;
int8_t val = 0;

if (size_o % 64) {
val = 64 - (size_o % 64);
}


+ 24
- 22
crypto_kem/hqc-192/clean/bch.c Parādīt failu

@@ -121,52 +121,54 @@ void PQCLEAN_HQC192_CLEAN_bch_code_encode(uint64_t *codeword, const uint64_t *me
* @param[in] syndromes Array of size (at least) 2*PARAM_DELTA storing the syndromes
*/
static size_t compute_elp(uint16_t *sigma, const uint16_t *syndromes) {
sigma[0] = 1;
size_t deg_sigma = 0;
size_t deg_sigma_p = 0;
uint16_t sigma_copy[PARAM_DELTA - 1] = {0};
size_t deg_sigma_copy = 0;
uint16_t X_sigma_p[PARAM_DELTA + 1] = {0, 1};
int32_t pp = -1; // 2*rho
uint16_t d_p = 1;
uint16_t d = syndromes[0];
uint16_t X_sigma_p[PARAM_DELTA + 1] = {0};
uint16_t d_p, d, dd;
uint16_t mask;
int32_t pp; // 2*rho
size_t deg_sigma, deg_sigma_p, deg_sigma_copy, deg_X_sigma_p;

d = syndromes[0];
sigma[0] = 1;
X_sigma_p[1] = 1;
deg_sigma = 0;
deg_sigma_p = 0;
d_p = 1;
pp = -1;
for (size_t mu = 0; mu < PARAM_DELTA; ++mu) {
// Save sigma in case we need it to update X_sigma_p
memcpy(sigma_copy, sigma, 2 * (PARAM_DELTA - 1));
deg_sigma_copy = deg_sigma;

uint16_t dd = PQCLEAN_HQC192_CLEAN_gf_mul(d, PQCLEAN_HQC192_CLEAN_gf_inverse(d_p)); // 0 if(d == 0)
dd = PQCLEAN_HQC192_CLEAN_gf_mul(d, PQCLEAN_HQC192_CLEAN_gf_inverse(d_p)); // 0 if(d == 0)
for (size_t i = 1; (i <= 2 * mu + 1) && (i <= PARAM_DELTA); ++i) {
sigma[i] ^= PQCLEAN_HQC192_CLEAN_gf_mul(dd, X_sigma_p[i]);
}

size_t deg_X = 2 * mu - pp; // 2*(mu-rho)
size_t deg_X_sigma_p = deg_X + deg_sigma_p;
deg_X_sigma_p = 2 * mu - pp + deg_sigma_p;

// mask1 = 0xffff if(d != 0) and 0 otherwise
int16_t mask1 = -((uint16_t) - d >> 15);
// mask = 0xffff if(d != 0) and 0 otherwise
mask = -((uint16_t) - d >> 15);

// mask2 = 0xffff if(deg_X_sigma_p > deg_sigma) and 0 otherwise
int16_t mask2 = -((uint16_t) (deg_sigma - deg_X_sigma_p) >> 15);
// mask2 &= 0xffff if(deg_X_sigma_p > deg_sigma) and 0 otherwise
mask &= -((uint16_t) (deg_sigma - deg_X_sigma_p) >> 15);

// mask12 = 0xffff if the deg_sigma increased and 0 otherwise
int16_t mask12 = mask1 & mask2;
deg_sigma = (mask12 & deg_X_sigma_p) ^ (~mask12 & deg_sigma);
deg_sigma ^= mask & (deg_sigma ^ deg_X_sigma_p);

if (mu == PARAM_DELTA - 1) {
break;
}

// Update pp, d_p and X_sigma_p if needed
pp = (mask12 & (2 * mu)) ^ (~mask12 & pp);
d_p = (mask12 & d) ^ (~mask12 & d_p);
pp ^= mask & (pp ^ (2 * mu));
d_p ^= mask & (d_p ^ d);
for (size_t i = PARAM_DELTA - 1; i; --i) {
X_sigma_p[i + 1] = (mask12 & sigma_copy[i - 1]) ^ (~mask12 & X_sigma_p[i - 1]);
X_sigma_p[i + 1] = X_sigma_p[i - 1];
X_sigma_p[i + 1] ^= mask & (X_sigma_p[i + 1] ^ sigma_copy[i - 1]);
}
X_sigma_p[1] = 0;
X_sigma_p[0] = 0;
deg_sigma_p = (mask12 & deg_sigma_copy) ^ (~mask12 & deg_sigma_p);
deg_sigma_p ^= mask & (deg_sigma_p ^ deg_sigma_copy);

// Compute the next discrepancy
d = syndromes[2 * mu + 2];


+ 17
- 58
crypto_kem/hqc-192/clean/vector.c Parādīt failu

@@ -31,39 +31,33 @@
void PQCLEAN_HQC192_CLEAN_vect_set_random_fixed_weight_by_coordinates(AES_XOF_struct *ctx, uint32_t *v, uint16_t weight) {
size_t random_bytes_size = 3 * weight;
uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0}; // weight is expected to be <= PARAM_OMEGA_R
uint32_t random_data = 0;
uint8_t exist = 0;
size_t j = 0;
uint8_t inc;
size_t i, j;

seedexpander(ctx, rand_bytes, random_bytes_size);

for (uint32_t i = 0; i < weight; ++i) {
exist = 0;
i = 0;
j = random_bytes_size;
while (i < weight) {
do {
if (j == random_bytes_size) {
seedexpander(ctx, rand_bytes, random_bytes_size);
j = 0;
}

random_data = ((uint32_t) rand_bytes[j++]) << 16;
random_data |= ((uint32_t) rand_bytes[j++]) << 8;
random_data |= rand_bytes[j++];
v[i] = ((uint32_t) rand_bytes[j++]) << 16;
v[i] |= ((uint32_t) rand_bytes[j++]) << 8;
v[i] |= rand_bytes[j++];

} while (random_data >= UTILS_REJECTION_THRESHOLD);
} while (v[i] >= UTILS_REJECTION_THRESHOLD);

random_data = random_data % PARAM_N;
v[i] = v[i] % PARAM_N;

for (uint32_t k = 0; k < i; k++) {
if (v[k] == random_data) {
exist = 1;
inc = 1;
for (size_t k = 0; k < i; k++) {
if (v[k] == v[i]) {
inc = 0;
}
}

if (exist == 1) {
i--;
} else {
v[i] = random_data;
}
i += inc;
}
}

@@ -86,46 +80,11 @@ void PQCLEAN_HQC192_CLEAN_vect_set_random_fixed_weight_by_coordinates(AES_XOF_st
* @param[in] ctx Pointer to the context of the seed expander
*/
void PQCLEAN_HQC192_CLEAN_vect_set_random_fixed_weight(AES_XOF_struct *ctx, uint64_t *v, uint16_t weight) {

size_t random_bytes_size = 3 * weight;
uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0}; // weight is expected to be <= PARAM_OMEGA_R
uint32_t random_data = 0;
uint32_t tmp[PARAM_OMEGA_R] = {0};
uint8_t exist = 0;
size_t j = 0;

seedexpander(ctx, rand_bytes, random_bytes_size);

for (uint32_t i = 0; i < weight; ++i) {
exist = 0;
do {
if (j == random_bytes_size) {
seedexpander(ctx, rand_bytes, random_bytes_size);
j = 0;
}

random_data = ((uint32_t) rand_bytes[j++]) << 16;
random_data |= ((uint32_t) rand_bytes[j++]) << 8;
random_data |= rand_bytes[j++];

} while (random_data >= UTILS_REJECTION_THRESHOLD);

random_data = random_data % PARAM_N;

for (uint32_t k = 0; k < i; k++) {
if (tmp[k] == random_data) {
exist = 1;
}
}

if (exist == 1) {
i--;
} else {
tmp[i] = random_data;
}
}
PQCLEAN_HQC192_CLEAN_vect_set_random_fixed_weight_by_coordinates(ctx, tmp, weight);

for (uint16_t i = 0; i < weight; ++i) {
for (size_t i = 0; i < weight; ++i) {
int32_t index = tmp[i] / 64;
int32_t pos = tmp[i] % 64;
v[index] |= ((uint64_t) 1) << pos;


+ 27
- 23
crypto_kem/hqc-256/avx2/bch.c Parādīt failu

@@ -35,52 +35,54 @@ static void compute_roots(uint64_t *error, const uint16_t *sigma);
* @param[in] syndromes Array of size (at least) 2*PARAM_DELTA storing the syndromes
*/
static size_t compute_elp(uint16_t *sigma, const uint16_t *syndromes) {
sigma[0] = 1;
size_t deg_sigma = 0;
size_t deg_sigma_p = 0;
uint16_t sigma_copy[PARAM_DELTA - 1] = {0};
size_t deg_sigma_copy = 0;
uint16_t X_sigma_p[PARAM_DELTA + 1] = {0, 1};
int32_t pp = -1; // 2*rho
uint16_t d_p = 1;
uint16_t d = syndromes[0];
uint16_t X_sigma_p[PARAM_DELTA + 1] = {0};
uint16_t d_p, d, dd;
uint16_t mask;
int32_t pp; // 2*rho
size_t deg_sigma, deg_sigma_p, deg_sigma_copy, deg_X_sigma_p;

d = syndromes[0];
sigma[0] = 1;
X_sigma_p[1] = 1;
deg_sigma = 0;
deg_sigma_p = 0;
d_p = 1;
pp = -1;
for (size_t mu = 0; mu < PARAM_DELTA; ++mu) {
// Save sigma in case we need it to update X_sigma_p
memcpy(sigma_copy, sigma, 2 * (PARAM_DELTA - 1));
deg_sigma_copy = deg_sigma;

uint16_t dd = PQCLEAN_HQC256_AVX2_gf_mul(d, PQCLEAN_HQC256_AVX2_gf_inverse(d_p)); // 0 if(d == 0)
dd = PQCLEAN_HQC256_AVX2_gf_mul(d, PQCLEAN_HQC256_AVX2_gf_inverse(d_p)); // 0 if(d == 0)
for (size_t i = 1; (i <= 2 * mu + 1) && (i <= PARAM_DELTA); ++i) {
sigma[i] ^= PQCLEAN_HQC256_AVX2_gf_mul(dd, X_sigma_p[i]);
}

size_t deg_X = 2 * mu - pp; // 2*(mu-rho)
size_t deg_X_sigma_p = deg_X + deg_sigma_p;
deg_X_sigma_p = 2 * mu - pp + deg_sigma_p;

// mask1 = 0xffff if(d != 0) and 0 otherwise
int16_t mask1 = -((uint16_t) - d >> 15);
// mask = 0xffff if(d != 0) and 0 otherwise
mask = -((uint16_t) - d >> 15);

// mask2 = 0xffff if(deg_X_sigma_p > deg_sigma) and 0 otherwise
int16_t mask2 = -((uint16_t) (deg_sigma - deg_X_sigma_p) >> 15);
// mask &= 0xffff if(deg_X_sigma_p > deg_sigma) and 0 otherwise
mask &= -((uint16_t) (deg_sigma - deg_X_sigma_p) >> 15);

// mask12 = 0xffff if the deg_sigma increased and 0 otherwise
int16_t mask12 = mask1 & mask2;
deg_sigma = (mask12 & deg_X_sigma_p) ^ (~mask12 & deg_sigma);
deg_sigma ^= mask & (deg_sigma ^ deg_X_sigma_p);

if (mu == PARAM_DELTA - 1) {
break;
}

// Update pp, d_p and X_sigma_p if needed
pp = (mask12 & (2 * mu)) ^ (~mask12 & pp);
d_p = (mask12 & d) ^ (~mask12 & d_p);
pp ^= mask & (pp ^ (2 * mu));
d_p ^= mask & (d_p ^ d);
for (size_t i = PARAM_DELTA - 1; i; --i) {
X_sigma_p[i + 1] = (mask12 & sigma_copy[i - 1]) ^ (~mask12 & X_sigma_p[i - 1]);
X_sigma_p[i + 1] = X_sigma_p[i - 1];
X_sigma_p[i + 1] ^= mask & (X_sigma_p[i + 1] ^ sigma_copy[i - 1]);
}
X_sigma_p[1] = 0;
X_sigma_p[0] = 0;
deg_sigma_p = (mask12 & deg_sigma_copy) ^ (~mask12 & deg_sigma_p);
deg_sigma_p ^= mask & (deg_sigma_p ^ deg_sigma_copy);

// Compute the next discrepancy
d = syndromes[2 * mu + 2];
@@ -145,6 +147,7 @@ void compute_syndromes(__m256i *syndromes, const uint64_t *rcv) {
uint32_t *aux;
int16_t *alpha_tmp;
uint32_t i;
uint32_t nzflag;
// static variable so that it is stored in the DATA segment
// not in the STACK segment
static uint8_t tmp_array[PARAM_N1 + 4]; // +4 to control overflow due to management of 256 bits
@@ -169,7 +172,8 @@ void compute_syndromes(__m256i *syndromes, const uint64_t *rcv) {
alpha_tmp = table_alpha_ij + (j << 4);

for (size_t i = 0; i < PARAM_N1; ++i) {
tmp_repeat = _mm256_set1_epi64x((long long)(tmp_array[i] != 0));
nzflag = ((-(int32_t) tmp_array[i]) >> 31) & 1;
tmp_repeat = _mm256_set1_epi64x(nzflag);
L = _mm256_cmpeq_epi64(tmp_repeat, un_256);
tmp_repeat = _mm256_lddqu_si256((__m256i *)(alpha_tmp + i * (PARAM_DELTA << 1)));
L = _mm256_and_si256(L, tmp_repeat);


+ 33
- 33
crypto_kem/hqc-256/avx2/code.c Parādīt failu

@@ -34,15 +34,19 @@ static inline uint64_t mux(uint64_t a, uint64_t b, int64_t bit) {
* @param[in] m Pointer to an array that is the message
*/
void PQCLEAN_HQC256_AVX2_code_encode(uint64_t *em, const uint64_t *m) {
uint64_t res;
uint32_t i;
static const uint64_t mask[2][3] = {{0x0UL, 0x0UL, 0x0UL}, {0xFFFFFFFFFFFFFFFFUL, 0xFFFFFFFFFFFFFFFFUL, 0x3FFFFFUL}};
const uint64_t mask[2][3] = {{0x0UL, 0x0UL, 0x0UL}, {0xFFFFFFFFFFFFFFFFUL, 0xFFFFFFFFFFFFFFFFUL, 0x3FFFFFUL}};
size_t i, pos_r;
uint64_t bit;
uint64_t idx_r;
uint64_t idx_2;
uint64_t select;


__m256i *colonne, y, aux0;
__m256i msg = _mm256_lddqu_si256((const __m256i *) m);
colonne = ((__m256i *) gen_matrix);

pos_r = 0;
for (i = 0; i < PARAM_N1 - PARAM_K; i++) {
// y is the and operation between m and ith column of G
y = _mm256_and_si256(colonne[i], msg);
@@ -54,44 +58,40 @@ void PQCLEAN_HQC256_AVX2_code_encode(uint64_t *em, const uint64_t *m) {
aux0 = _mm256_shuffle_epi32(y, 0x4e);
// y = (y0^y1^y2^y3 repeated 4 times)
y = _mm256_xor_si256(aux0, y);
res = _mm_popcnt_u64(_mm256_extract_epi64(y, 0)) & 1;


uint16_t pos_r = PARAM_N2 * i;
uint16_t idx_r = (pos_r & 0x3f);
uint64_t *p64 = em;
p64 += pos_r >> 6;
uint64_t select = mux(mask[0][0], mask[1][0], res);
*p64 ^= select << idx_r;
int64_t aux = (41 - idx_r);
uint64_t aux2 = (aux > 0);
uint64_t idx2 = aux * aux2;
select = mux(mask[0][1], mask[1][1], res);
*(p64 + 1) ^= select >> idx2;
select = mux(mask[0][2], mask[1][2], res);
*(p64 + 2) ^= select >> ((63 - idx_r));
bit = _mm_popcnt_u64(_mm256_extract_epi64(y, 0)) & 1;


idx_r = (pos_r & 0x3f);
idx_2 = 41 - idx_r;
idx_2 &= (uint64_t) (-((int64_t)idx_2) >> 63);
select = mux(mask[0][0], mask[1][0], bit);
em[(pos_r >> 6) + 0] ^= select << idx_r;
select = mux(mask[0][1], mask[1][1], bit);
em[(pos_r >> 6) + 1] ^= select >> idx_2;
select = mux(mask[0][2], mask[1][2], bit);
em[(pos_r >> 6) + 2] ^= select >> ((63 - idx_r));
pos_r += PARAM_N2;
}

/* now we add the message m */
/* systematic encoding */
pos_r = PARAM_N2 * (PARAM_N1 - PARAM_K);
for (int32_t i = 0; i < 4; i++) {
for (int32_t j = 0; j < 64; j++) {
uint8_t bit = (m[i] >> j) & 0x1;
uint32_t pos_r = PARAM_N2 * ((PARAM_N1 - PARAM_K) + ((i << 6) + j));
uint16_t idx_r = (pos_r & 0x3f);
uint64_t *p64 = em;


p64 += pos_r >> 6;
uint64_t select = mux(mask[0][0], mask[1][0], bit);
*p64 ^= select << idx_r;
int64_t aux = (41 - idx_r);
uint64_t aux2 = (aux > 0);
uint64_t idx2 = aux * aux2;
bit = (m[i] >> j) & 0x1;


idx_r = (pos_r & 0x3f);
idx_2 = 41 - idx_r;
idx_2 &= (uint64_t) (-((int64_t)idx_2) >> 63);
select = mux(mask[0][0], mask[1][0], bit);
em[(pos_r >> 6) + 0] ^= select << idx_r;
select = mux(mask[0][1], mask[1][1], bit);
*(p64 + 1) ^= select >> idx2;
em[(pos_r >> 6) + 1] ^= select >> idx_2;
select = mux(mask[0][2], mask[1][2], bit);
*(p64 + 2) ^= select >> ((63 - idx_r));
em[(pos_r >> 6) + 2] ^= select >> ((63 - idx_r));

pos_r += PARAM_N2;
}
}



+ 67
- 66
crypto_kem/hqc-256/avx2/gf2x.c Parādīt failu

@@ -232,23 +232,24 @@ static inline void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) {
* @param[in] B Pointer to the polynomial B(x)
*/
static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) {
int32_t i, is, is2, is3;
__m256i D0[16], D1[16], D2[16], SAA[8], SBB[8];

karat_mult_8( D0, A, B);
karat_mult_8(D2, A + 8, B + 8);

for (int32_t i = 0; i < 8; i++) {
int32_t is = i + 8;
for (i = 0; i < 8; i++) {
is = i + 8;
SAA[i] = A[i] ^ A[is];
SBB[i] = B[i] ^ B[is];
}

karat_mult_8( D1, SAA, SBB);

for (int32_t i = 0; i < 8; i++) {
int32_t is = i + 8;
int32_t is2 = is + 8;
int32_t is3 = is2 + 8;
for (i = 0; i < 8; i++) {
is = i + 8;
is2 = is + 8;
is3 = is2 + 8;

__m256i middle = _mm256_xor_si256(D0[is], D2[i]);

@@ -272,22 +273,23 @@ static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) {
*/
static inline void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[32], D1[32], D2[32], SAA[16], SBB[16];
int32_t i, is, is2, is3;

karat_mult_16( D0, A, B);
karat_mult_16(D2, A + 16, B + 16);

for (int32_t i = 0; i < 16; i++) {
int is = i + 16;
for (i = 0; i < 16; i++) {
is = i + 16;
SAA[i] = A[i] ^ A[is];
SBB[i] = B[i] ^ B[is];
}

karat_mult_16( D1, SAA, SBB);

for (int32_t i = 0; i < 16; i++) {
int32_t is = i + 16;
int32_t is2 = is + 16;
int32_t is3 = is2 + 16;
for (i = 0; i < 16; i++) {
is = i + 16;
is2 = is + 16;
is3 = is2 + 16;

__m256i middle = _mm256_xor_si256(D0[is], D2[i]);

@@ -299,7 +301,6 @@ static inline void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) {
}



/**
* @brief Compute B(x) = A(x)/(x+1)
*
@@ -336,11 +337,16 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
static __m256i tmp[2 * (T_TM3_3W_256)];
static __m256i ro256[6 * (T_TM3_3W_256)];
const __m256i zero = _mm256_setzero_si256();
int64_t *U1_64;
int64_t *U2_64;
int64_t *V1_64;
int64_t *V2_64;
int32_t T2 = T_TM3_3W_64 << 1;
int32_t i, i4, i41, i42;

for (int32_t i = 0; i < T_TM3_3W_256 - 1; i++) {
int32_t i4 = i << 2;
int32_t i42 = i4 - 2;
for (i = 0; i < T_TM3_3W_256 - 1; i++) {
i4 = i << 2;
i42 = i4 - 2;
U0[i] = _mm256_lddqu_si256((__m256i const *)(& A[i4]));
V0[i] = _mm256_lddqu_si256((__m256i const *)(& B[i4]));
U1[i] = _mm256_lddqu_si256((__m256i const *)(& A[i42 + T_TM3_3W_64]));
@@ -349,9 +355,9 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
V2[i] = _mm256_lddqu_si256((__m256i const *)(& B[i4 + T2 - 4]));
}

for (int32_t i = T_TM3_3W_256 - 1; i < T_TM3_3W_256; i++) {
int32_t i4 = i << 2;
int32_t i41 = i4 + 1;
for (i = T_TM3_3W_256 - 1; i < T_TM3_3W_256; i++) {
i4 = i << 2;
i41 = i4 + 1;
U0[i] = _mm256_set_epi64x(0, 0, A[i41], A[i4]);
V0[i] = _mm256_set_epi64x(0, 0, B[i41], B[i4]);
U1[i] = _mm256_set_epi64x(0, 0, A[i41 + T_TM3_3W_64 - 2], A[i4 + T_TM3_3W_64 - 2]);
@@ -364,7 +370,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
// P(X): P0=(0); P1=(1); P2=(x); P3=(1+x); P4=(\infty)
// Evaluation: 5*2 add, 2*2 shift; 5 mul (n)
//W3 = U2 + U1 + U0; W2 = V2 + V1 + V0
for (int32_t i = 0; i < T_TM3_3W_256; i++) {
for (i = 0; i < T_TM3_3W_256; i++) {
W3[i] = U0[i] ^ U1[i] ^ U2[i];
W2[i] = V0[i] ^ V1[i] ^ V2[i];
}
@@ -373,23 +379,17 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
karat_mult_32( W1, W2, W3);

//W0 =(U1 + U2*x)*x; W4 =(V1 + V2*x)*x (SIZE = T_TM3_3W_256 !)
int64_t *U1_64 = ((int64_t *) U1);
int64_t *U2_64 = ((int64_t *) U2);

int64_t *V1_64 = ((int64_t *) V1);
int64_t *V2_64 = ((int64_t *) V2);

W0[0] = _mm256_set_epi64x(U1_64[2] ^ U2_64[1], U1_64[1] ^ U2_64[0], U1_64[0], 0);
W4[0] = _mm256_set_epi64x(V1_64[2] ^ V2_64[1], V1_64[1] ^ V2_64[0], V1_64[0], 0);

U1_64 = ((int64_t *) U1);
U2_64 = ((int64_t *) U2);

V1_64 = ((int64_t *) V1);
V2_64 = ((int64_t *) V2);

for (int32_t i = 1; i < T_TM3_3W_256; i++) {
int i4 = i << 2;
W0[0] = _mm256_set_epi64x(U1_64[2] ^ U2_64[1], U1_64[1] ^ U2_64[0], U1_64[0], 0);
W4[0] = _mm256_set_epi64x(V1_64[2] ^ V2_64[1], V1_64[1] ^ V2_64[0], V1_64[0], 0);

for (i = 1; i < T_TM3_3W_256; i++) {
i4 = i << 2;
W0[i] = _mm256_lddqu_si256((__m256i const *)(& U1_64[i4 - 1]));
W0[i] ^= _mm256_lddqu_si256((__m256i const *)(& U2_64[i4 - 2]));

@@ -398,13 +398,13 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
}

//W3 = W3 + W0 ; W2 = W2 + W4
for (int32_t i = 0; i < T_TM3_3W_256; i++) {
for (i = 0; i < T_TM3_3W_256; i++) {
W3[i] ^= W0[i];
W2[i] ^= W4[i];
}

//W0 = W0 + U0 ; W4 = W4 + V0
for (int32_t i = 0; i < T_TM3_3W_256; i++) {
for (i = 0; i < T_TM3_3W_256; i++) {
W0[i] ^= U0[i];
W4[i] ^= V0[i];
}
@@ -412,7 +412,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
//W3 = W3 * W2 ; W2 = W0 * W4
karat_mult_32(tmp, W3, W2);

for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3_3W_256); i++) {
W3[i] = tmp[i];
}

@@ -424,20 +424,20 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
// Interpolation phase
// 9 add, 1 shift, 1 Smul, 2 Sdiv (2n)
//W3 = W3 + W2
for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3_3W_256); i++) {
W3[i] ^= W2[i];
}

//W1 = W1 + W0
for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3_3W_256); i++) {
W1[i] ^= W0[i];
}

//W2 =(W2 + W0)/x -> x = X^64
U1_64 = ((int64_t *) W2);
U2_64 = ((int64_t *) W0);
for (int32_t i = 0; i < (T_TM3_3W_256 << 1); i++) {
int32_t i4 = i << 2;
for (i = 0; i < (T_TM3_3W_256 << 1); i++) {
i4 = i << 2;
W2[i] = _mm256_lddqu_si256((__m256i const *)(& U1_64[i4 + 1]));
W2[i] ^= _mm256_lddqu_si256((__m256i const *)(& U2_64[i4 + 1]));
}
@@ -447,7 +447,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
__m256i *U1_256 = (__m256i *) (U1_64 + 1);
tmp[0] = W2[0] ^ W3[0] ^ W4[0] ^ _mm256_set_epi64x(U1_64[0], 0, 0, 0);

for (int32_t i = 1; i < (T_TM3_3W_256 << 1) - 1; i++) {
for (i = 1; i < (T_TM3_3W_256 << 1) - 1; i++) {
tmp[i] = W2[i] ^ W3[i] ^ W4[i] ^ _mm256_lddqu_si256(&U1_256[i - 1]);
}

@@ -461,7 +461,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
U2_64 = (int64_t *) W1;
__m256i *U2_256 = (__m256i *) (U2_64 + 1);

for (int32_t i = 0; i < 2 * (T_TM3_3W_256) - 1; i++) {
for (i = 0; i < 2 * (T_TM3_3W_256) - 1; i++) {
tmp[i] = _mm256_lddqu_si256(&U1_256[i]) ^ _mm256_lddqu_si256(&U2_256[i]);
}

@@ -469,19 +469,19 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
W3[2 * (T_TM3_3W_256) - 1] = zero;

//W1 = W1 + W4 + W2
for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3_3W_256); i++) {
W1[i] ^= W2[i] ^ W4[i];
}

//W2 = W2 + W3
for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3_3W_256); i++) {
W2[i] ^= W3[i];
}

// Recomposition
//W = W0+ W1*x+ W2*x^2+ W3*x^3 + W4*x^4
//W0, W1, W4 of size 2*T_TM3_3W_256, W2 and W3 of size 2*(T_TM3_3W_256)
for (int32_t i = 0; i < (T_TM3_3W_256 << 1) - 1; i++) {
for (i = 0; i < (T_TM3_3W_256 << 1) - 1; i++) {
ro256[i] = W0[i];
ro256[i + 2 * T_TM3_3W_256 - 1] = W2[i];
ro256[i + 4 * T_TM3_3W_256 - 2] = W4[i];
@@ -497,12 +497,12 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
U2_64 = ((int64_t *) &ro256[3 * T_TM3_3W_256 - 1]);
U2_256 = (__m256i *) (U2_64 - 2);

for (int32_t i = 0; i < T_TM3_3W_256 << 1; i++) {
for (i = 0; i < T_TM3_3W_256 << 1; i++) {
_mm256_storeu_si256(&U1_256[i], W1[i] ^ _mm256_lddqu_si256(&U1_256[i]));
_mm256_storeu_si256(&U2_256[i], W3[i] ^ _mm256_loadu_si256(&U2_256[i]));
}

for (int32_t i = 0; i < 6 * T_TM3_3W_256 - 2; i++) {
for (i = 0; i < 6 * T_TM3_3W_256 - 2; i++) {
_mm256_storeu_si256(&Out[i], ro256[i]);
}
}
@@ -541,9 +541,10 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
__m256i ro256[tTM3R / 2];
const __m256i zero = _mm256_setzero_si256();
int32_t T2 = T_TM3R_3W_64 << 1;
int32_t i, i1, i4;

for (int32_t i = 0; i < T_TM3R_3W_256; i++) {
int32_t i4 = i << 2;
for (i = 0; i < T_TM3R_3W_256; i++) {
i4 = i << 2;
U0[i] = _mm256_lddqu_si256((__m256i const *)(& A[i4]));
V0[i] = _mm256_lddqu_si256((__m256i const *)(& B[i4]));
U1[i] = _mm256_lddqu_si256((__m256i const *)(& A[i4 + T_TM3R_3W_64]));
@@ -552,7 +553,7 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
V2[i] = _mm256_lddqu_si256((__m256i const *)(& B[i4 + T2]));
}

for (int32_t i = T_TM3R_3W_256; i < T_TM3R_3W_256 + 2; i++) {
for (i = T_TM3R_3W_256; i < T_TM3R_3W_256 + 2; i++) {
U0[i] = zero;
V0[i] = zero;
U1[i] = zero;
@@ -566,12 +567,12 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
// Evaluation: 5*2 add, 2*2 shift; 5 mul (n)
//W3 = U2 + U1 + U0; W2 = V2 + V1 + V0

for (int32_t i = 0; i < T_TM3R_3W_256; i++) {
for (i = 0; i < T_TM3R_3W_256; i++) {
W3[i] = U0[i] ^ U1[i] ^ U2[i];
W2[i] = V0[i] ^ V1[i] ^ V2[i];
}

for (int32_t i = T_TM3R_3W_256; i < T_TM3R_3W_256 + 2; i++) {
for (i = T_TM3R_3W_256; i < T_TM3R_3W_256 + 2; i++) {
W2[i] = zero;
W3[i] = zero;
}
@@ -584,7 +585,7 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
W0[1] = U1[0];
W4[1] = V1[0];

for (int32_t i = 1; i < T_TM3R_3W_256 + 1; i++) {
for (i = 1; i < T_TM3R_3W_256 + 1; i++) {
W0[i + 1] = U1[i] ^ U2[i - 1];
W4[i + 1] = V1[i] ^ V2[i - 1];
}
@@ -593,13 +594,13 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
W4[T_TM3R_3W_256 + 1] = V2[T_TM3R_3W_256 - 1];

//W3 = W3 + W0 ; W2 = W2 + W4
for (int32_t i = 0; i < T_TM3R_3W_256 + 2; i++) {
for (i = 0; i < T_TM3R_3W_256 + 2; i++) {
W3[i] ^= W0[i];
W2[i] ^= W4[i];
}

//W0 = W0 + U0 ; W4 = W4 + V0
for (int32_t i = 0; i < T_TM3R_3W_256 + 2; i++) {
for (i = 0; i < T_TM3R_3W_256 + 2; i++) {
W0[i] ^= U0[i];
W4[i] ^= V0[i];
}
@@ -607,7 +608,7 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
//W3 = W3 * W2 ; W2 = W0 * W4
TOOM3Mult(tmp, (uint64_t *) W3, (uint64_t *) W2);

for (int32_t i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) {
for (i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) {
W3[i] = tmp[i];
}

@@ -621,25 +622,25 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
//9 add, 1 shift, 1 Smul, 2 Sdiv (2n)

//W3 = W3 + W2
for (int32_t i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) {
for (i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) {
W3[i] ^= W2[i];
}

//W1 = W1 + W0
for (int32_t i = 0; i < 2 * (T_TM3R_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3R_3W_256); i++) {
W1[i] ^= W0[i];
}

//W2 =(W2 + W0)/x
for (int32_t i = 0; i < 2 * (T_TM3R_3W_256 + 2) - 1; i++) {
int32_t i1 = i + 1;
for (i = 0; i < 2 * (T_TM3R_3W_256 + 2) - 1; i++) {
i1 = i + 1;
W2[i] = W2[i1] ^ W0[i1];
}

W2[2 * (T_TM3R_3W_256 + 2) - 1] = zero;

//W2 =(W2 + W3 + W4*(x^3+1))/(x+1)
for (int32_t i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) {
for (i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) {
tmp[i] = W2[i] ^ W3[i] ^ W4[i];
}

@@ -647,15 +648,15 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
tmp[2 * (T_TM3R_3W_256 + 2) + 1] = zero;
tmp[2 * (T_TM3R_3W_256 + 2) + 2] = zero;

for (int32_t i = 0; i < 2 * (T_TM3R_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3R_3W_256); i++) {
tmp[i + 3] ^= W4[i];
}

divByXplus1_256(W2, tmp, T_TM3R_3W_256);

//W3 =(W3 + W1)/(x*(x+1))
for (int32_t i = 0; i < 2 * (T_TM3R_3W_256 + 2) - 1; i++) {
int32_t i1 = i + 1;
for (i = 0; i < 2 * (T_TM3R_3W_256 + 2) - 1; i++) {
i1 = i + 1;
tmp[i] = W3[i1] ^ W1[i1];
}

@@ -663,18 +664,18 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
divByXplus1_256(W3, tmp, T_TM3R_3W_256);

//W1 = W1 + W4 + W2
for (int32_t i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) {
for (i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) {
W1[i] ^= W2[i] ^ W4[i];
}
//W2 = W2 + W3
for (int32_t i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) {
for (i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) {
W2[i] ^= W3[i];
}

// Recomposition
//W = W0+ W1*x+ W2*x^2+ W3*x^3 + W4*x^4
//W0, W1, W4 of size 2*T_TM3_3W_256, W2 and W3 of size 2*(T_TM3_3W_256+2)
for (int32_t i = 0; i < T_TM3R_3W_256; i++) {
for (i = 0; i < T_TM3R_3W_256; i++) {
ro256[i] = W0[i];
ro256[i + T_TM3R_3W_256] = W0[i + T_TM3R_3W_256] ^ W1[i];
ro256[i + 2 * T_TM3R_3W_256] = W1[i + T_TM3R_3W_256] ^ W2[i];
@@ -696,7 +697,7 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
ro256[3 + 5 * T_TM3R_3W_256] ^= W3[3 + 2 * T_TM3R_3W_256];


for (int32_t i = 0; i < 2 * VEC_N_SIZE_256 + 1; i++) {
for (i = 0; i < 2 * VEC_N_SIZE_256 + 1; i++) {
_mm256_storeu_si256(&Out[i], ro256[i]);
}
}


+ 25
- 35
crypto_kem/hqc-256/avx2/vector.c Parādīt failu

@@ -32,72 +32,63 @@
void PQCLEAN_HQC256_AVX2_vect_set_random_fixed_weight(AES_XOF_struct *ctx, uint64_t *v, uint16_t weight) {
size_t random_bytes_size = 3 * weight;
uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0};
uint32_t random_data = 0;
uint32_t tmp[PARAM_OMEGA_R] = {0};
uint8_t exist = 0;
size_t j = 0;
__m256i bit256[PARAM_OMEGA_R];
__m256i bloc256[PARAM_OMEGA_R];
static __m256i posCmp256 = (__m256i) {
0UL, 1UL, 2UL, 3UL
};
#define LOOP_SIZE CEIL_DIVIDE(PARAM_N, 256)

seedexpander(ctx, rand_bytes, random_bytes_size);

for (uint32_t i = 0; i < weight; ++i) {
exist = 0;
__m256i posCmp256 = _mm256_set_epi64x(3, 2, 1, 0);
uint64_t bloc, pos, bit64;
uint8_t inc;
size_t i, j;

i = 0;
j = random_bytes_size;
while (i < weight) {
do {
if (j == random_bytes_size) {
seedexpander(ctx, rand_bytes, random_bytes_size);
j = 0;
}

random_data = ((uint32_t) rand_bytes[j++]) << 16;
random_data |= ((uint32_t) rand_bytes[j++]) << 8;
random_data |= rand_bytes[j++];
tmp[i] = ((uint32_t) rand_bytes[j++]) << 16;
tmp[i] |= ((uint32_t) rand_bytes[j++]) << 8;
tmp[i] |= rand_bytes[j++];

} while (random_data >= UTILS_REJECTION_THRESHOLD);
} while (tmp[i] >= UTILS_REJECTION_THRESHOLD);

random_data = random_data % PARAM_N;
tmp[i] = tmp[i] % PARAM_N;

inc = 1;
for (uint32_t k = 0; k < i; k++) {
if (tmp[k] == random_data) {
exist = 1;
if (tmp[k] == tmp[i]) {
inc = 0;
}
}

if (exist == 1) {
i--;
} else {
tmp[i] = random_data;
}
i += inc;
}

for (uint32_t i = 0; i < weight; i++) {
for (i = 0; i < weight; i++) {
// we store the bloc number and bit position of each vb[i]
uint64_t bloc = tmp[i] >> 6;
bloc = tmp[i] >> 6;
bloc256[i] = _mm256_set1_epi64x(bloc >> 2);
uint64_t pos = (bloc & 0x3UL);
pos = (bloc & 0x3UL);
__m256i pos256 = _mm256_set1_epi64x(pos);
__m256i mask256 = _mm256_cmpeq_epi64(pos256, posCmp256);
uint64_t bit64 = 1ULL << (tmp[i] & 0x3f);
bit64 = 1ULL << (tmp[i] & 0x3f);
__m256i bloc256 = _mm256_set1_epi64x(bit64);
bit256[i] = bloc256 & mask256;
}

for (uint32_t i = 0; i < LOOP_SIZE; i++) {
for (i = 0; i < CEIL_DIVIDE(PARAM_N, 256); i++) {
__m256i aux = _mm256_loadu_si256(((__m256i *)v) + i);
__m256i i256 = _mm256_set1_epi64x(i);

for (uint32_t j = 0; j < weight; j++) {
for (j = 0; j < weight; j++) {
__m256i mask256 = _mm256_cmpeq_epi64(bloc256[j], i256);
aux ^= bit256[j] & mask256;
}
_mm256_storeu_si256(((__m256i *)v) + i, aux);
}

#undef LOOP_SIZE
}


@@ -182,10 +173,9 @@ uint8_t PQCLEAN_HQC256_AVX2_vect_compare(const uint8_t *v1, const uint8_t *v2, u
* @param[in] size_v Integer that is the size of the input vector in bits
*/
void PQCLEAN_HQC256_AVX2_vect_resize(uint64_t *o, uint32_t size_o, const uint64_t *v, uint32_t size_v) {
uint64_t mask = 0x7FFFFFFFFFFFFFFF;
int8_t val = 0;
if (size_o < size_v) {
uint64_t mask = 0x7FFFFFFFFFFFFFFF;
int8_t val = 0;

if (size_o % 64) {
val = 64 - (size_o % 64);
}


+ 24
- 22
crypto_kem/hqc-256/clean/bch.c Parādīt failu

@@ -121,52 +121,54 @@ void PQCLEAN_HQC256_CLEAN_bch_code_encode(uint64_t *codeword, const uint64_t *me
* @param[in] syndromes Array of size (at least) 2*PARAM_DELTA storing the syndromes
*/
static size_t compute_elp(uint16_t *sigma, const uint16_t *syndromes) {
sigma[0] = 1;
size_t deg_sigma = 0;
size_t deg_sigma_p = 0;
uint16_t sigma_copy[PARAM_DELTA - 1] = {0};
size_t deg_sigma_copy = 0;
uint16_t X_sigma_p[PARAM_DELTA + 1] = {0, 1};
int32_t pp = -1; // 2*rho
uint16_t d_p = 1;
uint16_t d = syndromes[0];
uint16_t X_sigma_p[PARAM_DELTA + 1] = {0};
uint16_t d_p, d, dd;
uint16_t mask;
int32_t pp; // 2*rho
size_t deg_sigma, deg_sigma_p, deg_sigma_copy, deg_X_sigma_p;

d = syndromes[0];
sigma[0] = 1;
X_sigma_p[1] = 1;
deg_sigma = 0;
deg_sigma_p = 0;
d_p = 1;
pp = -1;
for (size_t mu = 0; mu < PARAM_DELTA; ++mu) {
// Save sigma in case we need it to update X_sigma_p
memcpy(sigma_copy, sigma, 2 * (PARAM_DELTA - 1));
deg_sigma_copy = deg_sigma;

uint16_t dd = PQCLEAN_HQC256_CLEAN_gf_mul(d, PQCLEAN_HQC256_CLEAN_gf_inverse(d_p)); // 0 if(d == 0)
dd = PQCLEAN_HQC256_CLEAN_gf_mul(d, PQCLEAN_HQC256_CLEAN_gf_inverse(d_p)); // 0 if(d == 0)
for (size_t i = 1; (i <= 2 * mu + 1) && (i <= PARAM_DELTA); ++i) {
sigma[i] ^= PQCLEAN_HQC256_CLEAN_gf_mul(dd, X_sigma_p[i]);
}

size_t deg_X = 2 * mu - pp; // 2*(mu-rho)
size_t deg_X_sigma_p = deg_X + deg_sigma_p;
deg_X_sigma_p = 2 * mu - pp + deg_sigma_p;

// mask1 = 0xffff if(d != 0) and 0 otherwise
int16_t mask1 = -((uint16_t) - d >> 15);
// mask = 0xffff if(d != 0) and 0 otherwise
mask = -((uint16_t) - d >> 15);

// mask2 = 0xffff if(deg_X_sigma_p > deg_sigma) and 0 otherwise
int16_t mask2 = -((uint16_t) (deg_sigma - deg_X_sigma_p) >> 15);
// mask2 &= 0xffff if(deg_X_sigma_p > deg_sigma) and 0 otherwise
mask &= -((uint16_t) (deg_sigma - deg_X_sigma_p) >> 15);

// mask12 = 0xffff if the deg_sigma increased and 0 otherwise
int16_t mask12 = mask1 & mask2;
deg_sigma = (mask12 & deg_X_sigma_p) ^ (~mask12 & deg_sigma);
deg_sigma ^= mask & (deg_sigma ^ deg_X_sigma_p);

if (mu == PARAM_DELTA - 1) {
break;
}

// Update pp, d_p and X_sigma_p if needed
pp = (mask12 & (2 * mu)) ^ (~mask12 & pp);
d_p = (mask12 & d) ^ (~mask12 & d_p);
pp ^= mask & (pp ^ (2 * mu));
d_p ^= mask & (d_p ^ d);
for (size_t i = PARAM_DELTA - 1; i; --i) {
X_sigma_p[i + 1] = (mask12 & sigma_copy[i - 1]) ^ (~mask12 & X_sigma_p[i - 1]);
X_sigma_p[i + 1] = X_sigma_p[i - 1];
X_sigma_p[i + 1] ^= mask & (X_sigma_p[i + 1] ^ sigma_copy[i - 1]);
}
X_sigma_p[1] = 0;
X_sigma_p[0] = 0;
deg_sigma_p = (mask12 & deg_sigma_copy) ^ (~mask12 & deg_sigma_p);
deg_sigma_p ^= mask & (deg_sigma_p ^ deg_sigma_copy);

// Compute the next discrepancy
d = syndromes[2 * mu + 2];


+ 17
- 58
crypto_kem/hqc-256/clean/vector.c Parādīt failu

@@ -31,39 +31,33 @@
void PQCLEAN_HQC256_CLEAN_vect_set_random_fixed_weight_by_coordinates(AES_XOF_struct *ctx, uint32_t *v, uint16_t weight) {
size_t random_bytes_size = 3 * weight;
uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0}; // weight is expected to be <= PARAM_OMEGA_R
uint32_t random_data = 0;
uint8_t exist = 0;
size_t j = 0;
uint8_t inc;
size_t i, j;

seedexpander(ctx, rand_bytes, random_bytes_size);

for (uint32_t i = 0; i < weight; ++i) {
exist = 0;
i = 0;
j = random_bytes_size;
while (i < weight) {
do {
if (j == random_bytes_size) {
seedexpander(ctx, rand_bytes, random_bytes_size);
j = 0;
}

random_data = ((uint32_t) rand_bytes[j++]) << 16;
random_data |= ((uint32_t) rand_bytes[j++]) << 8;
random_data |= rand_bytes[j++];
v[i] = ((uint32_t) rand_bytes[j++]) << 16;
v[i] |= ((uint32_t) rand_bytes[j++]) << 8;
v[i] |= rand_bytes[j++];

} while (random_data >= UTILS_REJECTION_THRESHOLD);
} while (v[i] >= UTILS_REJECTION_THRESHOLD);

random_data = random_data % PARAM_N;
v[i] = v[i] % PARAM_N;

for (uint32_t k = 0; k < i; k++) {
if (v[k] == random_data) {
exist = 1;
inc = 1;
for (size_t k = 0; k < i; k++) {
if (v[k] == v[i]) {
inc = 0;
}
}

if (exist == 1) {
i--;
} else {
v[i] = random_data;
}
i += inc;
}
}

@@ -86,46 +80,11 @@ void PQCLEAN_HQC256_CLEAN_vect_set_random_fixed_weight_by_coordinates(AES_XOF_st
* @param[in] ctx Pointer to the context of the seed expander
*/
void PQCLEAN_HQC256_CLEAN_vect_set_random_fixed_weight(AES_XOF_struct *ctx, uint64_t *v, uint16_t weight) {

size_t random_bytes_size = 3 * weight;
uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0}; // weight is expected to be <= PARAM_OMEGA_R
uint32_t random_data = 0;
uint32_t tmp[PARAM_OMEGA_R] = {0};
uint8_t exist = 0;
size_t j = 0;

seedexpander(ctx, rand_bytes, random_bytes_size);

for (uint32_t i = 0; i < weight; ++i) {
exist = 0;
do {
if (j == random_bytes_size) {
seedexpander(ctx, rand_bytes, random_bytes_size);
j = 0;
}

random_data = ((uint32_t) rand_bytes[j++]) << 16;
random_data |= ((uint32_t) rand_bytes[j++]) << 8;
random_data |= rand_bytes[j++];

} while (random_data >= UTILS_REJECTION_THRESHOLD);

random_data = random_data % PARAM_N;

for (uint32_t k = 0; k < i; k++) {
if (tmp[k] == random_data) {
exist = 1;
}
}

if (exist == 1) {
i--;
} else {
tmp[i] = random_data;
}
}
PQCLEAN_HQC256_CLEAN_vect_set_random_fixed_weight_by_coordinates(ctx, tmp, weight);

for (uint16_t i = 0; i < weight; ++i) {
for (size_t i = 0; i < weight; ++i) {
int32_t index = tmp[i] / 64;
int32_t pos = tmp[i] % 64;
v[index] |= ((uint64_t) 1) << pos;


+ 52
- 50
crypto_kem/hqc-rmrs-128/avx2/gf2x.c Parādīt failu

@@ -188,22 +188,23 @@ static inline void karat_mult_4(__m256i *C, __m256i *A, __m256i *B) {
*/
static inline void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[8], D1[8], D2[8], SAA[4], SBB[4];
int32_t i, is, is2, is3;

karat_mult_4( D0, A, B);
karat_mult_4(D2, A + 4, B + 4);

for (int32_t i = 0; i < 4; i++) {
int is = i + 4;
for (i = 0; i < 4; i++) {
is = i + 4;
SAA[i] = A[i] ^ A[is];
SBB[i] = B[i] ^ B[is];
}

karat_mult_4(D1, SAA, SBB);

for (int32_t i = 0; i < 4; i++) {
int32_t is = i + 4;
int32_t is2 = is + 4;
int32_t is3 = is2 + 4;
for (i = 0; i < 4; i++) {
is = i + 4;
is2 = is + 4;
is3 = is2 + 4;

__m256i middle = _mm256_xor_si256(D0[is], D2[i]);

@@ -227,22 +228,23 @@ static inline void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) {
*/
static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[16], D1[16], D2[16], SAA[8], SBB[8];
int32_t i, is, is2, is3;

karat_mult_8( D0, A, B);
karat_mult_8(D2, A + 8, B + 8);

for (int32_t i = 0; i < 8; i++) {
int32_t is = i + 8;
for (i = 0; i < 8; i++) {
is = i + 8;
SAA[i] = A[i] ^ A[is];
SBB[i] = B[i] ^ B[is];
}

karat_mult_8( D1, SAA, SBB);

for (int32_t i = 0; i < 8; i++) {
int32_t is = i + 8;
int32_t is2 = is + 8;
int32_t is3 = is2 + 8;
for (i = 0; i < 8; i++) {
is = i + 8;
is2 = is + 8;
is3 = is2 + 8;

__m256i middle = _mm256_xor_si256(D0[is], D2[i]);

@@ -266,22 +268,23 @@ static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) {
*/
static inline void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[32], D1[32], D2[32], SAA[16], SBB[16];
int32_t i, is, is2, is3;

karat_mult_16( D0, A, B);
karat_mult_16(D2, A + 16, B + 16);

for (int32_t i = 0; i < 16; i++) {
int is = i + 16;
for (i = 0; i < 16; i++) {
is = i + 16;
SAA[i] = A[i] ^ A[is];
SBB[i] = B[i] ^ B[is];
}

karat_mult_16( D1, SAA, SBB);

for (int32_t i = 0; i < 16; i++) {
int32_t is = i + 16;
int32_t is2 = is + 16;
int32_t is3 = is2 + 16;
for (i = 0; i < 16; i++) {
is = i + 16;
is2 = is + 16;
is3 = is2 + 16;

__m256i middle = _mm256_xor_si256(D0[is], D2[i]);

@@ -329,11 +332,16 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
static __m256i tmp[2 * (T_TM3_3W_256)];
static __m256i ro256[6 * (T_TM3_3W_256)];
const __m256i zero = _mm256_setzero_si256();
int64_t *U1_64;
int64_t *U2_64;
int64_t *V1_64;
int64_t *V2_64;
int32_t T2 = T_TM3_3W_64 << 1;
int32_t i, i4, i41, i42;

for (int32_t i = 0; i < T_TM3_3W_256 - 1; i++) {
int32_t i4 = i << 2;
int32_t i42 = i4 - 2;
for (i = 0; i < T_TM3_3W_256 - 1; i++) {
i4 = i << 2;
i42 = i4 - 2;
U0[i] = _mm256_lddqu_si256((__m256i const *)(& A[i4]));
V0[i] = _mm256_lddqu_si256((__m256i const *)(& B[i4]));
U1[i] = _mm256_lddqu_si256((__m256i const *)(& A[i42 + T_TM3_3W_64]));
@@ -342,9 +350,9 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
V2[i] = _mm256_lddqu_si256((__m256i const *)(& B[i4 + T2 - 4]));
}

for (int32_t i = T_TM3_3W_256 - 1; i < T_TM3_3W_256; i++) {
int32_t i4 = i << 2;
int32_t i41 = i4 + 1;
for (i = T_TM3_3W_256 - 1; i < T_TM3_3W_256; i++) {
i4 = i << 2;
i41 = i4 + 1;
U0[i] = _mm256_set_epi64x(0, 0, A[i41], A[i4]);
V0[i] = _mm256_set_epi64x(0, 0, B[i41], B[i4]);
U1[i] = _mm256_set_epi64x(0, 0, A[i41 + T_TM3_3W_64 - 2], A[i4 + T_TM3_3W_64 - 2]);
@@ -357,7 +365,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
// P(X): P0=(0); P1=(1); P2=(x); P3=(1+x); P4=(\infty)
// Evaluation: 5*2 add, 2*2 shift; 5 mul (n)
//W3 = U2 + U1 + U0; W2 = V2 + V1 + V0
for (int32_t i = 0; i < T_TM3_3W_256; i++) {
for (i = 0; i < T_TM3_3W_256; i++) {
W3[i] = U0[i] ^ U1[i] ^ U2[i];
W2[i] = V0[i] ^ V1[i] ^ V2[i];
}
@@ -366,23 +374,17 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
karat_mult_32( W1, W2, W3);

//W0 =(U1 + U2*x)*x; W4 =(V1 + V2*x)*x (SIZE = T_TM3_3W_256 !)
int64_t *U1_64 = ((int64_t *) U1);
int64_t *U2_64 = ((int64_t *) U2);

int64_t *V1_64 = ((int64_t *) V1);
int64_t *V2_64 = ((int64_t *) V2);

W0[0] = _mm256_set_epi64x(U1_64[2] ^ U2_64[1], U1_64[1] ^ U2_64[0], U1_64[0], 0);
W4[0] = _mm256_set_epi64x(V1_64[2] ^ V2_64[1], V1_64[1] ^ V2_64[0], V1_64[0], 0);

U1_64 = ((int64_t *) U1);
U2_64 = ((int64_t *) U2);

V1_64 = ((int64_t *) V1);
V2_64 = ((int64_t *) V2);

for (int32_t i = 1; i < T_TM3_3W_256; i++) {
int i4 = i << 2;
W0[0] = _mm256_set_epi64x(U1_64[2] ^ U2_64[1], U1_64[1] ^ U2_64[0], U1_64[0], 0);
W4[0] = _mm256_set_epi64x(V1_64[2] ^ V2_64[1], V1_64[1] ^ V2_64[0], V1_64[0], 0);

for (i = 1; i < T_TM3_3W_256; i++) {
i4 = i << 2;
W0[i] = _mm256_lddqu_si256((__m256i const *)(& U1_64[i4 - 1]));
W0[i] ^= _mm256_lddqu_si256((__m256i const *)(& U2_64[i4 - 2]));

@@ -391,13 +393,13 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
}

//W3 = W3 + W0 ; W2 = W2 + W4
for (int32_t i = 0; i < T_TM3_3W_256; i++) {
for (i = 0; i < T_TM3_3W_256; i++) {
W3[i] ^= W0[i];
W2[i] ^= W4[i];
}

//W0 = W0 + U0 ; W4 = W4 + V0
for (int32_t i = 0; i < T_TM3_3W_256; i++) {
for (i = 0; i < T_TM3_3W_256; i++) {
W0[i] ^= U0[i];
W4[i] ^= V0[i];
}
@@ -405,7 +407,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
//W3 = W3 * W2 ; W2 = W0 * W4
karat_mult_32(tmp, W3, W2);

for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3_3W_256); i++) {
W3[i] = tmp[i];
}

@@ -417,20 +419,20 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
// Interpolation phase
// 9 add, 1 shift, 1 Smul, 2 Sdiv (2n)
//W3 = W3 + W2
for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3_3W_256); i++) {
W3[i] ^= W2[i];
}

//W1 = W1 + W0
for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3_3W_256); i++) {
W1[i] ^= W0[i];
}

//W2 =(W2 + W0)/x -> x = X^64
U1_64 = ((int64_t *) W2);
U2_64 = ((int64_t *) W0);
for (int32_t i = 0; i < (T_TM3_3W_256 << 1); i++) {
int32_t i4 = i << 2;
for (i = 0; i < (T_TM3_3W_256 << 1); i++) {
i4 = i << 2;
W2[i] = _mm256_lddqu_si256((__m256i const *)(& U1_64[i4 + 1]));
W2[i] ^= _mm256_lddqu_si256((__m256i const *)(& U2_64[i4 + 1]));
}
@@ -440,7 +442,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
__m256i *U1_256 = (__m256i *) (U1_64 + 1);
tmp[0] = W2[0] ^ W3[0] ^ W4[0] ^ _mm256_set_epi64x(U1_64[0], 0, 0, 0);

for (int32_t i = 1; i < (T_TM3_3W_256 << 1) - 1; i++) {
for (i = 1; i < (T_TM3_3W_256 << 1) - 1; i++) {
tmp[i] = W2[i] ^ W3[i] ^ W4[i] ^ _mm256_lddqu_si256(&U1_256[i - 1]);
}

@@ -454,7 +456,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
U2_64 = (int64_t *) W1;
__m256i *U2_256 = (__m256i *) (U2_64 + 1);

for (int32_t i = 0; i < 2 * (T_TM3_3W_256) - 1; i++) {
for (i = 0; i < 2 * (T_TM3_3W_256) - 1; i++) {
tmp[i] = _mm256_lddqu_si256(&U1_256[i]) ^ _mm256_lddqu_si256(&U2_256[i]);
}

@@ -462,19 +464,19 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
W3[2 * (T_TM3_3W_256) - 1] = zero;

//W1 = W1 + W4 + W2
for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3_3W_256); i++) {
W1[i] ^= W2[i] ^ W4[i];
}

//W2 = W2 + W3
for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3_3W_256); i++) {
W2[i] ^= W3[i];
}

// Recomposition
//W = W0+ W1*x+ W2*x^2+ W3*x^3 + W4*x^4
//W0, W1, W4 of size 2*T_TM3_3W_256, W2 and W3 of size 2*(T_TM3_3W_256)
for (int32_t i = 0; i < (T_TM3_3W_256 << 1) - 1; i++) {
for (i = 0; i < (T_TM3_3W_256 << 1) - 1; i++) {
ro256[i] = W0[i];
ro256[i + 2 * T_TM3_3W_256 - 1] = W2[i];
ro256[i + 4 * T_TM3_3W_256 - 2] = W4[i];
@@ -490,12 +492,12 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
U2_64 = ((int64_t *) &ro256[3 * T_TM3_3W_256 - 1]);
U2_256 = (__m256i *) (U2_64 - 2);

for (int32_t i = 0; i < T_TM3_3W_256 << 1; i++) {
for (i = 0; i < T_TM3_3W_256 << 1; i++) {
_mm256_storeu_si256(&U1_256[i], W1[i] ^ _mm256_lddqu_si256(&U1_256[i]));
_mm256_storeu_si256(&U2_256[i], W3[i] ^ _mm256_loadu_si256(&U2_256[i]));
}

for (int32_t i = 0; i < 2 * VEC_N_SIZE_256 + 1; i++) {
for (i = 0; i < 2 * VEC_N_SIZE_256 + 1; i++) {
_mm256_storeu_si256(&Out[i], ro256[i]);
}
}


+ 46
- 42
crypto_kem/hqc-rmrs-128/avx2/reed_muller.c Parādīt failu

@@ -231,7 +231,19 @@ inline void hadamard(__m256i *src, __m256i *dst) {
inline uint32_t find_peaks(__m256i *transform) {
// a whole lot of vector variables
__m256i bitmap, abs_rows[8], bound, active_row, max_abs_rows;
__m256i peak_mask;
__m256i tmp = _mm256_setzero_si256();
__m256i vect_mask;
__m256i res;
int32_t lower;
int32_t width;
uint32_t message;
uint32_t mask;
int8_t index;
int8_t abs_value;
int8_t mask1;
int8_t mask2;
uint16_t result;

// compute absolute value of transform
for (size_t i = 0; i < 8; i++) {
abs_rows[i] = _mm256_abs_epi16(transform[i]);
@@ -245,9 +257,9 @@ inline uint32_t find_peaks(__m256i *transform) {

// do binary search for the highest value that is lower than the maximum
// loop invariant: lower gives bit map = 0, lower + width gives bit map > 0
int32_t lower = 1;
lower = 1;
// this gives 64, 128 or 256 for MULTIPLICITY = 2, 4, 6
int32_t width = 1 << (5 + MULTIPLICITY / 2);
width = 1 << (5 + MULTIPLICITY / 2);
// if you don't unroll this loop, it fits in the loop cache
// uncomment the line below to speeding up the program by a few percent
// #pragma GCC unroll 0
@@ -259,8 +271,9 @@ inline uint32_t find_peaks(__m256i *transform) {
bitmap = _mm256_cmpgt_epi16(max_abs_rows, bound);
// step up if there are any matches
// rely on compiler to use conditional move here
int32_t step_mask = _mm256_testz_si256(bitmap, bitmap) - 1;
lower += step_mask & width;
mask = (uint32_t) _mm256_testz_si256(bitmap, bitmap);
mask = ~(uint32_t) ((-(int64_t) mask) >> 63);
lower += mask & width;
}
// lower+width contains the maximum value of the vector
// or less, if the maximum is very high (which is OK)
@@ -272,30 +285,26 @@ inline uint32_t find_peaks(__m256i *transform) {

// find in which of the 8 groups a maximum occurs to compute bits 4, 5, 6 of message
// find lowest value by searching backwards skip first check to save time
size_t message = 0x70;
for (int32_t i = 7; i >= 0; i--) {
bitmap = _mm256_cmpgt_epi16(abs_rows[i], bound);
int message_mask = (-(int16_t)(_mm256_testz_si256(bitmap, bitmap) == 0)) >> 15;
message ^= message_mask & (message ^ (unsigned)i << 4);
message = 0x70;
for (size_t i = 0; i < 8; i++) {
bitmap = _mm256_cmpgt_epi16(abs_rows[7 - i], bound);
mask = (uint32_t) _mm256_testz_si256(bitmap, bitmap);
mask = ~(uint32_t) ((-(int64_t) mask) >> 63);
message ^= mask & (message ^ ((7 - i) << 4));
}
// we decided which row of the matrix contains the lowest match
// select proper row
int8_t index = message >> 4;
__m256i res;
__m256i tmp = (__m256i) {
0ULL, 0ULL, 0ULL, 0ULL
};
index = message >> 4;

for (int8_t i = 0; i < 8; i++) {
int8_t abs_value = (int8_t)(index - i);
int8_t mask1 = abs_value >> 7;
tmp = _mm256_setzero_si256();
for (size_t i = 0; i < 8; i++) {
abs_value = (int8_t)(index - i);
mask1 = abs_value >> 7;
abs_value ^= mask1;
abs_value -= mask1;
int8_t mask2 = ((uint8_t) - abs_value >> 7);
int64_t mask3 = (-1ULL) + mask2;
__m256i vect_mask = (__m256i) {
mask3, mask3, mask3, mask3
};
mask2 = ((uint8_t) - abs_value >> 7);
mask = (-1ULL) + mask2;
vect_mask = _mm256_set1_epi32(mask);
res = _mm256_and_si256(abs_rows[i], vect_mask);
tmp = _mm256_or_si256(tmp, res);
}
@@ -305,34 +314,29 @@ inline uint32_t find_peaks(__m256i *transform) {
// get the column number of the vector element
// by setting the bits corresponding to the columns
// and then adding elements within two groups of 8
peak_mask = _mm256_cmpgt_epi16(active_row, bound);
peak_mask &= _mm256_set_epi16(-32768, 16384, 8192, 4096, 2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1);
for (int32_t i = 0; i < 3; i++) {
peak_mask = _mm256_hadd_epi16(peak_mask, peak_mask);
vect_mask = _mm256_cmpgt_epi16(active_row, bound);
vect_mask &= _mm256_set_epi16(-32768, 16384, 8192, 4096, 2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1);
for (size_t i = 0; i < 3; i++) {
vect_mask = _mm256_hadd_epi16(vect_mask, vect_mask);
}
// add low 4 bits of message
message |= __tzcnt_u16(_mm256_extract_epi16(peak_mask, 0) + _mm256_extract_epi16(peak_mask, 8));
message |= __tzcnt_u16(_mm256_extract_epi16(vect_mask, 0) + _mm256_extract_epi16(vect_mask, 8));

// set bit 7 if sign of biggest value is positive
// make sure a jump isn't generated by the compiler
tmp = (__m256i) {
0ULL, 0ULL, 0ULL, 0ULL
};
for (uint32_t i = 0; i < 8; i++) {
int64_t message_mask = (-(int64_t)(i == message / 16)) >> 63;
__m256i vect_mask = (__m256i) {
message_mask, message_mask, message_mask, message_mask
};
tmp = _mm256_setzero_si256();
for (size_t i = 0; i < 8; i++) {
mask = ~(uint32_t) ((-(int64_t)(i ^ message / 16)) >> 63);
__m256i vect_mask = _mm256_set1_epi32(mask);
tmp = _mm256_or_si256(tmp, _mm256_and_si256(vect_mask, transform[i]));
}
uint16_t result = 0;
for (uint32_t i = 0; i < 16; i++) {
uint16_t *ptr = (uint16_t *) &tmp;
int32_t message_mask = (-(int32_t)(i == message % 16)) >> (sizeof(int32_t) * 8 - 1);
result |= message_mask & ptr[i];
result = 0;
for (size_t i = 0; i < 16; i++) {
mask = ~(uint32_t) ((-(int64_t)(i ^ message % 16)) >> 63);
result |= mask & ((uint16_t *)&tmp)[i];
}
message |= (0x8000 & ~result) >> 8;
return (uint32_t) message;
return message;
}




+ 26
- 18
crypto_kem/hqc-rmrs-128/avx2/reed_solomon.c Parādīt failu

@@ -228,17 +228,25 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons
uint16_t beta_j[PARAM_DELTA] = {0};
uint16_t e_j[PARAM_DELTA] = {0};

uint16_t delta_counter = 0;
uint16_t delta_counter;
uint16_t delta_real_value;
uint16_t found;
uint16_t mask1;
uint16_t mask2;
uint16_t tmp1;
uint16_t tmp2;
uint16_t inverse;
uint16_t inverse_power_j;

// Compute the beta_{j_i} page 31 of the documentation
delta_counter = 0;
for (size_t i = 0; i < PARAM_N1; i++) {
uint16_t found = 0;
uint16_t valuemask = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0
for (uint16_t j = 0; j < PARAM_DELTA; j++) {
uint16_t indexmask = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter
beta_j[j] += indexmask & valuemask & gf_exp[i];
found += indexmask & valuemask & 1;
found = 0;
mask1 = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0
for (size_t j = 0; j < PARAM_DELTA; j++) {
mask2 = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter
beta_j[j] += mask1 & mask2 & gf_exp[i];
found += mask1 & mask2 & 1;
}
delta_counter += found;
}
@@ -246,10 +254,10 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons

// Compute the e_{j_i} page 31 of the documentation
for (size_t i = 0; i < PARAM_DELTA; ++i) {
uint16_t tmp1 = 1;
uint16_t tmp2 = 1;
uint16_t inverse = PQCLEAN_HQCRMRS128_AVX2_gf_inverse(beta_j[i]);
uint16_t inverse_power_j = 1;
tmp1 = 1;
tmp2 = 1;
inverse = PQCLEAN_HQCRMRS128_AVX2_gf_inverse(beta_j[i]);
inverse_power_j = 1;

for (size_t j = 1; j <= PARAM_DELTA; ++j) {
inverse_power_j = PQCLEAN_HQCRMRS128_AVX2_gf_mul(inverse_power_j, inverse);
@@ -258,19 +266,19 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons
for (size_t k = 1; k < PARAM_DELTA; ++k) {
tmp2 = PQCLEAN_HQCRMRS128_AVX2_gf_mul(tmp2, (1 ^ PQCLEAN_HQCRMRS128_AVX2_gf_mul(inverse, beta_j[(i + k) % PARAM_DELTA])));
}
uint16_t mask = (uint16_t) (((int16_t) i - delta_real_value) >> 15); // i < delta_real_value
e_j[i] = mask & PQCLEAN_HQCRMRS128_AVX2_gf_mul(tmp1, PQCLEAN_HQCRMRS128_AVX2_gf_inverse(tmp2));
mask1 = (uint16_t) (((int16_t) i - delta_real_value) >> 15); // i < delta_real_value
e_j[i] = mask1 & PQCLEAN_HQCRMRS128_AVX2_gf_mul(tmp1, PQCLEAN_HQCRMRS128_AVX2_gf_inverse(tmp2));
}

// Place the delta e_{j_i} values at the right coordinates of the output vector
delta_counter = 0;
for (size_t i = 0; i < PARAM_N1; ++i) {
uint16_t found = 0;
uint16_t valuemask = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0
found = 0;
mask1 = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0
for (size_t j = 0; j < PARAM_DELTA; j++) {
uint16_t indexmask = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter
error_values[i] += indexmask & valuemask & e_j[j];
found += indexmask & valuemask & 1;
mask2 = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter
error_values[i] += mask1 & mask2 & e_j[j];
found += mask1 & mask2 & 1;
}
delta_counter += found;
}


+ 25
- 35
crypto_kem/hqc-rmrs-128/avx2/vector.c Parādīt failu

@@ -32,72 +32,63 @@
void PQCLEAN_HQCRMRS128_AVX2_vect_set_random_fixed_weight(AES_XOF_struct *ctx, uint64_t *v, uint16_t weight) {
size_t random_bytes_size = 3 * weight;
uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0};
uint32_t random_data = 0;
uint32_t tmp[PARAM_OMEGA_R] = {0};
uint8_t exist = 0;
size_t j = 0;
__m256i bit256[PARAM_OMEGA_R];
__m256i bloc256[PARAM_OMEGA_R];
static __m256i posCmp256 = (__m256i) {
0UL, 1UL, 2UL, 3UL
};
#define LOOP_SIZE CEIL_DIVIDE(PARAM_N, 256)

seedexpander(ctx, rand_bytes, random_bytes_size);

for (uint32_t i = 0; i < weight; ++i) {
exist = 0;
__m256i posCmp256 = _mm256_set_epi64x(3, 2, 1, 0);
uint64_t bloc, pos, bit64;
uint8_t inc;
size_t i, j;

i = 0;
j = random_bytes_size;
while (i < weight) {
do {
if (j == random_bytes_size) {
seedexpander(ctx, rand_bytes, random_bytes_size);
j = 0;
}

random_data = ((uint32_t) rand_bytes[j++]) << 16;
random_data |= ((uint32_t) rand_bytes[j++]) << 8;
random_data |= rand_bytes[j++];
tmp[i] = ((uint32_t) rand_bytes[j++]) << 16;
tmp[i] |= ((uint32_t) rand_bytes[j++]) << 8;
tmp[i] |= rand_bytes[j++];

} while (random_data >= UTILS_REJECTION_THRESHOLD);
} while (tmp[i] >= UTILS_REJECTION_THRESHOLD);

random_data = random_data % PARAM_N;
tmp[i] = tmp[i] % PARAM_N;

inc = 1;
for (uint32_t k = 0; k < i; k++) {
if (tmp[k] == random_data) {
exist = 1;
if (tmp[k] == tmp[i]) {
inc = 0;
}
}

if (exist == 1) {
i--;
} else {
tmp[i] = random_data;
}
i += inc;
}

for (uint32_t i = 0; i < weight; i++) {
for (i = 0; i < weight; i++) {
// we store the bloc number and bit position of each vb[i]
uint64_t bloc = tmp[i] >> 6;
bloc = tmp[i] >> 6;
bloc256[i] = _mm256_set1_epi64x(bloc >> 2);
uint64_t pos = (bloc & 0x3UL);
pos = (bloc & 0x3UL);
__m256i pos256 = _mm256_set1_epi64x(pos);
__m256i mask256 = _mm256_cmpeq_epi64(pos256, posCmp256);
uint64_t bit64 = 1ULL << (tmp[i] & 0x3f);
bit64 = 1ULL << (tmp[i] & 0x3f);
__m256i bloc256 = _mm256_set1_epi64x(bit64);
bit256[i] = bloc256 & mask256;
}

for (uint32_t i = 0; i < LOOP_SIZE; i++) {
for (i = 0; i < CEIL_DIVIDE(PARAM_N, 256); i++) {
__m256i aux = _mm256_loadu_si256(((__m256i *)v) + i);
__m256i i256 = _mm256_set1_epi64x(i);

for (uint32_t j = 0; j < weight; j++) {
for (j = 0; j < weight; j++) {
__m256i mask256 = _mm256_cmpeq_epi64(bloc256[j], i256);
aux ^= bit256[j] & mask256;
}
_mm256_storeu_si256(((__m256i *)v) + i, aux);
}

#undef LOOP_SIZE
}


@@ -167,10 +158,9 @@ uint8_t PQCLEAN_HQCRMRS128_AVX2_vect_compare(const uint8_t *v1, const uint8_t *v
* @param[in] size_v Integer that is the size of the input vector in bits
*/
void PQCLEAN_HQCRMRS128_AVX2_vect_resize(uint64_t *o, uint32_t size_o, const uint64_t *v, uint32_t size_v) {
uint64_t mask = 0x7FFFFFFFFFFFFFFF;
int8_t val = 0;
if (size_o < size_v) {
uint64_t mask = 0x7FFFFFFFFFFFFFFF;
int8_t val = 0;

if (size_o % 64) {
val = 64 - (size_o % 64);
}


+ 26
- 18
crypto_kem/hqc-rmrs-128/clean/reed_solomon.c Parādīt failu

@@ -228,17 +228,25 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons
uint16_t beta_j[PARAM_DELTA] = {0};
uint16_t e_j[PARAM_DELTA] = {0};

uint16_t delta_counter = 0;
uint16_t delta_counter;
uint16_t delta_real_value;
uint16_t found;
uint16_t mask1;
uint16_t mask2;
uint16_t tmp1;
uint16_t tmp2;
uint16_t inverse;
uint16_t inverse_power_j;

// Compute the beta_{j_i} page 31 of the documentation
delta_counter = 0;
for (size_t i = 0; i < PARAM_N1; i++) {
uint16_t found = 0;
uint16_t valuemask = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0
for (uint16_t j = 0; j < PARAM_DELTA; j++) {
uint16_t indexmask = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter
beta_j[j] += indexmask & valuemask & gf_exp[i];
found += indexmask & valuemask & 1;
found = 0;
mask1 = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0
for (size_t j = 0; j < PARAM_DELTA; j++) {
mask2 = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter
beta_j[j] += mask1 & mask2 & gf_exp[i];
found += mask1 & mask2 & 1;
}
delta_counter += found;
}
@@ -246,10 +254,10 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons

// Compute the e_{j_i} page 31 of the documentation
for (size_t i = 0; i < PARAM_DELTA; ++i) {
uint16_t tmp1 = 1;
uint16_t tmp2 = 1;
uint16_t inverse = PQCLEAN_HQCRMRS128_CLEAN_gf_inverse(beta_j[i]);
uint16_t inverse_power_j = 1;
tmp1 = 1;
tmp2 = 1;
inverse = PQCLEAN_HQCRMRS128_CLEAN_gf_inverse(beta_j[i]);
inverse_power_j = 1;

for (size_t j = 1; j <= PARAM_DELTA; ++j) {
inverse_power_j = PQCLEAN_HQCRMRS128_CLEAN_gf_mul(inverse_power_j, inverse);
@@ -258,19 +266,19 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons
for (size_t k = 1; k < PARAM_DELTA; ++k) {
tmp2 = PQCLEAN_HQCRMRS128_CLEAN_gf_mul(tmp2, (1 ^ PQCLEAN_HQCRMRS128_CLEAN_gf_mul(inverse, beta_j[(i + k) % PARAM_DELTA])));
}
uint16_t mask = (uint16_t) (((int16_t) i - delta_real_value) >> 15); // i < delta_real_value
e_j[i] = mask & PQCLEAN_HQCRMRS128_CLEAN_gf_mul(tmp1, PQCLEAN_HQCRMRS128_CLEAN_gf_inverse(tmp2));
mask1 = (uint16_t) (((int16_t) i - delta_real_value) >> 15); // i < delta_real_value
e_j[i] = mask1 & PQCLEAN_HQCRMRS128_CLEAN_gf_mul(tmp1, PQCLEAN_HQCRMRS128_CLEAN_gf_inverse(tmp2));
}

// Place the delta e_{j_i} values at the right coordinates of the output vector
delta_counter = 0;
for (size_t i = 0; i < PARAM_N1; ++i) {
uint16_t found = 0;
uint16_t valuemask = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0
found = 0;
mask1 = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0
for (size_t j = 0; j < PARAM_DELTA; j++) {
uint16_t indexmask = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter
error_values[i] += indexmask & valuemask & e_j[j];
found += indexmask & valuemask & 1;
mask2 = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter
error_values[i] += mask1 & mask2 & e_j[j];
found += mask1 & mask2 & 1;
}
delta_counter += found;
}


+ 17
- 58
crypto_kem/hqc-rmrs-128/clean/vector.c Parādīt failu

@@ -31,39 +31,33 @@
void PQCLEAN_HQCRMRS128_CLEAN_vect_set_random_fixed_weight_by_coordinates(AES_XOF_struct *ctx, uint32_t *v, uint16_t weight) {
size_t random_bytes_size = 3 * weight;
uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0}; // weight is expected to be <= PARAM_OMEGA_R
uint32_t random_data = 0;
uint8_t exist = 0;
size_t j = 0;
uint8_t inc;
size_t i, j;

seedexpander(ctx, rand_bytes, random_bytes_size);

for (uint32_t i = 0; i < weight; ++i) {
exist = 0;
i = 0;
j = random_bytes_size;
while (i < weight) {
do {
if (j == random_bytes_size) {
seedexpander(ctx, rand_bytes, random_bytes_size);
j = 0;
}

random_data = ((uint32_t) rand_bytes[j++]) << 16;
random_data |= ((uint32_t) rand_bytes[j++]) << 8;
random_data |= rand_bytes[j++];
v[i] = ((uint32_t) rand_bytes[j++]) << 16;
v[i] |= ((uint32_t) rand_bytes[j++]) << 8;
v[i] |= rand_bytes[j++];

} while (random_data >= UTILS_REJECTION_THRESHOLD);
} while (v[i] >= UTILS_REJECTION_THRESHOLD);

random_data = random_data % PARAM_N;
v[i] = v[i] % PARAM_N;

for (uint32_t k = 0; k < i; k++) {
if (v[k] == random_data) {
exist = 1;
inc = 1;
for (size_t k = 0; k < i; k++) {
if (v[k] == v[i]) {
inc = 0;
}
}

if (exist == 1) {
i--;
} else {
v[i] = random_data;
}
i += inc;
}
}

@@ -86,46 +80,11 @@ void PQCLEAN_HQCRMRS128_CLEAN_vect_set_random_fixed_weight_by_coordinates(AES_XO
* @param[in] ctx Pointer to the context of the seed expander
*/
void PQCLEAN_HQCRMRS128_CLEAN_vect_set_random_fixed_weight(AES_XOF_struct *ctx, uint64_t *v, uint16_t weight) {

size_t random_bytes_size = 3 * weight;
uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0}; // weight is expected to be <= PARAM_OMEGA_R
uint32_t random_data = 0;
uint32_t tmp[PARAM_OMEGA_R] = {0};
uint8_t exist = 0;
size_t j = 0;

seedexpander(ctx, rand_bytes, random_bytes_size);

for (uint32_t i = 0; i < weight; ++i) {
exist = 0;
do {
if (j == random_bytes_size) {
seedexpander(ctx, rand_bytes, random_bytes_size);
j = 0;
}

random_data = ((uint32_t) rand_bytes[j++]) << 16;
random_data |= ((uint32_t) rand_bytes[j++]) << 8;
random_data |= rand_bytes[j++];

} while (random_data >= UTILS_REJECTION_THRESHOLD);

random_data = random_data % PARAM_N;

for (uint32_t k = 0; k < i; k++) {
if (tmp[k] == random_data) {
exist = 1;
}
}

if (exist == 1) {
i--;
} else {
tmp[i] = random_data;
}
}
PQCLEAN_HQCRMRS128_CLEAN_vect_set_random_fixed_weight_by_coordinates(ctx, tmp, weight);

for (uint16_t i = 0; i < weight; ++i) {
for (size_t i = 0; i < weight; ++i) {
int32_t index = tmp[i] / 64;
int32_t pos = tmp[i] % 64;
v[index] |= ((uint64_t) 1) << pos;


+ 59
- 57
crypto_kem/hqc-rmrs-192/avx2/gf2x.c Parādīt failu

@@ -188,23 +188,24 @@ static inline void karat_mult_4(__m256i *C, __m256i *A, __m256i *B) {
* @param[in] B Pointer to the polynomial B(x)
*/
static inline void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) {
int32_t i, is, is2, is3;
__m256i D0[8], D1[8], D2[8], SAA[4], SBB[4];

karat_mult_4( D0, A, B);
karat_mult_4(D2, A + 4, B + 4);

for (int32_t i = 0; i < 4; i++) {
int is = i + 4;
for (i = 0; i < 4; i++) {
is = i + 4;
SAA[i] = A[i] ^ A[is];
SBB[i] = B[i] ^ B[is];
}

karat_mult_4(D1, SAA, SBB);

for (int32_t i = 0; i < 4; i++) {
int32_t is = i + 4;
int32_t is2 = is + 4;
int32_t is3 = is2 + 4;
for (i = 0; i < 4; i++) {
is = i + 4;
is2 = is + 4;
is3 = is2 + 4;

__m256i middle = _mm256_xor_si256(D0[is], D2[i]);

@@ -228,22 +229,23 @@ static inline void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) {
*/
static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[16], D1[16], D2[16], SAA[8], SBB[8];
int32_t i, is, is2, is3;

karat_mult_8( D0, A, B);
karat_mult_8(D2, A + 8, B + 8);

for (int32_t i = 0; i < 8; i++) {
int32_t is = i + 8;
for (i = 0; i < 8; i++) {
is = i + 8;
SAA[i] = A[i] ^ A[is];
SBB[i] = B[i] ^ B[is];
}

karat_mult_8( D1, SAA, SBB);

for (int32_t i = 0; i < 8; i++) {
int32_t is = i + 8;
int32_t is2 = is + 8;
int32_t is3 = is2 + 8;
for (i = 0; i < 8; i++) {
is = i + 8;
is2 = is + 8;
is3 = is2 + 8;

__m256i middle = _mm256_xor_si256(D0[is], D2[i]);

@@ -267,22 +269,23 @@ static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) {
*/
static inline void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[32], D1[32], D2[32], SAA[16], SBB[16];
int32_t i, is, is2, is3;

karat_mult_16( D0, A, B);
karat_mult_16(D2, A + 16, B + 16);

for (int32_t i = 0; i < 16; i++) {
int is = i + 16;
for (i = 0; i < 16; i++) {
is = i + 16;
SAA[i] = A[i] ^ A[is];
SBB[i] = B[i] ^ B[is];
}

karat_mult_16( D1, SAA, SBB);

for (int32_t i = 0; i < 16; i++) {
int32_t is = i + 16;
int32_t is2 = is + 16;
int32_t is3 = is2 + 16;
for (i = 0; i < 16; i++) {
is = i + 16;
is2 = is + 16;
is3 = is2 + 16;

__m256i middle = _mm256_xor_si256(D0[is], D2[i]);

@@ -306,21 +309,22 @@ static inline void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) {
*/
static inline void karat_mult_64(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[64], D1[64], D2[64], SAA[32], SBB[32];
int32_t i, is, is2, is3;

karat_mult_32( D0, A, B);
karat_mult_32(D2, A + 32, B + 32);
for (int32_t i = 0; i < 32; i++) {
int32_t is = i + 32;
for (i = 0; i < 32; i++) {
is = i + 32;
SAA[i] = A[i] ^ A[is];
SBB[i] = B[i] ^ B[is];
}

karat_mult_32( D1, SAA, SBB);

for (int32_t i = 0; i < 32; i++) {
int32_t is = i + 32;
int32_t is2 = is + 32;
int32_t is3 = is2 + 32;
for (i = 0; i < 32; i++) {
is = i + 32;
is2 = is + 32;
is3 = is2 + 32;

__m256i middle = _mm256_xor_si256(D0[is], D2[i]);

@@ -369,11 +373,16 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
static __m256i tmp[2 * (T_TM3_3W_256)];
static __m256i ro256[6 * (T_TM3_3W_256)];
const __m256i zero = _mm256_setzero_si256();
int64_t *U1_64;
int64_t *U2_64;
int64_t *V1_64;
int64_t *V2_64;
int32_t T2 = T_TM3_3W_64 << 1;
int32_t i, i4, i41, i42;

for (int32_t i = 0; i < T_TM3_3W_256 - 1; i++) {
int32_t i4 = i << 2;
int32_t i42 = i4 - 2;
for (i = 0; i < T_TM3_3W_256 - 1; i++) {
i4 = i << 2;
i42 = i4 - 2;
U0[i] = _mm256_lddqu_si256((__m256i const *)(& A[i4]));
V0[i] = _mm256_lddqu_si256((__m256i const *)(& B[i4]));
U1[i] = _mm256_lddqu_si256((__m256i const *)(& A[i42 + T_TM3_3W_64]));
@@ -382,9 +391,9 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
V2[i] = _mm256_lddqu_si256((__m256i const *)(& B[i4 + T2 - 4]));
}

for (int32_t i = T_TM3_3W_256 - 1; i < T_TM3_3W_256; i++) {
int32_t i4 = i << 2;
int32_t i41 = i4 + 1;
for (i = T_TM3_3W_256 - 1; i < T_TM3_3W_256; i++) {
i4 = i << 2;
i41 = i4 + 1;
U0[i] = _mm256_set_epi64x(0, 0, A[i41], A[i4]);
V0[i] = _mm256_set_epi64x(0, 0, B[i41], B[i4]);
U1[i] = _mm256_set_epi64x(0, 0, A[i41 + T_TM3_3W_64 - 2], A[i4 + T_TM3_3W_64 - 2]);
@@ -397,7 +406,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
// P(X): P0=(0); P1=(1); P2=(x); P3=(1+x); P4=(\infty)
// Evaluation: 5*2 add, 2*2 shift; 5 mul (n)
//W3 = U2 + U1 + U0; W2 = V2 + V1 + V0
for (int32_t i = 0; i < T_TM3_3W_256; i++) {
for (i = 0; i < T_TM3_3W_256; i++) {
W3[i] = U0[i] ^ U1[i] ^ U2[i];
W2[i] = V0[i] ^ V1[i] ^ V2[i];
}
@@ -406,23 +415,17 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
karat_mult_64( W1, W2, W3);

//W0 =(U1 + U2*x)*x; W4 =(V1 + V2*x)*x (SIZE = T_TM3_3W_256 !)
int64_t *U1_64 = ((int64_t *) U1);
int64_t *U2_64 = ((int64_t *) U2);

int64_t *V1_64 = ((int64_t *) V1);
int64_t *V2_64 = ((int64_t *) V2);

W0[0] = _mm256_set_epi64x(U1_64[2] ^ U2_64[1], U1_64[1] ^ U2_64[0], U1_64[0], 0);
W4[0] = _mm256_set_epi64x(V1_64[2] ^ V2_64[1], V1_64[1] ^ V2_64[0], V1_64[0], 0);

U1_64 = ((int64_t *) U1);
U2_64 = ((int64_t *) U2);

V1_64 = ((int64_t *) V1);
V2_64 = ((int64_t *) V2);

for (int32_t i = 1; i < T_TM3_3W_256; i++) {
int i4 = i << 2;
W0[0] = _mm256_set_epi64x(U1_64[2] ^ U2_64[1], U1_64[1] ^ U2_64[0], U1_64[0], 0);
W4[0] = _mm256_set_epi64x(V1_64[2] ^ V2_64[1], V1_64[1] ^ V2_64[0], V1_64[0], 0);

for (i = 1; i < T_TM3_3W_256; i++) {
i4 = i << 2;
W0[i] = _mm256_lddqu_si256((__m256i const *)(& U1_64[i4 - 1]));
W0[i] ^= _mm256_lddqu_si256((__m256i const *)(& U2_64[i4 - 2]));

@@ -431,21 +434,20 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
}

//W3 = W3 + W0 ; W2 = W2 + W4
for (int32_t i = 0; i < T_TM3_3W_256; i++) {
for (i = 0; i < T_TM3_3W_256; i++) {
W3[i] ^= W0[i];
W2[i] ^= W4[i];
}

//W0 = W0 + U0 ; W4 = W4 + V0
for (int32_t i = 0; i < T_TM3_3W_256; i++) {
for (i = 0; i < T_TM3_3W_256; i++) {
W0[i] ^= U0[i];
W4[i] ^= V0[i];
}


karat_mult_64(tmp, W3, W2);

for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3_3W_256); i++) {
W3[i] = tmp[i];
}

@@ -457,20 +459,20 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
// Interpolation phase
// 9 add, 1 shift, 1 Smul, 2 Sdiv (2n)
//W3 = W3 + W2
for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3_3W_256); i++) {
W3[i] ^= W2[i];
}

//W1 = W1 + W0
for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3_3W_256); i++) {
W1[i] ^= W0[i];
}

//W2 =(W2 + W0)/x -> x = X^64
U1_64 = ((int64_t *) W2);
U2_64 = ((int64_t *) W0);
for (int32_t i = 0; i < (T_TM3_3W_256 << 1); i++) {
int32_t i4 = i << 2;
for (i = 0; i < (T_TM3_3W_256 << 1); i++) {
i4 = i << 2;
W2[i] = _mm256_lddqu_si256((__m256i const *)(& U1_64[i4 + 1]));
W2[i] ^= _mm256_lddqu_si256((__m256i const *)(& U2_64[i4 + 1]));
}
@@ -480,7 +482,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
__m256i *U1_256 = (__m256i *) (U1_64 + 1);
tmp[0] = W2[0] ^ W3[0] ^ W4[0] ^ _mm256_set_epi64x(U1_64[0], 0, 0, 0);

for (int32_t i = 1; i < (T_TM3_3W_256 << 1) - 1; i++) {
for (i = 1; i < (T_TM3_3W_256 << 1) - 1; i++) {
tmp[i] = W2[i] ^ W3[i] ^ W4[i] ^ _mm256_lddqu_si256(&U1_256[i - 1]);
}

@@ -494,7 +496,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
U2_64 = (int64_t *) W1;
__m256i *U2_256 = (__m256i *) (U2_64 + 1);

for (int32_t i = 0; i < 2 * (T_TM3_3W_256) - 1; i++) {
for (i = 0; i < 2 * (T_TM3_3W_256) - 1; i++) {
tmp[i] = _mm256_lddqu_si256(&U1_256[i]) ^ _mm256_lddqu_si256(&U2_256[i]);
}

@@ -502,19 +504,19 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
W3[2 * (T_TM3_3W_256) - 1] = zero;

//W1 = W1 + W4 + W2
for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3_3W_256); i++) {
W1[i] ^= W2[i] ^ W4[i];
}

//W2 = W2 + W3
for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3_3W_256); i++) {
W2[i] ^= W3[i];
}

// Recomposition
//W = W0+ W1*x+ W2*x^2+ W3*x^3 + W4*x^4
//W0, W1, W4 of size 2*T_TM3_3W_256, W2 and W3 of size 2*(T_TM3_3W_256)
for (int32_t i = 0; i < (T_TM3_3W_256 << 1) - 1; i++) {
for (i = 0; i < (T_TM3_3W_256 << 1) - 1; i++) {
ro256[i] = W0[i];
ro256[i + 2 * T_TM3_3W_256 - 1] = W2[i];
ro256[i + 4 * T_TM3_3W_256 - 2] = W4[i];
@@ -530,12 +532,12 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
U2_64 = ((int64_t *) &ro256[3 * T_TM3_3W_256 - 1]);
U2_256 = (__m256i *) (U2_64 - 2);

for (int32_t i = 0; i < T_TM3_3W_256 << 1; i++) {
for (i = 0; i < T_TM3_3W_256 << 1; i++) {
_mm256_storeu_si256(&U1_256[i], W1[i] ^ _mm256_lddqu_si256(&U1_256[i]));
_mm256_storeu_si256(&U2_256[i], W3[i] ^ _mm256_loadu_si256(&U2_256[i]));
}

for (int32_t i = 0; i < 2 * VEC_N_SIZE_256 + 1; i++) {
for (i = 0; i < 2 * VEC_N_SIZE_256 + 1; i++) {
_mm256_storeu_si256(&Out[i], ro256[i]);
}
}


+ 46
- 42
crypto_kem/hqc-rmrs-192/avx2/reed_muller.c Parādīt failu

@@ -231,7 +231,19 @@ inline void hadamard(__m256i *src, __m256i *dst) {
inline uint32_t find_peaks(__m256i *transform) {
// a whole lot of vector variables
__m256i bitmap, abs_rows[8], bound, active_row, max_abs_rows;
__m256i peak_mask;
__m256i tmp = _mm256_setzero_si256();
__m256i vect_mask;
__m256i res;
int32_t lower;
int32_t width;
uint32_t message;
uint32_t mask;
int8_t index;
int8_t abs_value;
int8_t mask1;
int8_t mask2;
uint16_t result;

// compute absolute value of transform
for (size_t i = 0; i < 8; i++) {
abs_rows[i] = _mm256_abs_epi16(transform[i]);
@@ -245,9 +257,9 @@ inline uint32_t find_peaks(__m256i *transform) {

// do binary search for the highest value that is lower than the maximum
// loop invariant: lower gives bit map = 0, lower + width gives bit map > 0
int32_t lower = 1;
lower = 1;
// this gives 64, 128 or 256 for MULTIPLICITY = 2, 4, 6
int32_t width = 1 << (5 + MULTIPLICITY / 2);
width = 1 << (5 + MULTIPLICITY / 2);
// if you don't unroll this loop, it fits in the loop cache
// uncomment the line below to speeding up the program by a few percent
// #pragma GCC unroll 0
@@ -259,8 +271,9 @@ inline uint32_t find_peaks(__m256i *transform) {
bitmap = _mm256_cmpgt_epi16(max_abs_rows, bound);
// step up if there are any matches
// rely on compiler to use conditional move here
int32_t step_mask = _mm256_testz_si256(bitmap, bitmap) - 1;
lower += step_mask & width;
mask = (uint32_t) _mm256_testz_si256(bitmap, bitmap);
mask = ~(uint32_t) ((-(int64_t) mask) >> 63);
lower += mask & width;
}
// lower+width contains the maximum value of the vector
// or less, if the maximum is very high (which is OK)
@@ -272,30 +285,26 @@ inline uint32_t find_peaks(__m256i *transform) {

// find in which of the 8 groups a maximum occurs to compute bits 4, 5, 6 of message
// find lowest value by searching backwards skip first check to save time
size_t message = 0x70;
for (int32_t i = 7; i >= 0; i--) {
bitmap = _mm256_cmpgt_epi16(abs_rows[i], bound);
int message_mask = (-(int16_t)(_mm256_testz_si256(bitmap, bitmap) == 0)) >> 15;
message ^= message_mask & (message ^ (unsigned)i << 4);
message = 0x70;
for (size_t i = 0; i < 8; i++) {
bitmap = _mm256_cmpgt_epi16(abs_rows[7 - i], bound);
mask = (uint32_t) _mm256_testz_si256(bitmap, bitmap);
mask = ~(uint32_t) ((-(int64_t) mask) >> 63);
message ^= mask & (message ^ ((7 - i) << 4));
}
// we decided which row of the matrix contains the lowest match
// select proper row
int8_t index = message >> 4;
__m256i res;
__m256i tmp = (__m256i) {
0ULL, 0ULL, 0ULL, 0ULL
};
index = message >> 4;

for (int8_t i = 0; i < 8; i++) {
int8_t abs_value = (int8_t)(index - i);
int8_t mask1 = abs_value >> 7;
tmp = _mm256_setzero_si256();
for (size_t i = 0; i < 8; i++) {
abs_value = (int8_t)(index - i);
mask1 = abs_value >> 7;
abs_value ^= mask1;
abs_value -= mask1;
int8_t mask2 = ((uint8_t) - abs_value >> 7);
int64_t mask3 = (-1ULL) + mask2;
__m256i vect_mask = (__m256i) {
mask3, mask3, mask3, mask3
};
mask2 = ((uint8_t) - abs_value >> 7);
mask = (-1ULL) + mask2;
vect_mask = _mm256_set1_epi32(mask);
res = _mm256_and_si256(abs_rows[i], vect_mask);
tmp = _mm256_or_si256(tmp, res);
}
@@ -305,34 +314,29 @@ inline uint32_t find_peaks(__m256i *transform) {
// get the column number of the vector element
// by setting the bits corresponding to the columns
// and then adding elements within two groups of 8
peak_mask = _mm256_cmpgt_epi16(active_row, bound);
peak_mask &= _mm256_set_epi16(-32768, 16384, 8192, 4096, 2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1);
for (int32_t i = 0; i < 3; i++) {
peak_mask = _mm256_hadd_epi16(peak_mask, peak_mask);
vect_mask = _mm256_cmpgt_epi16(active_row, bound);
vect_mask &= _mm256_set_epi16(-32768, 16384, 8192, 4096, 2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1);
for (size_t i = 0; i < 3; i++) {
vect_mask = _mm256_hadd_epi16(vect_mask, vect_mask);
}
// add low 4 bits of message
message |= __tzcnt_u16(_mm256_extract_epi16(peak_mask, 0) + _mm256_extract_epi16(peak_mask, 8));
message |= __tzcnt_u16(_mm256_extract_epi16(vect_mask, 0) + _mm256_extract_epi16(vect_mask, 8));

// set bit 7 if sign of biggest value is positive
// make sure a jump isn't generated by the compiler
tmp = (__m256i) {
0ULL, 0ULL, 0ULL, 0ULL
};
for (uint32_t i = 0; i < 8; i++) {
int64_t message_mask = (-(int64_t)(i == message / 16)) >> 63;
__m256i vect_mask = (__m256i) {
message_mask, message_mask, message_mask, message_mask
};
tmp = _mm256_setzero_si256();
for (size_t i = 0; i < 8; i++) {
mask = ~(uint32_t) ((-(int64_t)(i ^ message / 16)) >> 63);
__m256i vect_mask = _mm256_set1_epi32(mask);
tmp = _mm256_or_si256(tmp, _mm256_and_si256(vect_mask, transform[i]));
}
uint16_t result = 0;
for (uint32_t i = 0; i < 16; i++) {
uint16_t *ptr = (uint16_t *) &tmp;
int32_t message_mask = (-(int32_t)(i == message % 16)) >> (sizeof(int32_t) * 8 - 1);
result |= message_mask & ptr[i];
result = 0;
for (size_t i = 0; i < 16; i++) {
mask = ~(uint32_t) ((-(int64_t)(i ^ message % 16)) >> 63);
result |= mask & ((uint16_t *)&tmp)[i];
}
message |= (0x8000 & ~result) >> 8;
return (uint32_t) message;
return message;
}




+ 26
- 18
crypto_kem/hqc-rmrs-192/avx2/reed_solomon.c Parādīt failu

@@ -228,17 +228,25 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons
uint16_t beta_j[PARAM_DELTA] = {0};
uint16_t e_j[PARAM_DELTA] = {0};

uint16_t delta_counter = 0;
uint16_t delta_counter;
uint16_t delta_real_value;
uint16_t found;
uint16_t mask1;
uint16_t mask2;
uint16_t tmp1;
uint16_t tmp2;
uint16_t inverse;
uint16_t inverse_power_j;

// Compute the beta_{j_i} page 31 of the documentation
delta_counter = 0;
for (size_t i = 0; i < PARAM_N1; i++) {
uint16_t found = 0;
uint16_t valuemask = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0
for (uint16_t j = 0; j < PARAM_DELTA; j++) {
uint16_t indexmask = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter
beta_j[j] += indexmask & valuemask & gf_exp[i];
found += indexmask & valuemask & 1;
found = 0;
mask1 = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0
for (size_t j = 0; j < PARAM_DELTA; j++) {
mask2 = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter
beta_j[j] += mask1 & mask2 & gf_exp[i];
found += mask1 & mask2 & 1;
}
delta_counter += found;
}
@@ -246,10 +254,10 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons

// Compute the e_{j_i} page 31 of the documentation
for (size_t i = 0; i < PARAM_DELTA; ++i) {
uint16_t tmp1 = 1;
uint16_t tmp2 = 1;
uint16_t inverse = PQCLEAN_HQCRMRS192_AVX2_gf_inverse(beta_j[i]);
uint16_t inverse_power_j = 1;
tmp1 = 1;
tmp2 = 1;
inverse = PQCLEAN_HQCRMRS192_AVX2_gf_inverse(beta_j[i]);
inverse_power_j = 1;

for (size_t j = 1; j <= PARAM_DELTA; ++j) {
inverse_power_j = PQCLEAN_HQCRMRS192_AVX2_gf_mul(inverse_power_j, inverse);
@@ -258,19 +266,19 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons
for (size_t k = 1; k < PARAM_DELTA; ++k) {
tmp2 = PQCLEAN_HQCRMRS192_AVX2_gf_mul(tmp2, (1 ^ PQCLEAN_HQCRMRS192_AVX2_gf_mul(inverse, beta_j[(i + k) % PARAM_DELTA])));
}
uint16_t mask = (uint16_t) (((int16_t) i - delta_real_value) >> 15); // i < delta_real_value
e_j[i] = mask & PQCLEAN_HQCRMRS192_AVX2_gf_mul(tmp1, PQCLEAN_HQCRMRS192_AVX2_gf_inverse(tmp2));
mask1 = (uint16_t) (((int16_t) i - delta_real_value) >> 15); // i < delta_real_value
e_j[i] = mask1 & PQCLEAN_HQCRMRS192_AVX2_gf_mul(tmp1, PQCLEAN_HQCRMRS192_AVX2_gf_inverse(tmp2));
}

// Place the delta e_{j_i} values at the right coordinates of the output vector
delta_counter = 0;
for (size_t i = 0; i < PARAM_N1; ++i) {
uint16_t found = 0;
uint16_t valuemask = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0
found = 0;
mask1 = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0
for (size_t j = 0; j < PARAM_DELTA; j++) {
uint16_t indexmask = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter
error_values[i] += indexmask & valuemask & e_j[j];
found += indexmask & valuemask & 1;
mask2 = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter
error_values[i] += mask1 & mask2 & e_j[j];
found += mask1 & mask2 & 1;
}
delta_counter += found;
}


+ 25
- 35
crypto_kem/hqc-rmrs-192/avx2/vector.c Parādīt failu

@@ -32,72 +32,63 @@
void PQCLEAN_HQCRMRS192_AVX2_vect_set_random_fixed_weight(AES_XOF_struct *ctx, uint64_t *v, uint16_t weight) {
size_t random_bytes_size = 3 * weight;
uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0};
uint32_t random_data = 0;
uint32_t tmp[PARAM_OMEGA_R] = {0};
uint8_t exist = 0;
size_t j = 0;
__m256i bit256[PARAM_OMEGA_R];
__m256i bloc256[PARAM_OMEGA_R];
static __m256i posCmp256 = (__m256i) {
0UL, 1UL, 2UL, 3UL
};
#define LOOP_SIZE CEIL_DIVIDE(PARAM_N, 256)

seedexpander(ctx, rand_bytes, random_bytes_size);

for (uint32_t i = 0; i < weight; ++i) {
exist = 0;
__m256i posCmp256 = _mm256_set_epi64x(3, 2, 1, 0);
uint64_t bloc, pos, bit64;
uint8_t inc;
size_t i, j;

i = 0;
j = random_bytes_size;
while (i < weight) {
do {
if (j == random_bytes_size) {
seedexpander(ctx, rand_bytes, random_bytes_size);
j = 0;
}

random_data = ((uint32_t) rand_bytes[j++]) << 16;
random_data |= ((uint32_t) rand_bytes[j++]) << 8;
random_data |= rand_bytes[j++];
tmp[i] = ((uint32_t) rand_bytes[j++]) << 16;
tmp[i] |= ((uint32_t) rand_bytes[j++]) << 8;
tmp[i] |= rand_bytes[j++];

} while (random_data >= UTILS_REJECTION_THRESHOLD);
} while (tmp[i] >= UTILS_REJECTION_THRESHOLD);

random_data = random_data % PARAM_N;
tmp[i] = tmp[i] % PARAM_N;

inc = 1;
for (uint32_t k = 0; k < i; k++) {
if (tmp[k] == random_data) {
exist = 1;
if (tmp[k] == tmp[i]) {
inc = 0;
}
}

if (exist == 1) {
i--;
} else {
tmp[i] = random_data;
}
i += inc;
}

for (uint32_t i = 0; i < weight; i++) {
for (i = 0; i < weight; i++) {
// we store the bloc number and bit position of each vb[i]
uint64_t bloc = tmp[i] >> 6;
bloc = tmp[i] >> 6;
bloc256[i] = _mm256_set1_epi64x(bloc >> 2);
uint64_t pos = (bloc & 0x3UL);
pos = (bloc & 0x3UL);
__m256i pos256 = _mm256_set1_epi64x(pos);
__m256i mask256 = _mm256_cmpeq_epi64(pos256, posCmp256);
uint64_t bit64 = 1ULL << (tmp[i] & 0x3f);
bit64 = 1ULL << (tmp[i] & 0x3f);
__m256i bloc256 = _mm256_set1_epi64x(bit64);
bit256[i] = bloc256 & mask256;
}

for (uint32_t i = 0; i < LOOP_SIZE; i++) {
for (i = 0; i < CEIL_DIVIDE(PARAM_N, 256); i++) {
__m256i aux = _mm256_loadu_si256(((__m256i *)v) + i);
__m256i i256 = _mm256_set1_epi64x(i);

for (uint32_t j = 0; j < weight; j++) {
for (j = 0; j < weight; j++) {
__m256i mask256 = _mm256_cmpeq_epi64(bloc256[j], i256);
aux ^= bit256[j] & mask256;
}
_mm256_storeu_si256(((__m256i *)v) + i, aux);
}

#undef LOOP_SIZE
}


@@ -167,10 +158,9 @@ uint8_t PQCLEAN_HQCRMRS192_AVX2_vect_compare(const uint8_t *v1, const uint8_t *v
* @param[in] size_v Integer that is the size of the input vector in bits
*/
void PQCLEAN_HQCRMRS192_AVX2_vect_resize(uint64_t *o, uint32_t size_o, const uint64_t *v, uint32_t size_v) {
uint64_t mask = 0x7FFFFFFFFFFFFFFF;
int8_t val = 0;
if (size_o < size_v) {
uint64_t mask = 0x7FFFFFFFFFFFFFFF;
int8_t val = 0;

if (size_o % 64) {
val = 64 - (size_o % 64);
}


+ 26
- 18
crypto_kem/hqc-rmrs-192/clean/reed_solomon.c Parādīt failu

@@ -228,17 +228,25 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons
uint16_t beta_j[PARAM_DELTA] = {0};
uint16_t e_j[PARAM_DELTA] = {0};

uint16_t delta_counter = 0;
uint16_t delta_counter;
uint16_t delta_real_value;
uint16_t found;
uint16_t mask1;
uint16_t mask2;
uint16_t tmp1;
uint16_t tmp2;
uint16_t inverse;
uint16_t inverse_power_j;

// Compute the beta_{j_i} page 31 of the documentation
delta_counter = 0;
for (size_t i = 0; i < PARAM_N1; i++) {
uint16_t found = 0;
uint16_t valuemask = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0
for (uint16_t j = 0; j < PARAM_DELTA; j++) {
uint16_t indexmask = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter
beta_j[j] += indexmask & valuemask & gf_exp[i];
found += indexmask & valuemask & 1;
found = 0;
mask1 = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0
for (size_t j = 0; j < PARAM_DELTA; j++) {
mask2 = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter
beta_j[j] += mask1 & mask2 & gf_exp[i];
found += mask1 & mask2 & 1;
}
delta_counter += found;
}
@@ -246,10 +254,10 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons

// Compute the e_{j_i} page 31 of the documentation
for (size_t i = 0; i < PARAM_DELTA; ++i) {
uint16_t tmp1 = 1;
uint16_t tmp2 = 1;
uint16_t inverse = PQCLEAN_HQCRMRS192_CLEAN_gf_inverse(beta_j[i]);
uint16_t inverse_power_j = 1;
tmp1 = 1;
tmp2 = 1;
inverse = PQCLEAN_HQCRMRS192_CLEAN_gf_inverse(beta_j[i]);
inverse_power_j = 1;

for (size_t j = 1; j <= PARAM_DELTA; ++j) {
inverse_power_j = PQCLEAN_HQCRMRS192_CLEAN_gf_mul(inverse_power_j, inverse);
@@ -258,19 +266,19 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons
for (size_t k = 1; k < PARAM_DELTA; ++k) {
tmp2 = PQCLEAN_HQCRMRS192_CLEAN_gf_mul(tmp2, (1 ^ PQCLEAN_HQCRMRS192_CLEAN_gf_mul(inverse, beta_j[(i + k) % PARAM_DELTA])));
}
uint16_t mask = (uint16_t) (((int16_t) i - delta_real_value) >> 15); // i < delta_real_value
e_j[i] = mask & PQCLEAN_HQCRMRS192_CLEAN_gf_mul(tmp1, PQCLEAN_HQCRMRS192_CLEAN_gf_inverse(tmp2));
mask1 = (uint16_t) (((int16_t) i - delta_real_value) >> 15); // i < delta_real_value
e_j[i] = mask1 & PQCLEAN_HQCRMRS192_CLEAN_gf_mul(tmp1, PQCLEAN_HQCRMRS192_CLEAN_gf_inverse(tmp2));
}

// Place the delta e_{j_i} values at the right coordinates of the output vector
delta_counter = 0;
for (size_t i = 0; i < PARAM_N1; ++i) {
uint16_t found = 0;
uint16_t valuemask = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0
found = 0;
mask1 = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0
for (size_t j = 0; j < PARAM_DELTA; j++) {
uint16_t indexmask = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter
error_values[i] += indexmask & valuemask & e_j[j];
found += indexmask & valuemask & 1;
mask2 = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter
error_values[i] += mask1 & mask2 & e_j[j];
found += mask1 & mask2 & 1;
}
delta_counter += found;
}


+ 17
- 58
crypto_kem/hqc-rmrs-192/clean/vector.c Parādīt failu

@@ -31,39 +31,33 @@
void PQCLEAN_HQCRMRS192_CLEAN_vect_set_random_fixed_weight_by_coordinates(AES_XOF_struct *ctx, uint32_t *v, uint16_t weight) {
size_t random_bytes_size = 3 * weight;
uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0}; // weight is expected to be <= PARAM_OMEGA_R
uint32_t random_data = 0;
uint8_t exist = 0;
size_t j = 0;
uint8_t inc;
size_t i, j;

seedexpander(ctx, rand_bytes, random_bytes_size);

for (uint32_t i = 0; i < weight; ++i) {
exist = 0;
i = 0;
j = random_bytes_size;
while (i < weight) {
do {
if (j == random_bytes_size) {
seedexpander(ctx, rand_bytes, random_bytes_size);
j = 0;
}

random_data = ((uint32_t) rand_bytes[j++]) << 16;
random_data |= ((uint32_t) rand_bytes[j++]) << 8;
random_data |= rand_bytes[j++];
v[i] = ((uint32_t) rand_bytes[j++]) << 16;
v[i] |= ((uint32_t) rand_bytes[j++]) << 8;
v[i] |= rand_bytes[j++];

} while (random_data >= UTILS_REJECTION_THRESHOLD);
} while (v[i] >= UTILS_REJECTION_THRESHOLD);

random_data = random_data % PARAM_N;
v[i] = v[i] % PARAM_N;

for (uint32_t k = 0; k < i; k++) {
if (v[k] == random_data) {
exist = 1;
inc = 1;
for (size_t k = 0; k < i; k++) {
if (v[k] == v[i]) {
inc = 0;
}
}

if (exist == 1) {
i--;
} else {
v[i] = random_data;
}
i += inc;
}
}

@@ -86,46 +80,11 @@ void PQCLEAN_HQCRMRS192_CLEAN_vect_set_random_fixed_weight_by_coordinates(AES_XO
* @param[in] ctx Pointer to the context of the seed expander
*/
void PQCLEAN_HQCRMRS192_CLEAN_vect_set_random_fixed_weight(AES_XOF_struct *ctx, uint64_t *v, uint16_t weight) {

size_t random_bytes_size = 3 * weight;
uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0}; // weight is expected to be <= PARAM_OMEGA_R
uint32_t random_data = 0;
uint32_t tmp[PARAM_OMEGA_R] = {0};
uint8_t exist = 0;
size_t j = 0;

seedexpander(ctx, rand_bytes, random_bytes_size);

for (uint32_t i = 0; i < weight; ++i) {
exist = 0;
do {
if (j == random_bytes_size) {
seedexpander(ctx, rand_bytes, random_bytes_size);
j = 0;
}

random_data = ((uint32_t) rand_bytes[j++]) << 16;
random_data |= ((uint32_t) rand_bytes[j++]) << 8;
random_data |= rand_bytes[j++];

} while (random_data >= UTILS_REJECTION_THRESHOLD);

random_data = random_data % PARAM_N;

for (uint32_t k = 0; k < i; k++) {
if (tmp[k] == random_data) {
exist = 1;
}
}

if (exist == 1) {
i--;
} else {
tmp[i] = random_data;
}
}
PQCLEAN_HQCRMRS192_CLEAN_vect_set_random_fixed_weight_by_coordinates(ctx, tmp, weight);

for (uint16_t i = 0; i < weight; ++i) {
for (size_t i = 0; i < weight; ++i) {
int32_t index = tmp[i] / 64;
int32_t pos = tmp[i] % 64;
v[index] |= ((uint64_t) 1) << pos;


+ 67
- 66
crypto_kem/hqc-rmrs-256/avx2/gf2x.c Parādīt failu

@@ -232,23 +232,24 @@ static inline void karat_mult_8(__m256i *C, __m256i *A, __m256i *B) {
* @param[in] B Pointer to the polynomial B(x)
*/
static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) {
int32_t i, is, is2, is3;
__m256i D0[16], D1[16], D2[16], SAA[8], SBB[8];

karat_mult_8( D0, A, B);
karat_mult_8(D2, A + 8, B + 8);

for (int32_t i = 0; i < 8; i++) {
int32_t is = i + 8;
for (i = 0; i < 8; i++) {
is = i + 8;
SAA[i] = A[i] ^ A[is];
SBB[i] = B[i] ^ B[is];
}

karat_mult_8( D1, SAA, SBB);

for (int32_t i = 0; i < 8; i++) {
int32_t is = i + 8;
int32_t is2 = is + 8;
int32_t is3 = is2 + 8;
for (i = 0; i < 8; i++) {
is = i + 8;
is2 = is + 8;
is3 = is2 + 8;

__m256i middle = _mm256_xor_si256(D0[is], D2[i]);

@@ -272,22 +273,23 @@ static inline void karat_mult_16(__m256i *C, __m256i *A, __m256i *B) {
*/
static inline void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) {
__m256i D0[32], D1[32], D2[32], SAA[16], SBB[16];
int32_t i, is, is2, is3;

karat_mult_16( D0, A, B);
karat_mult_16(D2, A + 16, B + 16);

for (int32_t i = 0; i < 16; i++) {
int is = i + 16;
for (i = 0; i < 16; i++) {
is = i + 16;
SAA[i] = A[i] ^ A[is];
SBB[i] = B[i] ^ B[is];
}

karat_mult_16( D1, SAA, SBB);

for (int32_t i = 0; i < 16; i++) {
int32_t is = i + 16;
int32_t is2 = is + 16;
int32_t is3 = is2 + 16;
for (i = 0; i < 16; i++) {
is = i + 16;
is2 = is + 16;
is3 = is2 + 16;

__m256i middle = _mm256_xor_si256(D0[is], D2[i]);

@@ -299,7 +301,6 @@ static inline void karat_mult_32(__m256i *C, __m256i *A, __m256i *B) {
}



/**
* @brief Compute B(x) = A(x)/(x+1)
*
@@ -336,11 +337,16 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
static __m256i tmp[2 * (T_TM3_3W_256)];
static __m256i ro256[6 * (T_TM3_3W_256)];
const __m256i zero = _mm256_setzero_si256();
int64_t *U1_64;
int64_t *U2_64;
int64_t *V1_64;
int64_t *V2_64;
int32_t T2 = T_TM3_3W_64 << 1;
int32_t i, i4, i41, i42;

for (int32_t i = 0; i < T_TM3_3W_256 - 1; i++) {
int32_t i4 = i << 2;
int32_t i42 = i4 - 2;
for (i = 0; i < T_TM3_3W_256 - 1; i++) {
i4 = i << 2;
i42 = i4 - 2;
U0[i] = _mm256_lddqu_si256((__m256i const *)(& A[i4]));
V0[i] = _mm256_lddqu_si256((__m256i const *)(& B[i4]));
U1[i] = _mm256_lddqu_si256((__m256i const *)(& A[i42 + T_TM3_3W_64]));
@@ -349,9 +355,9 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
V2[i] = _mm256_lddqu_si256((__m256i const *)(& B[i4 + T2 - 4]));
}

for (int32_t i = T_TM3_3W_256 - 1; i < T_TM3_3W_256; i++) {
int32_t i4 = i << 2;
int32_t i41 = i4 + 1;
for (i = T_TM3_3W_256 - 1; i < T_TM3_3W_256; i++) {
i4 = i << 2;
i41 = i4 + 1;
U0[i] = _mm256_set_epi64x(0, 0, A[i41], A[i4]);
V0[i] = _mm256_set_epi64x(0, 0, B[i41], B[i4]);
U1[i] = _mm256_set_epi64x(0, 0, A[i41 + T_TM3_3W_64 - 2], A[i4 + T_TM3_3W_64 - 2]);
@@ -364,7 +370,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
// P(X): P0=(0); P1=(1); P2=(x); P3=(1+x); P4=(\infty)
// Evaluation: 5*2 add, 2*2 shift; 5 mul (n)
//W3 = U2 + U1 + U0; W2 = V2 + V1 + V0
for (int32_t i = 0; i < T_TM3_3W_256; i++) {
for (i = 0; i < T_TM3_3W_256; i++) {
W3[i] = U0[i] ^ U1[i] ^ U2[i];
W2[i] = V0[i] ^ V1[i] ^ V2[i];
}
@@ -373,23 +379,17 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
karat_mult_32( W1, W2, W3);

//W0 =(U1 + U2*x)*x; W4 =(V1 + V2*x)*x (SIZE = T_TM3_3W_256 !)
int64_t *U1_64 = ((int64_t *) U1);
int64_t *U2_64 = ((int64_t *) U2);

int64_t *V1_64 = ((int64_t *) V1);
int64_t *V2_64 = ((int64_t *) V2);

W0[0] = _mm256_set_epi64x(U1_64[2] ^ U2_64[1], U1_64[1] ^ U2_64[0], U1_64[0], 0);
W4[0] = _mm256_set_epi64x(V1_64[2] ^ V2_64[1], V1_64[1] ^ V2_64[0], V1_64[0], 0);

U1_64 = ((int64_t *) U1);
U2_64 = ((int64_t *) U2);

V1_64 = ((int64_t *) V1);
V2_64 = ((int64_t *) V2);

for (int32_t i = 1; i < T_TM3_3W_256; i++) {
int i4 = i << 2;
W0[0] = _mm256_set_epi64x(U1_64[2] ^ U2_64[1], U1_64[1] ^ U2_64[0], U1_64[0], 0);
W4[0] = _mm256_set_epi64x(V1_64[2] ^ V2_64[1], V1_64[1] ^ V2_64[0], V1_64[0], 0);

for (i = 1; i < T_TM3_3W_256; i++) {
i4 = i << 2;
W0[i] = _mm256_lddqu_si256((__m256i const *)(& U1_64[i4 - 1]));
W0[i] ^= _mm256_lddqu_si256((__m256i const *)(& U2_64[i4 - 2]));

@@ -398,13 +398,13 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
}

//W3 = W3 + W0 ; W2 = W2 + W4
for (int32_t i = 0; i < T_TM3_3W_256; i++) {
for (i = 0; i < T_TM3_3W_256; i++) {
W3[i] ^= W0[i];
W2[i] ^= W4[i];
}

//W0 = W0 + U0 ; W4 = W4 + V0
for (int32_t i = 0; i < T_TM3_3W_256; i++) {
for (i = 0; i < T_TM3_3W_256; i++) {
W0[i] ^= U0[i];
W4[i] ^= V0[i];
}
@@ -412,7 +412,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
//W3 = W3 * W2 ; W2 = W0 * W4
karat_mult_32(tmp, W3, W2);

for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3_3W_256); i++) {
W3[i] = tmp[i];
}

@@ -424,20 +424,20 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
// Interpolation phase
// 9 add, 1 shift, 1 Smul, 2 Sdiv (2n)
//W3 = W3 + W2
for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3_3W_256); i++) {
W3[i] ^= W2[i];
}

//W1 = W1 + W0
for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3_3W_256); i++) {
W1[i] ^= W0[i];
}

//W2 =(W2 + W0)/x -> x = X^64
U1_64 = ((int64_t *) W2);
U2_64 = ((int64_t *) W0);
for (int32_t i = 0; i < (T_TM3_3W_256 << 1); i++) {
int32_t i4 = i << 2;
for (i = 0; i < (T_TM3_3W_256 << 1); i++) {
i4 = i << 2;
W2[i] = _mm256_lddqu_si256((__m256i const *)(& U1_64[i4 + 1]));
W2[i] ^= _mm256_lddqu_si256((__m256i const *)(& U2_64[i4 + 1]));
}
@@ -447,7 +447,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
__m256i *U1_256 = (__m256i *) (U1_64 + 1);
tmp[0] = W2[0] ^ W3[0] ^ W4[0] ^ _mm256_set_epi64x(U1_64[0], 0, 0, 0);

for (int32_t i = 1; i < (T_TM3_3W_256 << 1) - 1; i++) {
for (i = 1; i < (T_TM3_3W_256 << 1) - 1; i++) {
tmp[i] = W2[i] ^ W3[i] ^ W4[i] ^ _mm256_lddqu_si256(&U1_256[i - 1]);
}

@@ -461,7 +461,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
U2_64 = (int64_t *) W1;
__m256i *U2_256 = (__m256i *) (U2_64 + 1);

for (int32_t i = 0; i < 2 * (T_TM3_3W_256) - 1; i++) {
for (i = 0; i < 2 * (T_TM3_3W_256) - 1; i++) {
tmp[i] = _mm256_lddqu_si256(&U1_256[i]) ^ _mm256_lddqu_si256(&U2_256[i]);
}

@@ -469,19 +469,19 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
W3[2 * (T_TM3_3W_256) - 1] = zero;

//W1 = W1 + W4 + W2
for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3_3W_256); i++) {
W1[i] ^= W2[i] ^ W4[i];
}

//W2 = W2 + W3
for (int32_t i = 0; i < 2 * (T_TM3_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3_3W_256); i++) {
W2[i] ^= W3[i];
}

// Recomposition
//W = W0+ W1*x+ W2*x^2+ W3*x^3 + W4*x^4
//W0, W1, W4 of size 2*T_TM3_3W_256, W2 and W3 of size 2*(T_TM3_3W_256)
for (int32_t i = 0; i < (T_TM3_3W_256 << 1) - 1; i++) {
for (i = 0; i < (T_TM3_3W_256 << 1) - 1; i++) {
ro256[i] = W0[i];
ro256[i + 2 * T_TM3_3W_256 - 1] = W2[i];
ro256[i + 4 * T_TM3_3W_256 - 2] = W4[i];
@@ -497,12 +497,12 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
U2_64 = ((int64_t *) &ro256[3 * T_TM3_3W_256 - 1]);
U2_256 = (__m256i *) (U2_64 - 2);

for (int32_t i = 0; i < T_TM3_3W_256 << 1; i++) {
for (i = 0; i < T_TM3_3W_256 << 1; i++) {
_mm256_storeu_si256(&U1_256[i], W1[i] ^ _mm256_lddqu_si256(&U1_256[i]));
_mm256_storeu_si256(&U2_256[i], W3[i] ^ _mm256_loadu_si256(&U2_256[i]));
}

for (int32_t i = 0; i < 6 * T_TM3_3W_256 - 2; i++) {
for (i = 0; i < 6 * T_TM3_3W_256 - 2; i++) {
_mm256_storeu_si256(&Out[i], ro256[i]);
}
}
@@ -541,9 +541,10 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
__m256i ro256[tTM3R / 2];
const __m256i zero = _mm256_setzero_si256();
int32_t T2 = T_TM3R_3W_64 << 1;
int32_t i, i1, i4;

for (int32_t i = 0; i < T_TM3R_3W_256; i++) {
int32_t i4 = i << 2;
for (i = 0; i < T_TM3R_3W_256; i++) {
i4 = i << 2;
U0[i] = _mm256_lddqu_si256((__m256i const *)(& A[i4]));
V0[i] = _mm256_lddqu_si256((__m256i const *)(& B[i4]));
U1[i] = _mm256_lddqu_si256((__m256i const *)(& A[i4 + T_TM3R_3W_64]));
@@ -552,7 +553,7 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
V2[i] = _mm256_lddqu_si256((__m256i const *)(& B[i4 + T2]));
}

for (int32_t i = T_TM3R_3W_256; i < T_TM3R_3W_256 + 2; i++) {
for (i = T_TM3R_3W_256; i < T_TM3R_3W_256 + 2; i++) {
U0[i] = zero;
V0[i] = zero;
U1[i] = zero;
@@ -566,12 +567,12 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
// Evaluation: 5*2 add, 2*2 shift; 5 mul (n)
//W3 = U2 + U1 + U0; W2 = V2 + V1 + V0

for (int32_t i = 0; i < T_TM3R_3W_256; i++) {
for (i = 0; i < T_TM3R_3W_256; i++) {
W3[i] = U0[i] ^ U1[i] ^ U2[i];
W2[i] = V0[i] ^ V1[i] ^ V2[i];
}

for (int32_t i = T_TM3R_3W_256; i < T_TM3R_3W_256 + 2; i++) {
for (i = T_TM3R_3W_256; i < T_TM3R_3W_256 + 2; i++) {
W2[i] = zero;
W3[i] = zero;
}
@@ -584,7 +585,7 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
W0[1] = U1[0];
W4[1] = V1[0];

for (int32_t i = 1; i < T_TM3R_3W_256 + 1; i++) {
for (i = 1; i < T_TM3R_3W_256 + 1; i++) {
W0[i + 1] = U1[i] ^ U2[i - 1];
W4[i + 1] = V1[i] ^ V2[i - 1];
}
@@ -593,13 +594,13 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
W4[T_TM3R_3W_256 + 1] = V2[T_TM3R_3W_256 - 1];

//W3 = W3 + W0 ; W2 = W2 + W4
for (int32_t i = 0; i < T_TM3R_3W_256 + 2; i++) {
for (i = 0; i < T_TM3R_3W_256 + 2; i++) {
W3[i] ^= W0[i];
W2[i] ^= W4[i];
}

//W0 = W0 + U0 ; W4 = W4 + V0
for (int32_t i = 0; i < T_TM3R_3W_256 + 2; i++) {
for (i = 0; i < T_TM3R_3W_256 + 2; i++) {
W0[i] ^= U0[i];
W4[i] ^= V0[i];
}
@@ -607,7 +608,7 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
//W3 = W3 * W2 ; W2 = W0 * W4
TOOM3Mult(tmp, (uint64_t *) W3, (uint64_t *) W2);

for (int32_t i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) {
for (i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) {
W3[i] = tmp[i];
}

@@ -621,25 +622,25 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
//9 add, 1 shift, 1 Smul, 2 Sdiv (2n)

//W3 = W3 + W2
for (int32_t i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) {
for (i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) {
W3[i] ^= W2[i];
}

//W1 = W1 + W0
for (int32_t i = 0; i < 2 * (T_TM3R_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3R_3W_256); i++) {
W1[i] ^= W0[i];
}

//W2 =(W2 + W0)/x
for (int32_t i = 0; i < 2 * (T_TM3R_3W_256 + 2) - 1; i++) {
int32_t i1 = i + 1;
for (i = 0; i < 2 * (T_TM3R_3W_256 + 2) - 1; i++) {
i1 = i + 1;
W2[i] = W2[i1] ^ W0[i1];
}

W2[2 * (T_TM3R_3W_256 + 2) - 1] = zero;

//W2 =(W2 + W3 + W4*(x^3+1))/(x+1)
for (int32_t i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) {
for (i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) {
tmp[i] = W2[i] ^ W3[i] ^ W4[i];
}

@@ -647,15 +648,15 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
tmp[2 * (T_TM3R_3W_256 + 2) + 1] = zero;
tmp[2 * (T_TM3R_3W_256 + 2) + 2] = zero;

for (int32_t i = 0; i < 2 * (T_TM3R_3W_256); i++) {
for (i = 0; i < 2 * (T_TM3R_3W_256); i++) {
tmp[i + 3] ^= W4[i];
}

divByXplus1_256(W2, tmp, T_TM3R_3W_256);

//W3 =(W3 + W1)/(x*(x+1))
for (int32_t i = 0; i < 2 * (T_TM3R_3W_256 + 2) - 1; i++) {
int32_t i1 = i + 1;
for (i = 0; i < 2 * (T_TM3R_3W_256 + 2) - 1; i++) {
i1 = i + 1;
tmp[i] = W3[i1] ^ W1[i1];
}

@@ -663,18 +664,18 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
divByXplus1_256(W3, tmp, T_TM3R_3W_256);

//W1 = W1 + W4 + W2
for (int32_t i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) {
for (i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) {
W1[i] ^= W2[i] ^ W4[i];
}
//W2 = W2 + W3
for (int32_t i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) {
for (i = 0; i < 2 * (T_TM3R_3W_256 + 2); i++) {
W2[i] ^= W3[i];
}

// Recomposition
//W = W0+ W1*x+ W2*x^2+ W3*x^3 + W4*x^4
//W0, W1, W4 of size 2*T_TM3_3W_256, W2 and W3 of size 2*(T_TM3_3W_256+2)
for (int32_t i = 0; i < T_TM3R_3W_256; i++) {
for (i = 0; i < T_TM3R_3W_256; i++) {
ro256[i] = W0[i];
ro256[i + T_TM3R_3W_256] = W0[i + T_TM3R_3W_256] ^ W1[i];
ro256[i + 2 * T_TM3R_3W_256] = W1[i + T_TM3R_3W_256] ^ W2[i];
@@ -696,7 +697,7 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
ro256[3 + 5 * T_TM3R_3W_256] ^= W3[3 + 2 * T_TM3R_3W_256];


for (int32_t i = 0; i < 2 * VEC_N_SIZE_256 + 1; i++) {
for (i = 0; i < 2 * VEC_N_SIZE_256 + 1; i++) {
_mm256_storeu_si256(&Out[i], ro256[i]);
}
}


+ 46
- 42
crypto_kem/hqc-rmrs-256/avx2/reed_muller.c Parādīt failu

@@ -231,7 +231,19 @@ inline void hadamard(__m256i *src, __m256i *dst) {
inline uint32_t find_peaks(__m256i *transform) {
// a whole lot of vector variables
__m256i bitmap, abs_rows[8], bound, active_row, max_abs_rows;
__m256i peak_mask;
__m256i tmp = _mm256_setzero_si256();
__m256i vect_mask;
__m256i res;
int32_t lower;
int32_t width;
uint32_t message;
uint32_t mask;
int8_t index;
int8_t abs_value;
int8_t mask1;
int8_t mask2;
uint16_t result;

// compute absolute value of transform
for (size_t i = 0; i < 8; i++) {
abs_rows[i] = _mm256_abs_epi16(transform[i]);
@@ -245,9 +257,9 @@ inline uint32_t find_peaks(__m256i *transform) {

// do binary search for the highest value that is lower than the maximum
// loop invariant: lower gives bit map = 0, lower + width gives bit map > 0
int32_t lower = 1;
lower = 1;
// this gives 64, 128 or 256 for MULTIPLICITY = 2, 4, 6
int32_t width = 1 << (5 + MULTIPLICITY / 2);
width = 1 << (5 + MULTIPLICITY / 2);
// if you don't unroll this loop, it fits in the loop cache
// uncomment the line below to speeding up the program by a few percent
// #pragma GCC unroll 0
@@ -259,8 +271,9 @@ inline uint32_t find_peaks(__m256i *transform) {
bitmap = _mm256_cmpgt_epi16(max_abs_rows, bound);
// step up if there are any matches
// rely on compiler to use conditional move here
int32_t step_mask = _mm256_testz_si256(bitmap, bitmap) - 1;
lower += step_mask & width;
mask = (uint32_t) _mm256_testz_si256(bitmap, bitmap);
mask = ~(uint32_t) ((-(int64_t) mask) >> 63);
lower += mask & width;
}
// lower+width contains the maximum value of the vector
// or less, if the maximum is very high (which is OK)
@@ -272,30 +285,26 @@ inline uint32_t find_peaks(__m256i *transform) {

// find in which of the 8 groups a maximum occurs to compute bits 4, 5, 6 of message
// find lowest value by searching backwards skip first check to save time
size_t message = 0x70;
for (int32_t i = 7; i >= 0; i--) {
bitmap = _mm256_cmpgt_epi16(abs_rows[i], bound);
int message_mask = (-(int16_t)(_mm256_testz_si256(bitmap, bitmap) == 0)) >> 15;
message ^= message_mask & (message ^ (unsigned)i << 4);
message = 0x70;
for (size_t i = 0; i < 8; i++) {
bitmap = _mm256_cmpgt_epi16(abs_rows[7 - i], bound);
mask = (uint32_t) _mm256_testz_si256(bitmap, bitmap);
mask = ~(uint32_t) ((-(int64_t) mask) >> 63);
message ^= mask & (message ^ ((7 - i) << 4));
}
// we decided which row of the matrix contains the lowest match
// select proper row
int8_t index = message >> 4;
__m256i res;
__m256i tmp = (__m256i) {
0ULL, 0ULL, 0ULL, 0ULL
};
index = message >> 4;

for (int8_t i = 0; i < 8; i++) {
int8_t abs_value = (int8_t)(index - i);
int8_t mask1 = abs_value >> 7;
tmp = _mm256_setzero_si256();
for (size_t i = 0; i < 8; i++) {
abs_value = (int8_t)(index - i);
mask1 = abs_value >> 7;
abs_value ^= mask1;
abs_value -= mask1;
int8_t mask2 = ((uint8_t) - abs_value >> 7);
int64_t mask3 = (-1ULL) + mask2;
__m256i vect_mask = (__m256i) {
mask3, mask3, mask3, mask3
};
mask2 = ((uint8_t) - abs_value >> 7);
mask = (-1ULL) + mask2;
vect_mask = _mm256_set1_epi32(mask);
res = _mm256_and_si256(abs_rows[i], vect_mask);
tmp = _mm256_or_si256(tmp, res);
}
@@ -305,34 +314,29 @@ inline uint32_t find_peaks(__m256i *transform) {
// get the column number of the vector element
// by setting the bits corresponding to the columns
// and then adding elements within two groups of 8
peak_mask = _mm256_cmpgt_epi16(active_row, bound);
peak_mask &= _mm256_set_epi16(-32768, 16384, 8192, 4096, 2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1);
for (int32_t i = 0; i < 3; i++) {
peak_mask = _mm256_hadd_epi16(peak_mask, peak_mask);
vect_mask = _mm256_cmpgt_epi16(active_row, bound);
vect_mask &= _mm256_set_epi16(-32768, 16384, 8192, 4096, 2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1);
for (size_t i = 0; i < 3; i++) {
vect_mask = _mm256_hadd_epi16(vect_mask, vect_mask);
}
// add low 4 bits of message
message |= __tzcnt_u16(_mm256_extract_epi16(peak_mask, 0) + _mm256_extract_epi16(peak_mask, 8));
message |= __tzcnt_u16(_mm256_extract_epi16(vect_mask, 0) + _mm256_extract_epi16(vect_mask, 8));

// set bit 7 if sign of biggest value is positive
// make sure a jump isn't generated by the compiler
tmp = (__m256i) {
0ULL, 0ULL, 0ULL, 0ULL
};
for (uint32_t i = 0; i < 8; i++) {
int64_t message_mask = (-(int64_t)(i == message / 16)) >> 63;
__m256i vect_mask = (__m256i) {
message_mask, message_mask, message_mask, message_mask
};
tmp = _mm256_setzero_si256();
for (size_t i = 0; i < 8; i++) {
mask = ~(uint32_t) ((-(int64_t)(i ^ message / 16)) >> 63);
__m256i vect_mask = _mm256_set1_epi32(mask);
tmp = _mm256_or_si256(tmp, _mm256_and_si256(vect_mask, transform[i]));
}
uint16_t result = 0;
for (uint32_t i = 0; i < 16; i++) {
uint16_t *ptr = (uint16_t *) &tmp;
int32_t message_mask = (-(int32_t)(i == message % 16)) >> (sizeof(int32_t) * 8 - 1);
result |= message_mask & ptr[i];
result = 0;
for (size_t i = 0; i < 16; i++) {
mask = ~(uint32_t) ((-(int64_t)(i ^ message % 16)) >> 63);
result |= mask & ((uint16_t *)&tmp)[i];
}
message |= (0x8000 & ~result) >> 8;
return (uint32_t) message;
return message;
}




+ 26
- 18
crypto_kem/hqc-rmrs-256/avx2/reed_solomon.c Parādīt failu

@@ -228,17 +228,25 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons
uint16_t beta_j[PARAM_DELTA] = {0};
uint16_t e_j[PARAM_DELTA] = {0};

uint16_t delta_counter = 0;
uint16_t delta_counter;
uint16_t delta_real_value;
uint16_t found;
uint16_t mask1;
uint16_t mask2;
uint16_t tmp1;
uint16_t tmp2;
uint16_t inverse;
uint16_t inverse_power_j;

// Compute the beta_{j_i} page 31 of the documentation
delta_counter = 0;
for (size_t i = 0; i < PARAM_N1; i++) {
uint16_t found = 0;
uint16_t valuemask = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0
for (uint16_t j = 0; j < PARAM_DELTA; j++) {
uint16_t indexmask = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter
beta_j[j] += indexmask & valuemask & gf_exp[i];
found += indexmask & valuemask & 1;
found = 0;
mask1 = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0
for (size_t j = 0; j < PARAM_DELTA; j++) {
mask2 = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter
beta_j[j] += mask1 & mask2 & gf_exp[i];
found += mask1 & mask2 & 1;
}
delta_counter += found;
}
@@ -246,10 +254,10 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons

// Compute the e_{j_i} page 31 of the documentation
for (size_t i = 0; i < PARAM_DELTA; ++i) {
uint16_t tmp1 = 1;
uint16_t tmp2 = 1;
uint16_t inverse = PQCLEAN_HQCRMRS256_AVX2_gf_inverse(beta_j[i]);
uint16_t inverse_power_j = 1;
tmp1 = 1;
tmp2 = 1;
inverse = PQCLEAN_HQCRMRS256_AVX2_gf_inverse(beta_j[i]);
inverse_power_j = 1;

for (size_t j = 1; j <= PARAM_DELTA; ++j) {
inverse_power_j = PQCLEAN_HQCRMRS256_AVX2_gf_mul(inverse_power_j, inverse);
@@ -258,19 +266,19 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons
for (size_t k = 1; k < PARAM_DELTA; ++k) {
tmp2 = PQCLEAN_HQCRMRS256_AVX2_gf_mul(tmp2, (1 ^ PQCLEAN_HQCRMRS256_AVX2_gf_mul(inverse, beta_j[(i + k) % PARAM_DELTA])));
}
uint16_t mask = (uint16_t) (((int16_t) i - delta_real_value) >> 15); // i < delta_real_value
e_j[i] = mask & PQCLEAN_HQCRMRS256_AVX2_gf_mul(tmp1, PQCLEAN_HQCRMRS256_AVX2_gf_inverse(tmp2));
mask1 = (uint16_t) (((int16_t) i - delta_real_value) >> 15); // i < delta_real_value
e_j[i] = mask1 & PQCLEAN_HQCRMRS256_AVX2_gf_mul(tmp1, PQCLEAN_HQCRMRS256_AVX2_gf_inverse(tmp2));
}

// Place the delta e_{j_i} values at the right coordinates of the output vector
delta_counter = 0;
for (size_t i = 0; i < PARAM_N1; ++i) {
uint16_t found = 0;
uint16_t valuemask = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0
found = 0;
mask1 = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0
for (size_t j = 0; j < PARAM_DELTA; j++) {
uint16_t indexmask = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter
error_values[i] += indexmask & valuemask & e_j[j];
found += indexmask & valuemask & 1;
mask2 = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter
error_values[i] += mask1 & mask2 & e_j[j];
found += mask1 & mask2 & 1;
}
delta_counter += found;
}


+ 25
- 35
crypto_kem/hqc-rmrs-256/avx2/vector.c Parādīt failu

@@ -32,72 +32,63 @@
void PQCLEAN_HQCRMRS256_AVX2_vect_set_random_fixed_weight(AES_XOF_struct *ctx, uint64_t *v, uint16_t weight) {
size_t random_bytes_size = 3 * weight;
uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0};
uint32_t random_data = 0;
uint32_t tmp[PARAM_OMEGA_R] = {0};
uint8_t exist = 0;
size_t j = 0;
__m256i bit256[PARAM_OMEGA_R];
__m256i bloc256[PARAM_OMEGA_R];
static __m256i posCmp256 = (__m256i) {
0UL, 1UL, 2UL, 3UL
};
#define LOOP_SIZE CEIL_DIVIDE(PARAM_N, 256)

seedexpander(ctx, rand_bytes, random_bytes_size);

for (uint32_t i = 0; i < weight; ++i) {
exist = 0;
__m256i posCmp256 = _mm256_set_epi64x(3, 2, 1, 0);
uint64_t bloc, pos, bit64;
uint8_t inc;
size_t i, j;

i = 0;
j = random_bytes_size;
while (i < weight) {
do {
if (j == random_bytes_size) {
seedexpander(ctx, rand_bytes, random_bytes_size);
j = 0;
}

random_data = ((uint32_t) rand_bytes[j++]) << 16;
random_data |= ((uint32_t) rand_bytes[j++]) << 8;
random_data |= rand_bytes[j++];
tmp[i] = ((uint32_t) rand_bytes[j++]) << 16;
tmp[i] |= ((uint32_t) rand_bytes[j++]) << 8;
tmp[i] |= rand_bytes[j++];

} while (random_data >= UTILS_REJECTION_THRESHOLD);
} while (tmp[i] >= UTILS_REJECTION_THRESHOLD);

random_data = random_data % PARAM_N;
tmp[i] = tmp[i] % PARAM_N;

inc = 1;
for (uint32_t k = 0; k < i; k++) {
if (tmp[k] == random_data) {
exist = 1;
if (tmp[k] == tmp[i]) {
inc = 0;
}
}

if (exist == 1) {
i--;
} else {
tmp[i] = random_data;
}
i += inc;
}

for (uint32_t i = 0; i < weight; i++) {
for (i = 0; i < weight; i++) {
// we store the bloc number and bit position of each vb[i]
uint64_t bloc = tmp[i] >> 6;
bloc = tmp[i] >> 6;
bloc256[i] = _mm256_set1_epi64x(bloc >> 2);
uint64_t pos = (bloc & 0x3UL);
pos = (bloc & 0x3UL);
__m256i pos256 = _mm256_set1_epi64x(pos);
__m256i mask256 = _mm256_cmpeq_epi64(pos256, posCmp256);
uint64_t bit64 = 1ULL << (tmp[i] & 0x3f);
bit64 = 1ULL << (tmp[i] & 0x3f);
__m256i bloc256 = _mm256_set1_epi64x(bit64);
bit256[i] = bloc256 & mask256;
}

for (uint32_t i = 0; i < LOOP_SIZE; i++) {
for (i = 0; i < CEIL_DIVIDE(PARAM_N, 256); i++) {
__m256i aux = _mm256_loadu_si256(((__m256i *)v) + i);
__m256i i256 = _mm256_set1_epi64x(i);

for (uint32_t j = 0; j < weight; j++) {
for (j = 0; j < weight; j++) {
__m256i mask256 = _mm256_cmpeq_epi64(bloc256[j], i256);
aux ^= bit256[j] & mask256;
}
_mm256_storeu_si256(((__m256i *)v) + i, aux);
}

#undef LOOP_SIZE
}


@@ -167,10 +158,9 @@ uint8_t PQCLEAN_HQCRMRS256_AVX2_vect_compare(const uint8_t *v1, const uint8_t *v
* @param[in] size_v Integer that is the size of the input vector in bits
*/
void PQCLEAN_HQCRMRS256_AVX2_vect_resize(uint64_t *o, uint32_t size_o, const uint64_t *v, uint32_t size_v) {
uint64_t mask = 0x7FFFFFFFFFFFFFFF;
int8_t val = 0;
if (size_o < size_v) {
uint64_t mask = 0x7FFFFFFFFFFFFFFF;
int8_t val = 0;

if (size_o % 64) {
val = 64 - (size_o % 64);
}


+ 26
- 18
crypto_kem/hqc-rmrs-256/clean/reed_solomon.c Parādīt failu

@@ -228,17 +228,25 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons
uint16_t beta_j[PARAM_DELTA] = {0};
uint16_t e_j[PARAM_DELTA] = {0};

uint16_t delta_counter = 0;
uint16_t delta_counter;
uint16_t delta_real_value;
uint16_t found;
uint16_t mask1;
uint16_t mask2;
uint16_t tmp1;
uint16_t tmp2;
uint16_t inverse;
uint16_t inverse_power_j;

// Compute the beta_{j_i} page 31 of the documentation
delta_counter = 0;
for (size_t i = 0; i < PARAM_N1; i++) {
uint16_t found = 0;
uint16_t valuemask = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0
for (uint16_t j = 0; j < PARAM_DELTA; j++) {
uint16_t indexmask = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter
beta_j[j] += indexmask & valuemask & gf_exp[i];
found += indexmask & valuemask & 1;
found = 0;
mask1 = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0
for (size_t j = 0; j < PARAM_DELTA; j++) {
mask2 = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter
beta_j[j] += mask1 & mask2 & gf_exp[i];
found += mask1 & mask2 & 1;
}
delta_counter += found;
}
@@ -246,10 +254,10 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons

// Compute the e_{j_i} page 31 of the documentation
for (size_t i = 0; i < PARAM_DELTA; ++i) {
uint16_t tmp1 = 1;
uint16_t tmp2 = 1;
uint16_t inverse = PQCLEAN_HQCRMRS256_CLEAN_gf_inverse(beta_j[i]);
uint16_t inverse_power_j = 1;
tmp1 = 1;
tmp2 = 1;
inverse = PQCLEAN_HQCRMRS256_CLEAN_gf_inverse(beta_j[i]);
inverse_power_j = 1;

for (size_t j = 1; j <= PARAM_DELTA; ++j) {
inverse_power_j = PQCLEAN_HQCRMRS256_CLEAN_gf_mul(inverse_power_j, inverse);
@@ -258,19 +266,19 @@ static void compute_error_values(uint16_t *error_values, const uint16_t *z, cons
for (size_t k = 1; k < PARAM_DELTA; ++k) {
tmp2 = PQCLEAN_HQCRMRS256_CLEAN_gf_mul(tmp2, (1 ^ PQCLEAN_HQCRMRS256_CLEAN_gf_mul(inverse, beta_j[(i + k) % PARAM_DELTA])));
}
uint16_t mask = (uint16_t) (((int16_t) i - delta_real_value) >> 15); // i < delta_real_value
e_j[i] = mask & PQCLEAN_HQCRMRS256_CLEAN_gf_mul(tmp1, PQCLEAN_HQCRMRS256_CLEAN_gf_inverse(tmp2));
mask1 = (uint16_t) (((int16_t) i - delta_real_value) >> 15); // i < delta_real_value
e_j[i] = mask1 & PQCLEAN_HQCRMRS256_CLEAN_gf_mul(tmp1, PQCLEAN_HQCRMRS256_CLEAN_gf_inverse(tmp2));
}

// Place the delta e_{j_i} values at the right coordinates of the output vector
delta_counter = 0;
for (size_t i = 0; i < PARAM_N1; ++i) {
uint16_t found = 0;
uint16_t valuemask = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0
found = 0;
mask1 = (uint16_t) (-((int32_t)error[i]) >> 31); // error[i] != 0
for (size_t j = 0; j < PARAM_DELTA; j++) {
uint16_t indexmask = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter
error_values[i] += indexmask & valuemask & e_j[j];
found += indexmask & valuemask & 1;
mask2 = ~((uint16_t) (-((int32_t) j ^ delta_counter) >> 31)); // j == delta_counter
error_values[i] += mask1 & mask2 & e_j[j];
found += mask1 & mask2 & 1;
}
delta_counter += found;
}


+ 17
- 58
crypto_kem/hqc-rmrs-256/clean/vector.c Parādīt failu

@@ -31,39 +31,33 @@
void PQCLEAN_HQCRMRS256_CLEAN_vect_set_random_fixed_weight_by_coordinates(AES_XOF_struct *ctx, uint32_t *v, uint16_t weight) {
size_t random_bytes_size = 3 * weight;
uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0}; // weight is expected to be <= PARAM_OMEGA_R
uint32_t random_data = 0;
uint8_t exist = 0;
size_t j = 0;
uint8_t inc;
size_t i, j;

seedexpander(ctx, rand_bytes, random_bytes_size);

for (uint32_t i = 0; i < weight; ++i) {
exist = 0;
i = 0;
j = random_bytes_size;
while (i < weight) {
do {
if (j == random_bytes_size) {
seedexpander(ctx, rand_bytes, random_bytes_size);
j = 0;
}

random_data = ((uint32_t) rand_bytes[j++]) << 16;
random_data |= ((uint32_t) rand_bytes[j++]) << 8;
random_data |= rand_bytes[j++];
v[i] = ((uint32_t) rand_bytes[j++]) << 16;
v[i] |= ((uint32_t) rand_bytes[j++]) << 8;
v[i] |= rand_bytes[j++];

} while (random_data >= UTILS_REJECTION_THRESHOLD);
} while (v[i] >= UTILS_REJECTION_THRESHOLD);

random_data = random_data % PARAM_N;
v[i] = v[i] % PARAM_N;

for (uint32_t k = 0; k < i; k++) {
if (v[k] == random_data) {
exist = 1;
inc = 1;
for (size_t k = 0; k < i; k++) {
if (v[k] == v[i]) {
inc = 0;
}
}

if (exist == 1) {
i--;
} else {
v[i] = random_data;
}
i += inc;
}
}

@@ -86,46 +80,11 @@ void PQCLEAN_HQCRMRS256_CLEAN_vect_set_random_fixed_weight_by_coordinates(AES_XO
* @param[in] ctx Pointer to the context of the seed expander
*/
void PQCLEAN_HQCRMRS256_CLEAN_vect_set_random_fixed_weight(AES_XOF_struct *ctx, uint64_t *v, uint16_t weight) {

size_t random_bytes_size = 3 * weight;
uint8_t rand_bytes[3 * PARAM_OMEGA_R] = {0}; // weight is expected to be <= PARAM_OMEGA_R
uint32_t random_data = 0;
uint32_t tmp[PARAM_OMEGA_R] = {0};
uint8_t exist = 0;
size_t j = 0;

seedexpander(ctx, rand_bytes, random_bytes_size);

for (uint32_t i = 0; i < weight; ++i) {
exist = 0;
do {
if (j == random_bytes_size) {
seedexpander(ctx, rand_bytes, random_bytes_size);
j = 0;
}

random_data = ((uint32_t) rand_bytes[j++]) << 16;
random_data |= ((uint32_t) rand_bytes[j++]) << 8;
random_data |= rand_bytes[j++];

} while (random_data >= UTILS_REJECTION_THRESHOLD);

random_data = random_data % PARAM_N;

for (uint32_t k = 0; k < i; k++) {
if (tmp[k] == random_data) {
exist = 1;
}
}

if (exist == 1) {
i--;
} else {
tmp[i] = random_data;
}
}
PQCLEAN_HQCRMRS256_CLEAN_vect_set_random_fixed_weight_by_coordinates(ctx, tmp, weight);

for (uint16_t i = 0; i < weight; ++i) {
for (size_t i = 0; i < weight; ++i) {
int32_t index = tmp[i] / 64;
int32_t pos = tmp[i] % 64;
v[index] |= ((uint64_t) 1) << pos;


+ 2
- 0
test/duplicate_consistency/hqc-128_clean.yml Parādīt failu

@@ -19,6 +19,7 @@ consistency_checks:
- parsing.h
- repetition.h
- vector.h
- bch.c
- code.c
- fft.c
- gf2x.c
@@ -46,6 +47,7 @@ consistency_checks:
- parsing.h
- repetition.h
- vector.h
- bch.c
- code.c
- fft.c
- gf2x.c


Notiek ielāde…
Atcelt
Saglabāt