From d5fd7d6d0cdff7b20b838736094cc80499c26357 Mon Sep 17 00:00:00 2001
From: "John M. Schanck" <jschanck@uwaterloo.ca>
Date: Mon, 7 Sep 2020 18:06:37 -0400
Subject: [PATCH] Fix an avx2/gf2x.c buffer overflow

---
 crypto_kem/hqc-128/avx2/gf2x.c      | 5 ++---
 crypto_kem/hqc-192/avx2/gf2x.c      | 9 ++++-----
 crypto_kem/hqc-256/avx2/gf2x.c      | 8 +++-----
 crypto_kem/hqc-rmrs-128/avx2/gf2x.c | 5 ++---
 crypto_kem/hqc-rmrs-192/avx2/gf2x.c | 9 ++++-----
 crypto_kem/hqc-rmrs-256/avx2/gf2x.c | 8 +++-----
 6 files changed, 18 insertions(+), 26 deletions(-)

diff --git a/crypto_kem/hqc-128/avx2/gf2x.c b/crypto_kem/hqc-128/avx2/gf2x.c
index 3338b362..1eb9ca2d 100644
--- a/crypto_kem/hqc-128/avx2/gf2x.c
+++ b/crypto_kem/hqc-128/avx2/gf2x.c
@@ -511,9 +511,8 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
         _mm256_storeu_si256(&U2_256[i], W3[i] ^ _mm256_loadu_si256(&U2_256[i]));
     }
 
-    for (int32_t i = 0 ; i < 6 * T_TM3_3W_256 - 2 ; i++) {
-        uint64_t *out64 = ((uint64_t *)Out) + (i << 2);
-        _mm256_storeu_si256((__m256i *)out64, ro256[i]);
+    for (int32_t i = 0 ; i < 2 * VEC_N_SIZE_256 + 1 ; i++) {
+        _mm256_storeu_si256(&Out[i], ro256[i]);
     }
 }
 
diff --git a/crypto_kem/hqc-192/avx2/gf2x.c b/crypto_kem/hqc-192/avx2/gf2x.c
index eb45382b..18eaa5b0 100644
--- a/crypto_kem/hqc-192/avx2/gf2x.c
+++ b/crypto_kem/hqc-192/avx2/gf2x.c
@@ -418,6 +418,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
 
     //W1 = W2 * W3
     karat_mult_64( W1, W2, W3);
+
     //W0 =(U1 + U2*x)*x ; W4 =(V1 + V2*x)*x (SIZE = T_TM3_3W_256 !)
     int64_t *U1_64 = ((int64_t *) U1);
     int64_t *U2_64 = ((int64_t *) U2);
@@ -455,6 +456,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
         W4[i] ^= V0[i];
     }
 
+
     karat_mult_64(tmp, W3, W2);
 
     for (int32_t i = 0 ; i < 2 * (T_TM3_3W_256) ; i++) {
@@ -462,7 +464,6 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
     }
 
     karat_mult_64( W2, W0, W4);
-
     //W4 = U2 * V2      ; W0 = U0 * V0
     karat_mult_64(W4, U2, V2);
     karat_mult_64(W0, U0, V0);
@@ -550,14 +551,12 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
         _mm256_storeu_si256(&U2_256[i], W3[i] ^ _mm256_loadu_si256(&U2_256[i]));
     }
 
-    for (int32_t i = 0 ; i < 6 * T_TM3_3W_256 - 2 ; i++) {
-        uint64_t *out64 = ((uint64_t *)Out) + (i << 2);
-        _mm256_storeu_si256((__m256i *)out64, ro256[i]);
+    for (int32_t i = 0 ; i < 2 * VEC_N_SIZE_256 + 1 ; i++) {
+        _mm256_storeu_si256(&Out[i], ro256[i]);
     }
 }
 
 
-
 /**
  * @brief Multiply two polynomials modulo \f$ X^n - 1\f$.
  *
diff --git a/crypto_kem/hqc-256/avx2/gf2x.c b/crypto_kem/hqc-256/avx2/gf2x.c
index a86dd4a8..2332b606 100644
--- a/crypto_kem/hqc-256/avx2/gf2x.c
+++ b/crypto_kem/hqc-256/avx2/gf2x.c
@@ -519,8 +519,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
     }
 
     for (int32_t i = 0 ; i < 6 * T_TM3_3W_256 - 2 ; i++) {
-        uint64_t *out64 = ((uint64_t *)Out) + (i << 2);
-        _mm256_storeu_si256((__m256i *)out64, ro256[i]);
+        _mm256_storeu_si256(&Out[i], ro256[i]);
     }
 }
 
@@ -715,9 +714,8 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
     ro256[3 + 5 * T_TM3R_3W_256] ^= W3[3 + 2 * T_TM3R_3W_256];
 
 
-    for (int32_t i = 0 ; i < 6 * T_TM3R_3W_256 - 2 ; i++) {
-        uint64_t *out64 = ((uint64_t *)Out) + (i << 2);
-        _mm256_storeu_si256((__m256i *)out64, ro256[i]);
+    for (int32_t i = 0 ; i < 2 * VEC_N_SIZE_256 + 1 ; i++) {
+        _mm256_storeu_si256(&Out[i], ro256[i]);
     }
 }
 
diff --git a/crypto_kem/hqc-rmrs-128/avx2/gf2x.c b/crypto_kem/hqc-rmrs-128/avx2/gf2x.c
index 3a91e1ee..6d7e932a 100644
--- a/crypto_kem/hqc-rmrs-128/avx2/gf2x.c
+++ b/crypto_kem/hqc-rmrs-128/avx2/gf2x.c
@@ -511,9 +511,8 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
         _mm256_storeu_si256(&U2_256[i], W3[i] ^ _mm256_loadu_si256(&U2_256[i]));
     }
 
-    for (int32_t i = 0 ; i < 6 * T_TM3_3W_256 - 2 ; i++) {
-        uint64_t *out64 = ((uint64_t *)Out) + (i << 2);
-        _mm256_storeu_si256((__m256i *)out64, ro256[i]);
+    for (int32_t i = 0 ; i < 2 * VEC_N_SIZE_256 + 1 ; i++) {
+        _mm256_storeu_si256(&Out[i], ro256[i]);
     }
 }
 
diff --git a/crypto_kem/hqc-rmrs-192/avx2/gf2x.c b/crypto_kem/hqc-rmrs-192/avx2/gf2x.c
index 2cd182f1..e2c62fa2 100644
--- a/crypto_kem/hqc-rmrs-192/avx2/gf2x.c
+++ b/crypto_kem/hqc-rmrs-192/avx2/gf2x.c
@@ -418,6 +418,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
 
     //W1 = W2 * W3
     karat_mult_64( W1, W2, W3);
+
     //W0 =(U1 + U2*x)*x ; W4 =(V1 + V2*x)*x (SIZE = T_TM3_3W_256 !)
     int64_t *U1_64 = ((int64_t *) U1);
     int64_t *U2_64 = ((int64_t *) U2);
@@ -455,6 +456,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
         W4[i] ^= V0[i];
     }
 
+
     karat_mult_64(tmp, W3, W2);
 
     for (int32_t i = 0 ; i < 2 * (T_TM3_3W_256) ; i++) {
@@ -462,7 +464,6 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
     }
 
     karat_mult_64( W2, W0, W4);
-
     //W4 = U2 * V2      ; W0 = U0 * V0
     karat_mult_64(W4, U2, V2);
     karat_mult_64(W0, U0, V0);
@@ -550,14 +551,12 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
         _mm256_storeu_si256(&U2_256[i], W3[i] ^ _mm256_loadu_si256(&U2_256[i]));
     }
 
-    for (int32_t i = 0 ; i < 6 * T_TM3_3W_256 - 2 ; i++) {
-        uint64_t *out64 = ((uint64_t *)Out) + (i << 2);
-        _mm256_storeu_si256((__m256i *)out64, ro256[i]);
+    for (int32_t i = 0 ; i < 2 * VEC_N_SIZE_256 + 1 ; i++) {
+        _mm256_storeu_si256(&Out[i], ro256[i]);
     }
 }
 
 
-
 /**
  * @brief Multiply two polynomials modulo \f$ X^n - 1\f$.
  *
diff --git a/crypto_kem/hqc-rmrs-256/avx2/gf2x.c b/crypto_kem/hqc-rmrs-256/avx2/gf2x.c
index 658cfd50..927c7ff7 100644
--- a/crypto_kem/hqc-rmrs-256/avx2/gf2x.c
+++ b/crypto_kem/hqc-rmrs-256/avx2/gf2x.c
@@ -519,8 +519,7 @@ static void TOOM3Mult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
     }
 
     for (int32_t i = 0 ; i < 6 * T_TM3_3W_256 - 2 ; i++) {
-        uint64_t *out64 = ((uint64_t *)Out) + (i << 2);
-        _mm256_storeu_si256((__m256i *)out64, ro256[i]);
+        _mm256_storeu_si256(&Out[i], ro256[i]);
     }
 }
 
@@ -715,9 +714,8 @@ static void TOOM3RecMult(__m256i *Out, const uint64_t *A, const uint64_t *B) {
     ro256[3 + 5 * T_TM3R_3W_256] ^= W3[3 + 2 * T_TM3R_3W_256];
 
 
-    for (int32_t i = 0 ; i < 6 * T_TM3R_3W_256 - 2 ; i++) {
-        uint64_t *out64 = ((uint64_t *)Out) + (i << 2);
-        _mm256_storeu_si256((__m256i *)out64, ro256[i]);
+    for (int32_t i = 0 ; i < 2 * VEC_N_SIZE_256 + 1 ; i++) {
+        _mm256_storeu_si256(&Out[i], ro256[i]);
     }
 }