From 4a301f189684f2b85f31b74b3369ea4d1c485ac0 Mon Sep 17 00:00:00 2001
From: "John M. Schanck" <jschanck@uwaterloo.ca>
Date: Wed, 9 Sep 2020 09:44:31 -0400
Subject: [PATCH] Simplify hqc-rmrs*/clean/reed_muller.c and fix potentially
 non-constant time behavior.

---
 crypto_kem/hqc-rmrs-128/clean/reed_muller.c | 120 +++++++++-----------
 crypto_kem/hqc-rmrs-192/clean/reed_muller.c | 120 +++++++++-----------
 crypto_kem/hqc-rmrs-256/clean/reed_muller.c | 120 +++++++++-----------
 3 files changed, 156 insertions(+), 204 deletions(-)

diff --git a/crypto_kem/hqc-rmrs-128/clean/reed_muller.c b/crypto_kem/hqc-rmrs-128/clean/reed_muller.c
index eca7d8f9..36e80ff6 100644
--- a/crypto_kem/hqc-rmrs-128/clean/reed_muller.c
+++ b/crypto_kem/hqc-rmrs-128/clean/reed_muller.c
@@ -7,33 +7,19 @@
  * Constant time implementation of Reed-Muller code RM(1,7)
  */
 
-// setting this will help the compiler with auto vectorization
-#undef ALIGNVECTORS
-
 
 
 // number of repeated code words
 #define MULTIPLICITY                   CEIL_DIVIDE(PARAM_N2, 128)
 
-// codeword is 128 bits, seen multiple ways
-typedef union {
-    uint8_t u8[16];
-    uint32_t u32[4];
-} codeword
-;
-
-// Expanded codeword has a short for every bit, for internal calculations
-typedef int16_t expandedCodeword[128]
-;
-
 // copy bit 0 into all bits of a 32 bit value
-#define BIT0MASK(x) (int32_t)(-((x) & 1))
+#define BIT0MASK(x) (-((x) & 1))
 
 
-static void encode(codeword *word, int32_t message);
-static void hadamard(expandedCodeword *src, expandedCodeword *dst);
-static void expand_and_sum(expandedCodeword *dest, codeword src[]);
-static int32_t find_peaks(expandedCodeword *transform);
+static void encode(uint32_t *word, const uint8_t message);
+static void hadamard(uint16_t src[128], uint16_t dst[128]);
+static void expand_and_sum(uint16_t dest[128], const uint32_t src[4 * MULTIPLICITY]);
+static uint8_t find_peaks(const uint16_t transform[128]);
 
 
 
@@ -54,10 +40,10 @@ static int32_t find_peaks(expandedCodeword *transform);
  * @param[out] word An RM(1,7) codeword
  * @param[in] message A message
  */
-static void encode(codeword *word, int32_t message) {
+static void encode(uint32_t *word, uint8_t message) {
     // the four parts of the word are identical
     // except for encoding bits 5 and 6
-    int32_t first_word;
+    uint32_t first_word;
     // bit 7 flips all the bits, do that first to save work
     first_word = BIT0MASK(message >> 7);
     // bits 0, 1, 2, 3, 4 are the same for all four longs
@@ -68,14 +54,14 @@ static void encode(codeword *word, int32_t message) {
     first_word ^= BIT0MASK(message >> 3) & 0xff00ff00;
     first_word ^= BIT0MASK(message >> 4) & 0xffff0000;
     // we can store this in the first quarter
-    word->u32[0] = first_word;
+    word[0] = first_word;
     // bit 5 flips entries 1 and 3; bit 6 flips 2 and 3
     first_word ^= BIT0MASK(message >> 5);
-    word->u32[1] = first_word;
+    word[1] = first_word;
     first_word ^= BIT0MASK(message >> 6);
-    word->u32[3] = first_word;
+    word[3] = first_word;
     first_word ^= BIT0MASK(message >> 5);
-    word->u32[2] = first_word;
+    word[2] = first_word;
 }
 
 
@@ -111,19 +97,20 @@ static void encode(codeword *word, int32_t message) {
  * @param[out] src Structure that contain the expanded codeword
  * @param[out] dst Structure that contain the expanded codeword
  */
-static void hadamard(expandedCodeword *src, expandedCodeword *dst) {
+static void hadamard(uint16_t src[128], uint16_t dst[128]) {
     // the passes move data:
     // src -> dst -> src -> dst -> src -> dst -> src -> dst
     // using p1 and p2 alternately
-    expandedCodeword *p1 = src;
-    expandedCodeword *p2 = dst;
-    for (int32_t pass = 0 ; pass < 7 ; pass++) {
-        for (int32_t i = 0 ; i < 64 ; i++) {
-            (*p2)[i] = (*p1)[2 * i] + (*p1)[2 * i + 1];
-            (*p2)[i + 64] = (*p1)[2 * i] - (*p1)[2 * i + 1];
+    uint16_t *p1 = src;
+    uint16_t *p2 = dst;
+    uint16_t *p3;
+    for (uint32_t pass = 0 ; pass < 7 ; pass++) {
+        for (uint32_t i = 0 ; i < 64 ; i++) {
+            p2[i] = p1[2 * i] + p1[2 * i + 1];
+            p2[i + 64] = p1[2 * i] - p1[2 * i + 1];
         }
         // swap p1, p2 for next round
-        expandedCodeword *p3 = p1;
+        p3 = p1;
         p1 = p2;
         p2 = p3;
     }
@@ -144,18 +131,18 @@ static void hadamard(expandedCodeword *src, expandedCodeword *dst) {
  * @param[out] dest Structure that contain the expanded codeword
  * @param[in] src Structure that contain the codeword
  */
-static void expand_and_sum(expandedCodeword *dest, codeword src[]) {
+static void expand_and_sum(uint16_t dest[128], const uint32_t src[4 * MULTIPLICITY]) {
     // start with the first copy
-    for (int32_t part = 0 ; part < 4 ; part++) {
-        for (int32_t bit = 0 ; bit < 32 ; bit++) {
-            (*dest)[part * 32 + bit] = src[0].u32[part] >> bit & 1;
+    for (uint32_t part = 0 ; part < 4 ; part++) {
+        for (uint32_t bit = 0 ; bit < 32 ; bit++) {
+            dest[part * 32 + bit] = (uint16_t) ((src[part] >> bit) & 1);
         }
     }
     // sum the rest of the copies
-    for (int32_t copy = 1 ; copy < MULTIPLICITY ; copy++) {
-        for (int32_t part = 0 ; part < 4 ; part++) {
-            for (int32_t bit = 0 ; bit < 32 ; bit++) {
-                (*dest)[part * 32 + bit] += src[copy].u32[part] >> bit & 1;
+    for (uint32_t copy = 1 ; copy < MULTIPLICITY ; copy++) {
+        for (uint32_t part = 0 ; part < 4 ; part++) {
+            for (uint32_t bit = 0 ; bit < 32 ; bit++) {
+                dest[part * 32 + bit] += (uint16_t) ((src[4 * copy + part] >> bit) & 1);
             }
         }
     }
@@ -172,27 +159,26 @@ static void expand_and_sum(expandedCodeword *dest, codeword src[]) {
  * in the lowest 7 bits it taken
  * @param[in] transform Structure that contain the expanded codeword
  */
-static int32_t find_peaks(expandedCodeword *transform) {
-    int32_t peak_abs_value = 0;
-    int32_t peak_value = 0;
-    int32_t peak_pos = 0;
-    for (int32_t i = 0 ; i < 128 ; i++) {
-        // get absolute value
-        int32_t t = (*transform)[i];
-        int32_t pos_mask = -(t > 0);
-        int32_t absolute = (pos_mask & t) | (~pos_mask & -t);
-        // all compilers nowadays compile with a conditional move
-        peak_value = absolute > peak_abs_value ? t : peak_value;
-        peak_pos = absolute > peak_abs_value ? i : peak_pos;
-        peak_abs_value = absolute > peak_abs_value ? absolute : peak_abs_value;
+static uint8_t find_peaks(const uint16_t transform[128]) {
+    uint16_t peak_abs = 0;
+    uint16_t peak = 0;
+    uint16_t pos = 0;
+    uint16_t t, abs, mask;
+    for (uint16_t i = 0 ; i < 128 ; i++) {
+        t = transform[i];
+        abs = t ^ ((-(t >> 15)) & (t ^ -t)); // t = abs(t)
+        mask = -(((uint16_t)(peak_abs - abs)) >> 15);
+        peak ^= mask & (peak ^ t);
+        pos ^= mask & (pos ^ i);
+        peak_abs ^= mask & (peak_abs ^ abs);
     }
-    // set bit 7
-    peak_pos |= 128 * (peak_value > 0);
-    return peak_pos;
+    pos |= 128 & ((peak >> 15) - 1);
+    return (uint8_t) pos;
 }
 
 
 
+
 /**
  * @brief Encodes the received word
  *
@@ -204,15 +190,13 @@ static int32_t find_peaks(expandedCodeword *transform) {
  */
 void PQCLEAN_HQCRMRS128_CLEAN_reed_muller_encode(uint64_t *cdw, const uint64_t *msg) {
     uint8_t *message_array = (uint8_t *) msg;
-    codeword *codeArray = (codeword *) cdw;
+    uint32_t *codeArray = (uint32_t *) cdw;
     for (size_t i = 0 ; i < VEC_N1_SIZE_BYTES ; i++) {
-        // fill entries i * MULTIPLICITY to (i+1) * MULTIPLICITY
-        int32_t pos = i * MULTIPLICITY;
         // encode first word
-        encode(&codeArray[pos], message_array[i]);
+        encode(&codeArray[4 * i * MULTIPLICITY], message_array[i]);
         // copy to other identical codewords
         for (size_t copy = 1 ; copy < MULTIPLICITY ; copy++) {
-            memcpy(&codeArray[pos + copy], &codeArray[pos], sizeof(codeword));
+            memcpy(&codeArray[4 * i * MULTIPLICITY + 4 * copy], &codeArray[4 * i * MULTIPLICITY], 4 * sizeof(uint32_t));
         }
     }
 }
@@ -230,17 +214,17 @@ void PQCLEAN_HQCRMRS128_CLEAN_reed_muller_encode(uint64_t *cdw, const uint64_t *
  */
 void PQCLEAN_HQCRMRS128_CLEAN_reed_muller_decode(uint64_t *msg, const uint64_t *cdw) {
     uint8_t *message_array = (uint8_t *) msg;
-    codeword *codeArray = (codeword *) cdw;
-    expandedCodeword expanded;
+    uint32_t *codeArray = (uint32_t *) cdw;
+    uint16_t expanded[128];
+    uint16_t transform[128];
     for (size_t i = 0 ; i < VEC_N1_SIZE_BYTES ; i++) {
         // collect the codewords
-        expand_and_sum(&expanded, &codeArray[i * MULTIPLICITY]);
+        expand_and_sum(expanded, &codeArray[4 * i * MULTIPLICITY]);
         // apply hadamard transform
-        expandedCodeword transform;
-        hadamard(&expanded, &transform);
+        hadamard(expanded, transform);
         // fix the first entry to get the half Hadamard transform
         transform[0] -= 64 * MULTIPLICITY;
         // finish the decoding
-        message_array[i] = find_peaks(&transform);
+        message_array[i] = find_peaks(transform);
     }
 }
diff --git a/crypto_kem/hqc-rmrs-192/clean/reed_muller.c b/crypto_kem/hqc-rmrs-192/clean/reed_muller.c
index 014be214..5beb05e3 100644
--- a/crypto_kem/hqc-rmrs-192/clean/reed_muller.c
+++ b/crypto_kem/hqc-rmrs-192/clean/reed_muller.c
@@ -7,33 +7,19 @@
  * Constant time implementation of Reed-Muller code RM(1,7)
  */
 
-// setting this will help the compiler with auto vectorization
-#undef ALIGNVECTORS
-
 
 
 // number of repeated code words
 #define MULTIPLICITY                   CEIL_DIVIDE(PARAM_N2, 128)
 
-// codeword is 128 bits, seen multiple ways
-typedef union {
-    uint8_t u8[16];
-    uint32_t u32[4];
-} codeword
-;
-
-// Expanded codeword has a short for every bit, for internal calculations
-typedef int16_t expandedCodeword[128]
-;
-
 // copy bit 0 into all bits of a 32 bit value
-#define BIT0MASK(x) (int32_t)(-((x) & 1))
+#define BIT0MASK(x) (-((x) & 1))
 
 
-static void encode(codeword *word, int32_t message);
-static void hadamard(expandedCodeword *src, expandedCodeword *dst);
-static void expand_and_sum(expandedCodeword *dest, codeword src[]);
-static int32_t find_peaks(expandedCodeword *transform);
+static void encode(uint32_t *word, const uint8_t message);
+static void hadamard(uint16_t src[128], uint16_t dst[128]);
+static void expand_and_sum(uint16_t dest[128], const uint32_t src[4 * MULTIPLICITY]);
+static uint8_t find_peaks(const uint16_t transform[128]);
 
 
 
@@ -54,10 +40,10 @@ static int32_t find_peaks(expandedCodeword *transform);
  * @param[out] word An RM(1,7) codeword
  * @param[in] message A message
  */
-static void encode(codeword *word, int32_t message) {
+static void encode(uint32_t *word, uint8_t message) {
     // the four parts of the word are identical
     // except for encoding bits 5 and 6
-    int32_t first_word;
+    uint32_t first_word;
     // bit 7 flips all the bits, do that first to save work
     first_word = BIT0MASK(message >> 7);
     // bits 0, 1, 2, 3, 4 are the same for all four longs
@@ -68,14 +54,14 @@ static void encode(codeword *word, int32_t message) {
     first_word ^= BIT0MASK(message >> 3) & 0xff00ff00;
     first_word ^= BIT0MASK(message >> 4) & 0xffff0000;
     // we can store this in the first quarter
-    word->u32[0] = first_word;
+    word[0] = first_word;
     // bit 5 flips entries 1 and 3; bit 6 flips 2 and 3
     first_word ^= BIT0MASK(message >> 5);
-    word->u32[1] = first_word;
+    word[1] = first_word;
     first_word ^= BIT0MASK(message >> 6);
-    word->u32[3] = first_word;
+    word[3] = first_word;
     first_word ^= BIT0MASK(message >> 5);
-    word->u32[2] = first_word;
+    word[2] = first_word;
 }
 
 
@@ -111,19 +97,20 @@ static void encode(codeword *word, int32_t message) {
  * @param[out] src Structure that contain the expanded codeword
  * @param[out] dst Structure that contain the expanded codeword
  */
-static void hadamard(expandedCodeword *src, expandedCodeword *dst) {
+static void hadamard(uint16_t src[128], uint16_t dst[128]) {
     // the passes move data:
     // src -> dst -> src -> dst -> src -> dst -> src -> dst
     // using p1 and p2 alternately
-    expandedCodeword *p1 = src;
-    expandedCodeword *p2 = dst;
-    for (int32_t pass = 0 ; pass < 7 ; pass++) {
-        for (int32_t i = 0 ; i < 64 ; i++) {
-            (*p2)[i] = (*p1)[2 * i] + (*p1)[2 * i + 1];
-            (*p2)[i + 64] = (*p1)[2 * i] - (*p1)[2 * i + 1];
+    uint16_t *p1 = src;
+    uint16_t *p2 = dst;
+    uint16_t *p3;
+    for (uint32_t pass = 0 ; pass < 7 ; pass++) {
+        for (uint32_t i = 0 ; i < 64 ; i++) {
+            p2[i] = p1[2 * i] + p1[2 * i + 1];
+            p2[i + 64] = p1[2 * i] - p1[2 * i + 1];
         }
         // swap p1, p2 for next round
-        expandedCodeword *p3 = p1;
+        p3 = p1;
         p1 = p2;
         p2 = p3;
     }
@@ -144,18 +131,18 @@ static void hadamard(expandedCodeword *src, expandedCodeword *dst) {
  * @param[out] dest Structure that contain the expanded codeword
  * @param[in] src Structure that contain the codeword
  */
-static void expand_and_sum(expandedCodeword *dest, codeword src[]) {
+static void expand_and_sum(uint16_t dest[128], const uint32_t src[4 * MULTIPLICITY]) {
     // start with the first copy
-    for (int32_t part = 0 ; part < 4 ; part++) {
-        for (int32_t bit = 0 ; bit < 32 ; bit++) {
-            (*dest)[part * 32 + bit] = src[0].u32[part] >> bit & 1;
+    for (uint32_t part = 0 ; part < 4 ; part++) {
+        for (uint32_t bit = 0 ; bit < 32 ; bit++) {
+            dest[part * 32 + bit] = (uint16_t) ((src[part] >> bit) & 1);
         }
     }
     // sum the rest of the copies
-    for (int32_t copy = 1 ; copy < MULTIPLICITY ; copy++) {
-        for (int32_t part = 0 ; part < 4 ; part++) {
-            for (int32_t bit = 0 ; bit < 32 ; bit++) {
-                (*dest)[part * 32 + bit] += src[copy].u32[part] >> bit & 1;
+    for (uint32_t copy = 1 ; copy < MULTIPLICITY ; copy++) {
+        for (uint32_t part = 0 ; part < 4 ; part++) {
+            for (uint32_t bit = 0 ; bit < 32 ; bit++) {
+                dest[part * 32 + bit] += (uint16_t) ((src[4 * copy + part] >> bit) & 1);
             }
         }
     }
@@ -172,27 +159,26 @@ static void expand_and_sum(expandedCodeword *dest, codeword src[]) {
  * in the lowest 7 bits it taken
  * @param[in] transform Structure that contain the expanded codeword
  */
-static int32_t find_peaks(expandedCodeword *transform) {
-    int32_t peak_abs_value = 0;
-    int32_t peak_value = 0;
-    int32_t peak_pos = 0;
-    for (int32_t i = 0 ; i < 128 ; i++) {
-        // get absolute value
-        int32_t t = (*transform)[i];
-        int32_t pos_mask = -(t > 0);
-        int32_t absolute = (pos_mask & t) | (~pos_mask & -t);
-        // all compilers nowadays compile with a conditional move
-        peak_value = absolute > peak_abs_value ? t : peak_value;
-        peak_pos = absolute > peak_abs_value ? i : peak_pos;
-        peak_abs_value = absolute > peak_abs_value ? absolute : peak_abs_value;
+static uint8_t find_peaks(const uint16_t transform[128]) {
+    uint16_t peak_abs = 0;
+    uint16_t peak = 0;
+    uint16_t pos = 0;
+    uint16_t t, abs, mask;
+    for (uint16_t i = 0 ; i < 128 ; i++) {
+        t = transform[i];
+        abs = t ^ ((-(t >> 15)) & (t ^ -t)); // t = abs(t)
+        mask = -(((uint16_t)(peak_abs - abs)) >> 15);
+        peak ^= mask & (peak ^ t);
+        pos ^= mask & (pos ^ i);
+        peak_abs ^= mask & (peak_abs ^ abs);
     }
-    // set bit 7
-    peak_pos |= 128 * (peak_value > 0);
-    return peak_pos;
+    pos |= 128 & ((peak >> 15) - 1);
+    return (uint8_t) pos;
 }
 
 
 
+
 /**
  * @brief Encodes the received word
  *
@@ -204,15 +190,13 @@ static int32_t find_peaks(expandedCodeword *transform) {
  */
 void PQCLEAN_HQCRMRS192_CLEAN_reed_muller_encode(uint64_t *cdw, const uint64_t *msg) {
     uint8_t *message_array = (uint8_t *) msg;
-    codeword *codeArray = (codeword *) cdw;
+    uint32_t *codeArray = (uint32_t *) cdw;
     for (size_t i = 0 ; i < VEC_N1_SIZE_BYTES ; i++) {
-        // fill entries i * MULTIPLICITY to (i+1) * MULTIPLICITY
-        int32_t pos = i * MULTIPLICITY;
         // encode first word
-        encode(&codeArray[pos], message_array[i]);
+        encode(&codeArray[4 * i * MULTIPLICITY], message_array[i]);
         // copy to other identical codewords
         for (size_t copy = 1 ; copy < MULTIPLICITY ; copy++) {
-            memcpy(&codeArray[pos + copy], &codeArray[pos], sizeof(codeword));
+            memcpy(&codeArray[4 * i * MULTIPLICITY + 4 * copy], &codeArray[4 * i * MULTIPLICITY], 4 * sizeof(uint32_t));
         }
     }
 }
@@ -230,17 +214,17 @@ void PQCLEAN_HQCRMRS192_CLEAN_reed_muller_encode(uint64_t *cdw, const uint64_t *
  */
 void PQCLEAN_HQCRMRS192_CLEAN_reed_muller_decode(uint64_t *msg, const uint64_t *cdw) {
     uint8_t *message_array = (uint8_t *) msg;
-    codeword *codeArray = (codeword *) cdw;
-    expandedCodeword expanded;
+    uint32_t *codeArray = (uint32_t *) cdw;
+    uint16_t expanded[128];
+    uint16_t transform[128];
     for (size_t i = 0 ; i < VEC_N1_SIZE_BYTES ; i++) {
         // collect the codewords
-        expand_and_sum(&expanded, &codeArray[i * MULTIPLICITY]);
+        expand_and_sum(expanded, &codeArray[4 * i * MULTIPLICITY]);
         // apply hadamard transform
-        expandedCodeword transform;
-        hadamard(&expanded, &transform);
+        hadamard(expanded, transform);
         // fix the first entry to get the half Hadamard transform
         transform[0] -= 64 * MULTIPLICITY;
         // finish the decoding
-        message_array[i] = find_peaks(&transform);
+        message_array[i] = find_peaks(transform);
     }
 }
diff --git a/crypto_kem/hqc-rmrs-256/clean/reed_muller.c b/crypto_kem/hqc-rmrs-256/clean/reed_muller.c
index d8cfde2e..ab3b66fa 100644
--- a/crypto_kem/hqc-rmrs-256/clean/reed_muller.c
+++ b/crypto_kem/hqc-rmrs-256/clean/reed_muller.c
@@ -7,33 +7,19 @@
  * Constant time implementation of Reed-Muller code RM(1,7)
  */
 
-// setting this will help the compiler with auto vectorization
-#undef ALIGNVECTORS
-
 
 
 // number of repeated code words
 #define MULTIPLICITY                   CEIL_DIVIDE(PARAM_N2, 128)
 
-// codeword is 128 bits, seen multiple ways
-typedef union {
-    uint8_t u8[16];
-    uint32_t u32[4];
-} codeword
-;
-
-// Expanded codeword has a short for every bit, for internal calculations
-typedef int16_t expandedCodeword[128]
-;
-
 // copy bit 0 into all bits of a 32 bit value
-#define BIT0MASK(x) (int32_t)(-((x) & 1))
+#define BIT0MASK(x) (-((x) & 1))
 
 
-static void encode(codeword *word, int32_t message);
-static void hadamard(expandedCodeword *src, expandedCodeword *dst);
-static void expand_and_sum(expandedCodeword *dest, codeword src[]);
-static int32_t find_peaks(expandedCodeword *transform);
+static void encode(uint32_t *word, const uint8_t message);
+static void hadamard(uint16_t src[128], uint16_t dst[128]);
+static void expand_and_sum(uint16_t dest[128], const uint32_t src[4 * MULTIPLICITY]);
+static uint8_t find_peaks(const uint16_t transform[128]);
 
 
 
@@ -54,10 +40,10 @@ static int32_t find_peaks(expandedCodeword *transform);
  * @param[out] word An RM(1,7) codeword
  * @param[in] message A message
  */
-static void encode(codeword *word, int32_t message) {
+static void encode(uint32_t *word, uint8_t message) {
     // the four parts of the word are identical
     // except for encoding bits 5 and 6
-    int32_t first_word;
+    uint32_t first_word;
     // bit 7 flips all the bits, do that first to save work
     first_word = BIT0MASK(message >> 7);
     // bits 0, 1, 2, 3, 4 are the same for all four longs
@@ -68,14 +54,14 @@ static void encode(codeword *word, int32_t message) {
     first_word ^= BIT0MASK(message >> 3) & 0xff00ff00;
     first_word ^= BIT0MASK(message >> 4) & 0xffff0000;
     // we can store this in the first quarter
-    word->u32[0] = first_word;
+    word[0] = first_word;
     // bit 5 flips entries 1 and 3; bit 6 flips 2 and 3
     first_word ^= BIT0MASK(message >> 5);
-    word->u32[1] = first_word;
+    word[1] = first_word;
     first_word ^= BIT0MASK(message >> 6);
-    word->u32[3] = first_word;
+    word[3] = first_word;
     first_word ^= BIT0MASK(message >> 5);
-    word->u32[2] = first_word;
+    word[2] = first_word;
 }
 
 
@@ -111,19 +97,20 @@ static void encode(codeword *word, int32_t message) {
  * @param[out] src Structure that contain the expanded codeword
  * @param[out] dst Structure that contain the expanded codeword
  */
-static void hadamard(expandedCodeword *src, expandedCodeword *dst) {
+static void hadamard(uint16_t src[128], uint16_t dst[128]) {
     // the passes move data:
     // src -> dst -> src -> dst -> src -> dst -> src -> dst
     // using p1 and p2 alternately
-    expandedCodeword *p1 = src;
-    expandedCodeword *p2 = dst;
-    for (int32_t pass = 0 ; pass < 7 ; pass++) {
-        for (int32_t i = 0 ; i < 64 ; i++) {
-            (*p2)[i] = (*p1)[2 * i] + (*p1)[2 * i + 1];
-            (*p2)[i + 64] = (*p1)[2 * i] - (*p1)[2 * i + 1];
+    uint16_t *p1 = src;
+    uint16_t *p2 = dst;
+    uint16_t *p3;
+    for (uint32_t pass = 0 ; pass < 7 ; pass++) {
+        for (uint32_t i = 0 ; i < 64 ; i++) {
+            p2[i] = p1[2 * i] + p1[2 * i + 1];
+            p2[i + 64] = p1[2 * i] - p1[2 * i + 1];
         }
         // swap p1, p2 for next round
-        expandedCodeword *p3 = p1;
+        p3 = p1;
         p1 = p2;
         p2 = p3;
     }
@@ -144,18 +131,18 @@ static void hadamard(expandedCodeword *src, expandedCodeword *dst) {
  * @param[out] dest Structure that contain the expanded codeword
  * @param[in] src Structure that contain the codeword
  */
-static void expand_and_sum(expandedCodeword *dest, codeword src[]) {
+static void expand_and_sum(uint16_t dest[128], const uint32_t src[4 * MULTIPLICITY]) {
     // start with the first copy
-    for (int32_t part = 0 ; part < 4 ; part++) {
-        for (int32_t bit = 0 ; bit < 32 ; bit++) {
-            (*dest)[part * 32 + bit] = src[0].u32[part] >> bit & 1;
+    for (uint32_t part = 0 ; part < 4 ; part++) {
+        for (uint32_t bit = 0 ; bit < 32 ; bit++) {
+            dest[part * 32 + bit] = (uint16_t) ((src[part] >> bit) & 1);
         }
     }
     // sum the rest of the copies
-    for (int32_t copy = 1 ; copy < MULTIPLICITY ; copy++) {
-        for (int32_t part = 0 ; part < 4 ; part++) {
-            for (int32_t bit = 0 ; bit < 32 ; bit++) {
-                (*dest)[part * 32 + bit] += src[copy].u32[part] >> bit & 1;
+    for (uint32_t copy = 1 ; copy < MULTIPLICITY ; copy++) {
+        for (uint32_t part = 0 ; part < 4 ; part++) {
+            for (uint32_t bit = 0 ; bit < 32 ; bit++) {
+                dest[part * 32 + bit] += (uint16_t) ((src[4 * copy + part] >> bit) & 1);
             }
         }
     }
@@ -172,27 +159,26 @@ static void expand_and_sum(expandedCodeword *dest, codeword src[]) {
  * in the lowest 7 bits it taken
  * @param[in] transform Structure that contain the expanded codeword
  */
-static int32_t find_peaks(expandedCodeword *transform) {
-    int32_t peak_abs_value = 0;
-    int32_t peak_value = 0;
-    int32_t peak_pos = 0;
-    for (int32_t i = 0 ; i < 128 ; i++) {
-        // get absolute value
-        int32_t t = (*transform)[i];
-        int32_t pos_mask = -(t > 0);
-        int32_t absolute = (pos_mask & t) | (~pos_mask & -t);
-        // all compilers nowadays compile with a conditional move
-        peak_value = absolute > peak_abs_value ? t : peak_value;
-        peak_pos = absolute > peak_abs_value ? i : peak_pos;
-        peak_abs_value = absolute > peak_abs_value ? absolute : peak_abs_value;
+static uint8_t find_peaks(const uint16_t transform[128]) {
+    uint16_t peak_abs = 0;
+    uint16_t peak = 0;
+    uint16_t pos = 0;
+    uint16_t t, abs, mask;
+    for (uint16_t i = 0 ; i < 128 ; i++) {
+        t = transform[i];
+        abs = t ^ ((-(t >> 15)) & (t ^ -t)); // t = abs(t)
+        mask = -(((uint16_t)(peak_abs - abs)) >> 15);
+        peak ^= mask & (peak ^ t);
+        pos ^= mask & (pos ^ i);
+        peak_abs ^= mask & (peak_abs ^ abs);
     }
-    // set bit 7
-    peak_pos |= 128 * (peak_value > 0);
-    return peak_pos;
+    pos |= 128 & ((peak >> 15) - 1);
+    return (uint8_t) pos;
 }
 
 
 
+
 /**
  * @brief Encodes the received word
  *
@@ -204,15 +190,13 @@ static int32_t find_peaks(expandedCodeword *transform) {
  */
 void PQCLEAN_HQCRMRS256_CLEAN_reed_muller_encode(uint64_t *cdw, const uint64_t *msg) {
     uint8_t *message_array = (uint8_t *) msg;
-    codeword *codeArray = (codeword *) cdw;
+    uint32_t *codeArray = (uint32_t *) cdw;
     for (size_t i = 0 ; i < VEC_N1_SIZE_BYTES ; i++) {
-        // fill entries i * MULTIPLICITY to (i+1) * MULTIPLICITY
-        int32_t pos = i * MULTIPLICITY;
         // encode first word
-        encode(&codeArray[pos], message_array[i]);
+        encode(&codeArray[4 * i * MULTIPLICITY], message_array[i]);
         // copy to other identical codewords
         for (size_t copy = 1 ; copy < MULTIPLICITY ; copy++) {
-            memcpy(&codeArray[pos + copy], &codeArray[pos], sizeof(codeword));
+            memcpy(&codeArray[4 * i * MULTIPLICITY + 4 * copy], &codeArray[4 * i * MULTIPLICITY], 4 * sizeof(uint32_t));
         }
     }
 }
@@ -230,17 +214,17 @@ void PQCLEAN_HQCRMRS256_CLEAN_reed_muller_encode(uint64_t *cdw, const uint64_t *
  */
 void PQCLEAN_HQCRMRS256_CLEAN_reed_muller_decode(uint64_t *msg, const uint64_t *cdw) {
     uint8_t *message_array = (uint8_t *) msg;
-    codeword *codeArray = (codeword *) cdw;
-    expandedCodeword expanded;
+    uint32_t *codeArray = (uint32_t *) cdw;
+    uint16_t expanded[128];
+    uint16_t transform[128];
     for (size_t i = 0 ; i < VEC_N1_SIZE_BYTES ; i++) {
         // collect the codewords
-        expand_and_sum(&expanded, &codeArray[i * MULTIPLICITY]);
+        expand_and_sum(expanded, &codeArray[4 * i * MULTIPLICITY]);
         // apply hadamard transform
-        expandedCodeword transform;
-        hadamard(&expanded, &transform);
+        hadamard(expanded, transform);
         // fix the first entry to get the half Hadamard transform
         transform[0] -= 64 * MULTIPLICITY;
         // finish the decoding
-        message_array[i] = find_peaks(&transform);
+        message_array[i] = find_peaks(transform);
     }
 }