pqc/crypto_kem/hqc-rmrs-256/avx2/reed_muller.c

#include "parameters.h"
#include "reed_muller.h"
#include <immintrin.h>
#include <stdint.h>
#include <string.h>
/**
 * @file reed_muller.c
 * Constant time implementation of Reed-Muller code RM(1,7)
 */


// number of repeated code words
#define MULTIPLICITY                   CEIL_DIVIDE(PARAM_N2, 128)

// copy bit 0 into all bits of a 64 bit value
#define BIT0MASK(x) (int64_t)(-((x) & 1))

static void encode(uint8_t *word, uint8_t message);
static void expand_and_sum(__m256i *dst, const uint64_t *src);
static void hadamard(__m256i *src, __m256i *dst);
static uint32_t find_peaks(__m256i *transform);


/**
 * @brief Encode a single byte into a single codeword using RM(1,7)
 *
 * Encoding matrix of this code:
 * bit pattern (note that bits are numbered big endian)
 * 0   aaaaaaaa aaaaaaaa aaaaaaaa aaaaaaaa
 * 1   cccccccc cccccccc cccccccc cccccccc
 * 2   f0f0f0f0 f0f0f0f0 f0f0f0f0 f0f0f0f0
 * 3   ff00ff00 ff00ff00 ff00ff00 ff00ff00
 * 4   ffff0000 ffff0000 ffff0000 ffff0000
 * 5   00000000 ffffffff 00000000 ffffffff
 * 6   00000000 00000000 ffffffff ffffffff
 * 7   ffffffff ffffffff ffffffff ffffffff
 *
 * @param[out] word An RM(1,7) codeword
 * @param[in] message A message to encode
 */
static void encode(uint8_t *word, uint8_t message) {
    uint32_t e;
    // bit 7 flips all the bits, do that first to save work
    e = BIT0MASK(message >> 7);
    // bits 0, 1, 2, 3, 4 are the same for all four longs
    // (Warning: in the bit matrix above, low bits are at the left!)
    e ^= BIT0MASK(message >> 0) & 0xaaaaaaaa;
    e ^= BIT0MASK(message >> 1) & 0xcccccccc;
    e ^= BIT0MASK(message >> 2) & 0xf0f0f0f0;
    e ^= BIT0MASK(message >> 3) & 0xff00ff00;
    e ^= BIT0MASK(message >> 4) & 0xffff0000;
    // we can store this in the first quarter
    word[0 + 0] = (e >> 0x00) & 0xff;
    word[0 + 1] = (e >> 0x08) & 0xff;
    word[0 + 2] = (e >> 0x10) & 0xff;
    word[0 + 3] = (e >> 0x18) & 0xff;
    // bit 5 flips entries 1 and 3; bit 6 flips 2 and 3
    e ^= BIT0MASK(message >> 5);
    word[4 + 0] = (e >> 0x00) & 0xff;
    word[4 + 1] = (e >> 0x08) & 0xff;
    word[4 + 2] = (e >> 0x10) & 0xff;
    word[4 + 3] = (e >> 0x18) & 0xff;
    e ^= BIT0MASK(message >> 6);
    word[12 + 0] = (e >> 0x00) & 0xff;
    word[12 + 1] = (e >> 0x08) & 0xff;
    word[12 + 2] = (e >> 0x10) & 0xff;
    word[12 + 3] = (e >> 0x18) & 0xff;
    e ^= BIT0MASK(message >> 5);
    word[8 + 0] = (e >> 0x00) & 0xff;
    word[8 + 1] = (e >> 0x08) & 0xff;
    word[8 + 2] = (e >> 0x10) & 0xff;
    word[8 + 3] = (e >> 0x18) & 0xff;
}


/**
 * @brief Add multiple codewords into expanded codeword
 *
 * Note: this does not write the codewords as -1 or +1 as the green machine does
 * instead, just 0 and 1 is used.
 * The resulting hadamard transform has:
 * all values are halved
 * the first entry is 64 too high
 *
 * @param[out] dst Structure that contain the expanded codeword
 * @param[in] src Structure that contain the codeword
 */
inline void expand_and_sum(__m256i *dst, const uint64_t *src) {
    uint16_t v[16];
    for (size_t part = 0; part < 8; part++) {
        dst[part] = _mm256_setzero_si256();
    }
    for (size_t copy = 0; copy < MULTIPLICITY; copy++) {
        for (size_t part = 0; part < 8; part++) {
            for (size_t bit = 0; bit < 16; bit++) {
                v[bit] = (((uint16_t *)(&src[2 * copy]))[part] >> bit) & 1;
            }
            dst[part] += _mm256_set_epi16(v[15], v[14], v[13], v[12], v[11], v[10], v[9], v[8],
                                          v[7], v[6], v[5], v[4], v[3], v[2], v[1], v[0]);
        }
    }
}


/**
 * @brief Hadamard transform
 *
 * Perform hadamard transform of src and store result in dst
 * src is overwritten: it is also used as intermediate buffer
 * Method is best explained if we use H(3) instead of H(7):
 *
 * The routine multiplies by the matrix H(3):
 *                     [1  1  1  1  1  1  1  1]
 *                     [1 -1  1 -1  1 -1  1 -1]
 *                     [1  1 -1 -1  1  1 -1 -1]
 * [a b c d e f g h] * [1 -1 -1  1  1 -1 -1  1] = result of routine
 *                     [1  1  1  1 -1 -1 -1 -1]
 *                     [1 -1  1 -1 -1  1 -1  1]
 *                     [1  1 -1 -1 -1 -1  1  1]
 *                     [1 -1 -1  1 -1  1  1 -1]
 * You can do this in three passes, where each pass does this:
 * set lower half of buffer to pairwise sums,
 * and upper half to differences
 * index     0        1        2        3        4        5        6        7
 * input:    a,       b,       c,       d,       e,       f,       g,       h
 * pass 1:   a+b,     c+d,     e+f,     g+h,     a-b,     c-d,     e-f,     g-h
 * pass 2:   a+b+c+d, e+f+g+h, a-b+c-d, e-f+g-h, a+b-c-d, e+f-g-h, a-b-c+d, e-f-g+h
 * pass 3:   a+b+c+d+e+f+g+h   a+b-c-d+e+f-g-h   a+b+c+d-e-f-g-h   a+b-c-d-e+-f+g+h
 * a-b+c-d+e-f+g-h   a-b-c+d+e-f-g+h   a-b+c-d-e+f-g+h   a-b-c+d-e+f+g-h
 * This order of computation is chosen because it vectorises well.
 * Likewise, this routine multiplies by H(7) in seven passes.
 *
 * @param[out] src Structure that contain the expanded codeword
 * @param[out] dst Structure that contain the expanded codeword
 */
inline void hadamard(__m256i *src, __m256i *dst) {
    // the passes move data:
    // src -> dst -> src -> dst -> src -> dst -> src -> dst
    // using p1 and p2 alternately
    __m256i *p1 = src;
    __m256i *p2 = dst;
    __m256i *p3;
    for (size_t pass = 0; pass < 7; pass++) {
        // warning: hadd works "within lanes" as Intel call it
        // so you have to swap the middle 64 bit blocks of the result
        for (size_t part = 0; part < 4; part++) {
            p2[part] = _mm256_permute4x64_epi64(_mm256_hadd_epi16(p1[2 * part], p1[2 * part + 1]), 0xd8);
            p2[part + 4] = _mm256_permute4x64_epi64(_mm256_hsub_epi16(p1[2 * part], p1[2 * part + 1]), 0xd8);
        }
        // swap p1, p2 for next round
        p3 = p1;
        p1 = p2;
        p2 = p3;
    }
}


/**
 * @brief Finding the location of the highest value
 *
 * This is the final step of the green machine: find the location of the highest value,
 * and add 128 if the peak is positive
 * Notes on decoding
 * The standard "Green machine" decoder words as follows:
 * if the received codeword is W, compute (2 * W - 1) * H7
 * The entries of the resulting vector are always even and vary from
 * -128 (= the complement is a code word, add bit 7 to decode)
 * via 0 (this is a different codeword)
 * to 128 (this is the code word).
 *
 * Our decoding differs in two ways:
 * - We take W instead of 2 * W - 1 (so the entries are 0,1 instead of -1,1)
 * - We take the sum of the repititions (so the entries are 0..MULTIPLICITY)
 * This implies that we have to subtract 64M (M=MULTIPLICITY)
 * from the first entry to make sure the first codewords is handled properly
 * and that the entries vary from -64M to 64M.
 * -64M or 64M stands for a perfect codeword.
 * If there are fewer than 32M errors, there is always a unique codeword
 * which an entry with absolute value > 32M;
 * this is because an error changes an entry by 1.
 * The highest number that seem to be decodable is 50 errors, so that the
 * highest entries in the hadamard transform can be as low as 12.
 * But this is different for the repeated code.
 * Because multiple codewords are added, this changes: the lowest value of the
 * hadamard transform of the sum of six words is seen to be as low as 43 (!),
 * which is way less than 12*6.
 *
 * It is possible that there are more errors, but the word is still uniquely
 * decodable: we found a word with distance of 50 from the nearest codeword.
 * That means that the highest entry can be as low as 14M.
 * Since we have to do binary search, we search for the range 1-64M
 * which can be done in 6+l2g(M) steps.
 * The binary search is based on (values>32M are unique):
 * M  32M     min>  max>  firstStep #steps
 * 2   64       1   64    33 +- 16    6
 * 4  128       1  128    65 +- 32    7
 * 6  192       1  192   129 +- 64    8
 *
 * As a check, we run a sample for M=6 to see the peak value; it ranged
 * from 43 to 147, so my analysis looks right. Also, it shows that decoding
 * far beyond the bound of 32M is needed.
 *
 * For the vectors, it would be tempting to use 8 bit ints,
 * because the values "almost" fit in there.
 * We could use some trickery to fit it in 8 bits, like saturated add or
 * division by 2 in a late step.
 * Unfortunately, these instructions do not exist.
 * the adds _mm512_adds_epi8 is available only on the latest processors,
 * and division, shift, mulhi are not available at all for 8 bits.
 * So, we use 16 bit ints.
 *
 * For the search of the optimal comparison value,
 * remember the transform contains 64M-d,
 * where d are the distances to the codewords.
 * The highest value gives the most likely codeword.
 * There is not fast vectorized way to find this value, so we search for the
 * maximum value itself.
 * In each pass, we collect a bit map of the transform values that are,
 * say >bound.  There are three cases:
 * bit map = 0: all code words are further away than 64M-bound (decrease bound)
 * bit map has one bit: one unique code word has distance < 64M-bound
 * bit map has multiple bits: multiple words (increase bound)
 * We will search for the lowest value of bound that gives a nonzero bit map.
 *
 * @param[in] transform Structure that contain the expanded codeword
 */
inline uint32_t find_peaks(__m256i *transform) {
    // a whole lot of vector variables
    __m256i bitmap, abs_rows[8], bound, active_row, max_abs_rows;
    __m256i tmp = _mm256_setzero_si256();
    __m256i vect_mask;
    __m256i res;
    int32_t lower;
    int32_t width;
    uint32_t message;
    uint32_t mask;
    int8_t index;
    int8_t abs_value;
    int8_t mask1;
    int8_t mask2;
    uint16_t result;

    // compute absolute value of transform
    for (size_t i = 0; i < 8; i++) {
        abs_rows[i] = _mm256_abs_epi16(transform[i]);
    }
    // compute a vector of 16 elements which contains the maximum somewhere
    // (later used to compute bits 0 through 3 of message)
    max_abs_rows = abs_rows[0];
    for (size_t i = 1; i < 8; i++) {
        max_abs_rows = _mm256_max_epi16(max_abs_rows, abs_rows[i]);
    }

    // do binary search for the highest value that is lower than the maximum
    // loop invariant: lower gives bit map = 0, lower + width gives bit map > 0
    lower = 1;
    // this gives 64, 128 or 256 for MULTIPLICITY = 2, 4, 6
    width = 1 << (5 + MULTIPLICITY / 2);
    // if you don't unroll this loop, it fits in the loop cache
    // uncomment the line below to speeding up the program by a few percent
    // #pragma GCC unroll 0
    while (width > 1) {
        width >>= 1;
        // compare with lower + width; put result in bitmap
        // make vector from value of new bound
        bound = _mm256_broadcastw_epi16(_mm_cvtsi32_si128(lower + width));
        bitmap = _mm256_cmpgt_epi16(max_abs_rows, bound);
        // step up if there are any matches
        // rely on compiler to use conditional move here
        mask = (uint32_t) _mm256_testz_si256(bitmap, bitmap);
        mask = ~(uint32_t) ((-(int64_t) mask) >> 63);
        lower += mask & width;
    }
    // lower+width contains the maximum value of the vector
    // or less, if the maximum is very high (which is OK)
    // normally, there is one maximum, but sometimes there are more
    // find where the maxima occur in the maximum vector
    // (each determines lower 4 bits of peak position)
    // construct vector filled with bound-1
    bound = _mm256_broadcastw_epi16(_mm_cvtsi32_si128(lower + width - 1));

    // find in which of the 8 groups a maximum occurs to compute bits 4, 5, 6 of message
    // find lowest value by searching backwards skip first check to save time
    message = 0x70;
    for (size_t i = 0; i < 8; i++) {
        bitmap = _mm256_cmpgt_epi16(abs_rows[7 - i], bound);
        mask = (uint32_t) _mm256_testz_si256(bitmap, bitmap);
        mask = ~(uint32_t) ((-(int64_t) mask) >> 63);
        message ^= mask & (message ^ ((7 - i) << 4));
    }
    // we decided which row of the matrix contains the lowest match
    // select proper row
    index = message >> 4;

    tmp = _mm256_setzero_si256();
    for (size_t i = 0; i < 8; i++) {
        abs_value = (int8_t)(index - i);
        mask1 = abs_value >> 7;
        abs_value ^= mask1;
        abs_value -= mask1;
        mask2 = ((uint8_t) - abs_value >> 7);
        mask = (-1ULL) + mask2;
        vect_mask = _mm256_set1_epi32(mask);
        res = _mm256_and_si256(abs_rows[i], vect_mask);
        tmp = _mm256_or_si256(tmp, res);
    }

    active_row = tmp;

    // get the column number of the vector element
    // by setting the bits corresponding to the columns
    // and then adding elements within two groups of 8
    vect_mask = _mm256_cmpgt_epi16(active_row, bound);
    vect_mask &= _mm256_set_epi16(-32768, 16384, 8192, 4096, 2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1);
    for (size_t i = 0; i < 3; i++) {
        vect_mask = _mm256_hadd_epi16(vect_mask, vect_mask);
    }
    // add low 4 bits of message
    message |= __tzcnt_u16(_mm256_extract_epi16(vect_mask, 0) + _mm256_extract_epi16(vect_mask, 8));

    // set bit 7 if sign of biggest value is positive
    // make sure a jump isn't generated by the compiler
    tmp = _mm256_setzero_si256();
    for (size_t i = 0; i < 8; i++) {
        mask = ~(uint32_t) ((-(int64_t)(i ^ message / 16)) >> 63);
        vect_mask = _mm256_set1_epi32(mask);
        tmp = _mm256_or_si256(tmp, _mm256_and_si256(vect_mask, transform[i]));
    }
    result = 0;
    for (size_t i = 0; i < 16; i++) {
        mask = ~(uint32_t) ((-(int64_t)(i ^ message % 16)) >> 63);
        result |= mask & ((uint16_t *)&tmp)[i];
    }
    message |= (0x8000 & ~result) >> 8;
    return message;
}


/**
 * @brief Encodes the received word
 *
 * The message consists of N1 bytes each byte is encoded into PARAM_N2 bits,
 * or MULTIPLICITY repeats of 128 bits
 *
 * @param[out] cdw Array of size VEC_N1N2_SIZE_64 receiving the encoded message
 * @param[in] msg Array of size VEC_N1_SIZE_64 storing the message
 */
void PQCLEAN_HQCRMRS256_AVX2_reed_muller_encode(uint8_t *cdw, const uint8_t *msg) {
    for (size_t i = 0; i < VEC_N1_SIZE_BYTES; i++) {
        // encode first word
        encode(&cdw[16 * i * MULTIPLICITY], msg[i]);
        // copy to other identical codewords
        for (size_t copy = 1; copy < MULTIPLICITY; copy++) {
            memcpy(&cdw[16 * i * MULTIPLICITY + 16 * copy], &cdw[16 * i * MULTIPLICITY], 16);
        }
    }
}


/**
 * @brief Decodes the received word
 *
 * Decoding uses fast hadamard transform, for a more complete picture on Reed-Muller decoding, see MacWilliams, Florence Jessie, and Neil James Alexander Sloane.
 * The theory of error-correcting codes codes @cite macwilliams1977theory
 *
 * @param[out] msg Array of size VEC_N1_SIZE_64 receiving the decoded message
 * @param[in] cdw Array of size VEC_N1N2_SIZE_64 storing the received word
 */
void PQCLEAN_HQCRMRS256_AVX2_reed_muller_decode(uint8_t *msg, const uint8_t *cdw) {
    __m256i expanded[8];
    __m256i transform[8];
    for (size_t i = 0; i < VEC_N1_SIZE_BYTES; i++) {
        // collect the codewords
        expand_and_sum(expanded, (uint64_t *)&cdw[16 * i * MULTIPLICITY]);
        // apply hadamard transform
        hadamard(expanded, transform);
        // fix the first entry to get the half Hadamard transform
        transform[0] -= _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0,
                                         0, 0, 0, 0, 0, 0, 0, 64 * MULTIPLICITY);
        // finish the decoding
        msg[i] = find_peaks(transform);
    }
}
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00			`#include "parameters.h"`
			`#include "reed_muller.h"`
			`#include <immintrin.h>`
			`#include <stdint.h>`
			`#include <string.h>`
			`/**`
			`* @file reed_muller.c`
			`* Constant time implementation of Reed-Muller code RM(1,7)`
			`*/`


			`// number of repeated code words`
			`#define MULTIPLICITY CEIL_DIVIDE(PARAM_N2, 128)`

			`// copy bit 0 into all bits of a 64 bit value`
			`#define BIT0MASK(x) (int64_t)(-((x) & 1))`

uint8_t api for encode/decode in optimized rmrs 2020-09-13 19:13:48 +01:00			`static void encode(uint8_t *word, uint8_t message);`
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00			`static void expand_and_sum(__m256i dst, const uint64_t src);`
			`static void hadamard(__m256i src, __m256i dst);`
uint8_t api for encode/decode in optimized rmrs 2020-09-13 19:13:48 +01:00			`static uint32_t find_peaks(__m256i *transform);`
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00


			`/**`
			`* @brief Encode a single byte into a single codeword using RM(1,7)`
			`*`
			`* Encoding matrix of this code:`
			`* bit pattern (note that bits are numbered big endian)`
			`* 0 aaaaaaaa aaaaaaaa aaaaaaaa aaaaaaaa`
			`* 1 cccccccc cccccccc cccccccc cccccccc`
			`* 2 f0f0f0f0 f0f0f0f0 f0f0f0f0 f0f0f0f0`
			`* 3 ff00ff00 ff00ff00 ff00ff00 ff00ff00`
			`* 4 ffff0000 ffff0000 ffff0000 ffff0000`
			`* 5 00000000 ffffffff 00000000 ffffffff`
			`* 6 00000000 00000000 ffffffff ffffffff`
			`* 7 ffffffff ffffffff ffffffff ffffffff`
			`*`
			`* @param[out] word An RM(1,7) codeword`
			`* @param[in] message A message to encode`
			`*/`
uint8_t api for encode/decode in optimized rmrs 2020-09-13 19:13:48 +01:00			`static void encode(uint8_t *word, uint8_t message) {`
			`uint32_t e;`
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00			`// bit 7 flips all the bits, do that first to save work`
uint8_t api for encode/decode in optimized rmrs 2020-09-13 19:13:48 +01:00			`e = BIT0MASK(message >> 7);`
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00			`// bits 0, 1, 2, 3, 4 are the same for all four longs`
			`// (Warning: in the bit matrix above, low bits are at the left!)`
uint8_t api for encode/decode in optimized rmrs 2020-09-13 19:13:48 +01:00			`e ^= BIT0MASK(message >> 0) & 0xaaaaaaaa;`
			`e ^= BIT0MASK(message >> 1) & 0xcccccccc;`
			`e ^= BIT0MASK(message >> 2) & 0xf0f0f0f0;`
			`e ^= BIT0MASK(message >> 3) & 0xff00ff00;`
			`e ^= BIT0MASK(message >> 4) & 0xffff0000;`
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00			`// we can store this in the first quarter`
uint8_t api for encode/decode in optimized rmrs 2020-09-13 19:13:48 +01:00			`word[0 + 0] = (e >> 0x00) & 0xff;`
			`word[0 + 1] = (e >> 0x08) & 0xff;`
			`word[0 + 2] = (e >> 0x10) & 0xff;`
			`word[0 + 3] = (e >> 0x18) & 0xff;`
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00			`// bit 5 flips entries 1 and 3; bit 6 flips 2 and 3`
uint8_t api for encode/decode in optimized rmrs 2020-09-13 19:13:48 +01:00			`e ^= BIT0MASK(message >> 5);`
			`word[4 + 0] = (e >> 0x00) & 0xff;`
			`word[4 + 1] = (e >> 0x08) & 0xff;`
			`word[4 + 2] = (e >> 0x10) & 0xff;`
			`word[4 + 3] = (e >> 0x18) & 0xff;`
			`e ^= BIT0MASK(message >> 6);`
			`word[12 + 0] = (e >> 0x00) & 0xff;`
			`word[12 + 1] = (e >> 0x08) & 0xff;`
			`word[12 + 2] = (e >> 0x10) & 0xff;`
			`word[12 + 3] = (e >> 0x18) & 0xff;`
			`e ^= BIT0MASK(message >> 5);`
			`word[8 + 0] = (e >> 0x00) & 0xff;`
			`word[8 + 1] = (e >> 0x08) & 0xff;`
			`word[8 + 2] = (e >> 0x10) & 0xff;`
			`word[8 + 3] = (e >> 0x18) & 0xff;`
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00			`}`



			`/**`
			`* @brief Add multiple codewords into expanded codeword`
			`*`
			`* Note: this does not write the codewords as -1 or +1 as the green machine does`
			`* instead, just 0 and 1 is used.`
			`* The resulting hadamard transform has:`
			`* all values are halved`
			`* the first entry is 64 too high`
			`*`
			`* @param[out] dst Structure that contain the expanded codeword`
			`* @param[in] src Structure that contain the codeword`
			`*/`
			`inline void expand_and_sum(__m256i dst, const uint64_t src) {`
			`uint16_t v[16];`
remove spaces before semicolons 2020-09-10 21:36:42 +01:00			`for (size_t part = 0; part < 8; part++) {`
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00			`dst[part] = _mm256_setzero_si256();`
			`}`
remove spaces before semicolons 2020-09-10 21:36:42 +01:00			`for (size_t copy = 0; copy < MULTIPLICITY; copy++) {`
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00			`for (size_t part = 0; part < 8; part++) {`
			`for (size_t bit = 0; bit < 16; bit++) {`
			`v[bit] = (((uint16_t )(&src[2 copy]))[part] >> bit) & 1;`
			`}`
			`dst[part] += _mm256_set_epi16(v[15], v[14], v[13], v[12], v[11], v[10], v[9], v[8],`
			`v[7], v[6], v[5], v[4], v[3], v[2], v[1], v[0]);`
			`}`
			`}`
			`}`



			`/**`
			`* @brief Hadamard transform`
			`*`
			`* Perform hadamard transform of src and store result in dst`
			`* src is overwritten: it is also used as intermediate buffer`
			`* Method is best explained if we use H(3) instead of H(7):`
			`*`
			`* The routine multiplies by the matrix H(3):`
			`* [1 1 1 1 1 1 1 1]`
			`* [1 -1 1 -1 1 -1 1 -1]`
			`* [1 1 -1 -1 1 1 -1 -1]`
			`* [a b c d e f g h] * [1 -1 -1 1 1 -1 -1 1] = result of routine`
			`* [1 1 1 1 -1 -1 -1 -1]`
			`* [1 -1 1 -1 -1 1 -1 1]`
			`* [1 1 -1 -1 -1 -1 1 1]`
			`* [1 -1 -1 1 -1 1 1 -1]`
			`* You can do this in three passes, where each pass does this:`
			`* set lower half of buffer to pairwise sums,`
			`* and upper half to differences`
			`* index 0 1 2 3 4 5 6 7`
			`* input: a, b, c, d, e, f, g, h`
			`* pass 1: a+b, c+d, e+f, g+h, a-b, c-d, e-f, g-h`
			`* pass 2: a+b+c+d, e+f+g+h, a-b+c-d, e-f+g-h, a+b-c-d, e+f-g-h, a-b-c+d, e-f-g+h`
			`* pass 3: a+b+c+d+e+f+g+h a+b-c-d+e+f-g-h a+b+c+d-e-f-g-h a+b-c-d-e+-f+g+h`
			`* a-b+c-d+e-f+g-h a-b-c+d+e-f-g+h a-b+c-d-e+f-g+h a-b-c+d-e+f+g-h`
			`* This order of computation is chosen because it vectorises well.`
			`* Likewise, this routine multiplies by H(7) in seven passes.`
			`*`
			`* @param[out] src Structure that contain the expanded codeword`
			`* @param[out] dst Structure that contain the expanded codeword`
			`*/`
			`inline void hadamard(__m256i src, __m256i dst) {`
			`// the passes move data:`
			`// src -> dst -> src -> dst -> src -> dst -> src -> dst`
			`// using p1 and p2 alternately`
			`__m256i *p1 = src;`
			`__m256i *p2 = dst;`
			`__m256i *p3;`
remove spaces before semicolons 2020-09-10 21:36:42 +01:00			`for (size_t pass = 0; pass < 7; pass++) {`
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00			`// warning: hadd works "within lanes" as Intel call it`
			`// so you have to swap the middle 64 bit blocks of the result`
remove spaces before semicolons 2020-09-10 21:36:42 +01:00			`for (size_t part = 0; part < 4; part++) {`
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00			`p2[part] = _mm256_permute4x64_epi64(_mm256_hadd_epi16(p1[2 * part], p1[2 * part + 1]), 0xd8);`
			`p2[part + 4] = _mm256_permute4x64_epi64(_mm256_hsub_epi16(p1[2 * part], p1[2 * part + 1]), 0xd8);`
			`}`
			`// swap p1, p2 for next round`
			`p3 = p1;`
			`p1 = p2;`
			`p2 = p3;`
			`}`
			`}`



			`/**`
			`* @brief Finding the location of the highest value`
			`*`
			`* This is the final step of the green machine: find the location of the highest value,`
			`* and add 128 if the peak is positive`
			`* Notes on decoding`
			`* The standard "Green machine" decoder words as follows:`
			`* if the received codeword is W, compute (2 * W - 1) * H7`
			`* The entries of the resulting vector are always even and vary from`
			`* -128 (= the complement is a code word, add bit 7 to decode)`
			`* via 0 (this is a different codeword)`
			`* to 128 (this is the code word).`
			`*`
			`* Our decoding differs in two ways:`
			`* - We take W instead of 2 * W - 1 (so the entries are 0,1 instead of -1,1)`
			`* - We take the sum of the repititions (so the entries are 0..MULTIPLICITY)`
			`* This implies that we have to subtract 64M (M=MULTIPLICITY)`
			`* from the first entry to make sure the first codewords is handled properly`
			`* and that the entries vary from -64M to 64M.`
			`* -64M or 64M stands for a perfect codeword.`
			`* If there are fewer than 32M errors, there is always a unique codeword`
			`* which an entry with absolute value > 32M;`
			`* this is because an error changes an entry by 1.`
			`* The highest number that seem to be decodable is 50 errors, so that the`
			`* highest entries in the hadamard transform can be as low as 12.`
			`* But this is different for the repeated code.`
			`* Because multiple codewords are added, this changes: the lowest value of the`
			`* hadamard transform of the sum of six words is seen to be as low as 43 (!),`
			`* which is way less than 12*6.`
			`*`
			`* It is possible that there are more errors, but the word is still uniquely`
			`* decodable: we found a word with distance of 50 from the nearest codeword.`
			`* That means that the highest entry can be as low as 14M.`
			`* Since we have to do binary search, we search for the range 1-64M`
			`* which can be done in 6+l2g(M) steps.`
			`* The binary search is based on (values>32M are unique):`
			`* M 32M min> max> firstStep #steps`
			`* 2 64 1 64 33 +- 16 6`
			`* 4 128 1 128 65 +- 32 7`
			`* 6 192 1 192 129 +- 64 8`
			`*`
			`* As a check, we run a sample for M=6 to see the peak value; it ranged`
			`* from 43 to 147, so my analysis looks right. Also, it shows that decoding`
			`* far beyond the bound of 32M is needed.`
			`*`
			`* For the vectors, it would be tempting to use 8 bit ints,`
			`* because the values "almost" fit in there.`
			`* We could use some trickery to fit it in 8 bits, like saturated add or`
			`* division by 2 in a late step.`
			`* Unfortunately, these instructions do not exist.`
			`* the adds _mm512_adds_epi8 is available only on the latest processors,`
			`* and division, shift, mulhi are not available at all for 8 bits.`
			`* So, we use 16 bit ints.`
			`*`
			`* For the search of the optimal comparison value,`
			`* remember the transform contains 64M-d,`
			`* where d are the distances to the codewords.`
			`* The highest value gives the most likely codeword.`
			`* There is not fast vectorized way to find this value, so we search for the`
			`* maximum value itself.`
			`* In each pass, we collect a bit map of the transform values that are,`
			`* say >bound. There are three cases:`
			`* bit map = 0: all code words are further away than 64M-bound (decrease bound)`
			`* bit map has one bit: one unique code word has distance < 64M-bound`
			`* bit map has multiple bits: multiple words (increase bound)`
			`* We will search for the lowest value of bound that gives a nonzero bit map.`
			`*`
			`* @param[in] transform Structure that contain the expanded codeword`
			`*/`
uint8_t api for encode/decode in optimized rmrs 2020-09-13 19:13:48 +01:00			`inline uint32_t find_peaks(__m256i *transform) {`
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00			`// a whole lot of vector variables`
			`__m256i bitmap, abs_rows[8], bound, active_row, max_abs_rows;`
readability changes 2020-09-15 15:33:06 +01:00			`__m256i tmp = _mm256_setzero_si256();`
			`__m256i vect_mask;`
			`__m256i res;`
			`int32_t lower;`
			`int32_t width;`
			`uint32_t message;`
			`uint32_t mask;`
			`int8_t index;`
			`int8_t abs_value;`
			`int8_t mask1;`
			`int8_t mask2;`
			`uint16_t result;`

New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00			`// compute absolute value of transform`
remove spaces before semicolons 2020-09-10 21:36:42 +01:00			`for (size_t i = 0; i < 8; i++) {`
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00			`abs_rows[i] = _mm256_abs_epi16(transform[i]);`
			`}`
			`// compute a vector of 16 elements which contains the maximum somewhere`
			`// (later used to compute bits 0 through 3 of message)`
			`max_abs_rows = abs_rows[0];`
remove spaces before semicolons 2020-09-10 21:36:42 +01:00			`for (size_t i = 1; i < 8; i++) {`
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00			`max_abs_rows = _mm256_max_epi16(max_abs_rows, abs_rows[i]);`
			`}`

			`// do binary search for the highest value that is lower than the maximum`
			`// loop invariant: lower gives bit map = 0, lower + width gives bit map > 0`
readability changes 2020-09-15 15:33:06 +01:00			`lower = 1;`
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00			`// this gives 64, 128 or 256 for MULTIPLICITY = 2, 4, 6`
readability changes 2020-09-15 15:33:06 +01:00			`width = 1 << (5 + MULTIPLICITY / 2);`
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00			`// if you don't unroll this loop, it fits in the loop cache`
			`// uncomment the line below to speeding up the program by a few percent`
			`// #pragma GCC unroll 0`
			`while (width > 1) {`
			`width >>= 1;`
			`// compare with lower + width; put result in bitmap`
			`// make vector from value of new bound`
			`bound = _mm256_broadcastw_epi16(_mm_cvtsi32_si128(lower + width));`
			`bitmap = _mm256_cmpgt_epi16(max_abs_rows, bound);`
			`// step up if there are any matches`
			`// rely on compiler to use conditional move here`
readability changes 2020-09-15 15:33:06 +01:00			`mask = (uint32_t) _mm256_testz_si256(bitmap, bitmap);`
			`mask = ~(uint32_t) ((-(int64_t) mask) >> 63);`
			`lower += mask & width;`
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00			`}`
			`// lower+width contains the maximum value of the vector`
			`// or less, if the maximum is very high (which is OK)`
			`// normally, there is one maximum, but sometimes there are more`
			`// find where the maxima occur in the maximum vector`
			`// (each determines lower 4 bits of peak position)`
			`// construct vector filled with bound-1`
			`bound = _mm256_broadcastw_epi16(_mm_cvtsi32_si128(lower + width - 1));`

			`// find in which of the 8 groups a maximum occurs to compute bits 4, 5, 6 of message`
			`// find lowest value by searching backwards skip first check to save time`
readability changes 2020-09-15 15:33:06 +01:00			`message = 0x70;`
			`for (size_t i = 0; i < 8; i++) {`
			`bitmap = _mm256_cmpgt_epi16(abs_rows[7 - i], bound);`
			`mask = (uint32_t) _mm256_testz_si256(bitmap, bitmap);`
			`mask = ~(uint32_t) ((-(int64_t) mask) >> 63);`
			`message ^= mask & (message ^ ((7 - i) << 4));`
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00			`}`
			`// we decided which row of the matrix contains the lowest match`
			`// select proper row`
readability changes 2020-09-15 15:33:06 +01:00			`index = message >> 4;`
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00
readability changes 2020-09-15 15:33:06 +01:00			`tmp = _mm256_setzero_si256();`
			`for (size_t i = 0; i < 8; i++) {`
			`abs_value = (int8_t)(index - i);`
			`mask1 = abs_value >> 7;`
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00			`abs_value ^= mask1;`
			`abs_value -= mask1;`
readability changes 2020-09-15 15:33:06 +01:00			`mask2 = ((uint8_t) - abs_value >> 7);`
			`mask = (-1ULL) + mask2;`
			`vect_mask = _mm256_set1_epi32(mask);`
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00			`res = _mm256_and_si256(abs_rows[i], vect_mask);`
			`tmp = _mm256_or_si256(tmp, res);`
			`}`

			`active_row = tmp;`

			`// get the column number of the vector element`
			`// by setting the bits corresponding to the columns`
			`// and then adding elements within two groups of 8`
readability changes 2020-09-15 15:33:06 +01:00			`vect_mask = _mm256_cmpgt_epi16(active_row, bound);`
			`vect_mask &= _mm256_set_epi16(-32768, 16384, 8192, 4096, 2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1);`
			`for (size_t i = 0; i < 3; i++) {`
			`vect_mask = _mm256_hadd_epi16(vect_mask, vect_mask);`
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00			`}`
			`// add low 4 bits of message`
readability changes 2020-09-15 15:33:06 +01:00			`message \|= __tzcnt_u16(_mm256_extract_epi16(vect_mask, 0) + _mm256_extract_epi16(vect_mask, 8));`
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00
			`// set bit 7 if sign of biggest value is positive`
			`// make sure a jump isn't generated by the compiler`
readability changes 2020-09-15 15:33:06 +01:00			`tmp = _mm256_setzero_si256();`
			`for (size_t i = 0; i < 8; i++) {`
			`mask = ~(uint32_t) ((-(int64_t)(i ^ message / 16)) >> 63);`
HQC: updated packaging script. should resolve #327 2020-10-05 17:55:34 +01:00			`vect_mask = _mm256_set1_epi32(mask);`
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00			`tmp = _mm256_or_si256(tmp, _mm256_and_si256(vect_mask, transform[i]));`
			`}`
readability changes 2020-09-15 15:33:06 +01:00			`result = 0;`
			`for (size_t i = 0; i < 16; i++) {`
			`mask = ~(uint32_t) ((-(int64_t)(i ^ message % 16)) >> 63);`
			`result \|= mask & ((uint16_t *)&tmp)[i];`
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00			`}`
			`message \|= (0x8000 & ~result) >> 8;`
readability changes 2020-09-15 15:33:06 +01:00			`return message;`
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00			`}`



			`/**`
			`* @brief Encodes the received word`
			`*`
			`* The message consists of N1 bytes each byte is encoded into PARAM_N2 bits,`
			`* or MULTIPLICITY repeats of 128 bits`
			`*`
			`* @param[out] cdw Array of size VEC_N1N2_SIZE_64 receiving the encoded message`
			`* @param[in] msg Array of size VEC_N1_SIZE_64 storing the message`
			`*/`
uint8_t api for encode/decode in optimized rmrs 2020-09-13 19:13:48 +01:00			`void PQCLEAN_HQCRMRS256_AVX2_reed_muller_encode(uint8_t cdw, const uint8_t msg) {`
remove spaces before semicolons 2020-09-10 21:36:42 +01:00			`for (size_t i = 0; i < VEC_N1_SIZE_BYTES; i++) {`
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00			`// encode first word`
uint8_t api for encode/decode in optimized rmrs 2020-09-13 19:13:48 +01:00			`encode(&cdw[16 * i * MULTIPLICITY], msg[i]);`
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00			`// copy to other identical codewords`
remove spaces before semicolons 2020-09-10 21:36:42 +01:00			`for (size_t copy = 1; copy < MULTIPLICITY; copy++) {`
uint8_t api for encode/decode in optimized rmrs 2020-09-13 19:13:48 +01:00			`memcpy(&cdw[16 * i * MULTIPLICITY + 16 * copy], &cdw[16 * i * MULTIPLICITY], 16);`
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00			`}`
			`}`
			`}`



			`/**`
			`* @brief Decodes the received word`
			`*`
			`* Decoding uses fast hadamard transform, for a more complete picture on Reed-Muller decoding, see MacWilliams, Florence Jessie, and Neil James Alexander Sloane.`
			`* The theory of error-correcting codes codes @cite macwilliams1977theory`
			`*`
			`* @param[out] msg Array of size VEC_N1_SIZE_64 receiving the decoded message`
			`* @param[in] cdw Array of size VEC_N1N2_SIZE_64 storing the received word`
			`*/`
uint8_t api for encode/decode in optimized rmrs 2020-09-13 19:13:48 +01:00			`void PQCLEAN_HQCRMRS256_AVX2_reed_muller_decode(uint8_t msg, const uint8_t cdw) {`
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00			`__m256i expanded[8];`
			`__m256i transform[8];`
remove spaces before semicolons 2020-09-10 21:36:42 +01:00			`for (size_t i = 0; i < VEC_N1_SIZE_BYTES; i++) {`
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00			`// collect the codewords`
uint8_t api for encode/decode in optimized rmrs 2020-09-13 19:13:48 +01:00			`expand_and_sum(expanded, (uint64_t )&cdw[16 i * MULTIPLICITY]);`
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00			`// apply hadamard transform`
			`hadamard(expanded, transform);`
			`// fix the first entry to get the half Hadamard transform`
			`transform[0] -= _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0,`
			`0, 0, 0, 0, 0, 0, 0, 64 * MULTIPLICITY);`
			`// finish the decoding`
uint8_t api for encode/decode in optimized rmrs 2020-09-13 19:13:48 +01:00			`msg[i] = find_peaks(transform);`
New HQC and HQC-RMRS from upstream 2020-09-07 19:23:34 +01:00			`}`
			`}`