390 lines
15 KiB
C
390 lines
15 KiB
C
#include "parameters.h"
|
|
#include "reed_muller.h"
|
|
#include <immintrin.h>
|
|
#include <stdint.h>
|
|
#include <string.h>
|
|
/**
|
|
* @file reed_muller.c
|
|
* Constant time implementation of Reed-Muller code RM(1,7)
|
|
*/
|
|
|
|
|
|
// number of repeated code words
|
|
#define MULTIPLICITY CEIL_DIVIDE(PARAM_N2, 128)
|
|
|
|
// copy bit 0 into all bits of a 64 bit value
|
|
#define BIT0MASK(x) (int64_t)(-((x) & 1))
|
|
|
|
static void encode(uint8_t *word, uint8_t message);
|
|
static void expand_and_sum(__m256i *dst, const uint64_t *src);
|
|
static void hadamard(__m256i *src, __m256i *dst);
|
|
static uint32_t find_peaks(__m256i *transform);
|
|
|
|
|
|
|
|
/**
|
|
* @brief Encode a single byte into a single codeword using RM(1,7)
|
|
*
|
|
* Encoding matrix of this code:
|
|
* bit pattern (note that bits are numbered big endian)
|
|
* 0 aaaaaaaa aaaaaaaa aaaaaaaa aaaaaaaa
|
|
* 1 cccccccc cccccccc cccccccc cccccccc
|
|
* 2 f0f0f0f0 f0f0f0f0 f0f0f0f0 f0f0f0f0
|
|
* 3 ff00ff00 ff00ff00 ff00ff00 ff00ff00
|
|
* 4 ffff0000 ffff0000 ffff0000 ffff0000
|
|
* 5 00000000 ffffffff 00000000 ffffffff
|
|
* 6 00000000 00000000 ffffffff ffffffff
|
|
* 7 ffffffff ffffffff ffffffff ffffffff
|
|
*
|
|
* @param[out] word An RM(1,7) codeword
|
|
* @param[in] message A message to encode
|
|
*/
|
|
static void encode(uint8_t *word, uint8_t message) {
|
|
uint32_t e;
|
|
// bit 7 flips all the bits, do that first to save work
|
|
e = BIT0MASK(message >> 7);
|
|
// bits 0, 1, 2, 3, 4 are the same for all four longs
|
|
// (Warning: in the bit matrix above, low bits are at the left!)
|
|
e ^= BIT0MASK(message >> 0) & 0xaaaaaaaa;
|
|
e ^= BIT0MASK(message >> 1) & 0xcccccccc;
|
|
e ^= BIT0MASK(message >> 2) & 0xf0f0f0f0;
|
|
e ^= BIT0MASK(message >> 3) & 0xff00ff00;
|
|
e ^= BIT0MASK(message >> 4) & 0xffff0000;
|
|
// we can store this in the first quarter
|
|
word[0 + 0] = (e >> 0x00) & 0xff;
|
|
word[0 + 1] = (e >> 0x08) & 0xff;
|
|
word[0 + 2] = (e >> 0x10) & 0xff;
|
|
word[0 + 3] = (e >> 0x18) & 0xff;
|
|
// bit 5 flips entries 1 and 3; bit 6 flips 2 and 3
|
|
e ^= BIT0MASK(message >> 5);
|
|
word[4 + 0] = (e >> 0x00) & 0xff;
|
|
word[4 + 1] = (e >> 0x08) & 0xff;
|
|
word[4 + 2] = (e >> 0x10) & 0xff;
|
|
word[4 + 3] = (e >> 0x18) & 0xff;
|
|
e ^= BIT0MASK(message >> 6);
|
|
word[12 + 0] = (e >> 0x00) & 0xff;
|
|
word[12 + 1] = (e >> 0x08) & 0xff;
|
|
word[12 + 2] = (e >> 0x10) & 0xff;
|
|
word[12 + 3] = (e >> 0x18) & 0xff;
|
|
e ^= BIT0MASK(message >> 5);
|
|
word[8 + 0] = (e >> 0x00) & 0xff;
|
|
word[8 + 1] = (e >> 0x08) & 0xff;
|
|
word[8 + 2] = (e >> 0x10) & 0xff;
|
|
word[8 + 3] = (e >> 0x18) & 0xff;
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
* @brief Add multiple codewords into expanded codeword
|
|
*
|
|
* Note: this does not write the codewords as -1 or +1 as the green machine does
|
|
* instead, just 0 and 1 is used.
|
|
* The resulting hadamard transform has:
|
|
* all values are halved
|
|
* the first entry is 64 too high
|
|
*
|
|
* @param[out] dst Structure that contain the expanded codeword
|
|
* @param[in] src Structure that contain the codeword
|
|
*/
|
|
inline void expand_and_sum(__m256i *dst, const uint64_t *src) {
|
|
uint16_t v[16];
|
|
for (size_t part = 0; part < 8; part++) {
|
|
dst[part] = _mm256_setzero_si256();
|
|
}
|
|
for (size_t copy = 0; copy < MULTIPLICITY; copy++) {
|
|
for (size_t part = 0; part < 8; part++) {
|
|
for (size_t bit = 0; bit < 16; bit++) {
|
|
v[bit] = (((uint16_t *)(&src[2 * copy]))[part] >> bit) & 1;
|
|
}
|
|
dst[part] += _mm256_set_epi16(v[15], v[14], v[13], v[12], v[11], v[10], v[9], v[8],
|
|
v[7], v[6], v[5], v[4], v[3], v[2], v[1], v[0]);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
* @brief Hadamard transform
|
|
*
|
|
* Perform hadamard transform of src and store result in dst
|
|
* src is overwritten: it is also used as intermediate buffer
|
|
* Method is best explained if we use H(3) instead of H(7):
|
|
*
|
|
* The routine multiplies by the matrix H(3):
|
|
* [1 1 1 1 1 1 1 1]
|
|
* [1 -1 1 -1 1 -1 1 -1]
|
|
* [1 1 -1 -1 1 1 -1 -1]
|
|
* [a b c d e f g h] * [1 -1 -1 1 1 -1 -1 1] = result of routine
|
|
* [1 1 1 1 -1 -1 -1 -1]
|
|
* [1 -1 1 -1 -1 1 -1 1]
|
|
* [1 1 -1 -1 -1 -1 1 1]
|
|
* [1 -1 -1 1 -1 1 1 -1]
|
|
* You can do this in three passes, where each pass does this:
|
|
* set lower half of buffer to pairwise sums,
|
|
* and upper half to differences
|
|
* index 0 1 2 3 4 5 6 7
|
|
* input: a, b, c, d, e, f, g, h
|
|
* pass 1: a+b, c+d, e+f, g+h, a-b, c-d, e-f, g-h
|
|
* pass 2: a+b+c+d, e+f+g+h, a-b+c-d, e-f+g-h, a+b-c-d, e+f-g-h, a-b-c+d, e-f-g+h
|
|
* pass 3: a+b+c+d+e+f+g+h a+b-c-d+e+f-g-h a+b+c+d-e-f-g-h a+b-c-d-e+-f+g+h
|
|
* a-b+c-d+e-f+g-h a-b-c+d+e-f-g+h a-b+c-d-e+f-g+h a-b-c+d-e+f+g-h
|
|
* This order of computation is chosen because it vectorises well.
|
|
* Likewise, this routine multiplies by H(7) in seven passes.
|
|
*
|
|
* @param[out] src Structure that contain the expanded codeword
|
|
* @param[out] dst Structure that contain the expanded codeword
|
|
*/
|
|
inline void hadamard(__m256i *src, __m256i *dst) {
|
|
// the passes move data:
|
|
// src -> dst -> src -> dst -> src -> dst -> src -> dst
|
|
// using p1 and p2 alternately
|
|
__m256i *p1 = src;
|
|
__m256i *p2 = dst;
|
|
__m256i *p3;
|
|
for (size_t pass = 0; pass < 7; pass++) {
|
|
// warning: hadd works "within lanes" as Intel call it
|
|
// so you have to swap the middle 64 bit blocks of the result
|
|
for (size_t part = 0; part < 4; part++) {
|
|
p2[part] = _mm256_permute4x64_epi64(_mm256_hadd_epi16(p1[2 * part], p1[2 * part + 1]), 0xd8);
|
|
p2[part + 4] = _mm256_permute4x64_epi64(_mm256_hsub_epi16(p1[2 * part], p1[2 * part + 1]), 0xd8);
|
|
}
|
|
// swap p1, p2 for next round
|
|
p3 = p1;
|
|
p1 = p2;
|
|
p2 = p3;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
* @brief Finding the location of the highest value
|
|
*
|
|
* This is the final step of the green machine: find the location of the highest value,
|
|
* and add 128 if the peak is positive
|
|
* Notes on decoding
|
|
* The standard "Green machine" decoder words as follows:
|
|
* if the received codeword is W, compute (2 * W - 1) * H7
|
|
* The entries of the resulting vector are always even and vary from
|
|
* -128 (= the complement is a code word, add bit 7 to decode)
|
|
* via 0 (this is a different codeword)
|
|
* to 128 (this is the code word).
|
|
*
|
|
* Our decoding differs in two ways:
|
|
* - We take W instead of 2 * W - 1 (so the entries are 0,1 instead of -1,1)
|
|
* - We take the sum of the repititions (so the entries are 0..MULTIPLICITY)
|
|
* This implies that we have to subtract 64M (M=MULTIPLICITY)
|
|
* from the first entry to make sure the first codewords is handled properly
|
|
* and that the entries vary from -64M to 64M.
|
|
* -64M or 64M stands for a perfect codeword.
|
|
* If there are fewer than 32M errors, there is always a unique codeword
|
|
* which an entry with absolute value > 32M;
|
|
* this is because an error changes an entry by 1.
|
|
* The highest number that seem to be decodable is 50 errors, so that the
|
|
* highest entries in the hadamard transform can be as low as 12.
|
|
* But this is different for the repeated code.
|
|
* Because multiple codewords are added, this changes: the lowest value of the
|
|
* hadamard transform of the sum of six words is seen to be as low as 43 (!),
|
|
* which is way less than 12*6.
|
|
*
|
|
* It is possible that there are more errors, but the word is still uniquely
|
|
* decodable: we found a word with distance of 50 from the nearest codeword.
|
|
* That means that the highest entry can be as low as 14M.
|
|
* Since we have to do binary search, we search for the range 1-64M
|
|
* which can be done in 6+l2g(M) steps.
|
|
* The binary search is based on (values>32M are unique):
|
|
* M 32M min> max> firstStep #steps
|
|
* 2 64 1 64 33 +- 16 6
|
|
* 4 128 1 128 65 +- 32 7
|
|
* 6 192 1 192 129 +- 64 8
|
|
*
|
|
* As a check, we run a sample for M=6 to see the peak value; it ranged
|
|
* from 43 to 147, so my analysis looks right. Also, it shows that decoding
|
|
* far beyond the bound of 32M is needed.
|
|
*
|
|
* For the vectors, it would be tempting to use 8 bit ints,
|
|
* because the values "almost" fit in there.
|
|
* We could use some trickery to fit it in 8 bits, like saturated add or
|
|
* division by 2 in a late step.
|
|
* Unfortunately, these instructions do not exist.
|
|
* the adds _mm512_adds_epi8 is available only on the latest processors,
|
|
* and division, shift, mulhi are not available at all for 8 bits.
|
|
* So, we use 16 bit ints.
|
|
*
|
|
* For the search of the optimal comparison value,
|
|
* remember the transform contains 64M-d,
|
|
* where d are the distances to the codewords.
|
|
* The highest value gives the most likely codeword.
|
|
* There is not fast vectorized way to find this value, so we search for the
|
|
* maximum value itself.
|
|
* In each pass, we collect a bit map of the transform values that are,
|
|
* say >bound. There are three cases:
|
|
* bit map = 0: all code words are further away than 64M-bound (decrease bound)
|
|
* bit map has one bit: one unique code word has distance < 64M-bound
|
|
* bit map has multiple bits: multiple words (increase bound)
|
|
* We will search for the lowest value of bound that gives a nonzero bit map.
|
|
*
|
|
* @param[in] transform Structure that contain the expanded codeword
|
|
*/
|
|
inline uint32_t find_peaks(__m256i *transform) {
|
|
// a whole lot of vector variables
|
|
__m256i bitmap, abs_rows[8], bound, active_row, max_abs_rows;
|
|
__m256i tmp = _mm256_setzero_si256();
|
|
__m256i vect_mask;
|
|
__m256i res;
|
|
int32_t lower;
|
|
int32_t width;
|
|
uint32_t message;
|
|
uint32_t mask;
|
|
int8_t index;
|
|
int8_t abs_value;
|
|
int8_t mask1;
|
|
int8_t mask2;
|
|
uint16_t result;
|
|
|
|
// compute absolute value of transform
|
|
for (size_t i = 0; i < 8; i++) {
|
|
abs_rows[i] = _mm256_abs_epi16(transform[i]);
|
|
}
|
|
// compute a vector of 16 elements which contains the maximum somewhere
|
|
// (later used to compute bits 0 through 3 of message)
|
|
max_abs_rows = abs_rows[0];
|
|
for (size_t i = 1; i < 8; i++) {
|
|
max_abs_rows = _mm256_max_epi16(max_abs_rows, abs_rows[i]);
|
|
}
|
|
|
|
// do binary search for the highest value that is lower than the maximum
|
|
// loop invariant: lower gives bit map = 0, lower + width gives bit map > 0
|
|
lower = 1;
|
|
// this gives 64, 128 or 256 for MULTIPLICITY = 2, 4, 6
|
|
width = 1 << (5 + MULTIPLICITY / 2);
|
|
// if you don't unroll this loop, it fits in the loop cache
|
|
// uncomment the line below to speeding up the program by a few percent
|
|
// #pragma GCC unroll 0
|
|
while (width > 1) {
|
|
width >>= 1;
|
|
// compare with lower + width; put result in bitmap
|
|
// make vector from value of new bound
|
|
bound = _mm256_broadcastw_epi16(_mm_cvtsi32_si128(lower + width));
|
|
bitmap = _mm256_cmpgt_epi16(max_abs_rows, bound);
|
|
// step up if there are any matches
|
|
// rely on compiler to use conditional move here
|
|
mask = (uint32_t) _mm256_testz_si256(bitmap, bitmap);
|
|
mask = ~(uint32_t) ((-(int64_t) mask) >> 63);
|
|
lower += mask & width;
|
|
}
|
|
// lower+width contains the maximum value of the vector
|
|
// or less, if the maximum is very high (which is OK)
|
|
// normally, there is one maximum, but sometimes there are more
|
|
// find where the maxima occur in the maximum vector
|
|
// (each determines lower 4 bits of peak position)
|
|
// construct vector filled with bound-1
|
|
bound = _mm256_broadcastw_epi16(_mm_cvtsi32_si128(lower + width - 1));
|
|
|
|
// find in which of the 8 groups a maximum occurs to compute bits 4, 5, 6 of message
|
|
// find lowest value by searching backwards skip first check to save time
|
|
message = 0x70;
|
|
for (size_t i = 0; i < 8; i++) {
|
|
bitmap = _mm256_cmpgt_epi16(abs_rows[7 - i], bound);
|
|
mask = (uint32_t) _mm256_testz_si256(bitmap, bitmap);
|
|
mask = ~(uint32_t) ((-(int64_t) mask) >> 63);
|
|
message ^= mask & (message ^ ((7 - i) << 4));
|
|
}
|
|
// we decided which row of the matrix contains the lowest match
|
|
// select proper row
|
|
index = message >> 4;
|
|
|
|
tmp = _mm256_setzero_si256();
|
|
for (size_t i = 0; i < 8; i++) {
|
|
abs_value = (int8_t)(index - i);
|
|
mask1 = abs_value >> 7;
|
|
abs_value ^= mask1;
|
|
abs_value -= mask1;
|
|
mask2 = ((uint8_t) - abs_value >> 7);
|
|
mask = (-1ULL) + mask2;
|
|
vect_mask = _mm256_set1_epi32(mask);
|
|
res = _mm256_and_si256(abs_rows[i], vect_mask);
|
|
tmp = _mm256_or_si256(tmp, res);
|
|
}
|
|
|
|
active_row = tmp;
|
|
|
|
// get the column number of the vector element
|
|
// by setting the bits corresponding to the columns
|
|
// and then adding elements within two groups of 8
|
|
vect_mask = _mm256_cmpgt_epi16(active_row, bound);
|
|
vect_mask &= _mm256_set_epi16(-32768, 16384, 8192, 4096, 2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1);
|
|
for (size_t i = 0; i < 3; i++) {
|
|
vect_mask = _mm256_hadd_epi16(vect_mask, vect_mask);
|
|
}
|
|
// add low 4 bits of message
|
|
message |= __tzcnt_u16(_mm256_extract_epi16(vect_mask, 0) + _mm256_extract_epi16(vect_mask, 8));
|
|
|
|
// set bit 7 if sign of biggest value is positive
|
|
// make sure a jump isn't generated by the compiler
|
|
tmp = _mm256_setzero_si256();
|
|
for (size_t i = 0; i < 8; i++) {
|
|
mask = ~(uint32_t) ((-(int64_t)(i ^ message / 16)) >> 63);
|
|
vect_mask = _mm256_set1_epi32(mask);
|
|
tmp = _mm256_or_si256(tmp, _mm256_and_si256(vect_mask, transform[i]));
|
|
}
|
|
result = 0;
|
|
for (size_t i = 0; i < 16; i++) {
|
|
mask = ~(uint32_t) ((-(int64_t)(i ^ message % 16)) >> 63);
|
|
result |= mask & ((uint16_t *)&tmp)[i];
|
|
}
|
|
message |= (0x8000 & ~result) >> 8;
|
|
return message;
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
* @brief Encodes the received word
|
|
*
|
|
* The message consists of N1 bytes each byte is encoded into PARAM_N2 bits,
|
|
* or MULTIPLICITY repeats of 128 bits
|
|
*
|
|
* @param[out] cdw Array of size VEC_N1N2_SIZE_64 receiving the encoded message
|
|
* @param[in] msg Array of size VEC_N1_SIZE_64 storing the message
|
|
*/
|
|
void PQCLEAN_HQCRMRS192_AVX2_reed_muller_encode(uint8_t *cdw, const uint8_t *msg) {
|
|
for (size_t i = 0; i < VEC_N1_SIZE_BYTES; i++) {
|
|
// encode first word
|
|
encode(&cdw[16 * i * MULTIPLICITY], msg[i]);
|
|
// copy to other identical codewords
|
|
for (size_t copy = 1; copy < MULTIPLICITY; copy++) {
|
|
memcpy(&cdw[16 * i * MULTIPLICITY + 16 * copy], &cdw[16 * i * MULTIPLICITY], 16);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
* @brief Decodes the received word
|
|
*
|
|
* Decoding uses fast hadamard transform, for a more complete picture on Reed-Muller decoding, see MacWilliams, Florence Jessie, and Neil James Alexander Sloane.
|
|
* The theory of error-correcting codes codes @cite macwilliams1977theory
|
|
*
|
|
* @param[out] msg Array of size VEC_N1_SIZE_64 receiving the decoded message
|
|
* @param[in] cdw Array of size VEC_N1N2_SIZE_64 storing the received word
|
|
*/
|
|
void PQCLEAN_HQCRMRS192_AVX2_reed_muller_decode(uint8_t *msg, const uint8_t *cdw) {
|
|
__m256i expanded[8];
|
|
__m256i transform[8];
|
|
for (size_t i = 0; i < VEC_N1_SIZE_BYTES; i++) {
|
|
// collect the codewords
|
|
expand_and_sum(expanded, (uint64_t *)&cdw[16 * i * MULTIPLICITY]);
|
|
// apply hadamard transform
|
|
hadamard(expanded, transform);
|
|
// fix the first entry to get the half Hadamard transform
|
|
transform[0] -= _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 64 * MULTIPLICITY);
|
|
// finish the decoding
|
|
msg[i] = find_peaks(transform);
|
|
}
|
|
}
|