pqc/crypto_kem/mceliece348864f/sse/fft.c
Thom Wiggers b3f9d4f8d6
Classic McEliece (#259)
* Add McEliece reference implementations

* Add Vec implementations of McEliece

* Add sse implementations

* Add AVX2 implementations

* Get rid of stuff not supported by Mac ABI

* restrict to two cores

* Ditch .data files

* Remove .hidden from all .S files

* speed up duplicate consistency tests by batching

* make cpuinfo more robust

* Hope to stabilize macos cpuinfo without ccache

* Revert "Hope to stabilize macos cpuinfo without ccache"

This reverts commit 6129c3cabe1abbc8b956bc87e902a698e32bf322.

* Just hardcode what's available at travis

* Fixed-size types in api.h

* namespace all header files in mceliece

* Ditch operations.h

* Get rid of static inline functions

* fixup! Ditch operations.h
2020-02-05 13:09:56 +01:00

156 lines
4.0 KiB
C

/*
This file is for the Gao-Mateer FFT
sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf
*/
#include "fft.h"
#include "vec.h"
#include "vec128.h"
/* input: in, polynomial in bitsliced form */
/* output: in, result of applying the radix conversions on in */
static void radix_conversions(uint64_t *in) {
int i, j, k;
const uint64_t mask[5][2] = {
{0x8888888888888888, 0x4444444444444444},
{0xC0C0C0C0C0C0C0C0, 0x3030303030303030},
{0xF000F000F000F000, 0x0F000F000F000F00},
{0xFF000000FF000000, 0x00FF000000FF0000},
{0xFFFF000000000000, 0x0000FFFF00000000}
};
const uint64_t s[5][GFBITS] = {
#include "scalars.inc"
};
//
for (j = 0; j <= 4; j++) {
for (i = 0; i < GFBITS; i++) {
for (k = 4; k >= j; k--) {
in[i] ^= (in[i] & mask[k][0]) >> (1 << k);
in[i] ^= (in[i] & mask[k][1]) >> (1 << k);
}
}
PQCLEAN_MCELIECE348864F_SSE_vec_mul(in, in, s[j]); // scaling
}
}
/* input: in, result of applying the radix conversions to the input polynomial */
/* output: out, evaluation results (by applying the FFT butterflies) */
static void butterflies(vec128 out[][ GFBITS ], const uint64_t *in) {
int i, j, k, s, b;
uint64_t t0, t1;
const vec128 consts[ 32 ][ GFBITS ] = {
#include "consts.inc"
};
uint64_t consts_ptr = 0;
const uint8_t reversal[64] = {
0, 32, 16, 48, 8, 40, 24, 56,
4, 36, 20, 52, 12, 44, 28, 60,
2, 34, 18, 50, 10, 42, 26, 58,
6, 38, 22, 54, 14, 46, 30, 62,
1, 33, 17, 49, 9, 41, 25, 57,
5, 37, 21, 53, 13, 45, 29, 61,
3, 35, 19, 51, 11, 43, 27, 59,
7, 39, 23, 55, 15, 47, 31, 63
};
// boradcast
vec128 tmp[ GFBITS ];
vec128 x[ GFBITS ], y[ GFBITS ];
for (j = 0; j < 64; j += 4) {
for (i = 0; i < GFBITS; i++) {
t0 = (in[i] >> reversal[j + 0]) & 1;
t0 = -t0;
t1 = (in[i] >> reversal[j + 2]) & 1;
t1 = -t1;
out[j / 2 + 0][i] = PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(t0, t1);
t0 = (in[i] >> reversal[j + 1]) & 1;
t0 = -t0;
t1 = (in[i] >> reversal[j + 3]) & 1;
t1 = -t1;
out[j / 2 + 1][i] = PQCLEAN_MCELIECE348864F_SSE_vec128_set2x(t0, t1);
}
}
//
for (i = 0; i < 32; i += 2) {
PQCLEAN_MCELIECE348864F_SSE_vec128_mul(tmp, out[i + 1], consts[ 0 ]);
for (b = 0; b < GFBITS; b++) {
out[i + 0][b] ^= tmp[b];
}
for (b = 0; b < GFBITS; b++) {
out[i + 1][b] ^= out[i + 0][b];
}
for (b = 0; b < GFBITS; b++) {
x[b] = PQCLEAN_MCELIECE348864F_SSE_vec128_unpack_low(out[i + 0][b], out[i + 1][b]);
}
for (b = 0; b < GFBITS; b++) {
y[b] = PQCLEAN_MCELIECE348864F_SSE_vec128_unpack_high(out[i + 0][b], out[i + 1][b]);
}
for (b = 0; b < GFBITS; b++) {
out[i + 0][b] = x[b];
}
for (b = 0; b < GFBITS; b++) {
out[i + 1][b] = y[b];
}
}
consts_ptr += 1;
for (i = 0; i <= 4; i++) {
s = 1 << i;
for (j = 0; j < 32; j += 2 * s) {
for (k = j; k < j + s; k++) {
PQCLEAN_MCELIECE348864F_SSE_vec128_mul(tmp, out[k + s], consts[ consts_ptr + (k - j) ]);
for (b = 0; b < GFBITS; b++) {
out[k][b] ^= tmp[b];
}
for (b = 0; b < GFBITS; b++) {
out[k + s][b] ^= out[k][b];
}
}
}
consts_ptr += s;
}
// adding the part contributed by x^64
vec128 powers[32][GFBITS] = {
#include "powers.inc"
};
for (i = 0; i < 32; i++) {
for (b = 0; b < GFBITS; b++) {
out[i][b] ^= powers[i][b];
}
}
}
void PQCLEAN_MCELIECE348864F_SSE_fft(vec128 out[][ GFBITS ], uint64_t *in) {
radix_conversions(in);
butterflies(out, in);
}