* Add McEliece reference implementations
* Add Vec implementations of McEliece
* Add sse implementations
* Add AVX2 implementations
* Get rid of stuff not supported by Mac ABI
* restrict to two cores
* Ditch .data files
* Remove .hidden from all .S files
* speed up duplicate consistency tests by batching
* make cpuinfo more robust
* Hope to stabilize macos cpuinfo without ccache
* Revert "Hope to stabilize macos cpuinfo without ccache"
This reverts commit 6129c3cabe
.
* Just hardcode what's available at travis
* Fixed-size types in api.h
* namespace all header files in mceliece
* Ditch operations.h
* Get rid of static inline functions
* fixup! Ditch operations.h
kyber
@@ -28,7 +28,7 @@ version: 2.1 | |||
export CC=\"ccache ${CC}\" && | |||
pip3 install -r requirements.txt && | |||
mkdir test-results && | |||
cd test && python3 -m pytest --verbose --junitxml=test-results/pytest/results.xml --numprocesses=auto" | |||
cd test && python3 -m pytest --verbose --junitxml=test-results/pytest/results.xml --numprocesses=2" | |||
no_output_timeout: 2h | |||
- save_cache: | |||
key: v1-ccache-{{ .Environment.CIRCLE_JOB }} | |||
@@ -59,7 +59,7 @@ version: 2.1 | |||
pip3 install -r requirements.txt | |||
mkdir test-results | |||
cd test | |||
python3 -m pytest --verbose --junitxml=test-results/pytest/results.xml --numprocesses=auto | |||
python3 -m pytest --verbose --junitxml=test-results/pytest/results.xml --numprocesses=2 | |||
no_output_timeout: 2h | |||
- store_test_results: | |||
path: test/test-results | |||
@@ -0,0 +1,48 @@ | |||
name: Classic McEliece 348864 | |||
type: kem | |||
claimed-nist-level: 1 | |||
claimed-security: IND-CCA2 | |||
length-public-key: 261120 | |||
length-secret-key: 6452 | |||
length-ciphertext: 128 | |||
length-shared-secret: 32 | |||
nistkat-sha256: f0a166a9115a0c8481c85aee3fe901729a21a8a84a5d2b871fb99fc50223046b | |||
principal-submitters: | |||
- Daniel J. Bernstein | |||
- Tung Chou | |||
- Tanja Lange | |||
- Ingo von Maurich | |||
- Rafael Misoczki | |||
- Ruben Niederhagen | |||
- Edoardo Persichetti | |||
- Christiane Peters | |||
- Peter Schwabe | |||
- Nicolas Sendrier | |||
- Jakub Szefer | |||
- Wen Wang | |||
auxiliary-submitters: [] | |||
implementations: | |||
- name: clean | |||
version: SUPERCOP-20191221 | |||
- name: vec | |||
version: SUPERCOP-20191221 | |||
- name: sse | |||
version: SUPERCOP-20191221 | |||
supported_platforms: | |||
- architecture: x86_64 | |||
operating_systems: | |||
- Linux | |||
- Darwin | |||
required_flags: | |||
- sse4_1 | |||
- popcnt | |||
- name: avx | |||
version: SUPERCOP-20191221 | |||
supported_platforms: | |||
- architecture: x86_64 | |||
operating_systems: | |||
- Linux | |||
- Darwin | |||
required_flags: | |||
- avx2 | |||
- popcnt |
@@ -0,0 +1,16 @@ | |||
Public Domain. | |||
Authors of Classic McEliece in alphabetical order: | |||
Daniel J. Bernstein, University of Illinois at Chicago | |||
Tung Chou, Osaka University | |||
Tanja Lange, Technische Universiteit Eindhoven | |||
Ingo von Maurich, self | |||
Rafael Misoczki, Intel Corporation | |||
Ruben Niederhagen, Fraunhofer SIT | |||
Edoardo Persichetti, Florida Atlantic University | |||
Christiane Peters, self | |||
Peter Schwabe, Radboud University | |||
Nicolas Sendrier, Inria | |||
Jakub Szefer, Yale University | |||
Wen Wang, Yale University |
@@ -0,0 +1,42 @@ | |||
# This Makefile can be used with GNU Make or BSD Make | |||
LIB = libmceliece348864_avx.a | |||
SOURCES = aes256ctr.c benes.c bm.c controlbits.c decrypt.c encrypt.c fft.c \ | |||
fft_tr.c gf.c int32_sort.c operations.c pk_gen.c sk_gen.c transpose.c \ | |||
util.c uint32_sort.o vec.c vec128.c vec256.c \ | |||
consts.S syndrome_asm.S transpose_64x256_sp_asm.S \ | |||
transpose_64x64_asm.S update_asm.S vec128_mul_asm.S vec256_mul_asm.S \ | |||
vec_mul_asm.S vec_mul_sp_asm.S vec_reduce_asm.S | |||
HEADERS = aes256ctr.h api.h benes.h bm.h controlbits.h crypto_hash.h \ | |||
decrypt.h encrypt.h fft.h fft_tr.h gf.h int32_sort.h \ | |||
params.h pk_gen.h sk_gen.h transpose.h uint32_sort.h util.h \ | |||
vec128.h vec256.h vec.h \ | |||
consts.inc powers.inc scalars_2x.inc scalars.inc | |||
OBJECTS = aes256ctr.o benes.o bm.o controlbits.o decrypt.o encrypt.o fft.o \ | |||
fft_tr.o gf.o int32_sort.o operations.o pk_gen.o sk_gen.o transpose.o \ | |||
util.o uint32_sort.o vec.o vec128.o vec256.o \ | |||
consts.o syndrome_asm.o transpose_64x256_sp_asm.o \ | |||
transpose_64x64_asm.o update_asm.o vec128_mul_asm.o vec256_mul_asm.o \ | |||
vec_mul_asm.o vec_mul_sp_asm.o vec_reduce_asm.o | |||
CFLAGS = -O3 -std=c99 -mpopcnt -mavx2 -Wall -Wextra -pedantic -Werror -Wpedantic \ | |||
-Wredundant-decls -Wvla -Wcast-align -Wmissing-prototypes \ | |||
-I../../../common/ $(EXTRAFLAGS) | |||
all: $(LIB) | |||
%.o: %.c $(HEADERS) | |||
$(CC) $(CFLAGS) -c -o $@ $< | |||
%.o: %.S | |||
$(CC) -c -o $@ $< | |||
$(LIB): $(OBJECTS) | |||
$(AR) -r $@ $(OBJECTS) | |||
clean: | |||
$(RM) $(OBJECTS) | |||
$(RM) $(LIB) |
@@ -0,0 +1,13 @@ | |||
#include "aes256ctr.h" | |||
void PQCLEAN_MCELIECE348864_AVX_aes256ctr( | |||
uint8_t *out, | |||
size_t outlen, | |||
const uint8_t nonce[AESCTR_NONCEBYTES], | |||
const uint8_t key[AES256_KEYBYTES]) { | |||
aes256ctx state; | |||
aes256_keyexp(&state, key); | |||
aes256_ctr(out, outlen, nonce, &state); | |||
aes256_ctx_release(&state); | |||
} |
@@ -0,0 +1,17 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_AVX_AES256CTR_H | |||
#define PQCLEAN_MCELIECE348864_AVX_AES256CTR_H | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
#include "aes.h" | |||
void PQCLEAN_MCELIECE348864_AVX_aes256ctr( | |||
uint8_t *out, | |||
size_t outlen, | |||
const uint8_t nonce[AESCTR_NONCEBYTES], | |||
const uint8_t key[AES256_KEYBYTES] | |||
); | |||
#endif |
@@ -0,0 +1,32 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_AVX_API_H | |||
#define PQCLEAN_MCELIECE348864_AVX_API_H | |||
#include <stdint.h> | |||
#define PQCLEAN_MCELIECE348864_AVX_CRYPTO_ALGNAME "Classic McEliece 348864" | |||
#define PQCLEAN_MCELIECE348864_AVX_CRYPTO_PUBLICKEYBYTES 261120 | |||
#define PQCLEAN_MCELIECE348864_AVX_CRYPTO_SECRETKEYBYTES 6452 | |||
#define PQCLEAN_MCELIECE348864_AVX_CRYPTO_CIPHERTEXTBYTES 128 | |||
#define PQCLEAN_MCELIECE348864_AVX_CRYPTO_BYTES 32 | |||
int PQCLEAN_MCELIECE348864_AVX_crypto_kem_enc( | |||
uint8_t *c, | |||
uint8_t *key, | |||
const uint8_t *pk | |||
); | |||
int PQCLEAN_MCELIECE348864_AVX_crypto_kem_dec( | |||
uint8_t *key, | |||
const uint8_t *c, | |||
const uint8_t *sk | |||
); | |||
int PQCLEAN_MCELIECE348864_AVX_crypto_kem_keypair | |||
( | |||
uint8_t *pk, | |||
uint8_t *sk | |||
); | |||
#endif | |||
@@ -0,0 +1,287 @@ | |||
/* | |||
This file is for Benes network related functions | |||
*/ | |||
#include "benes.h" | |||
#include "params.h" | |||
#include "transpose.h" | |||
#include "util.h" | |||
static void layer_0(uint64_t *bs, const uint64_t *cond) { | |||
int x; | |||
uint64_t diff; | |||
for (x = 0; x < (1 << 6); x += 2) { | |||
diff = bs[ x ] ^ bs[ x + 1 ]; | |||
diff &= *cond++; | |||
bs[ x ] ^= diff; | |||
bs[ x + 1 ] ^= diff; | |||
} | |||
} | |||
static void layer_1(uint64_t *bs, const uint64_t *cond) { | |||
int x; | |||
uint64_t diff; | |||
for (x = 0; x < (1 << 6); x += 4) { | |||
diff = bs[ x + 0 ] ^ bs[ x + 2 ]; | |||
diff &= cond[0]; | |||
bs[ x + 0 ] ^= diff; | |||
bs[ x + 2 ] ^= diff; | |||
diff = bs[ x + 1 ] ^ bs[ x + 3 ]; | |||
diff &= cond[1]; | |||
bs[ x + 1 ] ^= diff; | |||
bs[ x + 3 ] ^= diff; | |||
cond += 2; | |||
} | |||
} | |||
static void layer_2(uint64_t *bs, const uint64_t *cond) { | |||
int x; | |||
uint64_t diff; | |||
for (x = 0; x < (1 << 6); x += 8) { | |||
diff = bs[ x + 0 ] ^ bs[ x + 4 ]; | |||
diff &= cond[0]; | |||
bs[ x + 0 ] ^= diff; | |||
bs[ x + 4 ] ^= diff; | |||
diff = bs[ x + 1 ] ^ bs[ x + 5 ]; | |||
diff &= cond[1]; | |||
bs[ x + 1 ] ^= diff; | |||
bs[ x + 5 ] ^= diff; | |||
diff = bs[ x + 2 ] ^ bs[ x + 6 ]; | |||
diff &= cond[2]; | |||
bs[ x + 2 ] ^= diff; | |||
bs[ x + 6 ] ^= diff; | |||
diff = bs[ x + 3 ] ^ bs[ x + 7 ]; | |||
diff &= cond[3]; | |||
bs[ x + 3 ] ^= diff; | |||
bs[ x + 7 ] ^= diff; | |||
cond += 4; | |||
} | |||
} | |||
static void layer_3(uint64_t *bs, const uint64_t *cond) { | |||
int x, s; | |||
uint64_t diff; | |||
for (x = 0; x < (1 << 6); x += 16) { | |||
for (s = x; s < x + 8; s += 4) { | |||
diff = bs[ s + 0 ] ^ bs[ s + 8 ]; | |||
diff &= cond[0]; | |||
bs[ s + 0 ] ^= diff; | |||
bs[ s + 8 ] ^= diff; | |||
diff = bs[ s + 1 ] ^ bs[ s + 9 ]; | |||
diff &= cond[1]; | |||
bs[ s + 1 ] ^= diff; | |||
bs[ s + 9 ] ^= diff; | |||
diff = bs[ s + 2 ] ^ bs[ s + 10 ]; | |||
diff &= cond[2]; | |||
bs[ s + 2 ] ^= diff; | |||
bs[ s + 10 ] ^= diff; | |||
diff = bs[ s + 3 ] ^ bs[ s + 11 ]; | |||
diff &= cond[3]; | |||
bs[ s + 3 ] ^= diff; | |||
bs[ s + 11 ] ^= diff; | |||
cond += 4; | |||
} | |||
} | |||
} | |||
static void layer_4(uint64_t *bs, const uint64_t *cond) { | |||
int x, s; | |||
uint64_t diff; | |||
for (x = 0; x < (1 << 6); x += 32) { | |||
for (s = x; s < x + 16; s += 4) { | |||
diff = bs[ s + 0 ] ^ bs[ s + 16 ]; | |||
diff &= cond[0]; | |||
bs[ s + 0 ] ^= diff; | |||
bs[ s + 16 ] ^= diff; | |||
diff = bs[ s + 1 ] ^ bs[ s + 17 ]; | |||
diff &= cond[1]; | |||
bs[ s + 1 ] ^= diff; | |||
bs[ s + 17 ] ^= diff; | |||
diff = bs[ s + 2 ] ^ bs[ s + 18 ]; | |||
diff &= cond[2]; | |||
bs[ s + 2 ] ^= diff; | |||
bs[ s + 18 ] ^= diff; | |||
diff = bs[ s + 3 ] ^ bs[ s + 19 ]; | |||
diff &= cond[3]; | |||
bs[ s + 3 ] ^= diff; | |||
bs[ s + 19 ] ^= diff; | |||
cond += 4; | |||
} | |||
} | |||
} | |||
static void layer_5(uint64_t *bs, const uint64_t *cond) { | |||
int x, s; | |||
uint64_t diff; | |||
for (x = 0; x < (1 << 6); x += 64) { | |||
for (s = x; s < x + 32; s += 4) { | |||
diff = bs[ s + 0 ] ^ bs[ s + 32 ]; | |||
diff &= cond[0]; | |||
bs[ s + 0 ] ^= diff; | |||
bs[ s + 32 ] ^= diff; | |||
diff = bs[ s + 1 ] ^ bs[ s + 33 ]; | |||
diff &= cond[1]; | |||
bs[ s + 1 ] ^= diff; | |||
bs[ s + 33 ] ^= diff; | |||
diff = bs[ s + 2 ] ^ bs[ s + 34 ]; | |||
diff &= cond[2]; | |||
bs[ s + 2 ] ^= diff; | |||
bs[ s + 34 ] ^= diff; | |||
diff = bs[ s + 3 ] ^ bs[ s + 35 ]; | |||
diff &= cond[3]; | |||
bs[ s + 3 ] ^= diff; | |||
bs[ s + 35 ] ^= diff; | |||
cond += 4; | |||
} | |||
} | |||
} | |||
/* input: bits, control bits as array of bytes */ | |||
/* output: out, control bits as array of 128-bit vectors */ | |||
void PQCLEAN_MCELIECE348864_AVX_load_bits(uint64_t out[][32], const unsigned char *bits) { | |||
int i, low, block = 0; | |||
uint64_t cond[64]; | |||
// | |||
for (low = 0; low <= 5; low++) { | |||
for (i = 0; i < 64; i++) { | |||
cond[i] = PQCLEAN_MCELIECE348864_AVX_load4(bits + block * 256 + i * 4); | |||
} | |||
PQCLEAN_MCELIECE348864_AVX_transpose_64x64(cond); | |||
for (i = 0; i < 32; i++) { | |||
out[ block ][i] = cond[i]; | |||
} | |||
block++; | |||
} | |||
for (low = 0; low <= 5; low++) { | |||
for (i = 0; i < 32; i++) { | |||
out[ block ][i] = PQCLEAN_MCELIECE348864_AVX_load8(bits + block * 256 + i * 8); | |||
} | |||
block++; | |||
} | |||
for (low = 4; low >= 0; low--) { | |||
for (i = 0; i < 32; i++) { | |||
out[ block ][i] = PQCLEAN_MCELIECE348864_AVX_load8(bits + block * 256 + i * 8); | |||
} | |||
block++; | |||
} | |||
for (low = 5; low >= 0; low--) { | |||
for (i = 0; i < 64; i++) { | |||
cond[i] = PQCLEAN_MCELIECE348864_AVX_load4(bits + block * 256 + i * 4); | |||
} | |||
PQCLEAN_MCELIECE348864_AVX_transpose_64x64(cond); | |||
for (i = 0; i < 32; i++) { | |||
out[ block ][i] = cond[i]; | |||
} | |||
block++; | |||
} | |||
} | |||
/* input: r, sequence of bits to be permuted */ | |||
/* cond, control bits as array of 128-bit vectors */ | |||
/* rev, 0 for normal application; !0 for inverse */ | |||
/* output: r, permuted bits */ | |||
void PQCLEAN_MCELIECE348864_AVX_benes(uint64_t *r, uint64_t cond[][32], int rev) { | |||
int block, inc; | |||
uint64_t *bs = r; | |||
// | |||
if (rev == 0) { | |||
block = 0; | |||
inc = 1; | |||
} else { | |||
block = 22; | |||
inc = -1; | |||
} | |||
PQCLEAN_MCELIECE348864_AVX_transpose_64x64(bs); | |||
layer_0(bs, cond[ block ]); | |||
block += inc; | |||
layer_1(bs, cond[ block ]); | |||
block += inc; | |||
layer_2(bs, cond[ block ]); | |||
block += inc; | |||
layer_3(bs, cond[ block ]); | |||
block += inc; | |||
layer_4(bs, cond[ block ]); | |||
block += inc; | |||
layer_5(bs, cond[ block ]); | |||
block += inc; | |||
PQCLEAN_MCELIECE348864_AVX_transpose_64x64(bs); | |||
layer_0(bs, cond[ block ]); | |||
block += inc; | |||
layer_1(bs, cond[ block ]); | |||
block += inc; | |||
layer_2(bs, cond[ block ]); | |||
block += inc; | |||
layer_3(bs, cond[ block ]); | |||
block += inc; | |||
layer_4(bs, cond[ block ]); | |||
block += inc; | |||
layer_5(bs, cond[ block ]); | |||
block += inc; | |||
layer_4(bs, cond[ block ]); | |||
block += inc; | |||
layer_3(bs, cond[ block ]); | |||
block += inc; | |||
layer_2(bs, cond[ block ]); | |||
block += inc; | |||
layer_1(bs, cond[ block ]); | |||
block += inc; | |||
layer_0(bs, cond[ block ]); | |||
block += inc; | |||
PQCLEAN_MCELIECE348864_AVX_transpose_64x64(bs); | |||
layer_5(bs, cond[ block ]); | |||
block += inc; | |||
layer_4(bs, cond[ block ]); | |||
block += inc; | |||
layer_3(bs, cond[ block ]); | |||
block += inc; | |||
layer_2(bs, cond[ block ]); | |||
block += inc; | |||
layer_1(bs, cond[ block ]); | |||
block += inc; | |||
layer_0(bs, cond[ block ]); | |||
//block += inc; | |||
PQCLEAN_MCELIECE348864_AVX_transpose_64x64(bs); | |||
} | |||
@@ -0,0 +1,15 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_AVX_BENES_H | |||
#define PQCLEAN_MCELIECE348864_AVX_BENES_H | |||
/* | |||
This file is for Benes network related functions | |||
*/ | |||
#include "gf.h" | |||
#include "vec128.h" | |||
void PQCLEAN_MCELIECE348864_AVX_load_bits(uint64_t /*out*/[][32], const unsigned char * /*bits*/); | |||
void PQCLEAN_MCELIECE348864_AVX_benes(uint64_t * /*r*/, uint64_t /*cond*/[][32], int /*rev*/); | |||
#endif | |||
@@ -0,0 +1,219 @@ | |||
/* | |||
This file is for the inversion-free Berlekamp-Massey algorithm | |||
see https://ieeexplore.ieee.org/document/87857 | |||
*/ | |||
#include "bm.h" | |||
#include "gf.h" | |||
#include "util.h" | |||
#include "vec.h" | |||
#include "vec128.h" | |||
#include <stdint.h> | |||
extern void PQCLEAN_MCELIECE348864_AVX_update_asm(void *, gf, int); | |||
extern gf PQCLEAN_MCELIECE348864_AVX_vec_reduce_asm(uint64_t *); | |||
static inline uint64_t mask_nonzero(gf a) { | |||
uint64_t ret = a; | |||
ret -= 1; | |||
ret >>= 63; | |||
ret -= 1; | |||
return ret; | |||
} | |||
static inline uint64_t mask_leq(uint16_t a, uint16_t b) { | |||
uint64_t a_tmp = a; | |||
uint64_t b_tmp = b; | |||
uint64_t ret = b_tmp - a_tmp; | |||
ret >>= 63; | |||
ret -= 1; | |||
return ret; | |||
} | |||
static inline void vec_cmov(uint64_t out[][2], uint64_t mask) { | |||
int i; | |||
for (i = 0; i < GFBITS; i++) { | |||
out[i][0] = (out[i][0] & ~mask) | (out[i][1] & mask); | |||
} | |||
} | |||
static inline void interleave(vec128 *in, int idx0, int idx1, vec128 *mask, int b) { | |||
int s = 1 << b; | |||
vec128 x, y; | |||
x = PQCLEAN_MCELIECE348864_AVX_vec128_or(PQCLEAN_MCELIECE348864_AVX_vec128_and(in[idx0], mask[0]), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_sll_2x(PQCLEAN_MCELIECE348864_AVX_vec128_and(in[idx1], mask[0]), s)); | |||
y = PQCLEAN_MCELIECE348864_AVX_vec128_or(PQCLEAN_MCELIECE348864_AVX_vec128_srl_2x(PQCLEAN_MCELIECE348864_AVX_vec128_and(in[idx0], mask[1]), s), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_and(in[idx1], mask[1])); | |||
in[idx0] = x; | |||
in[idx1] = y; | |||
} | |||
/* input: in, field elements in bitsliced form */ | |||
/* output: out, field elements in non-bitsliced form */ | |||
static inline void get_coefs(gf *out, vec128 *in) { | |||
int i, k; | |||
vec128 mask[4][2]; | |||
vec128 buf[16]; | |||
for (i = 0; i < GFBITS; i++) { | |||
buf[i] = in[i]; | |||
} | |||
for (i = GFBITS; i < 16; i++) { | |||
buf[i] = PQCLEAN_MCELIECE348864_AVX_vec128_setzero(); | |||
} | |||
mask[0][0] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0x5555); | |||
mask[0][1] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0xAAAA); | |||
mask[1][0] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0x3333); | |||
mask[1][1] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0xCCCC); | |||
mask[2][0] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0x0F0F); | |||
mask[2][1] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0xF0F0); | |||
mask[3][0] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0x00FF); | |||
mask[3][1] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0xFF00); | |||
interleave(buf, 0, 8, mask[3], 3); | |||
interleave(buf, 1, 9, mask[3], 3); | |||
interleave(buf, 2, 10, mask[3], 3); | |||
interleave(buf, 3, 11, mask[3], 3); | |||
interleave(buf, 4, 12, mask[3], 3); | |||
interleave(buf, 5, 13, mask[3], 3); | |||
interleave(buf, 6, 14, mask[3], 3); | |||
interleave(buf, 7, 15, mask[3], 3); | |||
interleave(buf, 0, 4, mask[2], 2); | |||
interleave(buf, 1, 5, mask[2], 2); | |||
interleave(buf, 2, 6, mask[2], 2); | |||
interleave(buf, 3, 7, mask[2], 2); | |||
interleave(buf, 8, 12, mask[2], 2); | |||
interleave(buf, 9, 13, mask[2], 2); | |||
interleave(buf, 10, 14, mask[2], 2); | |||
interleave(buf, 11, 15, mask[2], 2); | |||
interleave(buf, 0, 2, mask[1], 1); | |||
interleave(buf, 1, 3, mask[1], 1); | |||
interleave(buf, 4, 6, mask[1], 1); | |||
interleave(buf, 5, 7, mask[1], 1); | |||
interleave(buf, 8, 10, mask[1], 1); | |||
interleave(buf, 9, 11, mask[1], 1); | |||
interleave(buf, 12, 14, mask[1], 1); | |||
interleave(buf, 13, 15, mask[1], 1); | |||
interleave(buf, 0, 1, mask[0], 0); | |||
interleave(buf, 2, 3, mask[0], 0); | |||
interleave(buf, 4, 5, mask[0], 0); | |||
interleave(buf, 6, 7, mask[0], 0); | |||
interleave(buf, 8, 9, mask[0], 0); | |||
interleave(buf, 10, 11, mask[0], 0); | |||
interleave(buf, 12, 13, mask[0], 0); | |||
interleave(buf, 14, 15, mask[0], 0); | |||
for (i = 0; i < 16; i++) { | |||
for (k = 0; k < 4; k++) { | |||
out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE348864_AVX_vec128_extract(buf[i], 0) >> (k * 16)) & GFMASK; | |||
out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE348864_AVX_vec128_extract(buf[i], 1) >> (k * 16)) & GFMASK; | |||
} | |||
} | |||
} | |||
/* input: in, field elements in bitsliced form */ | |||
/* output: out, field elements in non-bitsliced form */ | |||
void PQCLEAN_MCELIECE348864_AVX_bm(uint64_t out[ GFBITS ], vec128 in[ GFBITS ]) { | |||
uint16_t i; | |||
uint16_t N, L; | |||
uint64_t prod[ GFBITS ]; | |||
uint64_t in_tmp[ GFBITS ]; | |||
uint64_t db[ GFBITS ][ 2 ]; | |||
uint64_t BC_tmp[ GFBITS ][ 2 ]; | |||
uint64_t BC[ GFBITS ][ 2 ]; | |||
uint64_t mask, t; | |||
gf d, b, c0 = 1; | |||
gf coefs[SYS_T * 2]; | |||
// init | |||
BC[0][1] = 0; | |||
BC[0][0] = 1; | |||
BC[0][0] <<= 63; | |||
for (i = 1; i < GFBITS; i++) { | |||
BC[i][0] = BC[i][1] = 0; | |||
} | |||
b = 1; | |||
L = 0; | |||
// | |||
get_coefs(coefs, in); | |||
for (i = 0; i < GFBITS; i++) { | |||
in_tmp[i] = 0; | |||
} | |||
for (N = 0; N < SYS_T * 2; N++) { | |||
// computing d | |||
PQCLEAN_MCELIECE348864_AVX_vec_mul_sp(prod, in_tmp, &BC[0][0]); | |||
PQCLEAN_MCELIECE348864_AVX_update_asm(in_tmp, coefs[N], 8); | |||
d = PQCLEAN_MCELIECE348864_AVX_vec_reduce_asm(prod); | |||
t = PQCLEAN_MCELIECE348864_AVX_gf_mul2(c0, coefs[N], b); | |||
d ^= t & 0xFFFFFFFF; | |||
// 3 cases | |||
mask = mask_nonzero(d) & mask_leq(L * 2, N); | |||
for (i = 0; i < GFBITS; i++) { | |||
db[i][0] = (d >> i) & 1; | |||
db[i][0] = -db[i][0]; | |||
db[i][1] = (b >> i) & 1; | |||
db[i][1] = -db[i][1]; | |||
} | |||
PQCLEAN_MCELIECE348864_AVX_vec128_mul((vec128 *) BC_tmp, (vec128 *) db, (vec128 *) BC); | |||
vec_cmov(BC, mask); | |||
PQCLEAN_MCELIECE348864_AVX_update_asm(BC, mask & c0, 16); | |||
for (i = 0; i < GFBITS; i++) { | |||
BC[i][1] = BC_tmp[i][0] ^ BC_tmp[i][1]; | |||
} | |||
c0 = t >> 32; | |||
b = (d & mask) | (b & ~mask); | |||
L = ((N + 1 - L) & mask) | (L & ~mask); | |||
} | |||
c0 = PQCLEAN_MCELIECE348864_AVX_gf_inv(c0); | |||
for (i = 0; i < GFBITS; i++) { | |||
out[i] = (c0 >> i) & 1; | |||
out[i] = -out[i]; | |||
} | |||
PQCLEAN_MCELIECE348864_AVX_vec_mul_sp(out, out, &BC[0][0]); | |||
} | |||
@@ -0,0 +1,14 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_AVX_BM_H | |||
#define PQCLEAN_MCELIECE348864_AVX_BM_H | |||
/* | |||
This file is for the inversion-free Berlekamp-Massey algorithm | |||
see https://ieeexplore.ieee.org/document/87857 | |||
*/ | |||
#include "vec128.h" | |||
void PQCLEAN_MCELIECE348864_AVX_bm(uint64_t * /*out*/, vec128 * /*in*/); | |||
#endif | |||
@@ -0,0 +1,33 @@ | |||
.data | |||
# not supported on MacOS | |||
#.section .rodata | |||
.globl PQCLEAN_MCELIECE348864_AVX_MASK0_0 | |||
.globl PQCLEAN_MCELIECE348864_AVX_MASK0_1 | |||
.globl PQCLEAN_MCELIECE348864_AVX_MASK1_0 | |||
.globl PQCLEAN_MCELIECE348864_AVX_MASK1_1 | |||
.globl PQCLEAN_MCELIECE348864_AVX_MASK2_0 | |||
.globl PQCLEAN_MCELIECE348864_AVX_MASK2_1 | |||
.globl PQCLEAN_MCELIECE348864_AVX_MASK3_0 | |||
.globl PQCLEAN_MCELIECE348864_AVX_MASK3_1 | |||
.globl PQCLEAN_MCELIECE348864_AVX_MASK4_0 | |||
.globl PQCLEAN_MCELIECE348864_AVX_MASK4_1 | |||
.globl PQCLEAN_MCELIECE348864_AVX_MASK5_0 | |||
.globl PQCLEAN_MCELIECE348864_AVX_MASK5_1 | |||
.p2align 5 | |||
PQCLEAN_MCELIECE348864_AVX_MASK0_0: .quad 0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555 | |||
PQCLEAN_MCELIECE348864_AVX_MASK0_1: .quad 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA | |||
PQCLEAN_MCELIECE348864_AVX_MASK1_0: .quad 0x3333333333333333, 0x3333333333333333, 0x3333333333333333, 0x3333333333333333 | |||
PQCLEAN_MCELIECE348864_AVX_MASK1_1: .quad 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC | |||
PQCLEAN_MCELIECE348864_AVX_MASK2_0: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F | |||
PQCLEAN_MCELIECE348864_AVX_MASK2_1: .quad 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0 | |||
PQCLEAN_MCELIECE348864_AVX_MASK3_0: .quad 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF | |||
PQCLEAN_MCELIECE348864_AVX_MASK3_1: .quad 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 | |||
PQCLEAN_MCELIECE348864_AVX_MASK4_0: .quad 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF | |||
PQCLEAN_MCELIECE348864_AVX_MASK4_1: .quad 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 | |||
PQCLEAN_MCELIECE348864_AVX_MASK5_0: .quad 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF | |||
PQCLEAN_MCELIECE348864_AVX_MASK5_1: .quad 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000 | |||
@@ -0,0 +1,238 @@ | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0FF00FF00FF00FF0, 0x0FF00FF00FF00FF0, 0x0FF00FF00FF00FF0, 0x0FF00FF00FF00FF0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA5555AAAA5555AA, 0xAA5555AAAA5555AA, 0xAA5555AAAA5555AA, 0xAA5555AAAA5555AA), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCC33CC3333CC33CC, 0xCC33CC3333CC33CC, 0xCC33CC3333CC33CC, 0xCC33CC3333CC33CC), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x33CC33CC33CC33CC, 0x33CC33CC33CC33CC, 0x33CC33CC33CC33CC, 0x33CC33CC33CC33CC), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5A5A5A5A5A5A5A5A, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5A5A5A5A5A), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3C3C3C3C3C3C3C3C, 0x3C3C3C3C3C3C3C3C, 0x3C3C3C3C3C3C3C3C, 0x3C3C3C3C3C3C3C3C), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555AAAA5555AAAA, 0xAAAA5555AAAA5555, 0x5555AAAA5555AAAA, 0xAAAA5555AAAA5555), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCC3333CCCC3333CC, 0xCC3333CCCC3333CC, 0xCC3333CCCC3333CC, 0xCC3333CCCC3333CC), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55555555AAAAAAAA, 0x55555555AAAAAAAA, 0x55555555AAAAAAAA, 0x55555555AAAAAAAA), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x33333333CCCCCCCC, 0x33333333CCCCCCCC, 0x33333333CCCCCCCC, 0x33333333CCCCCCCC), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FFFF00FF00, 0xFF00FF0000FF00FF, 0x00FF00FFFF00FF00, 0xFF00FF0000FF00FF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000FFFFFFFF0000, 0x0000FFFFFFFF0000, 0x0000FFFFFFFF0000, 0x0000FFFFFFFF0000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA5555AA55AA, 0x55AA55AAAA55AA55, 0xAA55AA5555AA55AA, 0x55AA55AAAA55AA55), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCC33CC3333CC33CC, 0xCC33CC3333CC33CC, 0x33CC33CCCC33CC33, 0x33CC33CCCC33CC33), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33, 0xCC3333CC33CCCC33, 0x33CCCC33CC3333CC), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55555555AAAAAAAA, 0x55555555AAAAAAAA, 0x55555555AAAAAAAA, 0x55555555AAAAAAAA), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF0000FF00FFFF00, 0xFF0000FF00FFFF00, 0x00FFFF00FF0000FF, 0x00FFFF00FF0000FF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3CC33CC3C33CC33C, 0xC33CC33C3CC33CC3, 0x3CC33CC3C33CC33C, 0xC33CC33C3CC33CC3), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555AAAA5555AAAA, 0xAAAA5555AAAA5555, 0x5555AAAA5555AAAA, 0xAAAA5555AAAA5555), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0FF00FF00FF00FF0, 0xF00FF00FF00FF00F, 0x0FF00FF00FF00FF0, 0xF00FF00FF00FF00F), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC33333333CCCC, 0x3333CCCCCCCC3333, 0x3333CCCCCCCC3333, 0xCCCC33333333CCCC), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FFFF0000FFFF00, 0xFF0000FFFF0000FF, 0x00FFFF0000FFFF00, 0xFF0000FFFF0000FF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x6699669999669966, 0x9966996666996699, 0x6699669999669966, 0x9966996666996699), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xA5A5A5A55A5A5A5A, 0xA5A5A5A55A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0x5A5A5A5AA5A5A5A5), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C, 0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F, 0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C, 0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F, 0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33, 0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x6699669999669966, 0x9966996666996699, 0x6699669999669966, 0x9966996666996699), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCC3333CC33CCCC33, 0xCC3333CC33CCCC33, 0xCC3333CC33CCCC33, 0xCC3333CC33CCCC33), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5A5A5A5AA5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0xA5A5A5A55A5A5A5A, 0xA5A5A5A55A5A5A5A), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C, 0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0, 0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C, 0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F, 0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33, 0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x9669699696696996, 0x9669699696696996, 0x9669699696696996, 0x9669699696696996), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x6996699669966996, 0x6996699669966996, 0x6996699669966996, 0x6996699669966996), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x6996699669966996, 0x6996699669966996, 0x6996699669966996, 0x6996699669966996), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00, 0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F, 0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F, 0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x9669699696696996, 0x9669699696696996, 0x9669699696696996, 0x9669699696696996), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x6996699669966996, 0x6996699669966996, 0x6996699669966996, 0x6996699669966996), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x9669966996699669, 0x9669966996699669, 0x9669966996699669, 0x9669966996699669), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF, 0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F, 0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F, 0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x9669699696696996, 0x9669699696696996, 0x9669699696696996, 0x9669699696696996), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x9669966996699669, 0x9669966996699669, 0x9669966996699669, 0x9669966996699669), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x9669966996699669, 0x9669966996699669, 0x9669966996699669, 0x9669966996699669), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00, 0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F, 0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F, 0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x9669699696696996, 0x9669699696696996, 0x9669699696696996, 0x9669699696696996), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x9669966996699669, 0x9669966996699669, 0x9669966996699669, 0x9669966996699669), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x6996699669966996, 0x6996699669966996, 0x6996699669966996, 0x6996699669966996), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF, 0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F, 0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F, 0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), | |||
}, |
@@ -0,0 +1,274 @@ | |||
/* | |||
This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation | |||
see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf | |||
*/ | |||
#include "controlbits.h" | |||
#include "params.h" | |||
#include <stdint.h> | |||
typedef uint8_t bit; | |||
#define N (1 << GFBITS) | |||
static bit is_smaller(uint32_t a, uint32_t b) { | |||
uint32_t ret = 0; | |||
ret = a - b; | |||
ret >>= 31; | |||
return (bit)ret; | |||
} | |||
static bit is_smaller_63b(uint64_t a, uint64_t b) { | |||
uint64_t ret = 0; | |||
ret = a - b; | |||
ret >>= 63; | |||
return (bit)ret; | |||
} | |||
static void cswap(uint32_t *x, uint32_t *y, bit swap) { | |||
uint32_t m; | |||
uint32_t d; | |||
m = swap; | |||
m = 0 - m; | |||
d = (*x ^ *y); | |||
d &= m; | |||
*x ^= d; | |||
*y ^= d; | |||
} | |||
static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { | |||
uint64_t m; | |||
uint64_t d; | |||
m = swap; | |||
m = 0 - m; | |||
d = (*x ^ *y); | |||
d &= m; | |||
*x ^= d; | |||
*y ^= d; | |||
} | |||
/* output x = min(input x,input y) */ | |||
/* output y = max(input x,input y) */ | |||
static void minmax(uint32_t *x, uint32_t *y) { | |||
bit m; | |||
m = is_smaller(*y, *x); | |||
cswap(x, y, m); | |||
} | |||
static void minmax_63b(uint64_t *x, uint64_t *y) { | |||
bit m; | |||
m = is_smaller_63b(*y, *x); | |||
cswap_63b(x, y, m); | |||
} | |||
/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ | |||
/* requires n to be a power of 2 */ | |||
static void merge(int n, uint32_t *x, int step) { | |||
int i; | |||
if (n == 1) { | |||
minmax(&x[0], &x[step]); | |||
} else { | |||
merge(n / 2, x, step * 2); | |||
merge(n / 2, x + step, step * 2); | |||
for (i = 1; i < 2 * n - 1; i += 2) { | |||
minmax(&x[i * step], &x[(i + 1) * step]); | |||
} | |||
} | |||
} | |||
static void merge_63b(int n, uint64_t *x, int step) { | |||
int i; | |||
if (n == 1) { | |||
minmax_63b(&x[0], &x[step]); | |||
} else { | |||
merge_63b(n / 2, x, step * 2); | |||
merge_63b(n / 2, x + step, step * 2); | |||
for (i = 1; i < 2 * n - 1; i += 2) { | |||
minmax_63b(&x[i * step], &x[(i + 1) * step]); | |||
} | |||
} | |||
} | |||
/* sort x[0],x[1],...,x[n-1] in place */ | |||
/* requires n to be a power of 2 */ | |||
static void sort(int n, uint32_t *x) { | |||
if (n <= 1) { | |||
return; | |||
} | |||
sort(n / 2, x); | |||
sort(n / 2, x + n / 2); | |||
merge(n / 2, x, 1); | |||
} | |||
void PQCLEAN_MCELIECE348864_AVX_sort_63b(int n, uint64_t *x) { | |||
if (n <= 1) { | |||
return; | |||
} | |||
PQCLEAN_MCELIECE348864_AVX_sort_63b(n / 2, x); | |||
PQCLEAN_MCELIECE348864_AVX_sort_63b(n / 2, x + n / 2); | |||
merge_63b(n / 2, x, 1); | |||
} | |||
/* y[pi[i]] = x[i] */ | |||
/* requires n = 2^w */ | |||
/* requires pi to be a permutation */ | |||
static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC | |||
int i; | |||
uint32_t t[2 * N]; | |||
for (i = 0; i < n; ++i) { | |||
t[i] = x[i] | (pi[i] << 16); | |||
} | |||
sort(n, t); | |||
for (i = 0; i < n; ++i) { | |||
y[i] = t[i] & 0xFFFF; | |||
} | |||
} | |||
/* ip[i] = j iff pi[i] = j */ | |||
/* requires n = 2^w */ | |||
/* requires pi to be a permutation */ | |||
static void invert(int n, uint32_t *ip, const uint32_t *pi) { | |||
int i; | |||
for (i = 0; i < n; i++) { | |||
ip[i] = i; | |||
} | |||
composeinv(n, ip, ip, pi); | |||
} | |||
static void flow(int w, uint32_t *x, const uint32_t *y, int t) { | |||
bit m0; | |||
bit m1; | |||
uint32_t b; | |||
uint32_t y_copy = *y; | |||
m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); | |||
m1 = is_smaller(0, t); | |||
cswap(x, &y_copy, m0); | |||
b = m0 & m1; | |||
*x ^= b << w; | |||
} | |||
/* input: permutation pi */ | |||
/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ | |||
/* requires n = 2^w */ | |||
static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { | |||
int i; | |||
int j; | |||
int k; | |||
int t; | |||
uint32_t ip[N] = {0}; | |||
uint32_t I[2 * N] = {0}; | |||
uint32_t P[2 * N] = {0}; | |||
uint32_t PI[2 * N] = {0}; | |||
uint32_t T[2 * N] = {0}; | |||
uint32_t piflip[N] = {0}; | |||
uint32_t subpi[2][N / 2] = {{0}}; | |||
if (w == 1) { | |||
c[ off / 8 ] |= (pi[0] & 1) << (off % 8); | |||
} | |||
if (w <= 1) { | |||
return; | |||
} | |||
invert(n, ip, pi); | |||
for (i = 0; i < n; ++i) { | |||
I[i] = ip[i] | (1 << w); | |||
I[n + i] = pi[i]; | |||
} | |||
for (i = 0; i < 2 * n; ++i) { | |||
P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); | |||
} | |||
for (t = 0; t < w; ++t) { | |||
composeinv(2 * n, PI, P, I); | |||
for (i = 0; i < 2 * n; ++i) { | |||
flow(w, &P[i], &PI[i], t); | |||
} | |||
for (i = 0; i < 2 * n; ++i) { | |||
T[i] = I[i ^ 1]; | |||
} | |||
composeinv(2 * n, I, I, T); | |||
for (i = 0; i < 2 * n; ++i) { | |||
T[i] = P[i ^ 1]; | |||
} | |||
for (i = 0; i < 2 * n; ++i) { | |||
flow(w, &P[i], &T[i], 1); | |||
} | |||
} | |||
for (i = 0; i < n; ++i) { | |||
for (j = 0; j < w; ++j) { | |||
piflip[i] = pi[i]; | |||
} | |||
} | |||
for (i = 0; i < n / 2; ++i) { | |||
c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); | |||
} | |||
for (i = 0; i < n / 2; ++i) { | |||
c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); | |||
} | |||
for (i = 0; i < n / 2; ++i) { | |||
cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); | |||
} | |||
for (k = 0; k < 2; ++k) { | |||
for (i = 0; i < n / 2; ++i) { | |||
subpi[k][i] = piflip[i * 2 + k] >> 1; | |||
} | |||
} | |||
for (k = 0; k < 2; ++k) { | |||
controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); | |||
} | |||
} | |||
/* input: pi, a permutation*/ | |||
/* output: out, control bits w.r.t. pi */ | |||
void PQCLEAN_MCELIECE348864_AVX_controlbits(unsigned char *out, const uint32_t *pi) { | |||
unsigned int i; | |||
unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; | |||
for (i = 0; i < sizeof(c); i++) { | |||
c[i] = 0; | |||
} | |||
controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); | |||
for (i = 0; i < sizeof(c); i++) { | |||
out[i] = c[i]; | |||
} | |||
} | |||
@@ -0,0 +1,15 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_AVX_CONTROLBITS_H | |||
#define PQCLEAN_MCELIECE348864_AVX_CONTROLBITS_H | |||
/* | |||
This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation | |||
see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf | |||
*/ | |||
#include <stdint.h> | |||
void PQCLEAN_MCELIECE348864_AVX_sort_63b(int n, uint64_t *x); | |||
void PQCLEAN_MCELIECE348864_AVX_controlbits(unsigned char *out, const uint32_t *pi); | |||
#endif | |||
@@ -0,0 +1,7 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_AVX_CRYPTO_HASH_H | |||
#define PQCLEAN_MCELIECE348864_AVX_CRYPTO_HASH_H | |||
#include "fips202.h" | |||
#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) | |||
#endif |
@@ -0,0 +1,234 @@ | |||
/* | |||
This file is for Niederreiter decryption | |||
*/ | |||
#include "decrypt.h" | |||
#include "benes.h" | |||
#include "bm.h" | |||
#include "fft.h" | |||
#include "fft_tr.h" | |||
#include "params.h" | |||
#include "util.h" | |||
#include <stdio.h> | |||
static void scaling(vec256 out[][GFBITS], vec256 inv[][GFBITS], const unsigned char *sk, vec256 *recv) { | |||
int i, j; | |||
uint64_t sk_int[ GFBITS ]; | |||
vec256 eval[16][ GFBITS ]; | |||
vec256 tmp[ GFBITS ]; | |||
// computing inverses | |||
PQCLEAN_MCELIECE348864_AVX_irr_load(sk_int, sk); | |||
PQCLEAN_MCELIECE348864_AVX_fft(eval, sk_int); | |||
for (i = 0; i < 16; i++) { | |||
PQCLEAN_MCELIECE348864_AVX_vec256_sq(eval[i], eval[i]); | |||
} | |||
PQCLEAN_MCELIECE348864_AVX_vec256_copy(inv[0], eval[0]); | |||
for (i = 1; i < 16; i++) { | |||
PQCLEAN_MCELIECE348864_AVX_vec256_mul(inv[i], inv[i - 1], eval[i]); | |||
} | |||
PQCLEAN_MCELIECE348864_AVX_vec256_inv(tmp, inv[15]); | |||
for (i = 14; i >= 0; i--) { | |||
PQCLEAN_MCELIECE348864_AVX_vec256_mul(inv[i + 1], tmp, inv[i]); | |||
PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp, tmp, eval[i + 1]); | |||
} | |||
PQCLEAN_MCELIECE348864_AVX_vec256_copy(inv[0], tmp); | |||
// | |||
for (i = 0; i < 16; i++) { | |||
for (j = 0; j < GFBITS; j++) { | |||
out[i][j] = PQCLEAN_MCELIECE348864_AVX_vec256_and(inv[i][j], recv[i]); | |||
} | |||
} | |||
} | |||
static void preprocess(vec128 *recv, const unsigned char *s) { | |||
int i; | |||
unsigned char r[ 512 ]; | |||
for (i = 0; i < SYND_BYTES; i++) { | |||
r[i] = s[i]; | |||
} | |||
for (i = SYND_BYTES; i < 512; i++) { | |||
r[i] = 0; | |||
} | |||
for (i = 0; i < 32; i++) { | |||
recv[i] = PQCLEAN_MCELIECE348864_AVX_load16(r + i * 16); | |||
} | |||
} | |||
static void postprocess(unsigned char *e, vec128 *err) { | |||
int i; | |||
unsigned char error8[ (1 << GFBITS) / 8 ]; | |||
uint64_t v[2]; | |||
for (i = 0; i < 32; i++) { | |||
v[0] = PQCLEAN_MCELIECE348864_AVX_vec128_extract(err[i], 0); | |||
v[1] = PQCLEAN_MCELIECE348864_AVX_vec128_extract(err[i], 1); | |||
PQCLEAN_MCELIECE348864_AVX_store8(error8 + i * 16 + 0, v[0]); | |||
PQCLEAN_MCELIECE348864_AVX_store8(error8 + i * 16 + 8, v[1]); | |||
} | |||
for (i = 0; i < SYS_N / 8; i++) { | |||
e[i] = error8[i]; | |||
} | |||
} | |||
static void scaling_inv(vec256 out[][GFBITS], vec256 inv[][GFBITS], vec256 *recv) { | |||
int i, j; | |||
for (i = 0; i < 16; i++) { | |||
for (j = 0; j < GFBITS; j++) { | |||
out[i][j] = PQCLEAN_MCELIECE348864_AVX_vec256_and(inv[i][j], recv[i]); | |||
} | |||
} | |||
} | |||
static uint16_t weight_check(unsigned char *e, vec128 *error) { | |||
int i; | |||
uint16_t w0 = 0; | |||
uint16_t w1 = 0; | |||
uint16_t check; | |||
for (i = 0; i < 32; i++) { | |||
w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE348864_AVX_vec128_extract(error[i], 0) ); | |||
w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE348864_AVX_vec128_extract(error[i], 1) ); | |||
} | |||
for (i = 0; i < SYS_N / 8; i++) { | |||
w1 += _mm_popcnt_u64( e[i] ); | |||
} | |||
check = (w0 ^ SYS_T) | (w1 ^ SYS_T); | |||
check -= 1; | |||
check >>= 15; | |||
return check; | |||
} | |||
static uint16_t synd_cmp(vec128 *s0, vec128 *s1) { | |||
int i; | |||
vec128 diff; | |||
diff = PQCLEAN_MCELIECE348864_AVX_vec128_xor(s0[0], s1[0]); | |||
for (i = 1; i < GFBITS; i++) { | |||
diff = PQCLEAN_MCELIECE348864_AVX_vec128_or(diff, PQCLEAN_MCELIECE348864_AVX_vec128_xor(s0[i], s1[i])); | |||
} | |||
return (uint16_t)PQCLEAN_MCELIECE348864_AVX_vec128_testz(diff); | |||
} | |||
static void reformat_128to256(vec256 *out, vec128 *in) { | |||
int i; | |||
uint64_t v[4]; | |||
for (i = 0; i < 16; i++) { | |||
v[0] = PQCLEAN_MCELIECE348864_AVX_vec128_extract(in[2 * i + 0], 0); | |||
v[1] = PQCLEAN_MCELIECE348864_AVX_vec128_extract(in[2 * i + 0], 1); | |||
v[2] = PQCLEAN_MCELIECE348864_AVX_vec128_extract(in[2 * i + 1], 0); | |||
v[3] = PQCLEAN_MCELIECE348864_AVX_vec128_extract(in[2 * i + 1], 1); | |||
out[i] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); | |||
} | |||
} | |||
static void reformat_256to128(vec128 *out, vec256 *in) { | |||
int i; | |||
uint64_t v[4]; | |||
for (i = 0; i < 16; i++) { | |||
v[0] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[i], 0); | |||
v[1] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[i], 1); | |||
v[2] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[i], 2); | |||
v[3] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[i], 3); | |||
out[2 * i + 0] = PQCLEAN_MCELIECE348864_AVX_vec128_set2x(v[0], v[1]); | |||
out[2 * i + 1] = PQCLEAN_MCELIECE348864_AVX_vec128_set2x(v[2], v[3]); | |||
} | |||
} | |||
/* Niederreiter decryption with the Berlekamp decoder */ | |||
/* intput: sk, secret key */ | |||
/* c, ciphertext (syndrome) */ | |||
/* output: e, error vector */ | |||
/* return: 0 for success; 1 for failure */ | |||
int PQCLEAN_MCELIECE348864_AVX_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { | |||
int i; | |||
uint16_t check_synd; | |||
uint16_t check_weight; | |||
vec256 inv[ 16 ][ GFBITS ]; | |||
vec256 scaled[ 16 ][ GFBITS ]; | |||
vec256 eval[16][ GFBITS ]; | |||
vec128 error128[ 32 ]; | |||
vec256 error256[ 16 ]; | |||
vec128 s_priv[ GFBITS ]; | |||
vec128 s_priv_cmp[ GFBITS ]; | |||
uint64_t locator[ GFBITS ]; | |||
vec128 recv128[ 32 ]; | |||
vec256 recv256[ 16 ]; | |||
vec256 allone; | |||
uint64_t bits_int[23][32]; | |||
// Berlekamp decoder | |||
preprocess(recv128, c); | |||
PQCLEAN_MCELIECE348864_AVX_load_bits(bits_int, sk + IRR_BYTES); | |||
PQCLEAN_MCELIECE348864_AVX_benes((uint64_t *) recv128, bits_int, 1); | |||
reformat_128to256(recv256, recv128); | |||
scaling(scaled, inv, sk, recv256); | |||
PQCLEAN_MCELIECE348864_AVX_fft_tr(s_priv, scaled); | |||
PQCLEAN_MCELIECE348864_AVX_bm(locator, s_priv); | |||
PQCLEAN_MCELIECE348864_AVX_fft(eval, locator); | |||
// reencryption and weight check | |||
allone = PQCLEAN_MCELIECE348864_AVX_vec256_set1_16b(0xFFFF); | |||
for (i = 0; i < 16; i++) { | |||
error256[i] = PQCLEAN_MCELIECE348864_AVX_vec256_or_reduce(eval[i]); | |||
error256[i] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(error256[i], allone); | |||
} | |||
scaling_inv(scaled, inv, error256); | |||
PQCLEAN_MCELIECE348864_AVX_fft_tr(s_priv_cmp, scaled); | |||
check_synd = synd_cmp(s_priv, s_priv_cmp); | |||
// | |||
reformat_256to128(error128, error256); | |||
PQCLEAN_MCELIECE348864_AVX_benes((uint64_t *) error128, bits_int, 0); | |||
postprocess(e, error128); | |||
check_weight = weight_check(e, error128); | |||
return 1 - (check_synd & check_weight); | |||
} | |||
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_AVX_DECRYPT_H | |||
#define PQCLEAN_MCELIECE348864_AVX_DECRYPT_H | |||
/* | |||
This file is for Nieddereiter decryption | |||
*/ | |||
int PQCLEAN_MCELIECE348864_AVX_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); | |||
#endif | |||
@@ -0,0 +1,99 @@ | |||
/* | |||
This file is for Niederreiter encryption | |||
*/ | |||
#include "encrypt.h" | |||
#include "gf.h" | |||
#include "int32_sort.h" | |||
#include "params.h" | |||
#include "randombytes.h" | |||
#include "util.h" | |||
#include <stdint.h> | |||
#include <stdio.h> | |||
#include <string.h> | |||
/* input: public key pk, error vector e */ | |||
/* output: syndrome s */ | |||
extern void PQCLEAN_MCELIECE348864_AVX_syndrome_asm(unsigned char *s, const unsigned char *pk, unsigned char *e); | |||
/* output: e, an error vector of weight t */ | |||
static void gen_e(unsigned char *e) { | |||
int i, j, eq, count; | |||
uint16_t ind[ SYS_T * 2 ]; | |||
int32_t ind32[ SYS_T * 2 ]; | |||
uint64_t e_int[ (SYS_N + 63) / 64 ]; | |||
uint64_t one = 1; | |||
uint64_t mask; | |||
uint64_t val[ SYS_T ]; | |||
while (1) { | |||
randombytes((uint8_t *) ind, sizeof(ind)); | |||
for (i = 0; i < SYS_T * 2; i++) { | |||
ind[i] &= GFMASK; | |||
} | |||
// | |||
count = 0; | |||
for (i = 0; i < SYS_T * 2; i++) { | |||
if (ind[i] < SYS_N) { | |||
ind32[ count++ ] = ind[i]; | |||
} | |||
} | |||
if (count < SYS_T) { | |||
continue; | |||
} | |||
// check for repetition | |||
PQCLEAN_MCELIECE348864_AVX_int32_sort(ind32, SYS_T); | |||
eq = 0; | |||
for (i = 1; i < SYS_T; i++) { | |||
if (ind32[i - 1] == ind32[i]) { | |||
eq = 1; | |||
} | |||
} | |||
if (eq == 0) { | |||
break; | |||
} | |||
} | |||
for (j = 0; j < SYS_T; j++) { | |||
val[j] = one << (ind32[j] & 63); | |||
} | |||
for (i = 0; i < (SYS_N + 63) / 64; i++) { | |||
e_int[i] = 0; | |||
for (j = 0; j < SYS_T; j++) { | |||
mask = i ^ (ind32[j] >> 6); | |||
mask -= 1; | |||
mask >>= 63; | |||
mask = -mask; | |||
e_int[i] |= val[j] & mask; | |||
} | |||
} | |||
for (i = 0; i < (SYS_N + 63) / 64 - 1; i++) { | |||
PQCLEAN_MCELIECE348864_AVX_store8(e, e_int[i]); | |||
e += 8; | |||
} | |||
for (j = 0; j < (SYS_N % 64); j += 8) { | |||
e[ j / 8 ] = (e_int[i] >> j) & 0xFF; | |||
} | |||
} | |||
void PQCLEAN_MCELIECE348864_AVX_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { | |||
gen_e(e); | |||
PQCLEAN_MCELIECE348864_AVX_syndrome_asm(s, pk, e); | |||
} | |||
@@ -0,0 +1,11 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_AVX_ENCRYPT_H | |||
#define PQCLEAN_MCELIECE348864_AVX_ENCRYPT_H | |||
/* | |||
This file is for Niederreiter encryption | |||
*/ | |||
void PQCLEAN_MCELIECE348864_AVX_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); | |||
#endif | |||
@@ -0,0 +1,172 @@ | |||
/* | |||
This file is for the Gao-Mateer FFT | |||
sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf | |||
*/ | |||
#include "fft.h" | |||
#include "vec.h" | |||
/* input: in, polynomial in bitsliced form */ | |||
/* output: in, result of applying the radix conversions on in */ | |||
static void radix_conversions(uint64_t *in) { | |||
int i, j, k; | |||
const uint64_t mask[5][2] = { | |||
{0x8888888888888888, 0x4444444444444444}, | |||
{0xC0C0C0C0C0C0C0C0, 0x3030303030303030}, | |||
{0xF000F000F000F000, 0x0F000F000F000F00}, | |||
{0xFF000000FF000000, 0x00FF000000FF0000}, | |||
{0xFFFF000000000000, 0x0000FFFF00000000} | |||
}; | |||
const uint64_t s[5][GFBITS] = { | |||
#include "scalars.inc" | |||
}; | |||
// | |||
for (j = 0; j <= 4; j++) { | |||
for (i = 0; i < GFBITS; i++) { | |||
for (k = 4; k >= j; k--) { | |||
in[i] ^= (in[i] & mask[k][0]) >> (1 << k); | |||
in[i] ^= (in[i] & mask[k][1]) >> (1 << k); | |||
} | |||
} | |||
PQCLEAN_MCELIECE348864_AVX_vec_mul(in, in, s[j]); // scaling | |||
} | |||
} | |||
/* input: in, result of applying the radix conversions to the input polynomial */ | |||
/* output: out, evaluation results (by applying the FFT butterflies) */ | |||
static void butterflies(vec256 out[][ GFBITS ], const uint64_t *in) { | |||
int i, j, k, s, b; | |||
uint64_t t0, t1, t2, t3; | |||
const vec256 consts[ 17 ][ GFBITS ] = { | |||
#include "consts.inc" | |||
}; | |||
uint64_t consts_ptr = 0; | |||
const unsigned char reversal[64] = { | |||
0, 32, 16, 48, 8, 40, 24, 56, | |||
4, 36, 20, 52, 12, 44, 28, 60, | |||
2, 34, 18, 50, 10, 42, 26, 58, | |||
6, 38, 22, 54, 14, 46, 30, 62, | |||
1, 33, 17, 49, 9, 41, 25, 57, | |||
5, 37, 21, 53, 13, 45, 29, 61, | |||
3, 35, 19, 51, 11, 43, 27, 59, | |||
7, 39, 23, 55, 15, 47, 31, 63 | |||
}; | |||
// boradcast | |||
vec256 tmp256[ GFBITS ]; | |||
vec256 x[ GFBITS ], y[ GFBITS ]; | |||
for (j = 0; j < 64; j += 8) { | |||
for (i = 0; i < GFBITS; i++) { | |||
t0 = (in[i] >> reversal[j + 0]) & 1; | |||
t0 = -t0; | |||
t1 = (in[i] >> reversal[j + 2]) & 1; | |||
t1 = -t1; | |||
t2 = (in[i] >> reversal[j + 4]) & 1; | |||
t2 = -t2; | |||
t3 = (in[i] >> reversal[j + 6]) & 1; | |||
t3 = -t3; | |||
out[j / 4 + 0][i] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(t0, t1, t2, t3); | |||
t0 = (in[i] >> reversal[j + 1]) & 1; | |||
t0 = -t0; | |||
t1 = (in[i] >> reversal[j + 3]) & 1; | |||
t1 = -t1; | |||
t2 = (in[i] >> reversal[j + 5]) & 1; | |||
t2 = -t2; | |||
t3 = (in[i] >> reversal[j + 7]) & 1; | |||
t3 = -t3; | |||
out[j / 4 + 1][i] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(t0, t1, t2, t3); | |||
} | |||
} | |||
// | |||
for (i = 0; i < 16; i += 2) { | |||
PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp256, out[i + 1], consts[ 0 ]); | |||
for (b = 0; b < GFBITS; b++) { | |||
out[i + 0][b] ^= tmp256[b]; | |||
} | |||
for (b = 0; b < GFBITS; b++) { | |||
out[i + 1][b] ^= out[i + 0][b]; | |||
} | |||
} | |||
for (i = 0; i < 16; i += 2) { | |||
for (b = 0; b < GFBITS; b++) { | |||
x[b] = PQCLEAN_MCELIECE348864_AVX_vec256_unpack_low_2x(out[i + 0][b], out[i + 1][b]); | |||
} | |||
for (b = 0; b < GFBITS; b++) { | |||
y[b] = PQCLEAN_MCELIECE348864_AVX_vec256_unpack_high_2x(out[i + 0][b], out[i + 1][b]); | |||
} | |||
PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp256, y, consts[ 1 ]); | |||
for (b = 0; b < GFBITS; b++) { | |||
x[b] ^= tmp256[b]; | |||
} | |||
for (b = 0; b < GFBITS; b++) { | |||
y[b] ^= x[b]; | |||
} | |||
for (b = 0; b < GFBITS; b++) { | |||
out[i + 0][b] = PQCLEAN_MCELIECE348864_AVX_vec256_unpack_low(x[b], y[b]); | |||
} | |||
for (b = 0; b < GFBITS; b++) { | |||
out[i + 1][b] = PQCLEAN_MCELIECE348864_AVX_vec256_unpack_high(x[b], y[b]); | |||
} | |||
} | |||
consts_ptr = 2; | |||
for (i = 0; i <= 3; i++) { | |||
s = 1 << i; | |||
for (j = 0; j < 16; j += 2 * s) { | |||
for (k = j; k < j + s; k++) { | |||
PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp256, out[k + s], consts[ consts_ptr + (k - j) ]); | |||
for (b = 0; b < GFBITS; b++) { | |||
out[k][b] ^= tmp256[b]; | |||
} | |||
for (b = 0; b < GFBITS; b++) { | |||
out[k + s][b] ^= out[k][b]; | |||
} | |||
} | |||
} | |||
consts_ptr += s; | |||
} | |||
// adding the part contributed by x^64 | |||
vec256 powers[16][GFBITS] = { | |||
#include "powers.inc" | |||
}; | |||
for (i = 0; i < 16; i++) { | |||
for (b = 0; b < GFBITS; b++) { | |||
out[i][b] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(out[i][b], powers[i][b]); | |||
} | |||
} | |||
} | |||
void PQCLEAN_MCELIECE348864_AVX_fft(vec256 out[][ GFBITS ], uint64_t *in) { | |||
radix_conversions(in); | |||
butterflies(out, in); | |||
} | |||
@@ -0,0 +1,18 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_AVX_FFT_H | |||
#define PQCLEAN_MCELIECE348864_AVX_FFT_H | |||
/* | |||
This file is for the Gao-Mateer FFT | |||
sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf | |||
*/ | |||
#include <stdint.h> | |||
#include "params.h" | |||
#include "vec128.h" | |||
#include "vec256.h" | |||
void PQCLEAN_MCELIECE348864_AVX_fft(vec256 /*out*/[][GFBITS], uint64_t * /*in*/); | |||
#endif | |||
@@ -0,0 +1,355 @@ | |||
/* | |||
This file is for transpose of the Gao-Mateer FFT | |||
Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c | |||
*/ | |||
#include "fft_tr.h" | |||
#include "transpose.h" | |||
#include "vec.h" | |||
#include <stdint.h> | |||
static void radix_conversions_tr(vec128 in[ GFBITS ]) { | |||
int i, j, k; | |||
const vec128 mask[10] = { | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x2222222222222222, 0x2222222222222222), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x4444444444444444, 0x4444444444444444), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x3030303030303030, 0x3030303030303030), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00F000F000F000F0, 0x00F000F000F000F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000FF000000FF00, 0x0000FF000000FF00), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00000000FFFF0000, 0x00000000FFFF0000), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) | |||
}; | |||
const vec128 s[5][GFBITS] = { | |||
#include "scalars_2x.inc" | |||
}; | |||
uint64_t v0, v1; | |||
// | |||
for (j = 5; j >= 0; j--) { | |||
if (j < 5) { | |||
PQCLEAN_MCELIECE348864_AVX_vec128_mul(in, in, s[j]); | |||
} | |||
for (i = 0; i < GFBITS; i++) { | |||
for (k = j; k <= 4; k++) { | |||
in[i] ^= PQCLEAN_MCELIECE348864_AVX_vec128_sll_2x(in[i] & mask[2 * k + 0], 1 << k); | |||
in[i] ^= PQCLEAN_MCELIECE348864_AVX_vec128_sll_2x(in[i] & mask[2 * k + 1], 1 << k); | |||
} | |||
} | |||
for (i = 0; i < GFBITS; i++) { | |||
v0 = PQCLEAN_MCELIECE348864_AVX_vec128_extract(in[i], 0); | |||
v1 = PQCLEAN_MCELIECE348864_AVX_vec128_extract(in[i], 1); | |||
v1 ^= v0 >> 32; | |||
v1 ^= v1 << 32; | |||
in[i] = PQCLEAN_MCELIECE348864_AVX_vec128_set2x(v0, v1); | |||
} | |||
} | |||
} | |||
static void butterflies_tr(vec128 out[ GFBITS ], vec256 in[][ GFBITS ]) { | |||
int i, j, k, s, b; | |||
uint64_t tmp[ GFBITS ]; | |||
uint64_t pre[6][ GFBITS ]; | |||
uint64_t out64[2][64]; | |||
vec256 p2[ 6 ]; | |||
vec256 buf[64]; | |||
vec256 x[ GFBITS ], y[ GFBITS ]; | |||
vec256 tmp256[ GFBITS ]; | |||
const vec256 consts[ 17 ][ GFBITS ] = { | |||
#include "consts.inc" | |||
}; | |||
uint64_t consts_ptr = 17; | |||
const unsigned char reversal[64] = { | |||
0, 32, 16, 48, 8, 40, 24, 56, | |||
4, 36, 20, 52, 12, 44, 28, 60, | |||
2, 34, 18, 50, 10, 42, 26, 58, | |||
6, 38, 22, 54, 14, 46, 30, 62, | |||
1, 33, 17, 49, 9, 41, 25, 57, | |||
5, 37, 21, 53, 13, 45, 29, 61, | |||
3, 35, 19, 51, 11, 43, 27, 59, | |||
7, 39, 23, 55, 15, 47, 31, 63 | |||
}; | |||
const uint16_t beta[6] = {8, 1300, 3408, 1354, 2341, 1154}; | |||
// butterflies | |||
for (i = 3; i >= 0; i--) { | |||
s = 1 << i; | |||
consts_ptr -= s; | |||
for (j = 0; j < 16; j += 2 * s) { | |||
for (k = j; k < j + s; k++) { | |||
for (b = 0; b < GFBITS; b++) { | |||
in[k][b] ^= in[k + s][b]; | |||
} | |||
PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp256, in[k], consts[ consts_ptr + (k - j) ]); | |||
for (b = 0; b < GFBITS; b++) { | |||
in[k + s][b] ^= tmp256[b]; | |||
} | |||
} | |||
} | |||
} | |||
for (i = 0; i < 16; i += 2) { | |||
for (b = 0; b < GFBITS; b++) { | |||
x[b] = PQCLEAN_MCELIECE348864_AVX_vec256_unpack_low(in[i + 0][b], in[i + 1][b]); | |||
} | |||
for (b = 0; b < GFBITS; b++) { | |||
y[b] = PQCLEAN_MCELIECE348864_AVX_vec256_unpack_high(in[i + 0][b], in[i + 1][b]); | |||
} | |||
for (b = 0; b < GFBITS; b++) { | |||
x[b] ^= y[b]; | |||
} | |||
PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp256, x, consts[ 1 ]); | |||
for (b = 0; b < GFBITS; b++) { | |||
y[b] ^= tmp256[b]; | |||
} | |||
for (b = 0; b < GFBITS; b++) { | |||
in[i + 0][b] = PQCLEAN_MCELIECE348864_AVX_vec256_unpack_low_2x(x[b], y[b]); | |||
} | |||
for (b = 0; b < GFBITS; b++) { | |||
in[i + 1][b] = PQCLEAN_MCELIECE348864_AVX_vec256_unpack_high_2x(x[b], y[b]); | |||
} | |||
} | |||
for (i = 0; i < 16; i += 2) { | |||
for (b = 0; b < GFBITS; b++) { | |||
in[i + 0][b] ^= in[i + 1][b]; | |||
} | |||
PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp256, in[i + 0], consts[ 0 ]); | |||
for (b = 0; b < GFBITS; b++) { | |||
in[i + 1][b] ^= tmp256[b]; | |||
} | |||
} | |||
// transpose | |||
for (i = 0; i < GFBITS; i += 4) { | |||
for (j = 0; j < 64; j += 8) { | |||
buf[ reversal[j + 0] ] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 0], 0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 1], 0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 2], 0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 3], 0)); | |||
buf[ reversal[j + 1] ] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 0], 0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 1], 0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 2], 0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 3], 0)); | |||
buf[ reversal[j + 2] ] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 0], 1), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 1], 1), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 2], 1), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 3], 1)); | |||
buf[ reversal[j + 3] ] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 0], 1), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 1], 1), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 2], 1), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 3], 1)); | |||
buf[ reversal[j + 4] ] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 0], 2), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 1], 2), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 2], 2), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 3], 2)); | |||
buf[ reversal[j + 5] ] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 0], 2), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 1], 2), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 2], 2), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 3], 2)); | |||
buf[ reversal[j + 6] ] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 0], 3), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 1], 3), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 2], 3), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 3], 3)); | |||
buf[ reversal[j + 7] ] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 0], 3), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 1], 3), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 2], 3), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 3], 3)); | |||
} | |||
PQCLEAN_MCELIECE348864_AVX_transpose_64x256_sp(buf); | |||
p2[0] = buf[32]; | |||
buf[33] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[33], buf[32]); | |||
p2[1] = buf[33]; | |||
buf[35] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[35], buf[33]); | |||
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[35]); | |||
buf[34] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[34], buf[35]); | |||
p2[2] = buf[34]; | |||
buf[38] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[38], buf[34]); | |||
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[38]); | |||
buf[39] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[39], buf[38]); | |||
p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[39]); | |||
buf[37] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[37], buf[39]); | |||
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[37]); | |||
buf[36] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[36], buf[37]); | |||
p2[3] = buf[36]; | |||
buf[44] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[44], buf[36]); | |||
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[44]); | |||
buf[45] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[45], buf[44]); | |||
p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[45]); | |||
buf[47] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[47], buf[45]); | |||
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[47]); | |||
buf[46] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[46], buf[47]); | |||
p2[2] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[2], buf[46]); | |||
buf[42] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[42], buf[46]); | |||
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[42]); | |||
buf[43] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[43], buf[42]); | |||
p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[43]); | |||
buf[41] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[41], buf[43]); | |||
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[41]); | |||
buf[40] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[40], buf[41]); | |||
p2[4] = buf[40]; | |||
buf[56] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[56], buf[40]); | |||
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[56]); | |||
buf[57] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[57], buf[56]); | |||
p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[57]); | |||
buf[59] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[59], buf[57]); | |||
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[59]); | |||
buf[58] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[58], buf[59]); | |||
p2[2] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[2], buf[58]); | |||
buf[62] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[62], buf[58]); | |||
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[62]); | |||
buf[63] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[63], buf[62]); | |||
p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[63]); | |||
buf[61] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[61], buf[63]); | |||
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[61]); | |||
buf[60] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[60], buf[61]); | |||
p2[3] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[3], buf[60]); | |||
buf[52] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[52], buf[60]); | |||
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[52]); | |||
buf[53] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[53], buf[52]); | |||
p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[53]); | |||
buf[55] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[55], buf[53]); | |||
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[55]); | |||
buf[54] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[54], buf[55]); | |||
p2[2] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[2], buf[54]); | |||
buf[50] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[50], buf[54]); | |||
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[50]); | |||
buf[51] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[51], buf[50]); | |||
p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[51]); | |||
buf[49] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[49], buf[51]); | |||
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[49]); | |||
buf[48] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[48], buf[49]); | |||
p2[5] = buf[48]; | |||
buf[16] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[16], buf[48]); | |||
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[16]); | |||
buf[17] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[17], buf[16]); | |||
p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[17]); | |||
buf[19] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[19], buf[17]); | |||
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[19]); | |||
buf[18] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[18], buf[19]); | |||
p2[2] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[2], buf[18]); | |||
buf[22] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[22], buf[18]); | |||
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[22]); | |||
buf[23] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[23], buf[22]); | |||
p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[23]); | |||
buf[21] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[21], buf[23]); | |||
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[21]); | |||
buf[20] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[20], buf[21]); | |||
p2[3] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[3], buf[20]); | |||
buf[28] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[28], buf[20]); | |||
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[28]); | |||
buf[29] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[29], buf[28]); | |||
p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[29]); | |||
buf[31] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[31], buf[29]); | |||
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[31]); | |||
buf[30] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[30], buf[31]); | |||
p2[2] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[2], buf[30]); | |||
buf[26] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[26], buf[30]); | |||
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[26]); | |||
buf[27] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[27], buf[26]); | |||
p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[27]); | |||
buf[25] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[25], buf[27]); | |||
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[25]); | |||
buf[24] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[24], buf[25]); | |||
p2[4] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[4], buf[24]); | |||
buf[8] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[8], buf[24]); | |||
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[8]); | |||
buf[9] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[9], buf[8]); | |||
p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[9]); | |||
buf[11] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[11], buf[9]); | |||
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[11]); | |||
buf[10] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[10], buf[11]); | |||
p2[2] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[2], buf[10]); | |||
buf[14] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[14], buf[10]); | |||
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[14]); | |||
buf[15] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[15], buf[14]); | |||
p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[15]); | |||
buf[13] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[13], buf[15]); | |||
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[13]); | |||
buf[12] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[12], buf[13]); | |||
p2[3] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[3], buf[12]); | |||
buf[4] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[4], buf[12]); | |||
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[4]); | |||
buf[5] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[5], buf[4]); | |||
p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[5]); | |||
buf[7] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[7], buf[5]); | |||
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[7]); | |||
buf[6] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[6], buf[7]); | |||
p2[2] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[2], buf[6]); | |||
buf[2] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[2], buf[6]); | |||
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[2]); | |||
buf[3] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[3], buf[2]); | |||
p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[3]); | |||
buf[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[1], buf[3]); | |||
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[1]); | |||
buf[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[0], buf[1]); | |||
for (j = 0; j < 6; j++) { | |||
pre[j][i + 0] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(p2[j], 0); | |||
pre[j][i + 1] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(p2[j], 1); | |||
pre[j][i + 2] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(p2[j], 2); | |||
pre[j][i + 3] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(p2[j], 3); | |||
} | |||
out64[0][i + 0] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(buf[0], 0); | |||
out64[0][i + 1] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(buf[0], 1); | |||
out64[0][i + 2] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(buf[0], 2); | |||
out64[0][i + 3] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(buf[0], 3); | |||
} | |||
// | |||
for (j = 0; j < GFBITS; j++) { | |||
tmp[j] = (beta[0] >> j) & 1; | |||
tmp[j] = -tmp[j]; | |||
} | |||
PQCLEAN_MCELIECE348864_AVX_vec_mul(out64[1], pre[0], tmp); | |||
for (i = 1; i < 6; i++) { | |||
for (j = 0; j < GFBITS; j++) { | |||
tmp[j] = (beta[i] >> j) & 1; | |||
tmp[j] = -tmp[j]; | |||
} | |||
PQCLEAN_MCELIECE348864_AVX_vec_mul(tmp, pre[i], tmp); | |||
PQCLEAN_MCELIECE348864_AVX_vec_add(out64[1], out64[1], tmp); | |||
} | |||
for (i = 0; i < GFBITS; i++) { | |||
out[i] = PQCLEAN_MCELIECE348864_AVX_vec128_set2x(out64[0][i], out64[1][i]); | |||
} | |||
} | |||
void PQCLEAN_MCELIECE348864_AVX_fft_tr(vec128 out[GFBITS], vec256 in[][ GFBITS ]) { | |||
butterflies_tr(out, in); | |||
radix_conversions_tr(out); | |||
} | |||
@@ -0,0 +1,14 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_AVX_FFT_TR_H | |||
#define PQCLEAN_MCELIECE348864_AVX_FFT_TR_H | |||
/* | |||
This file is for transpose of the Gao-Mateer FFT | |||
*/ | |||
#include "params.h" | |||
#include "vec256.h" | |||
void PQCLEAN_MCELIECE348864_AVX_fft_tr(vec128 * /*out*/, vec256 /*in*/[][ GFBITS ]); | |||
#endif | |||
@@ -0,0 +1,169 @@ | |||
/* | |||
This file is for functions for field arithmetic | |||
*/ | |||
#include "gf.h" | |||
#include "params.h" | |||
gf PQCLEAN_MCELIECE348864_AVX_gf_iszero(gf a) { | |||
uint32_t t = a; | |||
t -= 1; | |||
t >>= 20; | |||
return (gf) t; | |||
} | |||
gf PQCLEAN_MCELIECE348864_AVX_gf_add(gf in0, gf in1) { | |||
return in0 ^ in1; | |||
} | |||
gf PQCLEAN_MCELIECE348864_AVX_gf_mul(gf in0, gf in1) { | |||
int i; | |||
uint32_t tmp; | |||
uint32_t t0; | |||
uint32_t t1; | |||
uint32_t t; | |||
t0 = in0; | |||
t1 = in1; | |||
tmp = t0 * (t1 & 1); | |||
for (i = 1; i < GFBITS; i++) { | |||
tmp ^= (t0 * (t1 & (1 << i))); | |||
} | |||
t = tmp & 0x7FC000; | |||
tmp ^= t >> 9; | |||
tmp ^= t >> 12; | |||
t = tmp & 0x3000; | |||
tmp ^= t >> 9; | |||
tmp ^= t >> 12; | |||
return tmp & ((1 << GFBITS) - 1); | |||
} | |||
/* input: field element in */ | |||
/* return: in^2 */ | |||
static inline gf gf_sq(gf in) { | |||
const uint32_t B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF}; | |||
uint32_t x = in; | |||
uint32_t t; | |||
x = (x | (x << 8)) & B[3]; | |||
x = (x | (x << 4)) & B[2]; | |||
x = (x | (x << 2)) & B[1]; | |||
x = (x | (x << 1)) & B[0]; | |||
t = x & 0x7FC000; | |||
x ^= t >> 9; | |||
x ^= t >> 12; | |||
t = x & 0x3000; | |||
x ^= t >> 9; | |||
x ^= t >> 12; | |||
return x & ((1 << GFBITS) - 1); | |||
} | |||
gf PQCLEAN_MCELIECE348864_AVX_gf_inv(gf in) { | |||
gf tmp_11; | |||
gf tmp_1111; | |||
gf out = in; | |||
out = gf_sq(out); | |||
tmp_11 = PQCLEAN_MCELIECE348864_AVX_gf_mul(out, in); // 11 | |||
out = gf_sq(tmp_11); | |||
out = gf_sq(out); | |||
tmp_1111 = PQCLEAN_MCELIECE348864_AVX_gf_mul(out, tmp_11); // 1111 | |||
out = gf_sq(tmp_1111); | |||
out = gf_sq(out); | |||
out = gf_sq(out); | |||
out = gf_sq(out); | |||
out = PQCLEAN_MCELIECE348864_AVX_gf_mul(out, tmp_1111); // 11111111 | |||
out = gf_sq(out); | |||
out = gf_sq(out); | |||
out = PQCLEAN_MCELIECE348864_AVX_gf_mul(out, tmp_11); // 1111111111 | |||
out = gf_sq(out); | |||
out = PQCLEAN_MCELIECE348864_AVX_gf_mul(out, in); // 11111111111 | |||
return gf_sq(out); // 111111111110 | |||
} | |||
/* input: field element den, num */ | |||
/* return: (num/den) */ | |||
gf PQCLEAN_MCELIECE348864_AVX_gf_frac(gf den, gf num) { | |||
return PQCLEAN_MCELIECE348864_AVX_gf_mul(PQCLEAN_MCELIECE348864_AVX_gf_inv(den), num); | |||
} | |||
/* input: in0, in1 in GF((2^m)^t)*/ | |||
/* output: out = in0*in1 */ | |||
void PQCLEAN_MCELIECE348864_AVX_GF_mul(gf *out, const gf *in0, const gf *in1) { | |||
int i, j; | |||
gf prod[ SYS_T * 2 - 1 ]; | |||
for (i = 0; i < SYS_T * 2 - 1; i++) { | |||
prod[i] = 0; | |||
} | |||
for (i = 0; i < SYS_T; i++) { | |||
for (j = 0; j < SYS_T; j++) { | |||
prod[i + j] ^= PQCLEAN_MCELIECE348864_AVX_gf_mul(in0[i], in1[j]); | |||
} | |||
} | |||
// | |||
for (i = (SYS_T - 1) * 2; i >= SYS_T; i--) { | |||
prod[i - SYS_T + 9] ^= PQCLEAN_MCELIECE348864_AVX_gf_mul(prod[i], (gf) 877); | |||
prod[i - SYS_T + 7] ^= PQCLEAN_MCELIECE348864_AVX_gf_mul(prod[i], (gf) 2888); | |||
prod[i - SYS_T + 5] ^= PQCLEAN_MCELIECE348864_AVX_gf_mul(prod[i], (gf) 1781); | |||
prod[i - SYS_T + 0] ^= PQCLEAN_MCELIECE348864_AVX_gf_mul(prod[i], (gf) 373); | |||
} | |||
for (i = 0; i < SYS_T; i++) { | |||
out[i] = prod[i]; | |||
} | |||
} | |||
/* 2 field multiplications */ | |||
uint64_t PQCLEAN_MCELIECE348864_AVX_gf_mul2(gf a, gf b0, gf b1) { | |||
int i; | |||
uint64_t tmp = 0; | |||
uint64_t t0; | |||
uint64_t t1; | |||
uint64_t t; | |||
uint64_t mask = 0x0000000100000001; | |||
t0 = a; | |||
t1 = b1; | |||
t1 = (t1 << 32) | b0; | |||
for (i = 0; i < GFBITS; i++) { | |||
tmp ^= t0 * (t1 & mask); | |||
mask += mask; | |||
} | |||
// | |||
t = tmp & 0x007FC000007FC000; | |||
tmp ^= (t >> 9) ^ (t >> 12); | |||
t = tmp & 0x0000300000003000; | |||
tmp ^= (t >> 9) ^ (t >> 12); | |||
return tmp & 0x00000FFF00000FFF; | |||
} | |||
@@ -0,0 +1,26 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_AVX_GF_H | |||
#define PQCLEAN_MCELIECE348864_AVX_GF_H | |||
/* | |||
This file is for functions for field arithmetic | |||
*/ | |||
#include "params.h" | |||
#include <stdint.h> | |||
typedef uint16_t gf; | |||
gf PQCLEAN_MCELIECE348864_AVX_gf_iszero(gf /*a*/); | |||
gf PQCLEAN_MCELIECE348864_AVX_gf_add(gf /*in0*/, gf /*in1*/); | |||
gf PQCLEAN_MCELIECE348864_AVX_gf_mul(gf /*in0*/, gf /*in1*/); | |||
gf PQCLEAN_MCELIECE348864_AVX_gf_frac(gf /*den*/, gf /*num*/); | |||
gf PQCLEAN_MCELIECE348864_AVX_gf_inv(gf /*in*/); | |||
void PQCLEAN_MCELIECE348864_AVX_GF_mul(gf * /*out*/, const gf * /*in0*/, const gf * /*in1*/); | |||
/* 2 field multiplications */ | |||
uint64_t PQCLEAN_MCELIECE348864_AVX_gf_mul2(gf a, gf b0, gf b1); | |||
#endif | |||
@@ -0,0 +1,9 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_AVX_INT32_SORT_H | |||
#define PQCLEAN_MCELIECE348864_AVX_INT32_SORT_H | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
void PQCLEAN_MCELIECE348864_AVX_int32_sort(int32_t *x, size_t n); | |||
#endif |
@@ -0,0 +1,136 @@ | |||
#include "api.h" | |||
#include "aes256ctr.h" | |||
#include "controlbits.h" | |||
#include "crypto_hash.h" | |||
#include "decrypt.h" | |||
#include "encrypt.h" | |||
#include "params.h" | |||
#include "pk_gen.h" | |||
#include "randombytes.h" | |||
#include "sk_gen.h" | |||
#include "util.h" | |||
#include <stdint.h> | |||
#include <string.h> | |||
int PQCLEAN_MCELIECE348864_AVX_crypto_kem_enc( | |||
uint8_t *c, | |||
uint8_t *key, | |||
const uint8_t *pk | |||
) { | |||
uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; | |||
uint8_t *e = two_e + 1; | |||
uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; | |||
PQCLEAN_MCELIECE348864_AVX_encrypt(c, e, pk); | |||
crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); | |||
memcpy(one_ec + 1, e, SYS_N / 8); | |||
memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); | |||
crypto_hash_32b(key, one_ec, sizeof(one_ec)); | |||
return 0; | |||
} | |||
int PQCLEAN_MCELIECE348864_AVX_crypto_kem_dec( | |||
uint8_t *key, | |||
const uint8_t *c, | |||
const uint8_t *sk | |||
) { | |||
int i; | |||
uint8_t ret_confirm = 0; | |||
uint8_t ret_decrypt = 0; | |||
uint16_t m; | |||
uint8_t conf[32]; | |||
uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; | |||
uint8_t *e = two_e + 1; | |||
uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; | |||
uint8_t *x = preimage; | |||
// | |||
ret_decrypt = (uint8_t)PQCLEAN_MCELIECE348864_AVX_decrypt(e, sk + SYS_N / 8, c); | |||
crypto_hash_32b(conf, two_e, sizeof(two_e)); | |||
for (i = 0; i < 32; i++) { | |||
ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; | |||
} | |||
m = ret_decrypt | ret_confirm; | |||
m -= 1; | |||
m >>= 8; | |||
*x++ = (~m & 0) | (m & 1); | |||
for (i = 0; i < SYS_N / 8; i++) { | |||
*x++ = (~m & sk[i]) | (m & e[i]); | |||
} | |||
for (i = 0; i < SYND_BYTES + 32; i++) { | |||
*x++ = c[i]; | |||
} | |||
crypto_hash_32b(key, preimage, sizeof(preimage)); | |||
return 0; | |||
} | |||
int PQCLEAN_MCELIECE348864_AVX_crypto_kem_keypair | |||
( | |||
uint8_t *pk, | |||
uint8_t *sk | |||
) { | |||
int i; | |||
uint8_t seed[ 32 ]; | |||
uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; | |||
uint8_t nonce[ 16 ] = {0}; | |||
uint8_t *rp; | |||
gf f[ SYS_T ]; // element in GF(2^mt) | |||
gf irr[ SYS_T ]; // Goppa polynomial | |||
uint32_t perm[ 1 << GFBITS ]; // random permutation | |||
randombytes(seed, sizeof(seed)); | |||
while (1) { | |||
rp = r; | |||
PQCLEAN_MCELIECE348864_AVX_aes256ctr(r, sizeof(r), nonce, seed); | |||
memcpy(seed, &r[ sizeof(r) - 32 ], 32); | |||
for (i = 0; i < SYS_T; i++) { | |||
f[i] = PQCLEAN_MCELIECE348864_AVX_load2(rp + i * 2); | |||
} | |||
rp += sizeof(f); | |||
if (PQCLEAN_MCELIECE348864_AVX_genpoly_gen(irr, f)) { | |||
continue; | |||
} | |||
for (i = 0; i < (1 << GFBITS); i++) { | |||
perm[i] = PQCLEAN_MCELIECE348864_AVX_load4(rp + i * 4); | |||
} | |||
rp += sizeof(perm); | |||
if (PQCLEAN_MCELIECE348864_AVX_perm_check(perm)) { | |||
continue; | |||
} | |||
for (i = 0; i < SYS_T; i++) { | |||
PQCLEAN_MCELIECE348864_AVX_store2(sk + SYS_N / 8 + i * 2, irr[i]); | |||
} | |||
if (PQCLEAN_MCELIECE348864_AVX_pk_gen(pk, perm, sk + SYS_N / 8)) { | |||
continue; | |||
} | |||
memcpy(sk, rp, SYS_N / 8); | |||
PQCLEAN_MCELIECE348864_AVX_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); | |||
break; | |||
} | |||
return 0; | |||
} | |||
@@ -0,0 +1,21 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_AVX_PARAMS_H | |||
#define PQCLEAN_MCELIECE348864_AVX_PARAMS_H | |||
#define GFBITS 12 | |||
#define SYS_N 3488 | |||
#define SYS_T 64 | |||
#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) | |||
#define IRR_BYTES (SYS_T * 2) | |||
#define PK_NROWS (SYS_T*GFBITS) | |||
#define PK_NCOLS (SYS_N - PK_NROWS) | |||
#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) | |||
#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) | |||
#define SYND_BYTES ((PK_NROWS + 7)/8) | |||
#define GFMASK ((1 << GFBITS) - 1) | |||
#endif | |||
@@ -0,0 +1,276 @@ | |||
/* | |||
This file is for public-key generation | |||
*/ | |||
#include "pk_gen.h" | |||
#include "benes.h" | |||
#include "controlbits.h" | |||
#include "fft.h" | |||
#include "params.h" | |||
#include "util.h" | |||
#include <stdint.h> | |||
static void de_bitslicing(uint64_t *out, vec256 in[][GFBITS]) { | |||
int i, j, r; | |||
uint64_t u = 0; | |||
for (i = 0; i < (1 << GFBITS); i++) { | |||
out[i] = 0 ; | |||
} | |||
for (i = 0; i < 16; i++) { | |||
for (j = GFBITS - 1; j >= 0; j--) { | |||
u = PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[i][j], 0); | |||
for (r = 0; r < 64; r++) { | |||
out[i * 256 + 0 * 64 + r] <<= 1; | |||
out[i * 256 + 0 * 64 + r] |= (u >> r) & 1; | |||
} | |||
u = PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[i][j], 1); | |||
for (r = 0; r < 64; r++) { | |||
out[i * 256 + 1 * 64 + r] <<= 1; | |||
out[i * 256 + 1 * 64 + r] |= (u >> r) & 1; | |||
} | |||
u = PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[i][j], 2); | |||
for (r = 0; r < 64; r++) { | |||
out[i * 256 + 2 * 64 + r] <<= 1; | |||
out[i * 256 + 2 * 64 + r] |= (u >> r) & 1; | |||
} | |||
u = PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[i][j], 3); | |||
for (r = 0; r < 64; r++) { | |||
out[i * 256 + 3 * 64 + r] <<= 1; | |||
out[i * 256 + 3 * 64 + r] |= (u >> r) & 1; | |||
} | |||
} | |||
} | |||
} | |||
static void to_bitslicing_2x(vec256 out0[][GFBITS], vec256 out1[][GFBITS], const uint64_t *in) { | |||
int i, j, k, r; | |||
uint64_t u[4] = {0}; | |||
for (i = 0; i < 16; i++) { | |||
for (j = GFBITS - 1; j >= 0; j--) { | |||
for (k = 0; k < 4; k++) { | |||
for (r = 63; r >= 0; r--) { | |||
u[k] <<= 1; | |||
u[k] |= (in[i * 256 + k * 64 + r] >> (j + GFBITS)) & 1; | |||
} | |||
} | |||
out1[i][j] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); | |||
} | |||
for (j = GFBITS - 1; j >= 0; j--) { | |||
for (k = 0; k < 4; k++) { | |||
for (r = 63; r >= 0; r--) { | |||
u[k] <<= 1; | |||
u[k] |= (in[i * 256 + k * 64 + r] >> j) & 1; | |||
} | |||
} | |||
out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); | |||
} | |||
} | |||
} | |||
#define NBLOCKS1_H ((SYS_N + 63) / 64) | |||
#define NBLOCKS2_H ((SYS_N + 255) / 256) | |||
#define NBLOCKS1_I ((GFBITS * SYS_T + 63) / 64) | |||
#define NBLOCKS2_I ((GFBITS * SYS_T + 255) / 256) | |||
int PQCLEAN_MCELIECE348864_AVX_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { | |||
const int block_idx = NBLOCKS1_I; | |||
int i, j, k; | |||
int row, c; | |||
uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS2_H * 4 ]; | |||
uint64_t ops[ GFBITS * SYS_T ][ NBLOCKS1_I ]; | |||
uint64_t mask; | |||
uint64_t sk_int[ GFBITS ]; | |||
vec256 consts[ 16 ][ GFBITS ]; | |||
vec256 eval[ 16 ][ GFBITS ]; | |||
vec256 prod[ 16 ][ GFBITS ]; | |||
vec256 tmp[ GFBITS ]; | |||
uint64_t list[1 << GFBITS]; | |||
uint64_t one_row[ 128 ]; | |||
// compute the inverses | |||
PQCLEAN_MCELIECE348864_AVX_irr_load(sk_int, sk); | |||
PQCLEAN_MCELIECE348864_AVX_fft(eval, sk_int); | |||
PQCLEAN_MCELIECE348864_AVX_vec256_copy(prod[0], eval[0]); | |||
for (i = 1; i < 16; i++) { | |||
PQCLEAN_MCELIECE348864_AVX_vec256_mul(prod[i], prod[i - 1], eval[i]); | |||
} | |||
PQCLEAN_MCELIECE348864_AVX_vec256_inv(tmp, prod[15]); | |||
for (i = 14; i >= 0; i--) { | |||
PQCLEAN_MCELIECE348864_AVX_vec256_mul(prod[i + 1], prod[i], tmp); | |||
PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp, tmp, eval[i + 1]); | |||
} | |||
PQCLEAN_MCELIECE348864_AVX_vec256_copy(prod[0], tmp); | |||
// fill matrix | |||
de_bitslicing(list, prod); | |||
for (i = 0; i < (1 << GFBITS); i++) { | |||
list[i] <<= GFBITS; | |||
list[i] |= i; | |||
list[i] |= ((uint64_t) perm[i]) << 31; | |||
} | |||
PQCLEAN_MCELIECE348864_AVX_sort_63b(1 << GFBITS, list); | |||
to_bitslicing_2x(consts, prod, list); | |||
for (i = 0; i < (1 << GFBITS); i++) { | |||
perm[i] = list[i] & GFMASK; | |||
} | |||
for (j = 0; j < NBLOCKS2_I; j++) { | |||
for (k = 0; k < GFBITS; k++) { | |||
mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 0); | |||
mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 1); | |||
mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 2); | |||
mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 3); | |||
} | |||
} | |||
for (i = 1; i < SYS_T; i++) { | |||
for (j = 0; j < NBLOCKS2_I; j++) { | |||
PQCLEAN_MCELIECE348864_AVX_vec256_mul(prod[j], prod[j], consts[j]); | |||
for (k = 0; k < GFBITS; k++) { | |||
mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 0); | |||
mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 1); | |||
mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 2); | |||
mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 3); | |||
} | |||
} | |||
} | |||
// gaussian elimination to obtain an upper triangular matrix | |||
// and keep track of the operations in ops | |||
for (i = 0; i < PK_NROWS; i++) { | |||
for (j = 0; j < NBLOCKS1_I; j++) { | |||
ops[ i ][ j ] = 0; | |||
} | |||
} | |||
for (i = 0; i < PK_NROWS; i++) { | |||
ops[ i ][ i / 64 ] = 1; | |||
ops[ i ][ i / 64 ] <<= (i % 64); | |||
} | |||
for (row = 0; row < PK_NROWS; row++) { | |||
i = row >> 6; | |||
j = row & 63; | |||
for (k = row + 1; k < PK_NROWS; k++) { | |||
mask = mat[ row ][ i ] >> j; | |||
mask &= 1; | |||
mask -= 1; | |||
for (c = 0; c < NBLOCKS1_I; c++) { | |||
mat[ row ][ c ] ^= mat[ k ][ c ] & mask; | |||
ops[ row ][ c ] ^= ops[ k ][ c ] & mask; | |||
} | |||
} | |||
if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic | |||
return -1; | |||
} | |||
for (k = row + 1; k < PK_NROWS; k++) { | |||
mask = mat[ k ][ i ] >> j; | |||
mask &= 1; | |||
mask = -mask; | |||
for (c = 0; c < NBLOCKS1_I; c++) { | |||
mat[ k ][ c ] ^= mat[ row ][ c ] & mask; | |||
ops[ k ][ c ] ^= ops[ row ][ c ] & mask; | |||
} | |||
} | |||
} | |||
// computing the lineaer map required to obatin the systematic form | |||
for (row = PK_NROWS - 1; row >= 0; row--) { | |||
for (k = 0; k < row; k++) { | |||
mask = mat[ k ][ row / 64 ] >> (row & 63); | |||
mask &= 1; | |||
mask = -mask; | |||
for (c = 0; c < NBLOCKS1_I; c++) { | |||
ops[ k ][ c ] ^= ops[ row ][ c ] & mask; | |||
} | |||
} | |||
} | |||
// apply the linear map to the non-systematic part | |||
for (j = NBLOCKS2_I; j < NBLOCKS2_H; j++) { | |||
for (k = 0; k < GFBITS; k++) { | |||
mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 0); | |||
mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 1); | |||
mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 2); | |||
mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 3); | |||
} | |||
} | |||
for (i = 1; i < SYS_T; i++) { | |||
for (j = NBLOCKS2_I; j < NBLOCKS2_H; j++) { | |||
PQCLEAN_MCELIECE348864_AVX_vec256_mul(prod[j], prod[j], consts[j]); | |||
for (k = 0; k < GFBITS; k++) { | |||
mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 0); | |||
mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 1); | |||
mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 2); | |||
mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 3); | |||
} | |||
} | |||
} | |||
for (row = 0; row < PK_NROWS; row++) { | |||
for (k = 0; k < NBLOCKS1_H; k++) { | |||
one_row[ k ] = 0; | |||
} | |||
for (c = 0; c < PK_NROWS; c++) { | |||
mask = ops[ row ][ c >> 6 ] >> (c & 63); | |||
mask &= 1; | |||
mask = -mask; | |||
for (k = block_idx; k < NBLOCKS1_H; k++) { | |||
one_row[ k ] ^= mat[ c ][ k ] & mask; | |||
} | |||
} | |||
for (k = block_idx; k < NBLOCKS1_H - 1; k++) { | |||
PQCLEAN_MCELIECE348864_AVX_store8(pk, one_row[k]); | |||
pk += 8; | |||
} | |||
PQCLEAN_MCELIECE348864_AVX_store_i(pk, one_row[k], PK_ROW_BYTES % 8); | |||
pk += PK_ROW_BYTES % 8; | |||
} | |||
// | |||
return 0; | |||
} | |||
@@ -0,0 +1,13 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_AVX_PK_GEN_H | |||
#define PQCLEAN_MCELIECE348864_AVX_PK_GEN_H | |||
/* | |||
This file is for public-key generation | |||
*/ | |||
#include "gf.h" | |||
int PQCLEAN_MCELIECE348864_AVX_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); | |||
#endif | |||
@@ -0,0 +1,224 @@ | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), | |||
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), | |||
}, |
@@ -0,0 +1,70 @@ | |||
{ | |||
0XF3CFC030FC30F003, | |||
0X3FCF0F003C00C00C, | |||
0X30033CC300C0C03C, | |||
0XCCFF0F3C0F30F0C0, | |||
0X0300C03FF303C3F0, | |||
0X3FFF3C0FF0CCCCC0, | |||
0XF3FFF0C00F3C3CC0, | |||
0X3003333FFFC3C000, | |||
0X0FF30FFFC3FFF300, | |||
0XFFC0F300F0F0CC00, | |||
0XC0CFF3FCCC3CFC00, | |||
0XFC3C03F0F330C000, | |||
}, | |||
{ | |||
0X000F00000000F00F, | |||
0X00000F00F00000F0, | |||
0X0F00000F00000F00, | |||
0XF00F00F00F000000, | |||
0X00F00000000000F0, | |||
0X0000000F00000000, | |||
0XF00000000F00F000, | |||
0X00F00F00000F0000, | |||
0X0000F00000F00F00, | |||
0X000F00F00F00F000, | |||
0X00F00F0000000000, | |||
0X0000000000F00000, | |||
}, | |||
{ | |||
0X0000FF00FF0000FF, | |||
0X0000FF000000FF00, | |||
0XFF0000FF00FF0000, | |||
0XFFFF0000FF000000, | |||
0X00FF00FF00FF0000, | |||
0X0000FFFFFF000000, | |||
0X00FFFF00FF000000, | |||
0XFFFFFF0000FF0000, | |||
0XFFFF00FFFF00FF00, | |||
0X0000FF0000000000, | |||
0XFFFFFF00FF000000, | |||
0X00FF000000000000, | |||
}, | |||
{ | |||
0X000000000000FFFF, | |||
0X00000000FFFF0000, | |||
0X0000000000000000, | |||
0XFFFF000000000000, | |||
0X00000000FFFF0000, | |||
0X0000FFFF00000000, | |||
0X0000000000000000, | |||
0X00000000FFFF0000, | |||
0X0000FFFF00000000, | |||
0X0000000000000000, | |||
0X0000000000000000, | |||
0X0000000000000000, | |||
}, | |||
{ | |||
0X00000000FFFFFFFF, | |||
0XFFFFFFFF00000000, | |||
0XFFFFFFFF00000000, | |||
0X0000000000000000, | |||
0X0000000000000000, | |||
0XFFFFFFFF00000000, | |||
0X0000000000000000, | |||
0X0000000000000000, | |||
0XFFFFFFFF00000000, | |||
0X0000000000000000, | |||
0X0000000000000000, | |||
0X0000000000000000, | |||
} |
@@ -0,0 +1,70 @@ | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xf3cfc030fc30f003, 0x000c03c0c3c0330c), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x3fcf0f003c00c00c, 0xf330cffcc00f33c0), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x30033cc300c0c03c, 0xccf330f00f3c0333), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xccff0f3c0f30f0c0, 0xff03fff3ff0cf0c0), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0300c03ff303c3f0, 0x3cc3fcf00fcc303c), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x3fff3c0ff0ccccc0, 0x0f000c0fc30303f3), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xf3fff0c00f3c3cc0, 0xcf0fc3ff333ccf3c), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x3003333fffc3c000, 0x003f3fc3c0ff333f), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0ff30fffc3fff300, 0x3cc3f0f3cf0ff00f), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffc0f300f0f0cc00, 0xf3f33cc03fc30cc0), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xc0cff3fccc3cfc00, 0x3cc330cfc333f33f), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xfc3c03f0f330c000, 0x3cc0303ff3c3fffc), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x000f00000000f00f, 0x0f00f00f00000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00000f00f00000f0, 0xf00000000000f000), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0f00000f00000f00, 0x00000f00000000f0), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xf00f00f00f000000, 0x0f00f00000f00000), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00f00000000000f0, 0x000f00000f00f00f), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000f00000000, 0x00f00f00f00f0000), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xf00000000f00f000, 0x0f00f00000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00f00f00000f0000, 0x000000000f000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000f00000f00f00, 0x00f00000000f00f0), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x000f00f00f00f000, 0x0000f00f00000f00), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00f00f0000000000, 0xf00000f00000f00f), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000f00000, 0x00000f00f00f00f0), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000ff00ff0000ff, 0xff00ffffff000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000ff000000ff00, 0xff0000ffff000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xff0000ff00ff0000, 0xffff00ffff000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffff0000ff000000, 0xff00ffffffffff00), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00ff00ff00ff0000, 0x00000000ff00ff00), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000ffffff000000, 0xffffffff00ff0000), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00ffff00ff000000, 0x00ffffff00ff0000), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffffff0000ff0000, 0xffff00ffff00ffff), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffff00ffff00ff00, 0xffff0000ffffffff), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000ff0000000000, 0xff00000000ff0000), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffffff00ff000000, 0x000000ff00ff00ff), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00ff000000000000, 0x00ff00ff00ffff00), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x000000000000ffff, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00000000ffff0000, 0xffff000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffff000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00000000ffff0000, 0xffff00000000ffff), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000ffff00000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0x0000ffff00000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00000000ffff0000, 0xffff00000000ffff), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000ffff00000000, 0x00000000ffff0000), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0xffff00000000ffff), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0x00000000ffff0000), | |||
}, | |||
{ | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00000000ffffffff, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffffffff00000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffffffff00000000, 0x00000000ffffffff), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0xffffffff00000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0xffffffff00000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffffffff00000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0xffffffff00000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0xffffffffffffffff), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffffffff00000000, 0xffffffff00000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0x0000000000000000), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0xffffffffffffffff), | |||
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0xffffffff00000000), | |||
}, |
@@ -0,0 +1,98 @@ | |||
/* | |||
This file is for secret-key generation | |||
*/ | |||
#include "sk_gen.h" | |||
#include "controlbits.h" | |||
#include "gf.h" | |||
#include "params.h" | |||
#include "util.h" | |||
/* input: f, element in GF((2^m)^t) */ | |||
/* output: out, minimal polynomial of f */ | |||
/* return: 0 for success and -1 for failure */ | |||
int PQCLEAN_MCELIECE348864_AVX_genpoly_gen(gf *out, gf *f) { | |||
int i, j, k, c; | |||
gf mat[ SYS_T + 1 ][ SYS_T ]; | |||
gf mask, inv, t; | |||
// fill matrix | |||
mat[0][0] = 1; | |||
for (i = 1; i < SYS_T; i++) { | |||
mat[0][i] = 0; | |||
} | |||
for (i = 0; i < SYS_T; i++) { | |||
mat[1][i] = f[i]; | |||
} | |||
for (j = 2; j <= SYS_T; j++) { | |||
PQCLEAN_MCELIECE348864_AVX_GF_mul(mat[j], mat[j - 1], f); | |||
} | |||
// gaussian | |||
for (j = 0; j < SYS_T; j++) { | |||
for (k = j + 1; k < SYS_T; k++) { | |||
mask = PQCLEAN_MCELIECE348864_AVX_gf_iszero(mat[ j ][ j ]); | |||
for (c = j; c < SYS_T + 1; c++) { | |||
mat[ c ][ j ] ^= mat[ c ][ k ] & mask; | |||
} | |||
} | |||
if ( mat[ j ][ j ] == 0 ) { // return if not systematic | |||
return -1; | |||
} | |||
inv = PQCLEAN_MCELIECE348864_AVX_gf_inv(mat[j][j]); | |||
for (c = j; c < SYS_T + 1; c++) { | |||
mat[ c ][ j ] = PQCLEAN_MCELIECE348864_AVX_gf_mul(mat[ c ][ j ], inv) ; | |||
} | |||
for (k = 0; k < SYS_T; k++) { | |||
if (k != j) { | |||
t = mat[ j ][ k ]; | |||
for (c = j; c < SYS_T + 1; c++) { | |||
mat[ c ][ k ] ^= PQCLEAN_MCELIECE348864_AVX_gf_mul(mat[ c ][ j ], t); | |||
} | |||
} | |||
} | |||
} | |||
for (i = 0; i < SYS_T; i++) { | |||
out[i] = mat[ SYS_T ][ i ]; | |||
} | |||
return 0; | |||
} | |||
/* input: permutation p represented as a list of 32-bit intergers */ | |||
/* output: -1 if some interger repeats in p */ | |||
/* 0 otherwise */ | |||
int PQCLEAN_MCELIECE348864_AVX_perm_check(const uint32_t *p) { | |||
int i; | |||
uint64_t list[1 << GFBITS]; | |||
for (i = 0; i < (1 << GFBITS); i++) { | |||
list[i] = p[i]; | |||
} | |||
PQCLEAN_MCELIECE348864_AVX_sort_63b(1 << GFBITS, list); | |||
for (i = 1; i < (1 << GFBITS); i++) { | |||
if (list[i - 1] == list[i]) { | |||
return -1; | |||
} | |||
} | |||
return 0; | |||
} | |||
@@ -0,0 +1,16 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_AVX_SK_GEN_H | |||
#define PQCLEAN_MCELIECE348864_AVX_SK_GEN_H | |||
/* | |||
This file is for secret-key generation | |||
*/ | |||
#include "gf.h" | |||
#include <stdint.h> | |||
int PQCLEAN_MCELIECE348864_AVX_genpoly_gen(gf * /*out*/, gf * /*f*/); | |||
int PQCLEAN_MCELIECE348864_AVX_perm_check(const uint32_t * /*p*/); | |||
#endif | |||
@@ -0,0 +1,530 @@ | |||
# qhasm: int64 input_0 | |||
# qhasm: int64 input_1 | |||
# qhasm: int64 input_2 | |||
# qhasm: int64 input_3 | |||
# qhasm: int64 input_4 | |||
# qhasm: int64 input_5 | |||
# qhasm: stack64 input_6 | |||
# qhasm: stack64 input_7 | |||
# qhasm: int64 caller_r11 | |||
# qhasm: int64 caller_r12 | |||
# qhasm: int64 caller_r13 | |||
# qhasm: int64 caller_r14 | |||
# qhasm: int64 caller_r15 | |||
# qhasm: int64 caller_rbx | |||
# qhasm: int64 caller_rbp | |||
# qhasm: int64 b64 | |||
# qhasm: int64 synd | |||
# qhasm: int64 addr | |||
# qhasm: int64 c | |||
# qhasm: int64 c_all | |||
# qhasm: int64 row | |||
# qhasm: int64 p | |||
# qhasm: int64 e | |||
# qhasm: int64 s | |||
# qhasm: reg256 pp | |||
# qhasm: reg256 ee | |||
# qhasm: reg256 ss | |||
# qhasm: int64 buf_ptr | |||
# qhasm: stack256 buf | |||
# qhasm: enter syndrome_asm | |||
.p2align 5 | |||
.global _PQCLEAN_MCELIECE348864_AVX_syndrome_asm | |||
.global PQCLEAN_MCELIECE348864_AVX_syndrome_asm | |||
_PQCLEAN_MCELIECE348864_AVX_syndrome_asm: | |||
PQCLEAN_MCELIECE348864_AVX_syndrome_asm: | |||
mov %rsp,%r11 | |||
and $31,%r11 | |||
add $32,%r11 | |||
sub %r11,%rsp | |||
# qhasm: input_1 += 260780 | |||
# asm 1: add $260780,<input_1=int64#2 | |||
# asm 2: add $260780,<input_1=%rsi | |||
add $260780,%rsi | |||
# qhasm: buf_ptr = &buf | |||
# asm 1: leaq <buf=stack256#1,>buf_ptr=int64#4 | |||
# asm 2: leaq <buf=0(%rsp),>buf_ptr=%rcx | |||
leaq 0(%rsp),%rcx | |||
# qhasm: row = 768 | |||
# asm 1: mov $768,>row=int64#5 | |||
# asm 2: mov $768,>row=%r8 | |||
mov $768,%r8 | |||
# qhasm: loop: | |||
._loop: | |||
# qhasm: row -= 1 | |||
# asm 1: sub $1,<row=int64#5 | |||
# asm 2: sub $1,<row=%r8 | |||
sub $1,%r8 | |||
# qhasm: ss = mem256[ input_1 + 0 ] | |||
# asm 1: vmovupd 0(<input_1=int64#2),>ss=reg256#1 | |||
# asm 2: vmovupd 0(<input_1=%rsi),>ss=%ymm0 | |||
vmovupd 0(%rsi),%ymm0 | |||
# qhasm: ee = mem256[ input_2 + 96 ] | |||
# asm 1: vmovupd 96(<input_2=int64#3),>ee=reg256#2 | |||
# asm 2: vmovupd 96(<input_2=%rdx),>ee=%ymm1 | |||
vmovupd 96(%rdx),%ymm1 | |||
# qhasm: ss &= ee | |||
# asm 1: vpand <ee=reg256#2,<ss=reg256#1,<ss=reg256#1 | |||
# asm 2: vpand <ee=%ymm1,<ss=%ymm0,<ss=%ymm0 | |||
vpand %ymm1,%ymm0,%ymm0 | |||
# qhasm: pp = mem256[ input_1 + 32 ] | |||
# asm 1: vmovupd 32(<input_1=int64#2),>pp=reg256#2 | |||
# asm 2: vmovupd 32(<input_1=%rsi),>pp=%ymm1 | |||
vmovupd 32(%rsi),%ymm1 | |||
# qhasm: ee = mem256[ input_2 + 128 ] | |||
# asm 1: vmovupd 128(<input_2=int64#3),>ee=reg256#3 | |||
# asm 2: vmovupd 128(<input_2=%rdx),>ee=%ymm2 | |||
vmovupd 128(%rdx),%ymm2 | |||
# qhasm: pp &= ee | |||
# asm 1: vpand <ee=reg256#3,<pp=reg256#2,<pp=reg256#2 | |||
# asm 2: vpand <ee=%ymm2,<pp=%ymm1,<pp=%ymm1 | |||
vpand %ymm2,%ymm1,%ymm1 | |||
# qhasm: ss ^= pp | |||
# asm 1: vpxor <pp=reg256#2,<ss=reg256#1,<ss=reg256#1 | |||
# asm 2: vpxor <pp=%ymm1,<ss=%ymm0,<ss=%ymm0 | |||
vpxor %ymm1,%ymm0,%ymm0 | |||
# qhasm: pp = mem256[ input_1 + 64 ] | |||
# asm 1: vmovupd 64(<input_1=int64#2),>pp=reg256#2 | |||
# asm 2: vmovupd 64(<input_1=%rsi),>pp=%ymm1 | |||
vmovupd 64(%rsi),%ymm1 | |||
# qhasm: ee = mem256[ input_2 + 160 ] | |||
# asm 1: vmovupd 160(<input_2=int64#3),>ee=reg256#3 | |||
# asm 2: vmovupd 160(<input_2=%rdx),>ee=%ymm2 | |||
vmovupd 160(%rdx),%ymm2 | |||
# qhasm: pp &= ee | |||
# asm 1: vpand <ee=reg256#3,<pp=reg256#2,<pp=reg256#2 | |||
# asm 2: vpand <ee=%ymm2,<pp=%ymm1,<pp=%ymm1 | |||
vpand %ymm2,%ymm1,%ymm1 | |||
# qhasm: ss ^= pp | |||
# asm 1: vpxor <pp=reg256#2,<ss=reg256#1,<ss=reg256#1 | |||
# asm 2: vpxor <pp=%ymm1,<ss=%ymm0,<ss=%ymm0 | |||
vpxor %ymm1,%ymm0,%ymm0 | |||
# qhasm: pp = mem256[ input_1 + 96 ] | |||
# asm 1: vmovupd 96(<input_1=int64#2),>pp=reg256#2 | |||
# asm 2: vmovupd 96(<input_1=%rsi),>pp=%ymm1 | |||
vmovupd 96(%rsi),%ymm1 | |||
# qhasm: ee = mem256[ input_2 + 192 ] | |||
# asm 1: vmovupd 192(<input_2=int64#3),>ee=reg256#3 | |||
# asm 2: vmovupd 192(<input_2=%rdx),>ee=%ymm2 | |||
vmovupd 192(%rdx),%ymm2 | |||
# qhasm: pp &= ee | |||
# asm 1: vpand <ee=reg256#3,<pp=reg256#2,<pp=reg256#2 | |||
# asm 2: vpand <ee=%ymm2,<pp=%ymm1,<pp=%ymm1 | |||
vpand %ymm2,%ymm1,%ymm1 | |||
# qhasm: ss ^= pp | |||
# asm 1: vpxor <pp=reg256#2,<ss=reg256#1,<ss=reg256#1 | |||
# asm 2: vpxor <pp=%ymm1,<ss=%ymm0,<ss=%ymm0 | |||
vpxor %ymm1,%ymm0,%ymm0 | |||
# qhasm: pp = mem256[ input_1 + 128 ] | |||
# asm 1: vmovupd 128(<input_1=int64#2),>pp=reg256#2 | |||
# asm 2: vmovupd 128(<input_1=%rsi),>pp=%ymm1 | |||
vmovupd 128(%rsi),%ymm1 | |||
# qhasm: ee = mem256[ input_2 + 224 ] | |||
# asm 1: vmovupd 224(<input_2=int64#3),>ee=reg256#3 | |||
# asm 2: vmovupd 224(<input_2=%rdx),>ee=%ymm2 | |||
vmovupd 224(%rdx),%ymm2 | |||
# qhasm: pp &= ee | |||
# asm 1: vpand <ee=reg256#3,<pp=reg256#2,<pp=reg256#2 | |||
# asm 2: vpand <ee=%ymm2,<pp=%ymm1,<pp=%ymm1 | |||
vpand %ymm2,%ymm1,%ymm1 | |||
# qhasm: ss ^= pp | |||
# asm 1: vpxor <pp=reg256#2,<ss=reg256#1,<ss=reg256#1 | |||
# asm 2: vpxor <pp=%ymm1,<ss=%ymm0,<ss=%ymm0 | |||
vpxor %ymm1,%ymm0,%ymm0 | |||
# qhasm: pp = mem256[ input_1 + 160 ] | |||
# asm 1: vmovupd 160(<input_1=int64#2),>pp=reg256#2 | |||
# asm 2: vmovupd 160(<input_1=%rsi),>pp=%ymm1 | |||
vmovupd 160(%rsi),%ymm1 | |||
# qhasm: ee = mem256[ input_2 + 256 ] | |||
# asm 1: vmovupd 256(<input_2=int64#3),>ee=reg256#3 | |||
# asm 2: vmovupd 256(<input_2=%rdx),>ee=%ymm2 | |||
vmovupd 256(%rdx),%ymm2 | |||
# qhasm: pp &= ee | |||
# asm 1: vpand <ee=reg256#3,<pp=reg256#2,<pp=reg256#2 | |||
# asm 2: vpand <ee=%ymm2,<pp=%ymm1,<pp=%ymm1 | |||
vpand %ymm2,%ymm1,%ymm1 | |||
# qhasm: ss ^= pp | |||
# asm 1: vpxor <pp=reg256#2,<ss=reg256#1,<ss=reg256#1 | |||
# asm 2: vpxor <pp=%ymm1,<ss=%ymm0,<ss=%ymm0 | |||
vpxor %ymm1,%ymm0,%ymm0 | |||
# qhasm: pp = mem256[ input_1 + 192 ] | |||
# asm 1: vmovupd 192(<input_1=int64#2),>pp=reg256#2 | |||
# asm 2: vmovupd 192(<input_1=%rsi),>pp=%ymm1 | |||
vmovupd 192(%rsi),%ymm1 | |||
# qhasm: ee = mem256[ input_2 + 288 ] | |||
# asm 1: vmovupd 288(<input_2=int64#3),>ee=reg256#3 | |||
# asm 2: vmovupd 288(<input_2=%rdx),>ee=%ymm2 | |||
vmovupd 288(%rdx),%ymm2 | |||
# qhasm: pp &= ee | |||
# asm 1: vpand <ee=reg256#3,<pp=reg256#2,<pp=reg256#2 | |||
# asm 2: vpand <ee=%ymm2,<pp=%ymm1,<pp=%ymm1 | |||
vpand %ymm2,%ymm1,%ymm1 | |||
# qhasm: ss ^= pp | |||
# asm 1: vpxor <pp=reg256#2,<ss=reg256#1,<ss=reg256#1 | |||
# asm 2: vpxor <pp=%ymm1,<ss=%ymm0,<ss=%ymm0 | |||
vpxor %ymm1,%ymm0,%ymm0 | |||
# qhasm: pp = mem256[ input_1 + 224 ] | |||
# asm 1: vmovupd 224(<input_1=int64#2),>pp=reg256#2 | |||
# asm 2: vmovupd 224(<input_1=%rsi),>pp=%ymm1 | |||
vmovupd 224(%rsi),%ymm1 | |||
# qhasm: ee = mem256[ input_2 + 320 ] | |||
# asm 1: vmovupd 320(<input_2=int64#3),>ee=reg256#3 | |||
# asm 2: vmovupd 320(<input_2=%rdx),>ee=%ymm2 | |||
vmovupd 320(%rdx),%ymm2 | |||
# qhasm: pp &= ee | |||
# asm 1: vpand <ee=reg256#3,<pp=reg256#2,<pp=reg256#2 | |||
# asm 2: vpand <ee=%ymm2,<pp=%ymm1,<pp=%ymm1 | |||
vpand %ymm2,%ymm1,%ymm1 | |||
# qhasm: ss ^= pp | |||
# asm 1: vpxor <pp=reg256#2,<ss=reg256#1,<ss=reg256#1 | |||
# asm 2: vpxor <pp=%ymm1,<ss=%ymm0,<ss=%ymm0 | |||
vpxor %ymm1,%ymm0,%ymm0 | |||
# qhasm: pp = mem256[ input_1 + 256 ] | |||
# asm 1: vmovupd 256(<input_1=int64#2),>pp=reg256#2 | |||
# asm 2: vmovupd 256(<input_1=%rsi),>pp=%ymm1 | |||
vmovupd 256(%rsi),%ymm1 | |||
# qhasm: ee = mem256[ input_2 + 352 ] | |||
# asm 1: vmovupd 352(<input_2=int64#3),>ee=reg256#3 | |||
# asm 2: vmovupd 352(<input_2=%rdx),>ee=%ymm2 | |||
vmovupd 352(%rdx),%ymm2 | |||
# qhasm: pp &= ee | |||
# asm 1: vpand <ee=reg256#3,<pp=reg256#2,<pp=reg256#2 | |||
# asm 2: vpand <ee=%ymm2,<pp=%ymm1,<pp=%ymm1 | |||
vpand %ymm2,%ymm1,%ymm1 | |||
# qhasm: ss ^= pp | |||
# asm 1: vpxor <pp=reg256#2,<ss=reg256#1,<ss=reg256#1 | |||
# asm 2: vpxor <pp=%ymm1,<ss=%ymm0,<ss=%ymm0 | |||
vpxor %ymm1,%ymm0,%ymm0 | |||
# qhasm: pp = mem256[ input_1 + 288 ] | |||
# asm 1: vmovupd 288(<input_1=int64#2),>pp=reg256#2 | |||
# asm 2: vmovupd 288(<input_1=%rsi),>pp=%ymm1 | |||
vmovupd 288(%rsi),%ymm1 | |||
# qhasm: ee = mem256[ input_2 + 384 ] | |||
# asm 1: vmovupd 384(<input_2=int64#3),>ee=reg256#3 | |||
# asm 2: vmovupd 384(<input_2=%rdx),>ee=%ymm2 | |||
vmovupd 384(%rdx),%ymm2 | |||
# qhasm: pp &= ee | |||
# asm 1: vpand <ee=reg256#3,<pp=reg256#2,<pp=reg256#2 | |||
# asm 2: vpand <ee=%ymm2,<pp=%ymm1,<pp=%ymm1 | |||
vpand %ymm2,%ymm1,%ymm1 | |||
# qhasm: ss ^= pp | |||
# asm 1: vpxor <pp=reg256#2,<ss=reg256#1,<ss=reg256#1 | |||
# asm 2: vpxor <pp=%ymm1,<ss=%ymm0,<ss=%ymm0 | |||
vpxor %ymm1,%ymm0,%ymm0 | |||
# qhasm: buf = ss | |||
# asm 1: vmovapd <ss=reg256#1,>buf=stack256#1 | |||
# asm 2: vmovapd <ss=%ymm0,>buf=0(%rsp) | |||
vmovapd %ymm0,0(%rsp) | |||
# qhasm: s = *(uint64 *)(input_1 + 320) | |||
# asm 1: movq 320(<input_1=int64#2),>s=int64#6 | |||
# asm 2: movq 320(<input_1=%rsi),>s=%r9 | |||
movq 320(%rsi),%r9 | |||
# qhasm: e = *(uint64 *)(input_2 + 416) | |||
# asm 1: movq 416(<input_2=int64#3),>e=int64#7 | |||
# asm 2: movq 416(<input_2=%rdx),>e=%rax | |||
movq 416(%rdx),%rax | |||
# qhasm: s &= e | |||
# asm 1: and <e=int64#7,<s=int64#6 | |||
# asm 2: and <e=%rax,<s=%r9 | |||
and %rax,%r9 | |||
# qhasm: p = *(uint64 *)(input_1 + 328) | |||
# asm 1: movq 328(<input_1=int64#2),>p=int64#7 | |||
# asm 2: movq 328(<input_1=%rsi),>p=%rax | |||
movq 328(%rsi),%rax | |||
# qhasm: e = *(uint64 *)(input_2 + 424) | |||
# asm 1: movq 424(<input_2=int64#3),>e=int64#8 | |||
# asm 2: movq 424(<input_2=%rdx),>e=%r10 | |||
movq 424(%rdx),%r10 | |||
# qhasm: p &= e | |||
# asm 1: and <e=int64#8,<p=int64#7 | |||
# asm 2: and <e=%r10,<p=%rax | |||
and %r10,%rax | |||
# qhasm: s ^= p | |||
# asm 1: xor <p=int64#7,<s=int64#6 | |||
# asm 2: xor <p=%rax,<s=%r9 | |||
xor %rax,%r9 | |||
# qhasm: p = *(uint32 *)(input_1 + 336) | |||
# asm 1: movl 336(<input_1=int64#2),>p=int64#7d | |||
# asm 2: movl 336(<input_1=%rsi),>p=%eax | |||
movl 336(%rsi),%eax | |||
# qhasm: e = *(uint32 *)(input_2 + 432) | |||
# asm 1: movl 432(<input_2=int64#3),>e=int64#8d | |||
# asm 2: movl 432(<input_2=%rdx),>e=%r10d | |||
movl 432(%rdx),%r10d | |||
# qhasm: p &= e | |||
# asm 1: and <e=int64#8,<p=int64#7 | |||
# asm 2: and <e=%r10,<p=%rax | |||
and %r10,%rax | |||
# qhasm: s ^= p | |||
# asm 1: xor <p=int64#7,<s=int64#6 | |||
# asm 2: xor <p=%rax,<s=%r9 | |||
xor %rax,%r9 | |||
# qhasm: c_all = count(s) | |||
# asm 1: popcnt <s=int64#6, >c_all=int64#6 | |||
# asm 2: popcnt <s=%r9, >c_all=%r9 | |||
popcnt %r9, %r9 | |||
# qhasm: b64 = mem64[ buf_ptr + 0 ] | |||
# asm 1: movq 0(<buf_ptr=int64#4),>b64=int64#7 | |||
# asm 2: movq 0(<buf_ptr=%rcx),>b64=%rax | |||
movq 0(%rcx),%rax | |||
# qhasm: c = count(b64) | |||
# asm 1: popcnt <b64=int64#7, >c=int64#7 | |||
# asm 2: popcnt <b64=%rax, >c=%rax | |||
popcnt %rax, %rax | |||
# qhasm: c_all ^= c | |||
# asm 1: xor <c=int64#7,<c_all=int64#6 | |||
# asm 2: xor <c=%rax,<c_all=%r9 | |||
xor %rax,%r9 | |||
# qhasm: b64 = mem64[ buf_ptr + 8 ] | |||
# asm 1: movq 8(<buf_ptr=int64#4),>b64=int64#7 | |||
# asm 2: movq 8(<buf_ptr=%rcx),>b64=%rax | |||
movq 8(%rcx),%rax | |||
# qhasm: c = count(b64) | |||
# asm 1: popcnt <b64=int64#7, >c=int64#7 | |||
# asm 2: popcnt <b64=%rax, >c=%rax | |||
popcnt %rax, %rax | |||
# qhasm: c_all ^= c | |||
# asm 1: xor <c=int64#7,<c_all=int64#6 | |||
# asm 2: xor <c=%rax,<c_all=%r9 | |||
xor %rax,%r9 | |||
# qhasm: b64 = mem64[ buf_ptr + 16 ] | |||
# asm 1: movq 16(<buf_ptr=int64#4),>b64=int64#7 | |||
# asm 2: movq 16(<buf_ptr=%rcx),>b64=%rax | |||
movq 16(%rcx),%rax | |||
# qhasm: c = count(b64) | |||
# asm 1: popcnt <b64=int64#7, >c=int64#7 | |||
# asm 2: popcnt <b64=%rax, >c=%rax | |||
popcnt %rax, %rax | |||
# qhasm: c_all ^= c | |||
# asm 1: xor <c=int64#7,<c_all=int64#6 | |||
# asm 2: xor <c=%rax,<c_all=%r9 | |||
xor %rax,%r9 | |||
# qhasm: b64 = mem64[ buf_ptr + 24 ] | |||
# asm 1: movq 24(<buf_ptr=int64#4),>b64=int64#7 | |||
# asm 2: movq 24(<buf_ptr=%rcx),>b64=%rax | |||
movq 24(%rcx),%rax | |||
# qhasm: c = count(b64) | |||
# asm 1: popcnt <b64=int64#7, >c=int64#7 | |||
# asm 2: popcnt <b64=%rax, >c=%rax | |||
popcnt %rax, %rax | |||
# qhasm: c_all ^= c | |||
# asm 1: xor <c=int64#7,<c_all=int64#6 | |||
# asm 2: xor <c=%rax,<c_all=%r9 | |||
xor %rax,%r9 | |||
# qhasm: addr = row | |||
# asm 1: mov <row=int64#5,>addr=int64#7 | |||
# asm 2: mov <row=%r8,>addr=%rax | |||
mov %r8,%rax | |||
# qhasm: (uint64) addr >>= 3 | |||
# asm 1: shr $3,<addr=int64#7 | |||
# asm 2: shr $3,<addr=%rax | |||
shr $3,%rax | |||
# qhasm: addr += input_0 | |||
# asm 1: add <input_0=int64#1,<addr=int64#7 | |||
# asm 2: add <input_0=%rdi,<addr=%rax | |||
add %rdi,%rax | |||
# qhasm: synd = *(uint8 *) (addr + 0) | |||
# asm 1: movzbq 0(<addr=int64#7),>synd=int64#8 | |||
# asm 2: movzbq 0(<addr=%rax),>synd=%r10 | |||
movzbq 0(%rax),%r10 | |||
# qhasm: synd <<= 1 | |||
# asm 1: shl $1,<synd=int64#8 | |||
# asm 2: shl $1,<synd=%r10 | |||
shl $1,%r10 | |||
# qhasm: (uint32) c_all &= 1 | |||
# asm 1: and $1,<c_all=int64#6d | |||
# asm 2: and $1,<c_all=%r9d | |||
and $1,%r9d | |||
# qhasm: synd |= c_all | |||
# asm 1: or <c_all=int64#6,<synd=int64#8 | |||
# asm 2: or <c_all=%r9,<synd=%r10 | |||
or %r9,%r10 | |||
# qhasm: *(uint8 *) (addr + 0) = synd | |||
# asm 1: movb <synd=int64#8b,0(<addr=int64#7) | |||
# asm 2: movb <synd=%r10b,0(<addr=%rax) | |||
movb %r10b,0(%rax) | |||
# qhasm: input_1 -= 340 | |||
# asm 1: sub $340,<input_1=int64#2 | |||
# asm 2: sub $340,<input_1=%rsi | |||
sub $340,%rsi | |||
# qhasm: =? row-0 | |||
# asm 1: cmp $0,<row=int64#5 | |||
# asm 2: cmp $0,<row=%r8 | |||
cmp $0,%r8 | |||
# comment:fp stack unchanged by jump | |||
# qhasm: goto loop if != | |||
jne ._loop | |||
# qhasm: ss = mem256[ input_0 + 0 ] | |||
# asm 1: vmovupd 0(<input_0=int64#1),>ss=reg256#1 | |||
# asm 2: vmovupd 0(<input_0=%rdi),>ss=%ymm0 | |||
vmovupd 0(%rdi),%ymm0 | |||
# qhasm: ee = mem256[ input_2 + 0 ] | |||
# asm 1: vmovupd 0(<input_2=int64#3),>ee=reg256#2 | |||
# asm 2: vmovupd 0(<input_2=%rdx),>ee=%ymm1 | |||
vmovupd 0(%rdx),%ymm1 | |||
# qhasm: ss ^= ee | |||
# asm 1: vpxor <ee=reg256#2,<ss=reg256#1,<ss=reg256#1 | |||
# asm 2: vpxor <ee=%ymm1,<ss=%ymm0,<ss=%ymm0 | |||
vpxor %ymm1,%ymm0,%ymm0 | |||
# qhasm: mem256[ input_0 + 0 ] = ss | |||
# asm 1: vmovupd <ss=reg256#1,0(<input_0=int64#1) | |||
# asm 2: vmovupd <ss=%ymm0,0(<input_0=%rdi) | |||
vmovupd %ymm0,0(%rdi) | |||
# qhasm: ss = mem256[ input_0 + 32 ] | |||
# asm 1: vmovupd 32(<input_0=int64#1),>ss=reg256#1 | |||
# asm 2: vmovupd 32(<input_0=%rdi),>ss=%ymm0 | |||
vmovupd 32(%rdi),%ymm0 | |||
# qhasm: ee = mem256[ input_2 + 32 ] | |||
# asm 1: vmovupd 32(<input_2=int64#3),>ee=reg256#2 | |||
# asm 2: vmovupd 32(<input_2=%rdx),>ee=%ymm1 | |||
vmovupd 32(%rdx),%ymm1 | |||
# qhasm: ss ^= ee | |||
# asm 1: vpxor <ee=reg256#2,<ss=reg256#1,<ss=reg256#1 | |||
# asm 2: vpxor <ee=%ymm1,<ss=%ymm0,<ss=%ymm0 | |||
vpxor %ymm1,%ymm0,%ymm0 | |||
# qhasm: mem256[ input_0 + 32 ] = ss | |||
# asm 1: vmovupd <ss=reg256#1,32(<input_0=int64#1) | |||
# asm 2: vmovupd <ss=%ymm0,32(<input_0=%rdi) | |||
vmovupd %ymm0,32(%rdi) | |||
# qhasm: ss = mem256[ input_0 + 64 ] | |||
# asm 1: vmovupd 64(<input_0=int64#1),>ss=reg256#1 | |||
# asm 2: vmovupd 64(<input_0=%rdi),>ss=%ymm0 | |||
vmovupd 64(%rdi),%ymm0 | |||
# qhasm: ee = mem256[ input_2 + 64 ] | |||
# asm 1: vmovupd 64(<input_2=int64#3),>ee=reg256#2 | |||
# asm 2: vmovupd 64(<input_2=%rdx),>ee=%ymm1 | |||
vmovupd 64(%rdx),%ymm1 | |||
# qhasm: ss ^= ee | |||
# asm 1: vpxor <ee=reg256#2,<ss=reg256#1,<ss=reg256#1 | |||
# asm 2: vpxor <ee=%ymm1,<ss=%ymm0,<ss=%ymm0 | |||
vpxor %ymm1,%ymm0,%ymm0 | |||
# qhasm: mem256[ input_0 + 64 ] = ss | |||
# asm 1: vmovupd <ss=reg256#1,64(<input_0=int64#1) | |||
# asm 2: vmovupd <ss=%ymm0,64(<input_0=%rdi) | |||
vmovupd %ymm0,64(%rdi) | |||
# qhasm: return | |||
add %r11,%rsp | |||
ret |
@@ -0,0 +1,17 @@ | |||
#include "transpose.h" | |||
/* | |||
This file is for matrix transposition | |||
*/ | |||
extern void PQCLEAN_MCELIECE348864_AVX_transpose_64x64_asm(uint64_t *); | |||
extern void PQCLEAN_MCELIECE348864_AVX_transpose_64x256_sp_asm(vec256 *); | |||
void PQCLEAN_MCELIECE348864_AVX_transpose_64x64(uint64_t *in) { | |||
PQCLEAN_MCELIECE348864_AVX_transpose_64x64_asm(in); | |||
} | |||
void PQCLEAN_MCELIECE348864_AVX_transpose_64x256_sp(vec256 *in) { | |||
PQCLEAN_MCELIECE348864_AVX_transpose_64x256_sp_asm(in); | |||
} |
@@ -0,0 +1,17 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_AVX_TRANSPOSE_H | |||
#define PQCLEAN_MCELIECE348864_AVX_TRANSPOSE_H | |||
/* | |||
This file is for matrix transposition | |||
*/ | |||
#include "vec256.h" | |||
#include <stdint.h> | |||
void PQCLEAN_MCELIECE348864_AVX_transpose_64x64(uint64_t *in); | |||
void PQCLEAN_MCELIECE348864_AVX_transpose_64x256_sp(vec256 *in); | |||
#endif | |||
@@ -0,0 +1,18 @@ | |||
#include "uint32_sort.h" | |||
#include "int32_sort.h" | |||
/* can save time by vectorizing xor loops */ | |||
/* can save time by integrating xor loops with int32_sort */ | |||
void PQCLEAN_MCELIECE348864_AVX_uint32_sort(uint32_t *x, size_t n) { | |||
size_t j; | |||
for (j = 0; j < n; ++j) { | |||
x[j] ^= 0x80000000; | |||
} | |||
PQCLEAN_MCELIECE348864_AVX_int32_sort((int32_t *) x, n); | |||
for (j = 0; j < n; ++j) { | |||
x[j] ^= 0x80000000; | |||
} | |||
} |
@@ -0,0 +1,9 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_AVX_UINT32_SORT_H | |||
#define PQCLEAN_MCELIECE348864_AVX_UINT32_SORT_H | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
void PQCLEAN_MCELIECE348864_AVX_uint32_sort(uint32_t *x, size_t n); | |||
#endif |
@@ -0,0 +1,354 @@ | |||
# qhasm: int64 input_0 | |||
# qhasm: int64 input_1 | |||
# qhasm: int64 input_2 | |||
# qhasm: int64 input_3 | |||
# qhasm: int64 input_4 | |||
# qhasm: int64 input_5 | |||
# qhasm: stack64 input_6 | |||
# qhasm: stack64 input_7 | |||
# qhasm: int64 caller_r11 | |||
# qhasm: int64 caller_r12 | |||
# qhasm: int64 caller_r13 | |||
# qhasm: int64 caller_r14 | |||
# qhasm: int64 caller_r15 | |||
# qhasm: int64 caller_rbx | |||
# qhasm: int64 caller_rbp | |||
# qhasm: int64 s0 | |||
# qhasm: int64 s1 | |||
# qhasm: enter update_asm | |||
.p2align 5 | |||
.global _PQCLEAN_MCELIECE348864_AVX_update_asm | |||
.global PQCLEAN_MCELIECE348864_AVX_update_asm | |||
_PQCLEAN_MCELIECE348864_AVX_update_asm: | |||
PQCLEAN_MCELIECE348864_AVX_update_asm: | |||
mov %rsp,%r11 | |||
and $31,%r11 | |||
add $0,%r11 | |||
sub %r11,%rsp | |||
# qhasm: s1 = input_1 | |||
# asm 1: mov <input_1=int64#2,>s1=int64#2 | |||
# asm 2: mov <input_1=%rsi,>s1=%rsi | |||
mov %rsi,%rsi | |||
# qhasm: s0 = mem64[ input_0 + 0 ] | |||
# asm 1: movq 0(<input_0=int64#1),>s0=int64#4 | |||
# asm 2: movq 0(<input_0=%rdi),>s0=%rcx | |||
movq 0(%rdi),%rcx | |||
# qhasm: s0 = (s1 s0) >> 1 | |||
# asm 1: shrd $1,<s1=int64#2,<s0=int64#4 | |||
# asm 2: shrd $1,<s1=%rsi,<s0=%rcx | |||
shrd $1,%rsi,%rcx | |||
# qhasm: (uint64) s1 >>= 1 | |||
# asm 1: shr $1,<s1=int64#2 | |||
# asm 2: shr $1,<s1=%rsi | |||
shr $1,%rsi | |||
# qhasm: mem64[ input_0 + 0 ] = s0 | |||
# asm 1: movq <s0=int64#4,0(<input_0=int64#1) | |||
# asm 2: movq <s0=%rcx,0(<input_0=%rdi) | |||
movq %rcx,0(%rdi) | |||
# qhasm: input_0 += input_2 | |||
# asm 1: add <input_2=int64#3,<input_0=int64#1 | |||
# asm 2: add <input_2=%rdx,<input_0=%rdi | |||
add %rdx,%rdi | |||
# qhasm: s0 = mem64[ input_0 + 0 ] | |||
# asm 1: movq 0(<input_0=int64#1),>s0=int64#4 | |||
# asm 2: movq 0(<input_0=%rdi),>s0=%rcx | |||
movq 0(%rdi),%rcx | |||
# qhasm: s0 = (s1 s0) >> 1 | |||
# asm 1: shrd $1,<s1=int64#2,<s0=int64#4 | |||
# asm 2: shrd $1,<s1=%rsi,<s0=%rcx | |||
shrd $1,%rsi,%rcx | |||
# qhasm: (uint64) s1 >>= 1 | |||
# asm 1: shr $1,<s1=int64#2 | |||
# asm 2: shr $1,<s1=%rsi | |||
shr $1,%rsi | |||
# qhasm: mem64[ input_0 + 0 ] = s0 | |||
# asm 1: movq <s0=int64#4,0(<input_0=int64#1) | |||
# asm 2: movq <s0=%rcx,0(<input_0=%rdi) | |||
movq %rcx,0(%rdi) | |||
# qhasm: input_0 += input_2 | |||
# asm 1: add <input_2=int64#3,<input_0=int64#1 | |||
# asm 2: add <input_2=%rdx,<input_0=%rdi | |||
add %rdx,%rdi | |||
# qhasm: s0 = mem64[ input_0 + 0 ] | |||
# asm 1: movq 0(<input_0=int64#1),>s0=int64#4 | |||
# asm 2: movq 0(<input_0=%rdi),>s0=%rcx | |||
movq 0(%rdi),%rcx | |||
# qhasm: s0 = (s1 s0) >> 1 | |||
# asm 1: shrd $1,<s1=int64#2,<s0=int64#4 | |||
# asm 2: shrd $1,<s1=%rsi,<s0=%rcx | |||
shrd $1,%rsi,%rcx | |||
# qhasm: (uint64) s1 >>= 1 | |||
# asm 1: shr $1,<s1=int64#2 | |||
# asm 2: shr $1,<s1=%rsi | |||
shr $1,%rsi | |||
# qhasm: mem64[ input_0 + 0 ] = s0 | |||
# asm 1: movq <s0=int64#4,0(<input_0=int64#1) | |||
# asm 2: movq <s0=%rcx,0(<input_0=%rdi) | |||
movq %rcx,0(%rdi) | |||
# qhasm: input_0 += input_2 | |||
# asm 1: add <input_2=int64#3,<input_0=int64#1 | |||
# asm 2: add <input_2=%rdx,<input_0=%rdi | |||
add %rdx,%rdi | |||
# qhasm: s0 = mem64[ input_0 + 0 ] | |||
# asm 1: movq 0(<input_0=int64#1),>s0=int64#4 | |||
# asm 2: movq 0(<input_0=%rdi),>s0=%rcx | |||
movq 0(%rdi),%rcx | |||
# qhasm: s0 = (s1 s0) >> 1 | |||
# asm 1: shrd $1,<s1=int64#2,<s0=int64#4 | |||
# asm 2: shrd $1,<s1=%rsi,<s0=%rcx | |||
shrd $1,%rsi,%rcx | |||
# qhasm: (uint64) s1 >>= 1 | |||
# asm 1: shr $1,<s1=int64#2 | |||
# asm 2: shr $1,<s1=%rsi | |||
shr $1,%rsi | |||
# qhasm: mem64[ input_0 + 0 ] = s0 | |||
# asm 1: movq <s0=int64#4,0(<input_0=int64#1) | |||
# asm 2: movq <s0=%rcx,0(<input_0=%rdi) | |||
movq %rcx,0(%rdi) | |||
# qhasm: input_0 += input_2 | |||
# asm 1: add <input_2=int64#3,<input_0=int64#1 | |||
# asm 2: add <input_2=%rdx,<input_0=%rdi | |||
add %rdx,%rdi | |||
# qhasm: s0 = mem64[ input_0 + 0 ] | |||
# asm 1: movq 0(<input_0=int64#1),>s0=int64#4 | |||
# asm 2: movq 0(<input_0=%rdi),>s0=%rcx | |||
movq 0(%rdi),%rcx | |||
# qhasm: s0 = (s1 s0) >> 1 | |||
# asm 1: shrd $1,<s1=int64#2,<s0=int64#4 | |||
# asm 2: shrd $1,<s1=%rsi,<s0=%rcx | |||
shrd $1,%rsi,%rcx | |||
# qhasm: (uint64) s1 >>= 1 | |||
# asm 1: shr $1,<s1=int64#2 | |||
# asm 2: shr $1,<s1=%rsi | |||
shr $1,%rsi | |||
# qhasm: mem64[ input_0 + 0 ] = s0 | |||
# asm 1: movq <s0=int64#4,0(<input_0=int64#1) | |||
# asm 2: movq <s0=%rcx,0(<input_0=%rdi) | |||
movq %rcx,0(%rdi) | |||
# qhasm: input_0 += input_2 | |||
# asm 1: add <input_2=int64#3,<input_0=int64#1 | |||
# asm 2: add <input_2=%rdx,<input_0=%rdi | |||
add %rdx,%rdi | |||
# qhasm: s0 = mem64[ input_0 + 0 ] | |||
# asm 1: movq 0(<input_0=int64#1),>s0=int64#4 | |||
# asm 2: movq 0(<input_0=%rdi),>s0=%rcx | |||
movq 0(%rdi),%rcx | |||
# qhasm: s0 = (s1 s0) >> 1 | |||
# asm 1: shrd $1,<s1=int64#2,<s0=int64#4 | |||
# asm 2: shrd $1,<s1=%rsi,<s0=%rcx | |||
shrd $1,%rsi,%rcx | |||
# qhasm: (uint64) s1 >>= 1 | |||
# asm 1: shr $1,<s1=int64#2 | |||
# asm 2: shr $1,<s1=%rsi | |||
shr $1,%rsi | |||
# qhasm: mem64[ input_0 + 0 ] = s0 | |||
# asm 1: movq <s0=int64#4,0(<input_0=int64#1) | |||
# asm 2: movq <s0=%rcx,0(<input_0=%rdi) | |||
movq %rcx,0(%rdi) | |||
# qhasm: input_0 += input_2 | |||
# asm 1: add <input_2=int64#3,<input_0=int64#1 | |||
# asm 2: add <input_2=%rdx,<input_0=%rdi | |||
add %rdx,%rdi | |||
# qhasm: s0 = mem64[ input_0 + 0 ] | |||
# asm 1: movq 0(<input_0=int64#1),>s0=int64#4 | |||
# asm 2: movq 0(<input_0=%rdi),>s0=%rcx | |||
movq 0(%rdi),%rcx | |||
# qhasm: s0 = (s1 s0) >> 1 | |||
# asm 1: shrd $1,<s1=int64#2,<s0=int64#4 | |||
# asm 2: shrd $1,<s1=%rsi,<s0=%rcx | |||
shrd $1,%rsi,%rcx | |||
# qhasm: (uint64) s1 >>= 1 | |||
# asm 1: shr $1,<s1=int64#2 | |||
# asm 2: shr $1,<s1=%rsi | |||
shr $1,%rsi | |||
# qhasm: mem64[ input_0 + 0 ] = s0 | |||
# asm 1: movq <s0=int64#4,0(<input_0=int64#1) | |||
# asm 2: movq <s0=%rcx,0(<input_0=%rdi) | |||
movq %rcx,0(%rdi) | |||
# qhasm: input_0 += input_2 | |||
# asm 1: add <input_2=int64#3,<input_0=int64#1 | |||
# asm 2: add <input_2=%rdx,<input_0=%rdi | |||
add %rdx,%rdi | |||
# qhasm: s0 = mem64[ input_0 + 0 ] | |||
# asm 1: movq 0(<input_0=int64#1),>s0=int64#4 | |||
# asm 2: movq 0(<input_0=%rdi),>s0=%rcx | |||
movq 0(%rdi),%rcx | |||
# qhasm: s0 = (s1 s0) >> 1 | |||
# asm 1: shrd $1,<s1=int64#2,<s0=int64#4 | |||
# asm 2: shrd $1,<s1=%rsi,<s0=%rcx | |||
shrd $1,%rsi,%rcx | |||
# qhasm: (uint64) s1 >>= 1 | |||
# asm 1: shr $1,<s1=int64#2 | |||
# asm 2: shr $1,<s1=%rsi | |||
shr $1,%rsi | |||
# qhasm: mem64[ input_0 + 0 ] = s0 | |||
# asm 1: movq <s0=int64#4,0(<input_0=int64#1) | |||
# asm 2: movq <s0=%rcx,0(<input_0=%rdi) | |||
movq %rcx,0(%rdi) | |||
# qhasm: input_0 += input_2 | |||
# asm 1: add <input_2=int64#3,<input_0=int64#1 | |||
# asm 2: add <input_2=%rdx,<input_0=%rdi | |||
add %rdx,%rdi | |||
# qhasm: s0 = mem64[ input_0 + 0 ] | |||
# asm 1: movq 0(<input_0=int64#1),>s0=int64#4 | |||
# asm 2: movq 0(<input_0=%rdi),>s0=%rcx | |||
movq 0(%rdi),%rcx | |||
# qhasm: s0 = (s1 s0) >> 1 | |||
# asm 1: shrd $1,<s1=int64#2,<s0=int64#4 | |||
# asm 2: shrd $1,<s1=%rsi,<s0=%rcx | |||
shrd $1,%rsi,%rcx | |||
# qhasm: (uint64) s1 >>= 1 | |||
# asm 1: shr $1,<s1=int64#2 | |||
# asm 2: shr $1,<s1=%rsi | |||
shr $1,%rsi | |||
# qhasm: mem64[ input_0 + 0 ] = s0 | |||
# asm 1: movq <s0=int64#4,0(<input_0=int64#1) | |||
# asm 2: movq <s0=%rcx,0(<input_0=%rdi) | |||
movq %rcx,0(%rdi) | |||
# qhasm: input_0 += input_2 | |||
# asm 1: add <input_2=int64#3,<input_0=int64#1 | |||
# asm 2: add <input_2=%rdx,<input_0=%rdi | |||
add %rdx,%rdi | |||
# qhasm: s0 = mem64[ input_0 + 0 ] | |||
# asm 1: movq 0(<input_0=int64#1),>s0=int64#4 | |||
# asm 2: movq 0(<input_0=%rdi),>s0=%rcx | |||
movq 0(%rdi),%rcx | |||
# qhasm: s0 = (s1 s0) >> 1 | |||
# asm 1: shrd $1,<s1=int64#2,<s0=int64#4 | |||
# asm 2: shrd $1,<s1=%rsi,<s0=%rcx | |||
shrd $1,%rsi,%rcx | |||
# qhasm: (uint64) s1 >>= 1 | |||
# asm 1: shr $1,<s1=int64#2 | |||
# asm 2: shr $1,<s1=%rsi | |||
shr $1,%rsi | |||
# qhasm: mem64[ input_0 + 0 ] = s0 | |||
# asm 1: movq <s0=int64#4,0(<input_0=int64#1) | |||
# asm 2: movq <s0=%rcx,0(<input_0=%rdi) | |||
movq %rcx,0(%rdi) | |||
# qhasm: input_0 += input_2 | |||
# asm 1: add <input_2=int64#3,<input_0=int64#1 | |||
# asm 2: add <input_2=%rdx,<input_0=%rdi | |||
add %rdx,%rdi | |||
# qhasm: s0 = mem64[ input_0 + 0 ] | |||
# asm 1: movq 0(<input_0=int64#1),>s0=int64#4 | |||
# asm 2: movq 0(<input_0=%rdi),>s0=%rcx | |||
movq 0(%rdi),%rcx | |||
# qhasm: s0 = (s1 s0) >> 1 | |||
# asm 1: shrd $1,<s1=int64#2,<s0=int64#4 | |||
# asm 2: shrd $1,<s1=%rsi,<s0=%rcx | |||
shrd $1,%rsi,%rcx | |||
# qhasm: (uint64) s1 >>= 1 | |||
# asm 1: shr $1,<s1=int64#2 | |||
# asm 2: shr $1,<s1=%rsi | |||
shr $1,%rsi | |||
# qhasm: mem64[ input_0 + 0 ] = s0 | |||
# asm 1: movq <s0=int64#4,0(<input_0=int64#1) | |||
# asm 2: movq <s0=%rcx,0(<input_0=%rdi) | |||
movq %rcx,0(%rdi) | |||
# qhasm: input_0 += input_2 | |||
# asm 1: add <input_2=int64#3,<input_0=int64#1 | |||
# asm 2: add <input_2=%rdx,<input_0=%rdi | |||
add %rdx,%rdi | |||
# qhasm: s0 = mem64[ input_0 + 0 ] | |||
# asm 1: movq 0(<input_0=int64#1),>s0=int64#4 | |||
# asm 2: movq 0(<input_0=%rdi),>s0=%rcx | |||
movq 0(%rdi),%rcx | |||
# qhasm: s0 = (s1 s0) >> 1 | |||
# asm 1: shrd $1,<s1=int64#2,<s0=int64#4 | |||
# asm 2: shrd $1,<s1=%rsi,<s0=%rcx | |||
shrd $1,%rsi,%rcx | |||
# qhasm: (uint64) s1 >>= 1 | |||
# asm 1: shr $1,<s1=int64#2 | |||
# asm 2: shr $1,<s1=%rsi | |||
shr $1,%rsi | |||
# qhasm: mem64[ input_0 + 0 ] = s0 | |||
# asm 1: movq <s0=int64#4,0(<input_0=int64#1) | |||
# asm 2: movq <s0=%rcx,0(<input_0=%rdi) | |||
movq %rcx,0(%rdi) | |||
# qhasm: input_0 += input_2 | |||
# asm 1: add <input_2=int64#3,<input_0=int64#1 | |||
# asm 2: add <input_2=%rdx,<input_0=%rdi | |||
add %rdx,%rdi | |||
# qhasm: return | |||
add %r11,%rsp | |||
ret |
@@ -0,0 +1,106 @@ | |||
/* | |||
This file is for loading/storing data in a little-endian fashion | |||
*/ | |||
#include "util.h" | |||
void PQCLEAN_MCELIECE348864_AVX_store_i(unsigned char *out, uint64_t in, int i) { | |||
int j; | |||
for (j = 0; j < i; j++) { | |||
out[j] = (in >> (j * 8)) & 0xFF; | |||
} | |||
} | |||
void PQCLEAN_MCELIECE348864_AVX_store2(unsigned char *dest, gf a) { | |||
dest[0] = a & 0xFF; | |||
dest[1] = a >> 8; | |||
} | |||
uint16_t PQCLEAN_MCELIECE348864_AVX_load2(const unsigned char *src) { | |||
uint16_t a; | |||
a = src[1]; | |||
a <<= 8; | |||
a |= src[0]; | |||
return a & GFMASK; | |||
} | |||
uint32_t PQCLEAN_MCELIECE348864_AVX_load4(const unsigned char *src) { | |||
uint32_t a; | |||
a = src[3]; | |||
a <<= 8; | |||
a |= src[2]; | |||
a <<= 8; | |||
a |= src[1]; | |||
a <<= 8; | |||
a |= src[0]; | |||
return a; | |||
} | |||
void PQCLEAN_MCELIECE348864_AVX_irr_load(uint64_t *out, const unsigned char *in) { | |||
int i, j; | |||
uint16_t irr[ SYS_T + 1 ]; | |||
for (i = 0; i < SYS_T; i++) { | |||
irr[i] = PQCLEAN_MCELIECE348864_AVX_load2(in + i * 2); | |||
irr[i] &= GFMASK; | |||
} | |||
irr[ SYS_T ] = 1; | |||
for (i = 0; i < GFBITS; i++) { | |||
out[i] = 0; | |||
} | |||
for (i = SYS_T; i >= 0; i--) { | |||
for (j = 0; j < GFBITS; j++) { | |||
out[j] <<= 1; | |||
out[j] |= (irr[i] >> j) & 1; | |||
} | |||
} | |||
} | |||
void PQCLEAN_MCELIECE348864_AVX_store8(unsigned char *out, uint64_t in) { | |||
out[0] = (in >> 0x00) & 0xFF; | |||
out[1] = (in >> 0x08) & 0xFF; | |||
out[2] = (in >> 0x10) & 0xFF; | |||
out[3] = (in >> 0x18) & 0xFF; | |||
out[4] = (in >> 0x20) & 0xFF; | |||
out[5] = (in >> 0x28) & 0xFF; | |||
out[6] = (in >> 0x30) & 0xFF; | |||
out[7] = (in >> 0x38) & 0xFF; | |||
} | |||
uint64_t PQCLEAN_MCELIECE348864_AVX_load8(const unsigned char *in) { | |||
int i; | |||
uint64_t ret = in[7]; | |||
for (i = 6; i >= 0; i--) { | |||
ret <<= 8; | |||
ret |= in[i]; | |||
} | |||
return ret; | |||
} | |||
gf PQCLEAN_MCELIECE348864_AVX_bitrev(gf a) { | |||
a = ((a & 0x00FF) << 8) | ((a & 0xFF00) >> 8); | |||
a = ((a & 0x0F0F) << 4) | ((a & 0xF0F0) >> 4); | |||
a = ((a & 0x3333) << 2) | ((a & 0xCCCC) >> 2); | |||
a = ((a & 0x5555) << 1) | ((a & 0xAAAA) >> 1); | |||
return a >> 4; | |||
} | |||
vec128 PQCLEAN_MCELIECE348864_AVX_load16(const unsigned char *in) { | |||
return PQCLEAN_MCELIECE348864_AVX_vec128_set2x( PQCLEAN_MCELIECE348864_AVX_load8(in), PQCLEAN_MCELIECE348864_AVX_load8(in + 8) ); | |||
} | |||
void PQCLEAN_MCELIECE348864_AVX_store16(unsigned char *out, vec128 in) { | |||
PQCLEAN_MCELIECE348864_AVX_store8(out + 0, PQCLEAN_MCELIECE348864_AVX_vec128_extract(in, 0)); | |||
PQCLEAN_MCELIECE348864_AVX_store8(out + 8, PQCLEAN_MCELIECE348864_AVX_vec128_extract(in, 1)); | |||
} |
@@ -0,0 +1,33 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_AVX_UTIL_H | |||
#define PQCLEAN_MCELIECE348864_AVX_UTIL_H | |||
/* | |||
This file is for loading/storing data in a little-endian fashion | |||
*/ | |||
#include "gf.h" | |||
#include "vec128.h" | |||
#include <stdint.h> | |||
void PQCLEAN_MCELIECE348864_AVX_store_i(unsigned char *out, uint64_t in, int i); | |||
void PQCLEAN_MCELIECE348864_AVX_store2(unsigned char *dest, gf a); | |||
uint16_t PQCLEAN_MCELIECE348864_AVX_load2(const unsigned char *src); | |||
uint32_t PQCLEAN_MCELIECE348864_AVX_load4(const unsigned char *src); | |||
void PQCLEAN_MCELIECE348864_AVX_irr_load(uint64_t *out, const unsigned char *in); | |||
void PQCLEAN_MCELIECE348864_AVX_store8(unsigned char *out, uint64_t in); | |||
uint64_t PQCLEAN_MCELIECE348864_AVX_load8(const unsigned char *in); | |||
gf PQCLEAN_MCELIECE348864_AVX_bitrev(gf a); | |||
vec128 PQCLEAN_MCELIECE348864_AVX_load16(const unsigned char *in); | |||
void PQCLEAN_MCELIECE348864_AVX_store16(unsigned char *out, vec128 in); | |||
#endif | |||
@@ -0,0 +1,25 @@ | |||
#include "vec.h" | |||
#include "params.h" | |||
extern void PQCLEAN_MCELIECE348864_AVX_vec_mul_asm(uint64_t *, const uint64_t *, const uint64_t *); | |||
extern void PQCLEAN_MCELIECE348864_AVX_vec_mul_sp_asm(uint64_t *, const uint64_t *, const uint64_t *); | |||
void PQCLEAN_MCELIECE348864_AVX_vec_mul(uint64_t *h, const uint64_t *f, const uint64_t *g) { | |||
PQCLEAN_MCELIECE348864_AVX_vec_mul_asm(h, f, g); | |||
} | |||
void PQCLEAN_MCELIECE348864_AVX_vec_mul_sp(uint64_t *h, const uint64_t *f, const uint64_t *g) { | |||
PQCLEAN_MCELIECE348864_AVX_vec_mul_sp_asm(h, f, g); | |||
} | |||
void PQCLEAN_MCELIECE348864_AVX_vec_add(uint64_t *h, const uint64_t *f, const uint64_t *g) { | |||
int b; | |||
for (b = 0; b < GFBITS; b++) { | |||
h[b] = f[b] ^ g[b]; | |||
} | |||
} | |||
@@ -0,0 +1,13 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_AVX_VEC_H | |||
#define PQCLEAN_MCELIECE348864_AVX_VEC_H | |||
#include <stdint.h> | |||
void PQCLEAN_MCELIECE348864_AVX_vec_mul(uint64_t *h, const uint64_t *f, const uint64_t *g); | |||
void PQCLEAN_MCELIECE348864_AVX_vec_mul_sp(uint64_t *h, const uint64_t *f, const uint64_t *g); | |||
void PQCLEAN_MCELIECE348864_AVX_vec_add(uint64_t *h, const uint64_t *f, const uint64_t *g); | |||
#endif |
@@ -0,0 +1,83 @@ | |||
#include "vec128.h" | |||
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(uint16_t a) { | |||
return _mm_set1_epi16(a); | |||
} | |||
int PQCLEAN_MCELIECE348864_AVX_vec128_testz(vec128 a) { | |||
return _mm_testz_si128(a, a); | |||
} | |||
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_setzero(void) { | |||
return _mm_setzero_si128(); | |||
} | |||
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_and(vec128 a, vec128 b) { | |||
return _mm_and_si128(a, b); | |||
} | |||
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_xor(vec128 a, vec128 b) { | |||
return _mm_xor_si128(a, b); | |||
} | |||
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_or(vec128 a, vec128 b) { | |||
return _mm_or_si128(a, b); | |||
} | |||
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_sll_2x(vec128 a, int s) { | |||
return _mm_slli_epi64(a, s); | |||
} | |||
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_srl_2x(vec128 a, int s) { | |||
return _mm_srli_epi64(a, s); | |||
} | |||
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_set2x(uint64_t a0, uint64_t a1) { | |||
return _mm_set_epi64x(a1, a0); | |||
} | |||
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_unpack_low(vec128 a, vec128 b) { | |||
return _mm_unpacklo_epi64(a, b); | |||
} | |||
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_unpack_high(vec128 a, vec128 b) { | |||
return _mm_unpackhi_epi64(a, b); | |||
} | |||
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_setbits(uint64_t a) { | |||
return _mm_set1_epi64x(-a); | |||
} | |||
void PQCLEAN_MCELIECE348864_AVX_vec128_copy(vec128 *dest, const vec128 *src) { | |||
int i; | |||
for (i = 0; i < GFBITS; i++) { | |||
dest[i] = src[i]; | |||
} | |||
} | |||
void PQCLEAN_MCELIECE348864_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b) { | |||
int i; | |||
for (i = 0; i < GFBITS; i++) { | |||
c[i] = PQCLEAN_MCELIECE348864_AVX_vec128_xor(a[i], b[i]); | |||
} | |||
} | |||
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_or_reduce(const vec128 *a) { | |||
int i; | |||
vec128 ret; | |||
ret = a[0]; | |||
for (i = 1; i < GFBITS; i++) { | |||
ret = PQCLEAN_MCELIECE348864_AVX_vec128_or(ret, a[i]); | |||
} | |||
return ret; | |||
} | |||
/* bitsliced field multiplications */ | |||
void PQCLEAN_MCELIECE348864_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g) { | |||
PQCLEAN_MCELIECE348864_AVX_vec128_mul_asm(h, f, g, 16); | |||
} | |||
@@ -0,0 +1,41 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_AVX_VEC128_H | |||
#define PQCLEAN_MCELIECE348864_AVX_VEC128_H | |||
/* | |||
This file is for functions related to 128-bit vectors | |||
including functions for bitsliced field operations | |||
*/ | |||
#include "params.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
typedef __m128i vec128; | |||
// this needs to be a macro, because | |||
// _mm_extract_epi64 requires a literal int argument. | |||
#define PQCLEAN_MCELIECE348864_AVX_vec128_extract(a, i) ((uint64_t) _mm_extract_epi64((vec128) (a), (i))) | |||
int PQCLEAN_MCELIECE348864_AVX_vec128_testz(vec128 a); | |||
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(uint16_t a); | |||
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_setzero(void); | |||
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_and(vec128 a, vec128 b); | |||
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_xor(vec128 a, vec128 b); | |||
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_or(vec128 a, vec128 b); | |||
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_sll_2x(vec128 a, int s); | |||
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_srl_2x(vec128 a, int s); | |||
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_set2x(uint64_t a0, uint64_t a1); | |||
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_unpack_low(vec128 a, vec128 b); | |||
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_unpack_high(vec128 a, vec128 b); | |||
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_setbits(uint64_t a); | |||
void PQCLEAN_MCELIECE348864_AVX_vec128_copy(vec128 *dest, const vec128 *src); | |||
void PQCLEAN_MCELIECE348864_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b); | |||
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_or_reduce(const vec128 *a); | |||
extern void PQCLEAN_MCELIECE348864_AVX_vec128_mul_asm(vec128 *, vec128 *, const vec128 *, int); | |||
/* bitsliced field multiplications */ | |||
void PQCLEAN_MCELIECE348864_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g); | |||
#endif |
@@ -0,0 +1,137 @@ | |||
/* | |||
This file is for functions related to 256-bit vectors | |||
including functions for bitsliced field operations | |||
*/ | |||
#include "vec256.h" | |||
extern void PQCLEAN_MCELIECE348864_AVX_vec256_mul_asm(vec256 *, vec256 *, const vec256 *); | |||
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_set1_16b(uint16_t a) { | |||
return _mm256_set1_epi16(a); | |||
} | |||
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_setzero(void) { | |||
return _mm256_setzero_si256(); | |||
} | |||
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_set4x(uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3) { | |||
return _mm256_set_epi64x(a3, a2, a1, a0); | |||
} | |||
int PQCLEAN_MCELIECE348864_AVX_vec256_testz(vec256 a) { | |||
return _mm256_testz_si256(a, a); | |||
} | |||
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_and(vec256 a, vec256 b) { | |||
return _mm256_and_si256(a, b); | |||
} | |||
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_xor(vec256 a, vec256 b) { | |||
return _mm256_xor_si256(a, b); | |||
} | |||
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_or(vec256 a, vec256 b) { | |||
return _mm256_or_si256(a, b); | |||
} | |||
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_sll_4x(vec256 a, int s) { | |||
return _mm256_slli_epi64(a, s); | |||
} | |||
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_srl_4x(vec256 a, int s) { | |||
return _mm256_srli_epi64(a, s); | |||
} | |||
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_unpack_low(vec256 a, vec256 b) { | |||
return _mm256_permute2x128_si256 (a, b, 0x20); | |||
} | |||
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_unpack_high(vec256 a, vec256 b) { | |||
return _mm256_permute2x128_si256 (a, b, 0x31); | |||
} | |||
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_unpack_low_2x(vec256 a, vec256 b) { | |||
return _mm256_unpacklo_epi64 (a, b); | |||
} | |||
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_unpack_high_2x(vec256 a, vec256 b) { | |||
return _mm256_unpackhi_epi64 (a, b); | |||
} | |||
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_or_reduce(const vec256 *a) { | |||
int i; | |||
vec256 ret; | |||
ret = a[0]; | |||
for (i = 1; i < GFBITS; i++) { | |||
ret = PQCLEAN_MCELIECE348864_AVX_vec256_or(ret, a[i]); | |||
} | |||
return ret; | |||
} | |||
void PQCLEAN_MCELIECE348864_AVX_vec256_copy(vec256 *dest, const vec256 *src) { | |||
int i; | |||
for (i = 0; i < GFBITS; i++) { | |||
dest[i] = src[i]; | |||
} | |||
} | |||
void PQCLEAN_MCELIECE348864_AVX_vec256_mul(vec256 *h, vec256 *f, const vec256 *g) { | |||
PQCLEAN_MCELIECE348864_AVX_vec256_mul_asm(h, f, g); | |||
} | |||
/* bitsliced field squarings */ | |||
void PQCLEAN_MCELIECE348864_AVX_vec256_sq(vec256 *out, const vec256 *in) { | |||
int i; | |||
vec256 result[GFBITS]; | |||
result[0] = in[0] ^ in[6]; | |||
result[1] = in[11]; | |||
result[2] = in[1] ^ in[7]; | |||
result[3] = in[6]; | |||
result[4] = in[2] ^ in[11] ^ in[8]; | |||
result[5] = in[7]; | |||
result[6] = in[3] ^ in[9]; | |||
result[7] = in[8]; | |||
result[8] = in[4] ^ in[10]; | |||
result[9] = in[9]; | |||
result[10] = in[5] ^ in[11]; | |||
result[11] = in[10]; | |||
for (i = 0; i < GFBITS; i++) { | |||
out[i] = result[i]; | |||
} | |||
} | |||
/* bitsliced field inverses */ | |||
void PQCLEAN_MCELIECE348864_AVX_vec256_inv(vec256 *out, const vec256 *in) { | |||
vec256 tmp_11[ GFBITS ]; | |||
vec256 tmp_1111[ GFBITS ]; | |||
PQCLEAN_MCELIECE348864_AVX_vec256_copy(out, in); | |||
PQCLEAN_MCELIECE348864_AVX_vec256_sq(out, out); | |||
PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp_11, out, in); // ^11 | |||
PQCLEAN_MCELIECE348864_AVX_vec256_sq(out, tmp_11); | |||
PQCLEAN_MCELIECE348864_AVX_vec256_sq(out, out); | |||
PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp_1111, out, tmp_11); // ^1111 | |||
PQCLEAN_MCELIECE348864_AVX_vec256_sq(out, tmp_1111); | |||
PQCLEAN_MCELIECE348864_AVX_vec256_sq(out, out); | |||
PQCLEAN_MCELIECE348864_AVX_vec256_sq(out, out); | |||
PQCLEAN_MCELIECE348864_AVX_vec256_sq(out, out); | |||
PQCLEAN_MCELIECE348864_AVX_vec256_mul(out, out, tmp_1111); // ^11111111 | |||
PQCLEAN_MCELIECE348864_AVX_vec256_sq(out, out); | |||
PQCLEAN_MCELIECE348864_AVX_vec256_sq(out, out); | |||
PQCLEAN_MCELIECE348864_AVX_vec256_mul(out, out, tmp_11); // ^1111111111 | |||
PQCLEAN_MCELIECE348864_AVX_vec256_sq(out, out); | |||
PQCLEAN_MCELIECE348864_AVX_vec256_mul(out, out, in); // ^11111111111 | |||
PQCLEAN_MCELIECE348864_AVX_vec256_sq(out, out); // ^111111111110 | |||
} |
@@ -0,0 +1,45 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_AVX_VEC256_H | |||
#define PQCLEAN_MCELIECE348864_AVX_VEC256_H | |||
/* | |||
This file is for functions related to 256-bit vectors | |||
including functions for bitsliced field operations | |||
*/ | |||
#include "vec128.h" | |||
#include <immintrin.h> | |||
typedef __m256i vec256; | |||
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_set1_16b(uint16_t a); | |||
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_setzero(void); | |||
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_set4x(uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3); | |||
// Extract requires a literal argument so need to be macros | |||
#define PQCLEAN_MCELIECE348864_AVX_vec256_extract2x(a,i) ((vec128) _mm256_extractf128_si256((vec256) (a),(i))) | |||
#define PQCLEAN_MCELIECE348864_AVX_vec256_extract(a,i) ((uint64_t) _mm256_extract_epi64((vec256) (a),(i))) | |||
int PQCLEAN_MCELIECE348864_AVX_vec256_testz(vec256 a); | |||
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_and(vec256 a, vec256 b); | |||
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_xor(vec256 a, vec256 b); | |||
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_or(vec256 a, vec256 b); | |||
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_sll_4x(vec256 a, int s); | |||
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_srl_4x(vec256 a, int s); | |||
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_unpack_low(vec256 a, vec256 b); | |||
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_unpack_high(vec256 a, vec256 b); | |||
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_unpack_low_2x(vec256 a, vec256 b); | |||
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_unpack_high_2x(vec256 a, vec256 b); | |||
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_or_reduce(const vec256 *a); | |||
void PQCLEAN_MCELIECE348864_AVX_vec256_copy(vec256 *dest, const vec256 *src); | |||
/* bitsliced field multiplications */ | |||
void PQCLEAN_MCELIECE348864_AVX_vec256_mul(vec256 *h, vec256 *f, const vec256 *g); | |||
void PQCLEAN_MCELIECE348864_AVX_vec256_sq(vec256 * /*out*/, const vec256 * /*in*/); | |||
void PQCLEAN_MCELIECE348864_AVX_vec256_inv(vec256 * /*out*/, const vec256 * /*in*/); | |||
extern void PQCLEAN_MCELIECE348864_AVX_vec256_maa_asm(vec256 *, vec256 *, const vec256 *); | |||
extern void PQCLEAN_MCELIECE348864_AVX_vec256_ama_asm(vec256 *, vec256 *, const vec256 *); | |||
#endif | |||
@@ -0,0 +1,356 @@ | |||
# qhasm: int64 input_0 | |||
# qhasm: int64 input_1 | |||
# qhasm: int64 input_2 | |||
# qhasm: int64 input_3 | |||
# qhasm: int64 input_4 | |||
# qhasm: int64 input_5 | |||
# qhasm: stack64 input_6 | |||
# qhasm: stack64 input_7 | |||
# qhasm: int64 caller_r11 | |||
# qhasm: int64 caller_r12 | |||
# qhasm: int64 caller_r13 | |||
# qhasm: int64 caller_r14 | |||
# qhasm: int64 caller_r15 | |||
# qhasm: int64 caller_rbx | |||
# qhasm: int64 caller_rbp | |||
# qhasm: int64 t | |||
# qhasm: int64 c | |||
# qhasm: int64 r | |||
# qhasm: enter vec_reduce_asm | |||
.p2align 5 | |||
.global _PQCLEAN_MCELIECE348864_AVX_vec_reduce_asm | |||
.global PQCLEAN_MCELIECE348864_AVX_vec_reduce_asm | |||
_PQCLEAN_MCELIECE348864_AVX_vec_reduce_asm: | |||
PQCLEAN_MCELIECE348864_AVX_vec_reduce_asm: | |||
mov %rsp,%r11 | |||
and $31,%r11 | |||
add $0,%r11 | |||
sub %r11,%rsp | |||
# qhasm: r = 0 | |||
# asm 1: mov $0,>r=int64#7 | |||
# asm 2: mov $0,>r=%rax | |||
mov $0,%rax | |||
# qhasm: t = mem64[ input_0 + 88 ] | |||
# asm 1: movq 88(<input_0=int64#1),>t=int64#2 | |||
# asm 2: movq 88(<input_0=%rdi),>t=%rsi | |||
movq 88(%rdi),%rsi | |||
# qhasm: c = count(t) | |||
# asm 1: popcnt <t=int64#2, >c=int64#2 | |||
# asm 2: popcnt <t=%rsi, >c=%rsi | |||
popcnt %rsi, %rsi | |||
# qhasm: (uint32) c &= 1 | |||
# asm 1: and $1,<c=int64#2d | |||
# asm 2: and $1,<c=%esi | |||
and $1,%esi | |||
# qhasm: r <<= 1 | |||
# asm 1: shl $1,<r=int64#7 | |||
# asm 2: shl $1,<r=%rax | |||
shl $1,%rax | |||
# qhasm: r |= c | |||
# asm 1: or <c=int64#2,<r=int64#7 | |||
# asm 2: or <c=%rsi,<r=%rax | |||
or %rsi,%rax | |||
# qhasm: t = mem64[ input_0 + 80 ] | |||
# asm 1: movq 80(<input_0=int64#1),>t=int64#2 | |||
# asm 2: movq 80(<input_0=%rdi),>t=%rsi | |||
movq 80(%rdi),%rsi | |||
# qhasm: c = count(t) | |||
# asm 1: popcnt <t=int64#2, >c=int64#2 | |||
# asm 2: popcnt <t=%rsi, >c=%rsi | |||
popcnt %rsi, %rsi | |||
# qhasm: (uint32) c &= 1 | |||
# asm 1: and $1,<c=int64#2d | |||
# asm 2: and $1,<c=%esi | |||
and $1,%esi | |||
# qhasm: r <<= 1 | |||
# asm 1: shl $1,<r=int64#7 | |||
# asm 2: shl $1,<r=%rax | |||
shl $1,%rax | |||
# qhasm: r |= c | |||
# asm 1: or <c=int64#2,<r=int64#7 | |||
# asm 2: or <c=%rsi,<r=%rax | |||
or %rsi,%rax | |||
# qhasm: t = mem64[ input_0 + 72 ] | |||
# asm 1: movq 72(<input_0=int64#1),>t=int64#2 | |||
# asm 2: movq 72(<input_0=%rdi),>t=%rsi | |||
movq 72(%rdi),%rsi | |||
# qhasm: c = count(t) | |||
# asm 1: popcnt <t=int64#2, >c=int64#2 | |||
# asm 2: popcnt <t=%rsi, >c=%rsi | |||
popcnt %rsi, %rsi | |||
# qhasm: (uint32) c &= 1 | |||
# asm 1: and $1,<c=int64#2d | |||
# asm 2: and $1,<c=%esi | |||
and $1,%esi | |||
# qhasm: r <<= 1 | |||
# asm 1: shl $1,<r=int64#7 | |||
# asm 2: shl $1,<r=%rax | |||
shl $1,%rax | |||
# qhasm: r |= c | |||
# asm 1: or <c=int64#2,<r=int64#7 | |||
# asm 2: or <c=%rsi,<r=%rax | |||
or %rsi,%rax | |||
# qhasm: t = mem64[ input_0 + 64 ] | |||
# asm 1: movq 64(<input_0=int64#1),>t=int64#2 | |||
# asm 2: movq 64(<input_0=%rdi),>t=%rsi | |||
movq 64(%rdi),%rsi | |||
# qhasm: c = count(t) | |||
# asm 1: popcnt <t=int64#2, >c=int64#2 | |||
# asm 2: popcnt <t=%rsi, >c=%rsi | |||
popcnt %rsi, %rsi | |||
# qhasm: (uint32) c &= 1 | |||
# asm 1: and $1,<c=int64#2d | |||
# asm 2: and $1,<c=%esi | |||
and $1,%esi | |||
# qhasm: r <<= 1 | |||
# asm 1: shl $1,<r=int64#7 | |||
# asm 2: shl $1,<r=%rax | |||
shl $1,%rax | |||
# qhasm: r |= c | |||
# asm 1: or <c=int64#2,<r=int64#7 | |||
# asm 2: or <c=%rsi,<r=%rax | |||
or %rsi,%rax | |||
# qhasm: t = mem64[ input_0 + 56 ] | |||
# asm 1: movq 56(<input_0=int64#1),>t=int64#2 | |||
# asm 2: movq 56(<input_0=%rdi),>t=%rsi | |||
movq 56(%rdi),%rsi | |||
# qhasm: c = count(t) | |||
# asm 1: popcnt <t=int64#2, >c=int64#2 | |||
# asm 2: popcnt <t=%rsi, >c=%rsi | |||
popcnt %rsi, %rsi | |||
# qhasm: (uint32) c &= 1 | |||
# asm 1: and $1,<c=int64#2d | |||
# asm 2: and $1,<c=%esi | |||
and $1,%esi | |||
# qhasm: r <<= 1 | |||
# asm 1: shl $1,<r=int64#7 | |||
# asm 2: shl $1,<r=%rax | |||
shl $1,%rax | |||
# qhasm: r |= c | |||
# asm 1: or <c=int64#2,<r=int64#7 | |||
# asm 2: or <c=%rsi,<r=%rax | |||
or %rsi,%rax | |||
# qhasm: t = mem64[ input_0 + 48 ] | |||
# asm 1: movq 48(<input_0=int64#1),>t=int64#2 | |||
# asm 2: movq 48(<input_0=%rdi),>t=%rsi | |||
movq 48(%rdi),%rsi | |||
# qhasm: c = count(t) | |||
# asm 1: popcnt <t=int64#2, >c=int64#2 | |||
# asm 2: popcnt <t=%rsi, >c=%rsi | |||
popcnt %rsi, %rsi | |||
# qhasm: (uint32) c &= 1 | |||
# asm 1: and $1,<c=int64#2d | |||
# asm 2: and $1,<c=%esi | |||
and $1,%esi | |||
# qhasm: r <<= 1 | |||
# asm 1: shl $1,<r=int64#7 | |||
# asm 2: shl $1,<r=%rax | |||
shl $1,%rax | |||
# qhasm: r |= c | |||
# asm 1: or <c=int64#2,<r=int64#7 | |||
# asm 2: or <c=%rsi,<r=%rax | |||
or %rsi,%rax | |||
# qhasm: t = mem64[ input_0 + 40 ] | |||
# asm 1: movq 40(<input_0=int64#1),>t=int64#2 | |||
# asm 2: movq 40(<input_0=%rdi),>t=%rsi | |||
movq 40(%rdi),%rsi | |||
# qhasm: c = count(t) | |||
# asm 1: popcnt <t=int64#2, >c=int64#2 | |||
# asm 2: popcnt <t=%rsi, >c=%rsi | |||
popcnt %rsi, %rsi | |||
# qhasm: (uint32) c &= 1 | |||
# asm 1: and $1,<c=int64#2d | |||
# asm 2: and $1,<c=%esi | |||
and $1,%esi | |||
# qhasm: r <<= 1 | |||
# asm 1: shl $1,<r=int64#7 | |||
# asm 2: shl $1,<r=%rax | |||
shl $1,%rax | |||
# qhasm: r |= c | |||
# asm 1: or <c=int64#2,<r=int64#7 | |||
# asm 2: or <c=%rsi,<r=%rax | |||
or %rsi,%rax | |||
# qhasm: t = mem64[ input_0 + 32 ] | |||
# asm 1: movq 32(<input_0=int64#1),>t=int64#2 | |||
# asm 2: movq 32(<input_0=%rdi),>t=%rsi | |||
movq 32(%rdi),%rsi | |||
# qhasm: c = count(t) | |||
# asm 1: popcnt <t=int64#2, >c=int64#2 | |||
# asm 2: popcnt <t=%rsi, >c=%rsi | |||
popcnt %rsi, %rsi | |||
# qhasm: (uint32) c &= 1 | |||
# asm 1: and $1,<c=int64#2d | |||
# asm 2: and $1,<c=%esi | |||
and $1,%esi | |||
# qhasm: r <<= 1 | |||
# asm 1: shl $1,<r=int64#7 | |||
# asm 2: shl $1,<r=%rax | |||
shl $1,%rax | |||
# qhasm: r |= c | |||
# asm 1: or <c=int64#2,<r=int64#7 | |||
# asm 2: or <c=%rsi,<r=%rax | |||
or %rsi,%rax | |||
# qhasm: t = mem64[ input_0 + 24 ] | |||
# asm 1: movq 24(<input_0=int64#1),>t=int64#2 | |||
# asm 2: movq 24(<input_0=%rdi),>t=%rsi | |||
movq 24(%rdi),%rsi | |||
# qhasm: c = count(t) | |||
# asm 1: popcnt <t=int64#2, >c=int64#2 | |||
# asm 2: popcnt <t=%rsi, >c=%rsi | |||
popcnt %rsi, %rsi | |||
# qhasm: (uint32) c &= 1 | |||
# asm 1: and $1,<c=int64#2d | |||
# asm 2: and $1,<c=%esi | |||
and $1,%esi | |||
# qhasm: r <<= 1 | |||
# asm 1: shl $1,<r=int64#7 | |||
# asm 2: shl $1,<r=%rax | |||
shl $1,%rax | |||
# qhasm: r |= c | |||
# asm 1: or <c=int64#2,<r=int64#7 | |||
# asm 2: or <c=%rsi,<r=%rax | |||
or %rsi,%rax | |||
# qhasm: t = mem64[ input_0 + 16 ] | |||
# asm 1: movq 16(<input_0=int64#1),>t=int64#2 | |||
# asm 2: movq 16(<input_0=%rdi),>t=%rsi | |||
movq 16(%rdi),%rsi | |||
# qhasm: c = count(t) | |||
# asm 1: popcnt <t=int64#2, >c=int64#2 | |||
# asm 2: popcnt <t=%rsi, >c=%rsi | |||
popcnt %rsi, %rsi | |||
# qhasm: (uint32) c &= 1 | |||
# asm 1: and $1,<c=int64#2d | |||
# asm 2: and $1,<c=%esi | |||
and $1,%esi | |||
# qhasm: r <<= 1 | |||
# asm 1: shl $1,<r=int64#7 | |||
# asm 2: shl $1,<r=%rax | |||
shl $1,%rax | |||
# qhasm: r |= c | |||
# asm 1: or <c=int64#2,<r=int64#7 | |||
# asm 2: or <c=%rsi,<r=%rax | |||
or %rsi,%rax | |||
# qhasm: t = mem64[ input_0 + 8 ] | |||
# asm 1: movq 8(<input_0=int64#1),>t=int64#2 | |||
# asm 2: movq 8(<input_0=%rdi),>t=%rsi | |||
movq 8(%rdi),%rsi | |||
# qhasm: c = count(t) | |||
# asm 1: popcnt <t=int64#2, >c=int64#2 | |||
# asm 2: popcnt <t=%rsi, >c=%rsi | |||
popcnt %rsi, %rsi | |||
# qhasm: (uint32) c &= 1 | |||
# asm 1: and $1,<c=int64#2d | |||
# asm 2: and $1,<c=%esi | |||
and $1,%esi | |||
# qhasm: r <<= 1 | |||
# asm 1: shl $1,<r=int64#7 | |||
# asm 2: shl $1,<r=%rax | |||
shl $1,%rax | |||
# qhasm: r |= c | |||
# asm 1: or <c=int64#2,<r=int64#7 | |||
# asm 2: or <c=%rsi,<r=%rax | |||
or %rsi,%rax | |||
# qhasm: t = mem64[ input_0 + 0 ] | |||
# asm 1: movq 0(<input_0=int64#1),>t=int64#1 | |||
# asm 2: movq 0(<input_0=%rdi),>t=%rdi | |||
movq 0(%rdi),%rdi | |||
# qhasm: c = count(t) | |||
# asm 1: popcnt <t=int64#1, >c=int64#1 | |||
# asm 2: popcnt <t=%rdi, >c=%rdi | |||
popcnt %rdi, %rdi | |||
# qhasm: (uint32) c &= 1 | |||
# asm 1: and $1,<c=int64#1d | |||
# asm 2: and $1,<c=%edi | |||
and $1,%edi | |||
# qhasm: r <<= 1 | |||
# asm 1: shl $1,<r=int64#7 | |||
# asm 2: shl $1,<r=%rax | |||
shl $1,%rax | |||
# qhasm: r |= c | |||
# asm 1: or <c=int64#1,<r=int64#7 | |||
# asm 2: or <c=%rdi,<r=%rax | |||
or %rdi,%rax | |||
# qhasm: return r | |||
add %r11,%rsp | |||
ret |
@@ -0,0 +1,16 @@ | |||
Public Domain. | |||
Authors of Classic McEliece in alphabetical order: | |||
Daniel J. Bernstein, University of Illinois at Chicago | |||
Tung Chou, Osaka University | |||
Tanja Lange, Technische Universiteit Eindhoven | |||
Ingo von Maurich, self | |||
Rafael Misoczki, Intel Corporation | |||
Ruben Niederhagen, Fraunhofer SIT | |||
Edoardo Persichetti, Florida Atlantic University | |||
Christiane Peters, self | |||
Peter Schwabe, Radboud University | |||
Nicolas Sendrier, Inria | |||
Jakub Szefer, Yale University | |||
Wen Wang, Yale University |
@@ -0,0 +1,27 @@ | |||
# This Makefile can be used with GNU Make or BSD Make | |||
LIB = libmceliece348864_clean.a | |||
SOURCES = aes256ctr.c benes.c bm.c controlbits.c decrypt.c encrypt.c gf.c \ | |||
operations.c pk_gen.c root.c sk_gen.c synd.c transpose.c util.c | |||
HEADERS = aes256ctr.h api.h benes.h bm.h controlbits.h crypto_hash.h \ | |||
decrypt.h encrypt.h gf.h params.h pk_gen.h root.h \ | |||
sk_gen.h synd.h transpose.h util.h | |||
OBJECTS = aes256ctr.o benes.o bm.o controlbits.o decrypt.o encrypt.o gf.o \ | |||
operations.o pk_gen.o root.o sk_gen.o synd.o transpose.o util.o | |||
CFLAGS = -O3 -std=c99 -Wall -Wextra -pedantic -Werror -Wpedantic \ | |||
-Wredundant-decls -Wvla -Wcast-align -Wmissing-prototypes \ | |||
-I../../../common/ $(EXTRAFLAGS) | |||
all: $(LIB) | |||
%.o: %.c $(HEADERS) | |||
$(CC) $(CFLAGS) -c -o $@ $< | |||
$(LIB): $(OBJECTS) | |||
$(AR) -r $@ $(OBJECTS) | |||
clean: | |||
$(RM) $(OBJECTS) | |||
$(RM) $(LIB) |
@@ -0,0 +1,24 @@ | |||
# This Makefile can be used with GNU Make or BSD Make | |||
LIBRARY = libmceliece348864_clean.lib | |||
OBJECTS = aes256ctr.obj benes.obj bm.obj controlbits.obj decrypt.obj encrypt.obj gf.obj \ | |||
operations.obj pk_gen.obj root.obj sk_gen.obj synd.obj transpose.obj util.obj | |||
# Warning C4146 is raised when a unary minus operator is applied to an | |||
# unsigned type; this has nonetheless been standard and portable for as | |||
# long as there has been a C standard, and we do that a lot, especially | |||
# for constant-time computations. Thus, we disable that spurious warning. | |||
CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /wd4146 /WX | |||
all: $(LIBRARY) | |||
# Make sure objects are recompiled if headers change. | |||
$(OBJECTS): *.h | |||
$(LIBRARY): $(OBJECTS) | |||
LIB.EXE /NOLOGO /WX /OUT:$@ $** | |||
clean: | |||
-DEL $(OBJECTS) | |||
-DEL $(LIBRARY) |
@@ -0,0 +1,13 @@ | |||
#include "aes256ctr.h" | |||
void PQCLEAN_MCELIECE348864_CLEAN_aes256ctr( | |||
uint8_t *out, | |||
size_t outlen, | |||
const uint8_t nonce[AESCTR_NONCEBYTES], | |||
const uint8_t key[AES256_KEYBYTES]) { | |||
aes256ctx state; | |||
aes256_keyexp(&state, key); | |||
aes256_ctr(out, outlen, nonce, &state); | |||
aes256_ctx_release(&state); | |||
} |
@@ -0,0 +1,17 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_CLEAN_AES256CTR_H | |||
#define PQCLEAN_MCELIECE348864_CLEAN_AES256CTR_H | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
#include "aes.h" | |||
void PQCLEAN_MCELIECE348864_CLEAN_aes256ctr( | |||
uint8_t *out, | |||
size_t outlen, | |||
const uint8_t nonce[AESCTR_NONCEBYTES], | |||
const uint8_t key[AES256_KEYBYTES] | |||
); | |||
#endif |
@@ -0,0 +1,32 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_CLEAN_API_H | |||
#define PQCLEAN_MCELIECE348864_CLEAN_API_H | |||
#include <stdint.h> | |||
#define PQCLEAN_MCELIECE348864_CLEAN_CRYPTO_ALGNAME "Classic McEliece 348864" | |||
#define PQCLEAN_MCELIECE348864_CLEAN_CRYPTO_PUBLICKEYBYTES 261120 | |||
#define PQCLEAN_MCELIECE348864_CLEAN_CRYPTO_SECRETKEYBYTES 6452 | |||
#define PQCLEAN_MCELIECE348864_CLEAN_CRYPTO_CIPHERTEXTBYTES 128 | |||
#define PQCLEAN_MCELIECE348864_CLEAN_CRYPTO_BYTES 32 | |||
int PQCLEAN_MCELIECE348864_CLEAN_crypto_kem_enc( | |||
uint8_t *c, | |||
uint8_t *key, | |||
const uint8_t *pk | |||
); | |||
int PQCLEAN_MCELIECE348864_CLEAN_crypto_kem_dec( | |||
uint8_t *key, | |||
const uint8_t *c, | |||
const uint8_t *sk | |||
); | |||
int PQCLEAN_MCELIECE348864_CLEAN_crypto_kem_keypair | |||
( | |||
uint8_t *pk, | |||
uint8_t *sk | |||
); | |||
#endif | |||
@@ -0,0 +1,139 @@ | |||
/* | |||
This file is for Benes network related functions | |||
*/ | |||
#include "benes.h" | |||
#include "params.h" | |||
#include "transpose.h" | |||
#include "util.h" | |||
/* one layer of the benes network */ | |||
static void layer(uint64_t *data, uint64_t *bits, int lgs) { | |||
int i, j, s; | |||
uint64_t d; | |||
s = 1 << lgs; | |||
for (i = 0; i < 64; i += s * 2) { | |||
for (j = i; j < i + s; j++) { | |||
d = (data[j + 0] ^ data[j + s]); | |||
d &= (*bits++); | |||
data[j + 0] ^= d; | |||
data[j + s] ^= d; | |||
} | |||
} | |||
} | |||
/* input: r, sequence of bits to be permuted */ | |||
/* bits, condition bits of the Benes network */ | |||
/* rev, 0 for normal application; !0 for inverse */ | |||
/* output: r, permuted bits */ | |||
void PQCLEAN_MCELIECE348864_CLEAN_apply_benes(unsigned char *r, const unsigned char *bits, int rev) { | |||
int i; | |||
const unsigned char *cond_ptr; | |||
int inc, low; | |||
uint64_t bs[64]; | |||
uint64_t cond[64]; | |||
// | |||
for (i = 0; i < 64; i++) { | |||
bs[i] = PQCLEAN_MCELIECE348864_CLEAN_load8(r + i * 8); | |||
} | |||
if (rev == 0) { | |||
inc = 256; | |||
cond_ptr = bits; | |||
} else { | |||
inc = -256; | |||
cond_ptr = bits + (2 * GFBITS - 2) * 256; | |||
} | |||
// | |||
PQCLEAN_MCELIECE348864_CLEAN_transpose_64x64(bs, bs); | |||
for (low = 0; low <= 5; low++) { | |||
for (i = 0; i < 64; i++) { | |||
cond[i] = PQCLEAN_MCELIECE348864_CLEAN_load4(cond_ptr + i * 4); | |||
} | |||
PQCLEAN_MCELIECE348864_CLEAN_transpose_64x64(cond, cond); | |||
layer(bs, cond, low); | |||
cond_ptr += inc; | |||
} | |||
PQCLEAN_MCELIECE348864_CLEAN_transpose_64x64(bs, bs); | |||
for (low = 0; low <= 5; low++) { | |||
for (i = 0; i < 32; i++) { | |||
cond[i] = PQCLEAN_MCELIECE348864_CLEAN_load8(cond_ptr + i * 8); | |||
} | |||
layer(bs, cond, low); | |||
cond_ptr += inc; | |||
} | |||
for (low = 4; low >= 0; low--) { | |||
for (i = 0; i < 32; i++) { | |||
cond[i] = PQCLEAN_MCELIECE348864_CLEAN_load8(cond_ptr + i * 8); | |||
} | |||
layer(bs, cond, low); | |||
cond_ptr += inc; | |||
} | |||
PQCLEAN_MCELIECE348864_CLEAN_transpose_64x64(bs, bs); | |||
for (low = 5; low >= 0; low--) { | |||
for (i = 0; i < 64; i++) { | |||
cond[i] = PQCLEAN_MCELIECE348864_CLEAN_load4(cond_ptr + i * 4); | |||
} | |||
PQCLEAN_MCELIECE348864_CLEAN_transpose_64x64(cond, cond); | |||
layer(bs, cond, low); | |||
cond_ptr += inc; | |||
} | |||
PQCLEAN_MCELIECE348864_CLEAN_transpose_64x64(bs, bs); | |||
for (i = 0; i < 64; i++) { | |||
PQCLEAN_MCELIECE348864_CLEAN_store8(r + i * 8, bs[i]); | |||
} | |||
} | |||
/* input: condition bits c */ | |||
/* output: support s */ | |||
void PQCLEAN_MCELIECE348864_CLEAN_support_gen(gf *s, const unsigned char *c) { | |||
gf a; | |||
int i, j; | |||
unsigned char L[ GFBITS ][ (1 << GFBITS) / 8 ]; | |||
for (i = 0; i < GFBITS; i++) { | |||
for (j = 0; j < (1 << GFBITS) / 8; j++) { | |||
L[i][j] = 0; | |||
} | |||
} | |||
for (i = 0; i < (1 << GFBITS); i++) { | |||
a = PQCLEAN_MCELIECE348864_CLEAN_bitrev((gf) i); | |||
for (j = 0; j < GFBITS; j++) { | |||
L[j][ i / 8 ] |= ((a >> j) & 1) << (i % 8); | |||
} | |||
} | |||
for (j = 0; j < GFBITS; j++) { | |||
PQCLEAN_MCELIECE348864_CLEAN_apply_benes(L[j], c, 0); | |||
} | |||
for (i = 0; i < SYS_N; i++) { | |||
s[i] = 0; | |||
for (j = GFBITS - 1; j >= 0; j--) { | |||
s[i] <<= 1; | |||
s[i] |= (L[j][i / 8] >> (i % 8)) & 1; | |||
} | |||
} | |||
} | |||
@@ -0,0 +1,14 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_CLEAN_BENES_H | |||
#define PQCLEAN_MCELIECE348864_CLEAN_BENES_H | |||
/* | |||
This file is for Benes network related functions | |||
*/ | |||
#include "gf.h" | |||
void PQCLEAN_MCELIECE348864_CLEAN_apply_benes(unsigned char * /*r*/, const unsigned char * /*bits*/, int /*rev*/); | |||
void PQCLEAN_MCELIECE348864_CLEAN_support_gen(gf * /*s*/, const unsigned char * /*c*/); | |||
#endif | |||
@@ -0,0 +1,83 @@ | |||
/* | |||
This file is for the Berlekamp-Massey algorithm | |||
see http://crypto.stanford.edu/~mironov/cs359/massey.pdf | |||
*/ | |||
#include "bm.h" | |||
#include "params.h" | |||
#define min(a, b) (((a) < (b)) ? (a) : (b)) | |||
/* the Berlekamp-Massey algorithm */ | |||
/* input: s, sequence of field elements */ | |||
/* output: out, minimal polynomial of s */ | |||
void PQCLEAN_MCELIECE348864_CLEAN_bm(gf *out, gf *s) { | |||
int i; | |||
uint16_t N = 0; | |||
uint16_t L = 0; | |||
uint16_t mle; | |||
uint16_t mne; | |||
gf T[ SYS_T + 1 ]; | |||
gf C[ SYS_T + 1 ]; | |||
gf B[ SYS_T + 1 ]; | |||
gf b = 1, d, f; | |||
// | |||
for (i = 0; i < SYS_T + 1; i++) { | |||
C[i] = B[i] = 0; | |||
} | |||
B[1] = C[0] = 1; | |||
// | |||
for (N = 0; N < 2 * SYS_T; N++) { | |||
d = 0; | |||
for (i = 0; i <= min(N, SYS_T); i++) { | |||
d ^= PQCLEAN_MCELIECE348864_CLEAN_gf_mul(C[i], s[ N - i]); | |||
} | |||
mne = d; | |||
mne -= 1; | |||
mne >>= 15; | |||
mne -= 1; | |||
mle = N; | |||
mle -= 2 * L; | |||
mle >>= 15; | |||
mle -= 1; | |||
mle &= mne; | |||
for (i = 0; i <= SYS_T; i++) { | |||
T[i] = C[i]; | |||
} | |||
f = PQCLEAN_MCELIECE348864_CLEAN_gf_frac(b, d); | |||
for (i = 0; i <= SYS_T; i++) { | |||
C[i] ^= PQCLEAN_MCELIECE348864_CLEAN_gf_mul(f, B[i]) & mne; | |||
} | |||
L = (L & ~mle) | ((N + 1 - L) & mle); | |||
for (i = 0; i <= SYS_T; i++) { | |||
B[i] = (B[i] & ~mle) | (T[i] & mle); | |||
} | |||
b = (b & ~mle) | (d & mle); | |||
for (i = SYS_T; i >= 1; i--) { | |||
B[i] = B[i - 1]; | |||
} | |||
B[0] = 0; | |||
} | |||
for (i = 0; i <= SYS_T; i++) { | |||
out[i] = C[ SYS_T - i ]; | |||
} | |||
} | |||
@@ -0,0 +1,13 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_CLEAN_BM_H | |||
#define PQCLEAN_MCELIECE348864_CLEAN_BM_H | |||
/* | |||
This file is for the Berlekamp-Massey algorithm | |||
see http://crypto.stanford.edu/~mironov/cs359/massey.pdf | |||
*/ | |||
#include "gf.h" | |||
void PQCLEAN_MCELIECE348864_CLEAN_bm(gf * /*out*/, gf * /*s*/); | |||
#endif | |||
@@ -0,0 +1,274 @@ | |||
/* | |||
This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation | |||
see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf | |||
*/ | |||
#include "controlbits.h" | |||
#include "params.h" | |||
#include <stdint.h> | |||
typedef uint8_t bit; | |||
#define N (1 << GFBITS) | |||
static bit is_smaller(uint32_t a, uint32_t b) { | |||
uint32_t ret = 0; | |||
ret = a - b; | |||
ret >>= 31; | |||
return (bit)ret; | |||
} | |||
static bit is_smaller_63b(uint64_t a, uint64_t b) { | |||
uint64_t ret = 0; | |||
ret = a - b; | |||
ret >>= 63; | |||
return (bit)ret; | |||
} | |||
static void cswap(uint32_t *x, uint32_t *y, bit swap) { | |||
uint32_t m; | |||
uint32_t d; | |||
m = swap; | |||
m = 0 - m; | |||
d = (*x ^ *y); | |||
d &= m; | |||
*x ^= d; | |||
*y ^= d; | |||
} | |||
static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { | |||
uint64_t m; | |||
uint64_t d; | |||
m = swap; | |||
m = 0 - m; | |||
d = (*x ^ *y); | |||
d &= m; | |||
*x ^= d; | |||
*y ^= d; | |||
} | |||
/* output x = min(input x,input y) */ | |||
/* output y = max(input x,input y) */ | |||
static void minmax(uint32_t *x, uint32_t *y) { | |||
bit m; | |||
m = is_smaller(*y, *x); | |||
cswap(x, y, m); | |||
} | |||
static void minmax_63b(uint64_t *x, uint64_t *y) { | |||
bit m; | |||
m = is_smaller_63b(*y, *x); | |||
cswap_63b(x, y, m); | |||
} | |||
/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ | |||
/* requires n to be a power of 2 */ | |||
static void merge(int n, uint32_t *x, int step) { | |||
int i; | |||
if (n == 1) { | |||
minmax(&x[0], &x[step]); | |||
} else { | |||
merge(n / 2, x, step * 2); | |||
merge(n / 2, x + step, step * 2); | |||
for (i = 1; i < 2 * n - 1; i += 2) { | |||
minmax(&x[i * step], &x[(i + 1) * step]); | |||
} | |||
} | |||
} | |||
static void merge_63b(int n, uint64_t *x, int step) { | |||
int i; | |||
if (n == 1) { | |||
minmax_63b(&x[0], &x[step]); | |||
} else { | |||
merge_63b(n / 2, x, step * 2); | |||
merge_63b(n / 2, x + step, step * 2); | |||
for (i = 1; i < 2 * n - 1; i += 2) { | |||
minmax_63b(&x[i * step], &x[(i + 1) * step]); | |||
} | |||
} | |||
} | |||
/* sort x[0],x[1],...,x[n-1] in place */ | |||
/* requires n to be a power of 2 */ | |||
static void sort(int n, uint32_t *x) { | |||
if (n <= 1) { | |||
return; | |||
} | |||
sort(n / 2, x); | |||
sort(n / 2, x + n / 2); | |||
merge(n / 2, x, 1); | |||
} | |||
void PQCLEAN_MCELIECE348864_CLEAN_sort_63b(int n, uint64_t *x) { | |||
if (n <= 1) { | |||
return; | |||
} | |||
PQCLEAN_MCELIECE348864_CLEAN_sort_63b(n / 2, x); | |||
PQCLEAN_MCELIECE348864_CLEAN_sort_63b(n / 2, x + n / 2); | |||
merge_63b(n / 2, x, 1); | |||
} | |||
/* y[pi[i]] = x[i] */ | |||
/* requires n = 2^w */ | |||
/* requires pi to be a permutation */ | |||
static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC | |||
int i; | |||
uint32_t t[2 * N]; | |||
for (i = 0; i < n; ++i) { | |||
t[i] = x[i] | (pi[i] << 16); | |||
} | |||
sort(n, t); | |||
for (i = 0; i < n; ++i) { | |||
y[i] = t[i] & 0xFFFF; | |||
} | |||
} | |||
/* ip[i] = j iff pi[i] = j */ | |||
/* requires n = 2^w */ | |||
/* requires pi to be a permutation */ | |||
static void invert(int n, uint32_t *ip, const uint32_t *pi) { | |||
int i; | |||
for (i = 0; i < n; i++) { | |||
ip[i] = i; | |||
} | |||
composeinv(n, ip, ip, pi); | |||
} | |||
static void flow(int w, uint32_t *x, const uint32_t *y, int t) { | |||
bit m0; | |||
bit m1; | |||
uint32_t b; | |||
uint32_t y_copy = *y; | |||
m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); | |||
m1 = is_smaller(0, t); | |||
cswap(x, &y_copy, m0); | |||
b = m0 & m1; | |||
*x ^= b << w; | |||
} | |||
/* input: permutation pi */ | |||
/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ | |||
/* requires n = 2^w */ | |||
static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { | |||
int i; | |||
int j; | |||
int k; | |||
int t; | |||
uint32_t ip[N] = {0}; | |||
uint32_t I[2 * N] = {0}; | |||
uint32_t P[2 * N] = {0}; | |||
uint32_t PI[2 * N] = {0}; | |||
uint32_t T[2 * N] = {0}; | |||
uint32_t piflip[N] = {0}; | |||
uint32_t subpi[2][N / 2] = {{0}}; | |||
if (w == 1) { | |||
c[ off / 8 ] |= (pi[0] & 1) << (off % 8); | |||
} | |||
if (w <= 1) { | |||
return; | |||
} | |||
invert(n, ip, pi); | |||
for (i = 0; i < n; ++i) { | |||
I[i] = ip[i] | (1 << w); | |||
I[n + i] = pi[i]; | |||
} | |||
for (i = 0; i < 2 * n; ++i) { | |||
P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); | |||
} | |||
for (t = 0; t < w; ++t) { | |||
composeinv(2 * n, PI, P, I); | |||
for (i = 0; i < 2 * n; ++i) { | |||
flow(w, &P[i], &PI[i], t); | |||
} | |||
for (i = 0; i < 2 * n; ++i) { | |||
T[i] = I[i ^ 1]; | |||
} | |||
composeinv(2 * n, I, I, T); | |||
for (i = 0; i < 2 * n; ++i) { | |||
T[i] = P[i ^ 1]; | |||
} | |||
for (i = 0; i < 2 * n; ++i) { | |||
flow(w, &P[i], &T[i], 1); | |||
} | |||
} | |||
for (i = 0; i < n; ++i) { | |||
for (j = 0; j < w; ++j) { | |||
piflip[i] = pi[i]; | |||
} | |||
} | |||
for (i = 0; i < n / 2; ++i) { | |||
c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); | |||
} | |||
for (i = 0; i < n / 2; ++i) { | |||
c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); | |||
} | |||
for (i = 0; i < n / 2; ++i) { | |||
cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); | |||
} | |||
for (k = 0; k < 2; ++k) { | |||
for (i = 0; i < n / 2; ++i) { | |||
subpi[k][i] = piflip[i * 2 + k] >> 1; | |||
} | |||
} | |||
for (k = 0; k < 2; ++k) { | |||
controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); | |||
} | |||
} | |||
/* input: pi, a permutation*/ | |||
/* output: out, control bits w.r.t. pi */ | |||
void PQCLEAN_MCELIECE348864_CLEAN_controlbits(unsigned char *out, const uint32_t *pi) { | |||
unsigned int i; | |||
unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; | |||
for (i = 0; i < sizeof(c); i++) { | |||
c[i] = 0; | |||
} | |||
controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); | |||
for (i = 0; i < sizeof(c); i++) { | |||
out[i] = c[i]; | |||
} | |||
} | |||
@@ -0,0 +1,15 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_CLEAN_CONTROLBITS_H | |||
#define PQCLEAN_MCELIECE348864_CLEAN_CONTROLBITS_H | |||
/* | |||
This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation | |||
see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf | |||
*/ | |||
#include <stdint.h> | |||
void PQCLEAN_MCELIECE348864_CLEAN_sort_63b(int n, uint64_t *x); | |||
void PQCLEAN_MCELIECE348864_CLEAN_controlbits(unsigned char *out, const uint32_t *pi); | |||
#endif | |||
@@ -0,0 +1,7 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_CLEAN_CRYPTO_HASH_H | |||
#define PQCLEAN_MCELIECE348864_CLEAN_CRYPTO_HASH_H | |||
#include "fips202.h" | |||
#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) | |||
#endif |
@@ -0,0 +1,90 @@ | |||
/* | |||
This file is for Niederreiter decryption | |||
*/ | |||
#include "decrypt.h" | |||
#include "benes.h" | |||
#include "bm.h" | |||
#include "gf.h" | |||
#include "params.h" | |||
#include "root.h" | |||
#include "synd.h" | |||
#include "util.h" | |||
/* Niederreiter decryption with the Berlekamp decoder */ | |||
/* intput: sk, secret key */ | |||
/* c, ciphertext */ | |||
/* output: e, error vector */ | |||
/* return: 0 for success; 1 for failure */ | |||
int PQCLEAN_MCELIECE348864_CLEAN_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { | |||
int i, w = 0; | |||
uint16_t check; | |||
unsigned char r[ SYS_N / 8 ]; | |||
gf g[ SYS_T + 1 ]; | |||
gf L[ SYS_N ]; | |||
gf s[ SYS_T * 2 ]; | |||
gf s_cmp[ SYS_T * 2 ]; | |||
gf locator[ SYS_T + 1 ]; | |||
gf images[ SYS_N ]; | |||
gf t; | |||
// | |||
for (i = 0; i < SYND_BYTES; i++) { | |||
r[i] = c[i]; | |||
} | |||
for (i = SYND_BYTES; i < SYS_N / 8; i++) { | |||
r[i] = 0; | |||
} | |||
for (i = 0; i < SYS_T; i++) { | |||
g[i] = PQCLEAN_MCELIECE348864_CLEAN_load2(sk); | |||
g[i] &= GFMASK; | |||
sk += 2; | |||
} | |||
g[ SYS_T ] = 1; | |||
PQCLEAN_MCELIECE348864_CLEAN_support_gen(L, sk); | |||
PQCLEAN_MCELIECE348864_CLEAN_synd(s, g, L, r); | |||
PQCLEAN_MCELIECE348864_CLEAN_bm(locator, s); | |||
PQCLEAN_MCELIECE348864_CLEAN_root(images, locator, L); | |||
// | |||
for (i = 0; i < SYS_N / 8; i++) { | |||
e[i] = 0; | |||
} | |||
for (i = 0; i < SYS_N; i++) { | |||
t = PQCLEAN_MCELIECE348864_CLEAN_gf_iszero(images[i]) & 1; | |||
e[ i / 8 ] |= t << (i % 8); | |||
w += t; | |||
} | |||
PQCLEAN_MCELIECE348864_CLEAN_synd(s_cmp, g, L, e); | |||
// | |||
check = (uint16_t)w; | |||
check ^= SYS_T; | |||
for (i = 0; i < SYS_T * 2; i++) { | |||
check |= s[i] ^ s_cmp[i]; | |||
} | |||
check -= 1; | |||
check >>= 15; | |||
return check ^ 1; | |||
} | |||
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_CLEAN_DECRYPT_H | |||
#define PQCLEAN_MCELIECE348864_CLEAN_DECRYPT_H | |||
/* | |||
This file is for Nieddereiter decryption | |||
*/ | |||
int PQCLEAN_MCELIECE348864_CLEAN_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); | |||
#endif | |||
@@ -0,0 +1,138 @@ | |||
/* | |||
This file is for Niederreiter encryption | |||
*/ | |||
#include "encrypt.h" | |||
#include "params.h" | |||
#include "randombytes.h" | |||
#include "util.h" | |||
#include <stdint.h> | |||
#include <string.h> | |||
#include "gf.h" | |||
static inline uint8_t same_mask(uint16_t x, uint16_t y) { | |||
uint32_t mask; | |||
mask = x ^ y; | |||
mask -= 1; | |||
mask >>= 31; | |||
mask = -mask; | |||
return (uint8_t)mask; | |||
} | |||
/* output: e, an error vector of weight t */ | |||
static void gen_e(unsigned char *e) { | |||
size_t i, j; | |||
int eq, count; | |||
uint16_t ind_[ SYS_T * 2 ]; | |||
uint8_t *ind_8 = (uint8_t *)ind_; | |||
uint16_t ind[ SYS_T * 2 ]; | |||
uint8_t mask; | |||
unsigned char val[ SYS_T ]; | |||
while (1) { | |||
randombytes(ind_8, sizeof(ind_)); | |||
// Copy to uint16_t ind_ in a little-endian way | |||
for (i = 0; i < sizeof(ind_); i += 2) { | |||
ind_[i / 2] = ((uint16_t)ind_8[i + 1]) << 8 | (uint16_t)ind_8[i]; | |||
} | |||
for (i = 0; i < SYS_T * 2; i++) { | |||
ind_[i] &= GFMASK; | |||
} | |||
// moving and counting indices in the correct range | |||
count = 0; | |||
for (i = 0; i < SYS_T * 2; i++) { | |||
if (ind_[i] < SYS_N) { | |||
ind[ count++ ] = ind_[i]; | |||
} | |||
} | |||
if (count < SYS_T) { | |||
continue; | |||
} | |||
// check for repetition | |||
eq = 0; | |||
for (i = 1; i < SYS_T; i++) { | |||
for (j = 0; j < i; j++) { | |||
if (ind[i] == ind[j]) { | |||
eq = 1; | |||
} | |||
} | |||
} | |||
if (eq == 0) { | |||
break; | |||
} | |||
} | |||
for (j = 0; j < SYS_T; j++) { | |||
val[j] = 1 << (ind[j] & 7); | |||
} | |||
for (i = 0; i < SYS_N / 8; i++) { | |||
e[i] = 0; | |||
for (j = 0; j < SYS_T; j++) { | |||
mask = same_mask((uint16_t)i, (ind[j] >> 3)); | |||
e[i] |= val[j] & mask; | |||
} | |||
} | |||
} | |||
/* input: public key pk, error vector e */ | |||
/* output: syndrome s */ | |||
static void syndrome(unsigned char *s, const unsigned char *pk, const unsigned char *e) { | |||
unsigned char b, row[SYS_N / 8]; | |||
const unsigned char *pk_ptr = pk; | |||
int i, j; | |||
for (i = 0; i < SYND_BYTES; i++) { | |||
s[i] = 0; | |||
} | |||
for (i = 0; i < PK_NROWS; i++) { | |||
for (j = 0; j < SYS_N / 8; j++) { | |||
row[j] = 0; | |||
} | |||
for (j = 0; j < PK_ROW_BYTES; j++) { | |||
row[ SYS_N / 8 - PK_ROW_BYTES + j ] = pk_ptr[j]; | |||
} | |||
row[i / 8] |= 1 << (i % 8); | |||
b = 0; | |||
for (j = 0; j < SYS_N / 8; j++) { | |||
b ^= row[j] & e[j]; | |||
} | |||
b ^= b >> 4; | |||
b ^= b >> 2; | |||
b ^= b >> 1; | |||
b &= 1; | |||
s[ i / 8 ] |= (b << (i % 8)); | |||
pk_ptr += PK_ROW_BYTES; | |||
} | |||
} | |||
void PQCLEAN_MCELIECE348864_CLEAN_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { | |||
gen_e(e); | |||
syndrome(s, pk, e); | |||
} | |||
@@ -0,0 +1,11 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_CLEAN_ENCRYPT_H | |||
#define PQCLEAN_MCELIECE348864_CLEAN_ENCRYPT_H | |||
/* | |||
This file is for Niederreiter encryption | |||
*/ | |||
void PQCLEAN_MCELIECE348864_CLEAN_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); | |||
#endif | |||
@@ -0,0 +1,139 @@ | |||
/* | |||
This file is for functions for field arithmetic | |||
*/ | |||
#include "gf.h" | |||
#include "params.h" | |||
gf PQCLEAN_MCELIECE348864_CLEAN_gf_iszero(gf a) { | |||
uint32_t t = a; | |||
t -= 1; | |||
t >>= 19; | |||
return (gf) t; | |||
} | |||
gf PQCLEAN_MCELIECE348864_CLEAN_gf_add(gf in0, gf in1) { | |||
return in0 ^ in1; | |||
} | |||
gf PQCLEAN_MCELIECE348864_CLEAN_gf_mul(gf in0, gf in1) { | |||
int i; | |||
uint32_t tmp; | |||
uint32_t t0; | |||
uint32_t t1; | |||
uint32_t t; | |||
t0 = in0; | |||
t1 = in1; | |||
tmp = t0 * (t1 & 1); | |||
for (i = 1; i < GFBITS; i++) { | |||
tmp ^= (t0 * (t1 & (1 << i))); | |||
} | |||
t = tmp & 0x7FC000; | |||
tmp ^= t >> 9; | |||
tmp ^= t >> 12; | |||
t = tmp & 0x3000; | |||
tmp ^= t >> 9; | |||
tmp ^= t >> 12; | |||
return tmp & ((1 << GFBITS) - 1); | |||
} | |||
/* input: field element in */ | |||
/* return: in^2 */ | |||
static inline gf gf_sq(gf in) { | |||
const uint32_t B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF}; | |||
uint32_t x = in; | |||
uint32_t t; | |||
x = (x | (x << 8)) & B[3]; | |||
x = (x | (x << 4)) & B[2]; | |||
x = (x | (x << 2)) & B[1]; | |||
x = (x | (x << 1)) & B[0]; | |||
t = x & 0x7FC000; | |||
x ^= t >> 9; | |||
x ^= t >> 12; | |||
t = x & 0x3000; | |||
x ^= t >> 9; | |||
x ^= t >> 12; | |||
return x & ((1 << GFBITS) - 1); | |||
} | |||
gf PQCLEAN_MCELIECE348864_CLEAN_gf_inv(gf in) { | |||
gf tmp_11; | |||
gf tmp_1111; | |||
gf out = in; | |||
out = gf_sq(out); | |||
tmp_11 = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(out, in); // 11 | |||
out = gf_sq(tmp_11); | |||
out = gf_sq(out); | |||
tmp_1111 = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(out, tmp_11); // 1111 | |||
out = gf_sq(tmp_1111); | |||
out = gf_sq(out); | |||
out = gf_sq(out); | |||
out = gf_sq(out); | |||
out = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(out, tmp_1111); // 11111111 | |||
out = gf_sq(out); | |||
out = gf_sq(out); | |||
out = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(out, tmp_11); // 1111111111 | |||
out = gf_sq(out); | |||
out = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(out, in); // 11111111111 | |||
return gf_sq(out); // 111111111110 | |||
} | |||
/* input: field element den, num */ | |||
/* return: (num/den) */ | |||
gf PQCLEAN_MCELIECE348864_CLEAN_gf_frac(gf den, gf num) { | |||
return PQCLEAN_MCELIECE348864_CLEAN_gf_mul(PQCLEAN_MCELIECE348864_CLEAN_gf_inv(den), num); | |||
} | |||
/* input: in0, in1 in GF((2^m)^t)*/ | |||
/* output: out = in0*in1 */ | |||
void PQCLEAN_MCELIECE348864_CLEAN_GF_mul(gf *out, const gf *in0, const gf *in1) { | |||
int i, j; | |||
gf prod[ SYS_T * 2 - 1 ]; | |||
for (i = 0; i < SYS_T * 2 - 1; i++) { | |||
prod[i] = 0; | |||
} | |||
for (i = 0; i < SYS_T; i++) { | |||
for (j = 0; j < SYS_T; j++) { | |||
prod[i + j] ^= PQCLEAN_MCELIECE348864_CLEAN_gf_mul(in0[i], in1[j]); | |||
} | |||
} | |||
// | |||
for (i = (SYS_T - 1) * 2; i >= SYS_T; i--) { | |||
prod[i - SYS_T + 9] ^= PQCLEAN_MCELIECE348864_CLEAN_gf_mul(prod[i], (gf) 877); | |||
prod[i - SYS_T + 7] ^= PQCLEAN_MCELIECE348864_CLEAN_gf_mul(prod[i], (gf) 2888); | |||
prod[i - SYS_T + 5] ^= PQCLEAN_MCELIECE348864_CLEAN_gf_mul(prod[i], (gf) 1781); | |||
prod[i - SYS_T + 0] ^= PQCLEAN_MCELIECE348864_CLEAN_gf_mul(prod[i], (gf) 373); | |||
} | |||
for (i = 0; i < SYS_T; i++) { | |||
out[i] = prod[i]; | |||
} | |||
} | |||
@@ -0,0 +1,22 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_CLEAN_GF_H | |||
#define PQCLEAN_MCELIECE348864_CLEAN_GF_H | |||
/* | |||
This file is for functions for field arithmetic | |||
*/ | |||
#include <stdint.h> | |||
typedef uint16_t gf; | |||
gf PQCLEAN_MCELIECE348864_CLEAN_gf_iszero(gf a); | |||
gf PQCLEAN_MCELIECE348864_CLEAN_gf_add(gf in0, gf in1); | |||
gf PQCLEAN_MCELIECE348864_CLEAN_gf_mul(gf in0, gf in1); | |||
gf PQCLEAN_MCELIECE348864_CLEAN_gf_frac(gf den, gf num); | |||
gf PQCLEAN_MCELIECE348864_CLEAN_gf_inv(gf in); | |||
uint64_t PQCLEAN_MCELIECE348864_CLEAN_gf_mul2(gf a, gf b0, gf b1); | |||
void PQCLEAN_MCELIECE348864_CLEAN_GF_mul(gf *out, const gf *in0, const gf *in1); | |||
#endif | |||
@@ -0,0 +1,136 @@ | |||
#include "api.h" | |||
#include "aes256ctr.h" | |||
#include "controlbits.h" | |||
#include "crypto_hash.h" | |||
#include "decrypt.h" | |||
#include "encrypt.h" | |||
#include "params.h" | |||
#include "pk_gen.h" | |||
#include "randombytes.h" | |||
#include "sk_gen.h" | |||
#include "util.h" | |||
#include <stdint.h> | |||
#include <string.h> | |||
int PQCLEAN_MCELIECE348864_CLEAN_crypto_kem_enc( | |||
uint8_t *c, | |||
uint8_t *key, | |||
const uint8_t *pk | |||
) { | |||
uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; | |||
uint8_t *e = two_e + 1; | |||
uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; | |||
PQCLEAN_MCELIECE348864_CLEAN_encrypt(c, e, pk); | |||
crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); | |||
memcpy(one_ec + 1, e, SYS_N / 8); | |||
memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); | |||
crypto_hash_32b(key, one_ec, sizeof(one_ec)); | |||
return 0; | |||
} | |||
int PQCLEAN_MCELIECE348864_CLEAN_crypto_kem_dec( | |||
uint8_t *key, | |||
const uint8_t *c, | |||
const uint8_t *sk | |||
) { | |||
int i; | |||
uint8_t ret_confirm = 0; | |||
uint8_t ret_decrypt = 0; | |||
uint16_t m; | |||
uint8_t conf[32]; | |||
uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; | |||
uint8_t *e = two_e + 1; | |||
uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; | |||
uint8_t *x = preimage; | |||
// | |||
ret_decrypt = (uint8_t)PQCLEAN_MCELIECE348864_CLEAN_decrypt(e, sk + SYS_N / 8, c); | |||
crypto_hash_32b(conf, two_e, sizeof(two_e)); | |||
for (i = 0; i < 32; i++) { | |||
ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; | |||
} | |||
m = ret_decrypt | ret_confirm; | |||
m -= 1; | |||
m >>= 8; | |||
*x++ = (~m & 0) | (m & 1); | |||
for (i = 0; i < SYS_N / 8; i++) { | |||
*x++ = (~m & sk[i]) | (m & e[i]); | |||
} | |||
for (i = 0; i < SYND_BYTES + 32; i++) { | |||
*x++ = c[i]; | |||
} | |||
crypto_hash_32b(key, preimage, sizeof(preimage)); | |||
return 0; | |||
} | |||
int PQCLEAN_MCELIECE348864_CLEAN_crypto_kem_keypair | |||
( | |||
uint8_t *pk, | |||
uint8_t *sk | |||
) { | |||
int i; | |||
uint8_t seed[ 32 ]; | |||
uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; | |||
uint8_t nonce[ 16 ] = {0}; | |||
uint8_t *rp; | |||
gf f[ SYS_T ]; // element in GF(2^mt) | |||
gf irr[ SYS_T ]; // Goppa polynomial | |||
uint32_t perm[ 1 << GFBITS ]; // random permutation | |||
randombytes(seed, sizeof(seed)); | |||
while (1) { | |||
rp = r; | |||
PQCLEAN_MCELIECE348864_CLEAN_aes256ctr(r, sizeof(r), nonce, seed); | |||
memcpy(seed, &r[ sizeof(r) - 32 ], 32); | |||
for (i = 0; i < SYS_T; i++) { | |||
f[i] = PQCLEAN_MCELIECE348864_CLEAN_load2(rp + i * 2); | |||
} | |||
rp += sizeof(f); | |||
if (PQCLEAN_MCELIECE348864_CLEAN_genpoly_gen(irr, f)) { | |||
continue; | |||
} | |||
for (i = 0; i < (1 << GFBITS); i++) { | |||
perm[i] = PQCLEAN_MCELIECE348864_CLEAN_load4(rp + i * 4); | |||
} | |||
rp += sizeof(perm); | |||
if (PQCLEAN_MCELIECE348864_CLEAN_perm_check(perm)) { | |||
continue; | |||
} | |||
for (i = 0; i < SYS_T; i++) { | |||
PQCLEAN_MCELIECE348864_CLEAN_store2(sk + SYS_N / 8 + i * 2, irr[i]); | |||
} | |||
if (PQCLEAN_MCELIECE348864_CLEAN_pk_gen(pk, perm, sk + SYS_N / 8)) { | |||
continue; | |||
} | |||
memcpy(sk, rp, SYS_N / 8); | |||
PQCLEAN_MCELIECE348864_CLEAN_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); | |||
break; | |||
} | |||
return 0; | |||
} | |||
@@ -0,0 +1,21 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_CLEAN_PARAMS_H | |||
#define PQCLEAN_MCELIECE348864_CLEAN_PARAMS_H | |||
#define GFBITS 12 | |||
#define SYS_N 3488 | |||
#define SYS_T 64 | |||
#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) | |||
#define IRR_BYTES (SYS_T * 2) | |||
#define PK_NROWS (SYS_T*GFBITS) | |||
#define PK_NCOLS (SYS_N - PK_NROWS) | |||
#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) | |||
#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) | |||
#define SYND_BYTES ((PK_NROWS + 7)/8) | |||
#define GFMASK ((1 << GFBITS) - 1) | |||
#endif | |||
@@ -0,0 +1,144 @@ | |||
/* | |||
This file is for public-key generation | |||
*/ | |||
#include <string.h> | |||
#include "benes.h" | |||
#include "controlbits.h" | |||
#include "gf.h" | |||
#include "params.h" | |||
#include "pk_gen.h" | |||
#include "root.h" | |||
#include "util.h" | |||
/* input: secret key sk */ | |||
/* output: public key pk */ | |||
int PQCLEAN_MCELIECE348864_CLEAN_pk_gen(uint8_t *pk, uint32_t *perm, const uint8_t *sk) { | |||
int i, j, k; | |||
int row, c; | |||
uint64_t buf[ 1 << GFBITS ]; | |||
uint8_t mat[ GFBITS * SYS_T ][ SYS_N / 8 ]; | |||
uint8_t mask; | |||
uint8_t b; | |||
gf g[ SYS_T + 1 ]; // Goppa polynomial | |||
gf L[ SYS_N ]; // support | |||
gf inv[ SYS_N ]; | |||
// | |||
g[ SYS_T ] = 1; | |||
for (i = 0; i < SYS_T; i++) { | |||
g[i] = PQCLEAN_MCELIECE348864_CLEAN_load2(sk); | |||
g[i] &= GFMASK; | |||
sk += 2; | |||
} | |||
for (i = 0; i < (1 << GFBITS); i++) { | |||
buf[i] = perm[i]; | |||
buf[i] <<= 31; | |||
buf[i] |= i; | |||
} | |||
PQCLEAN_MCELIECE348864_CLEAN_sort_63b(1 << GFBITS, buf); | |||
for (i = 0; i < (1 << GFBITS); i++) { | |||
perm[i] = buf[i] & GFMASK; | |||
} | |||
for (i = 0; i < SYS_N; i++) { | |||
L[i] = PQCLEAN_MCELIECE348864_CLEAN_bitrev((gf)perm[i]); | |||
} | |||
// filling the matrix | |||
PQCLEAN_MCELIECE348864_CLEAN_root(inv, g, L); | |||
for (i = 0; i < SYS_N; i++) { | |||
inv[i] = PQCLEAN_MCELIECE348864_CLEAN_gf_inv(inv[i]); | |||
} | |||
for (i = 0; i < PK_NROWS; i++) { | |||
for (j = 0; j < SYS_N / 8; j++) { | |||
mat[i][j] = 0; | |||
} | |||
} | |||
for (i = 0; i < SYS_T; i++) { | |||
for (j = 0; j < SYS_N; j += 8) { | |||
for (k = 0; k < GFBITS; k++) { | |||
b = (inv[j + 7] >> k) & 1; | |||
b <<= 1; | |||
b |= (inv[j + 6] >> k) & 1; | |||
b <<= 1; | |||
b |= (inv[j + 5] >> k) & 1; | |||
b <<= 1; | |||
b |= (inv[j + 4] >> k) & 1; | |||
b <<= 1; | |||
b |= (inv[j + 3] >> k) & 1; | |||
b <<= 1; | |||
b |= (inv[j + 2] >> k) & 1; | |||
b <<= 1; | |||
b |= (inv[j + 1] >> k) & 1; | |||
b <<= 1; | |||
b |= (inv[j + 0] >> k) & 1; | |||
mat[ i * GFBITS + k ][ j / 8 ] = b; | |||
} | |||
} | |||
for (j = 0; j < SYS_N; j++) { | |||
inv[j] = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(inv[j], L[j]); | |||
} | |||
} | |||
// gaussian elimination | |||
for (i = 0; i < (GFBITS * SYS_T + 7) / 8; i++) { | |||
for (j = 0; j < 8; j++) { | |||
row = i * 8 + j; | |||
if (row >= GFBITS * SYS_T) { | |||
break; | |||
} | |||
for (k = row + 1; k < GFBITS * SYS_T; k++) { | |||
mask = mat[ row ][ i ] ^ mat[ k ][ i ]; | |||
mask >>= j; | |||
mask &= 1; | |||
mask = -mask; | |||
for (c = 0; c < SYS_N / 8; c++) { | |||
mat[ row ][ c ] ^= mat[ k ][ c ] & mask; | |||
} | |||
} | |||
if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic | |||
return -1; | |||
} | |||
for (k = 0; k < GFBITS * SYS_T; k++) { | |||
if (k != row) { | |||
mask = mat[ k ][ i ] >> j; | |||
mask &= 1; | |||
mask = -mask; | |||
for (c = 0; c < SYS_N / 8; c++) { | |||
mat[ k ][ c ] ^= mat[ row ][ c ] & mask; | |||
} | |||
} | |||
} | |||
} | |||
} | |||
for (i = 0; i < PK_NROWS; i++) { | |||
memcpy(pk + i * PK_ROW_BYTES, mat[i] + PK_NROWS / 8, PK_ROW_BYTES); | |||
} | |||
return 0; | |||
} | |||
@@ -0,0 +1,13 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_CLEAN_PK_GEN_H | |||
#define PQCLEAN_MCELIECE348864_CLEAN_PK_GEN_H | |||
/* | |||
This file is for public-key generation | |||
*/ | |||
#include <stdint.h> | |||
int PQCLEAN_MCELIECE348864_CLEAN_pk_gen(uint8_t * /*pk*/, uint32_t * /*perm*/, const uint8_t * /*sk*/); | |||
#endif | |||
@@ -0,0 +1,33 @@ | |||
/* | |||
This file is for evaluating a polynomial at one or more field elements | |||
*/ | |||
#include "root.h" | |||
#include "params.h" | |||
/* input: polynomial f and field element a */ | |||
/* return f(a) */ | |||
gf PQCLEAN_MCELIECE348864_CLEAN_eval(gf *f, gf a) { | |||
int i; | |||
gf r; | |||
r = f[ SYS_T ]; | |||
for (i = SYS_T - 1; i >= 0; i--) { | |||
r = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(r, a); | |||
r = PQCLEAN_MCELIECE348864_CLEAN_gf_add(r, f[i]); | |||
} | |||
return r; | |||
} | |||
/* input: polynomial f and list of field elements L */ | |||
/* output: out = [ f(a) for a in L ] */ | |||
void PQCLEAN_MCELIECE348864_CLEAN_root(gf *out, gf *f, gf *L) { | |||
int i; | |||
for (i = 0; i < SYS_N; i++) { | |||
out[i] = PQCLEAN_MCELIECE348864_CLEAN_eval(f, L[i]); | |||
} | |||
} | |||
@@ -0,0 +1,14 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_CLEAN_ROOT_H | |||
#define PQCLEAN_MCELIECE348864_CLEAN_ROOT_H | |||
/* | |||
This file is for evaluating a polynomial at one or more field elements | |||
*/ | |||
#include "gf.h" | |||
gf PQCLEAN_MCELIECE348864_CLEAN_eval(gf * /*f*/, gf /*a*/); | |||
void PQCLEAN_MCELIECE348864_CLEAN_root(gf * /*out*/, gf * /*f*/, gf * /*L*/); | |||
#endif | |||
@@ -0,0 +1,98 @@ | |||
/* | |||
This file is for secret-key generation | |||
*/ | |||
#include "sk_gen.h" | |||
#include "controlbits.h" | |||
#include "gf.h" | |||
#include "params.h" | |||
#include "util.h" | |||
/* input: f, element in GF((2^m)^t) */ | |||
/* output: out, minimal polynomial of f */ | |||
/* return: 0 for success and -1 for failure */ | |||
int PQCLEAN_MCELIECE348864_CLEAN_genpoly_gen(gf *out, gf *f) { | |||
int i, j, k, c; | |||
gf mat[ SYS_T + 1 ][ SYS_T ]; | |||
gf mask, inv, t; | |||
// fill matrix | |||
mat[0][0] = 1; | |||
for (i = 1; i < SYS_T; i++) { | |||
mat[0][i] = 0; | |||
} | |||
for (i = 0; i < SYS_T; i++) { | |||
mat[1][i] = f[i]; | |||
} | |||
for (j = 2; j <= SYS_T; j++) { | |||
PQCLEAN_MCELIECE348864_CLEAN_GF_mul(mat[j], mat[j - 1], f); | |||
} | |||
// gaussian | |||
for (j = 0; j < SYS_T; j++) { | |||
for (k = j + 1; k < SYS_T; k++) { | |||
mask = PQCLEAN_MCELIECE348864_CLEAN_gf_iszero(mat[ j ][ j ]); | |||
for (c = j; c < SYS_T + 1; c++) { | |||
mat[ c ][ j ] ^= mat[ c ][ k ] & mask; | |||
} | |||
} | |||
if ( mat[ j ][ j ] == 0 ) { // return if not systematic | |||
return -1; | |||
} | |||
inv = PQCLEAN_MCELIECE348864_CLEAN_gf_inv(mat[j][j]); | |||
for (c = j; c < SYS_T + 1; c++) { | |||
mat[ c ][ j ] = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(mat[ c ][ j ], inv) ; | |||
} | |||
for (k = 0; k < SYS_T; k++) { | |||
if (k != j) { | |||
t = mat[ j ][ k ]; | |||
for (c = j; c < SYS_T + 1; c++) { | |||
mat[ c ][ k ] ^= PQCLEAN_MCELIECE348864_CLEAN_gf_mul(mat[ c ][ j ], t); | |||
} | |||
} | |||
} | |||
} | |||
for (i = 0; i < SYS_T; i++) { | |||
out[i] = mat[ SYS_T ][ i ]; | |||
} | |||
return 0; | |||
} | |||
/* input: permutation p represented as a list of 32-bit intergers */ | |||
/* output: -1 if some interger repeats in p */ | |||
/* 0 otherwise */ | |||
int PQCLEAN_MCELIECE348864_CLEAN_perm_check(const uint32_t *p) { | |||
int i; | |||
uint64_t list[1 << GFBITS]; | |||
for (i = 0; i < (1 << GFBITS); i++) { | |||
list[i] = p[i]; | |||
} | |||
PQCLEAN_MCELIECE348864_CLEAN_sort_63b(1 << GFBITS, list); | |||
for (i = 1; i < (1 << GFBITS); i++) { | |||
if (list[i - 1] == list[i]) { | |||
return -1; | |||
} | |||
} | |||
return 0; | |||
} | |||
@@ -0,0 +1,16 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_CLEAN_SK_GEN_H | |||
#define PQCLEAN_MCELIECE348864_CLEAN_SK_GEN_H | |||
/* | |||
This file is for secret-key generation | |||
*/ | |||
#include "gf.h" | |||
#include <stdint.h> | |||
int PQCLEAN_MCELIECE348864_CLEAN_genpoly_gen(gf * /*out*/, gf * /*f*/); | |||
int PQCLEAN_MCELIECE348864_CLEAN_perm_check(const uint32_t * /*p*/); | |||
#endif | |||
@@ -0,0 +1,33 @@ | |||
/* | |||
This file is for syndrome computation | |||
*/ | |||
#include "synd.h" | |||
#include "params.h" | |||
#include "root.h" | |||
/* input: Goppa polynomial f, support L, received word r */ | |||
/* output: out, the syndrome of length 2t */ | |||
void PQCLEAN_MCELIECE348864_CLEAN_synd(gf *out, gf *f, gf *L, const unsigned char *r) { | |||
int i, j; | |||
gf e, e_inv, c; | |||
for (j = 0; j < 2 * SYS_T; j++) { | |||
out[j] = 0; | |||
} | |||
for (i = 0; i < SYS_N; i++) { | |||
c = (r[i / 8] >> (i % 8)) & 1; | |||
e = PQCLEAN_MCELIECE348864_CLEAN_eval(f, L[i]); | |||
e_inv = PQCLEAN_MCELIECE348864_CLEAN_gf_inv(PQCLEAN_MCELIECE348864_CLEAN_gf_mul(e, e)); | |||
for (j = 0; j < 2 * SYS_T; j++) { | |||
out[j] = PQCLEAN_MCELIECE348864_CLEAN_gf_add(out[j], PQCLEAN_MCELIECE348864_CLEAN_gf_mul(e_inv, c)); | |||
e_inv = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(e_inv, L[i]); | |||
} | |||
} | |||
} | |||
@@ -0,0 +1,12 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_CLEAN_SYND_H | |||
#define PQCLEAN_MCELIECE348864_CLEAN_SYND_H | |||
/* | |||
This file is for syndrome computation | |||
*/ | |||
#include "gf.h" | |||
void PQCLEAN_MCELIECE348864_CLEAN_synd(gf * /*out*/, gf * /*f*/, gf * /*L*/, const unsigned char * /*r*/); | |||
#endif | |||
@@ -0,0 +1,42 @@ | |||
/* | |||
This file is for matrix transposition | |||
*/ | |||
#include "transpose.h" | |||
#include <stdint.h> | |||
/* input: in, a 64x64 matrix over GF(2) */ | |||
/* output: out, transpose of in */ | |||
void PQCLEAN_MCELIECE348864_CLEAN_transpose_64x64(uint64_t *out, const uint64_t *in) { | |||
int i, j, s, d; | |||
uint64_t x, y; | |||
uint64_t masks[6][2] = { | |||
{0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, | |||
{0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, | |||
{0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, | |||
{0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, | |||
{0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, | |||
{0x00000000FFFFFFFF, 0xFFFFFFFF00000000} | |||
}; | |||
for (i = 0; i < 64; i++) { | |||
out[i] = in[i]; | |||
} | |||
for (d = 5; d >= 0; d--) { | |||
s = 1 << d; | |||
for (i = 0; i < 64; i += s * 2) { | |||
for (j = i; j < i + s; j++) { | |||
x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); | |||
y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); | |||
out[j + 0] = x; | |||
out[j + s] = y; | |||
} | |||
} | |||
} | |||
} | |||
@@ -0,0 +1,13 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_CLEAN_TRANSPOSE_H | |||
#define PQCLEAN_MCELIECE348864_CLEAN_TRANSPOSE_H | |||
/* | |||
This file is for matrix transposition | |||
*/ | |||
#include <stdint.h> | |||
void PQCLEAN_MCELIECE348864_CLEAN_transpose_64x64(uint64_t * /*out*/, const uint64_t * /*in*/); | |||
#endif | |||
@@ -0,0 +1,67 @@ | |||
/* | |||
This file is for loading/storing data in a little-endian fashion | |||
*/ | |||
#include "util.h" | |||
#include "params.h" | |||
void PQCLEAN_MCELIECE348864_CLEAN_store2(unsigned char *dest, gf a) { | |||
dest[0] = a & 0xFF; | |||
dest[1] = a >> 8; | |||
} | |||
uint16_t PQCLEAN_MCELIECE348864_CLEAN_load2(const unsigned char *src) { | |||
uint16_t a; | |||
a = src[1]; | |||
a <<= 8; | |||
a |= src[0]; | |||
return a & GFMASK; | |||
} | |||
uint32_t PQCLEAN_MCELIECE348864_CLEAN_load4(const unsigned char *in) { | |||
int i; | |||
uint32_t ret = in[3]; | |||
for (i = 2; i >= 0; i--) { | |||
ret <<= 8; | |||
ret |= in[i]; | |||
} | |||
return ret; | |||
} | |||
void PQCLEAN_MCELIECE348864_CLEAN_store8(unsigned char *out, uint64_t in) { | |||
out[0] = (in >> 0x00) & 0xFF; | |||
out[1] = (in >> 0x08) & 0xFF; | |||
out[2] = (in >> 0x10) & 0xFF; | |||
out[3] = (in >> 0x18) & 0xFF; | |||
out[4] = (in >> 0x20) & 0xFF; | |||
out[5] = (in >> 0x28) & 0xFF; | |||
out[6] = (in >> 0x30) & 0xFF; | |||
out[7] = (in >> 0x38) & 0xFF; | |||
} | |||
uint64_t PQCLEAN_MCELIECE348864_CLEAN_load8(const unsigned char *in) { | |||
int i; | |||
uint64_t ret = in[7]; | |||
for (i = 6; i >= 0; i--) { | |||
ret <<= 8; | |||
ret |= in[i]; | |||
} | |||
return ret; | |||
} | |||
gf PQCLEAN_MCELIECE348864_CLEAN_bitrev(gf a) { | |||
a = ((a & 0x00FF) << 8) | ((a & 0xFF00) >> 8); | |||
a = ((a & 0x0F0F) << 4) | ((a & 0xF0F0) >> 4); | |||
a = ((a & 0x3333) << 2) | ((a & 0xCCCC) >> 2); | |||
a = ((a & 0x5555) << 1) | ((a & 0xAAAA) >> 1); | |||
return a >> 4; | |||
} | |||
@@ -0,0 +1,22 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_CLEAN_UTIL_H | |||
#define PQCLEAN_MCELIECE348864_CLEAN_UTIL_H | |||
/* | |||
This file is for loading/storing data in a little-endian fashion | |||
*/ | |||
#include "gf.h" | |||
#include <stdint.h> | |||
void PQCLEAN_MCELIECE348864_CLEAN_store2(unsigned char * /*dest*/, gf /*a*/); | |||
uint16_t PQCLEAN_MCELIECE348864_CLEAN_load2(const unsigned char * /*src*/); | |||
uint32_t PQCLEAN_MCELIECE348864_CLEAN_load4(const unsigned char * /*in*/); | |||
void PQCLEAN_MCELIECE348864_CLEAN_store8(unsigned char * /*out*/, uint64_t /*in*/); | |||
uint64_t PQCLEAN_MCELIECE348864_CLEAN_load8(const unsigned char * /*in*/); | |||
gf PQCLEAN_MCELIECE348864_CLEAN_bitrev(gf /*a*/); | |||
#endif | |||
@@ -0,0 +1,16 @@ | |||
Public Domain. | |||
Authors of Classic McEliece in alphabetical order: | |||
Daniel J. Bernstein, University of Illinois at Chicago | |||
Tung Chou, Osaka University | |||
Tanja Lange, Technische Universiteit Eindhoven | |||
Ingo von Maurich, self | |||
Rafael Misoczki, Intel Corporation | |||
Ruben Niederhagen, Fraunhofer SIT | |||
Edoardo Persichetti, Florida Atlantic University | |||
Christiane Peters, self | |||
Peter Schwabe, Radboud University | |||
Nicolas Sendrier, Inria | |||
Jakub Szefer, Yale University | |||
Wen Wang, Yale University |
@@ -0,0 +1,41 @@ | |||
# This Makefile can be used with GNU Make or BSD Make | |||
LIB = libmceliece348864_sse.a | |||
SOURCES = aes256ctr.c benes.c bm.c controlbits.c decrypt.c encrypt.c fft.c \ | |||
fft_tr.c gf.c operations.c pk_gen.c sk_gen.c transpose.c util.c \ | |||
vec.c vec128.c \ | |||
consts.S syndrome_asm.S transpose_64x128_sp_asm.S \ | |||
transpose_64x64_asm.S update_asm.S vec128_mul_asm.S \ | |||
vec_mul_asm.S vec_reduce_asm.S | |||
HEADERS = aes256ctr.h api.h benes.h bm.h controlbits.h crypto_hash.h \ | |||
decrypt.h encrypt.h fft.h fft_tr.h gf.h params.h \ | |||
pk_gen.h sk_gen.h transpose.h util.h vec128.h vec.h \ | |||
consts.inc powers.inc scalars_2x.inc scalars.inc | |||
OBJECTS = aes256ctr.o benes.o bm.o controlbits.o decrypt.o encrypt.o fft.o \ | |||
fft_tr.o gf.o operations.o pk_gen.o transpose.o sk_gen.o util.o \ | |||
vec.o vec128.o \ | |||
consts.o syndrome_asm.o transpose_64x128_sp_asm.o \ | |||
transpose_64x64_asm.o update_asm.o vec128_mul_asm.o \ | |||
vec_mul_asm.o vec_reduce_asm.o | |||
CFLAGS = -O3 -std=c99 -mpopcnt -mbmi -msse4.1 -Wall -Wextra -pedantic -Werror -Wpedantic \ | |||
-Wredundant-decls -Wvla -Wcast-align -Wmissing-prototypes \ | |||
-I../../../common/ $(EXTRAFLAGS) | |||
all: $(LIB) | |||
%.o: %.c $(HEADERS) | |||
$(CC) $(CFLAGS) -c -o $@ $< | |||
%.o: %.S | |||
$(CC) -c -o $@ $< | |||
$(LIB): $(OBJECTS) | |||
$(AR) -r $@ $(OBJECTS) | |||
clean: | |||
$(RM) $(OBJECTS) | |||
$(RM) $(LIB) |
@@ -0,0 +1,13 @@ | |||
#include "aes256ctr.h" | |||
void PQCLEAN_MCELIECE348864_SSE_aes256ctr( | |||
uint8_t *out, | |||
size_t outlen, | |||
const uint8_t nonce[AESCTR_NONCEBYTES], | |||
const uint8_t key[AES256_KEYBYTES]) { | |||
aes256ctx state; | |||
aes256_keyexp(&state, key); | |||
aes256_ctr(out, outlen, nonce, &state); | |||
aes256_ctx_release(&state); | |||
} |
@@ -0,0 +1,17 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_SSE_AES256CTR_H | |||
#define PQCLEAN_MCELIECE348864_SSE_AES256CTR_H | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
#include "aes.h" | |||
void PQCLEAN_MCELIECE348864_SSE_aes256ctr( | |||
uint8_t *out, | |||
size_t outlen, | |||
const uint8_t nonce[AESCTR_NONCEBYTES], | |||
const uint8_t key[AES256_KEYBYTES] | |||
); | |||
#endif |
@@ -0,0 +1,32 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_SSE_API_H | |||
#define PQCLEAN_MCELIECE348864_SSE_API_H | |||
#include <stdint.h> | |||
#define PQCLEAN_MCELIECE348864_SSE_CRYPTO_ALGNAME "Classic McEliece 348864" | |||
#define PQCLEAN_MCELIECE348864_SSE_CRYPTO_PUBLICKEYBYTES 261120 | |||
#define PQCLEAN_MCELIECE348864_SSE_CRYPTO_SECRETKEYBYTES 6452 | |||
#define PQCLEAN_MCELIECE348864_SSE_CRYPTO_CIPHERTEXTBYTES 128 | |||
#define PQCLEAN_MCELIECE348864_SSE_CRYPTO_BYTES 32 | |||
int PQCLEAN_MCELIECE348864_SSE_crypto_kem_enc( | |||
uint8_t *c, | |||
uint8_t *key, | |||
const uint8_t *pk | |||
); | |||
int PQCLEAN_MCELIECE348864_SSE_crypto_kem_dec( | |||
uint8_t *key, | |||
const uint8_t *c, | |||
const uint8_t *sk | |||
); | |||
int PQCLEAN_MCELIECE348864_SSE_crypto_kem_keypair | |||
( | |||
uint8_t *pk, | |||
uint8_t *sk | |||
); | |||
#endif | |||
@@ -0,0 +1,287 @@ | |||
/* | |||
This file is for Benes network related functions | |||
*/ | |||
#include "benes.h" | |||
#include "params.h" | |||
#include "transpose.h" | |||
#include "util.h" | |||
static void layer_0(uint64_t *bs, const uint64_t *cond) { | |||
int x; | |||
uint64_t diff; | |||
for (x = 0; x < (1 << 6); x += 2) { | |||
diff = bs[ x ] ^ bs[ x + 1 ]; | |||
diff &= *cond++; | |||
bs[ x ] ^= diff; | |||
bs[ x + 1 ] ^= diff; | |||
} | |||
} | |||
static void layer_1(uint64_t *bs, const uint64_t *cond) { | |||
int x; | |||
uint64_t diff; | |||
for (x = 0; x < (1 << 6); x += 4) { | |||
diff = bs[ x + 0 ] ^ bs[ x + 2 ]; | |||
diff &= cond[0]; | |||
bs[ x + 0 ] ^= diff; | |||
bs[ x + 2 ] ^= diff; | |||
diff = bs[ x + 1 ] ^ bs[ x + 3 ]; | |||
diff &= cond[1]; | |||
bs[ x + 1 ] ^= diff; | |||
bs[ x + 3 ] ^= diff; | |||
cond += 2; | |||
} | |||
} | |||
static void layer_2(uint64_t *bs, const uint64_t *cond) { | |||
int x; | |||
uint64_t diff; | |||
for (x = 0; x < (1 << 6); x += 8) { | |||
diff = bs[ x + 0 ] ^ bs[ x + 4 ]; | |||
diff &= cond[0]; | |||
bs[ x + 0 ] ^= diff; | |||
bs[ x + 4 ] ^= diff; | |||
diff = bs[ x + 1 ] ^ bs[ x + 5 ]; | |||
diff &= cond[1]; | |||
bs[ x + 1 ] ^= diff; | |||
bs[ x + 5 ] ^= diff; | |||
diff = bs[ x + 2 ] ^ bs[ x + 6 ]; | |||
diff &= cond[2]; | |||
bs[ x + 2 ] ^= diff; | |||
bs[ x + 6 ] ^= diff; | |||
diff = bs[ x + 3 ] ^ bs[ x + 7 ]; | |||
diff &= cond[3]; | |||
bs[ x + 3 ] ^= diff; | |||
bs[ x + 7 ] ^= diff; | |||
cond += 4; | |||
} | |||
} | |||
static void layer_3(uint64_t *bs, const uint64_t *cond) { | |||
int x, s; | |||
uint64_t diff; | |||
for (x = 0; x < (1 << 6); x += 16) { | |||
for (s = x; s < x + 8; s += 4) { | |||
diff = bs[ s + 0 ] ^ bs[ s + 8 ]; | |||
diff &= cond[0]; | |||
bs[ s + 0 ] ^= diff; | |||
bs[ s + 8 ] ^= diff; | |||
diff = bs[ s + 1 ] ^ bs[ s + 9 ]; | |||
diff &= cond[1]; | |||
bs[ s + 1 ] ^= diff; | |||
bs[ s + 9 ] ^= diff; | |||
diff = bs[ s + 2 ] ^ bs[ s + 10 ]; | |||
diff &= cond[2]; | |||
bs[ s + 2 ] ^= diff; | |||
bs[ s + 10 ] ^= diff; | |||
diff = bs[ s + 3 ] ^ bs[ s + 11 ]; | |||
diff &= cond[3]; | |||
bs[ s + 3 ] ^= diff; | |||
bs[ s + 11 ] ^= diff; | |||
cond += 4; | |||
} | |||
} | |||
} | |||
static void layer_4(uint64_t *bs, const uint64_t *cond) { | |||
int x, s; | |||
uint64_t diff; | |||
for (x = 0; x < (1 << 6); x += 32) { | |||
for (s = x; s < x + 16; s += 4) { | |||
diff = bs[ s + 0 ] ^ bs[ s + 16 ]; | |||
diff &= cond[0]; | |||
bs[ s + 0 ] ^= diff; | |||
bs[ s + 16 ] ^= diff; | |||
diff = bs[ s + 1 ] ^ bs[ s + 17 ]; | |||
diff &= cond[1]; | |||
bs[ s + 1 ] ^= diff; | |||
bs[ s + 17 ] ^= diff; | |||
diff = bs[ s + 2 ] ^ bs[ s + 18 ]; | |||
diff &= cond[2]; | |||
bs[ s + 2 ] ^= diff; | |||
bs[ s + 18 ] ^= diff; | |||
diff = bs[ s + 3 ] ^ bs[ s + 19 ]; | |||
diff &= cond[3]; | |||
bs[ s + 3 ] ^= diff; | |||
bs[ s + 19 ] ^= diff; | |||
cond += 4; | |||
} | |||
} | |||
} | |||
static void layer_5(uint64_t *bs, const uint64_t *cond) { | |||
int x, s; | |||
uint64_t diff; | |||
for (x = 0; x < (1 << 6); x += 64) { | |||
for (s = x; s < x + 32; s += 4) { | |||
diff = bs[ s + 0 ] ^ bs[ s + 32 ]; | |||
diff &= cond[0]; | |||
bs[ s + 0 ] ^= diff; | |||
bs[ s + 32 ] ^= diff; | |||
diff = bs[ s + 1 ] ^ bs[ s + 33 ]; | |||
diff &= cond[1]; | |||
bs[ s + 1 ] ^= diff; | |||
bs[ s + 33 ] ^= diff; | |||
diff = bs[ s + 2 ] ^ bs[ s + 34 ]; | |||
diff &= cond[2]; | |||
bs[ s + 2 ] ^= diff; | |||
bs[ s + 34 ] ^= diff; | |||
diff = bs[ s + 3 ] ^ bs[ s + 35 ]; | |||
diff &= cond[3]; | |||
bs[ s + 3 ] ^= diff; | |||
bs[ s + 35 ] ^= diff; | |||
cond += 4; | |||
} | |||
} | |||
} | |||
/* input: bits, control bits as array of bytes */ | |||
/* output: out, control bits as array of 128-bit vectors */ | |||
void PQCLEAN_MCELIECE348864_SSE_load_bits(uint64_t out[][32], const unsigned char *bits) { | |||
int i, low, block = 0; | |||
uint64_t cond[64]; | |||
// | |||
for (low = 0; low <= 5; low++) { | |||
for (i = 0; i < 64; i++) { | |||
cond[i] = PQCLEAN_MCELIECE348864_SSE_load4(bits + block * 256 + i * 4); | |||
} | |||
PQCLEAN_MCELIECE348864_SSE_transpose_64x64(cond); | |||
for (i = 0; i < 32; i++) { | |||
out[ block ][i] = cond[i]; | |||
} | |||
block++; | |||
} | |||
for (low = 0; low <= 5; low++) { | |||
for (i = 0; i < 32; i++) { | |||
out[ block ][i] = PQCLEAN_MCELIECE348864_SSE_load8(bits + block * 256 + i * 8); | |||
} | |||
block++; | |||
} | |||
for (low = 4; low >= 0; low--) { | |||
for (i = 0; i < 32; i++) { | |||
out[ block ][i] = PQCLEAN_MCELIECE348864_SSE_load8(bits + block * 256 + i * 8); | |||
} | |||
block++; | |||
} | |||
for (low = 5; low >= 0; low--) { | |||
for (i = 0; i < 64; i++) { | |||
cond[i] = PQCLEAN_MCELIECE348864_SSE_load4(bits + block * 256 + i * 4); | |||
} | |||
PQCLEAN_MCELIECE348864_SSE_transpose_64x64(cond); | |||
for (i = 0; i < 32; i++) { | |||
out[ block ][i] = cond[i]; | |||
} | |||
block++; | |||
} | |||
} | |||
/* input: r, sequence of bits to be permuted */ | |||
/* cond, control bits as array of 128-bit vectors */ | |||
/* rev, 0 for normal application; !0 for inverse */ | |||
/* output: r, permuted bits */ | |||
void PQCLEAN_MCELIECE348864_SSE_benes(uint64_t *r, uint64_t cond[][32], int rev) { | |||
int block, inc; | |||
uint64_t *bs = r; | |||
// | |||
if (rev == 0) { | |||
block = 0; | |||
inc = 1; | |||
} else { | |||
block = 22; | |||
inc = -1; | |||
} | |||
PQCLEAN_MCELIECE348864_SSE_transpose_64x64(bs); | |||
layer_0(bs, cond[ block ]); | |||
block += inc; | |||
layer_1(bs, cond[ block ]); | |||
block += inc; | |||
layer_2(bs, cond[ block ]); | |||
block += inc; | |||
layer_3(bs, cond[ block ]); | |||
block += inc; | |||
layer_4(bs, cond[ block ]); | |||
block += inc; | |||
layer_5(bs, cond[ block ]); | |||
block += inc; | |||
PQCLEAN_MCELIECE348864_SSE_transpose_64x64(bs); | |||
layer_0(bs, cond[ block ]); | |||
block += inc; | |||
layer_1(bs, cond[ block ]); | |||
block += inc; | |||
layer_2(bs, cond[ block ]); | |||
block += inc; | |||
layer_3(bs, cond[ block ]); | |||
block += inc; | |||
layer_4(bs, cond[ block ]); | |||
block += inc; | |||
layer_5(bs, cond[ block ]); | |||
block += inc; | |||
layer_4(bs, cond[ block ]); | |||
block += inc; | |||
layer_3(bs, cond[ block ]); | |||
block += inc; | |||
layer_2(bs, cond[ block ]); | |||
block += inc; | |||
layer_1(bs, cond[ block ]); | |||
block += inc; | |||
layer_0(bs, cond[ block ]); | |||
block += inc; | |||
PQCLEAN_MCELIECE348864_SSE_transpose_64x64(bs); | |||
layer_5(bs, cond[ block ]); | |||
block += inc; | |||
layer_4(bs, cond[ block ]); | |||
block += inc; | |||
layer_3(bs, cond[ block ]); | |||
block += inc; | |||
layer_2(bs, cond[ block ]); | |||
block += inc; | |||
layer_1(bs, cond[ block ]); | |||
block += inc; | |||
layer_0(bs, cond[ block ]); | |||
//block += inc; | |||
PQCLEAN_MCELIECE348864_SSE_transpose_64x64(bs); | |||
} | |||
@@ -0,0 +1,15 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_SSE_BENES_H | |||
#define PQCLEAN_MCELIECE348864_SSE_BENES_H | |||
/* | |||
This file is for Benes network related functions | |||
*/ | |||
#include "gf.h" | |||
#include "vec128.h" | |||
void PQCLEAN_MCELIECE348864_SSE_load_bits(uint64_t /*out*/[][32], const unsigned char * /*bits*/); | |||
void PQCLEAN_MCELIECE348864_SSE_benes(uint64_t * /*r*/, uint64_t /*cond*/[][32], int /*rev*/); | |||
#endif | |||
@@ -0,0 +1,220 @@ | |||
/* | |||
This file is for the inversion-free Berlekamp-Massey algorithm | |||
see https://ieeexplore.ieee.org/document/87857 | |||
*/ | |||
#include "bm.h" | |||
#include "gf.h" | |||
#include "util.h" | |||
#include "vec.h" | |||
#include "vec128.h" | |||
#include <assert.h> | |||
#include <stdint.h> | |||
extern void PQCLEAN_MCELIECE348864_SSE_update_asm(void *, gf, int); | |||
extern gf PQCLEAN_MCELIECE348864_SSE_vec_reduce_asm(uint64_t *); | |||
static inline uint64_t mask_nonzero(gf a) { | |||
uint64_t ret = a; | |||
ret -= 1; | |||
ret >>= 63; | |||
ret -= 1; | |||
return ret; | |||
} | |||
static inline uint64_t mask_leq(uint16_t a, uint16_t b) { | |||
uint64_t a_tmp = a; | |||
uint64_t b_tmp = b; | |||
uint64_t ret = b_tmp - a_tmp; | |||
ret >>= 63; | |||
ret -= 1; | |||
return ret; | |||
} | |||
static void vec_cmov(uint64_t out[][2], uint64_t mask) { | |||
int i; | |||
for (i = 0; i < GFBITS; i++) { | |||
out[i][0] = (out[i][0] & ~mask) | (out[i][1] & mask); | |||
} | |||
} | |||
static inline void interleave(vec128 *in, int idx0, int idx1, vec128 *mask, int b) { | |||
int s = 1 << b; | |||
vec128 x, y; | |||
x = PQCLEAN_MCELIECE348864_SSE_vec128_or(PQCLEAN_MCELIECE348864_SSE_vec128_and(in[idx0], mask[0]), | |||
PQCLEAN_MCELIECE348864_SSE_vec128_sll_2x(PQCLEAN_MCELIECE348864_SSE_vec128_and(in[idx1], mask[0]), s)); | |||
y = PQCLEAN_MCELIECE348864_SSE_vec128_or(PQCLEAN_MCELIECE348864_SSE_vec128_srl_2x(PQCLEAN_MCELIECE348864_SSE_vec128_and(in[idx0], mask[1]), s), | |||
PQCLEAN_MCELIECE348864_SSE_vec128_and(in[idx1], mask[1])); | |||
in[idx0] = x; | |||
in[idx1] = y; | |||
} | |||
/* input: in, field elements in bitsliced form */ | |||
/* output: out, field elements in non-bitsliced form */ | |||
static inline void get_coefs(gf *out, vec128 *in) { | |||
int i, k; | |||
vec128 mask[4][2]; | |||
vec128 buf[16]; | |||
for (i = 0; i < GFBITS; i++) { | |||
buf[i] = in[i]; | |||
} | |||
for (i = GFBITS; i < 16; i++) { | |||
buf[i] = PQCLEAN_MCELIECE348864_SSE_vec128_setzero(); | |||
} | |||
mask[0][0] = PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(0x5555); | |||
mask[0][1] = PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(0xAAAA); | |||
mask[1][0] = PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(0x3333); | |||
mask[1][1] = PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(0xCCCC); | |||
mask[2][0] = PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(0x0F0F); | |||
mask[2][1] = PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(0xF0F0); | |||
mask[3][0] = PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(0x00FF); | |||
mask[3][1] = PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(0xFF00); | |||
interleave(buf, 0, 8, mask[3], 3); | |||
interleave(buf, 1, 9, mask[3], 3); | |||
interleave(buf, 2, 10, mask[3], 3); | |||
interleave(buf, 3, 11, mask[3], 3); | |||
interleave(buf, 4, 12, mask[3], 3); | |||
interleave(buf, 5, 13, mask[3], 3); | |||
interleave(buf, 6, 14, mask[3], 3); | |||
interleave(buf, 7, 15, mask[3], 3); | |||
interleave(buf, 0, 4, mask[2], 2); | |||
interleave(buf, 1, 5, mask[2], 2); | |||
interleave(buf, 2, 6, mask[2], 2); | |||
interleave(buf, 3, 7, mask[2], 2); | |||
interleave(buf, 8, 12, mask[2], 2); | |||
interleave(buf, 9, 13, mask[2], 2); | |||
interleave(buf, 10, 14, mask[2], 2); | |||
interleave(buf, 11, 15, mask[2], 2); | |||
interleave(buf, 0, 2, mask[1], 1); | |||
interleave(buf, 1, 3, mask[1], 1); | |||
interleave(buf, 4, 6, mask[1], 1); | |||
interleave(buf, 5, 7, mask[1], 1); | |||
interleave(buf, 8, 10, mask[1], 1); | |||
interleave(buf, 9, 11, mask[1], 1); | |||
interleave(buf, 12, 14, mask[1], 1); | |||
interleave(buf, 13, 15, mask[1], 1); | |||
interleave(buf, 0, 1, mask[0], 0); | |||
interleave(buf, 2, 3, mask[0], 0); | |||
interleave(buf, 4, 5, mask[0], 0); | |||
interleave(buf, 6, 7, mask[0], 0); | |||
interleave(buf, 8, 9, mask[0], 0); | |||
interleave(buf, 10, 11, mask[0], 0); | |||
interleave(buf, 12, 13, mask[0], 0); | |||
interleave(buf, 14, 15, mask[0], 0); | |||
for (i = 0; i < 16; i++) { | |||
for (k = 0; k < 4; k++) { | |||
out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE348864_SSE_vec128_extract(buf[i], 0) >> (k * 16)) & GFMASK; | |||
out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE348864_SSE_vec128_extract(buf[i], 1) >> (k * 16)) & GFMASK; | |||
} | |||
} | |||
} | |||
/* input: in, sequence of field elements */ | |||
/* output: out, minimal polynomial of in */ | |||
void PQCLEAN_MCELIECE348864_SSE_bm(uint64_t out[ GFBITS ], vec128 in[ GFBITS ]) { | |||
uint16_t i; | |||
uint16_t N, L; | |||
uint64_t prod[ GFBITS ]; | |||
uint64_t in_tmp[ GFBITS ]; | |||
uint64_t db[ GFBITS ][ 2 ]; | |||
uint64_t BC_tmp[ GFBITS ][ 2 ]; | |||
uint64_t BC[ GFBITS ][ 2 ]; | |||
uint64_t mask, t; | |||
gf d, b, c0 = 1; | |||
gf coefs[SYS_T * 2]; | |||
// init | |||
BC[0][1] = 0; | |||
BC[0][0] = 1; | |||
BC[0][0] <<= 63; | |||
for (i = 1; i < GFBITS; i++) { | |||
BC[i][0] = BC[i][1] = 0; | |||
} | |||
b = 1; | |||
L = 0; | |||
// | |||
get_coefs(coefs, in); | |||
for (i = 0; i < GFBITS; i++) { | |||
in_tmp[i] = 0; | |||
} | |||
for (N = 0; N < SYS_T * 2; N++) { | |||
// computing d | |||
PQCLEAN_MCELIECE348864_SSE_vec_mul_asm(prod, in_tmp, &BC[0][1], 16); | |||
PQCLEAN_MCELIECE348864_SSE_update_asm(in_tmp, coefs[N], 8); | |||
d = PQCLEAN_MCELIECE348864_SSE_vec_reduce_asm(prod); | |||
t = PQCLEAN_MCELIECE348864_SSE_gf_mul2(c0, coefs[N], b); | |||
d ^= t & 0xFFFFFFFF; | |||
// 3 cases | |||
mask = mask_nonzero(d) & mask_leq(L * 2, N); | |||
for (i = 0; i < GFBITS; i++) { | |||
db[i][0] = (d >> i) & 1; | |||
db[i][0] = -db[i][0]; | |||
db[i][1] = (b >> i) & 1; | |||
db[i][1] = -db[i][1]; | |||
} | |||
PQCLEAN_MCELIECE348864_SSE_vec128_mul((vec128 *) BC_tmp, (vec128 *) db, (vec128 *) BC); | |||
vec_cmov(BC, mask); | |||
PQCLEAN_MCELIECE348864_SSE_update_asm(BC, mask & c0, 16); | |||
for (i = 0; i < GFBITS; i++) { | |||
BC[i][1] = BC_tmp[i][0] ^ BC_tmp[i][1]; | |||
} | |||
c0 = t >> 32; | |||
b = (d & mask) | (b & ~mask); | |||
L = ((N + 1 - L) & mask) | (L & ~mask); | |||
} | |||
c0 = PQCLEAN_MCELIECE348864_SSE_gf_inv(c0); | |||
for (i = 0; i < GFBITS; i++) { | |||
out[i] = (c0 >> i) & 1; | |||
out[i] = -out[i]; | |||
} | |||
PQCLEAN_MCELIECE348864_SSE_vec_mul_asm(out, out, &BC[0][1], 16); | |||
} | |||
@@ -0,0 +1,17 @@ | |||
#ifndef PQCLEAN_MCELIECE348864_SSE_BM_H | |||
#define PQCLEAN_MCELIECE348864_SSE_BM_H | |||
/* | |||
This file is for the inversion-free Berlekamp-Massey algorithm | |||
see https://ieeexplore.ieee.org/document/87857 | |||
*/ | |||
#include <stdint.h> | |||
#include "params.h" | |||
#include "vec128.h" | |||
void PQCLEAN_MCELIECE348864_SSE_bm(uint64_t *out, vec128 *in); | |||
#endif | |||