Browse Source

Classic McEliece (#259)

* Add McEliece reference implementations

* Add Vec implementations of McEliece

* Add sse implementations

* Add AVX2 implementations

* Get rid of stuff not supported by Mac ABI

* restrict to two cores

* Ditch .data files

* Remove .hidden from all .S files

* speed up duplicate consistency tests by batching

* make cpuinfo more robust

* Hope to stabilize macos cpuinfo without ccache

* Revert "Hope to stabilize macos cpuinfo without ccache"

This reverts commit 6129c3cabe.

* Just hardcode what's available at travis

* Fixed-size types in api.h

* namespace all header files in mceliece

* Ditch operations.h

* Get rid of static inline functions

* fixup! Ditch operations.h
kyber
Thom Wiggers 4 years ago
committed by Kris Kwiatkowski
parent
commit
ac2c20045c
100 changed files with 30649 additions and 2 deletions
  1. +2
    -2
      .circleci/config.yml
  2. +48
    -0
      crypto_kem/mceliece348864/META.yml
  3. +16
    -0
      crypto_kem/mceliece348864/avx/LICENSE
  4. +42
    -0
      crypto_kem/mceliece348864/avx/Makefile
  5. +13
    -0
      crypto_kem/mceliece348864/avx/aes256ctr.c
  6. +17
    -0
      crypto_kem/mceliece348864/avx/aes256ctr.h
  7. +32
    -0
      crypto_kem/mceliece348864/avx/api.h
  8. +287
    -0
      crypto_kem/mceliece348864/avx/benes.c
  9. +15
    -0
      crypto_kem/mceliece348864/avx/benes.h
  10. +219
    -0
      crypto_kem/mceliece348864/avx/bm.c
  11. +14
    -0
      crypto_kem/mceliece348864/avx/bm.h
  12. +33
    -0
      crypto_kem/mceliece348864/avx/consts.S
  13. +238
    -0
      crypto_kem/mceliece348864/avx/consts.inc
  14. +274
    -0
      crypto_kem/mceliece348864/avx/controlbits.c
  15. +15
    -0
      crypto_kem/mceliece348864/avx/controlbits.h
  16. +7
    -0
      crypto_kem/mceliece348864/avx/crypto_hash.h
  17. +234
    -0
      crypto_kem/mceliece348864/avx/decrypt.c
  18. +10
    -0
      crypto_kem/mceliece348864/avx/decrypt.h
  19. +99
    -0
      crypto_kem/mceliece348864/avx/encrypt.c
  20. +11
    -0
      crypto_kem/mceliece348864/avx/encrypt.h
  21. +172
    -0
      crypto_kem/mceliece348864/avx/fft.c
  22. +18
    -0
      crypto_kem/mceliece348864/avx/fft.h
  23. +355
    -0
      crypto_kem/mceliece348864/avx/fft_tr.c
  24. +14
    -0
      crypto_kem/mceliece348864/avx/fft_tr.h
  25. +169
    -0
      crypto_kem/mceliece348864/avx/gf.c
  26. +26
    -0
      crypto_kem/mceliece348864/avx/gf.h
  27. +1208
    -0
      crypto_kem/mceliece348864/avx/int32_sort.c
  28. +9
    -0
      crypto_kem/mceliece348864/avx/int32_sort.h
  29. +136
    -0
      crypto_kem/mceliece348864/avx/operations.c
  30. +21
    -0
      crypto_kem/mceliece348864/avx/params.h
  31. +276
    -0
      crypto_kem/mceliece348864/avx/pk_gen.c
  32. +13
    -0
      crypto_kem/mceliece348864/avx/pk_gen.h
  33. +224
    -0
      crypto_kem/mceliece348864/avx/powers.inc
  34. +70
    -0
      crypto_kem/mceliece348864/avx/scalars.inc
  35. +70
    -0
      crypto_kem/mceliece348864/avx/scalars_2x.inc
  36. +98
    -0
      crypto_kem/mceliece348864/avx/sk_gen.c
  37. +16
    -0
      crypto_kem/mceliece348864/avx/sk_gen.h
  38. +530
    -0
      crypto_kem/mceliece348864/avx/syndrome_asm.S
  39. +17
    -0
      crypto_kem/mceliece348864/avx/transpose.c
  40. +17
    -0
      crypto_kem/mceliece348864/avx/transpose.h
  41. +8145
    -0
      crypto_kem/mceliece348864/avx/transpose_64x256_sp_asm.S
  42. +8467
    -0
      crypto_kem/mceliece348864/avx/transpose_64x64_asm.S
  43. +18
    -0
      crypto_kem/mceliece348864/avx/uint32_sort.c
  44. +9
    -0
      crypto_kem/mceliece348864/avx/uint32_sort.h
  45. +354
    -0
      crypto_kem/mceliece348864/avx/update_asm.S
  46. +106
    -0
      crypto_kem/mceliece348864/avx/util.c
  47. +33
    -0
      crypto_kem/mceliece348864/avx/util.h
  48. +25
    -0
      crypto_kem/mceliece348864/avx/vec.c
  49. +13
    -0
      crypto_kem/mceliece348864/avx/vec.h
  50. +83
    -0
      crypto_kem/mceliece348864/avx/vec128.c
  51. +41
    -0
      crypto_kem/mceliece348864/avx/vec128.h
  52. +1369
    -0
      crypto_kem/mceliece348864/avx/vec128_mul_asm.S
  53. +137
    -0
      crypto_kem/mceliece348864/avx/vec256.c
  54. +45
    -0
      crypto_kem/mceliece348864/avx/vec256.h
  55. +1736
    -0
      crypto_kem/mceliece348864/avx/vec256_mul_asm.S
  56. +1106
    -0
      crypto_kem/mceliece348864/avx/vec_mul_asm.S
  57. +1115
    -0
      crypto_kem/mceliece348864/avx/vec_mul_sp_asm.S
  58. +356
    -0
      crypto_kem/mceliece348864/avx/vec_reduce_asm.S
  59. +16
    -0
      crypto_kem/mceliece348864/clean/LICENSE
  60. +27
    -0
      crypto_kem/mceliece348864/clean/Makefile
  61. +24
    -0
      crypto_kem/mceliece348864/clean/Makefile.Microsoft_nmake
  62. +13
    -0
      crypto_kem/mceliece348864/clean/aes256ctr.c
  63. +17
    -0
      crypto_kem/mceliece348864/clean/aes256ctr.h
  64. +32
    -0
      crypto_kem/mceliece348864/clean/api.h
  65. +139
    -0
      crypto_kem/mceliece348864/clean/benes.c
  66. +14
    -0
      crypto_kem/mceliece348864/clean/benes.h
  67. +83
    -0
      crypto_kem/mceliece348864/clean/bm.c
  68. +13
    -0
      crypto_kem/mceliece348864/clean/bm.h
  69. +274
    -0
      crypto_kem/mceliece348864/clean/controlbits.c
  70. +15
    -0
      crypto_kem/mceliece348864/clean/controlbits.h
  71. +7
    -0
      crypto_kem/mceliece348864/clean/crypto_hash.h
  72. +90
    -0
      crypto_kem/mceliece348864/clean/decrypt.c
  73. +10
    -0
      crypto_kem/mceliece348864/clean/decrypt.h
  74. +138
    -0
      crypto_kem/mceliece348864/clean/encrypt.c
  75. +11
    -0
      crypto_kem/mceliece348864/clean/encrypt.h
  76. +139
    -0
      crypto_kem/mceliece348864/clean/gf.c
  77. +22
    -0
      crypto_kem/mceliece348864/clean/gf.h
  78. +136
    -0
      crypto_kem/mceliece348864/clean/operations.c
  79. +21
    -0
      crypto_kem/mceliece348864/clean/params.h
  80. +144
    -0
      crypto_kem/mceliece348864/clean/pk_gen.c
  81. +13
    -0
      crypto_kem/mceliece348864/clean/pk_gen.h
  82. +33
    -0
      crypto_kem/mceliece348864/clean/root.c
  83. +14
    -0
      crypto_kem/mceliece348864/clean/root.h
  84. +98
    -0
      crypto_kem/mceliece348864/clean/sk_gen.c
  85. +16
    -0
      crypto_kem/mceliece348864/clean/sk_gen.h
  86. +33
    -0
      crypto_kem/mceliece348864/clean/synd.c
  87. +12
    -0
      crypto_kem/mceliece348864/clean/synd.h
  88. +42
    -0
      crypto_kem/mceliece348864/clean/transpose.c
  89. +13
    -0
      crypto_kem/mceliece348864/clean/transpose.h
  90. +67
    -0
      crypto_kem/mceliece348864/clean/util.c
  91. +22
    -0
      crypto_kem/mceliece348864/clean/util.h
  92. +16
    -0
      crypto_kem/mceliece348864/sse/LICENSE
  93. +41
    -0
      crypto_kem/mceliece348864/sse/Makefile
  94. +13
    -0
      crypto_kem/mceliece348864/sse/aes256ctr.c
  95. +17
    -0
      crypto_kem/mceliece348864/sse/aes256ctr.h
  96. +32
    -0
      crypto_kem/mceliece348864/sse/api.h
  97. +287
    -0
      crypto_kem/mceliece348864/sse/benes.c
  98. +15
    -0
      crypto_kem/mceliece348864/sse/benes.h
  99. +220
    -0
      crypto_kem/mceliece348864/sse/bm.c
  100. +17
    -0
      crypto_kem/mceliece348864/sse/bm.h

+ 2
- 2
.circleci/config.yml View File

@@ -28,7 +28,7 @@ version: 2.1
export CC=\"ccache ${CC}\" &&
pip3 install -r requirements.txt &&
mkdir test-results &&
cd test && python3 -m pytest --verbose --junitxml=test-results/pytest/results.xml --numprocesses=auto"
cd test && python3 -m pytest --verbose --junitxml=test-results/pytest/results.xml --numprocesses=2"
no_output_timeout: 2h
- save_cache:
key: v1-ccache-{{ .Environment.CIRCLE_JOB }}
@@ -59,7 +59,7 @@ version: 2.1
pip3 install -r requirements.txt
mkdir test-results
cd test
python3 -m pytest --verbose --junitxml=test-results/pytest/results.xml --numprocesses=auto
python3 -m pytest --verbose --junitxml=test-results/pytest/results.xml --numprocesses=2
no_output_timeout: 2h
- store_test_results:
path: test/test-results


+ 48
- 0
crypto_kem/mceliece348864/META.yml View File

@@ -0,0 +1,48 @@
name: Classic McEliece 348864
type: kem
claimed-nist-level: 1
claimed-security: IND-CCA2
length-public-key: 261120
length-secret-key: 6452
length-ciphertext: 128
length-shared-secret: 32
nistkat-sha256: f0a166a9115a0c8481c85aee3fe901729a21a8a84a5d2b871fb99fc50223046b
principal-submitters:
- Daniel J. Bernstein
- Tung Chou
- Tanja Lange
- Ingo von Maurich
- Rafael Misoczki
- Ruben Niederhagen
- Edoardo Persichetti
- Christiane Peters
- Peter Schwabe
- Nicolas Sendrier
- Jakub Szefer
- Wen Wang
auxiliary-submitters: []
implementations:
- name: clean
version: SUPERCOP-20191221
- name: vec
version: SUPERCOP-20191221
- name: sse
version: SUPERCOP-20191221
supported_platforms:
- architecture: x86_64
operating_systems:
- Linux
- Darwin
required_flags:
- sse4_1
- popcnt
- name: avx
version: SUPERCOP-20191221
supported_platforms:
- architecture: x86_64
operating_systems:
- Linux
- Darwin
required_flags:
- avx2
- popcnt

+ 16
- 0
crypto_kem/mceliece348864/avx/LICENSE View File

@@ -0,0 +1,16 @@
Public Domain.

Authors of Classic McEliece in alphabetical order:

Daniel J. Bernstein, University of Illinois at Chicago
Tung Chou, Osaka University
Tanja Lange, Technische Universiteit Eindhoven
Ingo von Maurich, self
Rafael Misoczki, Intel Corporation
Ruben Niederhagen, Fraunhofer SIT
Edoardo Persichetti, Florida Atlantic University
Christiane Peters, self
Peter Schwabe, Radboud University
Nicolas Sendrier, Inria
Jakub Szefer, Yale University
Wen Wang, Yale University

+ 42
- 0
crypto_kem/mceliece348864/avx/Makefile View File

@@ -0,0 +1,42 @@
# This Makefile can be used with GNU Make or BSD Make

LIB = libmceliece348864_avx.a

SOURCES = aes256ctr.c benes.c bm.c controlbits.c decrypt.c encrypt.c fft.c \
fft_tr.c gf.c int32_sort.c operations.c pk_gen.c sk_gen.c transpose.c \
util.c uint32_sort.o vec.c vec128.c vec256.c \
consts.S syndrome_asm.S transpose_64x256_sp_asm.S \
transpose_64x64_asm.S update_asm.S vec128_mul_asm.S vec256_mul_asm.S \
vec_mul_asm.S vec_mul_sp_asm.S vec_reduce_asm.S

HEADERS = aes256ctr.h api.h benes.h bm.h controlbits.h crypto_hash.h \
decrypt.h encrypt.h fft.h fft_tr.h gf.h int32_sort.h \
params.h pk_gen.h sk_gen.h transpose.h uint32_sort.h util.h \
vec128.h vec256.h vec.h \
consts.inc powers.inc scalars_2x.inc scalars.inc

OBJECTS = aes256ctr.o benes.o bm.o controlbits.o decrypt.o encrypt.o fft.o \
fft_tr.o gf.o int32_sort.o operations.o pk_gen.o sk_gen.o transpose.o \
util.o uint32_sort.o vec.o vec128.o vec256.o \
consts.o syndrome_asm.o transpose_64x256_sp_asm.o \
transpose_64x64_asm.o update_asm.o vec128_mul_asm.o vec256_mul_asm.o \
vec_mul_asm.o vec_mul_sp_asm.o vec_reduce_asm.o

CFLAGS = -O3 -std=c99 -mpopcnt -mavx2 -Wall -Wextra -pedantic -Werror -Wpedantic \
-Wredundant-decls -Wvla -Wcast-align -Wmissing-prototypes \
-I../../../common/ $(EXTRAFLAGS)

all: $(LIB)

%.o: %.c $(HEADERS)
$(CC) $(CFLAGS) -c -o $@ $<

%.o: %.S
$(CC) -c -o $@ $<

$(LIB): $(OBJECTS)
$(AR) -r $@ $(OBJECTS)

clean:
$(RM) $(OBJECTS)
$(RM) $(LIB)

+ 13
- 0
crypto_kem/mceliece348864/avx/aes256ctr.c View File

@@ -0,0 +1,13 @@
#include "aes256ctr.h"

void PQCLEAN_MCELIECE348864_AVX_aes256ctr(
uint8_t *out,
size_t outlen,
const uint8_t nonce[AESCTR_NONCEBYTES],
const uint8_t key[AES256_KEYBYTES]) {

aes256ctx state;
aes256_keyexp(&state, key);
aes256_ctr(out, outlen, nonce, &state);
aes256_ctx_release(&state);
}

+ 17
- 0
crypto_kem/mceliece348864/avx/aes256ctr.h View File

@@ -0,0 +1,17 @@
#ifndef PQCLEAN_MCELIECE348864_AVX_AES256CTR_H
#define PQCLEAN_MCELIECE348864_AVX_AES256CTR_H

#include <stddef.h>
#include <stdint.h>

#include "aes.h"


void PQCLEAN_MCELIECE348864_AVX_aes256ctr(
uint8_t *out,
size_t outlen,
const uint8_t nonce[AESCTR_NONCEBYTES],
const uint8_t key[AES256_KEYBYTES]
);

#endif

+ 32
- 0
crypto_kem/mceliece348864/avx/api.h View File

@@ -0,0 +1,32 @@
#ifndef PQCLEAN_MCELIECE348864_AVX_API_H
#define PQCLEAN_MCELIECE348864_AVX_API_H

#include <stdint.h>

#define PQCLEAN_MCELIECE348864_AVX_CRYPTO_ALGNAME "Classic McEliece 348864"
#define PQCLEAN_MCELIECE348864_AVX_CRYPTO_PUBLICKEYBYTES 261120
#define PQCLEAN_MCELIECE348864_AVX_CRYPTO_SECRETKEYBYTES 6452
#define PQCLEAN_MCELIECE348864_AVX_CRYPTO_CIPHERTEXTBYTES 128
#define PQCLEAN_MCELIECE348864_AVX_CRYPTO_BYTES 32


int PQCLEAN_MCELIECE348864_AVX_crypto_kem_enc(
uint8_t *c,
uint8_t *key,
const uint8_t *pk
);

int PQCLEAN_MCELIECE348864_AVX_crypto_kem_dec(
uint8_t *key,
const uint8_t *c,
const uint8_t *sk
);

int PQCLEAN_MCELIECE348864_AVX_crypto_kem_keypair
(
uint8_t *pk,
uint8_t *sk
);

#endif


+ 287
- 0
crypto_kem/mceliece348864/avx/benes.c View File

@@ -0,0 +1,287 @@
/*
This file is for Benes network related functions
*/
#include "benes.h"

#include "params.h"
#include "transpose.h"
#include "util.h"

static void layer_0(uint64_t *bs, const uint64_t *cond) {
int x;
uint64_t diff;

for (x = 0; x < (1 << 6); x += 2) {
diff = bs[ x ] ^ bs[ x + 1 ];
diff &= *cond++;
bs[ x ] ^= diff;
bs[ x + 1 ] ^= diff;
}
}

static void layer_1(uint64_t *bs, const uint64_t *cond) {
int x;
uint64_t diff;

for (x = 0; x < (1 << 6); x += 4) {
diff = bs[ x + 0 ] ^ bs[ x + 2 ];
diff &= cond[0];
bs[ x + 0 ] ^= diff;
bs[ x + 2 ] ^= diff;

diff = bs[ x + 1 ] ^ bs[ x + 3 ];
diff &= cond[1];
bs[ x + 1 ] ^= diff;
bs[ x + 3 ] ^= diff;

cond += 2;
}
}

static void layer_2(uint64_t *bs, const uint64_t *cond) {
int x;
uint64_t diff;

for (x = 0; x < (1 << 6); x += 8) {
diff = bs[ x + 0 ] ^ bs[ x + 4 ];
diff &= cond[0];
bs[ x + 0 ] ^= diff;
bs[ x + 4 ] ^= diff;

diff = bs[ x + 1 ] ^ bs[ x + 5 ];
diff &= cond[1];
bs[ x + 1 ] ^= diff;
bs[ x + 5 ] ^= diff;

diff = bs[ x + 2 ] ^ bs[ x + 6 ];
diff &= cond[2];
bs[ x + 2 ] ^= diff;
bs[ x + 6 ] ^= diff;

diff = bs[ x + 3 ] ^ bs[ x + 7 ];
diff &= cond[3];
bs[ x + 3 ] ^= diff;
bs[ x + 7 ] ^= diff;

cond += 4;
}
}

static void layer_3(uint64_t *bs, const uint64_t *cond) {
int x, s;
uint64_t diff;

for (x = 0; x < (1 << 6); x += 16) {
for (s = x; s < x + 8; s += 4) {
diff = bs[ s + 0 ] ^ bs[ s + 8 ];
diff &= cond[0];
bs[ s + 0 ] ^= diff;
bs[ s + 8 ] ^= diff;

diff = bs[ s + 1 ] ^ bs[ s + 9 ];
diff &= cond[1];
bs[ s + 1 ] ^= diff;
bs[ s + 9 ] ^= diff;

diff = bs[ s + 2 ] ^ bs[ s + 10 ];
diff &= cond[2];
bs[ s + 2 ] ^= diff;
bs[ s + 10 ] ^= diff;

diff = bs[ s + 3 ] ^ bs[ s + 11 ];
diff &= cond[3];
bs[ s + 3 ] ^= diff;
bs[ s + 11 ] ^= diff;

cond += 4;
}
}
}

static void layer_4(uint64_t *bs, const uint64_t *cond) {
int x, s;
uint64_t diff;

for (x = 0; x < (1 << 6); x += 32) {
for (s = x; s < x + 16; s += 4) {
diff = bs[ s + 0 ] ^ bs[ s + 16 ];
diff &= cond[0];
bs[ s + 0 ] ^= diff;
bs[ s + 16 ] ^= diff;

diff = bs[ s + 1 ] ^ bs[ s + 17 ];
diff &= cond[1];
bs[ s + 1 ] ^= diff;
bs[ s + 17 ] ^= diff;

diff = bs[ s + 2 ] ^ bs[ s + 18 ];
diff &= cond[2];
bs[ s + 2 ] ^= diff;
bs[ s + 18 ] ^= diff;

diff = bs[ s + 3 ] ^ bs[ s + 19 ];
diff &= cond[3];
bs[ s + 3 ] ^= diff;
bs[ s + 19 ] ^= diff;

cond += 4;
}
}
}

static void layer_5(uint64_t *bs, const uint64_t *cond) {
int x, s;
uint64_t diff;

for (x = 0; x < (1 << 6); x += 64) {
for (s = x; s < x + 32; s += 4) {
diff = bs[ s + 0 ] ^ bs[ s + 32 ];
diff &= cond[0];
bs[ s + 0 ] ^= diff;
bs[ s + 32 ] ^= diff;

diff = bs[ s + 1 ] ^ bs[ s + 33 ];
diff &= cond[1];
bs[ s + 1 ] ^= diff;
bs[ s + 33 ] ^= diff;

diff = bs[ s + 2 ] ^ bs[ s + 34 ];
diff &= cond[2];
bs[ s + 2 ] ^= diff;
bs[ s + 34 ] ^= diff;

diff = bs[ s + 3 ] ^ bs[ s + 35 ];
diff &= cond[3];
bs[ s + 3 ] ^= diff;
bs[ s + 35 ] ^= diff;

cond += 4;
}
}
}

/* input: bits, control bits as array of bytes */
/* output: out, control bits as array of 128-bit vectors */
void PQCLEAN_MCELIECE348864_AVX_load_bits(uint64_t out[][32], const unsigned char *bits) {
int i, low, block = 0;

uint64_t cond[64];

//

for (low = 0; low <= 5; low++) {
for (i = 0; i < 64; i++) {
cond[i] = PQCLEAN_MCELIECE348864_AVX_load4(bits + block * 256 + i * 4);
}
PQCLEAN_MCELIECE348864_AVX_transpose_64x64(cond);

for (i = 0; i < 32; i++) {
out[ block ][i] = cond[i];
}
block++;
}

for (low = 0; low <= 5; low++) {
for (i = 0; i < 32; i++) {
out[ block ][i] = PQCLEAN_MCELIECE348864_AVX_load8(bits + block * 256 + i * 8);
}
block++;
}

for (low = 4; low >= 0; low--) {
for (i = 0; i < 32; i++) {
out[ block ][i] = PQCLEAN_MCELIECE348864_AVX_load8(bits + block * 256 + i * 8);
}
block++;
}

for (low = 5; low >= 0; low--) {
for (i = 0; i < 64; i++) {
cond[i] = PQCLEAN_MCELIECE348864_AVX_load4(bits + block * 256 + i * 4);
}
PQCLEAN_MCELIECE348864_AVX_transpose_64x64(cond);

for (i = 0; i < 32; i++) {
out[ block ][i] = cond[i];
}
block++;
}
}

/* input: r, sequence of bits to be permuted */
/* cond, control bits as array of 128-bit vectors */
/* rev, 0 for normal application; !0 for inverse */
/* output: r, permuted bits */
void PQCLEAN_MCELIECE348864_AVX_benes(uint64_t *r, uint64_t cond[][32], int rev) {
int block, inc;

uint64_t *bs = r;

//

if (rev == 0) {
block = 0;
inc = 1;
} else {
block = 22;
inc = -1;
}

PQCLEAN_MCELIECE348864_AVX_transpose_64x64(bs);

layer_0(bs, cond[ block ]);
block += inc;
layer_1(bs, cond[ block ]);
block += inc;
layer_2(bs, cond[ block ]);
block += inc;
layer_3(bs, cond[ block ]);
block += inc;
layer_4(bs, cond[ block ]);
block += inc;
layer_5(bs, cond[ block ]);
block += inc;

PQCLEAN_MCELIECE348864_AVX_transpose_64x64(bs);

layer_0(bs, cond[ block ]);
block += inc;
layer_1(bs, cond[ block ]);
block += inc;
layer_2(bs, cond[ block ]);
block += inc;
layer_3(bs, cond[ block ]);
block += inc;
layer_4(bs, cond[ block ]);
block += inc;
layer_5(bs, cond[ block ]);
block += inc;
layer_4(bs, cond[ block ]);
block += inc;
layer_3(bs, cond[ block ]);
block += inc;
layer_2(bs, cond[ block ]);
block += inc;
layer_1(bs, cond[ block ]);
block += inc;
layer_0(bs, cond[ block ]);
block += inc;

PQCLEAN_MCELIECE348864_AVX_transpose_64x64(bs);

layer_5(bs, cond[ block ]);
block += inc;
layer_4(bs, cond[ block ]);
block += inc;
layer_3(bs, cond[ block ]);
block += inc;
layer_2(bs, cond[ block ]);
block += inc;
layer_1(bs, cond[ block ]);
block += inc;
layer_0(bs, cond[ block ]);
//block += inc;

PQCLEAN_MCELIECE348864_AVX_transpose_64x64(bs);
}


+ 15
- 0
crypto_kem/mceliece348864/avx/benes.h View File

@@ -0,0 +1,15 @@
#ifndef PQCLEAN_MCELIECE348864_AVX_BENES_H
#define PQCLEAN_MCELIECE348864_AVX_BENES_H
/*
This file is for Benes network related functions
*/


#include "gf.h"
#include "vec128.h"

void PQCLEAN_MCELIECE348864_AVX_load_bits(uint64_t /*out*/[][32], const unsigned char * /*bits*/);
void PQCLEAN_MCELIECE348864_AVX_benes(uint64_t * /*r*/, uint64_t /*cond*/[][32], int /*rev*/);

#endif


+ 219
- 0
crypto_kem/mceliece348864/avx/bm.c View File

@@ -0,0 +1,219 @@
/*
This file is for the inversion-free Berlekamp-Massey algorithm
see https://ieeexplore.ieee.org/document/87857
*/

#include "bm.h"

#include "gf.h"
#include "util.h"
#include "vec.h"
#include "vec128.h"

#include <stdint.h>

extern void PQCLEAN_MCELIECE348864_AVX_update_asm(void *, gf, int);
extern gf PQCLEAN_MCELIECE348864_AVX_vec_reduce_asm(uint64_t *);

static inline uint64_t mask_nonzero(gf a) {
uint64_t ret = a;

ret -= 1;
ret >>= 63;
ret -= 1;

return ret;
}

static inline uint64_t mask_leq(uint16_t a, uint16_t b) {
uint64_t a_tmp = a;
uint64_t b_tmp = b;
uint64_t ret = b_tmp - a_tmp;

ret >>= 63;
ret -= 1;

return ret;
}

static inline void vec_cmov(uint64_t out[][2], uint64_t mask) {
int i;

for (i = 0; i < GFBITS; i++) {
out[i][0] = (out[i][0] & ~mask) | (out[i][1] & mask);
}
}

static inline void interleave(vec128 *in, int idx0, int idx1, vec128 *mask, int b) {
int s = 1 << b;

vec128 x, y;

x = PQCLEAN_MCELIECE348864_AVX_vec128_or(PQCLEAN_MCELIECE348864_AVX_vec128_and(in[idx0], mask[0]),
PQCLEAN_MCELIECE348864_AVX_vec128_sll_2x(PQCLEAN_MCELIECE348864_AVX_vec128_and(in[idx1], mask[0]), s));

y = PQCLEAN_MCELIECE348864_AVX_vec128_or(PQCLEAN_MCELIECE348864_AVX_vec128_srl_2x(PQCLEAN_MCELIECE348864_AVX_vec128_and(in[idx0], mask[1]), s),
PQCLEAN_MCELIECE348864_AVX_vec128_and(in[idx1], mask[1]));

in[idx0] = x;
in[idx1] = y;
}

/* input: in, field elements in bitsliced form */
/* output: out, field elements in non-bitsliced form */
static inline void get_coefs(gf *out, vec128 *in) {
int i, k;

vec128 mask[4][2];
vec128 buf[16];

for (i = 0; i < GFBITS; i++) {
buf[i] = in[i];
}
for (i = GFBITS; i < 16; i++) {
buf[i] = PQCLEAN_MCELIECE348864_AVX_vec128_setzero();
}

mask[0][0] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0x5555);
mask[0][1] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0xAAAA);
mask[1][0] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0x3333);
mask[1][1] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0xCCCC);
mask[2][0] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0x0F0F);
mask[2][1] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0xF0F0);
mask[3][0] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0x00FF);
mask[3][1] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0xFF00);

interleave(buf, 0, 8, mask[3], 3);
interleave(buf, 1, 9, mask[3], 3);
interleave(buf, 2, 10, mask[3], 3);
interleave(buf, 3, 11, mask[3], 3);
interleave(buf, 4, 12, mask[3], 3);
interleave(buf, 5, 13, mask[3], 3);
interleave(buf, 6, 14, mask[3], 3);
interleave(buf, 7, 15, mask[3], 3);

interleave(buf, 0, 4, mask[2], 2);
interleave(buf, 1, 5, mask[2], 2);
interleave(buf, 2, 6, mask[2], 2);
interleave(buf, 3, 7, mask[2], 2);
interleave(buf, 8, 12, mask[2], 2);
interleave(buf, 9, 13, mask[2], 2);
interleave(buf, 10, 14, mask[2], 2);
interleave(buf, 11, 15, mask[2], 2);

interleave(buf, 0, 2, mask[1], 1);
interleave(buf, 1, 3, mask[1], 1);
interleave(buf, 4, 6, mask[1], 1);
interleave(buf, 5, 7, mask[1], 1);
interleave(buf, 8, 10, mask[1], 1);
interleave(buf, 9, 11, mask[1], 1);
interleave(buf, 12, 14, mask[1], 1);
interleave(buf, 13, 15, mask[1], 1);

interleave(buf, 0, 1, mask[0], 0);
interleave(buf, 2, 3, mask[0], 0);
interleave(buf, 4, 5, mask[0], 0);
interleave(buf, 6, 7, mask[0], 0);
interleave(buf, 8, 9, mask[0], 0);
interleave(buf, 10, 11, mask[0], 0);
interleave(buf, 12, 13, mask[0], 0);
interleave(buf, 14, 15, mask[0], 0);

for (i = 0; i < 16; i++) {
for (k = 0; k < 4; k++) {
out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE348864_AVX_vec128_extract(buf[i], 0) >> (k * 16)) & GFMASK;
out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE348864_AVX_vec128_extract(buf[i], 1) >> (k * 16)) & GFMASK;
}
}
}

/* input: in, field elements in bitsliced form */
/* output: out, field elements in non-bitsliced form */
void PQCLEAN_MCELIECE348864_AVX_bm(uint64_t out[ GFBITS ], vec128 in[ GFBITS ]) {
uint16_t i;
uint16_t N, L;

uint64_t prod[ GFBITS ];
uint64_t in_tmp[ GFBITS ];

uint64_t db[ GFBITS ][ 2 ];
uint64_t BC_tmp[ GFBITS ][ 2 ];
uint64_t BC[ GFBITS ][ 2 ];

uint64_t mask, t;

gf d, b, c0 = 1;

gf coefs[SYS_T * 2];

// init

BC[0][1] = 0;
BC[0][0] = 1;
BC[0][0] <<= 63;

for (i = 1; i < GFBITS; i++) {
BC[i][0] = BC[i][1] = 0;
}

b = 1;
L = 0;

//

get_coefs(coefs, in);

for (i = 0; i < GFBITS; i++) {
in_tmp[i] = 0;
}

for (N = 0; N < SYS_T * 2; N++) {
// computing d

PQCLEAN_MCELIECE348864_AVX_vec_mul_sp(prod, in_tmp, &BC[0][0]);

PQCLEAN_MCELIECE348864_AVX_update_asm(in_tmp, coefs[N], 8);

d = PQCLEAN_MCELIECE348864_AVX_vec_reduce_asm(prod);

t = PQCLEAN_MCELIECE348864_AVX_gf_mul2(c0, coefs[N], b);

d ^= t & 0xFFFFFFFF;

// 3 cases

mask = mask_nonzero(d) & mask_leq(L * 2, N);

for (i = 0; i < GFBITS; i++) {
db[i][0] = (d >> i) & 1;
db[i][0] = -db[i][0];
db[i][1] = (b >> i) & 1;
db[i][1] = -db[i][1];
}

PQCLEAN_MCELIECE348864_AVX_vec128_mul((vec128 *) BC_tmp, (vec128 *) db, (vec128 *) BC);

vec_cmov(BC, mask);

PQCLEAN_MCELIECE348864_AVX_update_asm(BC, mask & c0, 16);

for (i = 0; i < GFBITS; i++) {
BC[i][1] = BC_tmp[i][0] ^ BC_tmp[i][1];
}

c0 = t >> 32;
b = (d & mask) | (b & ~mask);
L = ((N + 1 - L) & mask) | (L & ~mask);

}

c0 = PQCLEAN_MCELIECE348864_AVX_gf_inv(c0);

for (i = 0; i < GFBITS; i++) {
out[i] = (c0 >> i) & 1;
out[i] = -out[i];
}

PQCLEAN_MCELIECE348864_AVX_vec_mul_sp(out, out, &BC[0][0]);
}


+ 14
- 0
crypto_kem/mceliece348864/avx/bm.h View File

@@ -0,0 +1,14 @@
#ifndef PQCLEAN_MCELIECE348864_AVX_BM_H
#define PQCLEAN_MCELIECE348864_AVX_BM_H
/*
This file is for the inversion-free Berlekamp-Massey algorithm
see https://ieeexplore.ieee.org/document/87857
*/


#include "vec128.h"

void PQCLEAN_MCELIECE348864_AVX_bm(uint64_t * /*out*/, vec128 * /*in*/);

#endif


+ 33
- 0
crypto_kem/mceliece348864/avx/consts.S View File

@@ -0,0 +1,33 @@
.data

# not supported on MacOS
#.section .rodata

.globl PQCLEAN_MCELIECE348864_AVX_MASK0_0
.globl PQCLEAN_MCELIECE348864_AVX_MASK0_1
.globl PQCLEAN_MCELIECE348864_AVX_MASK1_0
.globl PQCLEAN_MCELIECE348864_AVX_MASK1_1
.globl PQCLEAN_MCELIECE348864_AVX_MASK2_0
.globl PQCLEAN_MCELIECE348864_AVX_MASK2_1
.globl PQCLEAN_MCELIECE348864_AVX_MASK3_0
.globl PQCLEAN_MCELIECE348864_AVX_MASK3_1
.globl PQCLEAN_MCELIECE348864_AVX_MASK4_0
.globl PQCLEAN_MCELIECE348864_AVX_MASK4_1
.globl PQCLEAN_MCELIECE348864_AVX_MASK5_0
.globl PQCLEAN_MCELIECE348864_AVX_MASK5_1

.p2align 5

PQCLEAN_MCELIECE348864_AVX_MASK0_0: .quad 0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555
PQCLEAN_MCELIECE348864_AVX_MASK0_1: .quad 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA
PQCLEAN_MCELIECE348864_AVX_MASK1_0: .quad 0x3333333333333333, 0x3333333333333333, 0x3333333333333333, 0x3333333333333333
PQCLEAN_MCELIECE348864_AVX_MASK1_1: .quad 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC
PQCLEAN_MCELIECE348864_AVX_MASK2_0: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
PQCLEAN_MCELIECE348864_AVX_MASK2_1: .quad 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0
PQCLEAN_MCELIECE348864_AVX_MASK3_0: .quad 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF
PQCLEAN_MCELIECE348864_AVX_MASK3_1: .quad 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00
PQCLEAN_MCELIECE348864_AVX_MASK4_0: .quad 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF
PQCLEAN_MCELIECE348864_AVX_MASK4_1: .quad 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000
PQCLEAN_MCELIECE348864_AVX_MASK5_0: .quad 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF
PQCLEAN_MCELIECE348864_AVX_MASK5_1: .quad 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000


+ 238
- 0
crypto_kem/mceliece348864/avx/consts.inc View File

@@ -0,0 +1,238 @@
{
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0FF00FF00FF00FF0, 0x0FF00FF00FF00FF0, 0x0FF00FF00FF00FF0, 0x0FF00FF00FF00FF0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA5555AAAA5555AA, 0xAA5555AAAA5555AA, 0xAA5555AAAA5555AA, 0xAA5555AAAA5555AA),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCC33CC3333CC33CC, 0xCC33CC3333CC33CC, 0xCC33CC3333CC33CC, 0xCC33CC3333CC33CC),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x33CC33CC33CC33CC, 0x33CC33CC33CC33CC, 0x33CC33CC33CC33CC, 0x33CC33CC33CC33CC),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5A5A5A5A5A5A5A5A, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5A5A5A5A5A),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0),
},
{
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3C3C3C3C3C3C3C3C, 0x3C3C3C3C3C3C3C3C, 0x3C3C3C3C3C3C3C3C, 0x3C3C3C3C3C3C3C3C),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555AAAA5555AAAA, 0xAAAA5555AAAA5555, 0x5555AAAA5555AAAA, 0xAAAA5555AAAA5555),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCC3333CCCC3333CC, 0xCC3333CCCC3333CC, 0xCC3333CCCC3333CC, 0xCC3333CCCC3333CC),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55555555AAAAAAAA, 0x55555555AAAAAAAA, 0x55555555AAAAAAAA, 0x55555555AAAAAAAA),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x33333333CCCCCCCC, 0x33333333CCCCCCCC, 0x33333333CCCCCCCC, 0x33333333CCCCCCCC),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FFFF00FF00, 0xFF00FF0000FF00FF, 0x00FF00FFFF00FF00, 0xFF00FF0000FF00FF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000FFFFFFFF0000, 0x0000FFFFFFFF0000, 0x0000FFFFFFFF0000, 0x0000FFFFFFFF0000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0),
},
{
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA5555AA55AA, 0x55AA55AAAA55AA55, 0xAA55AA5555AA55AA, 0x55AA55AAAA55AA55),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCC33CC3333CC33CC, 0xCC33CC3333CC33CC, 0x33CC33CCCC33CC33, 0x33CC33CCCC33CC33),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33, 0xCC3333CC33CCCC33, 0x33CCCC33CC3333CC),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55555555AAAAAAAA, 0x55555555AAAAAAAA, 0x55555555AAAAAAAA, 0x55555555AAAAAAAA),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF0000FF00FFFF00, 0xFF0000FF00FFFF00, 0x00FFFF00FF0000FF, 0x00FFFF00FF0000FF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3CC33CC3C33CC33C, 0xC33CC33C3CC33CC3, 0x3CC33CC3C33CC33C, 0xC33CC33C3CC33CC3),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555AAAA5555AAAA, 0xAAAA5555AAAA5555, 0x5555AAAA5555AAAA, 0xAAAA5555AAAA5555),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0FF00FF00FF00FF0, 0xF00FF00FF00FF00F, 0x0FF00FF00FF00FF0, 0xF00FF00FF00FF00F),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC33333333CCCC, 0x3333CCCCCCCC3333, 0x3333CCCCCCCC3333, 0xCCCC33333333CCCC),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FFFF0000FFFF00, 0xFF0000FFFF0000FF, 0x00FFFF0000FFFF00, 0xFF0000FFFF0000FF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C),
},
{
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x6699669999669966, 0x9966996666996699, 0x6699669999669966, 0x9966996666996699),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xA5A5A5A55A5A5A5A, 0xA5A5A5A55A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0x5A5A5A5AA5A5A5A5),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C, 0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F, 0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C, 0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F, 0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33, 0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A),
},
{
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x6699669999669966, 0x9966996666996699, 0x6699669999669966, 0x9966996666996699),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCC3333CC33CCCC33, 0xCC3333CC33CCCC33, 0xCC3333CC33CCCC33, 0xCC3333CC33CCCC33),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5A5A5A5AA5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0xA5A5A5A55A5A5A5A, 0xA5A5A5A55A5A5A5A),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C, 0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0, 0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C, 0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F, 0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33, 0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A),
},
{
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x9669699696696996, 0x9669699696696996, 0x9669699696696996, 0x9669699696696996),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x6996699669966996, 0x6996699669966996, 0x6996699669966996, 0x6996699669966996),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x6996699669966996, 0x6996699669966996, 0x6996699669966996, 0x6996699669966996),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00, 0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F, 0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F, 0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C),
},
{
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x9669699696696996, 0x9669699696696996, 0x9669699696696996, 0x9669699696696996),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x6996699669966996, 0x6996699669966996, 0x6996699669966996, 0x6996699669966996),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x9669966996699669, 0x9669966996699669, 0x9669966996699669, 0x9669966996699669),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF, 0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F, 0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F, 0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C),
},
{
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x9669699696696996, 0x9669699696696996, 0x9669699696696996, 0x9669699696696996),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x9669966996699669, 0x9669966996699669, 0x9669966996699669, 0x9669966996699669),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x9669966996699669, 0x9669966996699669, 0x9669966996699669, 0x9669966996699669),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00, 0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F, 0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F, 0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C),
},
{
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x9669699696696996, 0x9669699696696996, 0x9669699696696996, 0x9669699696696996),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x9669966996699669, 0x9669966996699669, 0x9669966996699669, 0x9669966996699669),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x6996699669966996, 0x6996699669966996, 0x6996699669966996, 0x6996699669966996),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF, 0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F, 0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F, 0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C),
},
{
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA),
},
{
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA),
},
{
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA),
},
{
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA),
},
{
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA),
},
{
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA),
},
{
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA),
},
{
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA),
},

+ 274
- 0
crypto_kem/mceliece348864/avx/controlbits.c View File

@@ -0,0 +1,274 @@
/*
This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation
see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf
*/

#include "controlbits.h"

#include "params.h"

#include <stdint.h>

typedef uint8_t bit;

#define N (1 << GFBITS)

static bit is_smaller(uint32_t a, uint32_t b) {
uint32_t ret = 0;

ret = a - b;
ret >>= 31;

return (bit)ret;
}

static bit is_smaller_63b(uint64_t a, uint64_t b) {
uint64_t ret = 0;

ret = a - b;
ret >>= 63;

return (bit)ret;
}

static void cswap(uint32_t *x, uint32_t *y, bit swap) {
uint32_t m;
uint32_t d;

m = swap;
m = 0 - m;

d = (*x ^ *y);
d &= m;
*x ^= d;
*y ^= d;
}

static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) {
uint64_t m;
uint64_t d;

m = swap;
m = 0 - m;

d = (*x ^ *y);
d &= m;
*x ^= d;
*y ^= d;
}

/* output x = min(input x,input y) */
/* output y = max(input x,input y) */

static void minmax(uint32_t *x, uint32_t *y) {
bit m;

m = is_smaller(*y, *x);
cswap(x, y, m);
}

static void minmax_63b(uint64_t *x, uint64_t *y) {
bit m;

m = is_smaller_63b(*y, *x);
cswap_63b(x, y, m);
}

/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */
/* requires n to be a power of 2 */

static void merge(int n, uint32_t *x, int step) {
int i;
if (n == 1) {
minmax(&x[0], &x[step]);
} else {
merge(n / 2, x, step * 2);
merge(n / 2, x + step, step * 2);
for (i = 1; i < 2 * n - 1; i += 2) {
minmax(&x[i * step], &x[(i + 1) * step]);
}
}
}

static void merge_63b(int n, uint64_t *x, int step) {
int i;
if (n == 1) {
minmax_63b(&x[0], &x[step]);
} else {
merge_63b(n / 2, x, step * 2);
merge_63b(n / 2, x + step, step * 2);
for (i = 1; i < 2 * n - 1; i += 2) {
minmax_63b(&x[i * step], &x[(i + 1) * step]);
}
}
}

/* sort x[0],x[1],...,x[n-1] in place */
/* requires n to be a power of 2 */

static void sort(int n, uint32_t *x) {
if (n <= 1) {
return;
}
sort(n / 2, x);
sort(n / 2, x + n / 2);
merge(n / 2, x, 1);
}

void PQCLEAN_MCELIECE348864_AVX_sort_63b(int n, uint64_t *x) {
if (n <= 1) {
return;
}
PQCLEAN_MCELIECE348864_AVX_sort_63b(n / 2, x);
PQCLEAN_MCELIECE348864_AVX_sort_63b(n / 2, x + n / 2);
merge_63b(n / 2, x, 1);
}

/* y[pi[i]] = x[i] */
/* requires n = 2^w */
/* requires pi to be a permutation */
static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC
int i;
uint32_t t[2 * N];

for (i = 0; i < n; ++i) {
t[i] = x[i] | (pi[i] << 16);
}

sort(n, t);

for (i = 0; i < n; ++i) {
y[i] = t[i] & 0xFFFF;
}
}

/* ip[i] = j iff pi[i] = j */
/* requires n = 2^w */
/* requires pi to be a permutation */
static void invert(int n, uint32_t *ip, const uint32_t *pi) {
int i;

for (i = 0; i < n; i++) {
ip[i] = i;
}

composeinv(n, ip, ip, pi);
}


static void flow(int w, uint32_t *x, const uint32_t *y, int t) {
bit m0;
bit m1;

uint32_t b;
uint32_t y_copy = *y;

m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1));
m1 = is_smaller(0, t);

cswap(x, &y_copy, m0);
b = m0 & m1;
*x ^= b << w;
}

/* input: permutation pi */
/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */
/* requires n = 2^w */
static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) {
int i;
int j;
int k;
int t;
uint32_t ip[N] = {0};
uint32_t I[2 * N] = {0};
uint32_t P[2 * N] = {0};
uint32_t PI[2 * N] = {0};
uint32_t T[2 * N] = {0};
uint32_t piflip[N] = {0};
uint32_t subpi[2][N / 2] = {{0}};

if (w == 1) {
c[ off / 8 ] |= (pi[0] & 1) << (off % 8);
}
if (w <= 1) {
return;
}

invert(n, ip, pi);

for (i = 0; i < n; ++i) {
I[i] = ip[i] | (1 << w);
I[n + i] = pi[i];
}

for (i = 0; i < 2 * n; ++i) {
P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w);
}

for (t = 0; t < w; ++t) {
composeinv(2 * n, PI, P, I);

for (i = 0; i < 2 * n; ++i) {
flow(w, &P[i], &PI[i], t);
}

for (i = 0; i < 2 * n; ++i) {
T[i] = I[i ^ 1];
}

composeinv(2 * n, I, I, T);

for (i = 0; i < 2 * n; ++i) {
T[i] = P[i ^ 1];
}

for (i = 0; i < 2 * n; ++i) {
flow(w, &P[i], &T[i], 1);
}
}

for (i = 0; i < n; ++i) {
for (j = 0; j < w; ++j) {
piflip[i] = pi[i];
}
}

for (i = 0; i < n / 2; ++i) {
c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8);
}
for (i = 0; i < n / 2; ++i) {
c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8);
}

for (i = 0; i < n / 2; ++i) {
cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1);
}

for (k = 0; k < 2; ++k) {
for (i = 0; i < n / 2; ++i) {
subpi[k][i] = piflip[i * 2 + k] >> 1;
}
}

for (k = 0; k < 2; ++k) {
controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]);
}
}

/* input: pi, a permutation*/
/* output: out, control bits w.r.t. pi */
void PQCLEAN_MCELIECE348864_AVX_controlbits(unsigned char *out, const uint32_t *pi) {
unsigned int i;
unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ];

for (i = 0; i < sizeof(c); i++) {
c[i] = 0;
}

controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi);

for (i = 0; i < sizeof(c); i++) {
out[i] = c[i];
}
}


+ 15
- 0
crypto_kem/mceliece348864/avx/controlbits.h View File

@@ -0,0 +1,15 @@
#ifndef PQCLEAN_MCELIECE348864_AVX_CONTROLBITS_H
#define PQCLEAN_MCELIECE348864_AVX_CONTROLBITS_H
/*
This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation
see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf
*/


#include <stdint.h>

void PQCLEAN_MCELIECE348864_AVX_sort_63b(int n, uint64_t *x);
void PQCLEAN_MCELIECE348864_AVX_controlbits(unsigned char *out, const uint32_t *pi);

#endif


+ 7
- 0
crypto_kem/mceliece348864/avx/crypto_hash.h View File

@@ -0,0 +1,7 @@
#ifndef PQCLEAN_MCELIECE348864_AVX_CRYPTO_HASH_H
#define PQCLEAN_MCELIECE348864_AVX_CRYPTO_HASH_H
#include "fips202.h"

#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen)

#endif

+ 234
- 0
crypto_kem/mceliece348864/avx/decrypt.c View File

@@ -0,0 +1,234 @@
/*
This file is for Niederreiter decryption
*/

#include "decrypt.h"

#include "benes.h"
#include "bm.h"
#include "fft.h"
#include "fft_tr.h"
#include "params.h"
#include "util.h"

#include <stdio.h>

static void scaling(vec256 out[][GFBITS], vec256 inv[][GFBITS], const unsigned char *sk, vec256 *recv) {
int i, j;

uint64_t sk_int[ GFBITS ];
vec256 eval[16][ GFBITS ];
vec256 tmp[ GFBITS ];

// computing inverses

PQCLEAN_MCELIECE348864_AVX_irr_load(sk_int, sk);

PQCLEAN_MCELIECE348864_AVX_fft(eval, sk_int);

for (i = 0; i < 16; i++) {
PQCLEAN_MCELIECE348864_AVX_vec256_sq(eval[i], eval[i]);
}

PQCLEAN_MCELIECE348864_AVX_vec256_copy(inv[0], eval[0]);

for (i = 1; i < 16; i++) {
PQCLEAN_MCELIECE348864_AVX_vec256_mul(inv[i], inv[i - 1], eval[i]);
}

PQCLEAN_MCELIECE348864_AVX_vec256_inv(tmp, inv[15]);

for (i = 14; i >= 0; i--) {
PQCLEAN_MCELIECE348864_AVX_vec256_mul(inv[i + 1], tmp, inv[i]);
PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp, tmp, eval[i + 1]);
}

PQCLEAN_MCELIECE348864_AVX_vec256_copy(inv[0], tmp);

//

for (i = 0; i < 16; i++) {
for (j = 0; j < GFBITS; j++) {
out[i][j] = PQCLEAN_MCELIECE348864_AVX_vec256_and(inv[i][j], recv[i]);
}
}
}

static void preprocess(vec128 *recv, const unsigned char *s) {
int i;
unsigned char r[ 512 ];

for (i = 0; i < SYND_BYTES; i++) {
r[i] = s[i];
}

for (i = SYND_BYTES; i < 512; i++) {
r[i] = 0;
}

for (i = 0; i < 32; i++) {
recv[i] = PQCLEAN_MCELIECE348864_AVX_load16(r + i * 16);
}
}

static void postprocess(unsigned char *e, vec128 *err) {
int i;
unsigned char error8[ (1 << GFBITS) / 8 ];
uint64_t v[2];

for (i = 0; i < 32; i++) {
v[0] = PQCLEAN_MCELIECE348864_AVX_vec128_extract(err[i], 0);
v[1] = PQCLEAN_MCELIECE348864_AVX_vec128_extract(err[i], 1);

PQCLEAN_MCELIECE348864_AVX_store8(error8 + i * 16 + 0, v[0]);
PQCLEAN_MCELIECE348864_AVX_store8(error8 + i * 16 + 8, v[1]);
}

for (i = 0; i < SYS_N / 8; i++) {
e[i] = error8[i];
}
}

static void scaling_inv(vec256 out[][GFBITS], vec256 inv[][GFBITS], vec256 *recv) {
int i, j;

for (i = 0; i < 16; i++) {
for (j = 0; j < GFBITS; j++) {
out[i][j] = PQCLEAN_MCELIECE348864_AVX_vec256_and(inv[i][j], recv[i]);
}
}
}

static uint16_t weight_check(unsigned char *e, vec128 *error) {
int i;
uint16_t w0 = 0;
uint16_t w1 = 0;
uint16_t check;

for (i = 0; i < 32; i++) {
w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE348864_AVX_vec128_extract(error[i], 0) );
w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE348864_AVX_vec128_extract(error[i], 1) );
}

for (i = 0; i < SYS_N / 8; i++) {
w1 += _mm_popcnt_u64( e[i] );
}

check = (w0 ^ SYS_T) | (w1 ^ SYS_T);
check -= 1;
check >>= 15;

return check;
}

static uint16_t synd_cmp(vec128 *s0, vec128 *s1) {
int i;
vec128 diff;

diff = PQCLEAN_MCELIECE348864_AVX_vec128_xor(s0[0], s1[0]);

for (i = 1; i < GFBITS; i++) {
diff = PQCLEAN_MCELIECE348864_AVX_vec128_or(diff, PQCLEAN_MCELIECE348864_AVX_vec128_xor(s0[i], s1[i]));
}

return (uint16_t)PQCLEAN_MCELIECE348864_AVX_vec128_testz(diff);
}

static void reformat_128to256(vec256 *out, vec128 *in) {
int i;
uint64_t v[4];

for (i = 0; i < 16; i++) {
v[0] = PQCLEAN_MCELIECE348864_AVX_vec128_extract(in[2 * i + 0], 0);
v[1] = PQCLEAN_MCELIECE348864_AVX_vec128_extract(in[2 * i + 0], 1);
v[2] = PQCLEAN_MCELIECE348864_AVX_vec128_extract(in[2 * i + 1], 0);
v[3] = PQCLEAN_MCELIECE348864_AVX_vec128_extract(in[2 * i + 1], 1);

out[i] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(v[0], v[1], v[2], v[3]);
}
}

static void reformat_256to128(vec128 *out, vec256 *in) {
int i;
uint64_t v[4];

for (i = 0; i < 16; i++) {
v[0] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[i], 0);
v[1] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[i], 1);
v[2] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[i], 2);
v[3] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[i], 3);

out[2 * i + 0] = PQCLEAN_MCELIECE348864_AVX_vec128_set2x(v[0], v[1]);
out[2 * i + 1] = PQCLEAN_MCELIECE348864_AVX_vec128_set2x(v[2], v[3]);
}
}

/* Niederreiter decryption with the Berlekamp decoder */
/* intput: sk, secret key */
/* c, ciphertext (syndrome) */
/* output: e, error vector */
/* return: 0 for success; 1 for failure */
int PQCLEAN_MCELIECE348864_AVX_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) {
int i;

uint16_t check_synd;
uint16_t check_weight;

vec256 inv[ 16 ][ GFBITS ];
vec256 scaled[ 16 ][ GFBITS ];
vec256 eval[16][ GFBITS ];

vec128 error128[ 32 ];
vec256 error256[ 16 ];

vec128 s_priv[ GFBITS ];
vec128 s_priv_cmp[ GFBITS ];
uint64_t locator[ GFBITS ];

vec128 recv128[ 32 ];
vec256 recv256[ 16 ];
vec256 allone;

uint64_t bits_int[23][32];

// Berlekamp decoder

preprocess(recv128, c);

PQCLEAN_MCELIECE348864_AVX_load_bits(bits_int, sk + IRR_BYTES);
PQCLEAN_MCELIECE348864_AVX_benes((uint64_t *) recv128, bits_int, 1);

reformat_128to256(recv256, recv128);

scaling(scaled, inv, sk, recv256);
PQCLEAN_MCELIECE348864_AVX_fft_tr(s_priv, scaled);
PQCLEAN_MCELIECE348864_AVX_bm(locator, s_priv);

PQCLEAN_MCELIECE348864_AVX_fft(eval, locator);

// reencryption and weight check

allone = PQCLEAN_MCELIECE348864_AVX_vec256_set1_16b(0xFFFF);

for (i = 0; i < 16; i++) {
error256[i] = PQCLEAN_MCELIECE348864_AVX_vec256_or_reduce(eval[i]);
error256[i] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(error256[i], allone);
}

scaling_inv(scaled, inv, error256);
PQCLEAN_MCELIECE348864_AVX_fft_tr(s_priv_cmp, scaled);

check_synd = synd_cmp(s_priv, s_priv_cmp);

//

reformat_256to128(error128, error256);
PQCLEAN_MCELIECE348864_AVX_benes((uint64_t *) error128, bits_int, 0);

postprocess(e, error128);

check_weight = weight_check(e, error128);

return 1 - (check_synd & check_weight);
}


+ 10
- 0
crypto_kem/mceliece348864/avx/decrypt.h View File

@@ -0,0 +1,10 @@
#ifndef PQCLEAN_MCELIECE348864_AVX_DECRYPT_H
#define PQCLEAN_MCELIECE348864_AVX_DECRYPT_H
/*
This file is for Nieddereiter decryption
*/

int PQCLEAN_MCELIECE348864_AVX_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/);

#endif


+ 99
- 0
crypto_kem/mceliece348864/avx/encrypt.c View File

@@ -0,0 +1,99 @@
/*
This file is for Niederreiter encryption
*/

#include "encrypt.h"

#include "gf.h"
#include "int32_sort.h"
#include "params.h"
#include "randombytes.h"
#include "util.h"

#include <stdint.h>
#include <stdio.h>
#include <string.h>

/* input: public key pk, error vector e */
/* output: syndrome s */
extern void PQCLEAN_MCELIECE348864_AVX_syndrome_asm(unsigned char *s, const unsigned char *pk, unsigned char *e);

/* output: e, an error vector of weight t */
static void gen_e(unsigned char *e) {
int i, j, eq, count;

uint16_t ind[ SYS_T * 2 ];
int32_t ind32[ SYS_T * 2 ];
uint64_t e_int[ (SYS_N + 63) / 64 ];
uint64_t one = 1;
uint64_t mask;
uint64_t val[ SYS_T ];

while (1) {
randombytes((uint8_t *) ind, sizeof(ind));

for (i = 0; i < SYS_T * 2; i++) {
ind[i] &= GFMASK;
}

//

count = 0;
for (i = 0; i < SYS_T * 2; i++) {
if (ind[i] < SYS_N) {
ind32[ count++ ] = ind[i];
}
}

if (count < SYS_T) {
continue;
}

// check for repetition

PQCLEAN_MCELIECE348864_AVX_int32_sort(ind32, SYS_T);

eq = 0;
for (i = 1; i < SYS_T; i++) {
if (ind32[i - 1] == ind32[i]) {
eq = 1;
}
}

if (eq == 0) {
break;
}
}

for (j = 0; j < SYS_T; j++) {
val[j] = one << (ind32[j] & 63);
}

for (i = 0; i < (SYS_N + 63) / 64; i++) {
e_int[i] = 0;

for (j = 0; j < SYS_T; j++) {
mask = i ^ (ind32[j] >> 6);
mask -= 1;
mask >>= 63;
mask = -mask;

e_int[i] |= val[j] & mask;
}
}

for (i = 0; i < (SYS_N + 63) / 64 - 1; i++) {
PQCLEAN_MCELIECE348864_AVX_store8(e, e_int[i]);
e += 8;
}

for (j = 0; j < (SYS_N % 64); j += 8) {
e[ j / 8 ] = (e_int[i] >> j) & 0xFF;
}
}

void PQCLEAN_MCELIECE348864_AVX_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) {
gen_e(e);
PQCLEAN_MCELIECE348864_AVX_syndrome_asm(s, pk, e);
}


+ 11
- 0
crypto_kem/mceliece348864/avx/encrypt.h View File

@@ -0,0 +1,11 @@
#ifndef PQCLEAN_MCELIECE348864_AVX_ENCRYPT_H
#define PQCLEAN_MCELIECE348864_AVX_ENCRYPT_H
/*
This file is for Niederreiter encryption
*/


void PQCLEAN_MCELIECE348864_AVX_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/);

#endif


+ 172
- 0
crypto_kem/mceliece348864/avx/fft.c View File

@@ -0,0 +1,172 @@
/*
This file is for the Gao-Mateer FFT
sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf
*/

#include "fft.h"

#include "vec.h"

/* input: in, polynomial in bitsliced form */
/* output: in, result of applying the radix conversions on in */
static void radix_conversions(uint64_t *in) {
int i, j, k;

const uint64_t mask[5][2] = {
{0x8888888888888888, 0x4444444444444444},
{0xC0C0C0C0C0C0C0C0, 0x3030303030303030},
{0xF000F000F000F000, 0x0F000F000F000F00},
{0xFF000000FF000000, 0x00FF000000FF0000},
{0xFFFF000000000000, 0x0000FFFF00000000}
};

const uint64_t s[5][GFBITS] = {
#include "scalars.inc"
};

//

for (j = 0; j <= 4; j++) {
for (i = 0; i < GFBITS; i++) {
for (k = 4; k >= j; k--) {
in[i] ^= (in[i] & mask[k][0]) >> (1 << k);
in[i] ^= (in[i] & mask[k][1]) >> (1 << k);
}
}

PQCLEAN_MCELIECE348864_AVX_vec_mul(in, in, s[j]); // scaling
}
}

/* input: in, result of applying the radix conversions to the input polynomial */
/* output: out, evaluation results (by applying the FFT butterflies) */
static void butterflies(vec256 out[][ GFBITS ], const uint64_t *in) {
int i, j, k, s, b;

uint64_t t0, t1, t2, t3;

const vec256 consts[ 17 ][ GFBITS ] = {
#include "consts.inc"
};

uint64_t consts_ptr = 0;

const unsigned char reversal[64] = {
0, 32, 16, 48, 8, 40, 24, 56,
4, 36, 20, 52, 12, 44, 28, 60,
2, 34, 18, 50, 10, 42, 26, 58,
6, 38, 22, 54, 14, 46, 30, 62,
1, 33, 17, 49, 9, 41, 25, 57,
5, 37, 21, 53, 13, 45, 29, 61,
3, 35, 19, 51, 11, 43, 27, 59,
7, 39, 23, 55, 15, 47, 31, 63
};

// boradcast

vec256 tmp256[ GFBITS ];
vec256 x[ GFBITS ], y[ GFBITS ];

for (j = 0; j < 64; j += 8) {
for (i = 0; i < GFBITS; i++) {
t0 = (in[i] >> reversal[j + 0]) & 1;
t0 = -t0;
t1 = (in[i] >> reversal[j + 2]) & 1;
t1 = -t1;
t2 = (in[i] >> reversal[j + 4]) & 1;
t2 = -t2;
t3 = (in[i] >> reversal[j + 6]) & 1;
t3 = -t3;

out[j / 4 + 0][i] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(t0, t1, t2, t3);

t0 = (in[i] >> reversal[j + 1]) & 1;
t0 = -t0;
t1 = (in[i] >> reversal[j + 3]) & 1;
t1 = -t1;
t2 = (in[i] >> reversal[j + 5]) & 1;
t2 = -t2;
t3 = (in[i] >> reversal[j + 7]) & 1;
t3 = -t3;

out[j / 4 + 1][i] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(t0, t1, t2, t3);
}
}

//

for (i = 0; i < 16; i += 2) {
PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp256, out[i + 1], consts[ 0 ]);

for (b = 0; b < GFBITS; b++) {
out[i + 0][b] ^= tmp256[b];
}
for (b = 0; b < GFBITS; b++) {
out[i + 1][b] ^= out[i + 0][b];
}
}

for (i = 0; i < 16; i += 2) {
for (b = 0; b < GFBITS; b++) {
x[b] = PQCLEAN_MCELIECE348864_AVX_vec256_unpack_low_2x(out[i + 0][b], out[i + 1][b]);
}
for (b = 0; b < GFBITS; b++) {
y[b] = PQCLEAN_MCELIECE348864_AVX_vec256_unpack_high_2x(out[i + 0][b], out[i + 1][b]);
}

PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp256, y, consts[ 1 ]);

for (b = 0; b < GFBITS; b++) {
x[b] ^= tmp256[b];
}
for (b = 0; b < GFBITS; b++) {
y[b] ^= x[b];
}

for (b = 0; b < GFBITS; b++) {
out[i + 0][b] = PQCLEAN_MCELIECE348864_AVX_vec256_unpack_low(x[b], y[b]);
}
for (b = 0; b < GFBITS; b++) {
out[i + 1][b] = PQCLEAN_MCELIECE348864_AVX_vec256_unpack_high(x[b], y[b]);
}
}

consts_ptr = 2;

for (i = 0; i <= 3; i++) {
s = 1 << i;

for (j = 0; j < 16; j += 2 * s) {
for (k = j; k < j + s; k++) {
PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp256, out[k + s], consts[ consts_ptr + (k - j) ]);

for (b = 0; b < GFBITS; b++) {
out[k][b] ^= tmp256[b];
}
for (b = 0; b < GFBITS; b++) {
out[k + s][b] ^= out[k][b];
}
}
}

consts_ptr += s;
}

// adding the part contributed by x^64

vec256 powers[16][GFBITS] = {
#include "powers.inc"
};

for (i = 0; i < 16; i++) {
for (b = 0; b < GFBITS; b++) {
out[i][b] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(out[i][b], powers[i][b]);
}
}
}

void PQCLEAN_MCELIECE348864_AVX_fft(vec256 out[][ GFBITS ], uint64_t *in) {
radix_conversions(in);
butterflies(out, in);
}


+ 18
- 0
crypto_kem/mceliece348864/avx/fft.h View File

@@ -0,0 +1,18 @@
#ifndef PQCLEAN_MCELIECE348864_AVX_FFT_H
#define PQCLEAN_MCELIECE348864_AVX_FFT_H

/*
This file is for the Gao-Mateer FFT
sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf
*/

#include <stdint.h>

#include "params.h"
#include "vec128.h"
#include "vec256.h"

void PQCLEAN_MCELIECE348864_AVX_fft(vec256 /*out*/[][GFBITS], uint64_t * /*in*/);

#endif


+ 355
- 0
crypto_kem/mceliece348864/avx/fft_tr.c View File

@@ -0,0 +1,355 @@
/*
This file is for transpose of the Gao-Mateer FFT
Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c
*/

#include "fft_tr.h"

#include "transpose.h"
#include "vec.h"

#include <stdint.h>

static void radix_conversions_tr(vec128 in[ GFBITS ]) {
int i, j, k;

const vec128 mask[10] = {
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x2222222222222222, 0x2222222222222222),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x4444444444444444, 0x4444444444444444),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x3030303030303030, 0x3030303030303030),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00F000F000F000F0, 0x00F000F000F000F0),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000FF000000FF00, 0x0000FF000000FF00),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00000000FFFF0000, 0x00000000FFFF0000),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000)
};

const vec128 s[5][GFBITS] = {
#include "scalars_2x.inc"
};

uint64_t v0, v1;

//

for (j = 5; j >= 0; j--) {

if (j < 5) {
PQCLEAN_MCELIECE348864_AVX_vec128_mul(in, in, s[j]);
}

for (i = 0; i < GFBITS; i++) {
for (k = j; k <= 4; k++) {
in[i] ^= PQCLEAN_MCELIECE348864_AVX_vec128_sll_2x(in[i] & mask[2 * k + 0], 1 << k);
in[i] ^= PQCLEAN_MCELIECE348864_AVX_vec128_sll_2x(in[i] & mask[2 * k + 1], 1 << k);
}
}

for (i = 0; i < GFBITS; i++) {
v0 = PQCLEAN_MCELIECE348864_AVX_vec128_extract(in[i], 0);
v1 = PQCLEAN_MCELIECE348864_AVX_vec128_extract(in[i], 1);

v1 ^= v0 >> 32;
v1 ^= v1 << 32;

in[i] = PQCLEAN_MCELIECE348864_AVX_vec128_set2x(v0, v1);
}
}
}

static void butterflies_tr(vec128 out[ GFBITS ], vec256 in[][ GFBITS ]) {
int i, j, k, s, b;

uint64_t tmp[ GFBITS ];
uint64_t pre[6][ GFBITS ];
uint64_t out64[2][64];

vec256 p2[ 6 ];
vec256 buf[64];
vec256 x[ GFBITS ], y[ GFBITS ];
vec256 tmp256[ GFBITS ];

const vec256 consts[ 17 ][ GFBITS ] = {
#include "consts.inc"
};

uint64_t consts_ptr = 17;

const unsigned char reversal[64] = {
0, 32, 16, 48, 8, 40, 24, 56,
4, 36, 20, 52, 12, 44, 28, 60,
2, 34, 18, 50, 10, 42, 26, 58,
6, 38, 22, 54, 14, 46, 30, 62,
1, 33, 17, 49, 9, 41, 25, 57,
5, 37, 21, 53, 13, 45, 29, 61,
3, 35, 19, 51, 11, 43, 27, 59,
7, 39, 23, 55, 15, 47, 31, 63
};

const uint16_t beta[6] = {8, 1300, 3408, 1354, 2341, 1154};

// butterflies

for (i = 3; i >= 0; i--) {
s = 1 << i;
consts_ptr -= s;

for (j = 0; j < 16; j += 2 * s) {
for (k = j; k < j + s; k++) {
for (b = 0; b < GFBITS; b++) {
in[k][b] ^= in[k + s][b];
}
PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp256, in[k], consts[ consts_ptr + (k - j) ]);
for (b = 0; b < GFBITS; b++) {
in[k + s][b] ^= tmp256[b];
}
}
}
}

for (i = 0; i < 16; i += 2) {
for (b = 0; b < GFBITS; b++) {
x[b] = PQCLEAN_MCELIECE348864_AVX_vec256_unpack_low(in[i + 0][b], in[i + 1][b]);
}
for (b = 0; b < GFBITS; b++) {
y[b] = PQCLEAN_MCELIECE348864_AVX_vec256_unpack_high(in[i + 0][b], in[i + 1][b]);
}

for (b = 0; b < GFBITS; b++) {
x[b] ^= y[b];
}
PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp256, x, consts[ 1 ]);
for (b = 0; b < GFBITS; b++) {
y[b] ^= tmp256[b];
}

for (b = 0; b < GFBITS; b++) {
in[i + 0][b] = PQCLEAN_MCELIECE348864_AVX_vec256_unpack_low_2x(x[b], y[b]);
}
for (b = 0; b < GFBITS; b++) {
in[i + 1][b] = PQCLEAN_MCELIECE348864_AVX_vec256_unpack_high_2x(x[b], y[b]);
}
}

for (i = 0; i < 16; i += 2) {
for (b = 0; b < GFBITS; b++) {
in[i + 0][b] ^= in[i + 1][b];
}
PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp256, in[i + 0], consts[ 0 ]);
for (b = 0; b < GFBITS; b++) {
in[i + 1][b] ^= tmp256[b];
}
}

// transpose

for (i = 0; i < GFBITS; i += 4) {
for (j = 0; j < 64; j += 8) {
buf[ reversal[j + 0] ] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 0], 0),
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 1], 0),
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 2], 0),
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 3], 0));
buf[ reversal[j + 1] ] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 0], 0),
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 1], 0),
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 2], 0),
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 3], 0));
buf[ reversal[j + 2] ] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 0], 1),
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 1], 1),
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 2], 1),
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 3], 1));
buf[ reversal[j + 3] ] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 0], 1),
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 1], 1),
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 2], 1),
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 3], 1));
buf[ reversal[j + 4] ] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 0], 2),
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 1], 2),
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 2], 2),
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 3], 2));
buf[ reversal[j + 5] ] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 0], 2),
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 1], 2),
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 2], 2),
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 3], 2));
buf[ reversal[j + 6] ] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 0], 3),
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 1], 3),
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 2], 3),
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 3], 3));
buf[ reversal[j + 7] ] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 0], 3),
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 1], 3),
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 2], 3),
PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 3], 3));
}

PQCLEAN_MCELIECE348864_AVX_transpose_64x256_sp(buf);

p2[0] = buf[32];
buf[33] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[33], buf[32]);
p2[1] = buf[33];
buf[35] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[35], buf[33]);
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[35]);
buf[34] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[34], buf[35]);
p2[2] = buf[34];
buf[38] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[38], buf[34]);
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[38]);
buf[39] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[39], buf[38]);
p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[39]);
buf[37] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[37], buf[39]);
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[37]);
buf[36] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[36], buf[37]);
p2[3] = buf[36];
buf[44] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[44], buf[36]);
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[44]);
buf[45] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[45], buf[44]);
p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[45]);
buf[47] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[47], buf[45]);
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[47]);
buf[46] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[46], buf[47]);
p2[2] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[2], buf[46]);
buf[42] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[42], buf[46]);
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[42]);
buf[43] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[43], buf[42]);
p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[43]);
buf[41] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[41], buf[43]);
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[41]);
buf[40] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[40], buf[41]);
p2[4] = buf[40];
buf[56] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[56], buf[40]);
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[56]);
buf[57] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[57], buf[56]);
p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[57]);
buf[59] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[59], buf[57]);
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[59]);
buf[58] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[58], buf[59]);
p2[2] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[2], buf[58]);
buf[62] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[62], buf[58]);
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[62]);
buf[63] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[63], buf[62]);
p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[63]);
buf[61] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[61], buf[63]);
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[61]);
buf[60] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[60], buf[61]);
p2[3] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[3], buf[60]);
buf[52] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[52], buf[60]);
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[52]);
buf[53] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[53], buf[52]);
p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[53]);
buf[55] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[55], buf[53]);
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[55]);
buf[54] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[54], buf[55]);
p2[2] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[2], buf[54]);
buf[50] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[50], buf[54]);
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[50]);
buf[51] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[51], buf[50]);
p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[51]);
buf[49] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[49], buf[51]);
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[49]);
buf[48] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[48], buf[49]);
p2[5] = buf[48];
buf[16] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[16], buf[48]);
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[16]);
buf[17] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[17], buf[16]);
p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[17]);
buf[19] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[19], buf[17]);
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[19]);
buf[18] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[18], buf[19]);
p2[2] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[2], buf[18]);
buf[22] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[22], buf[18]);
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[22]);
buf[23] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[23], buf[22]);
p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[23]);
buf[21] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[21], buf[23]);
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[21]);
buf[20] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[20], buf[21]);
p2[3] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[3], buf[20]);
buf[28] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[28], buf[20]);
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[28]);
buf[29] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[29], buf[28]);
p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[29]);
buf[31] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[31], buf[29]);
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[31]);
buf[30] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[30], buf[31]);
p2[2] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[2], buf[30]);
buf[26] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[26], buf[30]);
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[26]);
buf[27] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[27], buf[26]);
p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[27]);
buf[25] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[25], buf[27]);
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[25]);
buf[24] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[24], buf[25]);
p2[4] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[4], buf[24]);
buf[8] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[8], buf[24]);
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[8]);
buf[9] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[9], buf[8]);
p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[9]);
buf[11] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[11], buf[9]);
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[11]);
buf[10] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[10], buf[11]);
p2[2] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[2], buf[10]);
buf[14] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[14], buf[10]);
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[14]);
buf[15] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[15], buf[14]);
p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[15]);
buf[13] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[13], buf[15]);
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[13]);
buf[12] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[12], buf[13]);
p2[3] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[3], buf[12]);
buf[4] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[4], buf[12]);
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[4]);
buf[5] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[5], buf[4]);
p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[5]);
buf[7] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[7], buf[5]);
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[7]);
buf[6] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[6], buf[7]);
p2[2] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[2], buf[6]);
buf[2] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[2], buf[6]);
p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[2]);
buf[3] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[3], buf[2]);
p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[3]);
buf[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[1], buf[3]);

p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[1]);
buf[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[0], buf[1]);

for (j = 0; j < 6; j++) {
pre[j][i + 0] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(p2[j], 0);
pre[j][i + 1] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(p2[j], 1);
pre[j][i + 2] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(p2[j], 2);
pre[j][i + 3] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(p2[j], 3);
}

out64[0][i + 0] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(buf[0], 0);
out64[0][i + 1] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(buf[0], 1);
out64[0][i + 2] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(buf[0], 2);
out64[0][i + 3] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(buf[0], 3);
}

//

for (j = 0; j < GFBITS; j++) {
tmp[j] = (beta[0] >> j) & 1;
tmp[j] = -tmp[j];
}

PQCLEAN_MCELIECE348864_AVX_vec_mul(out64[1], pre[0], tmp);

for (i = 1; i < 6; i++) {
for (j = 0; j < GFBITS; j++) {
tmp[j] = (beta[i] >> j) & 1;
tmp[j] = -tmp[j];
}

PQCLEAN_MCELIECE348864_AVX_vec_mul(tmp, pre[i], tmp);
PQCLEAN_MCELIECE348864_AVX_vec_add(out64[1], out64[1], tmp);
}

for (i = 0; i < GFBITS; i++) {
out[i] = PQCLEAN_MCELIECE348864_AVX_vec128_set2x(out64[0][i], out64[1][i]);
}
}

void PQCLEAN_MCELIECE348864_AVX_fft_tr(vec128 out[GFBITS], vec256 in[][ GFBITS ]) {
butterflies_tr(out, in);
radix_conversions_tr(out);
}


+ 14
- 0
crypto_kem/mceliece348864/avx/fft_tr.h View File

@@ -0,0 +1,14 @@
#ifndef PQCLEAN_MCELIECE348864_AVX_FFT_TR_H
#define PQCLEAN_MCELIECE348864_AVX_FFT_TR_H
/*
This file is for transpose of the Gao-Mateer FFT
*/


#include "params.h"
#include "vec256.h"

void PQCLEAN_MCELIECE348864_AVX_fft_tr(vec128 * /*out*/, vec256 /*in*/[][ GFBITS ]);

#endif


+ 169
- 0
crypto_kem/mceliece348864/avx/gf.c View File

@@ -0,0 +1,169 @@
/*
This file is for functions for field arithmetic
*/

#include "gf.h"

#include "params.h"

gf PQCLEAN_MCELIECE348864_AVX_gf_iszero(gf a) {
uint32_t t = a;

t -= 1;
t >>= 20;

return (gf) t;
}

gf PQCLEAN_MCELIECE348864_AVX_gf_add(gf in0, gf in1) {
return in0 ^ in1;
}

gf PQCLEAN_MCELIECE348864_AVX_gf_mul(gf in0, gf in1) {
int i;

uint32_t tmp;
uint32_t t0;
uint32_t t1;
uint32_t t;

t0 = in0;
t1 = in1;

tmp = t0 * (t1 & 1);

for (i = 1; i < GFBITS; i++) {
tmp ^= (t0 * (t1 & (1 << i)));
}

t = tmp & 0x7FC000;
tmp ^= t >> 9;
tmp ^= t >> 12;

t = tmp & 0x3000;
tmp ^= t >> 9;
tmp ^= t >> 12;

return tmp & ((1 << GFBITS) - 1);
}

/* input: field element in */
/* return: in^2 */
static inline gf gf_sq(gf in) {
const uint32_t B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF};

uint32_t x = in;
uint32_t t;

x = (x | (x << 8)) & B[3];
x = (x | (x << 4)) & B[2];
x = (x | (x << 2)) & B[1];
x = (x | (x << 1)) & B[0];

t = x & 0x7FC000;
x ^= t >> 9;
x ^= t >> 12;

t = x & 0x3000;
x ^= t >> 9;
x ^= t >> 12;

return x & ((1 << GFBITS) - 1);
}

gf PQCLEAN_MCELIECE348864_AVX_gf_inv(gf in) {
gf tmp_11;
gf tmp_1111;

gf out = in;

out = gf_sq(out);
tmp_11 = PQCLEAN_MCELIECE348864_AVX_gf_mul(out, in); // 11

out = gf_sq(tmp_11);
out = gf_sq(out);
tmp_1111 = PQCLEAN_MCELIECE348864_AVX_gf_mul(out, tmp_11); // 1111

out = gf_sq(tmp_1111);
out = gf_sq(out);
out = gf_sq(out);
out = gf_sq(out);
out = PQCLEAN_MCELIECE348864_AVX_gf_mul(out, tmp_1111); // 11111111

out = gf_sq(out);
out = gf_sq(out);
out = PQCLEAN_MCELIECE348864_AVX_gf_mul(out, tmp_11); // 1111111111

out = gf_sq(out);
out = PQCLEAN_MCELIECE348864_AVX_gf_mul(out, in); // 11111111111

return gf_sq(out); // 111111111110
}

/* input: field element den, num */
/* return: (num/den) */
gf PQCLEAN_MCELIECE348864_AVX_gf_frac(gf den, gf num) {
return PQCLEAN_MCELIECE348864_AVX_gf_mul(PQCLEAN_MCELIECE348864_AVX_gf_inv(den), num);
}

/* input: in0, in1 in GF((2^m)^t)*/
/* output: out = in0*in1 */
void PQCLEAN_MCELIECE348864_AVX_GF_mul(gf *out, const gf *in0, const gf *in1) {
int i, j;

gf prod[ SYS_T * 2 - 1 ];

for (i = 0; i < SYS_T * 2 - 1; i++) {
prod[i] = 0;
}

for (i = 0; i < SYS_T; i++) {
for (j = 0; j < SYS_T; j++) {
prod[i + j] ^= PQCLEAN_MCELIECE348864_AVX_gf_mul(in0[i], in1[j]);
}
}

//

for (i = (SYS_T - 1) * 2; i >= SYS_T; i--) {
prod[i - SYS_T + 9] ^= PQCLEAN_MCELIECE348864_AVX_gf_mul(prod[i], (gf) 877);
prod[i - SYS_T + 7] ^= PQCLEAN_MCELIECE348864_AVX_gf_mul(prod[i], (gf) 2888);
prod[i - SYS_T + 5] ^= PQCLEAN_MCELIECE348864_AVX_gf_mul(prod[i], (gf) 1781);
prod[i - SYS_T + 0] ^= PQCLEAN_MCELIECE348864_AVX_gf_mul(prod[i], (gf) 373);
}

for (i = 0; i < SYS_T; i++) {
out[i] = prod[i];
}
}

/* 2 field multiplications */
uint64_t PQCLEAN_MCELIECE348864_AVX_gf_mul2(gf a, gf b0, gf b1) {
int i;

uint64_t tmp = 0;
uint64_t t0;
uint64_t t1;
uint64_t t;
uint64_t mask = 0x0000000100000001;

t0 = a;
t1 = b1;
t1 = (t1 << 32) | b0;

for (i = 0; i < GFBITS; i++) {
tmp ^= t0 * (t1 & mask);
mask += mask;
}

//

t = tmp & 0x007FC000007FC000;
tmp ^= (t >> 9) ^ (t >> 12);

t = tmp & 0x0000300000003000;
tmp ^= (t >> 9) ^ (t >> 12);

return tmp & 0x00000FFF00000FFF;
}


+ 26
- 0
crypto_kem/mceliece348864/avx/gf.h View File

@@ -0,0 +1,26 @@
#ifndef PQCLEAN_MCELIECE348864_AVX_GF_H
#define PQCLEAN_MCELIECE348864_AVX_GF_H
/*
This file is for functions for field arithmetic
*/


#include "params.h"

#include <stdint.h>

typedef uint16_t gf;

gf PQCLEAN_MCELIECE348864_AVX_gf_iszero(gf /*a*/);
gf PQCLEAN_MCELIECE348864_AVX_gf_add(gf /*in0*/, gf /*in1*/);
gf PQCLEAN_MCELIECE348864_AVX_gf_mul(gf /*in0*/, gf /*in1*/);
gf PQCLEAN_MCELIECE348864_AVX_gf_frac(gf /*den*/, gf /*num*/);
gf PQCLEAN_MCELIECE348864_AVX_gf_inv(gf /*in*/);

void PQCLEAN_MCELIECE348864_AVX_GF_mul(gf * /*out*/, const gf * /*in0*/, const gf * /*in1*/);

/* 2 field multiplications */
uint64_t PQCLEAN_MCELIECE348864_AVX_gf_mul2(gf a, gf b0, gf b1);

#endif


+ 1208
- 0
crypto_kem/mceliece348864/avx/int32_sort.c
File diff suppressed because it is too large
View File


+ 9
- 0
crypto_kem/mceliece348864/avx/int32_sort.h View File

@@ -0,0 +1,9 @@
#ifndef PQCLEAN_MCELIECE348864_AVX_INT32_SORT_H
#define PQCLEAN_MCELIECE348864_AVX_INT32_SORT_H

#include <stddef.h>
#include <stdint.h>

void PQCLEAN_MCELIECE348864_AVX_int32_sort(int32_t *x, size_t n);

#endif

+ 136
- 0
crypto_kem/mceliece348864/avx/operations.c View File

@@ -0,0 +1,136 @@
#include "api.h"

#include "aes256ctr.h"
#include "controlbits.h"
#include "crypto_hash.h"
#include "decrypt.h"
#include "encrypt.h"
#include "params.h"
#include "pk_gen.h"
#include "randombytes.h"
#include "sk_gen.h"
#include "util.h"

#include <stdint.h>
#include <string.h>

int PQCLEAN_MCELIECE348864_AVX_crypto_kem_enc(
uint8_t *c,
uint8_t *key,
const uint8_t *pk
) {
uint8_t two_e[ 1 + SYS_N / 8 ] = {2};
uint8_t *e = two_e + 1;
uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1};

PQCLEAN_MCELIECE348864_AVX_encrypt(c, e, pk);

crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e));

memcpy(one_ec + 1, e, SYS_N / 8);
memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32);

crypto_hash_32b(key, one_ec, sizeof(one_ec));

return 0;
}

int PQCLEAN_MCELIECE348864_AVX_crypto_kem_dec(
uint8_t *key,
const uint8_t *c,
const uint8_t *sk
) {
int i;

uint8_t ret_confirm = 0;
uint8_t ret_decrypt = 0;

uint16_t m;

uint8_t conf[32];
uint8_t two_e[ 1 + SYS_N / 8 ] = {2};
uint8_t *e = two_e + 1;
uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ];
uint8_t *x = preimage;

//

ret_decrypt = (uint8_t)PQCLEAN_MCELIECE348864_AVX_decrypt(e, sk + SYS_N / 8, c);

crypto_hash_32b(conf, two_e, sizeof(two_e));

for (i = 0; i < 32; i++) {
ret_confirm |= conf[i] ^ c[SYND_BYTES + i];
}

m = ret_decrypt | ret_confirm;
m -= 1;
m >>= 8;

*x++ = (~m & 0) | (m & 1);
for (i = 0; i < SYS_N / 8; i++) {
*x++ = (~m & sk[i]) | (m & e[i]);
}
for (i = 0; i < SYND_BYTES + 32; i++) {
*x++ = c[i];
}

crypto_hash_32b(key, preimage, sizeof(preimage));

return 0;
}

int PQCLEAN_MCELIECE348864_AVX_crypto_kem_keypair
(
uint8_t *pk,
uint8_t *sk
) {
int i;
uint8_t seed[ 32 ];
uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ];
uint8_t nonce[ 16 ] = {0};
uint8_t *rp;

gf f[ SYS_T ]; // element in GF(2^mt)
gf irr[ SYS_T ]; // Goppa polynomial
uint32_t perm[ 1 << GFBITS ]; // random permutation

randombytes(seed, sizeof(seed));

while (1) {
rp = r;
PQCLEAN_MCELIECE348864_AVX_aes256ctr(r, sizeof(r), nonce, seed);
memcpy(seed, &r[ sizeof(r) - 32 ], 32);

for (i = 0; i < SYS_T; i++) {
f[i] = PQCLEAN_MCELIECE348864_AVX_load2(rp + i * 2);
}
rp += sizeof(f);
if (PQCLEAN_MCELIECE348864_AVX_genpoly_gen(irr, f)) {
continue;
}

for (i = 0; i < (1 << GFBITS); i++) {
perm[i] = PQCLEAN_MCELIECE348864_AVX_load4(rp + i * 4);
}
rp += sizeof(perm);
if (PQCLEAN_MCELIECE348864_AVX_perm_check(perm)) {
continue;
}

for (i = 0; i < SYS_T; i++) {
PQCLEAN_MCELIECE348864_AVX_store2(sk + SYS_N / 8 + i * 2, irr[i]);
}
if (PQCLEAN_MCELIECE348864_AVX_pk_gen(pk, perm, sk + SYS_N / 8)) {
continue;
}

memcpy(sk, rp, SYS_N / 8);
PQCLEAN_MCELIECE348864_AVX_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm);

break;
}

return 0;
}


+ 21
- 0
crypto_kem/mceliece348864/avx/params.h View File

@@ -0,0 +1,21 @@
#ifndef PQCLEAN_MCELIECE348864_AVX_PARAMS_H
#define PQCLEAN_MCELIECE348864_AVX_PARAMS_H

#define GFBITS 12
#define SYS_N 3488
#define SYS_T 64

#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1))
#define IRR_BYTES (SYS_T * 2)

#define PK_NROWS (SYS_T*GFBITS)
#define PK_NCOLS (SYS_N - PK_NROWS)
#define PK_ROW_BYTES ((PK_NCOLS + 7)/8)

#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES)
#define SYND_BYTES ((PK_NROWS + 7)/8)

#define GFMASK ((1 << GFBITS) - 1)

#endif


+ 276
- 0
crypto_kem/mceliece348864/avx/pk_gen.c View File

@@ -0,0 +1,276 @@
/*
This file is for public-key generation
*/

#include "pk_gen.h"

#include "benes.h"
#include "controlbits.h"
#include "fft.h"
#include "params.h"
#include "util.h"

#include <stdint.h>

static void de_bitslicing(uint64_t *out, vec256 in[][GFBITS]) {
int i, j, r;
uint64_t u = 0;

for (i = 0; i < (1 << GFBITS); i++) {
out[i] = 0 ;
}

for (i = 0; i < 16; i++) {
for (j = GFBITS - 1; j >= 0; j--) {
u = PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[i][j], 0);
for (r = 0; r < 64; r++) {
out[i * 256 + 0 * 64 + r] <<= 1;
out[i * 256 + 0 * 64 + r] |= (u >> r) & 1;
}
u = PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[i][j], 1);
for (r = 0; r < 64; r++) {
out[i * 256 + 1 * 64 + r] <<= 1;
out[i * 256 + 1 * 64 + r] |= (u >> r) & 1;
}
u = PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[i][j], 2);
for (r = 0; r < 64; r++) {
out[i * 256 + 2 * 64 + r] <<= 1;
out[i * 256 + 2 * 64 + r] |= (u >> r) & 1;
}
u = PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[i][j], 3);
for (r = 0; r < 64; r++) {
out[i * 256 + 3 * 64 + r] <<= 1;
out[i * 256 + 3 * 64 + r] |= (u >> r) & 1;
}
}
}
}

static void to_bitslicing_2x(vec256 out0[][GFBITS], vec256 out1[][GFBITS], const uint64_t *in) {
int i, j, k, r;
uint64_t u[4] = {0};

for (i = 0; i < 16; i++) {
for (j = GFBITS - 1; j >= 0; j--) {
for (k = 0; k < 4; k++) {
for (r = 63; r >= 0; r--) {
u[k] <<= 1;
u[k] |= (in[i * 256 + k * 64 + r] >> (j + GFBITS)) & 1;
}
}

out1[i][j] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(u[0], u[1], u[2], u[3]);
}

for (j = GFBITS - 1; j >= 0; j--) {
for (k = 0; k < 4; k++) {
for (r = 63; r >= 0; r--) {
u[k] <<= 1;
u[k] |= (in[i * 256 + k * 64 + r] >> j) & 1;
}
}

out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(u[0], u[1], u[2], u[3]);
}
}
}

#define NBLOCKS1_H ((SYS_N + 63) / 64)
#define NBLOCKS2_H ((SYS_N + 255) / 256)
#define NBLOCKS1_I ((GFBITS * SYS_T + 63) / 64)
#define NBLOCKS2_I ((GFBITS * SYS_T + 255) / 256)
int PQCLEAN_MCELIECE348864_AVX_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) {
const int block_idx = NBLOCKS1_I;

int i, j, k;
int row, c;

uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS2_H * 4 ];
uint64_t ops[ GFBITS * SYS_T ][ NBLOCKS1_I ];

uint64_t mask;

uint64_t sk_int[ GFBITS ];

vec256 consts[ 16 ][ GFBITS ];
vec256 eval[ 16 ][ GFBITS ];
vec256 prod[ 16 ][ GFBITS ];
vec256 tmp[ GFBITS ];

uint64_t list[1 << GFBITS];
uint64_t one_row[ 128 ];

// compute the inverses

PQCLEAN_MCELIECE348864_AVX_irr_load(sk_int, sk);

PQCLEAN_MCELIECE348864_AVX_fft(eval, sk_int);

PQCLEAN_MCELIECE348864_AVX_vec256_copy(prod[0], eval[0]);

for (i = 1; i < 16; i++) {
PQCLEAN_MCELIECE348864_AVX_vec256_mul(prod[i], prod[i - 1], eval[i]);
}

PQCLEAN_MCELIECE348864_AVX_vec256_inv(tmp, prod[15]);

for (i = 14; i >= 0; i--) {
PQCLEAN_MCELIECE348864_AVX_vec256_mul(prod[i + 1], prod[i], tmp);
PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp, tmp, eval[i + 1]);
}

PQCLEAN_MCELIECE348864_AVX_vec256_copy(prod[0], tmp);

// fill matrix

de_bitslicing(list, prod);

for (i = 0; i < (1 << GFBITS); i++) {
list[i] <<= GFBITS;
list[i] |= i;
list[i] |= ((uint64_t) perm[i]) << 31;
}

PQCLEAN_MCELIECE348864_AVX_sort_63b(1 << GFBITS, list);

to_bitslicing_2x(consts, prod, list);

for (i = 0; i < (1 << GFBITS); i++) {
perm[i] = list[i] & GFMASK;
}

for (j = 0; j < NBLOCKS2_I; j++) {
for (k = 0; k < GFBITS; k++) {
mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 0);
mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 1);
mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 2);
mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 3);
}
}

for (i = 1; i < SYS_T; i++) {
for (j = 0; j < NBLOCKS2_I; j++) {
PQCLEAN_MCELIECE348864_AVX_vec256_mul(prod[j], prod[j], consts[j]);

for (k = 0; k < GFBITS; k++) {
mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 0);
mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 1);
mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 2);
mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 3);
}
}
}

// gaussian elimination to obtain an upper triangular matrix
// and keep track of the operations in ops

for (i = 0; i < PK_NROWS; i++) {
for (j = 0; j < NBLOCKS1_I; j++) {
ops[ i ][ j ] = 0;
}
}

for (i = 0; i < PK_NROWS; i++) {
ops[ i ][ i / 64 ] = 1;
ops[ i ][ i / 64 ] <<= (i % 64);
}

for (row = 0; row < PK_NROWS; row++) {
i = row >> 6;
j = row & 63;

for (k = row + 1; k < PK_NROWS; k++) {
mask = mat[ row ][ i ] >> j;
mask &= 1;
mask -= 1;

for (c = 0; c < NBLOCKS1_I; c++) {
mat[ row ][ c ] ^= mat[ k ][ c ] & mask;
ops[ row ][ c ] ^= ops[ k ][ c ] & mask;
}
}

if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic
return -1;
}

for (k = row + 1; k < PK_NROWS; k++) {
mask = mat[ k ][ i ] >> j;
mask &= 1;
mask = -mask;

for (c = 0; c < NBLOCKS1_I; c++) {
mat[ k ][ c ] ^= mat[ row ][ c ] & mask;
ops[ k ][ c ] ^= ops[ row ][ c ] & mask;
}
}
}

// computing the lineaer map required to obatin the systematic form

for (row = PK_NROWS - 1; row >= 0; row--) {
for (k = 0; k < row; k++) {
mask = mat[ k ][ row / 64 ] >> (row & 63);
mask &= 1;
mask = -mask;

for (c = 0; c < NBLOCKS1_I; c++) {
ops[ k ][ c ] ^= ops[ row ][ c ] & mask;
}
}
}

// apply the linear map to the non-systematic part

for (j = NBLOCKS2_I; j < NBLOCKS2_H; j++) {
for (k = 0; k < GFBITS; k++) {
mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 0);
mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 1);
mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 2);
mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 3);
}
}

for (i = 1; i < SYS_T; i++) {
for (j = NBLOCKS2_I; j < NBLOCKS2_H; j++) {
PQCLEAN_MCELIECE348864_AVX_vec256_mul(prod[j], prod[j], consts[j]);

for (k = 0; k < GFBITS; k++) {
mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 0);
mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 1);
mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 2);
mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 3);
}
}
}

for (row = 0; row < PK_NROWS; row++) {
for (k = 0; k < NBLOCKS1_H; k++) {
one_row[ k ] = 0;
}

for (c = 0; c < PK_NROWS; c++) {
mask = ops[ row ][ c >> 6 ] >> (c & 63);
mask &= 1;
mask = -mask;

for (k = block_idx; k < NBLOCKS1_H; k++) {
one_row[ k ] ^= mat[ c ][ k ] & mask;
}
}

for (k = block_idx; k < NBLOCKS1_H - 1; k++) {
PQCLEAN_MCELIECE348864_AVX_store8(pk, one_row[k]);
pk += 8;
}

PQCLEAN_MCELIECE348864_AVX_store_i(pk, one_row[k], PK_ROW_BYTES % 8);

pk += PK_ROW_BYTES % 8;
}

//

return 0;
}


+ 13
- 0
crypto_kem/mceliece348864/avx/pk_gen.h View File

@@ -0,0 +1,13 @@
#ifndef PQCLEAN_MCELIECE348864_AVX_PK_GEN_H
#define PQCLEAN_MCELIECE348864_AVX_PK_GEN_H
/*
This file is for public-key generation
*/


#include "gf.h"

int PQCLEAN_MCELIECE348864_AVX_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/);

#endif


+ 224
- 0
crypto_kem/mceliece348864/avx/powers.inc View File

@@ -0,0 +1,224 @@
{
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA),
},
{
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA),
},
{
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555),
},
{
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555),
},
{
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA),
},
{
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA),
},
{
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555),
},
{
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555),
},
{
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA),
},
{
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA),
},
{
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555),
},
{
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555),
},
{
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA),
},
{
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA),
},
{
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555),
},
{
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333),
PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555),
},

+ 70
- 0
crypto_kem/mceliece348864/avx/scalars.inc View File

@@ -0,0 +1,70 @@
{
0XF3CFC030FC30F003,
0X3FCF0F003C00C00C,
0X30033CC300C0C03C,
0XCCFF0F3C0F30F0C0,
0X0300C03FF303C3F0,
0X3FFF3C0FF0CCCCC0,
0XF3FFF0C00F3C3CC0,
0X3003333FFFC3C000,
0X0FF30FFFC3FFF300,
0XFFC0F300F0F0CC00,
0XC0CFF3FCCC3CFC00,
0XFC3C03F0F330C000,
},
{
0X000F00000000F00F,
0X00000F00F00000F0,
0X0F00000F00000F00,
0XF00F00F00F000000,
0X00F00000000000F0,
0X0000000F00000000,
0XF00000000F00F000,
0X00F00F00000F0000,
0X0000F00000F00F00,
0X000F00F00F00F000,
0X00F00F0000000000,
0X0000000000F00000,
},
{
0X0000FF00FF0000FF,
0X0000FF000000FF00,
0XFF0000FF00FF0000,
0XFFFF0000FF000000,
0X00FF00FF00FF0000,
0X0000FFFFFF000000,
0X00FFFF00FF000000,
0XFFFFFF0000FF0000,
0XFFFF00FFFF00FF00,
0X0000FF0000000000,
0XFFFFFF00FF000000,
0X00FF000000000000,
},
{
0X000000000000FFFF,
0X00000000FFFF0000,
0X0000000000000000,
0XFFFF000000000000,
0X00000000FFFF0000,
0X0000FFFF00000000,
0X0000000000000000,
0X00000000FFFF0000,
0X0000FFFF00000000,
0X0000000000000000,
0X0000000000000000,
0X0000000000000000,
},
{
0X00000000FFFFFFFF,
0XFFFFFFFF00000000,
0XFFFFFFFF00000000,
0X0000000000000000,
0X0000000000000000,
0XFFFFFFFF00000000,
0X0000000000000000,
0X0000000000000000,
0XFFFFFFFF00000000,
0X0000000000000000,
0X0000000000000000,
0X0000000000000000,
}

+ 70
- 0
crypto_kem/mceliece348864/avx/scalars_2x.inc View File

@@ -0,0 +1,70 @@
{
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xf3cfc030fc30f003, 0x000c03c0c3c0330c),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x3fcf0f003c00c00c, 0xf330cffcc00f33c0),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x30033cc300c0c03c, 0xccf330f00f3c0333),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xccff0f3c0f30f0c0, 0xff03fff3ff0cf0c0),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0300c03ff303c3f0, 0x3cc3fcf00fcc303c),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x3fff3c0ff0ccccc0, 0x0f000c0fc30303f3),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xf3fff0c00f3c3cc0, 0xcf0fc3ff333ccf3c),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x3003333fffc3c000, 0x003f3fc3c0ff333f),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0ff30fffc3fff300, 0x3cc3f0f3cf0ff00f),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffc0f300f0f0cc00, 0xf3f33cc03fc30cc0),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xc0cff3fccc3cfc00, 0x3cc330cfc333f33f),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xfc3c03f0f330c000, 0x3cc0303ff3c3fffc),
},
{
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x000f00000000f00f, 0x0f00f00f00000000),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00000f00f00000f0, 0xf00000000000f000),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0f00000f00000f00, 0x00000f00000000f0),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xf00f00f00f000000, 0x0f00f00000f00000),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00f00000000000f0, 0x000f00000f00f00f),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000f00000000, 0x00f00f00f00f0000),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xf00000000f00f000, 0x0f00f00000000000),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00f00f00000f0000, 0x000000000f000000),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000f00000f00f00, 0x00f00000000f00f0),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x000f00f00f00f000, 0x0000f00f00000f00),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00f00f0000000000, 0xf00000f00000f00f),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000f00000, 0x00000f00f00f00f0),
},
{
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000ff00ff0000ff, 0xff00ffffff000000),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000ff000000ff00, 0xff0000ffff000000),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xff0000ff00ff0000, 0xffff00ffff000000),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffff0000ff000000, 0xff00ffffffffff00),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00ff00ff00ff0000, 0x00000000ff00ff00),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000ffffff000000, 0xffffffff00ff0000),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00ffff00ff000000, 0x00ffffff00ff0000),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffffff0000ff0000, 0xffff00ffff00ffff),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffff00ffff00ff00, 0xffff0000ffffffff),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000ff0000000000, 0xff00000000ff0000),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffffff00ff000000, 0x000000ff00ff00ff),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00ff000000000000, 0x00ff00ff00ffff00),
},
{
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x000000000000ffff, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00000000ffff0000, 0xffff000000000000),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffff000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00000000ffff0000, 0xffff00000000ffff),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000ffff00000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0x0000ffff00000000),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00000000ffff0000, 0xffff00000000ffff),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000ffff00000000, 0x00000000ffff0000),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0xffff00000000ffff),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0x00000000ffff0000),
},
{
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00000000ffffffff, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffffffff00000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffffffff00000000, 0x00000000ffffffff),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0xffffffff00000000),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0xffffffff00000000),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffffffff00000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0xffffffff00000000),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0xffffffffffffffff),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffffffff00000000, 0xffffffff00000000),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0x0000000000000000),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0xffffffffffffffff),
PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0xffffffff00000000),
},

+ 98
- 0
crypto_kem/mceliece348864/avx/sk_gen.c View File

@@ -0,0 +1,98 @@
/*
This file is for secret-key generation
*/

#include "sk_gen.h"

#include "controlbits.h"
#include "gf.h"
#include "params.h"
#include "util.h"

/* input: f, element in GF((2^m)^t) */
/* output: out, minimal polynomial of f */
/* return: 0 for success and -1 for failure */
int PQCLEAN_MCELIECE348864_AVX_genpoly_gen(gf *out, gf *f) {
int i, j, k, c;

gf mat[ SYS_T + 1 ][ SYS_T ];
gf mask, inv, t;

// fill matrix

mat[0][0] = 1;

for (i = 1; i < SYS_T; i++) {
mat[0][i] = 0;
}

for (i = 0; i < SYS_T; i++) {
mat[1][i] = f[i];
}

for (j = 2; j <= SYS_T; j++) {
PQCLEAN_MCELIECE348864_AVX_GF_mul(mat[j], mat[j - 1], f);
}

// gaussian

for (j = 0; j < SYS_T; j++) {
for (k = j + 1; k < SYS_T; k++) {
mask = PQCLEAN_MCELIECE348864_AVX_gf_iszero(mat[ j ][ j ]);

for (c = j; c < SYS_T + 1; c++) {
mat[ c ][ j ] ^= mat[ c ][ k ] & mask;
}

}

if ( mat[ j ][ j ] == 0 ) { // return if not systematic
return -1;
}

inv = PQCLEAN_MCELIECE348864_AVX_gf_inv(mat[j][j]);

for (c = j; c < SYS_T + 1; c++) {
mat[ c ][ j ] = PQCLEAN_MCELIECE348864_AVX_gf_mul(mat[ c ][ j ], inv) ;
}

for (k = 0; k < SYS_T; k++) {
if (k != j) {
t = mat[ j ][ k ];

for (c = j; c < SYS_T + 1; c++) {
mat[ c ][ k ] ^= PQCLEAN_MCELIECE348864_AVX_gf_mul(mat[ c ][ j ], t);
}
}
}
}

for (i = 0; i < SYS_T; i++) {
out[i] = mat[ SYS_T ][ i ];
}

return 0;
}

/* input: permutation p represented as a list of 32-bit intergers */
/* output: -1 if some interger repeats in p */
/* 0 otherwise */
int PQCLEAN_MCELIECE348864_AVX_perm_check(const uint32_t *p) {
int i;
uint64_t list[1 << GFBITS];

for (i = 0; i < (1 << GFBITS); i++) {
list[i] = p[i];
}

PQCLEAN_MCELIECE348864_AVX_sort_63b(1 << GFBITS, list);

for (i = 1; i < (1 << GFBITS); i++) {
if (list[i - 1] == list[i]) {
return -1;
}
}

return 0;
}


+ 16
- 0
crypto_kem/mceliece348864/avx/sk_gen.h View File

@@ -0,0 +1,16 @@
#ifndef PQCLEAN_MCELIECE348864_AVX_SK_GEN_H
#define PQCLEAN_MCELIECE348864_AVX_SK_GEN_H
/*
This file is for secret-key generation
*/


#include "gf.h"

#include <stdint.h>

int PQCLEAN_MCELIECE348864_AVX_genpoly_gen(gf * /*out*/, gf * /*f*/);
int PQCLEAN_MCELIECE348864_AVX_perm_check(const uint32_t * /*p*/);

#endif


+ 530
- 0
crypto_kem/mceliece348864/avx/syndrome_asm.S View File

@@ -0,0 +1,530 @@

# qhasm: int64 input_0

# qhasm: int64 input_1

# qhasm: int64 input_2

# qhasm: int64 input_3

# qhasm: int64 input_4

# qhasm: int64 input_5

# qhasm: stack64 input_6

# qhasm: stack64 input_7

# qhasm: int64 caller_r11

# qhasm: int64 caller_r12

# qhasm: int64 caller_r13

# qhasm: int64 caller_r14

# qhasm: int64 caller_r15

# qhasm: int64 caller_rbx

# qhasm: int64 caller_rbp

# qhasm: int64 b64

# qhasm: int64 synd

# qhasm: int64 addr

# qhasm: int64 c

# qhasm: int64 c_all

# qhasm: int64 row

# qhasm: int64 p

# qhasm: int64 e

# qhasm: int64 s

# qhasm: reg256 pp

# qhasm: reg256 ee

# qhasm: reg256 ss

# qhasm: int64 buf_ptr

# qhasm: stack256 buf

# qhasm: enter syndrome_asm
.p2align 5
.global _PQCLEAN_MCELIECE348864_AVX_syndrome_asm
.global PQCLEAN_MCELIECE348864_AVX_syndrome_asm
_PQCLEAN_MCELIECE348864_AVX_syndrome_asm:
PQCLEAN_MCELIECE348864_AVX_syndrome_asm:
mov %rsp,%r11
and $31,%r11
add $32,%r11
sub %r11,%rsp

# qhasm: input_1 += 260780
# asm 1: add $260780,<input_1=int64#2
# asm 2: add $260780,<input_1=%rsi
add $260780,%rsi

# qhasm: buf_ptr = &buf
# asm 1: leaq <buf=stack256#1,>buf_ptr=int64#4
# asm 2: leaq <buf=0(%rsp),>buf_ptr=%rcx
leaq 0(%rsp),%rcx

# qhasm: row = 768
# asm 1: mov $768,>row=int64#5
# asm 2: mov $768,>row=%r8
mov $768,%r8

# qhasm: loop:
._loop:

# qhasm: row -= 1
# asm 1: sub $1,<row=int64#5
# asm 2: sub $1,<row=%r8
sub $1,%r8

# qhasm: ss = mem256[ input_1 + 0 ]
# asm 1: vmovupd 0(<input_1=int64#2),>ss=reg256#1
# asm 2: vmovupd 0(<input_1=%rsi),>ss=%ymm0
vmovupd 0(%rsi),%ymm0

# qhasm: ee = mem256[ input_2 + 96 ]
# asm 1: vmovupd 96(<input_2=int64#3),>ee=reg256#2
# asm 2: vmovupd 96(<input_2=%rdx),>ee=%ymm1
vmovupd 96(%rdx),%ymm1

# qhasm: ss &= ee
# asm 1: vpand <ee=reg256#2,<ss=reg256#1,<ss=reg256#1
# asm 2: vpand <ee=%ymm1,<ss=%ymm0,<ss=%ymm0
vpand %ymm1,%ymm0,%ymm0

# qhasm: pp = mem256[ input_1 + 32 ]
# asm 1: vmovupd 32(<input_1=int64#2),>pp=reg256#2
# asm 2: vmovupd 32(<input_1=%rsi),>pp=%ymm1
vmovupd 32(%rsi),%ymm1

# qhasm: ee = mem256[ input_2 + 128 ]
# asm 1: vmovupd 128(<input_2=int64#3),>ee=reg256#3
# asm 2: vmovupd 128(<input_2=%rdx),>ee=%ymm2
vmovupd 128(%rdx),%ymm2

# qhasm: pp &= ee
# asm 1: vpand <ee=reg256#3,<pp=reg256#2,<pp=reg256#2
# asm 2: vpand <ee=%ymm2,<pp=%ymm1,<pp=%ymm1
vpand %ymm2,%ymm1,%ymm1

# qhasm: ss ^= pp
# asm 1: vpxor <pp=reg256#2,<ss=reg256#1,<ss=reg256#1
# asm 2: vpxor <pp=%ymm1,<ss=%ymm0,<ss=%ymm0
vpxor %ymm1,%ymm0,%ymm0

# qhasm: pp = mem256[ input_1 + 64 ]
# asm 1: vmovupd 64(<input_1=int64#2),>pp=reg256#2
# asm 2: vmovupd 64(<input_1=%rsi),>pp=%ymm1
vmovupd 64(%rsi),%ymm1

# qhasm: ee = mem256[ input_2 + 160 ]
# asm 1: vmovupd 160(<input_2=int64#3),>ee=reg256#3
# asm 2: vmovupd 160(<input_2=%rdx),>ee=%ymm2
vmovupd 160(%rdx),%ymm2

# qhasm: pp &= ee
# asm 1: vpand <ee=reg256#3,<pp=reg256#2,<pp=reg256#2
# asm 2: vpand <ee=%ymm2,<pp=%ymm1,<pp=%ymm1
vpand %ymm2,%ymm1,%ymm1

# qhasm: ss ^= pp
# asm 1: vpxor <pp=reg256#2,<ss=reg256#1,<ss=reg256#1
# asm 2: vpxor <pp=%ymm1,<ss=%ymm0,<ss=%ymm0
vpxor %ymm1,%ymm0,%ymm0

# qhasm: pp = mem256[ input_1 + 96 ]
# asm 1: vmovupd 96(<input_1=int64#2),>pp=reg256#2
# asm 2: vmovupd 96(<input_1=%rsi),>pp=%ymm1
vmovupd 96(%rsi),%ymm1

# qhasm: ee = mem256[ input_2 + 192 ]
# asm 1: vmovupd 192(<input_2=int64#3),>ee=reg256#3
# asm 2: vmovupd 192(<input_2=%rdx),>ee=%ymm2
vmovupd 192(%rdx),%ymm2

# qhasm: pp &= ee
# asm 1: vpand <ee=reg256#3,<pp=reg256#2,<pp=reg256#2
# asm 2: vpand <ee=%ymm2,<pp=%ymm1,<pp=%ymm1
vpand %ymm2,%ymm1,%ymm1

# qhasm: ss ^= pp
# asm 1: vpxor <pp=reg256#2,<ss=reg256#1,<ss=reg256#1
# asm 2: vpxor <pp=%ymm1,<ss=%ymm0,<ss=%ymm0
vpxor %ymm1,%ymm0,%ymm0

# qhasm: pp = mem256[ input_1 + 128 ]
# asm 1: vmovupd 128(<input_1=int64#2),>pp=reg256#2
# asm 2: vmovupd 128(<input_1=%rsi),>pp=%ymm1
vmovupd 128(%rsi),%ymm1

# qhasm: ee = mem256[ input_2 + 224 ]
# asm 1: vmovupd 224(<input_2=int64#3),>ee=reg256#3
# asm 2: vmovupd 224(<input_2=%rdx),>ee=%ymm2
vmovupd 224(%rdx),%ymm2

# qhasm: pp &= ee
# asm 1: vpand <ee=reg256#3,<pp=reg256#2,<pp=reg256#2
# asm 2: vpand <ee=%ymm2,<pp=%ymm1,<pp=%ymm1
vpand %ymm2,%ymm1,%ymm1

# qhasm: ss ^= pp
# asm 1: vpxor <pp=reg256#2,<ss=reg256#1,<ss=reg256#1
# asm 2: vpxor <pp=%ymm1,<ss=%ymm0,<ss=%ymm0
vpxor %ymm1,%ymm0,%ymm0

# qhasm: pp = mem256[ input_1 + 160 ]
# asm 1: vmovupd 160(<input_1=int64#2),>pp=reg256#2
# asm 2: vmovupd 160(<input_1=%rsi),>pp=%ymm1
vmovupd 160(%rsi),%ymm1

# qhasm: ee = mem256[ input_2 + 256 ]
# asm 1: vmovupd 256(<input_2=int64#3),>ee=reg256#3
# asm 2: vmovupd 256(<input_2=%rdx),>ee=%ymm2
vmovupd 256(%rdx),%ymm2

# qhasm: pp &= ee
# asm 1: vpand <ee=reg256#3,<pp=reg256#2,<pp=reg256#2
# asm 2: vpand <ee=%ymm2,<pp=%ymm1,<pp=%ymm1
vpand %ymm2,%ymm1,%ymm1

# qhasm: ss ^= pp
# asm 1: vpxor <pp=reg256#2,<ss=reg256#1,<ss=reg256#1
# asm 2: vpxor <pp=%ymm1,<ss=%ymm0,<ss=%ymm0
vpxor %ymm1,%ymm0,%ymm0

# qhasm: pp = mem256[ input_1 + 192 ]
# asm 1: vmovupd 192(<input_1=int64#2),>pp=reg256#2
# asm 2: vmovupd 192(<input_1=%rsi),>pp=%ymm1
vmovupd 192(%rsi),%ymm1

# qhasm: ee = mem256[ input_2 + 288 ]
# asm 1: vmovupd 288(<input_2=int64#3),>ee=reg256#3
# asm 2: vmovupd 288(<input_2=%rdx),>ee=%ymm2
vmovupd 288(%rdx),%ymm2

# qhasm: pp &= ee
# asm 1: vpand <ee=reg256#3,<pp=reg256#2,<pp=reg256#2
# asm 2: vpand <ee=%ymm2,<pp=%ymm1,<pp=%ymm1
vpand %ymm2,%ymm1,%ymm1

# qhasm: ss ^= pp
# asm 1: vpxor <pp=reg256#2,<ss=reg256#1,<ss=reg256#1
# asm 2: vpxor <pp=%ymm1,<ss=%ymm0,<ss=%ymm0
vpxor %ymm1,%ymm0,%ymm0

# qhasm: pp = mem256[ input_1 + 224 ]
# asm 1: vmovupd 224(<input_1=int64#2),>pp=reg256#2
# asm 2: vmovupd 224(<input_1=%rsi),>pp=%ymm1
vmovupd 224(%rsi),%ymm1

# qhasm: ee = mem256[ input_2 + 320 ]
# asm 1: vmovupd 320(<input_2=int64#3),>ee=reg256#3
# asm 2: vmovupd 320(<input_2=%rdx),>ee=%ymm2
vmovupd 320(%rdx),%ymm2

# qhasm: pp &= ee
# asm 1: vpand <ee=reg256#3,<pp=reg256#2,<pp=reg256#2
# asm 2: vpand <ee=%ymm2,<pp=%ymm1,<pp=%ymm1
vpand %ymm2,%ymm1,%ymm1

# qhasm: ss ^= pp
# asm 1: vpxor <pp=reg256#2,<ss=reg256#1,<ss=reg256#1
# asm 2: vpxor <pp=%ymm1,<ss=%ymm0,<ss=%ymm0
vpxor %ymm1,%ymm0,%ymm0

# qhasm: pp = mem256[ input_1 + 256 ]
# asm 1: vmovupd 256(<input_1=int64#2),>pp=reg256#2
# asm 2: vmovupd 256(<input_1=%rsi),>pp=%ymm1
vmovupd 256(%rsi),%ymm1

# qhasm: ee = mem256[ input_2 + 352 ]
# asm 1: vmovupd 352(<input_2=int64#3),>ee=reg256#3
# asm 2: vmovupd 352(<input_2=%rdx),>ee=%ymm2
vmovupd 352(%rdx),%ymm2

# qhasm: pp &= ee
# asm 1: vpand <ee=reg256#3,<pp=reg256#2,<pp=reg256#2
# asm 2: vpand <ee=%ymm2,<pp=%ymm1,<pp=%ymm1
vpand %ymm2,%ymm1,%ymm1

# qhasm: ss ^= pp
# asm 1: vpxor <pp=reg256#2,<ss=reg256#1,<ss=reg256#1
# asm 2: vpxor <pp=%ymm1,<ss=%ymm0,<ss=%ymm0
vpxor %ymm1,%ymm0,%ymm0

# qhasm: pp = mem256[ input_1 + 288 ]
# asm 1: vmovupd 288(<input_1=int64#2),>pp=reg256#2
# asm 2: vmovupd 288(<input_1=%rsi),>pp=%ymm1
vmovupd 288(%rsi),%ymm1

# qhasm: ee = mem256[ input_2 + 384 ]
# asm 1: vmovupd 384(<input_2=int64#3),>ee=reg256#3
# asm 2: vmovupd 384(<input_2=%rdx),>ee=%ymm2
vmovupd 384(%rdx),%ymm2

# qhasm: pp &= ee
# asm 1: vpand <ee=reg256#3,<pp=reg256#2,<pp=reg256#2
# asm 2: vpand <ee=%ymm2,<pp=%ymm1,<pp=%ymm1
vpand %ymm2,%ymm1,%ymm1

# qhasm: ss ^= pp
# asm 1: vpxor <pp=reg256#2,<ss=reg256#1,<ss=reg256#1
# asm 2: vpxor <pp=%ymm1,<ss=%ymm0,<ss=%ymm0
vpxor %ymm1,%ymm0,%ymm0

# qhasm: buf = ss
# asm 1: vmovapd <ss=reg256#1,>buf=stack256#1
# asm 2: vmovapd <ss=%ymm0,>buf=0(%rsp)
vmovapd %ymm0,0(%rsp)

# qhasm: s = *(uint64 *)(input_1 + 320)
# asm 1: movq 320(<input_1=int64#2),>s=int64#6
# asm 2: movq 320(<input_1=%rsi),>s=%r9
movq 320(%rsi),%r9

# qhasm: e = *(uint64 *)(input_2 + 416)
# asm 1: movq 416(<input_2=int64#3),>e=int64#7
# asm 2: movq 416(<input_2=%rdx),>e=%rax
movq 416(%rdx),%rax

# qhasm: s &= e
# asm 1: and <e=int64#7,<s=int64#6
# asm 2: and <e=%rax,<s=%r9
and %rax,%r9

# qhasm: p = *(uint64 *)(input_1 + 328)
# asm 1: movq 328(<input_1=int64#2),>p=int64#7
# asm 2: movq 328(<input_1=%rsi),>p=%rax
movq 328(%rsi),%rax

# qhasm: e = *(uint64 *)(input_2 + 424)
# asm 1: movq 424(<input_2=int64#3),>e=int64#8
# asm 2: movq 424(<input_2=%rdx),>e=%r10
movq 424(%rdx),%r10

# qhasm: p &= e
# asm 1: and <e=int64#8,<p=int64#7
# asm 2: and <e=%r10,<p=%rax
and %r10,%rax

# qhasm: s ^= p
# asm 1: xor <p=int64#7,<s=int64#6
# asm 2: xor <p=%rax,<s=%r9
xor %rax,%r9

# qhasm: p = *(uint32 *)(input_1 + 336)
# asm 1: movl 336(<input_1=int64#2),>p=int64#7d
# asm 2: movl 336(<input_1=%rsi),>p=%eax
movl 336(%rsi),%eax

# qhasm: e = *(uint32 *)(input_2 + 432)
# asm 1: movl 432(<input_2=int64#3),>e=int64#8d
# asm 2: movl 432(<input_2=%rdx),>e=%r10d
movl 432(%rdx),%r10d

# qhasm: p &= e
# asm 1: and <e=int64#8,<p=int64#7
# asm 2: and <e=%r10,<p=%rax
and %r10,%rax

# qhasm: s ^= p
# asm 1: xor <p=int64#7,<s=int64#6
# asm 2: xor <p=%rax,<s=%r9
xor %rax,%r9

# qhasm: c_all = count(s)
# asm 1: popcnt <s=int64#6, >c_all=int64#6
# asm 2: popcnt <s=%r9, >c_all=%r9
popcnt %r9, %r9

# qhasm: b64 = mem64[ buf_ptr + 0 ]
# asm 1: movq 0(<buf_ptr=int64#4),>b64=int64#7
# asm 2: movq 0(<buf_ptr=%rcx),>b64=%rax
movq 0(%rcx),%rax

# qhasm: c = count(b64)
# asm 1: popcnt <b64=int64#7, >c=int64#7
# asm 2: popcnt <b64=%rax, >c=%rax
popcnt %rax, %rax

# qhasm: c_all ^= c
# asm 1: xor <c=int64#7,<c_all=int64#6
# asm 2: xor <c=%rax,<c_all=%r9
xor %rax,%r9

# qhasm: b64 = mem64[ buf_ptr + 8 ]
# asm 1: movq 8(<buf_ptr=int64#4),>b64=int64#7
# asm 2: movq 8(<buf_ptr=%rcx),>b64=%rax
movq 8(%rcx),%rax

# qhasm: c = count(b64)
# asm 1: popcnt <b64=int64#7, >c=int64#7
# asm 2: popcnt <b64=%rax, >c=%rax
popcnt %rax, %rax

# qhasm: c_all ^= c
# asm 1: xor <c=int64#7,<c_all=int64#6
# asm 2: xor <c=%rax,<c_all=%r9
xor %rax,%r9

# qhasm: b64 = mem64[ buf_ptr + 16 ]
# asm 1: movq 16(<buf_ptr=int64#4),>b64=int64#7
# asm 2: movq 16(<buf_ptr=%rcx),>b64=%rax
movq 16(%rcx),%rax

# qhasm: c = count(b64)
# asm 1: popcnt <b64=int64#7, >c=int64#7
# asm 2: popcnt <b64=%rax, >c=%rax
popcnt %rax, %rax

# qhasm: c_all ^= c
# asm 1: xor <c=int64#7,<c_all=int64#6
# asm 2: xor <c=%rax,<c_all=%r9
xor %rax,%r9

# qhasm: b64 = mem64[ buf_ptr + 24 ]
# asm 1: movq 24(<buf_ptr=int64#4),>b64=int64#7
# asm 2: movq 24(<buf_ptr=%rcx),>b64=%rax
movq 24(%rcx),%rax

# qhasm: c = count(b64)
# asm 1: popcnt <b64=int64#7, >c=int64#7
# asm 2: popcnt <b64=%rax, >c=%rax
popcnt %rax, %rax

# qhasm: c_all ^= c
# asm 1: xor <c=int64#7,<c_all=int64#6
# asm 2: xor <c=%rax,<c_all=%r9
xor %rax,%r9

# qhasm: addr = row
# asm 1: mov <row=int64#5,>addr=int64#7
# asm 2: mov <row=%r8,>addr=%rax
mov %r8,%rax

# qhasm: (uint64) addr >>= 3
# asm 1: shr $3,<addr=int64#7
# asm 2: shr $3,<addr=%rax
shr $3,%rax

# qhasm: addr += input_0
# asm 1: add <input_0=int64#1,<addr=int64#7
# asm 2: add <input_0=%rdi,<addr=%rax
add %rdi,%rax

# qhasm: synd = *(uint8 *) (addr + 0)
# asm 1: movzbq 0(<addr=int64#7),>synd=int64#8
# asm 2: movzbq 0(<addr=%rax),>synd=%r10
movzbq 0(%rax),%r10

# qhasm: synd <<= 1
# asm 1: shl $1,<synd=int64#8
# asm 2: shl $1,<synd=%r10
shl $1,%r10

# qhasm: (uint32) c_all &= 1
# asm 1: and $1,<c_all=int64#6d
# asm 2: and $1,<c_all=%r9d
and $1,%r9d

# qhasm: synd |= c_all
# asm 1: or <c_all=int64#6,<synd=int64#8
# asm 2: or <c_all=%r9,<synd=%r10
or %r9,%r10

# qhasm: *(uint8 *) (addr + 0) = synd
# asm 1: movb <synd=int64#8b,0(<addr=int64#7)
# asm 2: movb <synd=%r10b,0(<addr=%rax)
movb %r10b,0(%rax)

# qhasm: input_1 -= 340
# asm 1: sub $340,<input_1=int64#2
# asm 2: sub $340,<input_1=%rsi
sub $340,%rsi

# qhasm: =? row-0
# asm 1: cmp $0,<row=int64#5
# asm 2: cmp $0,<row=%r8
cmp $0,%r8
# comment:fp stack unchanged by jump

# qhasm: goto loop if !=
jne ._loop

# qhasm: ss = mem256[ input_0 + 0 ]
# asm 1: vmovupd 0(<input_0=int64#1),>ss=reg256#1
# asm 2: vmovupd 0(<input_0=%rdi),>ss=%ymm0
vmovupd 0(%rdi),%ymm0

# qhasm: ee = mem256[ input_2 + 0 ]
# asm 1: vmovupd 0(<input_2=int64#3),>ee=reg256#2
# asm 2: vmovupd 0(<input_2=%rdx),>ee=%ymm1
vmovupd 0(%rdx),%ymm1

# qhasm: ss ^= ee
# asm 1: vpxor <ee=reg256#2,<ss=reg256#1,<ss=reg256#1
# asm 2: vpxor <ee=%ymm1,<ss=%ymm0,<ss=%ymm0
vpxor %ymm1,%ymm0,%ymm0

# qhasm: mem256[ input_0 + 0 ] = ss
# asm 1: vmovupd <ss=reg256#1,0(<input_0=int64#1)
# asm 2: vmovupd <ss=%ymm0,0(<input_0=%rdi)
vmovupd %ymm0,0(%rdi)

# qhasm: ss = mem256[ input_0 + 32 ]
# asm 1: vmovupd 32(<input_0=int64#1),>ss=reg256#1
# asm 2: vmovupd 32(<input_0=%rdi),>ss=%ymm0
vmovupd 32(%rdi),%ymm0

# qhasm: ee = mem256[ input_2 + 32 ]
# asm 1: vmovupd 32(<input_2=int64#3),>ee=reg256#2
# asm 2: vmovupd 32(<input_2=%rdx),>ee=%ymm1
vmovupd 32(%rdx),%ymm1

# qhasm: ss ^= ee
# asm 1: vpxor <ee=reg256#2,<ss=reg256#1,<ss=reg256#1
# asm 2: vpxor <ee=%ymm1,<ss=%ymm0,<ss=%ymm0
vpxor %ymm1,%ymm0,%ymm0

# qhasm: mem256[ input_0 + 32 ] = ss
# asm 1: vmovupd <ss=reg256#1,32(<input_0=int64#1)
# asm 2: vmovupd <ss=%ymm0,32(<input_0=%rdi)
vmovupd %ymm0,32(%rdi)

# qhasm: ss = mem256[ input_0 + 64 ]
# asm 1: vmovupd 64(<input_0=int64#1),>ss=reg256#1
# asm 2: vmovupd 64(<input_0=%rdi),>ss=%ymm0
vmovupd 64(%rdi),%ymm0

# qhasm: ee = mem256[ input_2 + 64 ]
# asm 1: vmovupd 64(<input_2=int64#3),>ee=reg256#2
# asm 2: vmovupd 64(<input_2=%rdx),>ee=%ymm1
vmovupd 64(%rdx),%ymm1

# qhasm: ss ^= ee
# asm 1: vpxor <ee=reg256#2,<ss=reg256#1,<ss=reg256#1
# asm 2: vpxor <ee=%ymm1,<ss=%ymm0,<ss=%ymm0
vpxor %ymm1,%ymm0,%ymm0

# qhasm: mem256[ input_0 + 64 ] = ss
# asm 1: vmovupd <ss=reg256#1,64(<input_0=int64#1)
# asm 2: vmovupd <ss=%ymm0,64(<input_0=%rdi)
vmovupd %ymm0,64(%rdi)

# qhasm: return
add %r11,%rsp
ret

+ 17
- 0
crypto_kem/mceliece348864/avx/transpose.c View File

@@ -0,0 +1,17 @@
#include "transpose.h"

/*
This file is for matrix transposition
*/

extern void PQCLEAN_MCELIECE348864_AVX_transpose_64x64_asm(uint64_t *);
extern void PQCLEAN_MCELIECE348864_AVX_transpose_64x256_sp_asm(vec256 *);


void PQCLEAN_MCELIECE348864_AVX_transpose_64x64(uint64_t *in) {
PQCLEAN_MCELIECE348864_AVX_transpose_64x64_asm(in);
}

void PQCLEAN_MCELIECE348864_AVX_transpose_64x256_sp(vec256 *in) {
PQCLEAN_MCELIECE348864_AVX_transpose_64x256_sp_asm(in);
}

+ 17
- 0
crypto_kem/mceliece348864/avx/transpose.h View File

@@ -0,0 +1,17 @@
#ifndef PQCLEAN_MCELIECE348864_AVX_TRANSPOSE_H
#define PQCLEAN_MCELIECE348864_AVX_TRANSPOSE_H
/*
This file is for matrix transposition
*/


#include "vec256.h"

#include <stdint.h>


void PQCLEAN_MCELIECE348864_AVX_transpose_64x64(uint64_t *in);
void PQCLEAN_MCELIECE348864_AVX_transpose_64x256_sp(vec256 *in);

#endif


+ 8145
- 0
crypto_kem/mceliece348864/avx/transpose_64x256_sp_asm.S
File diff suppressed because it is too large
View File


+ 8467
- 0
crypto_kem/mceliece348864/avx/transpose_64x64_asm.S
File diff suppressed because it is too large
View File


+ 18
- 0
crypto_kem/mceliece348864/avx/uint32_sort.c View File

@@ -0,0 +1,18 @@
#include "uint32_sort.h"

#include "int32_sort.h"


/* can save time by vectorizing xor loops */
/* can save time by integrating xor loops with int32_sort */

void PQCLEAN_MCELIECE348864_AVX_uint32_sort(uint32_t *x, size_t n) {
size_t j;
for (j = 0; j < n; ++j) {
x[j] ^= 0x80000000;
}
PQCLEAN_MCELIECE348864_AVX_int32_sort((int32_t *) x, n);
for (j = 0; j < n; ++j) {
x[j] ^= 0x80000000;
}
}

+ 9
- 0
crypto_kem/mceliece348864/avx/uint32_sort.h View File

@@ -0,0 +1,9 @@
#ifndef PQCLEAN_MCELIECE348864_AVX_UINT32_SORT_H
#define PQCLEAN_MCELIECE348864_AVX_UINT32_SORT_H

#include <stddef.h>
#include <stdint.h>

void PQCLEAN_MCELIECE348864_AVX_uint32_sort(uint32_t *x, size_t n);

#endif

+ 354
- 0
crypto_kem/mceliece348864/avx/update_asm.S View File

@@ -0,0 +1,354 @@

# qhasm: int64 input_0

# qhasm: int64 input_1

# qhasm: int64 input_2

# qhasm: int64 input_3

# qhasm: int64 input_4

# qhasm: int64 input_5

# qhasm: stack64 input_6

# qhasm: stack64 input_7

# qhasm: int64 caller_r11

# qhasm: int64 caller_r12

# qhasm: int64 caller_r13

# qhasm: int64 caller_r14

# qhasm: int64 caller_r15

# qhasm: int64 caller_rbx

# qhasm: int64 caller_rbp

# qhasm: int64 s0

# qhasm: int64 s1

# qhasm: enter update_asm
.p2align 5
.global _PQCLEAN_MCELIECE348864_AVX_update_asm
.global PQCLEAN_MCELIECE348864_AVX_update_asm
_PQCLEAN_MCELIECE348864_AVX_update_asm:
PQCLEAN_MCELIECE348864_AVX_update_asm:
mov %rsp,%r11
and $31,%r11
add $0,%r11
sub %r11,%rsp

# qhasm: s1 = input_1
# asm 1: mov <input_1=int64#2,>s1=int64#2
# asm 2: mov <input_1=%rsi,>s1=%rsi
mov %rsi,%rsi

# qhasm: s0 = mem64[ input_0 + 0 ]
# asm 1: movq 0(<input_0=int64#1),>s0=int64#4
# asm 2: movq 0(<input_0=%rdi),>s0=%rcx
movq 0(%rdi),%rcx

# qhasm: s0 = (s1 s0) >> 1
# asm 1: shrd $1,<s1=int64#2,<s0=int64#4
# asm 2: shrd $1,<s1=%rsi,<s0=%rcx
shrd $1,%rsi,%rcx

# qhasm: (uint64) s1 >>= 1
# asm 1: shr $1,<s1=int64#2
# asm 2: shr $1,<s1=%rsi
shr $1,%rsi

# qhasm: mem64[ input_0 + 0 ] = s0
# asm 1: movq <s0=int64#4,0(<input_0=int64#1)
# asm 2: movq <s0=%rcx,0(<input_0=%rdi)
movq %rcx,0(%rdi)

# qhasm: input_0 += input_2
# asm 1: add <input_2=int64#3,<input_0=int64#1
# asm 2: add <input_2=%rdx,<input_0=%rdi
add %rdx,%rdi

# qhasm: s0 = mem64[ input_0 + 0 ]
# asm 1: movq 0(<input_0=int64#1),>s0=int64#4
# asm 2: movq 0(<input_0=%rdi),>s0=%rcx
movq 0(%rdi),%rcx

# qhasm: s0 = (s1 s0) >> 1
# asm 1: shrd $1,<s1=int64#2,<s0=int64#4
# asm 2: shrd $1,<s1=%rsi,<s0=%rcx
shrd $1,%rsi,%rcx

# qhasm: (uint64) s1 >>= 1
# asm 1: shr $1,<s1=int64#2
# asm 2: shr $1,<s1=%rsi
shr $1,%rsi

# qhasm: mem64[ input_0 + 0 ] = s0
# asm 1: movq <s0=int64#4,0(<input_0=int64#1)
# asm 2: movq <s0=%rcx,0(<input_0=%rdi)
movq %rcx,0(%rdi)

# qhasm: input_0 += input_2
# asm 1: add <input_2=int64#3,<input_0=int64#1
# asm 2: add <input_2=%rdx,<input_0=%rdi
add %rdx,%rdi

# qhasm: s0 = mem64[ input_0 + 0 ]
# asm 1: movq 0(<input_0=int64#1),>s0=int64#4
# asm 2: movq 0(<input_0=%rdi),>s0=%rcx
movq 0(%rdi),%rcx

# qhasm: s0 = (s1 s0) >> 1
# asm 1: shrd $1,<s1=int64#2,<s0=int64#4
# asm 2: shrd $1,<s1=%rsi,<s0=%rcx
shrd $1,%rsi,%rcx

# qhasm: (uint64) s1 >>= 1
# asm 1: shr $1,<s1=int64#2
# asm 2: shr $1,<s1=%rsi
shr $1,%rsi

# qhasm: mem64[ input_0 + 0 ] = s0
# asm 1: movq <s0=int64#4,0(<input_0=int64#1)
# asm 2: movq <s0=%rcx,0(<input_0=%rdi)
movq %rcx,0(%rdi)

# qhasm: input_0 += input_2
# asm 1: add <input_2=int64#3,<input_0=int64#1
# asm 2: add <input_2=%rdx,<input_0=%rdi
add %rdx,%rdi

# qhasm: s0 = mem64[ input_0 + 0 ]
# asm 1: movq 0(<input_0=int64#1),>s0=int64#4
# asm 2: movq 0(<input_0=%rdi),>s0=%rcx
movq 0(%rdi),%rcx

# qhasm: s0 = (s1 s0) >> 1
# asm 1: shrd $1,<s1=int64#2,<s0=int64#4
# asm 2: shrd $1,<s1=%rsi,<s0=%rcx
shrd $1,%rsi,%rcx

# qhasm: (uint64) s1 >>= 1
# asm 1: shr $1,<s1=int64#2
# asm 2: shr $1,<s1=%rsi
shr $1,%rsi

# qhasm: mem64[ input_0 + 0 ] = s0
# asm 1: movq <s0=int64#4,0(<input_0=int64#1)
# asm 2: movq <s0=%rcx,0(<input_0=%rdi)
movq %rcx,0(%rdi)

# qhasm: input_0 += input_2
# asm 1: add <input_2=int64#3,<input_0=int64#1
# asm 2: add <input_2=%rdx,<input_0=%rdi
add %rdx,%rdi

# qhasm: s0 = mem64[ input_0 + 0 ]
# asm 1: movq 0(<input_0=int64#1),>s0=int64#4
# asm 2: movq 0(<input_0=%rdi),>s0=%rcx
movq 0(%rdi),%rcx

# qhasm: s0 = (s1 s0) >> 1
# asm 1: shrd $1,<s1=int64#2,<s0=int64#4
# asm 2: shrd $1,<s1=%rsi,<s0=%rcx
shrd $1,%rsi,%rcx

# qhasm: (uint64) s1 >>= 1
# asm 1: shr $1,<s1=int64#2
# asm 2: shr $1,<s1=%rsi
shr $1,%rsi

# qhasm: mem64[ input_0 + 0 ] = s0
# asm 1: movq <s0=int64#4,0(<input_0=int64#1)
# asm 2: movq <s0=%rcx,0(<input_0=%rdi)
movq %rcx,0(%rdi)

# qhasm: input_0 += input_2
# asm 1: add <input_2=int64#3,<input_0=int64#1
# asm 2: add <input_2=%rdx,<input_0=%rdi
add %rdx,%rdi

# qhasm: s0 = mem64[ input_0 + 0 ]
# asm 1: movq 0(<input_0=int64#1),>s0=int64#4
# asm 2: movq 0(<input_0=%rdi),>s0=%rcx
movq 0(%rdi),%rcx

# qhasm: s0 = (s1 s0) >> 1
# asm 1: shrd $1,<s1=int64#2,<s0=int64#4
# asm 2: shrd $1,<s1=%rsi,<s0=%rcx
shrd $1,%rsi,%rcx

# qhasm: (uint64) s1 >>= 1
# asm 1: shr $1,<s1=int64#2
# asm 2: shr $1,<s1=%rsi
shr $1,%rsi

# qhasm: mem64[ input_0 + 0 ] = s0
# asm 1: movq <s0=int64#4,0(<input_0=int64#1)
# asm 2: movq <s0=%rcx,0(<input_0=%rdi)
movq %rcx,0(%rdi)

# qhasm: input_0 += input_2
# asm 1: add <input_2=int64#3,<input_0=int64#1
# asm 2: add <input_2=%rdx,<input_0=%rdi
add %rdx,%rdi

# qhasm: s0 = mem64[ input_0 + 0 ]
# asm 1: movq 0(<input_0=int64#1),>s0=int64#4
# asm 2: movq 0(<input_0=%rdi),>s0=%rcx
movq 0(%rdi),%rcx

# qhasm: s0 = (s1 s0) >> 1
# asm 1: shrd $1,<s1=int64#2,<s0=int64#4
# asm 2: shrd $1,<s1=%rsi,<s0=%rcx
shrd $1,%rsi,%rcx

# qhasm: (uint64) s1 >>= 1
# asm 1: shr $1,<s1=int64#2
# asm 2: shr $1,<s1=%rsi
shr $1,%rsi

# qhasm: mem64[ input_0 + 0 ] = s0
# asm 1: movq <s0=int64#4,0(<input_0=int64#1)
# asm 2: movq <s0=%rcx,0(<input_0=%rdi)
movq %rcx,0(%rdi)

# qhasm: input_0 += input_2
# asm 1: add <input_2=int64#3,<input_0=int64#1
# asm 2: add <input_2=%rdx,<input_0=%rdi
add %rdx,%rdi

# qhasm: s0 = mem64[ input_0 + 0 ]
# asm 1: movq 0(<input_0=int64#1),>s0=int64#4
# asm 2: movq 0(<input_0=%rdi),>s0=%rcx
movq 0(%rdi),%rcx

# qhasm: s0 = (s1 s0) >> 1
# asm 1: shrd $1,<s1=int64#2,<s0=int64#4
# asm 2: shrd $1,<s1=%rsi,<s0=%rcx
shrd $1,%rsi,%rcx

# qhasm: (uint64) s1 >>= 1
# asm 1: shr $1,<s1=int64#2
# asm 2: shr $1,<s1=%rsi
shr $1,%rsi

# qhasm: mem64[ input_0 + 0 ] = s0
# asm 1: movq <s0=int64#4,0(<input_0=int64#1)
# asm 2: movq <s0=%rcx,0(<input_0=%rdi)
movq %rcx,0(%rdi)

# qhasm: input_0 += input_2
# asm 1: add <input_2=int64#3,<input_0=int64#1
# asm 2: add <input_2=%rdx,<input_0=%rdi
add %rdx,%rdi

# qhasm: s0 = mem64[ input_0 + 0 ]
# asm 1: movq 0(<input_0=int64#1),>s0=int64#4
# asm 2: movq 0(<input_0=%rdi),>s0=%rcx
movq 0(%rdi),%rcx

# qhasm: s0 = (s1 s0) >> 1
# asm 1: shrd $1,<s1=int64#2,<s0=int64#4
# asm 2: shrd $1,<s1=%rsi,<s0=%rcx
shrd $1,%rsi,%rcx

# qhasm: (uint64) s1 >>= 1
# asm 1: shr $1,<s1=int64#2
# asm 2: shr $1,<s1=%rsi
shr $1,%rsi

# qhasm: mem64[ input_0 + 0 ] = s0
# asm 1: movq <s0=int64#4,0(<input_0=int64#1)
# asm 2: movq <s0=%rcx,0(<input_0=%rdi)
movq %rcx,0(%rdi)

# qhasm: input_0 += input_2
# asm 1: add <input_2=int64#3,<input_0=int64#1
# asm 2: add <input_2=%rdx,<input_0=%rdi
add %rdx,%rdi

# qhasm: s0 = mem64[ input_0 + 0 ]
# asm 1: movq 0(<input_0=int64#1),>s0=int64#4
# asm 2: movq 0(<input_0=%rdi),>s0=%rcx
movq 0(%rdi),%rcx

# qhasm: s0 = (s1 s0) >> 1
# asm 1: shrd $1,<s1=int64#2,<s0=int64#4
# asm 2: shrd $1,<s1=%rsi,<s0=%rcx
shrd $1,%rsi,%rcx

# qhasm: (uint64) s1 >>= 1
# asm 1: shr $1,<s1=int64#2
# asm 2: shr $1,<s1=%rsi
shr $1,%rsi

# qhasm: mem64[ input_0 + 0 ] = s0
# asm 1: movq <s0=int64#4,0(<input_0=int64#1)
# asm 2: movq <s0=%rcx,0(<input_0=%rdi)
movq %rcx,0(%rdi)

# qhasm: input_0 += input_2
# asm 1: add <input_2=int64#3,<input_0=int64#1
# asm 2: add <input_2=%rdx,<input_0=%rdi
add %rdx,%rdi

# qhasm: s0 = mem64[ input_0 + 0 ]
# asm 1: movq 0(<input_0=int64#1),>s0=int64#4
# asm 2: movq 0(<input_0=%rdi),>s0=%rcx
movq 0(%rdi),%rcx

# qhasm: s0 = (s1 s0) >> 1
# asm 1: shrd $1,<s1=int64#2,<s0=int64#4
# asm 2: shrd $1,<s1=%rsi,<s0=%rcx
shrd $1,%rsi,%rcx

# qhasm: (uint64) s1 >>= 1
# asm 1: shr $1,<s1=int64#2
# asm 2: shr $1,<s1=%rsi
shr $1,%rsi

# qhasm: mem64[ input_0 + 0 ] = s0
# asm 1: movq <s0=int64#4,0(<input_0=int64#1)
# asm 2: movq <s0=%rcx,0(<input_0=%rdi)
movq %rcx,0(%rdi)

# qhasm: input_0 += input_2
# asm 1: add <input_2=int64#3,<input_0=int64#1
# asm 2: add <input_2=%rdx,<input_0=%rdi
add %rdx,%rdi

# qhasm: s0 = mem64[ input_0 + 0 ]
# asm 1: movq 0(<input_0=int64#1),>s0=int64#4
# asm 2: movq 0(<input_0=%rdi),>s0=%rcx
movq 0(%rdi),%rcx

# qhasm: s0 = (s1 s0) >> 1
# asm 1: shrd $1,<s1=int64#2,<s0=int64#4
# asm 2: shrd $1,<s1=%rsi,<s0=%rcx
shrd $1,%rsi,%rcx

# qhasm: (uint64) s1 >>= 1
# asm 1: shr $1,<s1=int64#2
# asm 2: shr $1,<s1=%rsi
shr $1,%rsi

# qhasm: mem64[ input_0 + 0 ] = s0
# asm 1: movq <s0=int64#4,0(<input_0=int64#1)
# asm 2: movq <s0=%rcx,0(<input_0=%rdi)
movq %rcx,0(%rdi)

# qhasm: input_0 += input_2
# asm 1: add <input_2=int64#3,<input_0=int64#1
# asm 2: add <input_2=%rdx,<input_0=%rdi
add %rdx,%rdi

# qhasm: return
add %r11,%rsp
ret

+ 106
- 0
crypto_kem/mceliece348864/avx/util.c View File

@@ -0,0 +1,106 @@
/*
This file is for loading/storing data in a little-endian fashion
*/

#include "util.h"

void PQCLEAN_MCELIECE348864_AVX_store_i(unsigned char *out, uint64_t in, int i) {
int j;

for (j = 0; j < i; j++) {
out[j] = (in >> (j * 8)) & 0xFF;
}
}

void PQCLEAN_MCELIECE348864_AVX_store2(unsigned char *dest, gf a) {
dest[0] = a & 0xFF;
dest[1] = a >> 8;
}

uint16_t PQCLEAN_MCELIECE348864_AVX_load2(const unsigned char *src) {
uint16_t a;

a = src[1];
a <<= 8;
a |= src[0];

return a & GFMASK;
}

uint32_t PQCLEAN_MCELIECE348864_AVX_load4(const unsigned char *src) {
uint32_t a;

a = src[3];
a <<= 8;
a |= src[2];
a <<= 8;
a |= src[1];
a <<= 8;
a |= src[0];

return a;
}

void PQCLEAN_MCELIECE348864_AVX_irr_load(uint64_t *out, const unsigned char *in) {
int i, j;
uint16_t irr[ SYS_T + 1 ];

for (i = 0; i < SYS_T; i++) {
irr[i] = PQCLEAN_MCELIECE348864_AVX_load2(in + i * 2);
irr[i] &= GFMASK;
}

irr[ SYS_T ] = 1;

for (i = 0; i < GFBITS; i++) {
out[i] = 0;
}

for (i = SYS_T; i >= 0; i--) {
for (j = 0; j < GFBITS; j++) {
out[j] <<= 1;
out[j] |= (irr[i] >> j) & 1;
}
}
}

void PQCLEAN_MCELIECE348864_AVX_store8(unsigned char *out, uint64_t in) {
out[0] = (in >> 0x00) & 0xFF;
out[1] = (in >> 0x08) & 0xFF;
out[2] = (in >> 0x10) & 0xFF;
out[3] = (in >> 0x18) & 0xFF;
out[4] = (in >> 0x20) & 0xFF;
out[5] = (in >> 0x28) & 0xFF;
out[6] = (in >> 0x30) & 0xFF;
out[7] = (in >> 0x38) & 0xFF;
}

uint64_t PQCLEAN_MCELIECE348864_AVX_load8(const unsigned char *in) {
int i;
uint64_t ret = in[7];

for (i = 6; i >= 0; i--) {
ret <<= 8;
ret |= in[i];
}

return ret;
}

gf PQCLEAN_MCELIECE348864_AVX_bitrev(gf a) {
a = ((a & 0x00FF) << 8) | ((a & 0xFF00) >> 8);
a = ((a & 0x0F0F) << 4) | ((a & 0xF0F0) >> 4);
a = ((a & 0x3333) << 2) | ((a & 0xCCCC) >> 2);
a = ((a & 0x5555) << 1) | ((a & 0xAAAA) >> 1);

return a >> 4;
}

vec128 PQCLEAN_MCELIECE348864_AVX_load16(const unsigned char *in) {
return PQCLEAN_MCELIECE348864_AVX_vec128_set2x( PQCLEAN_MCELIECE348864_AVX_load8(in), PQCLEAN_MCELIECE348864_AVX_load8(in + 8) );
}

void PQCLEAN_MCELIECE348864_AVX_store16(unsigned char *out, vec128 in) {
PQCLEAN_MCELIECE348864_AVX_store8(out + 0, PQCLEAN_MCELIECE348864_AVX_vec128_extract(in, 0));
PQCLEAN_MCELIECE348864_AVX_store8(out + 8, PQCLEAN_MCELIECE348864_AVX_vec128_extract(in, 1));
}

+ 33
- 0
crypto_kem/mceliece348864/avx/util.h View File

@@ -0,0 +1,33 @@
#ifndef PQCLEAN_MCELIECE348864_AVX_UTIL_H
#define PQCLEAN_MCELIECE348864_AVX_UTIL_H
/*
This file is for loading/storing data in a little-endian fashion
*/


#include "gf.h"
#include "vec128.h"

#include <stdint.h>

void PQCLEAN_MCELIECE348864_AVX_store_i(unsigned char *out, uint64_t in, int i);
void PQCLEAN_MCELIECE348864_AVX_store2(unsigned char *dest, gf a);

uint16_t PQCLEAN_MCELIECE348864_AVX_load2(const unsigned char *src);

uint32_t PQCLEAN_MCELIECE348864_AVX_load4(const unsigned char *src);

void PQCLEAN_MCELIECE348864_AVX_irr_load(uint64_t *out, const unsigned char *in);

void PQCLEAN_MCELIECE348864_AVX_store8(unsigned char *out, uint64_t in);

uint64_t PQCLEAN_MCELIECE348864_AVX_load8(const unsigned char *in);

gf PQCLEAN_MCELIECE348864_AVX_bitrev(gf a);

vec128 PQCLEAN_MCELIECE348864_AVX_load16(const unsigned char *in);

void PQCLEAN_MCELIECE348864_AVX_store16(unsigned char *out, vec128 in);

#endif


+ 25
- 0
crypto_kem/mceliece348864/avx/vec.c View File

@@ -0,0 +1,25 @@
#include "vec.h"

#include "params.h"

extern void PQCLEAN_MCELIECE348864_AVX_vec_mul_asm(uint64_t *, const uint64_t *, const uint64_t *);
extern void PQCLEAN_MCELIECE348864_AVX_vec_mul_sp_asm(uint64_t *, const uint64_t *, const uint64_t *);


void PQCLEAN_MCELIECE348864_AVX_vec_mul(uint64_t *h, const uint64_t *f, const uint64_t *g) {
PQCLEAN_MCELIECE348864_AVX_vec_mul_asm(h, f, g);
}


void PQCLEAN_MCELIECE348864_AVX_vec_mul_sp(uint64_t *h, const uint64_t *f, const uint64_t *g) {
PQCLEAN_MCELIECE348864_AVX_vec_mul_sp_asm(h, f, g);
}

void PQCLEAN_MCELIECE348864_AVX_vec_add(uint64_t *h, const uint64_t *f, const uint64_t *g) {
int b;

for (b = 0; b < GFBITS; b++) {
h[b] = f[b] ^ g[b];
}
}


+ 13
- 0
crypto_kem/mceliece348864/avx/vec.h View File

@@ -0,0 +1,13 @@
#ifndef PQCLEAN_MCELIECE348864_AVX_VEC_H
#define PQCLEAN_MCELIECE348864_AVX_VEC_H

#include <stdint.h>


void PQCLEAN_MCELIECE348864_AVX_vec_mul(uint64_t *h, const uint64_t *f, const uint64_t *g);

void PQCLEAN_MCELIECE348864_AVX_vec_mul_sp(uint64_t *h, const uint64_t *f, const uint64_t *g);

void PQCLEAN_MCELIECE348864_AVX_vec_add(uint64_t *h, const uint64_t *f, const uint64_t *g);

#endif

+ 83
- 0
crypto_kem/mceliece348864/avx/vec128.c View File

@@ -0,0 +1,83 @@
#include "vec128.h"

vec128 PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(uint16_t a) {
return _mm_set1_epi16(a);
}

int PQCLEAN_MCELIECE348864_AVX_vec128_testz(vec128 a) {
return _mm_testz_si128(a, a);
}

vec128 PQCLEAN_MCELIECE348864_AVX_vec128_setzero(void) {
return _mm_setzero_si128();
}

vec128 PQCLEAN_MCELIECE348864_AVX_vec128_and(vec128 a, vec128 b) {
return _mm_and_si128(a, b);
}

vec128 PQCLEAN_MCELIECE348864_AVX_vec128_xor(vec128 a, vec128 b) {
return _mm_xor_si128(a, b);
}

vec128 PQCLEAN_MCELIECE348864_AVX_vec128_or(vec128 a, vec128 b) {
return _mm_or_si128(a, b);
}

vec128 PQCLEAN_MCELIECE348864_AVX_vec128_sll_2x(vec128 a, int s) {
return _mm_slli_epi64(a, s);
}

vec128 PQCLEAN_MCELIECE348864_AVX_vec128_srl_2x(vec128 a, int s) {
return _mm_srli_epi64(a, s);
}

vec128 PQCLEAN_MCELIECE348864_AVX_vec128_set2x(uint64_t a0, uint64_t a1) {
return _mm_set_epi64x(a1, a0);
}

vec128 PQCLEAN_MCELIECE348864_AVX_vec128_unpack_low(vec128 a, vec128 b) {
return _mm_unpacklo_epi64(a, b);
}

vec128 PQCLEAN_MCELIECE348864_AVX_vec128_unpack_high(vec128 a, vec128 b) {
return _mm_unpackhi_epi64(a, b);
}

vec128 PQCLEAN_MCELIECE348864_AVX_vec128_setbits(uint64_t a) {
return _mm_set1_epi64x(-a);
}

void PQCLEAN_MCELIECE348864_AVX_vec128_copy(vec128 *dest, const vec128 *src) {
int i;

for (i = 0; i < GFBITS; i++) {
dest[i] = src[i];
}
}

void PQCLEAN_MCELIECE348864_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b) {
int i;

for (i = 0; i < GFBITS; i++) {
c[i] = PQCLEAN_MCELIECE348864_AVX_vec128_xor(a[i], b[i]);
}
}

vec128 PQCLEAN_MCELIECE348864_AVX_vec128_or_reduce(const vec128 *a) {
int i;
vec128 ret;

ret = a[0];
for (i = 1; i < GFBITS; i++) {
ret = PQCLEAN_MCELIECE348864_AVX_vec128_or(ret, a[i]);
}

return ret;
}

/* bitsliced field multiplications */
void PQCLEAN_MCELIECE348864_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g) {
PQCLEAN_MCELIECE348864_AVX_vec128_mul_asm(h, f, g, 16);
}


+ 41
- 0
crypto_kem/mceliece348864/avx/vec128.h View File

@@ -0,0 +1,41 @@
#ifndef PQCLEAN_MCELIECE348864_AVX_VEC128_H
#define PQCLEAN_MCELIECE348864_AVX_VEC128_H
/*
This file is for functions related to 128-bit vectors
including functions for bitsliced field operations
*/


#include "params.h"

#include <immintrin.h>
#include <stdint.h>

typedef __m128i vec128;

// this needs to be a macro, because
// _mm_extract_epi64 requires a literal int argument.
#define PQCLEAN_MCELIECE348864_AVX_vec128_extract(a, i) ((uint64_t) _mm_extract_epi64((vec128) (a), (i)))

int PQCLEAN_MCELIECE348864_AVX_vec128_testz(vec128 a);
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(uint16_t a);
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_setzero(void);
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_and(vec128 a, vec128 b);
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_xor(vec128 a, vec128 b);
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_or(vec128 a, vec128 b);
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_sll_2x(vec128 a, int s);
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_srl_2x(vec128 a, int s);
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_set2x(uint64_t a0, uint64_t a1);
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_unpack_low(vec128 a, vec128 b);
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_unpack_high(vec128 a, vec128 b);
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_setbits(uint64_t a);
void PQCLEAN_MCELIECE348864_AVX_vec128_copy(vec128 *dest, const vec128 *src);
void PQCLEAN_MCELIECE348864_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b);
vec128 PQCLEAN_MCELIECE348864_AVX_vec128_or_reduce(const vec128 *a);

extern void PQCLEAN_MCELIECE348864_AVX_vec128_mul_asm(vec128 *, vec128 *, const vec128 *, int);

/* bitsliced field multiplications */
void PQCLEAN_MCELIECE348864_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g);

#endif

+ 1369
- 0
crypto_kem/mceliece348864/avx/vec128_mul_asm.S
File diff suppressed because it is too large
View File


+ 137
- 0
crypto_kem/mceliece348864/avx/vec256.c View File

@@ -0,0 +1,137 @@
/*
This file is for functions related to 256-bit vectors
including functions for bitsliced field operations
*/

#include "vec256.h"

extern void PQCLEAN_MCELIECE348864_AVX_vec256_mul_asm(vec256 *, vec256 *, const vec256 *);

vec256 PQCLEAN_MCELIECE348864_AVX_vec256_set1_16b(uint16_t a) {
return _mm256_set1_epi16(a);
}

vec256 PQCLEAN_MCELIECE348864_AVX_vec256_setzero(void) {
return _mm256_setzero_si256();
}

vec256 PQCLEAN_MCELIECE348864_AVX_vec256_set4x(uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3) {
return _mm256_set_epi64x(a3, a2, a1, a0);
}

int PQCLEAN_MCELIECE348864_AVX_vec256_testz(vec256 a) {
return _mm256_testz_si256(a, a);
}

vec256 PQCLEAN_MCELIECE348864_AVX_vec256_and(vec256 a, vec256 b) {
return _mm256_and_si256(a, b);
}

vec256 PQCLEAN_MCELIECE348864_AVX_vec256_xor(vec256 a, vec256 b) {
return _mm256_xor_si256(a, b);
}

vec256 PQCLEAN_MCELIECE348864_AVX_vec256_or(vec256 a, vec256 b) {
return _mm256_or_si256(a, b);
}

vec256 PQCLEAN_MCELIECE348864_AVX_vec256_sll_4x(vec256 a, int s) {
return _mm256_slli_epi64(a, s);
}

vec256 PQCLEAN_MCELIECE348864_AVX_vec256_srl_4x(vec256 a, int s) {
return _mm256_srli_epi64(a, s);
}

vec256 PQCLEAN_MCELIECE348864_AVX_vec256_unpack_low(vec256 a, vec256 b) {
return _mm256_permute2x128_si256 (a, b, 0x20);
}

vec256 PQCLEAN_MCELIECE348864_AVX_vec256_unpack_high(vec256 a, vec256 b) {
return _mm256_permute2x128_si256 (a, b, 0x31);
}

vec256 PQCLEAN_MCELIECE348864_AVX_vec256_unpack_low_2x(vec256 a, vec256 b) {
return _mm256_unpacklo_epi64 (a, b);
}

vec256 PQCLEAN_MCELIECE348864_AVX_vec256_unpack_high_2x(vec256 a, vec256 b) {
return _mm256_unpackhi_epi64 (a, b);
}

vec256 PQCLEAN_MCELIECE348864_AVX_vec256_or_reduce(const vec256 *a) {
int i;
vec256 ret;

ret = a[0];
for (i = 1; i < GFBITS; i++) {
ret = PQCLEAN_MCELIECE348864_AVX_vec256_or(ret, a[i]);
}

return ret;
}

void PQCLEAN_MCELIECE348864_AVX_vec256_copy(vec256 *dest, const vec256 *src) {
int i;

for (i = 0; i < GFBITS; i++) {
dest[i] = src[i];
}
}


void PQCLEAN_MCELIECE348864_AVX_vec256_mul(vec256 *h, vec256 *f, const vec256 *g) {
PQCLEAN_MCELIECE348864_AVX_vec256_mul_asm(h, f, g);
}

/* bitsliced field squarings */
void PQCLEAN_MCELIECE348864_AVX_vec256_sq(vec256 *out, const vec256 *in) {
int i;
vec256 result[GFBITS];

result[0] = in[0] ^ in[6];
result[1] = in[11];
result[2] = in[1] ^ in[7];
result[3] = in[6];
result[4] = in[2] ^ in[11] ^ in[8];
result[5] = in[7];
result[6] = in[3] ^ in[9];
result[7] = in[8];
result[8] = in[4] ^ in[10];
result[9] = in[9];
result[10] = in[5] ^ in[11];
result[11] = in[10];

for (i = 0; i < GFBITS; i++) {
out[i] = result[i];
}
}

/* bitsliced field inverses */
void PQCLEAN_MCELIECE348864_AVX_vec256_inv(vec256 *out, const vec256 *in) {
vec256 tmp_11[ GFBITS ];
vec256 tmp_1111[ GFBITS ];

PQCLEAN_MCELIECE348864_AVX_vec256_copy(out, in);

PQCLEAN_MCELIECE348864_AVX_vec256_sq(out, out);
PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp_11, out, in); // ^11

PQCLEAN_MCELIECE348864_AVX_vec256_sq(out, tmp_11);
PQCLEAN_MCELIECE348864_AVX_vec256_sq(out, out);
PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp_1111, out, tmp_11); // ^1111

PQCLEAN_MCELIECE348864_AVX_vec256_sq(out, tmp_1111);
PQCLEAN_MCELIECE348864_AVX_vec256_sq(out, out);
PQCLEAN_MCELIECE348864_AVX_vec256_sq(out, out);
PQCLEAN_MCELIECE348864_AVX_vec256_sq(out, out);
PQCLEAN_MCELIECE348864_AVX_vec256_mul(out, out, tmp_1111); // ^11111111

PQCLEAN_MCELIECE348864_AVX_vec256_sq(out, out);
PQCLEAN_MCELIECE348864_AVX_vec256_sq(out, out);
PQCLEAN_MCELIECE348864_AVX_vec256_mul(out, out, tmp_11); // ^1111111111
PQCLEAN_MCELIECE348864_AVX_vec256_sq(out, out);
PQCLEAN_MCELIECE348864_AVX_vec256_mul(out, out, in); // ^11111111111

PQCLEAN_MCELIECE348864_AVX_vec256_sq(out, out); // ^111111111110
}

+ 45
- 0
crypto_kem/mceliece348864/avx/vec256.h View File

@@ -0,0 +1,45 @@
#ifndef PQCLEAN_MCELIECE348864_AVX_VEC256_H
#define PQCLEAN_MCELIECE348864_AVX_VEC256_H
/*
This file is for functions related to 256-bit vectors
including functions for bitsliced field operations
*/


#include "vec128.h"

#include <immintrin.h>

typedef __m256i vec256;

vec256 PQCLEAN_MCELIECE348864_AVX_vec256_set1_16b(uint16_t a);
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_setzero(void);
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_set4x(uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3);

// Extract requires a literal argument so need to be macros
#define PQCLEAN_MCELIECE348864_AVX_vec256_extract2x(a,i) ((vec128) _mm256_extractf128_si256((vec256) (a),(i)))
#define PQCLEAN_MCELIECE348864_AVX_vec256_extract(a,i) ((uint64_t) _mm256_extract_epi64((vec256) (a),(i)))

int PQCLEAN_MCELIECE348864_AVX_vec256_testz(vec256 a);
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_and(vec256 a, vec256 b);
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_xor(vec256 a, vec256 b);
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_or(vec256 a, vec256 b);
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_sll_4x(vec256 a, int s);
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_srl_4x(vec256 a, int s);
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_unpack_low(vec256 a, vec256 b);
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_unpack_high(vec256 a, vec256 b);
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_unpack_low_2x(vec256 a, vec256 b);
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_unpack_high_2x(vec256 a, vec256 b);
vec256 PQCLEAN_MCELIECE348864_AVX_vec256_or_reduce(const vec256 *a);
void PQCLEAN_MCELIECE348864_AVX_vec256_copy(vec256 *dest, const vec256 *src);

/* bitsliced field multiplications */
void PQCLEAN_MCELIECE348864_AVX_vec256_mul(vec256 *h, vec256 *f, const vec256 *g);
void PQCLEAN_MCELIECE348864_AVX_vec256_sq(vec256 * /*out*/, const vec256 * /*in*/);
void PQCLEAN_MCELIECE348864_AVX_vec256_inv(vec256 * /*out*/, const vec256 * /*in*/);

extern void PQCLEAN_MCELIECE348864_AVX_vec256_maa_asm(vec256 *, vec256 *, const vec256 *);
extern void PQCLEAN_MCELIECE348864_AVX_vec256_ama_asm(vec256 *, vec256 *, const vec256 *);

#endif


+ 1736
- 0
crypto_kem/mceliece348864/avx/vec256_mul_asm.S
File diff suppressed because it is too large
View File


+ 1106
- 0
crypto_kem/mceliece348864/avx/vec_mul_asm.S
File diff suppressed because it is too large
View File


+ 1115
- 0
crypto_kem/mceliece348864/avx/vec_mul_sp_asm.S
File diff suppressed because it is too large
View File


+ 356
- 0
crypto_kem/mceliece348864/avx/vec_reduce_asm.S View File

@@ -0,0 +1,356 @@

# qhasm: int64 input_0

# qhasm: int64 input_1

# qhasm: int64 input_2

# qhasm: int64 input_3

# qhasm: int64 input_4

# qhasm: int64 input_5

# qhasm: stack64 input_6

# qhasm: stack64 input_7

# qhasm: int64 caller_r11

# qhasm: int64 caller_r12

# qhasm: int64 caller_r13

# qhasm: int64 caller_r14

# qhasm: int64 caller_r15

# qhasm: int64 caller_rbx

# qhasm: int64 caller_rbp

# qhasm: int64 t

# qhasm: int64 c

# qhasm: int64 r

# qhasm: enter vec_reduce_asm
.p2align 5
.global _PQCLEAN_MCELIECE348864_AVX_vec_reduce_asm
.global PQCLEAN_MCELIECE348864_AVX_vec_reduce_asm
_PQCLEAN_MCELIECE348864_AVX_vec_reduce_asm:
PQCLEAN_MCELIECE348864_AVX_vec_reduce_asm:
mov %rsp,%r11
and $31,%r11
add $0,%r11
sub %r11,%rsp

# qhasm: r = 0
# asm 1: mov $0,>r=int64#7
# asm 2: mov $0,>r=%rax
mov $0,%rax

# qhasm: t = mem64[ input_0 + 88 ]
# asm 1: movq 88(<input_0=int64#1),>t=int64#2
# asm 2: movq 88(<input_0=%rdi),>t=%rsi
movq 88(%rdi),%rsi

# qhasm: c = count(t)
# asm 1: popcnt <t=int64#2, >c=int64#2
# asm 2: popcnt <t=%rsi, >c=%rsi
popcnt %rsi, %rsi

# qhasm: (uint32) c &= 1
# asm 1: and $1,<c=int64#2d
# asm 2: and $1,<c=%esi
and $1,%esi

# qhasm: r <<= 1
# asm 1: shl $1,<r=int64#7
# asm 2: shl $1,<r=%rax
shl $1,%rax

# qhasm: r |= c
# asm 1: or <c=int64#2,<r=int64#7
# asm 2: or <c=%rsi,<r=%rax
or %rsi,%rax

# qhasm: t = mem64[ input_0 + 80 ]
# asm 1: movq 80(<input_0=int64#1),>t=int64#2
# asm 2: movq 80(<input_0=%rdi),>t=%rsi
movq 80(%rdi),%rsi

# qhasm: c = count(t)
# asm 1: popcnt <t=int64#2, >c=int64#2
# asm 2: popcnt <t=%rsi, >c=%rsi
popcnt %rsi, %rsi

# qhasm: (uint32) c &= 1
# asm 1: and $1,<c=int64#2d
# asm 2: and $1,<c=%esi
and $1,%esi

# qhasm: r <<= 1
# asm 1: shl $1,<r=int64#7
# asm 2: shl $1,<r=%rax
shl $1,%rax

# qhasm: r |= c
# asm 1: or <c=int64#2,<r=int64#7
# asm 2: or <c=%rsi,<r=%rax
or %rsi,%rax

# qhasm: t = mem64[ input_0 + 72 ]
# asm 1: movq 72(<input_0=int64#1),>t=int64#2
# asm 2: movq 72(<input_0=%rdi),>t=%rsi
movq 72(%rdi),%rsi

# qhasm: c = count(t)
# asm 1: popcnt <t=int64#2, >c=int64#2
# asm 2: popcnt <t=%rsi, >c=%rsi
popcnt %rsi, %rsi

# qhasm: (uint32) c &= 1
# asm 1: and $1,<c=int64#2d
# asm 2: and $1,<c=%esi
and $1,%esi

# qhasm: r <<= 1
# asm 1: shl $1,<r=int64#7
# asm 2: shl $1,<r=%rax
shl $1,%rax

# qhasm: r |= c
# asm 1: or <c=int64#2,<r=int64#7
# asm 2: or <c=%rsi,<r=%rax
or %rsi,%rax

# qhasm: t = mem64[ input_0 + 64 ]
# asm 1: movq 64(<input_0=int64#1),>t=int64#2
# asm 2: movq 64(<input_0=%rdi),>t=%rsi
movq 64(%rdi),%rsi

# qhasm: c = count(t)
# asm 1: popcnt <t=int64#2, >c=int64#2
# asm 2: popcnt <t=%rsi, >c=%rsi
popcnt %rsi, %rsi

# qhasm: (uint32) c &= 1
# asm 1: and $1,<c=int64#2d
# asm 2: and $1,<c=%esi
and $1,%esi

# qhasm: r <<= 1
# asm 1: shl $1,<r=int64#7
# asm 2: shl $1,<r=%rax
shl $1,%rax

# qhasm: r |= c
# asm 1: or <c=int64#2,<r=int64#7
# asm 2: or <c=%rsi,<r=%rax
or %rsi,%rax

# qhasm: t = mem64[ input_0 + 56 ]
# asm 1: movq 56(<input_0=int64#1),>t=int64#2
# asm 2: movq 56(<input_0=%rdi),>t=%rsi
movq 56(%rdi),%rsi

# qhasm: c = count(t)
# asm 1: popcnt <t=int64#2, >c=int64#2
# asm 2: popcnt <t=%rsi, >c=%rsi
popcnt %rsi, %rsi

# qhasm: (uint32) c &= 1
# asm 1: and $1,<c=int64#2d
# asm 2: and $1,<c=%esi
and $1,%esi

# qhasm: r <<= 1
# asm 1: shl $1,<r=int64#7
# asm 2: shl $1,<r=%rax
shl $1,%rax

# qhasm: r |= c
# asm 1: or <c=int64#2,<r=int64#7
# asm 2: or <c=%rsi,<r=%rax
or %rsi,%rax

# qhasm: t = mem64[ input_0 + 48 ]
# asm 1: movq 48(<input_0=int64#1),>t=int64#2
# asm 2: movq 48(<input_0=%rdi),>t=%rsi
movq 48(%rdi),%rsi

# qhasm: c = count(t)
# asm 1: popcnt <t=int64#2, >c=int64#2
# asm 2: popcnt <t=%rsi, >c=%rsi
popcnt %rsi, %rsi

# qhasm: (uint32) c &= 1
# asm 1: and $1,<c=int64#2d
# asm 2: and $1,<c=%esi
and $1,%esi

# qhasm: r <<= 1
# asm 1: shl $1,<r=int64#7
# asm 2: shl $1,<r=%rax
shl $1,%rax

# qhasm: r |= c
# asm 1: or <c=int64#2,<r=int64#7
# asm 2: or <c=%rsi,<r=%rax
or %rsi,%rax

# qhasm: t = mem64[ input_0 + 40 ]
# asm 1: movq 40(<input_0=int64#1),>t=int64#2
# asm 2: movq 40(<input_0=%rdi),>t=%rsi
movq 40(%rdi),%rsi

# qhasm: c = count(t)
# asm 1: popcnt <t=int64#2, >c=int64#2
# asm 2: popcnt <t=%rsi, >c=%rsi
popcnt %rsi, %rsi

# qhasm: (uint32) c &= 1
# asm 1: and $1,<c=int64#2d
# asm 2: and $1,<c=%esi
and $1,%esi

# qhasm: r <<= 1
# asm 1: shl $1,<r=int64#7
# asm 2: shl $1,<r=%rax
shl $1,%rax

# qhasm: r |= c
# asm 1: or <c=int64#2,<r=int64#7
# asm 2: or <c=%rsi,<r=%rax
or %rsi,%rax

# qhasm: t = mem64[ input_0 + 32 ]
# asm 1: movq 32(<input_0=int64#1),>t=int64#2
# asm 2: movq 32(<input_0=%rdi),>t=%rsi
movq 32(%rdi),%rsi

# qhasm: c = count(t)
# asm 1: popcnt <t=int64#2, >c=int64#2
# asm 2: popcnt <t=%rsi, >c=%rsi
popcnt %rsi, %rsi

# qhasm: (uint32) c &= 1
# asm 1: and $1,<c=int64#2d
# asm 2: and $1,<c=%esi
and $1,%esi

# qhasm: r <<= 1
# asm 1: shl $1,<r=int64#7
# asm 2: shl $1,<r=%rax
shl $1,%rax

# qhasm: r |= c
# asm 1: or <c=int64#2,<r=int64#7
# asm 2: or <c=%rsi,<r=%rax
or %rsi,%rax

# qhasm: t = mem64[ input_0 + 24 ]
# asm 1: movq 24(<input_0=int64#1),>t=int64#2
# asm 2: movq 24(<input_0=%rdi),>t=%rsi
movq 24(%rdi),%rsi

# qhasm: c = count(t)
# asm 1: popcnt <t=int64#2, >c=int64#2
# asm 2: popcnt <t=%rsi, >c=%rsi
popcnt %rsi, %rsi

# qhasm: (uint32) c &= 1
# asm 1: and $1,<c=int64#2d
# asm 2: and $1,<c=%esi
and $1,%esi

# qhasm: r <<= 1
# asm 1: shl $1,<r=int64#7
# asm 2: shl $1,<r=%rax
shl $1,%rax

# qhasm: r |= c
# asm 1: or <c=int64#2,<r=int64#7
# asm 2: or <c=%rsi,<r=%rax
or %rsi,%rax

# qhasm: t = mem64[ input_0 + 16 ]
# asm 1: movq 16(<input_0=int64#1),>t=int64#2
# asm 2: movq 16(<input_0=%rdi),>t=%rsi
movq 16(%rdi),%rsi

# qhasm: c = count(t)
# asm 1: popcnt <t=int64#2, >c=int64#2
# asm 2: popcnt <t=%rsi, >c=%rsi
popcnt %rsi, %rsi

# qhasm: (uint32) c &= 1
# asm 1: and $1,<c=int64#2d
# asm 2: and $1,<c=%esi
and $1,%esi

# qhasm: r <<= 1
# asm 1: shl $1,<r=int64#7
# asm 2: shl $1,<r=%rax
shl $1,%rax

# qhasm: r |= c
# asm 1: or <c=int64#2,<r=int64#7
# asm 2: or <c=%rsi,<r=%rax
or %rsi,%rax

# qhasm: t = mem64[ input_0 + 8 ]
# asm 1: movq 8(<input_0=int64#1),>t=int64#2
# asm 2: movq 8(<input_0=%rdi),>t=%rsi
movq 8(%rdi),%rsi

# qhasm: c = count(t)
# asm 1: popcnt <t=int64#2, >c=int64#2
# asm 2: popcnt <t=%rsi, >c=%rsi
popcnt %rsi, %rsi

# qhasm: (uint32) c &= 1
# asm 1: and $1,<c=int64#2d
# asm 2: and $1,<c=%esi
and $1,%esi

# qhasm: r <<= 1
# asm 1: shl $1,<r=int64#7
# asm 2: shl $1,<r=%rax
shl $1,%rax

# qhasm: r |= c
# asm 1: or <c=int64#2,<r=int64#7
# asm 2: or <c=%rsi,<r=%rax
or %rsi,%rax

# qhasm: t = mem64[ input_0 + 0 ]
# asm 1: movq 0(<input_0=int64#1),>t=int64#1
# asm 2: movq 0(<input_0=%rdi),>t=%rdi
movq 0(%rdi),%rdi

# qhasm: c = count(t)
# asm 1: popcnt <t=int64#1, >c=int64#1
# asm 2: popcnt <t=%rdi, >c=%rdi
popcnt %rdi, %rdi

# qhasm: (uint32) c &= 1
# asm 1: and $1,<c=int64#1d
# asm 2: and $1,<c=%edi
and $1,%edi

# qhasm: r <<= 1
# asm 1: shl $1,<r=int64#7
# asm 2: shl $1,<r=%rax
shl $1,%rax

# qhasm: r |= c
# asm 1: or <c=int64#1,<r=int64#7
# asm 2: or <c=%rdi,<r=%rax
or %rdi,%rax

# qhasm: return r
add %r11,%rsp
ret

+ 16
- 0
crypto_kem/mceliece348864/clean/LICENSE View File

@@ -0,0 +1,16 @@
Public Domain.

Authors of Classic McEliece in alphabetical order:

Daniel J. Bernstein, University of Illinois at Chicago
Tung Chou, Osaka University
Tanja Lange, Technische Universiteit Eindhoven
Ingo von Maurich, self
Rafael Misoczki, Intel Corporation
Ruben Niederhagen, Fraunhofer SIT
Edoardo Persichetti, Florida Atlantic University
Christiane Peters, self
Peter Schwabe, Radboud University
Nicolas Sendrier, Inria
Jakub Szefer, Yale University
Wen Wang, Yale University

+ 27
- 0
crypto_kem/mceliece348864/clean/Makefile View File

@@ -0,0 +1,27 @@
# This Makefile can be used with GNU Make or BSD Make

LIB = libmceliece348864_clean.a

SOURCES = aes256ctr.c benes.c bm.c controlbits.c decrypt.c encrypt.c gf.c \
operations.c pk_gen.c root.c sk_gen.c synd.c transpose.c util.c
HEADERS = aes256ctr.h api.h benes.h bm.h controlbits.h crypto_hash.h \
decrypt.h encrypt.h gf.h params.h pk_gen.h root.h \
sk_gen.h synd.h transpose.h util.h
OBJECTS = aes256ctr.o benes.o bm.o controlbits.o decrypt.o encrypt.o gf.o \
operations.o pk_gen.o root.o sk_gen.o synd.o transpose.o util.o

CFLAGS = -O3 -std=c99 -Wall -Wextra -pedantic -Werror -Wpedantic \
-Wredundant-decls -Wvla -Wcast-align -Wmissing-prototypes \
-I../../../common/ $(EXTRAFLAGS)

all: $(LIB)

%.o: %.c $(HEADERS)
$(CC) $(CFLAGS) -c -o $@ $<

$(LIB): $(OBJECTS)
$(AR) -r $@ $(OBJECTS)

clean:
$(RM) $(OBJECTS)
$(RM) $(LIB)

+ 24
- 0
crypto_kem/mceliece348864/clean/Makefile.Microsoft_nmake View File

@@ -0,0 +1,24 @@
# This Makefile can be used with GNU Make or BSD Make

LIBRARY = libmceliece348864_clean.lib

OBJECTS = aes256ctr.obj benes.obj bm.obj controlbits.obj decrypt.obj encrypt.obj gf.obj \
operations.obj pk_gen.obj root.obj sk_gen.obj synd.obj transpose.obj util.obj

# Warning C4146 is raised when a unary minus operator is applied to an
# unsigned type; this has nonetheless been standard and portable for as
# long as there has been a C standard, and we do that a lot, especially
# for constant-time computations. Thus, we disable that spurious warning.
CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /wd4146 /WX

all: $(LIBRARY)

# Make sure objects are recompiled if headers change.
$(OBJECTS): *.h

$(LIBRARY): $(OBJECTS)
LIB.EXE /NOLOGO /WX /OUT:$@ $**

clean:
-DEL $(OBJECTS)
-DEL $(LIBRARY)

+ 13
- 0
crypto_kem/mceliece348864/clean/aes256ctr.c View File

@@ -0,0 +1,13 @@
#include "aes256ctr.h"

void PQCLEAN_MCELIECE348864_CLEAN_aes256ctr(
uint8_t *out,
size_t outlen,
const uint8_t nonce[AESCTR_NONCEBYTES],
const uint8_t key[AES256_KEYBYTES]) {

aes256ctx state;
aes256_keyexp(&state, key);
aes256_ctr(out, outlen, nonce, &state);
aes256_ctx_release(&state);
}

+ 17
- 0
crypto_kem/mceliece348864/clean/aes256ctr.h View File

@@ -0,0 +1,17 @@
#ifndef PQCLEAN_MCELIECE348864_CLEAN_AES256CTR_H
#define PQCLEAN_MCELIECE348864_CLEAN_AES256CTR_H

#include <stddef.h>
#include <stdint.h>

#include "aes.h"


void PQCLEAN_MCELIECE348864_CLEAN_aes256ctr(
uint8_t *out,
size_t outlen,
const uint8_t nonce[AESCTR_NONCEBYTES],
const uint8_t key[AES256_KEYBYTES]
);

#endif

+ 32
- 0
crypto_kem/mceliece348864/clean/api.h View File

@@ -0,0 +1,32 @@
#ifndef PQCLEAN_MCELIECE348864_CLEAN_API_H
#define PQCLEAN_MCELIECE348864_CLEAN_API_H

#include <stdint.h>

#define PQCLEAN_MCELIECE348864_CLEAN_CRYPTO_ALGNAME "Classic McEliece 348864"
#define PQCLEAN_MCELIECE348864_CLEAN_CRYPTO_PUBLICKEYBYTES 261120
#define PQCLEAN_MCELIECE348864_CLEAN_CRYPTO_SECRETKEYBYTES 6452
#define PQCLEAN_MCELIECE348864_CLEAN_CRYPTO_CIPHERTEXTBYTES 128
#define PQCLEAN_MCELIECE348864_CLEAN_CRYPTO_BYTES 32


int PQCLEAN_MCELIECE348864_CLEAN_crypto_kem_enc(
uint8_t *c,
uint8_t *key,
const uint8_t *pk
);

int PQCLEAN_MCELIECE348864_CLEAN_crypto_kem_dec(
uint8_t *key,
const uint8_t *c,
const uint8_t *sk
);

int PQCLEAN_MCELIECE348864_CLEAN_crypto_kem_keypair
(
uint8_t *pk,
uint8_t *sk
);

#endif


+ 139
- 0
crypto_kem/mceliece348864/clean/benes.c View File

@@ -0,0 +1,139 @@
/*
This file is for Benes network related functions
*/

#include "benes.h"

#include "params.h"
#include "transpose.h"
#include "util.h"

/* one layer of the benes network */
static void layer(uint64_t *data, uint64_t *bits, int lgs) {
int i, j, s;

uint64_t d;

s = 1 << lgs;

for (i = 0; i < 64; i += s * 2) {
for (j = i; j < i + s; j++) {

d = (data[j + 0] ^ data[j + s]);
d &= (*bits++);
data[j + 0] ^= d;
data[j + s] ^= d;
}
}
}

/* input: r, sequence of bits to be permuted */
/* bits, condition bits of the Benes network */
/* rev, 0 for normal application; !0 for inverse */
/* output: r, permuted bits */
void PQCLEAN_MCELIECE348864_CLEAN_apply_benes(unsigned char *r, const unsigned char *bits, int rev) {
int i;

const unsigned char *cond_ptr;
int inc, low;

uint64_t bs[64];
uint64_t cond[64];

//

for (i = 0; i < 64; i++) {
bs[i] = PQCLEAN_MCELIECE348864_CLEAN_load8(r + i * 8);
}

if (rev == 0) {
inc = 256;
cond_ptr = bits;
} else {
inc = -256;
cond_ptr = bits + (2 * GFBITS - 2) * 256;
}

//

PQCLEAN_MCELIECE348864_CLEAN_transpose_64x64(bs, bs);

for (low = 0; low <= 5; low++) {
for (i = 0; i < 64; i++) {
cond[i] = PQCLEAN_MCELIECE348864_CLEAN_load4(cond_ptr + i * 4);
}
PQCLEAN_MCELIECE348864_CLEAN_transpose_64x64(cond, cond);
layer(bs, cond, low);
cond_ptr += inc;
}

PQCLEAN_MCELIECE348864_CLEAN_transpose_64x64(bs, bs);

for (low = 0; low <= 5; low++) {
for (i = 0; i < 32; i++) {
cond[i] = PQCLEAN_MCELIECE348864_CLEAN_load8(cond_ptr + i * 8);
}
layer(bs, cond, low);
cond_ptr += inc;
}
for (low = 4; low >= 0; low--) {
for (i = 0; i < 32; i++) {
cond[i] = PQCLEAN_MCELIECE348864_CLEAN_load8(cond_ptr + i * 8);
}
layer(bs, cond, low);
cond_ptr += inc;
}

PQCLEAN_MCELIECE348864_CLEAN_transpose_64x64(bs, bs);

for (low = 5; low >= 0; low--) {
for (i = 0; i < 64; i++) {
cond[i] = PQCLEAN_MCELIECE348864_CLEAN_load4(cond_ptr + i * 4);
}
PQCLEAN_MCELIECE348864_CLEAN_transpose_64x64(cond, cond);
layer(bs, cond, low);
cond_ptr += inc;
}

PQCLEAN_MCELIECE348864_CLEAN_transpose_64x64(bs, bs);


for (i = 0; i < 64; i++) {
PQCLEAN_MCELIECE348864_CLEAN_store8(r + i * 8, bs[i]);
}
}

/* input: condition bits c */
/* output: support s */
void PQCLEAN_MCELIECE348864_CLEAN_support_gen(gf *s, const unsigned char *c) {
gf a;
int i, j;
unsigned char L[ GFBITS ][ (1 << GFBITS) / 8 ];

for (i = 0; i < GFBITS; i++) {
for (j = 0; j < (1 << GFBITS) / 8; j++) {
L[i][j] = 0;
}
}

for (i = 0; i < (1 << GFBITS); i++) {
a = PQCLEAN_MCELIECE348864_CLEAN_bitrev((gf) i);

for (j = 0; j < GFBITS; j++) {
L[j][ i / 8 ] |= ((a >> j) & 1) << (i % 8);
}
}

for (j = 0; j < GFBITS; j++) {
PQCLEAN_MCELIECE348864_CLEAN_apply_benes(L[j], c, 0);
}

for (i = 0; i < SYS_N; i++) {
s[i] = 0;
for (j = GFBITS - 1; j >= 0; j--) {
s[i] <<= 1;
s[i] |= (L[j][i / 8] >> (i % 8)) & 1;
}
}
}


+ 14
- 0
crypto_kem/mceliece348864/clean/benes.h View File

@@ -0,0 +1,14 @@
#ifndef PQCLEAN_MCELIECE348864_CLEAN_BENES_H
#define PQCLEAN_MCELIECE348864_CLEAN_BENES_H
/*
This file is for Benes network related functions
*/


#include "gf.h"

void PQCLEAN_MCELIECE348864_CLEAN_apply_benes(unsigned char * /*r*/, const unsigned char * /*bits*/, int /*rev*/);
void PQCLEAN_MCELIECE348864_CLEAN_support_gen(gf * /*s*/, const unsigned char * /*c*/);

#endif


+ 83
- 0
crypto_kem/mceliece348864/clean/bm.c View File

@@ -0,0 +1,83 @@
/*
This file is for the Berlekamp-Massey algorithm
see http://crypto.stanford.edu/~mironov/cs359/massey.pdf
*/
#include "bm.h"

#include "params.h"

#define min(a, b) (((a) < (b)) ? (a) : (b))

/* the Berlekamp-Massey algorithm */
/* input: s, sequence of field elements */
/* output: out, minimal polynomial of s */
void PQCLEAN_MCELIECE348864_CLEAN_bm(gf *out, gf *s) {
int i;

uint16_t N = 0;
uint16_t L = 0;
uint16_t mle;
uint16_t mne;

gf T[ SYS_T + 1 ];
gf C[ SYS_T + 1 ];
gf B[ SYS_T + 1 ];

gf b = 1, d, f;

//

for (i = 0; i < SYS_T + 1; i++) {
C[i] = B[i] = 0;
}

B[1] = C[0] = 1;

//

for (N = 0; N < 2 * SYS_T; N++) {
d = 0;

for (i = 0; i <= min(N, SYS_T); i++) {
d ^= PQCLEAN_MCELIECE348864_CLEAN_gf_mul(C[i], s[ N - i]);
}

mne = d;
mne -= 1;
mne >>= 15;
mne -= 1;
mle = N;
mle -= 2 * L;
mle >>= 15;
mle -= 1;
mle &= mne;

for (i = 0; i <= SYS_T; i++) {
T[i] = C[i];
}

f = PQCLEAN_MCELIECE348864_CLEAN_gf_frac(b, d);

for (i = 0; i <= SYS_T; i++) {
C[i] ^= PQCLEAN_MCELIECE348864_CLEAN_gf_mul(f, B[i]) & mne;
}

L = (L & ~mle) | ((N + 1 - L) & mle);

for (i = 0; i <= SYS_T; i++) {
B[i] = (B[i] & ~mle) | (T[i] & mle);
}

b = (b & ~mle) | (d & mle);

for (i = SYS_T; i >= 1; i--) {
B[i] = B[i - 1];
}
B[0] = 0;
}

for (i = 0; i <= SYS_T; i++) {
out[i] = C[ SYS_T - i ];
}
}


+ 13
- 0
crypto_kem/mceliece348864/clean/bm.h View File

@@ -0,0 +1,13 @@
#ifndef PQCLEAN_MCELIECE348864_CLEAN_BM_H
#define PQCLEAN_MCELIECE348864_CLEAN_BM_H
/*
This file is for the Berlekamp-Massey algorithm
see http://crypto.stanford.edu/~mironov/cs359/massey.pdf
*/

#include "gf.h"

void PQCLEAN_MCELIECE348864_CLEAN_bm(gf * /*out*/, gf * /*s*/);

#endif


+ 274
- 0
crypto_kem/mceliece348864/clean/controlbits.c View File

@@ -0,0 +1,274 @@
/*
This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation
see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf
*/

#include "controlbits.h"

#include "params.h"

#include <stdint.h>

typedef uint8_t bit;

#define N (1 << GFBITS)

static bit is_smaller(uint32_t a, uint32_t b) {
uint32_t ret = 0;

ret = a - b;
ret >>= 31;

return (bit)ret;
}

static bit is_smaller_63b(uint64_t a, uint64_t b) {
uint64_t ret = 0;

ret = a - b;
ret >>= 63;

return (bit)ret;
}

static void cswap(uint32_t *x, uint32_t *y, bit swap) {
uint32_t m;
uint32_t d;

m = swap;
m = 0 - m;

d = (*x ^ *y);
d &= m;
*x ^= d;
*y ^= d;
}

static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) {
uint64_t m;
uint64_t d;

m = swap;
m = 0 - m;

d = (*x ^ *y);
d &= m;
*x ^= d;
*y ^= d;
}

/* output x = min(input x,input y) */
/* output y = max(input x,input y) */

static void minmax(uint32_t *x, uint32_t *y) {
bit m;

m = is_smaller(*y, *x);
cswap(x, y, m);
}

static void minmax_63b(uint64_t *x, uint64_t *y) {
bit m;

m = is_smaller_63b(*y, *x);
cswap_63b(x, y, m);
}

/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */
/* requires n to be a power of 2 */

static void merge(int n, uint32_t *x, int step) {
int i;
if (n == 1) {
minmax(&x[0], &x[step]);
} else {
merge(n / 2, x, step * 2);
merge(n / 2, x + step, step * 2);
for (i = 1; i < 2 * n - 1; i += 2) {
minmax(&x[i * step], &x[(i + 1) * step]);
}
}
}

static void merge_63b(int n, uint64_t *x, int step) {
int i;
if (n == 1) {
minmax_63b(&x[0], &x[step]);
} else {
merge_63b(n / 2, x, step * 2);
merge_63b(n / 2, x + step, step * 2);
for (i = 1; i < 2 * n - 1; i += 2) {
minmax_63b(&x[i * step], &x[(i + 1) * step]);
}
}
}

/* sort x[0],x[1],...,x[n-1] in place */
/* requires n to be a power of 2 */

static void sort(int n, uint32_t *x) {
if (n <= 1) {
return;
}
sort(n / 2, x);
sort(n / 2, x + n / 2);
merge(n / 2, x, 1);
}

void PQCLEAN_MCELIECE348864_CLEAN_sort_63b(int n, uint64_t *x) {
if (n <= 1) {
return;
}
PQCLEAN_MCELIECE348864_CLEAN_sort_63b(n / 2, x);
PQCLEAN_MCELIECE348864_CLEAN_sort_63b(n / 2, x + n / 2);
merge_63b(n / 2, x, 1);
}

/* y[pi[i]] = x[i] */
/* requires n = 2^w */
/* requires pi to be a permutation */
static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC
int i;
uint32_t t[2 * N];

for (i = 0; i < n; ++i) {
t[i] = x[i] | (pi[i] << 16);
}

sort(n, t);

for (i = 0; i < n; ++i) {
y[i] = t[i] & 0xFFFF;
}
}

/* ip[i] = j iff pi[i] = j */
/* requires n = 2^w */
/* requires pi to be a permutation */
static void invert(int n, uint32_t *ip, const uint32_t *pi) {
int i;

for (i = 0; i < n; i++) {
ip[i] = i;
}

composeinv(n, ip, ip, pi);
}


static void flow(int w, uint32_t *x, const uint32_t *y, int t) {
bit m0;
bit m1;

uint32_t b;
uint32_t y_copy = *y;

m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1));
m1 = is_smaller(0, t);

cswap(x, &y_copy, m0);
b = m0 & m1;
*x ^= b << w;
}

/* input: permutation pi */
/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */
/* requires n = 2^w */
static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) {
int i;
int j;
int k;
int t;
uint32_t ip[N] = {0};
uint32_t I[2 * N] = {0};
uint32_t P[2 * N] = {0};
uint32_t PI[2 * N] = {0};
uint32_t T[2 * N] = {0};
uint32_t piflip[N] = {0};
uint32_t subpi[2][N / 2] = {{0}};

if (w == 1) {
c[ off / 8 ] |= (pi[0] & 1) << (off % 8);
}
if (w <= 1) {
return;
}

invert(n, ip, pi);

for (i = 0; i < n; ++i) {
I[i] = ip[i] | (1 << w);
I[n + i] = pi[i];
}

for (i = 0; i < 2 * n; ++i) {
P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w);
}

for (t = 0; t < w; ++t) {
composeinv(2 * n, PI, P, I);

for (i = 0; i < 2 * n; ++i) {
flow(w, &P[i], &PI[i], t);
}

for (i = 0; i < 2 * n; ++i) {
T[i] = I[i ^ 1];
}

composeinv(2 * n, I, I, T);

for (i = 0; i < 2 * n; ++i) {
T[i] = P[i ^ 1];
}

for (i = 0; i < 2 * n; ++i) {
flow(w, &P[i], &T[i], 1);
}
}

for (i = 0; i < n; ++i) {
for (j = 0; j < w; ++j) {
piflip[i] = pi[i];
}
}

for (i = 0; i < n / 2; ++i) {
c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8);
}
for (i = 0; i < n / 2; ++i) {
c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8);
}

for (i = 0; i < n / 2; ++i) {
cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1);
}

for (k = 0; k < 2; ++k) {
for (i = 0; i < n / 2; ++i) {
subpi[k][i] = piflip[i * 2 + k] >> 1;
}
}

for (k = 0; k < 2; ++k) {
controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]);
}
}

/* input: pi, a permutation*/
/* output: out, control bits w.r.t. pi */
void PQCLEAN_MCELIECE348864_CLEAN_controlbits(unsigned char *out, const uint32_t *pi) {
unsigned int i;
unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ];

for (i = 0; i < sizeof(c); i++) {
c[i] = 0;
}

controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi);

for (i = 0; i < sizeof(c); i++) {
out[i] = c[i];
}
}


+ 15
- 0
crypto_kem/mceliece348864/clean/controlbits.h View File

@@ -0,0 +1,15 @@
#ifndef PQCLEAN_MCELIECE348864_CLEAN_CONTROLBITS_H
#define PQCLEAN_MCELIECE348864_CLEAN_CONTROLBITS_H
/*
This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation
see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf
*/


#include <stdint.h>

void PQCLEAN_MCELIECE348864_CLEAN_sort_63b(int n, uint64_t *x);
void PQCLEAN_MCELIECE348864_CLEAN_controlbits(unsigned char *out, const uint32_t *pi);

#endif


+ 7
- 0
crypto_kem/mceliece348864/clean/crypto_hash.h View File

@@ -0,0 +1,7 @@
#ifndef PQCLEAN_MCELIECE348864_CLEAN_CRYPTO_HASH_H
#define PQCLEAN_MCELIECE348864_CLEAN_CRYPTO_HASH_H
#include "fips202.h"

#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen)

#endif

+ 90
- 0
crypto_kem/mceliece348864/clean/decrypt.c View File

@@ -0,0 +1,90 @@
/*
This file is for Niederreiter decryption
*/

#include "decrypt.h"

#include "benes.h"
#include "bm.h"
#include "gf.h"
#include "params.h"
#include "root.h"
#include "synd.h"
#include "util.h"

/* Niederreiter decryption with the Berlekamp decoder */
/* intput: sk, secret key */
/* c, ciphertext */
/* output: e, error vector */
/* return: 0 for success; 1 for failure */
int PQCLEAN_MCELIECE348864_CLEAN_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) {
int i, w = 0;
uint16_t check;

unsigned char r[ SYS_N / 8 ];

gf g[ SYS_T + 1 ];
gf L[ SYS_N ];

gf s[ SYS_T * 2 ];
gf s_cmp[ SYS_T * 2 ];
gf locator[ SYS_T + 1 ];
gf images[ SYS_N ];

gf t;

//

for (i = 0; i < SYND_BYTES; i++) {
r[i] = c[i];
}
for (i = SYND_BYTES; i < SYS_N / 8; i++) {
r[i] = 0;
}

for (i = 0; i < SYS_T; i++) {
g[i] = PQCLEAN_MCELIECE348864_CLEAN_load2(sk);
g[i] &= GFMASK;
sk += 2;
}
g[ SYS_T ] = 1;

PQCLEAN_MCELIECE348864_CLEAN_support_gen(L, sk);

PQCLEAN_MCELIECE348864_CLEAN_synd(s, g, L, r);

PQCLEAN_MCELIECE348864_CLEAN_bm(locator, s);

PQCLEAN_MCELIECE348864_CLEAN_root(images, locator, L);

//

for (i = 0; i < SYS_N / 8; i++) {
e[i] = 0;
}

for (i = 0; i < SYS_N; i++) {
t = PQCLEAN_MCELIECE348864_CLEAN_gf_iszero(images[i]) & 1;

e[ i / 8 ] |= t << (i % 8);
w += t;

}

PQCLEAN_MCELIECE348864_CLEAN_synd(s_cmp, g, L, e);

//

check = (uint16_t)w;
check ^= SYS_T;

for (i = 0; i < SYS_T * 2; i++) {
check |= s[i] ^ s_cmp[i];
}

check -= 1;
check >>= 15;

return check ^ 1;
}


+ 10
- 0
crypto_kem/mceliece348864/clean/decrypt.h View File

@@ -0,0 +1,10 @@
#ifndef PQCLEAN_MCELIECE348864_CLEAN_DECRYPT_H
#define PQCLEAN_MCELIECE348864_CLEAN_DECRYPT_H
/*
This file is for Nieddereiter decryption
*/

int PQCLEAN_MCELIECE348864_CLEAN_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/);

#endif


+ 138
- 0
crypto_kem/mceliece348864/clean/encrypt.c View File

@@ -0,0 +1,138 @@
/*
This file is for Niederreiter encryption
*/

#include "encrypt.h"

#include "params.h"
#include "randombytes.h"
#include "util.h"

#include <stdint.h>
#include <string.h>

#include "gf.h"

static inline uint8_t same_mask(uint16_t x, uint16_t y) {
uint32_t mask;

mask = x ^ y;
mask -= 1;
mask >>= 31;
mask = -mask;

return (uint8_t)mask;
}

/* output: e, an error vector of weight t */
static void gen_e(unsigned char *e) {
size_t i, j;
int eq, count;

uint16_t ind_[ SYS_T * 2 ];
uint8_t *ind_8 = (uint8_t *)ind_;
uint16_t ind[ SYS_T * 2 ];
uint8_t mask;
unsigned char val[ SYS_T ];

while (1) {
randombytes(ind_8, sizeof(ind_));
// Copy to uint16_t ind_ in a little-endian way
for (i = 0; i < sizeof(ind_); i += 2) {
ind_[i / 2] = ((uint16_t)ind_8[i + 1]) << 8 | (uint16_t)ind_8[i];
}

for (i = 0; i < SYS_T * 2; i++) {
ind_[i] &= GFMASK;
}

// moving and counting indices in the correct range

count = 0;
for (i = 0; i < SYS_T * 2; i++) {
if (ind_[i] < SYS_N) {
ind[ count++ ] = ind_[i];
}
}

if (count < SYS_T) {
continue;
}

// check for repetition

eq = 0;

for (i = 1; i < SYS_T; i++) {
for (j = 0; j < i; j++) {
if (ind[i] == ind[j]) {
eq = 1;
}
}
}

if (eq == 0) {
break;
}
}

for (j = 0; j < SYS_T; j++) {
val[j] = 1 << (ind[j] & 7);
}

for (i = 0; i < SYS_N / 8; i++) {
e[i] = 0;

for (j = 0; j < SYS_T; j++) {
mask = same_mask((uint16_t)i, (ind[j] >> 3));

e[i] |= val[j] & mask;
}
}
}

/* input: public key pk, error vector e */
/* output: syndrome s */
static void syndrome(unsigned char *s, const unsigned char *pk, const unsigned char *e) {
unsigned char b, row[SYS_N / 8];
const unsigned char *pk_ptr = pk;

int i, j;

for (i = 0; i < SYND_BYTES; i++) {
s[i] = 0;
}

for (i = 0; i < PK_NROWS; i++) {
for (j = 0; j < SYS_N / 8; j++) {
row[j] = 0;
}

for (j = 0; j < PK_ROW_BYTES; j++) {
row[ SYS_N / 8 - PK_ROW_BYTES + j ] = pk_ptr[j];
}

row[i / 8] |= 1 << (i % 8);

b = 0;
for (j = 0; j < SYS_N / 8; j++) {
b ^= row[j] & e[j];
}

b ^= b >> 4;
b ^= b >> 2;
b ^= b >> 1;
b &= 1;

s[ i / 8 ] |= (b << (i % 8));

pk_ptr += PK_ROW_BYTES;
}
}

void PQCLEAN_MCELIECE348864_CLEAN_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) {
gen_e(e);

syndrome(s, pk, e);
}


+ 11
- 0
crypto_kem/mceliece348864/clean/encrypt.h View File

@@ -0,0 +1,11 @@
#ifndef PQCLEAN_MCELIECE348864_CLEAN_ENCRYPT_H
#define PQCLEAN_MCELIECE348864_CLEAN_ENCRYPT_H
/*
This file is for Niederreiter encryption
*/


void PQCLEAN_MCELIECE348864_CLEAN_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/);

#endif


+ 139
- 0
crypto_kem/mceliece348864/clean/gf.c View File

@@ -0,0 +1,139 @@
/*
This file is for functions for field arithmetic
*/

#include "gf.h"

#include "params.h"

gf PQCLEAN_MCELIECE348864_CLEAN_gf_iszero(gf a) {
uint32_t t = a;

t -= 1;
t >>= 19;

return (gf) t;
}

gf PQCLEAN_MCELIECE348864_CLEAN_gf_add(gf in0, gf in1) {
return in0 ^ in1;
}

gf PQCLEAN_MCELIECE348864_CLEAN_gf_mul(gf in0, gf in1) {
int i;

uint32_t tmp;
uint32_t t0;
uint32_t t1;
uint32_t t;

t0 = in0;
t1 = in1;

tmp = t0 * (t1 & 1);

for (i = 1; i < GFBITS; i++) {
tmp ^= (t0 * (t1 & (1 << i)));
}

t = tmp & 0x7FC000;
tmp ^= t >> 9;
tmp ^= t >> 12;

t = tmp & 0x3000;
tmp ^= t >> 9;
tmp ^= t >> 12;

return tmp & ((1 << GFBITS) - 1);
}

/* input: field element in */
/* return: in^2 */
static inline gf gf_sq(gf in) {
const uint32_t B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF};

uint32_t x = in;
uint32_t t;

x = (x | (x << 8)) & B[3];
x = (x | (x << 4)) & B[2];
x = (x | (x << 2)) & B[1];
x = (x | (x << 1)) & B[0];

t = x & 0x7FC000;
x ^= t >> 9;
x ^= t >> 12;

t = x & 0x3000;
x ^= t >> 9;
x ^= t >> 12;

return x & ((1 << GFBITS) - 1);
}

gf PQCLEAN_MCELIECE348864_CLEAN_gf_inv(gf in) {
gf tmp_11;
gf tmp_1111;

gf out = in;

out = gf_sq(out);
tmp_11 = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(out, in); // 11

out = gf_sq(tmp_11);
out = gf_sq(out);
tmp_1111 = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(out, tmp_11); // 1111

out = gf_sq(tmp_1111);
out = gf_sq(out);
out = gf_sq(out);
out = gf_sq(out);
out = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(out, tmp_1111); // 11111111

out = gf_sq(out);
out = gf_sq(out);
out = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(out, tmp_11); // 1111111111

out = gf_sq(out);
out = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(out, in); // 11111111111

return gf_sq(out); // 111111111110
}

/* input: field element den, num */
/* return: (num/den) */
gf PQCLEAN_MCELIECE348864_CLEAN_gf_frac(gf den, gf num) {
return PQCLEAN_MCELIECE348864_CLEAN_gf_mul(PQCLEAN_MCELIECE348864_CLEAN_gf_inv(den), num);
}

/* input: in0, in1 in GF((2^m)^t)*/
/* output: out = in0*in1 */
void PQCLEAN_MCELIECE348864_CLEAN_GF_mul(gf *out, const gf *in0, const gf *in1) {
int i, j;

gf prod[ SYS_T * 2 - 1 ];

for (i = 0; i < SYS_T * 2 - 1; i++) {
prod[i] = 0;
}

for (i = 0; i < SYS_T; i++) {
for (j = 0; j < SYS_T; j++) {
prod[i + j] ^= PQCLEAN_MCELIECE348864_CLEAN_gf_mul(in0[i], in1[j]);
}
}

//

for (i = (SYS_T - 1) * 2; i >= SYS_T; i--) {
prod[i - SYS_T + 9] ^= PQCLEAN_MCELIECE348864_CLEAN_gf_mul(prod[i], (gf) 877);
prod[i - SYS_T + 7] ^= PQCLEAN_MCELIECE348864_CLEAN_gf_mul(prod[i], (gf) 2888);
prod[i - SYS_T + 5] ^= PQCLEAN_MCELIECE348864_CLEAN_gf_mul(prod[i], (gf) 1781);
prod[i - SYS_T + 0] ^= PQCLEAN_MCELIECE348864_CLEAN_gf_mul(prod[i], (gf) 373);
}

for (i = 0; i < SYS_T; i++) {
out[i] = prod[i];
}
}


+ 22
- 0
crypto_kem/mceliece348864/clean/gf.h View File

@@ -0,0 +1,22 @@
#ifndef PQCLEAN_MCELIECE348864_CLEAN_GF_H
#define PQCLEAN_MCELIECE348864_CLEAN_GF_H
/*
This file is for functions for field arithmetic
*/


#include <stdint.h>

typedef uint16_t gf;

gf PQCLEAN_MCELIECE348864_CLEAN_gf_iszero(gf a);
gf PQCLEAN_MCELIECE348864_CLEAN_gf_add(gf in0, gf in1);
gf PQCLEAN_MCELIECE348864_CLEAN_gf_mul(gf in0, gf in1);
gf PQCLEAN_MCELIECE348864_CLEAN_gf_frac(gf den, gf num);
gf PQCLEAN_MCELIECE348864_CLEAN_gf_inv(gf in);
uint64_t PQCLEAN_MCELIECE348864_CLEAN_gf_mul2(gf a, gf b0, gf b1);

void PQCLEAN_MCELIECE348864_CLEAN_GF_mul(gf *out, const gf *in0, const gf *in1);

#endif


+ 136
- 0
crypto_kem/mceliece348864/clean/operations.c View File

@@ -0,0 +1,136 @@
#include "api.h"

#include "aes256ctr.h"
#include "controlbits.h"
#include "crypto_hash.h"
#include "decrypt.h"
#include "encrypt.h"
#include "params.h"
#include "pk_gen.h"
#include "randombytes.h"
#include "sk_gen.h"
#include "util.h"

#include <stdint.h>
#include <string.h>

int PQCLEAN_MCELIECE348864_CLEAN_crypto_kem_enc(
uint8_t *c,
uint8_t *key,
const uint8_t *pk
) {
uint8_t two_e[ 1 + SYS_N / 8 ] = {2};
uint8_t *e = two_e + 1;
uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1};

PQCLEAN_MCELIECE348864_CLEAN_encrypt(c, e, pk);

crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e));

memcpy(one_ec + 1, e, SYS_N / 8);
memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32);

crypto_hash_32b(key, one_ec, sizeof(one_ec));

return 0;
}

int PQCLEAN_MCELIECE348864_CLEAN_crypto_kem_dec(
uint8_t *key,
const uint8_t *c,
const uint8_t *sk
) {
int i;

uint8_t ret_confirm = 0;
uint8_t ret_decrypt = 0;

uint16_t m;

uint8_t conf[32];
uint8_t two_e[ 1 + SYS_N / 8 ] = {2};
uint8_t *e = two_e + 1;
uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ];
uint8_t *x = preimage;

//

ret_decrypt = (uint8_t)PQCLEAN_MCELIECE348864_CLEAN_decrypt(e, sk + SYS_N / 8, c);

crypto_hash_32b(conf, two_e, sizeof(two_e));

for (i = 0; i < 32; i++) {
ret_confirm |= conf[i] ^ c[SYND_BYTES + i];
}

m = ret_decrypt | ret_confirm;
m -= 1;
m >>= 8;

*x++ = (~m & 0) | (m & 1);
for (i = 0; i < SYS_N / 8; i++) {
*x++ = (~m & sk[i]) | (m & e[i]);
}
for (i = 0; i < SYND_BYTES + 32; i++) {
*x++ = c[i];
}

crypto_hash_32b(key, preimage, sizeof(preimage));

return 0;
}

int PQCLEAN_MCELIECE348864_CLEAN_crypto_kem_keypair
(
uint8_t *pk,
uint8_t *sk
) {
int i;
uint8_t seed[ 32 ];
uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ];
uint8_t nonce[ 16 ] = {0};
uint8_t *rp;

gf f[ SYS_T ]; // element in GF(2^mt)
gf irr[ SYS_T ]; // Goppa polynomial
uint32_t perm[ 1 << GFBITS ]; // random permutation

randombytes(seed, sizeof(seed));

while (1) {
rp = r;
PQCLEAN_MCELIECE348864_CLEAN_aes256ctr(r, sizeof(r), nonce, seed);
memcpy(seed, &r[ sizeof(r) - 32 ], 32);

for (i = 0; i < SYS_T; i++) {
f[i] = PQCLEAN_MCELIECE348864_CLEAN_load2(rp + i * 2);
}
rp += sizeof(f);
if (PQCLEAN_MCELIECE348864_CLEAN_genpoly_gen(irr, f)) {
continue;
}

for (i = 0; i < (1 << GFBITS); i++) {
perm[i] = PQCLEAN_MCELIECE348864_CLEAN_load4(rp + i * 4);
}
rp += sizeof(perm);
if (PQCLEAN_MCELIECE348864_CLEAN_perm_check(perm)) {
continue;
}

for (i = 0; i < SYS_T; i++) {
PQCLEAN_MCELIECE348864_CLEAN_store2(sk + SYS_N / 8 + i * 2, irr[i]);
}
if (PQCLEAN_MCELIECE348864_CLEAN_pk_gen(pk, perm, sk + SYS_N / 8)) {
continue;
}

memcpy(sk, rp, SYS_N / 8);
PQCLEAN_MCELIECE348864_CLEAN_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm);

break;
}

return 0;
}


+ 21
- 0
crypto_kem/mceliece348864/clean/params.h View File

@@ -0,0 +1,21 @@
#ifndef PQCLEAN_MCELIECE348864_CLEAN_PARAMS_H
#define PQCLEAN_MCELIECE348864_CLEAN_PARAMS_H

#define GFBITS 12
#define SYS_N 3488
#define SYS_T 64

#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1))
#define IRR_BYTES (SYS_T * 2)

#define PK_NROWS (SYS_T*GFBITS)
#define PK_NCOLS (SYS_N - PK_NROWS)
#define PK_ROW_BYTES ((PK_NCOLS + 7)/8)

#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES)
#define SYND_BYTES ((PK_NROWS + 7)/8)

#define GFMASK ((1 << GFBITS) - 1)

#endif


+ 144
- 0
crypto_kem/mceliece348864/clean/pk_gen.c View File

@@ -0,0 +1,144 @@
/*
This file is for public-key generation
*/

#include <string.h>

#include "benes.h"
#include "controlbits.h"
#include "gf.h"
#include "params.h"
#include "pk_gen.h"
#include "root.h"
#include "util.h"

/* input: secret key sk */
/* output: public key pk */
int PQCLEAN_MCELIECE348864_CLEAN_pk_gen(uint8_t *pk, uint32_t *perm, const uint8_t *sk) {
int i, j, k;
int row, c;

uint64_t buf[ 1 << GFBITS ];

uint8_t mat[ GFBITS * SYS_T ][ SYS_N / 8 ];
uint8_t mask;
uint8_t b;

gf g[ SYS_T + 1 ]; // Goppa polynomial
gf L[ SYS_N ]; // support
gf inv[ SYS_N ];

//

g[ SYS_T ] = 1;

for (i = 0; i < SYS_T; i++) {
g[i] = PQCLEAN_MCELIECE348864_CLEAN_load2(sk);
g[i] &= GFMASK;
sk += 2;
}

for (i = 0; i < (1 << GFBITS); i++) {
buf[i] = perm[i];
buf[i] <<= 31;
buf[i] |= i;
}

PQCLEAN_MCELIECE348864_CLEAN_sort_63b(1 << GFBITS, buf);

for (i = 0; i < (1 << GFBITS); i++) {
perm[i] = buf[i] & GFMASK;
}
for (i = 0; i < SYS_N; i++) {
L[i] = PQCLEAN_MCELIECE348864_CLEAN_bitrev((gf)perm[i]);
}

// filling the matrix

PQCLEAN_MCELIECE348864_CLEAN_root(inv, g, L);

for (i = 0; i < SYS_N; i++) {
inv[i] = PQCLEAN_MCELIECE348864_CLEAN_gf_inv(inv[i]);
}

for (i = 0; i < PK_NROWS; i++) {
for (j = 0; j < SYS_N / 8; j++) {
mat[i][j] = 0;
}
}

for (i = 0; i < SYS_T; i++) {
for (j = 0; j < SYS_N; j += 8) {
for (k = 0; k < GFBITS; k++) {
b = (inv[j + 7] >> k) & 1;
b <<= 1;
b |= (inv[j + 6] >> k) & 1;
b <<= 1;
b |= (inv[j + 5] >> k) & 1;
b <<= 1;
b |= (inv[j + 4] >> k) & 1;
b <<= 1;
b |= (inv[j + 3] >> k) & 1;
b <<= 1;
b |= (inv[j + 2] >> k) & 1;
b <<= 1;
b |= (inv[j + 1] >> k) & 1;
b <<= 1;
b |= (inv[j + 0] >> k) & 1;

mat[ i * GFBITS + k ][ j / 8 ] = b;
}
}

for (j = 0; j < SYS_N; j++) {
inv[j] = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(inv[j], L[j]);
}

}

// gaussian elimination

for (i = 0; i < (GFBITS * SYS_T + 7) / 8; i++) {
for (j = 0; j < 8; j++) {
row = i * 8 + j;

if (row >= GFBITS * SYS_T) {
break;
}

for (k = row + 1; k < GFBITS * SYS_T; k++) {
mask = mat[ row ][ i ] ^ mat[ k ][ i ];
mask >>= j;
mask &= 1;
mask = -mask;

for (c = 0; c < SYS_N / 8; c++) {
mat[ row ][ c ] ^= mat[ k ][ c ] & mask;
}
}

if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic
return -1;
}

for (k = 0; k < GFBITS * SYS_T; k++) {
if (k != row) {
mask = mat[ k ][ i ] >> j;
mask &= 1;
mask = -mask;

for (c = 0; c < SYS_N / 8; c++) {
mat[ k ][ c ] ^= mat[ row ][ c ] & mask;
}
}
}
}
}

for (i = 0; i < PK_NROWS; i++) {
memcpy(pk + i * PK_ROW_BYTES, mat[i] + PK_NROWS / 8, PK_ROW_BYTES);
}

return 0;
}


+ 13
- 0
crypto_kem/mceliece348864/clean/pk_gen.h View File

@@ -0,0 +1,13 @@
#ifndef PQCLEAN_MCELIECE348864_CLEAN_PK_GEN_H
#define PQCLEAN_MCELIECE348864_CLEAN_PK_GEN_H
/*
This file is for public-key generation
*/


#include <stdint.h>

int PQCLEAN_MCELIECE348864_CLEAN_pk_gen(uint8_t * /*pk*/, uint32_t * /*perm*/, const uint8_t * /*sk*/);

#endif


+ 33
- 0
crypto_kem/mceliece348864/clean/root.c View File

@@ -0,0 +1,33 @@
/*
This file is for evaluating a polynomial at one or more field elements
*/
#include "root.h"

#include "params.h"

/* input: polynomial f and field element a */
/* return f(a) */
gf PQCLEAN_MCELIECE348864_CLEAN_eval(gf *f, gf a) {
int i;
gf r;

r = f[ SYS_T ];

for (i = SYS_T - 1; i >= 0; i--) {
r = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(r, a);
r = PQCLEAN_MCELIECE348864_CLEAN_gf_add(r, f[i]);
}

return r;
}

/* input: polynomial f and list of field elements L */
/* output: out = [ f(a) for a in L ] */
void PQCLEAN_MCELIECE348864_CLEAN_root(gf *out, gf *f, gf *L) {
int i;

for (i = 0; i < SYS_N; i++) {
out[i] = PQCLEAN_MCELIECE348864_CLEAN_eval(f, L[i]);
}
}


+ 14
- 0
crypto_kem/mceliece348864/clean/root.h View File

@@ -0,0 +1,14 @@
#ifndef PQCLEAN_MCELIECE348864_CLEAN_ROOT_H
#define PQCLEAN_MCELIECE348864_CLEAN_ROOT_H
/*
This file is for evaluating a polynomial at one or more field elements
*/


#include "gf.h"

gf PQCLEAN_MCELIECE348864_CLEAN_eval(gf * /*f*/, gf /*a*/);
void PQCLEAN_MCELIECE348864_CLEAN_root(gf * /*out*/, gf * /*f*/, gf * /*L*/);

#endif


+ 98
- 0
crypto_kem/mceliece348864/clean/sk_gen.c View File

@@ -0,0 +1,98 @@
/*
This file is for secret-key generation
*/

#include "sk_gen.h"

#include "controlbits.h"
#include "gf.h"
#include "params.h"
#include "util.h"

/* input: f, element in GF((2^m)^t) */
/* output: out, minimal polynomial of f */
/* return: 0 for success and -1 for failure */
int PQCLEAN_MCELIECE348864_CLEAN_genpoly_gen(gf *out, gf *f) {
int i, j, k, c;

gf mat[ SYS_T + 1 ][ SYS_T ];
gf mask, inv, t;

// fill matrix

mat[0][0] = 1;

for (i = 1; i < SYS_T; i++) {
mat[0][i] = 0;
}

for (i = 0; i < SYS_T; i++) {
mat[1][i] = f[i];
}

for (j = 2; j <= SYS_T; j++) {
PQCLEAN_MCELIECE348864_CLEAN_GF_mul(mat[j], mat[j - 1], f);
}

// gaussian

for (j = 0; j < SYS_T; j++) {
for (k = j + 1; k < SYS_T; k++) {
mask = PQCLEAN_MCELIECE348864_CLEAN_gf_iszero(mat[ j ][ j ]);

for (c = j; c < SYS_T + 1; c++) {
mat[ c ][ j ] ^= mat[ c ][ k ] & mask;
}

}

if ( mat[ j ][ j ] == 0 ) { // return if not systematic
return -1;
}

inv = PQCLEAN_MCELIECE348864_CLEAN_gf_inv(mat[j][j]);

for (c = j; c < SYS_T + 1; c++) {
mat[ c ][ j ] = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(mat[ c ][ j ], inv) ;
}

for (k = 0; k < SYS_T; k++) {
if (k != j) {
t = mat[ j ][ k ];

for (c = j; c < SYS_T + 1; c++) {
mat[ c ][ k ] ^= PQCLEAN_MCELIECE348864_CLEAN_gf_mul(mat[ c ][ j ], t);
}
}
}
}

for (i = 0; i < SYS_T; i++) {
out[i] = mat[ SYS_T ][ i ];
}

return 0;
}

/* input: permutation p represented as a list of 32-bit intergers */
/* output: -1 if some interger repeats in p */
/* 0 otherwise */
int PQCLEAN_MCELIECE348864_CLEAN_perm_check(const uint32_t *p) {
int i;
uint64_t list[1 << GFBITS];

for (i = 0; i < (1 << GFBITS); i++) {
list[i] = p[i];
}

PQCLEAN_MCELIECE348864_CLEAN_sort_63b(1 << GFBITS, list);

for (i = 1; i < (1 << GFBITS); i++) {
if (list[i - 1] == list[i]) {
return -1;
}
}

return 0;
}


+ 16
- 0
crypto_kem/mceliece348864/clean/sk_gen.h View File

@@ -0,0 +1,16 @@
#ifndef PQCLEAN_MCELIECE348864_CLEAN_SK_GEN_H
#define PQCLEAN_MCELIECE348864_CLEAN_SK_GEN_H
/*
This file is for secret-key generation
*/


#include "gf.h"

#include <stdint.h>

int PQCLEAN_MCELIECE348864_CLEAN_genpoly_gen(gf * /*out*/, gf * /*f*/);
int PQCLEAN_MCELIECE348864_CLEAN_perm_check(const uint32_t * /*p*/);

#endif


+ 33
- 0
crypto_kem/mceliece348864/clean/synd.c View File

@@ -0,0 +1,33 @@
/*
This file is for syndrome computation
*/

#include "synd.h"

#include "params.h"
#include "root.h"


/* input: Goppa polynomial f, support L, received word r */
/* output: out, the syndrome of length 2t */
void PQCLEAN_MCELIECE348864_CLEAN_synd(gf *out, gf *f, gf *L, const unsigned char *r) {
int i, j;
gf e, e_inv, c;

for (j = 0; j < 2 * SYS_T; j++) {
out[j] = 0;
}

for (i = 0; i < SYS_N; i++) {
c = (r[i / 8] >> (i % 8)) & 1;

e = PQCLEAN_MCELIECE348864_CLEAN_eval(f, L[i]);
e_inv = PQCLEAN_MCELIECE348864_CLEAN_gf_inv(PQCLEAN_MCELIECE348864_CLEAN_gf_mul(e, e));

for (j = 0; j < 2 * SYS_T; j++) {
out[j] = PQCLEAN_MCELIECE348864_CLEAN_gf_add(out[j], PQCLEAN_MCELIECE348864_CLEAN_gf_mul(e_inv, c));
e_inv = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(e_inv, L[i]);
}
}
}


+ 12
- 0
crypto_kem/mceliece348864/clean/synd.h View File

@@ -0,0 +1,12 @@
#ifndef PQCLEAN_MCELIECE348864_CLEAN_SYND_H
#define PQCLEAN_MCELIECE348864_CLEAN_SYND_H
/*
This file is for syndrome computation
*/

#include "gf.h"

void PQCLEAN_MCELIECE348864_CLEAN_synd(gf * /*out*/, gf * /*f*/, gf * /*L*/, const unsigned char * /*r*/);

#endif


+ 42
- 0
crypto_kem/mceliece348864/clean/transpose.c View File

@@ -0,0 +1,42 @@
/*
This file is for matrix transposition
*/

#include "transpose.h"

#include <stdint.h>

/* input: in, a 64x64 matrix over GF(2) */
/* output: out, transpose of in */
void PQCLEAN_MCELIECE348864_CLEAN_transpose_64x64(uint64_t *out, const uint64_t *in) {
int i, j, s, d;

uint64_t x, y;
uint64_t masks[6][2] = {
{0x5555555555555555, 0xAAAAAAAAAAAAAAAA},
{0x3333333333333333, 0xCCCCCCCCCCCCCCCC},
{0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0},
{0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00},
{0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000},
{0x00000000FFFFFFFF, 0xFFFFFFFF00000000}
};

for (i = 0; i < 64; i++) {
out[i] = in[i];
}

for (d = 5; d >= 0; d--) {
s = 1 << d;

for (i = 0; i < 64; i += s * 2) {
for (j = i; j < i + s; j++) {
x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s);
y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]);

out[j + 0] = x;
out[j + s] = y;
}
}
}
}


+ 13
- 0
crypto_kem/mceliece348864/clean/transpose.h View File

@@ -0,0 +1,13 @@
#ifndef PQCLEAN_MCELIECE348864_CLEAN_TRANSPOSE_H
#define PQCLEAN_MCELIECE348864_CLEAN_TRANSPOSE_H
/*
This file is for matrix transposition
*/


#include <stdint.h>

void PQCLEAN_MCELIECE348864_CLEAN_transpose_64x64(uint64_t * /*out*/, const uint64_t * /*in*/);

#endif


+ 67
- 0
crypto_kem/mceliece348864/clean/util.c View File

@@ -0,0 +1,67 @@
/*
This file is for loading/storing data in a little-endian fashion
*/

#include "util.h"

#include "params.h"

void PQCLEAN_MCELIECE348864_CLEAN_store2(unsigned char *dest, gf a) {
dest[0] = a & 0xFF;
dest[1] = a >> 8;
}

uint16_t PQCLEAN_MCELIECE348864_CLEAN_load2(const unsigned char *src) {
uint16_t a;

a = src[1];
a <<= 8;
a |= src[0];

return a & GFMASK;
}

uint32_t PQCLEAN_MCELIECE348864_CLEAN_load4(const unsigned char *in) {
int i;
uint32_t ret = in[3];

for (i = 2; i >= 0; i--) {
ret <<= 8;
ret |= in[i];
}

return ret;
}

void PQCLEAN_MCELIECE348864_CLEAN_store8(unsigned char *out, uint64_t in) {
out[0] = (in >> 0x00) & 0xFF;
out[1] = (in >> 0x08) & 0xFF;
out[2] = (in >> 0x10) & 0xFF;
out[3] = (in >> 0x18) & 0xFF;
out[4] = (in >> 0x20) & 0xFF;
out[5] = (in >> 0x28) & 0xFF;
out[6] = (in >> 0x30) & 0xFF;
out[7] = (in >> 0x38) & 0xFF;
}

uint64_t PQCLEAN_MCELIECE348864_CLEAN_load8(const unsigned char *in) {
int i;
uint64_t ret = in[7];

for (i = 6; i >= 0; i--) {
ret <<= 8;
ret |= in[i];
}

return ret;
}

gf PQCLEAN_MCELIECE348864_CLEAN_bitrev(gf a) {
a = ((a & 0x00FF) << 8) | ((a & 0xFF00) >> 8);
a = ((a & 0x0F0F) << 4) | ((a & 0xF0F0) >> 4);
a = ((a & 0x3333) << 2) | ((a & 0xCCCC) >> 2);
a = ((a & 0x5555) << 1) | ((a & 0xAAAA) >> 1);

return a >> 4;
}


+ 22
- 0
crypto_kem/mceliece348864/clean/util.h View File

@@ -0,0 +1,22 @@
#ifndef PQCLEAN_MCELIECE348864_CLEAN_UTIL_H
#define PQCLEAN_MCELIECE348864_CLEAN_UTIL_H
/*
This file is for loading/storing data in a little-endian fashion
*/


#include "gf.h"
#include <stdint.h>

void PQCLEAN_MCELIECE348864_CLEAN_store2(unsigned char * /*dest*/, gf /*a*/);
uint16_t PQCLEAN_MCELIECE348864_CLEAN_load2(const unsigned char * /*src*/);

uint32_t PQCLEAN_MCELIECE348864_CLEAN_load4(const unsigned char * /*in*/);

void PQCLEAN_MCELIECE348864_CLEAN_store8(unsigned char * /*out*/, uint64_t /*in*/);
uint64_t PQCLEAN_MCELIECE348864_CLEAN_load8(const unsigned char * /*in*/);

gf PQCLEAN_MCELIECE348864_CLEAN_bitrev(gf /*a*/);

#endif


+ 16
- 0
crypto_kem/mceliece348864/sse/LICENSE View File

@@ -0,0 +1,16 @@
Public Domain.

Authors of Classic McEliece in alphabetical order:

Daniel J. Bernstein, University of Illinois at Chicago
Tung Chou, Osaka University
Tanja Lange, Technische Universiteit Eindhoven
Ingo von Maurich, self
Rafael Misoczki, Intel Corporation
Ruben Niederhagen, Fraunhofer SIT
Edoardo Persichetti, Florida Atlantic University
Christiane Peters, self
Peter Schwabe, Radboud University
Nicolas Sendrier, Inria
Jakub Szefer, Yale University
Wen Wang, Yale University

+ 41
- 0
crypto_kem/mceliece348864/sse/Makefile View File

@@ -0,0 +1,41 @@
# This Makefile can be used with GNU Make or BSD Make

LIB = libmceliece348864_sse.a

SOURCES = aes256ctr.c benes.c bm.c controlbits.c decrypt.c encrypt.c fft.c \
fft_tr.c gf.c operations.c pk_gen.c sk_gen.c transpose.c util.c \
vec.c vec128.c \
consts.S syndrome_asm.S transpose_64x128_sp_asm.S \
transpose_64x64_asm.S update_asm.S vec128_mul_asm.S \
vec_mul_asm.S vec_reduce_asm.S

HEADERS = aes256ctr.h api.h benes.h bm.h controlbits.h crypto_hash.h \
decrypt.h encrypt.h fft.h fft_tr.h gf.h params.h \
pk_gen.h sk_gen.h transpose.h util.h vec128.h vec.h \
consts.inc powers.inc scalars_2x.inc scalars.inc

OBJECTS = aes256ctr.o benes.o bm.o controlbits.o decrypt.o encrypt.o fft.o \
fft_tr.o gf.o operations.o pk_gen.o transpose.o sk_gen.o util.o \
vec.o vec128.o \
consts.o syndrome_asm.o transpose_64x128_sp_asm.o \
transpose_64x64_asm.o update_asm.o vec128_mul_asm.o \
vec_mul_asm.o vec_reduce_asm.o

CFLAGS = -O3 -std=c99 -mpopcnt -mbmi -msse4.1 -Wall -Wextra -pedantic -Werror -Wpedantic \
-Wredundant-decls -Wvla -Wcast-align -Wmissing-prototypes \
-I../../../common/ $(EXTRAFLAGS)

all: $(LIB)

%.o: %.c $(HEADERS)
$(CC) $(CFLAGS) -c -o $@ $<

%.o: %.S
$(CC) -c -o $@ $<

$(LIB): $(OBJECTS)
$(AR) -r $@ $(OBJECTS)

clean:
$(RM) $(OBJECTS)
$(RM) $(LIB)

+ 13
- 0
crypto_kem/mceliece348864/sse/aes256ctr.c View File

@@ -0,0 +1,13 @@
#include "aes256ctr.h"

void PQCLEAN_MCELIECE348864_SSE_aes256ctr(
uint8_t *out,
size_t outlen,
const uint8_t nonce[AESCTR_NONCEBYTES],
const uint8_t key[AES256_KEYBYTES]) {

aes256ctx state;
aes256_keyexp(&state, key);
aes256_ctr(out, outlen, nonce, &state);
aes256_ctx_release(&state);
}

+ 17
- 0
crypto_kem/mceliece348864/sse/aes256ctr.h View File

@@ -0,0 +1,17 @@
#ifndef PQCLEAN_MCELIECE348864_SSE_AES256CTR_H
#define PQCLEAN_MCELIECE348864_SSE_AES256CTR_H

#include <stddef.h>
#include <stdint.h>

#include "aes.h"


void PQCLEAN_MCELIECE348864_SSE_aes256ctr(
uint8_t *out,
size_t outlen,
const uint8_t nonce[AESCTR_NONCEBYTES],
const uint8_t key[AES256_KEYBYTES]
);

#endif

+ 32
- 0
crypto_kem/mceliece348864/sse/api.h View File

@@ -0,0 +1,32 @@
#ifndef PQCLEAN_MCELIECE348864_SSE_API_H
#define PQCLEAN_MCELIECE348864_SSE_API_H

#include <stdint.h>

#define PQCLEAN_MCELIECE348864_SSE_CRYPTO_ALGNAME "Classic McEliece 348864"
#define PQCLEAN_MCELIECE348864_SSE_CRYPTO_PUBLICKEYBYTES 261120
#define PQCLEAN_MCELIECE348864_SSE_CRYPTO_SECRETKEYBYTES 6452
#define PQCLEAN_MCELIECE348864_SSE_CRYPTO_CIPHERTEXTBYTES 128
#define PQCLEAN_MCELIECE348864_SSE_CRYPTO_BYTES 32


int PQCLEAN_MCELIECE348864_SSE_crypto_kem_enc(
uint8_t *c,
uint8_t *key,
const uint8_t *pk
);

int PQCLEAN_MCELIECE348864_SSE_crypto_kem_dec(
uint8_t *key,
const uint8_t *c,
const uint8_t *sk
);

int PQCLEAN_MCELIECE348864_SSE_crypto_kem_keypair
(
uint8_t *pk,
uint8_t *sk
);

#endif


+ 287
- 0
crypto_kem/mceliece348864/sse/benes.c View File

@@ -0,0 +1,287 @@
/*
This file is for Benes network related functions
*/
#include "benes.h"

#include "params.h"
#include "transpose.h"
#include "util.h"

static void layer_0(uint64_t *bs, const uint64_t *cond) {
int x;
uint64_t diff;

for (x = 0; x < (1 << 6); x += 2) {
diff = bs[ x ] ^ bs[ x + 1 ];
diff &= *cond++;
bs[ x ] ^= diff;
bs[ x + 1 ] ^= diff;
}
}

static void layer_1(uint64_t *bs, const uint64_t *cond) {
int x;
uint64_t diff;

for (x = 0; x < (1 << 6); x += 4) {
diff = bs[ x + 0 ] ^ bs[ x + 2 ];
diff &= cond[0];
bs[ x + 0 ] ^= diff;
bs[ x + 2 ] ^= diff;

diff = bs[ x + 1 ] ^ bs[ x + 3 ];
diff &= cond[1];
bs[ x + 1 ] ^= diff;
bs[ x + 3 ] ^= diff;

cond += 2;
}
}

static void layer_2(uint64_t *bs, const uint64_t *cond) {
int x;
uint64_t diff;

for (x = 0; x < (1 << 6); x += 8) {
diff = bs[ x + 0 ] ^ bs[ x + 4 ];
diff &= cond[0];
bs[ x + 0 ] ^= diff;
bs[ x + 4 ] ^= diff;

diff = bs[ x + 1 ] ^ bs[ x + 5 ];
diff &= cond[1];
bs[ x + 1 ] ^= diff;
bs[ x + 5 ] ^= diff;

diff = bs[ x + 2 ] ^ bs[ x + 6 ];
diff &= cond[2];
bs[ x + 2 ] ^= diff;
bs[ x + 6 ] ^= diff;

diff = bs[ x + 3 ] ^ bs[ x + 7 ];
diff &= cond[3];
bs[ x + 3 ] ^= diff;
bs[ x + 7 ] ^= diff;

cond += 4;
}
}

static void layer_3(uint64_t *bs, const uint64_t *cond) {
int x, s;
uint64_t diff;

for (x = 0; x < (1 << 6); x += 16) {
for (s = x; s < x + 8; s += 4) {
diff = bs[ s + 0 ] ^ bs[ s + 8 ];
diff &= cond[0];
bs[ s + 0 ] ^= diff;
bs[ s + 8 ] ^= diff;

diff = bs[ s + 1 ] ^ bs[ s + 9 ];
diff &= cond[1];
bs[ s + 1 ] ^= diff;
bs[ s + 9 ] ^= diff;

diff = bs[ s + 2 ] ^ bs[ s + 10 ];
diff &= cond[2];
bs[ s + 2 ] ^= diff;
bs[ s + 10 ] ^= diff;

diff = bs[ s + 3 ] ^ bs[ s + 11 ];
diff &= cond[3];
bs[ s + 3 ] ^= diff;
bs[ s + 11 ] ^= diff;

cond += 4;
}
}
}

static void layer_4(uint64_t *bs, const uint64_t *cond) {
int x, s;
uint64_t diff;

for (x = 0; x < (1 << 6); x += 32) {
for (s = x; s < x + 16; s += 4) {
diff = bs[ s + 0 ] ^ bs[ s + 16 ];
diff &= cond[0];
bs[ s + 0 ] ^= diff;
bs[ s + 16 ] ^= diff;

diff = bs[ s + 1 ] ^ bs[ s + 17 ];
diff &= cond[1];
bs[ s + 1 ] ^= diff;
bs[ s + 17 ] ^= diff;

diff = bs[ s + 2 ] ^ bs[ s + 18 ];
diff &= cond[2];
bs[ s + 2 ] ^= diff;
bs[ s + 18 ] ^= diff;

diff = bs[ s + 3 ] ^ bs[ s + 19 ];
diff &= cond[3];
bs[ s + 3 ] ^= diff;
bs[ s + 19 ] ^= diff;

cond += 4;
}
}
}

static void layer_5(uint64_t *bs, const uint64_t *cond) {
int x, s;
uint64_t diff;

for (x = 0; x < (1 << 6); x += 64) {
for (s = x; s < x + 32; s += 4) {
diff = bs[ s + 0 ] ^ bs[ s + 32 ];
diff &= cond[0];
bs[ s + 0 ] ^= diff;
bs[ s + 32 ] ^= diff;

diff = bs[ s + 1 ] ^ bs[ s + 33 ];
diff &= cond[1];
bs[ s + 1 ] ^= diff;
bs[ s + 33 ] ^= diff;

diff = bs[ s + 2 ] ^ bs[ s + 34 ];
diff &= cond[2];
bs[ s + 2 ] ^= diff;
bs[ s + 34 ] ^= diff;

diff = bs[ s + 3 ] ^ bs[ s + 35 ];
diff &= cond[3];
bs[ s + 3 ] ^= diff;
bs[ s + 35 ] ^= diff;

cond += 4;
}
}
}

/* input: bits, control bits as array of bytes */
/* output: out, control bits as array of 128-bit vectors */
void PQCLEAN_MCELIECE348864_SSE_load_bits(uint64_t out[][32], const unsigned char *bits) {
int i, low, block = 0;

uint64_t cond[64];

//

for (low = 0; low <= 5; low++) {
for (i = 0; i < 64; i++) {
cond[i] = PQCLEAN_MCELIECE348864_SSE_load4(bits + block * 256 + i * 4);
}
PQCLEAN_MCELIECE348864_SSE_transpose_64x64(cond);

for (i = 0; i < 32; i++) {
out[ block ][i] = cond[i];
}
block++;
}

for (low = 0; low <= 5; low++) {
for (i = 0; i < 32; i++) {
out[ block ][i] = PQCLEAN_MCELIECE348864_SSE_load8(bits + block * 256 + i * 8);
}
block++;
}

for (low = 4; low >= 0; low--) {
for (i = 0; i < 32; i++) {
out[ block ][i] = PQCLEAN_MCELIECE348864_SSE_load8(bits + block * 256 + i * 8);
}
block++;
}

for (low = 5; low >= 0; low--) {
for (i = 0; i < 64; i++) {
cond[i] = PQCLEAN_MCELIECE348864_SSE_load4(bits + block * 256 + i * 4);
}
PQCLEAN_MCELIECE348864_SSE_transpose_64x64(cond);

for (i = 0; i < 32; i++) {
out[ block ][i] = cond[i];
}
block++;
}
}

/* input: r, sequence of bits to be permuted */
/* cond, control bits as array of 128-bit vectors */
/* rev, 0 for normal application; !0 for inverse */
/* output: r, permuted bits */
void PQCLEAN_MCELIECE348864_SSE_benes(uint64_t *r, uint64_t cond[][32], int rev) {
int block, inc;

uint64_t *bs = r;

//

if (rev == 0) {
block = 0;
inc = 1;
} else {
block = 22;
inc = -1;
}

PQCLEAN_MCELIECE348864_SSE_transpose_64x64(bs);

layer_0(bs, cond[ block ]);
block += inc;
layer_1(bs, cond[ block ]);
block += inc;
layer_2(bs, cond[ block ]);
block += inc;
layer_3(bs, cond[ block ]);
block += inc;
layer_4(bs, cond[ block ]);
block += inc;
layer_5(bs, cond[ block ]);
block += inc;

PQCLEAN_MCELIECE348864_SSE_transpose_64x64(bs);

layer_0(bs, cond[ block ]);
block += inc;
layer_1(bs, cond[ block ]);
block += inc;
layer_2(bs, cond[ block ]);
block += inc;
layer_3(bs, cond[ block ]);
block += inc;
layer_4(bs, cond[ block ]);
block += inc;
layer_5(bs, cond[ block ]);
block += inc;
layer_4(bs, cond[ block ]);
block += inc;
layer_3(bs, cond[ block ]);
block += inc;
layer_2(bs, cond[ block ]);
block += inc;
layer_1(bs, cond[ block ]);
block += inc;
layer_0(bs, cond[ block ]);
block += inc;

PQCLEAN_MCELIECE348864_SSE_transpose_64x64(bs);

layer_5(bs, cond[ block ]);
block += inc;
layer_4(bs, cond[ block ]);
block += inc;
layer_3(bs, cond[ block ]);
block += inc;
layer_2(bs, cond[ block ]);
block += inc;
layer_1(bs, cond[ block ]);
block += inc;
layer_0(bs, cond[ block ]);
//block += inc;

PQCLEAN_MCELIECE348864_SSE_transpose_64x64(bs);
}


+ 15
- 0
crypto_kem/mceliece348864/sse/benes.h View File

@@ -0,0 +1,15 @@
#ifndef PQCLEAN_MCELIECE348864_SSE_BENES_H
#define PQCLEAN_MCELIECE348864_SSE_BENES_H
/*
This file is for Benes network related functions
*/


#include "gf.h"
#include "vec128.h"

void PQCLEAN_MCELIECE348864_SSE_load_bits(uint64_t /*out*/[][32], const unsigned char * /*bits*/);
void PQCLEAN_MCELIECE348864_SSE_benes(uint64_t * /*r*/, uint64_t /*cond*/[][32], int /*rev*/);

#endif


+ 220
- 0
crypto_kem/mceliece348864/sse/bm.c View File

@@ -0,0 +1,220 @@
/*
This file is for the inversion-free Berlekamp-Massey algorithm
see https://ieeexplore.ieee.org/document/87857
*/

#include "bm.h"

#include "gf.h"
#include "util.h"
#include "vec.h"
#include "vec128.h"

#include <assert.h>
#include <stdint.h>

extern void PQCLEAN_MCELIECE348864_SSE_update_asm(void *, gf, int);
extern gf PQCLEAN_MCELIECE348864_SSE_vec_reduce_asm(uint64_t *);

static inline uint64_t mask_nonzero(gf a) {
uint64_t ret = a;

ret -= 1;
ret >>= 63;
ret -= 1;

return ret;
}

static inline uint64_t mask_leq(uint16_t a, uint16_t b) {
uint64_t a_tmp = a;
uint64_t b_tmp = b;
uint64_t ret = b_tmp - a_tmp;

ret >>= 63;
ret -= 1;

return ret;
}

static void vec_cmov(uint64_t out[][2], uint64_t mask) {
int i;

for (i = 0; i < GFBITS; i++) {
out[i][0] = (out[i][0] & ~mask) | (out[i][1] & mask);
}
}

static inline void interleave(vec128 *in, int idx0, int idx1, vec128 *mask, int b) {
int s = 1 << b;

vec128 x, y;

x = PQCLEAN_MCELIECE348864_SSE_vec128_or(PQCLEAN_MCELIECE348864_SSE_vec128_and(in[idx0], mask[0]),
PQCLEAN_MCELIECE348864_SSE_vec128_sll_2x(PQCLEAN_MCELIECE348864_SSE_vec128_and(in[idx1], mask[0]), s));

y = PQCLEAN_MCELIECE348864_SSE_vec128_or(PQCLEAN_MCELIECE348864_SSE_vec128_srl_2x(PQCLEAN_MCELIECE348864_SSE_vec128_and(in[idx0], mask[1]), s),
PQCLEAN_MCELIECE348864_SSE_vec128_and(in[idx1], mask[1]));

in[idx0] = x;
in[idx1] = y;
}

/* input: in, field elements in bitsliced form */
/* output: out, field elements in non-bitsliced form */
static inline void get_coefs(gf *out, vec128 *in) {
int i, k;

vec128 mask[4][2];
vec128 buf[16];

for (i = 0; i < GFBITS; i++) {
buf[i] = in[i];
}
for (i = GFBITS; i < 16; i++) {
buf[i] = PQCLEAN_MCELIECE348864_SSE_vec128_setzero();
}

mask[0][0] = PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(0x5555);
mask[0][1] = PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(0xAAAA);
mask[1][0] = PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(0x3333);
mask[1][1] = PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(0xCCCC);
mask[2][0] = PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(0x0F0F);
mask[2][1] = PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(0xF0F0);
mask[3][0] = PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(0x00FF);
mask[3][1] = PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(0xFF00);

interleave(buf, 0, 8, mask[3], 3);
interleave(buf, 1, 9, mask[3], 3);
interleave(buf, 2, 10, mask[3], 3);
interleave(buf, 3, 11, mask[3], 3);
interleave(buf, 4, 12, mask[3], 3);
interleave(buf, 5, 13, mask[3], 3);
interleave(buf, 6, 14, mask[3], 3);
interleave(buf, 7, 15, mask[3], 3);

interleave(buf, 0, 4, mask[2], 2);
interleave(buf, 1, 5, mask[2], 2);
interleave(buf, 2, 6, mask[2], 2);
interleave(buf, 3, 7, mask[2], 2);
interleave(buf, 8, 12, mask[2], 2);
interleave(buf, 9, 13, mask[2], 2);
interleave(buf, 10, 14, mask[2], 2);
interleave(buf, 11, 15, mask[2], 2);

interleave(buf, 0, 2, mask[1], 1);
interleave(buf, 1, 3, mask[1], 1);
interleave(buf, 4, 6, mask[1], 1);
interleave(buf, 5, 7, mask[1], 1);
interleave(buf, 8, 10, mask[1], 1);
interleave(buf, 9, 11, mask[1], 1);
interleave(buf, 12, 14, mask[1], 1);
interleave(buf, 13, 15, mask[1], 1);

interleave(buf, 0, 1, mask[0], 0);
interleave(buf, 2, 3, mask[0], 0);
interleave(buf, 4, 5, mask[0], 0);
interleave(buf, 6, 7, mask[0], 0);
interleave(buf, 8, 9, mask[0], 0);
interleave(buf, 10, 11, mask[0], 0);
interleave(buf, 12, 13, mask[0], 0);
interleave(buf, 14, 15, mask[0], 0);

for (i = 0; i < 16; i++) {
for (k = 0; k < 4; k++) {
out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE348864_SSE_vec128_extract(buf[i], 0) >> (k * 16)) & GFMASK;
out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE348864_SSE_vec128_extract(buf[i], 1) >> (k * 16)) & GFMASK;
}
}
}

/* input: in, sequence of field elements */
/* output: out, minimal polynomial of in */
void PQCLEAN_MCELIECE348864_SSE_bm(uint64_t out[ GFBITS ], vec128 in[ GFBITS ]) {
uint16_t i;
uint16_t N, L;

uint64_t prod[ GFBITS ];
uint64_t in_tmp[ GFBITS ];

uint64_t db[ GFBITS ][ 2 ];
uint64_t BC_tmp[ GFBITS ][ 2 ];
uint64_t BC[ GFBITS ][ 2 ];

uint64_t mask, t;

gf d, b, c0 = 1;

gf coefs[SYS_T * 2];

// init

BC[0][1] = 0;
BC[0][0] = 1;
BC[0][0] <<= 63;

for (i = 1; i < GFBITS; i++) {
BC[i][0] = BC[i][1] = 0;
}

b = 1;
L = 0;

//

get_coefs(coefs, in);

for (i = 0; i < GFBITS; i++) {
in_tmp[i] = 0;
}

for (N = 0; N < SYS_T * 2; N++) {
// computing d

PQCLEAN_MCELIECE348864_SSE_vec_mul_asm(prod, in_tmp, &BC[0][1], 16);

PQCLEAN_MCELIECE348864_SSE_update_asm(in_tmp, coefs[N], 8);

d = PQCLEAN_MCELIECE348864_SSE_vec_reduce_asm(prod);

t = PQCLEAN_MCELIECE348864_SSE_gf_mul2(c0, coefs[N], b);

d ^= t & 0xFFFFFFFF;

// 3 cases

mask = mask_nonzero(d) & mask_leq(L * 2, N);

for (i = 0; i < GFBITS; i++) {
db[i][0] = (d >> i) & 1;
db[i][0] = -db[i][0];
db[i][1] = (b >> i) & 1;
db[i][1] = -db[i][1];
}

PQCLEAN_MCELIECE348864_SSE_vec128_mul((vec128 *) BC_tmp, (vec128 *) db, (vec128 *) BC);

vec_cmov(BC, mask);

PQCLEAN_MCELIECE348864_SSE_update_asm(BC, mask & c0, 16);

for (i = 0; i < GFBITS; i++) {
BC[i][1] = BC_tmp[i][0] ^ BC_tmp[i][1];
}

c0 = t >> 32;
b = (d & mask) | (b & ~mask);
L = ((N + 1 - L) & mask) | (L & ~mask);

}

c0 = PQCLEAN_MCELIECE348864_SSE_gf_inv(c0);

for (i = 0; i < GFBITS; i++) {
out[i] = (c0 >> i) & 1;
out[i] = -out[i];
}

PQCLEAN_MCELIECE348864_SSE_vec_mul_asm(out, out, &BC[0][1], 16);
}


+ 17
- 0
crypto_kem/mceliece348864/sse/bm.h View File

@@ -0,0 +1,17 @@
#ifndef PQCLEAN_MCELIECE348864_SSE_BM_H
#define PQCLEAN_MCELIECE348864_SSE_BM_H
/*
This file is for the inversion-free Berlekamp-Massey algorithm
see https://ieeexplore.ieee.org/document/87857
*/


#include <stdint.h>

#include "params.h"
#include "vec128.h"

void PQCLEAN_MCELIECE348864_SSE_bm(uint64_t *out, vec128 *in);

#endif


Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save