Exported from SUPERCOP-20200826 using the scripts at: https://github.com/jschanck/pqclean-package-ntruprimekyber
@@ -0,0 +1,26 @@ | |||
name: ntrulpr653 | |||
type: kem | |||
claimed-nist-level: 2 | |||
claimed-security: IND-CCA2 | |||
length-public-key: 897 | |||
length-secret-key: 1125 | |||
length-ciphertext: 1025 | |||
length-shared-secret: 32 | |||
nistkat-sha256: 6f8be58bb5d9785a0693fa8d34f5d89193757e1244e26f6182372c3e6de84fb2 | |||
principal-submitters: | |||
- Daniel J. Bernstein | |||
- Chitchanok Chuengsatiansup | |||
- Tanja Lange | |||
- Christine van Vredendaal | |||
implementations: | |||
- name: clean | |||
version: supercop-20200826 | |||
- name: avx2 | |||
version: supercop-20200826 | |||
supported_platforms: | |||
- architecture: x86_64 | |||
operating_systems: | |||
- Linux | |||
- Darwin | |||
required_flags: | |||
- avx2 |
@@ -0,0 +1 @@ | |||
Public Domain |
@@ -0,0 +1,22 @@ | |||
# This Makefile can be used with GNU Make or BSD Make | |||
LIB=libntrulpr653_avx2.a | |||
HEADERS=api.h crypto_core_multsntrup653.h crypto_core_multsntrup653_ntt.h crypto_decode_256x16.h crypto_decode_256x2.h crypto_decode_653x1541.h crypto_decode_653x3.h crypto_decode_653xint16.h crypto_decode_653xint32.h crypto_encode_256x16.h crypto_encode_256x2.h crypto_encode_653x1541.h crypto_encode_653x1541round.h crypto_encode_653x3.h crypto_encode_653xint16.h crypto_sort_int32.h crypto_sort_uint32.h crypto_stream_aes256ctr.h crypto_verify_1025.h params.h | |||
OBJECTS=crypto_core_multsntrup653.o crypto_core_multsntrup653_ntt.o crypto_decode_256x16.o crypto_decode_256x2.o crypto_decode_653x1541.o crypto_decode_653x3.o crypto_decode_653xint16.o crypto_decode_653xint32.o crypto_encode_256x16.o crypto_encode_256x2.o crypto_encode_653x1541.o crypto_encode_653x1541round.o crypto_encode_653x3.o crypto_encode_653xint16.o crypto_sort_int32.o crypto_sort_uint32.o crypto_stream_aes256ctr.o crypto_verify_1025.o kem.o | |||
CFLAGS=-O3 -mavx2 -mbmi2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS) | |||
all: $(LIB) | |||
%.o: %.s $(HEADERS) | |||
$(AS) -o $@ $< | |||
%.o: %.c $(HEADERS) | |||
$(CC) $(CFLAGS) -c -o $@ $< | |||
$(LIB): $(OBJECTS) | |||
$(AR) -r $@ $(OBJECTS) | |||
clean: | |||
$(RM) $(OBJECTS) | |||
$(RM) $(LIB) |
@@ -0,0 +1,16 @@ | |||
#ifndef PQCLEAN_NTRULPR653_AVX2_API_H | |||
#define PQCLEAN_NTRULPR653_AVX2_API_H | |||
#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_ALGNAME "ntrulpr653" | |||
#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_SECRETKEYBYTES 1125 | |||
#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_PUBLICKEYBYTES 897 | |||
#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_CIPHERTEXTBYTES 1025 | |||
#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_BYTES 32 | |||
int PQCLEAN_NTRULPR653_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); | |||
int PQCLEAN_NTRULPR653_AVX2_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk); | |||
int PQCLEAN_NTRULPR653_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk); | |||
#endif |
@@ -0,0 +1,314 @@ | |||
#include "crypto_core_multsntrup653.h" | |||
#include "crypto_core_multsntrup653_ntt.h" | |||
#include "crypto_decode_653xint16.h" | |||
#include "crypto_encode_653xint16.h" | |||
#include <immintrin.h> | |||
typedef int8_t int8; | |||
typedef int16_t int16; | |||
#define int16x16 __m256i | |||
#define load_x16(p) _mm256_loadu_si256((int16x16 *) (p)) | |||
#define store_x16(p,v) _mm256_storeu_si256((int16x16 *) (p),(v)) | |||
#define const_x16 _mm256_set1_epi16 | |||
#define add_x16 _mm256_add_epi16 | |||
#define sub_x16 _mm256_sub_epi16 | |||
#define mullo_x16 _mm256_mullo_epi16 | |||
#define mulhi_x16 _mm256_mulhi_epi16 | |||
#define mulhrs_x16 _mm256_mulhrs_epi16 | |||
#define signmask_x16(x) _mm256_srai_epi16((x),15) | |||
typedef union { | |||
int16 v[3][512]; | |||
int16x16 _dummy; | |||
} vec3x512; | |||
typedef union { | |||
int16 v[768]; | |||
int16x16 _dummy; | |||
} vec768; | |||
typedef union { | |||
int16 v[3 * 512]; | |||
int16x16 _dummy; | |||
} vec1536; | |||
static inline int16x16 squeeze_4621_x16(int16x16 x) { | |||
return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(7)), const_x16(4621))); | |||
} | |||
static inline int16x16 squeeze_7681_x16(int16x16 x) { | |||
return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(4)), const_x16(7681))); | |||
} | |||
static inline int16x16 squeeze_10753_x16(int16x16 x) { | |||
return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(3)), const_x16(10753))); | |||
} | |||
static inline int16x16 mulmod_4621_x16(int16x16 x, int16x16 y) { | |||
int16x16 yqinv = mullo_x16(y, const_x16(-29499)); /* XXX: precompute */ | |||
int16x16 b = mulhi_x16(x, y); | |||
int16x16 d = mullo_x16(x, yqinv); | |||
int16x16 e = mulhi_x16(d, const_x16(4621)); | |||
return sub_x16(b, e); | |||
} | |||
static inline int16x16 mulmod_7681_x16(int16x16 x, int16x16 y) { | |||
int16x16 yqinv = mullo_x16(y, const_x16(-7679)); /* XXX: precompute */ | |||
int16x16 b = mulhi_x16(x, y); | |||
int16x16 d = mullo_x16(x, yqinv); | |||
int16x16 e = mulhi_x16(d, const_x16(7681)); | |||
return sub_x16(b, e); | |||
} | |||
static inline int16x16 mulmod_10753_x16(int16x16 x, int16x16 y) { | |||
int16x16 yqinv = mullo_x16(y, const_x16(-10751)); /* XXX: precompute */ | |||
int16x16 b = mulhi_x16(x, y); | |||
int16x16 d = mullo_x16(x, yqinv); | |||
int16x16 e = mulhi_x16(d, const_x16(10753)); | |||
return sub_x16(b, e); | |||
} | |||
#define mask0 _mm256_set_epi16(-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1) | |||
#define mask1 _mm256_set_epi16(0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0) | |||
#define mask2 _mm256_set_epi16(0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0) | |||
static void good(int16 fpad[3][512], const int16 f[768]) { | |||
int j; | |||
int16x16 f0, f1; | |||
j = 0; | |||
for (;;) { | |||
f0 = load_x16(f + j); | |||
f1 = load_x16(f + 512 + j); | |||
store_x16(&fpad[0][j], (f0 & mask0) | (f1 & mask1)); | |||
store_x16(&fpad[1][j], (f0 & mask1) | (f1 & mask2)); | |||
store_x16(&fpad[2][j], (f0 & mask2) | (f1 & mask0)); | |||
j += 16; | |||
if (j == 256) { | |||
break; | |||
} | |||
f0 = load_x16(f + j); | |||
f1 = load_x16(f + 512 + j); | |||
store_x16(&fpad[0][j], (f0 & mask2) | (f1 & mask0)); | |||
store_x16(&fpad[1][j], (f0 & mask0) | (f1 & mask1)); | |||
store_x16(&fpad[2][j], (f0 & mask1) | (f1 & mask2)); | |||
j += 16; | |||
f0 = load_x16(f + j); | |||
f1 = load_x16(f + 512 + j); | |||
store_x16(&fpad[0][j], (f0 & mask1) | (f1 & mask2)); | |||
store_x16(&fpad[1][j], (f0 & mask2) | (f1 & mask0)); | |||
store_x16(&fpad[2][j], (f0 & mask0) | (f1 & mask1)); | |||
j += 16; | |||
} | |||
for (;;) { | |||
f0 = load_x16(f + j); | |||
store_x16(&fpad[0][j], f0 & mask2); | |||
store_x16(&fpad[1][j], f0 & mask0); | |||
store_x16(&fpad[2][j], f0 & mask1); | |||
j += 16; | |||
if (j == 512) { | |||
break; | |||
} | |||
f0 = load_x16(f + j); | |||
store_x16(&fpad[0][j], f0 & mask1); | |||
store_x16(&fpad[1][j], f0 & mask2); | |||
store_x16(&fpad[2][j], f0 & mask0); | |||
j += 16; | |||
f0 = load_x16(f + j); | |||
store_x16(&fpad[0][j], f0 & mask0); | |||
store_x16(&fpad[1][j], f0 & mask1); | |||
store_x16(&fpad[2][j], f0 & mask2); | |||
j += 16; | |||
} | |||
} | |||
static void ungood(int16 f[1536], const int16 fpad[3][512]) { | |||
int j; | |||
int16x16 f0, f1, f2, g0, g1, g2; | |||
j = 0; | |||
for (;;) { | |||
f0 = load_x16(&fpad[0][j]); | |||
f1 = load_x16(&fpad[1][j]); | |||
f2 = load_x16(&fpad[2][j]); | |||
g0 = (f0 & mask0) | (f1 & mask1) | (f2 & mask2); | |||
g1 = (f0 & mask1) | (f1 & mask2) | (f2 & mask0); | |||
g2 = f0 ^ f1 ^ f2 ^ g0 ^ g1; /* same as (f0&mask2)|(f1&mask0)|(f2&mask1) */ | |||
store_x16(f + 0 + j, g0); | |||
store_x16(f + 512 + j, g1); | |||
store_x16(f + 1024 + j, g2); | |||
j += 16; | |||
f0 = load_x16(&fpad[0][j]); | |||
f1 = load_x16(&fpad[1][j]); | |||
f2 = load_x16(&fpad[2][j]); | |||
g0 = (f0 & mask2) | (f1 & mask0) | (f2 & mask1); | |||
g1 = (f0 & mask0) | (f1 & mask1) | (f2 & mask2); | |||
g2 = f0 ^ f1 ^ f2 ^ g0 ^ g1; /* same as (f0&mask1)|(f1&mask2)|(f2&mask0) */ | |||
store_x16(f + 0 + j, g0); | |||
store_x16(f + 512 + j, g1); | |||
store_x16(f + 1024 + j, g2); | |||
j += 16; | |||
if (j == 512) { | |||
break; | |||
} | |||
f0 = load_x16(&fpad[0][j]); | |||
f1 = load_x16(&fpad[1][j]); | |||
f2 = load_x16(&fpad[2][j]); | |||
g0 = (f0 & mask1) | (f1 & mask2) | (f2 & mask0); | |||
g1 = (f0 & mask2) | (f1 & mask0) | (f2 & mask1); | |||
g2 = f0 ^ f1 ^ f2 ^ g0 ^ g1; /* same as (f0&mask0)|(f1&mask1)|(f2&mask2) */ | |||
store_x16(f + 0 + j, g0); | |||
store_x16(f + 512 + j, g1); | |||
store_x16(f + 1024 + j, g2); | |||
j += 16; | |||
} | |||
} | |||
static void mult768(int16 h[1536], const int16 f[768], const int16 g[768]) { | |||
vec3x512 x1, x2; | |||
vec1536 x3, x4; | |||
#define fpad (x1.v) | |||
#define gpad (x2.v) | |||
#define hpad fpad | |||
#define h_7681 (x3.v) | |||
#define h_10753 (x4.v) | |||
int i; | |||
good(fpad, f); | |||
PQCLEAN_NTRULPR653_AVX2_ntt512_7681(fpad[0], 3); | |||
good(gpad, g); | |||
PQCLEAN_NTRULPR653_AVX2_ntt512_7681(gpad[0], 3); | |||
for (i = 0; i < 512; i += 16) { | |||
int16x16 f0 = squeeze_7681_x16(load_x16(&fpad[0][i])); | |||
int16x16 f1 = squeeze_7681_x16(load_x16(&fpad[1][i])); | |||
int16x16 f2 = squeeze_7681_x16(load_x16(&fpad[2][i])); | |||
int16x16 g0 = squeeze_7681_x16(load_x16(&gpad[0][i])); | |||
int16x16 g1 = squeeze_7681_x16(load_x16(&gpad[1][i])); | |||
int16x16 g2 = squeeze_7681_x16(load_x16(&gpad[2][i])); | |||
int16x16 d0 = mulmod_7681_x16(f0, g0); | |||
int16x16 d1 = mulmod_7681_x16(f1, g1); | |||
int16x16 d2 = mulmod_7681_x16(f2, g2); | |||
int16x16 dsum = add_x16(add_x16(d0, d1), d2); | |||
int16x16 h0 = add_x16(dsum, mulmod_7681_x16(sub_x16(f2, f1), sub_x16(g1, g2))); | |||
int16x16 h1 = add_x16(dsum, mulmod_7681_x16(sub_x16(f1, f0), sub_x16(g0, g1))); | |||
int16x16 h2 = add_x16(dsum, mulmod_7681_x16(sub_x16(f0, f2), sub_x16(g2, g0))); | |||
store_x16(&hpad[0][i], squeeze_7681_x16(h0)); | |||
store_x16(&hpad[1][i], squeeze_7681_x16(h1)); | |||
store_x16(&hpad[2][i], squeeze_7681_x16(h2)); | |||
} | |||
PQCLEAN_NTRULPR653_AVX2_invntt512_7681(hpad[0], 3); | |||
ungood(h_7681, (const int16(*)[512]) hpad); | |||
good(fpad, f); | |||
PQCLEAN_NTRULPR653_AVX2_ntt512_10753(fpad[0], 3); | |||
good(gpad, g); | |||
PQCLEAN_NTRULPR653_AVX2_ntt512_10753(gpad[0], 3); | |||
for (i = 0; i < 512; i += 16) { | |||
int16x16 f0 = squeeze_10753_x16(load_x16(&fpad[0][i])); | |||
int16x16 f1 = squeeze_10753_x16(load_x16(&fpad[1][i])); | |||
int16x16 f2 = squeeze_10753_x16(load_x16(&fpad[2][i])); | |||
int16x16 g0 = squeeze_10753_x16(load_x16(&gpad[0][i])); | |||
int16x16 g1 = squeeze_10753_x16(load_x16(&gpad[1][i])); | |||
int16x16 g2 = squeeze_10753_x16(load_x16(&gpad[2][i])); | |||
int16x16 d0 = mulmod_10753_x16(f0, g0); | |||
int16x16 d1 = mulmod_10753_x16(f1, g1); | |||
int16x16 d2 = mulmod_10753_x16(f2, g2); | |||
int16x16 dsum = add_x16(add_x16(d0, d1), d2); | |||
int16x16 h0 = add_x16(dsum, mulmod_10753_x16(sub_x16(f2, f1), sub_x16(g1, g2))); | |||
int16x16 h1 = add_x16(dsum, mulmod_10753_x16(sub_x16(f1, f0), sub_x16(g0, g1))); | |||
int16x16 h2 = add_x16(dsum, mulmod_10753_x16(sub_x16(f0, f2), sub_x16(g2, g0))); | |||
store_x16(&hpad[0][i], squeeze_10753_x16(h0)); | |||
store_x16(&hpad[1][i], squeeze_10753_x16(h1)); | |||
store_x16(&hpad[2][i], squeeze_10753_x16(h2)); | |||
} | |||
PQCLEAN_NTRULPR653_AVX2_invntt512_10753(hpad[0], 3); | |||
ungood(h_10753, (const int16(*)[512]) hpad); | |||
for (i = 0; i < 1536; i += 16) { | |||
int16x16 u1 = load_x16(&h_10753[i]); | |||
int16x16 u2 = load_x16(&h_7681[i]); | |||
int16x16 t; | |||
u1 = mulmod_10753_x16(u1, const_x16(1268)); | |||
u2 = mulmod_7681_x16(u2, const_x16(956)); | |||
t = mulmod_7681_x16(sub_x16(u2, u1), const_x16(-2539)); | |||
t = add_x16(u1, mulmod_4621_x16(t, const_x16(1487))); | |||
store_x16(&h[i], t); | |||
} | |||
} | |||
#define crypto_decode_pxint16 PQCLEAN_NTRULPR653_AVX2_crypto_decode_653xint16 | |||
#define crypto_encode_pxint16 PQCLEAN_NTRULPR653_AVX2_crypto_encode_653xint16 | |||
#define p 653 | |||
#define q 4621 | |||
static inline int16x16 freeze_4621_x16(int16x16 x) { | |||
int16x16 mask, xq; | |||
x = add_x16(x, const_x16(q)&signmask_x16(x)); | |||
mask = signmask_x16(sub_x16(x, const_x16((q + 1) / 2))); | |||
xq = sub_x16(x, const_x16(q)); | |||
x = _mm256_blendv_epi8(xq, x, mask); | |||
return x; | |||
} | |||
int PQCLEAN_NTRULPR653_AVX2_crypto_core_multsntrup653(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes) { | |||
vec768 x1, x2; | |||
vec1536 x3; | |||
#define f (x1.v) | |||
#define g (x2.v) | |||
#define fg (x3.v) | |||
#define h f | |||
int i; | |||
int16x16 x; | |||
x = const_x16(0); | |||
for (i = p & ~15; i < 768; i += 16) { | |||
store_x16(&f[i], x); | |||
} | |||
for (i = p & ~15; i < 768; i += 16) { | |||
store_x16(&g[i], x); | |||
} | |||
crypto_decode_pxint16(f, inbytes); | |||
for (i = 0; i < 768; i += 16) { | |||
x = load_x16(&f[i]); | |||
x = freeze_4621_x16(squeeze_4621_x16(x)); | |||
store_x16(&f[i], x); | |||
} | |||
for (i = 0; i < p; ++i) { | |||
int8 gi = kbytes[i]; | |||
int8 gi0 = gi & 1; | |||
g[i] = gi0 - (gi & (gi0 << 1)); | |||
} | |||
mult768(fg, f, g); | |||
fg[0] -= fg[p - 1]; | |||
for (i = 0; i < 768; i += 16) { | |||
int16x16 fgi = load_x16(&fg[i]); | |||
int16x16 fgip = load_x16(&fg[i + p]); | |||
int16x16 fgip1 = load_x16(&fg[i + p - 1]); | |||
x = add_x16(fgi, add_x16(fgip, fgip1)); | |||
x = freeze_4621_x16(squeeze_4621_x16(x)); | |||
store_x16(&h[i], x); | |||
} | |||
crypto_encode_pxint16(outbytes, h); | |||
return 0; | |||
} |
@@ -0,0 +1,11 @@ | |||
#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_CORE_MULTSNTRUP653_H | |||
#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_CORE_MULTSNTRUP653_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_core_multsntrup653_OUTPUTBYTES 1306 | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_core_multsntrup653_INPUTBYTES 1306 | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_core_multsntrup653_KEYBYTES 653 | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_core_multsntrup653_CONSTBYTES 0 | |||
int PQCLEAN_NTRULPR653_AVX2_crypto_core_multsntrup653(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes); | |||
#endif |
@@ -0,0 +1,927 @@ | |||
#include "crypto_core_multsntrup653.h" | |||
#include "crypto_core_multsntrup653_ntt.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
/* auto-generated; do not edit */ | |||
typedef int8_t int8; | |||
typedef int16_t int16; | |||
#define zeta(n,i) (((__m256i *) zeta_##n)[(i)]) | |||
#define zeta_x4(n,i) (((__m256i *) zeta_x4_##n)[(i)]) | |||
#define zeta_qinv(n,i) (((__m256i *) qinvzeta_##n)[(i)]) | |||
#define zeta_x4_qinv(n,i) (((__m256i *) qinvzeta_x4_##n)[(i)]) | |||
#define zetainv(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) zeta_##n+(n)/2+1-16*((i)+1))) | |||
#define zetainv_x4(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) zeta_x4_##n+2*(n)+4-16*((i)+1))) | |||
#define zetainv_qinv(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) qinvzeta_##n+(n)/2+1-16*((i)+1))) | |||
#define zetainv_x4_qinv(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) qinvzeta_x4_##n+2*(n)+4-16*((i)+1))) | |||
typedef union { | |||
int16 data[93 * 16]; | |||
__m256i _dummy; | |||
} vec1488; | |||
static const vec1488 qdata_7681 = { .data = { | |||
#define q_x16 (qdata[0]) | |||
7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, | |||
#define qrecip_x16 (qdata[1]) | |||
17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, | |||
#define qshift_x16 (qdata[2]) | |||
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, | |||
#define zeta4_x16 (qdata[3]) | |||
-3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, | |||
#define zeta4_x16_qinv (qdata[4]) | |||
-28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, | |||
#define zeta8_x16 (qdata[5]) | |||
-3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, | |||
#define zeta8_x16_qinv (qdata[6]) | |||
-16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, | |||
#define zetainv8_x16 (qdata[7]) | |||
-3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, | |||
#define zetainv8_x16_qinv (qdata[8]) | |||
-10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, | |||
#define zeta_x4_16 (qdata+9) | |||
-3593, -3593, -3593, -3593, -2194, -2194, -2194, -2194, -3625, -3625, -3625, -3625, 1100, 1100, 1100, 1100, | |||
-3777, -3777, -3777, -3777, -2456, -2456, -2456, -2456, 3182, 3182, 3182, 3182, 3696, 3696, 3696, 3696, | |||
3593, 3593, 3593, 3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
#define qinvzeta_x4_16 (qdata+12) | |||
-9, -9, -9, -9, 4974, 4974, 4974, 4974, -16425, -16425, -16425, -16425, 7244, 7244, 7244, 7244, | |||
-28865, -28865, -28865, -28865, -14744, -14744, -14744, -14744, 10350, 10350, 10350, 10350, -4496, -4496, -4496, -4496, | |||
9, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
#define zeta_x4_32 (qdata+15) | |||
-3593, -3593, -3593, -3593, 1414, 1414, 1414, 1414, -2194, -2194, -2194, -2194, -2495, -2495, -2495, -2495, | |||
-3625, -3625, -3625, -3625, 2876, 2876, 2876, 2876, 1100, 1100, 1100, 1100, -2250, -2250, -2250, -2250, | |||
-3777, -3777, -3777, -3777, -1701, -1701, -1701, -1701, -2456, -2456, -2456, -2456, 834, 834, 834, 834, | |||
3182, 3182, 3182, 3182, -2319, -2319, -2319, -2319, 3696, 3696, 3696, 3696, 121, 121, 121, 121, | |||
3593, 3593, 3593, 3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
#define qinvzeta_x4_32 (qdata+20) | |||
-9, -9, -9, -9, 20870, 20870, 20870, 20870, 4974, 4974, 4974, 4974, 22593, 22593, 22593, 22593, | |||
-16425, -16425, -16425, -16425, 828, 828, 828, 828, 7244, 7244, 7244, 7244, -23754, -23754, -23754, -23754, | |||
-28865, -28865, -28865, -28865, 20315, 20315, 20315, 20315, -14744, -14744, -14744, -14744, 18242, 18242, 18242, 18242, | |||
10350, 10350, 10350, 10350, -18191, -18191, -18191, -18191, -4496, -4496, -4496, -4496, -11655, -11655, -11655, -11655, | |||
9, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
#define zeta_64 (qdata+25) | |||
-3593, -617, 1414, 3706, -2194, -1296, -2495, -2237, -3625, 2830, 2876, -1599, 1100, 1525, -2250, 2816, | |||
-3777, 1921, -1701, 2006, -2456, 1483, 834, -1986, 3182, 3364, -2319, -1993, 3696, -2557, 121, 2088, | |||
3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
#define qinvzeta_64 (qdata+28) | |||
-9, 19351, 20870, -15750, 4974, -9488, 22593, 7491, -16425, 26382, 828, 23489, 7244, 20469, -23754, 2816, | |||
-28865, -5759, 20315, -3114, -14744, 15307, 18242, -19394, 10350, -10972, -18191, -31177, -4496, -25597, -11655, 22568, | |||
9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
#define zeta_128 (qdata+31) | |||
-3593, -2804, -617, -396, 1414, -549, 3706, 810, -2194, -1321, -1296, 438, -2495, -2535, -2237, -3689, | |||
-3625, 2043, 2830, -1881, 2876, 3153, -1599, 7, 1100, -514, 1525, -1760, -2250, -2440, 2816, 3600, | |||
-3777, 103, 1921, -3174, -1701, 1535, 2006, -1887, -2456, 1399, 1483, -679, 834, 3772, -1986, 1738, | |||
3182, -1431, 3364, -3555, -2319, -2310, -1993, 638, 3696, -2956, -2557, -1305, 121, 2555, 2088, -3266, | |||
3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
#define qinvzeta_128 (qdata+36) | |||
-9, -29428, 19351, 26228, 20870, 21467, -15750, 5930, 4974, -14121, -9488, -21066, 22593, 2073, 7491, 16279, | |||
-16425, -25093, 26382, 26279, 828, -29103, 23489, 11783, 7244, 14846, 20469, 14624, -23754, -6536, 2816, 11792, | |||
-28865, -4505, -5759, -6246, 20315, 9215, -3114, 6817, -14744, 4983, 15307, -28839, 18242, 1724, -19394, 23242, | |||
10350, -21399, -10972, -29667, -18191, -21766, -31177, 15998, -4496, 23668, -25597, -5913, -11655, -24581, 22568, -20674, | |||
9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
#define zeta_256 (qdata+41) | |||
-3593, 2665, -2804, -2572, -617, 727, -396, 3417, 1414, 2579, -549, 373, 3706, 3750, 810, -1054, | |||
-2194, -2133, -1321, 1681, -1296, -1386, 438, -2732, -2495, 1919, -2535, -2391, -2237, 2835, -3689, 2, | |||
-3625, -783, 2043, 3145, 2830, 1533, -1881, 2789, 2876, 2649, 3153, 3692, -1599, -1390, 7, -1166, | |||
1100, 3310, -514, 2224, 1525, -2743, -1760, 2385, -2250, -486, -2440, -1756, 2816, -3816, 3600, -3831, | |||
-3777, -1799, 103, 1497, 1921, 1521, -3174, -194, -1701, -859, 1535, 2175, 2006, -2762, -1887, -1698, | |||
-2456, -3480, 1399, 2883, 1483, -3428, -679, -2113, 834, 1532, 3772, -660, -1986, -2764, 1738, -915, | |||
3182, 1056, -1431, 1350, 3364, 1464, -3555, 2919, -2319, -2160, -2310, 730, -1993, -1598, 638, 3456, | |||
3696, -1168, -2956, -3588, -2557, -921, -1305, 3405, 121, -404, 2555, -3135, 2088, 2233, -3266, -2426, | |||
3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
#define qinvzeta_256 (qdata+50) | |||
-9, -17303, -29428, 24052, 19351, -12073, 26228, -24743, 20870, -12269, 21467, 19317, -15750, -25946, 5930, 32738, | |||
4974, -4693, -14121, 2193, -9488, 26262, -21066, 7508, 22593, 9599, 2073, 10409, 7491, -12013, 16279, -15358, | |||
-16425, -16655, -25093, 32329, 26382, 24573, 26279, 13541, 828, -25511, -29103, 26220, 23489, -8558, 11783, -24718, | |||
7244, 10478, 14846, 26800, 20469, 26441, 14624, -29871, -23754, -3558, -6536, -16092, 2816, 8472, 11792, -7415, | |||
-28865, -13575, -4505, -26663, -5759, -14351, -6246, -17602, 20315, -22875, 9215, 9855, -3114, -24266, 6817, -2722, | |||
-14744, -15768, 4983, 12611, 15307, -21860, -28839, -27201, 18242, 32252, 1724, 21868, -19394, -8908, 23242, 13933, | |||
10350, 17440, -21399, -11962, -10972, 30136, -29667, -1689, -18191, 6032, -21766, 30426, -31177, 15810, 15998, 3456, | |||
-4496, -9360, 23668, 27132, -25597, -5529, -5913, 1869, -11655, 22124, -24581, 21953, 22568, 23225, -20674, 17030, | |||
9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
#define zeta_512 (qdata+59) | |||
-3593, 2005, 2665, 2891, -2804, 2345, -2572, 1121, -617, -188, 727, 2786, -396, -3208, 3417, -17, | |||
1414, -3752, 2579, 2815, -549, 1837, 373, 151, 3706, -1012, 3750, -1509, 810, -3214, -1054, 3177, | |||
-2194, -1403, -2133, -3314, -1321, 83, 1681, -658, -1296, 2070, -1386, -3547, 438, 3781, -2732, 2230, | |||
-2495, -1669, 1919, 2589, -2535, -3312, -2391, -3542, -2237, -1441, 2835, -3568, -3689, -402, 2, -1070, | |||
-3625, 3763, -783, -3550, 2043, -2303, 3145, -436, 2830, -893, 1533, 1712, -1881, 124, 2789, -2001, | |||
2876, -2460, 2649, 3770, 3153, 2965, 3692, -1203, -1599, 2874, -1390, -1407, 7, -3745, -1166, 1649, | |||
1100, 2937, 3310, 3461, -514, -1526, 2224, 715, 1525, -1689, -2743, 434, -1760, -3163, 2385, -929, | |||
-2250, -2167, -486, -1144, -2440, -370, -1756, 2378, 2816, -1084, -3816, -1586, 3600, 1931, -3831, -1242, | |||
-3777, 592, -1799, 2340, 103, -1338, 1497, -2071, 1921, 1519, 1521, 451, -3174, 589, -194, -3744, | |||
-1701, 3677, -859, -1295, 1535, 642, 2175, -3794, 2006, 2130, -2762, 2918, -1887, 3334, -1698, 2072, | |||
-2456, 509, -3480, 2998, 1399, -3408, 2883, 1476, 1483, -2262, -3428, -1779, -679, 2258, -2113, 1348, | |||
834, -692, 1532, 2247, 3772, 2083, -660, -226, -1986, 2532, -2764, -3693, 1738, -429, -915, -2059, | |||
3182, 2812, 1056, 3434, -1431, -2515, 1350, -236, 3364, -2386, 1464, 222, -3555, -2963, 2919, -2422, | |||
-2319, -3657, -2160, 3450, -2310, -791, 730, 1181, -1993, -1404, -1598, 2339, 638, -3366, 3456, 2161, | |||
3696, -3343, -1168, 2719, -2956, -826, -3588, -670, -2557, 777, -921, 1151, -1305, -796, 3405, -1278, | |||
121, -3287, -404, 1072, 2555, 293, -3135, 2767, 2088, -3335, 2233, 3581, -3266, 3723, -2426, -179, | |||
3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
#define qinvzeta_512 (qdata+76) | |||
-9, 4565, -17303, 16715, -29428, 15145, 24052, -22943, 19351, 1860, -12073, -28958, 26228, -7304, -24743, -529, | |||
20870, -24232, -12269, 10495, 21467, -16083, 19317, 20119, -15750, -27636, -25946, -12261, 5930, -26766, 32738, -16791, | |||
4974, 25733, -4693, 20238, -14121, 18003, 2193, 6510, -9488, 29718, 26262, -25563, -21066, -1851, 7508, -19274, | |||
22593, -28805, 9599, -23523, 2073, 4880, 10409, 1578, 7491, -10145, -12013, 4624, 16279, 6766, -15358, 24530, | |||
-16425, 5299, -16655, -2526, -25093, -9983, 32329, 5708, 26382, -23933, 24573, 26288, 26279, 30844, 13541, 30255, | |||
828, 15972, -25511, 17082, -29103, -27243, 26220, -2739, 23489, 16186, -8558, -9087, 11783, -12449, -24718, -14223, | |||
7244, -8839, 10478, 30597, 14846, -12790, 26800, 14539, 20469, -6297, 26441, 9650, 14624, -25179, -29871, -9633, | |||
-23754, -5751, -3558, 2952, -6536, 23182, -16092, 23882, 2816, 964, 8472, -10802, 11792, -17013, -7415, -30938, | |||
-28865, -23984, -13575, -11996, -4505, -14650, -26663, -22039, -5759, 1007, -14351, 10179, -6246, -947, -17602, -20128, | |||
20315, 10333, -22875, -17167, 9215, -14718, 9855, -29394, -3114, 27730, -24266, 5990, 6817, 22790, -2722, 14360, | |||
-14744, 23549, -15768, -18506, 4983, 21168, 12611, 3524, 15307, 2858, -21860, 29453, -28839, 27858, -27201, 3396, | |||
18242, 5452, 32252, -18745, 1724, -4573, 21868, 31518, -19394, 20964, -8908, -18541, 23242, 17491, 13933, 16885, | |||
10350, -32004, 17440, -24214, -21399, -20435, -11962, -22764, -10972, -27986, 30136, -802, -29667, 11885, -1689, -13686, | |||
-18191, 32695, 6032, -16006, -21766, -20759, 30426, -24931, -31177, -32124, 15810, -4317, 15998, 26330, 3456, -13711, | |||
-4496, -19215, -9360, 26783, 23668, -14138, 27132, -32414, -25597, -2807, -5529, 8831, -5913, 17636, 1869, -16638, | |||
-11655, 9513, 22124, 25648, -24581, -21723, 21953, -14129, 22568, -15111, 23225, 26621, -20674, -15221, 17030, -1715, | |||
9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
} | |||
}; | |||
static const vec1488 qdata_10753 = { .data = { | |||
10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, | |||
24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, | |||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, | |||
223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, | |||
27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, | |||
4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, | |||
-1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, | |||
3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, | |||
-408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, | |||
1018, 1018, 1018, 1018, 2413, 2413, 2413, 2413, 4188, 4188, 4188, 4188, 357, 357, 357, 357, | |||
223, 223, 223, 223, -3686, -3686, -3686, -3686, -3688, -3688, -3688, -3688, -376, -376, -376, -376, | |||
-1018, -1018, -1018, -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
-6, -6, -6, -6, 10093, 10093, 10093, 10093, -1956, -1956, -1956, -1956, 28517, 28517, 28517, 28517, | |||
27359, 27359, 27359, 27359, -21094, -21094, -21094, -21094, 408, 408, 408, 408, -20856, -20856, -20856, -20856, | |||
6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
1018, 1018, 1018, 1018, -2695, -2695, -2695, -2695, 2413, 2413, 2413, 2413, 425, 425, 425, 425, | |||
4188, 4188, 4188, 4188, -4855, -4855, -4855, -4855, 357, 357, 357, 357, -3364, -3364, -3364, -3364, | |||
223, 223, 223, 223, 730, 730, 730, 730, -3686, -3686, -3686, -3686, -4544, -4544, -4544, -4544, | |||
-3688, -3688, -3688, -3688, -2236, -2236, -2236, -2236, -376, -376, -376, -376, 3784, 3784, 3784, 3784, | |||
-1018, -1018, -1018, -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
-6, -6, -6, -6, 7033, 7033, 7033, 7033, 10093, 10093, 10093, 10093, 18345, 18345, 18345, 18345, | |||
-1956, -1956, -1956, -1956, 29449, 29449, 29449, 29449, 28517, 28517, 28517, 28517, -9508, -9508, -9508, -9508, | |||
27359, 27359, 27359, 27359, 16090, 16090, 16090, 16090, -21094, -21094, -21094, -21094, 28224, 28224, 28224, 28224, | |||
408, 408, 408, 408, -12476, -12476, -12476, -12476, -20856, -20856, -20856, -20856, 16072, 16072, 16072, 16072, | |||
6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
1018, -1520, -2695, 1341, 2413, 918, 425, 5175, 4188, -4035, -4855, 341, 357, 4347, -3364, 5213, | |||
223, -4875, 730, 1931, -3686, -2503, -4544, -4095, -3688, 5063, -2236, -3823, -376, 3012, 3784, -2629, | |||
-1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
-6, 23056, 7033, 829, 10093, 26518, 18345, 3639, -1956, -4547, 29449, 3925, 28517, -7429, -9508, -11683, | |||
27359, -17675, 16090, 14731, -21094, -25543, 28224, -14847, 408, 28103, -12476, 10001, -20856, -7228, 16072, 18363, | |||
6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
1018, -2935, -1520, -4744, -2695, -205, 1341, 1299, 2413, 4, 918, -4379, 425, -4616, 5175, -544, | |||
4188, 4129, -4035, 4102, -4855, -1287, 341, -2388, 357, 1284, 4347, 2984, -3364, 2178, 5213, -2576, | |||
223, 2790, -4875, 4876, 730, -4513, 1931, -3085, -3686, 3550, -2503, 847, -4544, 193, -4095, 1085, | |||
-3688, 3091, 5063, -4742, -2236, 2982, -3823, -1009, -376, -268, 3012, 3062, 3784, -2565, -2629, 4189, | |||
-1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
-6, 31369, 23056, 15736, 7033, -24269, 829, -6381, 10093, 22532, 26518, 23781, 18345, 15864, 3639, 15840, | |||
-1956, -23007, -4547, 5126, 29449, 8441, 3925, -16724, 28517, 23812, -7429, 31656, -9508, -19326, -11683, -27152, | |||
27359, 20198, -17675, 6924, 16090, 22623, 14731, 5619, -21094, -24098, -25543, 3407, 28224, 22209, -14847, 573, | |||
408, -4589, 28103, -5766, -12476, -12378, 10001, -31217, -20856, -2316, -7228, -20490, 16072, -14341, 18363, -12707, | |||
6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
1018, -4734, -2935, -400, -1520, 4977, -4744, -2973, -2695, 512, -205, -779, 1341, -1356, 1299, 635, | |||
2413, 567, 4, -4286, 918, -5114, -4379, -1586, 425, 1615, -4616, -336, 5175, -1841, -544, 2234, | |||
4188, -3441, 4129, 636, -4035, -4580, 4102, 2684, -4855, 3057, -1287, -2740, 341, -5156, -2388, -472, | |||
357, -794, 1284, 578, 4347, 3615, 2984, -3715, -3364, 2271, 2178, -326, 5213, 454, -2576, -3337, | |||
223, 2998, 2790, -151, -4875, 2981, 4876, 1324, 730, 2774, -4513, 2206, 1931, 886, -3085, -970, | |||
-3686, 3198, 3550, 2737, -2503, -909, 847, 1068, -4544, -2213, 193, 2884, -4095, -4808, 1085, 4123, | |||
-3688, 5341, 3091, 5294, 5063, -116, -4742, -5116, -2236, -2045, 2982, -1572, -3823, 4828, -1009, 467, | |||
-376, 5023, -268, -3169, 3012, -1458, 3062, -1268, 3784, -675, -2565, 1006, -2629, 5064, 4189, 864, | |||
-1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
-6, -26238, 31369, -24976, 23056, -30351, 15736, -18845, 7033, 512, -24269, -13579, 829, 29364, -6381, -11141, | |||
10093, -969, 22532, 6978, 26518, -4090, 23781, 11726, 18345, 4175, 15864, 7856, 3639, 719, 15840, -31558, | |||
-1956, 31887, -23007, -21892, -4547, 22044, 5126, -19844, 29449, -32271, 8441, 32076, 3925, -11300, -16724, 28200, | |||
28517, 16614, 23812, 11842, -7429, -2017, 31656, 28541, -9508, 29407, -19326, 31418, -11683, -31290, -27152, 27895, | |||
27359, 12214, 20198, -14999, -17675, -1627, 6924, -13012, 16090, -4394, 22623, 7326, 14731, -22666, 5619, 8246, | |||
-21094, 24702, -24098, 177, -25543, 7795, 3407, -13268, 28224, 2395, 22209, -7356, -14847, -17096, 573, -24037, | |||
408, -11555, -4589, -30546, 28103, 1932, -5766, 17412, -12476, 31235, -12378, -7716, 10001, -1316, -31217, 25555, | |||
-20856, -609, -2316, -8801, -7228, 11854, -20490, 780, 16072, -17571, -14341, -2066, 18363, 17352, -12707, 17248, | |||
6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
1018, 3453, -4734, 4519, -2935, 2118, -400, -554, -1520, 2196, 4977, 1893, -4744, -1409, -2973, -1053, | |||
-2695, 4601, 512, 279, -205, -3241, -779, 4889, 1341, 3524, -1356, -1663, 1299, 2283, 635, 73, | |||
2413, 2428, 567, 624, 4, -1930, -4286, 3419, 918, -2062, -5114, 5068, -4379, -97, -1586, 1782, | |||
425, 4621, 1615, 355, -4616, 1349, -336, 825, 5175, 3135, -1841, 1160, -544, 4408, 2234, -2605, | |||
4188, 854, -3441, -1056, 4129, 2439, 636, 4967, -4035, -4782, -4580, -5268, 4102, -663, 2684, -4670, | |||
-4855, 3760, 3057, 3535, -1287, 2680, -2740, -569, 341, 2139, -5156, 3827, -2388, 1639, -472, 1927, | |||
357, 5172, -794, -4003, 1284, 4144, 578, 693, 4347, 4784, 3615, 3125, 2984, 1122, -3715, 2113, | |||
-3364, -573, 2271, -4328, 2178, 2909, -326, -4000, 5213, -4447, 454, -3995, -2576, -4428, -3337, 2529, | |||
223, 5309, 2998, 5120, 2790, -2050, -151, 2963, -4875, 2657, 2981, -2807, 4876, 2237, 1324, -4403, | |||
730, 2624, 2774, -5083, -4513, 40, 2206, 152, 1931, -1573, 886, 2625, -3085, -778, -970, -5107, | |||
-3686, 4250, 3198, -5356, 3550, -3148, 2737, -3360, -2503, -2015, -909, 3096, 847, 5313, 1068, 834, | |||
-4544, -1132, -2213, -2151, 193, -1722, 2884, -4393, -4095, 2662, -4808, -2788, 1085, -1992, 4123, 5334, | |||
-3688, 5215, 5341, -1689, 3091, -2117, 5294, 4859, 5063, 3410, -116, 2205, -4742, -2374, -5116, -4720, | |||
-2236, 3570, -2045, 2813, 2982, 2087, -1572, -4973, -3823, 458, 4828, 3891, -1009, -2419, 467, -4891, | |||
-376, -1381, 5023, 1204, -268, 274, -3169, -3260, 3012, -1635, -1458, 4540, 3062, -4254, -1268, -1111, | |||
3784, 2230, -675, -2279, -2565, -4359, 1006, -1510, -2629, 5015, 5064, -2449, 4189, -5005, 864, 2487, | |||
-1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
-6, -29827, -26238, -21593, 31369, -29626, -24976, -7722, 23056, -16236, -30351, 30053, 15736, 9343, -18845, -16925, | |||
7033, 14329, 512, 15127, -24269, -21161, -13579, -1767, 829, -6716, 29364, -12415, -6381, 31467, -11141, 1609, | |||
10093, -20100, -969, -23952, 22532, -25482, 6978, 8027, 26518, 17394, -4090, -25652, 23781, -5729, 11726, -21770, | |||
18345, -4083, 4175, -15517, 15864, -19643, 7856, -22215, 3639, -18881, 719, -19320, 15840, -7880, -31558, 22483, | |||
-1956, -6314, 31887, 15328, -23007, -7289, -21892, 11623, -4547, 31058, 22044, 13164, 5126, -15511, -19844, 6594, | |||
29449, 11952, -32271, 6095, 8441, 23160, 32076, 22471, 3925, 6747, -11300, 12531, -16724, 8295, 28200, -7801, | |||
28517, -29644, 16614, -20899, 23812, 12336, 11842, 20661, -7429, 12976, -2017, 23093, 31656, -3998, 28541, 24129, | |||
-9508, -61, 29407, -232, -19326, -13987, 31418, 12384, -11683, -31583, -31290, 24165, -27152, 26292, 27895, 8161, | |||
27359, 4797, 12214, 5120, 20198, 19454, -14999, -4717, -17675, 8289, -1627, 31497, 6924, 1725, -13012, 19661, | |||
16090, -30144, -4394, -9691, 22623, 28712, 7326, 4248, 14731, 3035, -22666, 24641, 5619, -24330, 8246, -13811, | |||
-21094, -13158, 24702, -23788, -24098, 27572, 177, 13024, -25543, -29151, 7795, 7192, 3407, 27329, -13268, 12098, | |||
28224, -19564, 2395, -8807, 22209, 32070, -7356, -22313, -14847, 20070, -17096, 23836, 573, -14280, -24037, -1834, | |||
408, 32351, -11555, 4967, -4589, 18875, -30546, -6917, 28103, -26286, 1932, 18077, -5766, 29370, 17412, 19856, | |||
-12476, 23026, 31235, -30467, -12378, -24025, -7716, -12653, 10001, -8758, -1316, -20173, -31217, -11123, 25555, 23269, | |||
-20856, -29541, -609, 31924, -2316, 3346, -8801, -13500, -7228, 14237, 11854, 14780, -20490, -9374, 780, 16809, | |||
16072, 11446, -17571, -8935, -14341, 5369, -2066, -18918, 18363, 19863, 17352, -16273, -12707, 3699, 17248, 951, | |||
6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
} | |||
}; | |||
static inline __m256i sub_x16(__m256i a, __m256i b) { | |||
//__asm__("vpsubw %1,%0,%0" : "+x"(a),"+x"(b)); | |||
return _mm256_sub_epi16(a, b); | |||
} | |||
static inline __m256i add_x16(__m256i a, __m256i b) { | |||
return _mm256_add_epi16(a, b); | |||
} | |||
static inline __m256i reduce_x16(const __m256i *qdata, __m256i x) { | |||
__m256i y = _mm256_mulhi_epi16(x, qrecip_x16); | |||
y = _mm256_mulhrs_epi16(y, qshift_x16); | |||
y = _mm256_mullo_epi16(y, q_x16); | |||
return sub_x16(x, y); | |||
} | |||
static inline __m256i mulmod_x16_scaled(const __m256i *qdata, __m256i x, __m256i y, __m256i yqinv) { | |||
__m256i b = _mm256_mulhi_epi16(x, y); | |||
__m256i d = _mm256_mullo_epi16(x, yqinv); | |||
__m256i e = _mm256_mulhi_epi16(d, q_x16); | |||
return sub_x16(b, e); | |||
} | |||
typedef union { | |||
int8 data[32]; | |||
__m256i _dummy; | |||
} byte32; | |||
static const byte32 shuffle_buf = { .data = { | |||
14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, | |||
14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, | |||
} | |||
}; | |||
#define shuffle (*(__m256i *) shuffle_buf.data) | |||
static inline __m256i _mm256_loadu_reverse16(const __m256i *p) { | |||
__m256i x = _mm256_loadu_si256(p); | |||
x = _mm256_permute2x128_si256(x, x, 1); | |||
x = _mm256_shuffle_epi8(x, shuffle); | |||
return x; | |||
} | |||
static void ntt128(int16 *f, int reps, const __m256i *qdata) { | |||
__m256i f0, f1, f2, f3, g0, g1, g2, g3, h0, h1, h2, h3; | |||
int16 *origf = f; | |||
int rep; | |||
__m256i zetainv_128_0 = zetainv(128, 0); | |||
__m256i zetainv_qinv_128_0 = zetainv_qinv(128, 0); | |||
__m256i zetainv_x4_32_0 = zetainv_x4(32, 0); | |||
__m256i zetainv_x4_qinv_32_0 = zetainv_x4_qinv(32, 0); | |||
__m256i zetainv_128_1 = zetainv(128, 1); | |||
__m256i zetainv_qinv_128_1 = zetainv_qinv(128, 1); | |||
__m256i zetainv_x4_32_1 = zetainv_x4(32, 1); | |||
__m256i zetainv_x4_qinv_32_1 = zetainv_x4_qinv(32, 1); | |||
for (rep = 0; rep < reps; ++rep) { | |||
f1 = _mm256_loadu_si256((__m256i *) (f + 32)); | |||
f3 = _mm256_loadu_si256((__m256i *) (f + 96)); | |||
g3 = sub_x16(f1, f3); | |||
g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv); | |||
g1 = add_x16(f1, f3); | |||
f0 = _mm256_loadu_si256((__m256i *) (f + 0)); | |||
f2 = _mm256_loadu_si256((__m256i *) (f + 64)); | |||
g2 = sub_x16(f0, f2); | |||
g0 = add_x16(f0, f2); | |||
f3 = sub_x16(g3, g2); | |||
f2 = add_x16(g2, g3); | |||
f3 = mulmod_x16_scaled(qdata, f3, zetainv_128_0, zetainv_qinv_128_0); | |||
f2 = mulmod_x16_scaled(qdata, f2, zeta(128, 0), zeta_qinv(128, 0)); | |||
g2 = _mm256_unpacklo_epi16(f2, f3); | |||
g3 = _mm256_unpackhi_epi16(f2, f3); | |||
f1 = sub_x16(g0, g1); | |||
f0 = add_x16(g0, g1); | |||
f1 = mulmod_x16_scaled(qdata, f1, zeta(64, 0), zeta_qinv(64, 0)); | |||
f0 = reduce_x16(qdata, f0); | |||
g0 = _mm256_unpacklo_epi16(f0, f1); | |||
h0 = _mm256_unpacklo_epi32(g0, g2); | |||
h1 = _mm256_unpackhi_epi32(g0, g2); | |||
g1 = _mm256_unpackhi_epi16(f0, f1); | |||
h2 = _mm256_unpacklo_epi32(g1, g3); | |||
h3 = _mm256_unpackhi_epi32(g1, g3); | |||
f0 = _mm256_permute2x128_si256(h0, h1, 0x20); | |||
f2 = _mm256_permute2x128_si256(h0, h1, 0x31); | |||
f1 = _mm256_permute2x128_si256(h2, h3, 0x20); | |||
f3 = _mm256_permute2x128_si256(h2, h3, 0x31); | |||
_mm256_storeu_si256((__m256i *) (f + 0), f0); | |||
_mm256_storeu_si256((__m256i *) (f + 64), f2); | |||
_mm256_storeu_si256((__m256i *) (f + 32), f1); | |||
_mm256_storeu_si256((__m256i *) (f + 96), f3); | |||
f1 = _mm256_loadu_si256((__m256i *) (f + 48)); | |||
f3 = _mm256_loadu_si256((__m256i *) (f + 112)); | |||
g3 = sub_x16(f1, f3); | |||
g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv); | |||
g1 = add_x16(f1, f3); | |||
f0 = _mm256_loadu_si256((__m256i *) (f + 16)); | |||
f2 = _mm256_loadu_si256((__m256i *) (f + 80)); | |||
g2 = sub_x16(f0, f2); | |||
g0 = add_x16(f0, f2); | |||
f3 = sub_x16(g3, g2); | |||
f2 = add_x16(g2, g3); | |||
f3 = mulmod_x16_scaled(qdata, f3, zetainv_128_1, zetainv_qinv_128_1); | |||
f2 = mulmod_x16_scaled(qdata, f2, zeta(128, 1), zeta_qinv(128, 1)); | |||
g2 = _mm256_unpacklo_epi16(f2, f3); | |||
g3 = _mm256_unpackhi_epi16(f2, f3); | |||
f1 = sub_x16(g0, g1); | |||
f0 = add_x16(g0, g1); | |||
f1 = mulmod_x16_scaled(qdata, f1, zeta(64, 1), zeta_qinv(64, 1)); | |||
f0 = reduce_x16(qdata, f0); | |||
g0 = _mm256_unpacklo_epi16(f0, f1); | |||
h0 = _mm256_unpacklo_epi32(g0, g2); | |||
h1 = _mm256_unpackhi_epi32(g0, g2); | |||
g1 = _mm256_unpackhi_epi16(f0, f1); | |||
h2 = _mm256_unpacklo_epi32(g1, g3); | |||
h3 = _mm256_unpackhi_epi32(g1, g3); | |||
f0 = _mm256_permute2x128_si256(h0, h1, 0x20); | |||
f2 = _mm256_permute2x128_si256(h0, h1, 0x31); | |||
f1 = _mm256_permute2x128_si256(h2, h3, 0x20); | |||
f3 = _mm256_permute2x128_si256(h2, h3, 0x31); | |||
_mm256_storeu_si256((__m256i *) (f + 16), f0); | |||
_mm256_storeu_si256((__m256i *) (f + 80), f2); | |||
_mm256_storeu_si256((__m256i *) (f + 48), f1); | |||
_mm256_storeu_si256((__m256i *) (f + 112), f3); | |||
f += 128; | |||
} | |||
f = origf; | |||
for (rep = 0; rep < reps; ++rep) { | |||
f1 = _mm256_loadu_si256((__m256i *) (f + 64)); | |||
f3 = _mm256_loadu_si256((__m256i *) (f + 80)); | |||
g3 = sub_x16(f1, f3); | |||
g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv); | |||
g1 = add_x16(f1, f3); | |||
f0 = _mm256_loadu_si256((__m256i *) (f + 0)); | |||
f2 = _mm256_loadu_si256((__m256i *) (f + 16)); | |||
g2 = sub_x16(f0, f2); | |||
g0 = add_x16(f0, f2); | |||
f3 = sub_x16(g3, g2); | |||
f2 = add_x16(g2, g3); | |||
f3 = mulmod_x16_scaled(qdata, f3, zetainv_x4_32_0, zetainv_x4_qinv_32_0); | |||
f2 = mulmod_x16_scaled(qdata, f2, zeta_x4(32, 0), zeta_x4_qinv(32, 0)); | |||
g2 = _mm256_unpacklo_epi64(f2, f3); | |||
g3 = _mm256_unpackhi_epi64(f2, f3); | |||
f1 = sub_x16(g0, g1); | |||
f0 = add_x16(g0, g1); | |||
f1 = mulmod_x16_scaled(qdata, f1, zeta_x4(16, 0), zeta_x4_qinv(16, 0)); | |||
f0 = reduce_x16(qdata, f0); | |||
g1 = _mm256_unpackhi_epi64(f0, f1); | |||
g0 = _mm256_unpacklo_epi64(f0, f1); | |||
f1 = _mm256_permute2x128_si256(g1, g3, 0x20); | |||
f3 = _mm256_permute2x128_si256(g1, g3, 0x31); | |||
f0 = _mm256_permute2x128_si256(g0, g2, 0x20); | |||
f2 = _mm256_permute2x128_si256(g0, g2, 0x31); | |||
_mm256_storeu_si256((__m256i *) (f + 64), f1); | |||
_mm256_storeu_si256((__m256i *) (f + 80), f3); | |||
_mm256_storeu_si256((__m256i *) (f + 0), f0); | |||
_mm256_storeu_si256((__m256i *) (f + 16), f2); | |||
f1 = _mm256_loadu_si256((__m256i *) (f + 96)); | |||
f3 = _mm256_loadu_si256((__m256i *) (f + 112)); | |||
g3 = sub_x16(f1, f3); | |||
g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv); | |||
g1 = add_x16(f1, f3); | |||
f0 = _mm256_loadu_si256((__m256i *) (f + 32)); | |||
f2 = _mm256_loadu_si256((__m256i *) (f + 48)); | |||
g2 = sub_x16(f0, f2); | |||
g0 = add_x16(f0, f2); | |||
f3 = sub_x16(g3, g2); | |||
f2 = add_x16(g2, g3); | |||
f3 = mulmod_x16_scaled(qdata, f3, zetainv_x4_32_1, zetainv_x4_qinv_32_1); | |||
f2 = mulmod_x16_scaled(qdata, f2, zeta_x4(32, 1), zeta_x4_qinv(32, 1)); | |||
g2 = _mm256_unpacklo_epi64(f2, f3); | |||
g3 = _mm256_unpackhi_epi64(f2, f3); | |||
f1 = sub_x16(g0, g1); | |||
f0 = add_x16(g0, g1); | |||
f1 = mulmod_x16_scaled(qdata, f1, zeta_x4(16, 1), zeta_x4_qinv(16, 1)); | |||
f0 = reduce_x16(qdata, f0); | |||
g1 = _mm256_unpackhi_epi64(f0, f1); | |||
g0 = _mm256_unpacklo_epi64(f0, f1); | |||
f1 = _mm256_permute2x128_si256(g1, g3, 0x20); | |||
f3 = _mm256_permute2x128_si256(g1, g3, 0x31); | |||
f0 = _mm256_permute2x128_si256(g0, g2, 0x20); | |||
f2 = _mm256_permute2x128_si256(g0, g2, 0x31); | |||
_mm256_storeu_si256((__m256i *) (f + 96), f1); | |||
_mm256_storeu_si256((__m256i *) (f + 112), f3); | |||
_mm256_storeu_si256((__m256i *) (f + 32), f0); | |||
_mm256_storeu_si256((__m256i *) (f + 48), f2); | |||
f += 128; | |||
} | |||
f = origf; | |||
for (rep = 0; rep < reps; ++rep) { | |||
f1 = _mm256_loadu_si256((__m256i *) (f + 16)); | |||
f3 = _mm256_loadu_si256((__m256i *) (f + 48)); | |||
g3 = sub_x16(f1, f3); | |||
g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv); | |||
g1 = add_x16(f1, f3); | |||
f0 = _mm256_loadu_si256((__m256i *) (f + 0)); | |||
f2 = _mm256_loadu_si256((__m256i *) (f + 32)); | |||
g2 = sub_x16(f0, f2); | |||
g0 = add_x16(f0, f2); | |||
f2 = add_x16(g2, g3); | |||
f3 = sub_x16(g2, g3); | |||
f2 = reduce_x16(qdata, f2); | |||
f3 = reduce_x16(qdata, f3); | |||
f1 = sub_x16(g0, g1); | |||
f0 = add_x16(g0, g1); | |||
f0 = reduce_x16(qdata, f0); | |||
h0 = f0; | |||
h1 = f1; | |||
h2 = f2; | |||
h3 = f3; | |||
f1 = _mm256_loadu_si256((__m256i *) (f + 80)); | |||
f3 = _mm256_loadu_si256((__m256i *) (f + 112)); | |||
g3 = sub_x16(f1, f3); | |||
g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv); | |||
g1 = add_x16(f1, f3); | |||
f0 = _mm256_loadu_si256((__m256i *) (f + 64)); | |||
f2 = _mm256_loadu_si256((__m256i *) (f + 96)); | |||
g2 = sub_x16(f0, f2); | |||
g0 = add_x16(f0, f2); | |||
f3 = sub_x16(g3, g2); | |||
f2 = add_x16(g2, g3); | |||
f3 = mulmod_x16_scaled(qdata, f3, zetainv8_x16, zetainv8_x16_qinv); | |||
f2 = mulmod_x16_scaled(qdata, f2, zeta8_x16, zeta8_x16_qinv); | |||
f1 = sub_x16(g0, g1); | |||
f0 = add_x16(g0, g1); | |||
f1 = mulmod_x16_scaled(qdata, f1, zeta4_x16, zeta4_x16_qinv); | |||
f0 = reduce_x16(qdata, f0); | |||
g0 = add_x16(h0, f0); | |||
g1 = add_x16(h1, f1); | |||
g2 = add_x16(h2, f2); | |||
g3 = add_x16(h3, f3); | |||
_mm256_storeu_si256((__m256i *) (f + 0), g0); | |||
_mm256_storeu_si256((__m256i *) (f + 16), g1); | |||
_mm256_storeu_si256((__m256i *) (f + 32), g2); | |||
_mm256_storeu_si256((__m256i *) (f + 48), g3); | |||
g0 = sub_x16(h0, f0); | |||
g1 = sub_x16(h1, f1); | |||
g2 = sub_x16(h2, f2); | |||
g3 = sub_x16(h3, f3); | |||
_mm256_storeu_si256((__m256i *) (f + 64), g0); | |||
_mm256_storeu_si256((__m256i *) (f + 80), g1); | |||
_mm256_storeu_si256((__m256i *) (f + 96), g2); | |||
_mm256_storeu_si256((__m256i *) (f + 112), g3); | |||
f += 128; | |||
} | |||
} | |||
static void ntt512(int16 *f, int reps, const __m256i *qdata) { | |||
__m256i f0, f1, f2, f3, g0, g1, g2, g3; /* [-Werror=unused-variable] */ /* ,h0,h1,h2,h3; */ | |||
int16 *origf = f; | |||
int rep; | |||
__m256i zetainv_512[8]; | |||
__m256i zetainv_qinv_512[8]; | |||
int i; | |||
for (i = 0; i < 8; ++i) { | |||
zetainv_512[i] = zetainv(512, i); | |||
} | |||
for (i = 0; i < 8; ++i) { | |||
zetainv_qinv_512[i] = zetainv_qinv(512, i); | |||
} | |||
for (rep = 0; rep < reps; ++rep) { | |||
for (i = 0; i < 8; ++i) { | |||
f1 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 128)); | |||
f3 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 384)); | |||
g3 = sub_x16(f1, f3); | |||
g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv); | |||
g1 = add_x16(f1, f3); | |||
f0 = _mm256_loadu_si256((__m256i *) (f + 16 * i)); | |||
f2 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 256)); | |||
g2 = sub_x16(f0, f2); | |||
g0 = add_x16(f0, f2); | |||
f3 = sub_x16(g3, g2); | |||
f2 = add_x16(g2, g3); | |||
f3 = mulmod_x16_scaled(qdata, f3, zetainv_512[i], zetainv_qinv_512[i]); | |||
f2 = mulmod_x16_scaled(qdata, f2, zeta(512, i), zeta_qinv(512, i)); | |||
f1 = sub_x16(g0, g1); | |||
f0 = add_x16(g0, g1); | |||
f1 = mulmod_x16_scaled(qdata, f1, zeta(256, i), zeta_qinv(256, i)); | |||
f0 = reduce_x16(qdata, f0); | |||
_mm256_storeu_si256((__m256i *) (f + 16 * i + 384), f3); | |||
_mm256_storeu_si256((__m256i *) (f + 16 * i + 256), f2); | |||
_mm256_storeu_si256((__m256i *) (f + 16 * i + 128), f1); | |||
_mm256_storeu_si256((__m256i *) (f + 16 * i), f0); | |||
} | |||
f += 512; | |||
} | |||
f = origf; | |||
ntt128(f, reps * 4, qdata); | |||
} | |||
void PQCLEAN_NTRULPR653_AVX2_ntt512_7681(int16 *f, int reps) { | |||
ntt512(f, reps, (const __m256i *) qdata_7681.data); | |||
} | |||
void PQCLEAN_NTRULPR653_AVX2_ntt512_10753(int16 *f, int reps) { | |||
ntt512(f, reps, (const __m256i *) qdata_10753.data); | |||
} | |||
static void invntt128(int16 *f, int reps, const __m256i *qdata) { | |||
__m256i f0, f1, f2, f3, g0, g1, g2, g3, h0, h1, h2, h3; | |||
int16 *origf = f; | |||
int rep; | |||
__m256i zetainv_x4_16_0 = zetainv_x4(16, 0); | |||
__m256i zetainv_x4_qinv_16_0 = zetainv_x4_qinv(16, 0); | |||
__m256i zetainv_x4_32_0 = zetainv_x4(32, 0); | |||
__m256i zetainv_x4_qinv_32_0 = zetainv_x4_qinv(32, 0); | |||
__m256i zetainv_64_0 = zetainv(64, 0); | |||
__m256i zetainv_qinv_64_0 = zetainv_qinv(64, 0); | |||
__m256i zetainv_128_0 = zetainv(128, 0); | |||
__m256i zetainv_qinv_128_0 = zetainv_qinv(128, 0); | |||
__m256i zetainv_x4_16_1 = zetainv_x4(16, 1); | |||
__m256i zetainv_x4_qinv_16_1 = zetainv_x4_qinv(16, 1); | |||
__m256i zetainv_x4_32_1 = zetainv_x4(32, 1); | |||
__m256i zetainv_x4_qinv_32_1 = zetainv_x4_qinv(32, 1); | |||
__m256i zetainv_64_1 = zetainv(64, 1); | |||
__m256i zetainv_qinv_64_1 = zetainv_qinv(64, 1); | |||
__m256i zetainv_128_1 = zetainv(128, 1); | |||
__m256i zetainv_qinv_128_1 = zetainv_qinv(128, 1); | |||
for (rep = 0; rep < reps; ++rep) { | |||
f0 = _mm256_loadu_si256((__m256i *) (f + 0)); | |||
f1 = _mm256_loadu_si256((__m256i *) (f + 64)); | |||
f2 = _mm256_loadu_si256((__m256i *) (f + 16)); | |||
f3 = _mm256_loadu_si256((__m256i *) (f + 80)); | |||
g0 = _mm256_loadu_si256((__m256i *) (f + 32)); | |||
g1 = _mm256_loadu_si256((__m256i *) (f + 96)); | |||
g2 = _mm256_loadu_si256((__m256i *) (f + 48)); | |||
g3 = _mm256_loadu_si256((__m256i *) (f + 112)); | |||
h1 = sub_x16(f0, f1); | |||
h1 = reduce_x16(qdata, h1); | |||
h0 = add_x16(f0, f1); | |||
h3 = sub_x16(f2, f3); | |||
h3 = mulmod_x16_scaled(qdata, h3, zeta4_x16, zeta4_x16_qinv); | |||
h2 = add_x16(f2, f3); | |||
f1 = sub_x16(g0, g1); | |||
f1 = mulmod_x16_scaled(qdata, f1, zetainv8_x16, zetainv8_x16_qinv); | |||
f0 = add_x16(g0, g1); | |||
f3 = sub_x16(g2, g3); | |||
f3 = mulmod_x16_scaled(qdata, f3, zeta8_x16, zeta8_x16_qinv); | |||
f2 = add_x16(g2, g3); | |||
g0 = add_x16(h0, h2); | |||
g0 = reduce_x16(qdata, g0); | |||
g2 = sub_x16(h0, h2); | |||
g2 = reduce_x16(qdata, g2); | |||
g1 = sub_x16(h1, h3); | |||
g3 = add_x16(h1, h3); | |||
h2 = sub_x16(f0, f2); | |||
h2 = mulmod_x16_scaled(qdata, h2, zeta4_x16, zeta4_x16_qinv); | |||
h0 = add_x16(f0, f2); | |||
h3 = add_x16(f1, f3); | |||
h3 = mulmod_x16_scaled(qdata, h3, zeta4_x16, zeta4_x16_qinv); | |||
h1 = sub_x16(f1, f3); | |||
f0 = add_x16(g0, h0); | |||
g0 = sub_x16(g0, h0); | |||
f1 = add_x16(g1, h1); | |||
g1 = sub_x16(g1, h1); | |||
f2 = sub_x16(g2, h2); | |||
g2 = add_x16(g2, h2); | |||
f3 = sub_x16(g3, h3); | |||
g3 = add_x16(g3, h3); | |||
_mm256_storeu_si256((__m256i *) (f + 0), f0); | |||
_mm256_storeu_si256((__m256i *) (f + 32), g0); | |||
_mm256_storeu_si256((__m256i *) (f + 64), f1); | |||
_mm256_storeu_si256((__m256i *) (f + 96), g1); | |||
_mm256_storeu_si256((__m256i *) (f + 16), f2); | |||
_mm256_storeu_si256((__m256i *) (f + 48), g2); | |||
_mm256_storeu_si256((__m256i *) (f + 80), f3); | |||
_mm256_storeu_si256((__m256i *) (f + 112), g3); | |||
f += 128; | |||
} | |||
f = origf; | |||
for (rep = 0; rep < reps; ++rep) { | |||
f0 = _mm256_loadu_si256((__m256i *) (f + 0)); | |||
f1 = _mm256_loadu_si256((__m256i *) (f + 64)); | |||
f2 = _mm256_loadu_si256((__m256i *) (f + 16)); | |||
f3 = _mm256_loadu_si256((__m256i *) (f + 80)); | |||
g0 = _mm256_unpacklo_epi64(f0, f1); | |||
g1 = _mm256_unpacklo_epi64(f2, f3); | |||
g2 = _mm256_unpackhi_epi64(f0, f1); | |||
g3 = _mm256_unpackhi_epi64(f2, f3); | |||
f2 = _mm256_permute2x128_si256(g0, g1, 0x31); | |||
f3 = _mm256_permute2x128_si256(g2, g3, 0x31); | |||
f0 = _mm256_permute2x128_si256(g0, g1, 0x20); | |||
f1 = _mm256_permute2x128_si256(g2, g3, 0x20); | |||
f2 = mulmod_x16_scaled(qdata, f2, zetainv_x4_32_0, zetainv_x4_qinv_32_0); | |||
f3 = mulmod_x16_scaled(qdata, f3, zeta_x4(32, 0), zeta_x4_qinv(32, 0)); | |||
g3 = add_x16(f3, f2); | |||
g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv); | |||
g2 = sub_x16(f3, f2); | |||
f0 = reduce_x16(qdata, f0); | |||
f1 = mulmod_x16_scaled(qdata, f1, zetainv_x4_16_0, zetainv_x4_qinv_16_0); | |||
g1 = add_x16(f0, f1); | |||
g0 = sub_x16(f0, f1); | |||
f1 = add_x16(g1, g3); | |||
f3 = sub_x16(g1, g3); | |||
f0 = add_x16(g0, g2); | |||
f2 = sub_x16(g0, g2); | |||
_mm256_storeu_si256((__m256i *) (f + 64), f1); | |||
_mm256_storeu_si256((__m256i *) (f + 80), f3); | |||
_mm256_storeu_si256((__m256i *) (f + 0), f0); | |||
_mm256_storeu_si256((__m256i *) (f + 16), f2); | |||
f0 = _mm256_loadu_si256((__m256i *) (f + 32)); | |||
f1 = _mm256_loadu_si256((__m256i *) (f + 96)); | |||
f2 = _mm256_loadu_si256((__m256i *) (f + 48)); | |||
f3 = _mm256_loadu_si256((__m256i *) (f + 112)); | |||
g0 = _mm256_unpacklo_epi64(f0, f1); | |||
g1 = _mm256_unpacklo_epi64(f2, f3); | |||
g2 = _mm256_unpackhi_epi64(f0, f1); | |||
g3 = _mm256_unpackhi_epi64(f2, f3); | |||
f2 = _mm256_permute2x128_si256(g0, g1, 0x31); | |||
f3 = _mm256_permute2x128_si256(g2, g3, 0x31); | |||
f0 = _mm256_permute2x128_si256(g0, g1, 0x20); | |||
f1 = _mm256_permute2x128_si256(g2, g3, 0x20); | |||
f2 = mulmod_x16_scaled(qdata, f2, zetainv_x4_32_1, zetainv_x4_qinv_32_1); | |||
f3 = mulmod_x16_scaled(qdata, f3, zeta_x4(32, 1), zeta_x4_qinv(32, 1)); | |||
g3 = add_x16(f3, f2); | |||
g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv); | |||
g2 = sub_x16(f3, f2); | |||
f0 = reduce_x16(qdata, f0); | |||
f1 = mulmod_x16_scaled(qdata, f1, zetainv_x4_16_1, zetainv_x4_qinv_16_1); | |||
g1 = add_x16(f0, f1); | |||
g0 = sub_x16(f0, f1); | |||
f1 = add_x16(g1, g3); | |||
f3 = sub_x16(g1, g3); | |||
f0 = add_x16(g0, g2); | |||
f2 = sub_x16(g0, g2); | |||
_mm256_storeu_si256((__m256i *) (f + 96), f1); | |||
_mm256_storeu_si256((__m256i *) (f + 112), f3); | |||
_mm256_storeu_si256((__m256i *) (f + 32), f0); | |||
_mm256_storeu_si256((__m256i *) (f + 48), f2); | |||
f += 128; | |||
} | |||
f = origf; | |||
for (rep = 0; rep < reps; ++rep) { | |||
f0 = _mm256_loadu_si256((__m256i *) (f + 0)); | |||
f2 = _mm256_loadu_si256((__m256i *) (f + 64)); | |||
f1 = _mm256_loadu_si256((__m256i *) (f + 32)); | |||
f3 = _mm256_loadu_si256((__m256i *) (f + 96)); | |||
g0 = _mm256_permute2x128_si256(f0, f2, 0x20); | |||
g2 = _mm256_permute2x128_si256(f0, f2, 0x31); | |||
f0 = _mm256_unpacklo_epi16(g0, g2); | |||
f2 = _mm256_unpackhi_epi16(g0, g2); | |||
g1 = _mm256_permute2x128_si256(f1, f3, 0x20); | |||
g3 = _mm256_permute2x128_si256(f1, f3, 0x31); | |||
f1 = _mm256_unpacklo_epi16(g1, g3); | |||
f3 = _mm256_unpackhi_epi16(g1, g3); | |||
g1 = _mm256_unpackhi_epi16(f0, f2); | |||
g0 = _mm256_unpacklo_epi16(f0, f2); | |||
g3 = _mm256_unpackhi_epi16(f1, f3); | |||
g2 = _mm256_unpacklo_epi16(f1, f3); | |||
f2 = _mm256_unpacklo_epi64(g1, g3); | |||
f3 = _mm256_unpackhi_epi64(g1, g3); | |||
f0 = _mm256_unpacklo_epi64(g0, g2); | |||
f1 = _mm256_unpackhi_epi64(g0, g2); | |||
f2 = mulmod_x16_scaled(qdata, f2, zetainv_128_0, zetainv_qinv_128_0); | |||
f3 = mulmod_x16_scaled(qdata, f3, zeta(128, 0), zeta_qinv(128, 0)); | |||
f0 = reduce_x16(qdata, f0); | |||
f1 = mulmod_x16_scaled(qdata, f1, zetainv_64_0, zetainv_qinv_64_0); | |||
g3 = add_x16(f3, f2); | |||
g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv); | |||
g1 = add_x16(f0, f1); | |||
g2 = sub_x16(f3, f2); | |||
g0 = sub_x16(f0, f1); | |||
f1 = add_x16(g1, g3); | |||
f3 = sub_x16(g1, g3); | |||
f0 = add_x16(g0, g2); | |||
f2 = sub_x16(g0, g2); | |||
_mm256_storeu_si256((__m256i *) (f + 32), f1); | |||
_mm256_storeu_si256((__m256i *) (f + 96), f3); | |||
_mm256_storeu_si256((__m256i *) (f + 0), f0); | |||
_mm256_storeu_si256((__m256i *) (f + 64), f2); | |||
f0 = _mm256_loadu_si256((__m256i *) (f + 16)); | |||
f2 = _mm256_loadu_si256((__m256i *) (f + 80)); | |||
f1 = _mm256_loadu_si256((__m256i *) (f + 48)); | |||
f3 = _mm256_loadu_si256((__m256i *) (f + 112)); | |||
g0 = _mm256_permute2x128_si256(f0, f2, 0x20); | |||
g2 = _mm256_permute2x128_si256(f0, f2, 0x31); | |||
f0 = _mm256_unpacklo_epi16(g0, g2); | |||
f2 = _mm256_unpackhi_epi16(g0, g2); | |||
g1 = _mm256_permute2x128_si256(f1, f3, 0x20); | |||
g3 = _mm256_permute2x128_si256(f1, f3, 0x31); | |||
f1 = _mm256_unpacklo_epi16(g1, g3); | |||
f3 = _mm256_unpackhi_epi16(g1, g3); | |||
g1 = _mm256_unpackhi_epi16(f0, f2); | |||
g0 = _mm256_unpacklo_epi16(f0, f2); | |||
g3 = _mm256_unpackhi_epi16(f1, f3); | |||
g2 = _mm256_unpacklo_epi16(f1, f3); | |||
f2 = _mm256_unpacklo_epi64(g1, g3); | |||
f3 = _mm256_unpackhi_epi64(g1, g3); | |||
f0 = _mm256_unpacklo_epi64(g0, g2); | |||
f1 = _mm256_unpackhi_epi64(g0, g2); | |||
f2 = mulmod_x16_scaled(qdata, f2, zetainv_128_1, zetainv_qinv_128_1); | |||
f3 = mulmod_x16_scaled(qdata, f3, zeta(128, 1), zeta_qinv(128, 1)); | |||
f0 = reduce_x16(qdata, f0); | |||
f1 = mulmod_x16_scaled(qdata, f1, zetainv_64_1, zetainv_qinv_64_1); | |||
g3 = add_x16(f3, f2); | |||
g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv); | |||
g1 = add_x16(f0, f1); | |||
g2 = sub_x16(f3, f2); | |||
g0 = sub_x16(f0, f1); | |||
f1 = add_x16(g1, g3); | |||
f3 = sub_x16(g1, g3); | |||
f0 = add_x16(g0, g2); | |||
f2 = sub_x16(g0, g2); | |||
_mm256_storeu_si256((__m256i *) (f + 48), f1); | |||
_mm256_storeu_si256((__m256i *) (f + 112), f3); | |||
_mm256_storeu_si256((__m256i *) (f + 16), f0); | |||
_mm256_storeu_si256((__m256i *) (f + 80), f2); | |||
f += 128; | |||
} | |||
} | |||
static void invntt512(int16 *f, int reps, const __m256i *qdata) { | |||
__m256i f0, f1, f2, f3, g0, g1, g2, g3; /* [-Werror=unused-variable] */ /* ,h0,h1,h2,h3; */ | |||
/* [-Werror=unused-variable] */ /* int16 *origf = f; */ | |||
int rep; | |||
__m256i zetainv_512[8]; | |||
__m256i zetainv_qinv_512[8]; | |||
__m256i zetainv_256[8]; | |||
__m256i zetainv_qinv_256[8]; | |||
int i; | |||
for (i = 0; i < 8; ++i) { | |||
zetainv_512[i] = zetainv(512, i); | |||
} | |||
for (i = 0; i < 8; ++i) { | |||
zetainv_qinv_512[i] = zetainv_qinv(512, i); | |||
} | |||
for (i = 0; i < 8; ++i) { | |||
zetainv_256[i] = zetainv(256, i); | |||
} | |||
for (i = 0; i < 8; ++i) { | |||
zetainv_qinv_256[i] = zetainv_qinv(256, i); | |||
} | |||
invntt128(f, 4 * reps, qdata); | |||
for (rep = 0; rep < reps; ++rep) { | |||
for (i = 0; i < 8; ++i) { | |||
f2 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 256)); | |||
f3 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 384)); | |||
f2 = mulmod_x16_scaled(qdata, f2, zetainv_512[i], zetainv_qinv_512[i]); | |||
f3 = mulmod_x16_scaled(qdata, f3, zeta(512, i), zeta_qinv(512, i)); | |||
g3 = add_x16(f3, f2); | |||
g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv); | |||
g2 = sub_x16(f3, f2); | |||
f0 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 0)); | |||
f1 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 128)); | |||
f0 = reduce_x16(qdata, f0); | |||
f1 = mulmod_x16_scaled(qdata, f1, zetainv_256[i], zetainv_qinv_256[i]); | |||
g1 = add_x16(f0, f1); | |||
g0 = sub_x16(f0, f1); | |||
f1 = add_x16(g1, g3); | |||
f3 = sub_x16(g1, g3); | |||
f0 = add_x16(g0, g2); | |||
f2 = sub_x16(g0, g2); | |||
_mm256_storeu_si256((__m256i *) (f + 16 * i + 128), f1); | |||
_mm256_storeu_si256((__m256i *) (f + 16 * i + 384), f3); | |||
_mm256_storeu_si256((__m256i *) (f + 16 * i + 0), f0); | |||
_mm256_storeu_si256((__m256i *) (f + 16 * i + 256), f2); | |||
} | |||
f += 512; | |||
} | |||
} | |||
void PQCLEAN_NTRULPR653_AVX2_invntt512_7681(int16 *f, int reps) { | |||
invntt512(f, reps, (const __m256i *) qdata_7681.data); | |||
} | |||
void PQCLEAN_NTRULPR653_AVX2_invntt512_10753(int16 *f, int reps) { | |||
invntt512(f, reps, (const __m256i *) qdata_10753.data); | |||
} |
@@ -0,0 +1,13 @@ | |||
#ifndef ntt_H | |||
#define ntt_H | |||
#include <stdint.h> | |||
extern void PQCLEAN_NTRULPR653_AVX2_ntt512_7681(int16_t *f, int reps); | |||
extern void PQCLEAN_NTRULPR653_AVX2_ntt512_10753(int16_t *f, int reps); | |||
extern void PQCLEAN_NTRULPR653_AVX2_invntt512_7681(int16_t *f, int reps); | |||
extern void PQCLEAN_NTRULPR653_AVX2_invntt512_10753(int16_t *f, int reps); | |||
#endif |
@@ -0,0 +1,11 @@ | |||
#include "crypto_decode_256x16.h" | |||
void PQCLEAN_NTRULPR653_AVX2_crypto_decode_256x16(void *v, const unsigned char *s) { | |||
unsigned char *T = v; | |||
int i; | |||
for (i = 0; i < 128; ++i) { | |||
T[2 * i] = s[i] & 15; | |||
T[2 * i + 1] = s[i] >> 4; | |||
} | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_DECODE_256X16_H | |||
#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_DECODE_256X16_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_256x16_STRBYTES 128 | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_256x16_ITEMS 256 | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_256x16_ITEMBYTES 1 | |||
void PQCLEAN_NTRULPR653_AVX2_crypto_decode_256x16(void *v, const unsigned char *s); | |||
#endif |
@@ -0,0 +1,27 @@ | |||
#include "crypto_decode_256x2.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
#define COPY _mm256_set_epi64x(0x0303030303030303,0x0202020202020202,0x0101010101010101,0x0000000000000000) | |||
#define MASK _mm256_set1_epi64x(0x8040201008040201) | |||
#define MASK2 _mm256_set1_epi64x(0x0101010101010101) | |||
void PQCLEAN_NTRULPR653_AVX2_crypto_decode_256x2(void *v, const unsigned char *s) { | |||
__m256i *r = v; | |||
int i; | |||
for (i = 0; i < 8; ++i) { | |||
/* bytes s0 s1 s2 s3 */ | |||
__m256i x = _mm256_set1_epi32(*(int32_t *) s); | |||
/* s0 s1 s2 s3 s0 s1 s2 s3 s0 s1 s2 s3 s0 s1 s2 s3 s0 s1 s2 s3 s0 s1 s2 s3 s0 s1 s2 s3 s0 s1 s2 s3 */ | |||
x = _mm256_shuffle_epi8(x, COPY); | |||
/* s0 s0 s0 s0 s0 s0 s0 s0 s1 s1 s1 s1 s1 s1 s1 s1 s2 s2 s2 s2 s2 s2 s2 s2 s3 s3 s3 s3 s3 s3 s3 s3 */ | |||
x = _mm256_andnot_si256(x, MASK); | |||
x = _mm256_cmpeq_epi8(x, _mm256_setzero_si256()); | |||
x &= MASK2; | |||
_mm256_storeu_si256(r, x); | |||
s += 4; | |||
r += 1; | |||
} | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_DECODE_256X2_H | |||
#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_DECODE_256X2_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_256x2_STRBYTES 32 | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_256x2_ITEMS 256 | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_256x2_ITEMBYTES 1 | |||
void PQCLEAN_NTRULPR653_AVX2_crypto_decode_256x2(void *v, const unsigned char *s); | |||
#endif |
@@ -0,0 +1,408 @@ | |||
#include "crypto_decode_653x1541.h" | |||
#include <immintrin.h> | |||
/* auto-generated; do not edit */ | |||
#define int16 int16_t | |||
#define int32 int32_t | |||
static inline int16 mullo(int16 x, int16 y) { | |||
return x * y; | |||
} | |||
static inline int16 mulhi(int16 x, int16 y) { | |||
return (x * (int32)y) >> 16; | |||
} | |||
static inline __m256i add(__m256i x, __m256i y) { | |||
return _mm256_add_epi16(x, y); | |||
} | |||
static inline __m256i sub(__m256i x, __m256i y) { | |||
return _mm256_sub_epi16(x, y); | |||
} | |||
static inline __m256i shiftleftconst(__m256i x, int16 y) { | |||
return _mm256_slli_epi16(x, y); | |||
} | |||
static inline __m256i signedshiftrightconst(__m256i x, int16 y) { | |||
return _mm256_srai_epi16(x, y); | |||
} | |||
static inline __m256i addconst(__m256i x, int16 y) { | |||
return add(x, _mm256_set1_epi16(y)); | |||
} | |||
static inline __m256i subconst(__m256i x, int16 y) { | |||
return sub(x, _mm256_set1_epi16(y)); | |||
} | |||
static inline __m256i mulloconst(__m256i x, int16 y) { | |||
return _mm256_mullo_epi16(x, _mm256_set1_epi16(y)); | |||
} | |||
static inline __m256i mulhiconst(__m256i x, int16 y) { | |||
return _mm256_mulhi_epi16(x, _mm256_set1_epi16(y)); | |||
} | |||
static inline __m256i ifgesubconst(__m256i x, int16 y) { | |||
__m256i y16 = _mm256_set1_epi16(y); | |||
__m256i top16 = _mm256_set1_epi16((int16)(y - 1)); | |||
return sub(x, _mm256_cmpgt_epi16(x, top16) & y16); | |||
} | |||
static inline __m256i ifnegaddconst(__m256i x, int16 y) { | |||
return add(x, signedshiftrightconst(x, 15) & _mm256_set1_epi16(y)); | |||
} | |||
void PQCLEAN_NTRULPR653_AVX2_crypto_decode_653x1541(void *v, const unsigned char *s) { | |||
int16 *R0 = v; | |||
int16 R1[327], R2[164], R3[82], R4[41], R5[21], R6[11], R7[6], R8[3], R9[2], R10[1]; | |||
long long i; | |||
int16 a0, a1, a2; | |||
__m256i A0, A1, A2, S0, S1, B0, B1, C0, C1; | |||
s += PQCLEAN_NTRULPR653_AVX2_crypto_decode_653x1541_STRBYTES; | |||
a1 = 0; | |||
a1 += *--s; /* 0...255 */ | |||
a1 = mulhi(a1, -48) - mulhi(mullo(a1, -6433), 2608); | |||
a1 += *--s; /* -1304...1558 */ | |||
a1 += (a1 >> 15) & 2608; /* 0...2607 */ | |||
R10[0] = a1; | |||
/* R10 ------> R9: reconstruct mod 1*[71]+[9402] */ | |||
i = 0; | |||
s -= 1; | |||
a2 = a0 = R10[0]; | |||
a0 = mulhi(a0, -13) - mulhi(mullo(a0, 25845), 71); /* -39...35 */ | |||
a0 += s[1 * i + 0]; /* -39...290 */ | |||
a0 = mulhi(a0, 3) - mulhi(mullo(a0, -923), 71); /* -36...35 */ | |||
a0 += (a0 >> 15) & 71; /* 0...70 */ | |||
a1 = (a2 << 8) + s[i] - a0; | |||
a1 = mullo(a1, -22153); | |||
/* invalid inputs might need reduction mod 9402 */ | |||
a1 -= 9402; | |||
a1 += (a1 >> 15) & 9402; | |||
R9[0] = a0; | |||
R9[1] = a1; | |||
s -= 0; | |||
/* R9 ------> R8: reconstruct mod 2*[134]+[9402] */ | |||
R8[2] = R9[1]; | |||
s -= 1; | |||
for (i = 0; i >= 0; --i) { | |||
a2 = a0 = R9[i]; | |||
a0 = mulhi(a0, 14) - mulhi(mullo(a0, 5869), 134); /* -67...70 */ | |||
a0 += s[1 * i + 0]; /* -67...325 */ | |||
a0 = mulhi(a0, 10) - mulhi(mullo(a0, -489), 134); /* -68...67 */ | |||
a0 += (a0 >> 15) & 134; /* 0...133 */ | |||
a1 = (a2 << 7) + ((s[i] - a0) >> 1); | |||
a1 = mullo(a1, 19563); | |||
/* invalid inputs might need reduction mod 134 */ | |||
a1 -= 134; | |||
a1 += (a1 >> 15) & 134; | |||
R8[2 * i] = a0; | |||
R8[2 * i + 1] = a1; | |||
} | |||
/* R8 ------> R7: reconstruct mod 5*[2953]+[815] */ | |||
i = 0; | |||
s -= 1; | |||
a2 = a0 = R8[2]; | |||
a0 = mulhi(a0, 1223) - mulhi(mullo(a0, -5681), 2953); /* -1477...1782 */ | |||
a0 += s[1 * i + 0]; /* -1477...2037 */ | |||
a0 += (a0 >> 15) & 2953; /* 0...2952 */ | |||
a1 = (a2 << 8) + s[i] - a0; | |||
a1 = mullo(a1, -9543); | |||
/* invalid inputs might need reduction mod 815 */ | |||
a1 -= 815; | |||
a1 += (a1 >> 15) & 815; | |||
R7[4] = a0; | |||
R7[5] = a1; | |||
s -= 4; | |||
for (i = 1; i >= 0; --i) { | |||
a0 = R8[i]; | |||
a0 = mulhi(a0, 1223) - mulhi(mullo(a0, -5681), 2953); /* -1477...1782 */ | |||
a0 += s[2 * i + 1]; /* -1477...2037 */ | |||
a0 = mulhi(a0, 1223) - mulhi(mullo(a0, -5681), 2953); /* -1505...1514 */ | |||
a0 += s[2 * i + 0]; /* -1505...1769 */ | |||
a0 += (a0 >> 15) & 2953; /* 0...2952 */ | |||
a1 = (s[2 * i + 1] << 8) + s[2 * i] - a0; | |||
a1 = mullo(a1, -9543); | |||
/* invalid inputs might need reduction mod 2953 */ | |||
a1 -= 2953; | |||
a1 += (a1 >> 15) & 2953; | |||
R7[2 * i] = a0; | |||
R7[2 * i + 1] = a1; | |||
} | |||
/* R7 ------> R6: reconstruct mod 10*[13910]+[815] */ | |||
R6[10] = R7[5]; | |||
s -= 10; | |||
for (i = 4; i >= 0; --i) { | |||
a2 = a0 = R7[i]; | |||
a0 = mulhi(a0, 1756) - mulhi(mullo(a0, -1206), 13910); /* -6955...7394 */ | |||
a0 += s[2 * i + 1]; /* -6955...7649 */ | |||
a0 = mulhi(a0, 1756) - mulhi(mullo(a0, -1206), 13910); /* -7142...7159 */ | |||
a0 += s[2 * i + 0]; /* -7142...7414 */ | |||
a0 += (a0 >> 15) & 13910; /* 0...13909 */ | |||
a1 = (a2 << 15) + (s[2 * i + 1] << 7) + ((s[2 * i] - a0) >> 1); | |||
a1 = mullo(a1, -13437); | |||
/* invalid inputs might need reduction mod 13910 */ | |||
a1 -= 13910; | |||
a1 += (a1 >> 15) & 13910; | |||
R6[2 * i] = a0; | |||
R6[2 * i + 1] = a1; | |||
} | |||
/* R6 ------> R5: reconstruct mod 20*[1887]+[815] */ | |||
R5[20] = R6[10]; | |||
s -= 10; | |||
for (i = 9; i >= 0; --i) { | |||
a2 = a0 = R6[i]; | |||
a0 = mulhi(a0, -101) - mulhi(mullo(a0, -8891), 1887); /* -969...943 */ | |||
a0 += s[1 * i + 0]; /* -969...1198 */ | |||
a0 += (a0 >> 15) & 1887; /* 0...1886 */ | |||
a1 = (a2 << 8) + s[i] - a0; | |||
a1 = mullo(a1, 5279); | |||
/* invalid inputs might need reduction mod 1887 */ | |||
a1 -= 1887; | |||
a1 += (a1 >> 15) & 1887; | |||
R5[2 * i] = a0; | |||
R5[2 * i + 1] = a1; | |||
} | |||
/* R5 ------> R4: reconstruct mod 40*[695]+[815] */ | |||
R4[40] = R5[20]; | |||
s -= 20; | |||
i = 4; | |||
for (;;) { | |||
A2 = A0 = _mm256_loadu_si256((__m256i *) &R5[i]); | |||
S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i))); | |||
A0 = sub(mulhiconst(A0, -84), mulhiconst(mulloconst(A0, -24140), 695)); /* -369...347 */ | |||
A0 = add(A0, S0); /* -369...602 */ | |||
A0 = ifnegaddconst(A0, 695); /* 0...694 */ | |||
A1 = add(shiftleftconst(A2, 8), sub(S0, A0)); | |||
A1 = mulloconst(A1, 31495); | |||
/* invalid inputs might need reduction mod 695 */ | |||
A1 = ifgesubconst(A1, 695); | |||
/* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */ | |||
/* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */ | |||
B0 = _mm256_unpacklo_epi16(A0, A1); | |||
B1 = _mm256_unpackhi_epi16(A0, A1); | |||
/* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */ | |||
/* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */ | |||
C0 = _mm256_permute2x128_si256(B0, B1, 0x20); | |||
C1 = _mm256_permute2x128_si256(B0, B1, 0x31); | |||
/* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */ | |||
/* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */ | |||
_mm256_storeu_si256((__m256i *) (&R4[2 * i]), C0); | |||
_mm256_storeu_si256((__m256i *) (16 + &R4[2 * i]), C1); | |||
if (!i) { | |||
break; | |||
} | |||
i = -16 - ((~15) & -i); | |||
} | |||
/* R4 ------> R3: reconstruct mod 81*[6745]+[7910] */ | |||
i = 0; | |||
s -= 2; | |||
a0 = R4[40]; | |||
a0 = mulhi(a0, 2401) - mulhi(mullo(a0, -2487), 6745); /* -3373...3972 */ | |||
a0 += s[2 * i + 1]; /* -3373...4227 */ | |||
a0 = mulhi(a0, 2401) - mulhi(mullo(a0, -2487), 6745); /* -3497...3527 */ | |||
a0 += s[2 * i + 0]; /* -3497...3782 */ | |||
a0 += (a0 >> 15) & 6745; /* 0...6744 */ | |||
a1 = (s[2 * i + 1] << 8) + s[2 * i] - a0; | |||
a1 = mullo(a1, -29207); | |||
/* invalid inputs might need reduction mod 7910 */ | |||
a1 -= 7910; | |||
a1 += (a1 >> 15) & 7910; | |||
R3[80] = a0; | |||
R3[81] = a1; | |||
s -= 80; | |||
i = 24; | |||
for (;;) { | |||
A0 = _mm256_loadu_si256((__m256i *) &R4[i]); | |||
S0 = _mm256_loadu_si256((__m256i *) (s + 2 * i)); | |||
S1 = _mm256_srli_epi16(S0, 8); | |||
S0 &= _mm256_set1_epi16(255); | |||
A0 = sub(mulhiconst(A0, 2401), mulhiconst(mulloconst(A0, -2487), 6745)); /* -3373...3972 */ | |||
A0 = add(A0, S1); /* -3373...4227 */ | |||
A0 = sub(mulhiconst(A0, 2401), mulhiconst(mulloconst(A0, -2487), 6745)); /* -3497...3527 */ | |||
A0 = add(A0, S0); /* -3497...3782 */ | |||
A0 = ifnegaddconst(A0, 6745); /* 0...6744 */ | |||
A1 = add(shiftleftconst(S1, 8), sub(S0, A0)); | |||
A1 = mulloconst(A1, -29207); | |||
/* invalid inputs might need reduction mod 6745 */ | |||
A1 = ifgesubconst(A1, 6745); | |||
/* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */ | |||
/* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */ | |||
B0 = _mm256_unpacklo_epi16(A0, A1); | |||
B1 = _mm256_unpackhi_epi16(A0, A1); | |||
/* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */ | |||
/* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */ | |||
C0 = _mm256_permute2x128_si256(B0, B1, 0x20); | |||
C1 = _mm256_permute2x128_si256(B0, B1, 0x31); | |||
/* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */ | |||
/* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */ | |||
_mm256_storeu_si256((__m256i *) (&R3[2 * i]), C0); | |||
_mm256_storeu_si256((__m256i *) (16 + &R3[2 * i]), C1); | |||
if (!i) { | |||
break; | |||
} | |||
i = -16 - ((~15) & -i); | |||
} | |||
/* R3 ------> R2: reconstruct mod 163*[1314]+[1541] */ | |||
i = 0; | |||
s -= 1; | |||
a2 = a0 = R3[81]; | |||
a0 = mulhi(a0, 64) - mulhi(mullo(a0, -12768), 1314); /* -657...673 */ | |||
a0 += s[1 * i + 0]; /* -657...928 */ | |||
a0 += (a0 >> 15) & 1314; /* 0...1313 */ | |||
a1 = (a2 << 7) + ((s[i] - a0) >> 1); | |||
a1 = mullo(a1, -399); | |||
/* invalid inputs might need reduction mod 1541 */ | |||
a1 -= 1541; | |||
a1 += (a1 >> 15) & 1541; | |||
R2[162] = a0; | |||
R2[163] = a1; | |||
s -= 81; | |||
i = 65; | |||
for (;;) { | |||
A2 = A0 = _mm256_loadu_si256((__m256i *) &R3[i]); | |||
S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i))); | |||
A0 = sub(mulhiconst(A0, 64), mulhiconst(mulloconst(A0, -12768), 1314)); /* -657...673 */ | |||
A0 = add(A0, S0); /* -657...928 */ | |||
A0 = ifnegaddconst(A0, 1314); /* 0...1313 */ | |||
A1 = add(shiftleftconst(A2, 7), signedshiftrightconst(sub(S0, A0), 1)); | |||
A1 = mulloconst(A1, -399); | |||
/* invalid inputs might need reduction mod 1314 */ | |||
A1 = ifgesubconst(A1, 1314); | |||
/* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */ | |||
/* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */ | |||
B0 = _mm256_unpacklo_epi16(A0, A1); | |||
B1 = _mm256_unpackhi_epi16(A0, A1); | |||
/* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */ | |||
/* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */ | |||
C0 = _mm256_permute2x128_si256(B0, B1, 0x20); | |||
C1 = _mm256_permute2x128_si256(B0, B1, 0x31); | |||
/* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */ | |||
/* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */ | |||
_mm256_storeu_si256((__m256i *) (&R2[2 * i]), C0); | |||
_mm256_storeu_si256((__m256i *) (16 + &R2[2 * i]), C1); | |||
if (!i) { | |||
break; | |||
} | |||
i = -16 - ((~15) & -i); | |||
} | |||
/* R2 ------> R1: reconstruct mod 326*[9277]+[1541] */ | |||
R1[326] = R2[163]; | |||
s -= 326; | |||
i = 147; | |||
for (;;) { | |||
A0 = _mm256_loadu_si256((__m256i *) &R2[i]); | |||
S0 = _mm256_loadu_si256((__m256i *) (s + 2 * i)); | |||
S1 = _mm256_srli_epi16(S0, 8); | |||
S0 &= _mm256_set1_epi16(255); | |||
A0 = sub(mulhiconst(A0, 4400), mulhiconst(mulloconst(A0, -1808), 9277)); /* -4639...5738 */ | |||
A0 = add(A0, S1); /* -4639...5993 */ | |||
A0 = sub(mulhiconst(A0, 4400), mulhiconst(mulloconst(A0, -1808), 9277)); /* -4950...5040 */ | |||
A0 = add(A0, S0); /* -4950...5295 */ | |||
A0 = ifnegaddconst(A0, 9277); /* 0...9276 */ | |||
A1 = add(shiftleftconst(S1, 8), sub(S0, A0)); | |||
A1 = mulloconst(A1, -27883); | |||
/* invalid inputs might need reduction mod 9277 */ | |||
A1 = ifgesubconst(A1, 9277); | |||
/* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */ | |||
/* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */ | |||
B0 = _mm256_unpacklo_epi16(A0, A1); | |||
B1 = _mm256_unpackhi_epi16(A0, A1); | |||
/* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */ | |||
/* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */ | |||
C0 = _mm256_permute2x128_si256(B0, B1, 0x20); | |||
C1 = _mm256_permute2x128_si256(B0, B1, 0x31); | |||
/* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */ | |||
/* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */ | |||
_mm256_storeu_si256((__m256i *) (&R1[2 * i]), C0); | |||
_mm256_storeu_si256((__m256i *) (16 + &R1[2 * i]), C1); | |||
if (!i) { | |||
break; | |||
} | |||
i = -16 - ((~15) & -i); | |||
} | |||
/* R1 ------> R0: reconstruct mod 653*[1541] */ | |||
R0[652] = 3 * R1[326] - 2310; | |||
s -= 326; | |||
i = 310; | |||
for (;;) { | |||
A2 = A0 = _mm256_loadu_si256((__m256i *) &R1[i]); | |||
S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i))); | |||
A0 = sub(mulhiconst(A0, 349), mulhiconst(mulloconst(A0, -10887), 1541)); /* -771...857 */ | |||
A0 = add(A0, S0); /* -771...1112 */ | |||
A0 = ifnegaddconst(A0, 1541); /* 0...1540 */ | |||
A1 = add(shiftleftconst(A2, 8), sub(S0, A0)); | |||
A1 = mulloconst(A1, -10547); | |||
/* invalid inputs might need reduction mod 1541 */ | |||
A1 = ifgesubconst(A1, 1541); | |||
A0 = mulloconst(A0, 3); | |||
A1 = mulloconst(A1, 3); | |||
A0 = subconst(A0, 2310); | |||
A1 = subconst(A1, 2310); | |||
/* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */ | |||
/* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */ | |||
B0 = _mm256_unpacklo_epi16(A0, A1); | |||
B1 = _mm256_unpackhi_epi16(A0, A1); | |||
/* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */ | |||
/* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */ | |||
C0 = _mm256_permute2x128_si256(B0, B1, 0x20); | |||
C1 = _mm256_permute2x128_si256(B0, B1, 0x31); | |||
/* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */ | |||
/* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */ | |||
_mm256_storeu_si256((__m256i *) (&R0[2 * i]), C0); | |||
_mm256_storeu_si256((__m256i *) (16 + &R0[2 * i]), C1); | |||
if (!i) { | |||
break; | |||
} | |||
i = -16 - ((~15) & -i); | |||
} | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_DECODE_653X1541_H | |||
#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_DECODE_653X1541_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_653x1541_STRBYTES 865 | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_653x1541_ITEMS 653 | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_653x1541_ITEMBYTES 2 | |||
void PQCLEAN_NTRULPR653_AVX2_crypto_decode_653x1541(void *v, const unsigned char *s); | |||
#endif |
@@ -0,0 +1,65 @@ | |||
#include "crypto_decode_653x3.h" | |||
#include <immintrin.h> | |||
#define uint8 uint8_t | |||
#define p 653 | |||
#define loops 6 | |||
#define overshoot 29 | |||
void PQCLEAN_NTRULPR653_AVX2_crypto_decode_653x3(void *v, const unsigned char *s) { | |||
uint8 *f = v; | |||
int loop; | |||
uint8 *nextf = f + 128 - 4 * overshoot; | |||
const unsigned char *nexts = s + 32 - overshoot; | |||
for (loop = loops; loop > 0; --loop) { | |||
__m256i s0 = _mm256_loadu_si256((const __m256i *) s); | |||
s = nexts; | |||
nexts += 32; | |||
__m256i s1 = _mm256_srli_epi16(s0 & _mm256_set1_epi8(-16), 4); | |||
s0 &= _mm256_set1_epi8(15); | |||
__m256i a0 = _mm256_unpacklo_epi8(s0, s1); | |||
/* 0 0>>4 1 1>>4 2 2>>4 3 3>>4 4 4>>4 5 5>>4 6 6>>4 7 7>>4 */ | |||
/* 16 16>>4 ... */ | |||
__m256i a1 = _mm256_unpackhi_epi8(s0, s1); | |||
/* 8 8>>4 9 9>>4 10 10>>4 ... */ | |||
/* 24 24>>4 ... */ | |||
__m256i a2 = _mm256_srli_epi16(a0 & _mm256_set1_epi8(12), 2); | |||
__m256i a3 = _mm256_srli_epi16(a1 & _mm256_set1_epi8(12), 2); | |||
a0 &= _mm256_set1_epi8(3); | |||
a1 &= _mm256_set1_epi8(3); | |||
__m256i b0 = _mm256_unpacklo_epi8(a0, a2); | |||
/* 0 0>>2 0>>4 0>>6 1 1>>2 1>>4 1>>6 */ | |||
/* 2 2>>2 2>>4 2>>6 3 3>>2 3>>4 3>.6 */ | |||
/* 16 16>>2 16>>4 16>>6 ... */ | |||
__m256i b2 = _mm256_unpackhi_epi8(a0, a2); | |||
/* 4 4>>2 ... */ | |||
__m256i b1 = _mm256_unpacklo_epi8(a1, a3); | |||
/* 8 8>>2 ... */ | |||
__m256i b3 = _mm256_unpackhi_epi8(a1, a3); | |||
/* 12 12>>2 ... */ | |||
__m256i f0 = _mm256_permute2x128_si256(b0, b2, 0x20); | |||
__m256i f2 = _mm256_permute2x128_si256(b0, b2, 0x31); | |||
__m256i f1 = _mm256_permute2x128_si256(b1, b3, 0x20); | |||
__m256i f3 = _mm256_permute2x128_si256(b1, b3, 0x31); | |||
f0 = _mm256_add_epi8(f0, _mm256_set1_epi8(-1)); | |||
f1 = _mm256_add_epi8(f1, _mm256_set1_epi8(-1)); | |||
f2 = _mm256_add_epi8(f2, _mm256_set1_epi8(-1)); | |||
f3 = _mm256_add_epi8(f3, _mm256_set1_epi8(-1)); | |||
_mm256_storeu_si256((__m256i *) (f + 0), f0); | |||
_mm256_storeu_si256((__m256i *) (f + 32), f1); | |||
_mm256_storeu_si256((__m256i *) (f + 64), f2); | |||
_mm256_storeu_si256((__m256i *) (f + 96), f3); | |||
f = nextf; | |||
nextf += 128; | |||
} | |||
*f = ((uint8)(*s & 3)) - 1; | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_DECODE_653X3_H | |||
#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_DECODE_653X3_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_653x3_STRBYTES 164 | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_653x3_ITEMS 653 | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_653x3_ITEMBYTES 1 | |||
void PQCLEAN_NTRULPR653_AVX2_crypto_decode_653x3(void *v, const unsigned char *s); | |||
#endif |
@@ -0,0 +1,16 @@ | |||
#include "crypto_decode_653xint16.h" | |||
void PQCLEAN_NTRULPR653_AVX2_crypto_decode_653xint16(void *v, const unsigned char *s) { | |||
uint16_t *x = v; | |||
int i; | |||
for (i = 0; i < 653; ++i) { | |||
uint16_t u0 = s[0]; | |||
uint16_t u1 = s[1]; | |||
u1 <<= 8; | |||
*x = u0 | u1; | |||
x += 1; | |||
s += 2; | |||
} | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_DECODE_653XINT16_H | |||
#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_DECODE_653XINT16_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_653xint16_STRBYTES 1306 | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_653xint16_ITEMBYTES 2 | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_653xint16_ITEMS 653 | |||
void PQCLEAN_NTRULPR653_AVX2_crypto_decode_653xint16(void *v, const unsigned char *s); | |||
#endif |
@@ -0,0 +1,20 @@ | |||
#include "crypto_decode_653xint32.h" | |||
void PQCLEAN_NTRULPR653_AVX2_crypto_decode_653xint32(void *v, const unsigned char *s) { | |||
uint32_t *x = v; | |||
int i; | |||
for (i = 0; i < 653; ++i) { | |||
uint32_t u0 = s[0]; | |||
uint32_t u1 = s[1]; | |||
uint32_t u2 = s[2]; | |||
uint32_t u3 = s[3]; | |||
u1 <<= 8; | |||
u2 <<= 16; | |||
u3 <<= 24; | |||
*x = u0 | u1 | u2 | u3; | |||
x += 1; | |||
s += 4; | |||
} | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_DECODE_653XINT32_H | |||
#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_DECODE_653XINT32_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_653xint32_STRBYTES 2612 | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_653xint32_ITEMBYTES 4 | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_decode_653xint32_ITEMS 653 | |||
void PQCLEAN_NTRULPR653_AVX2_crypto_decode_653xint32(void *v, const unsigned char *s); | |||
#endif |
@@ -0,0 +1,10 @@ | |||
#include "crypto_encode_256x16.h" | |||
void PQCLEAN_NTRULPR653_AVX2_crypto_encode_256x16(unsigned char *s, const void *v) { | |||
const unsigned char *T = v; | |||
int i; | |||
for (i = 0; i < 128; ++i) { | |||
s[i] = T[2 * i] + (T[2 * i + 1] << 4); | |||
} | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_ENCODE_256X16_H | |||
#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_ENCODE_256X16_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_256x16_STRBYTES 128 | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_256x16_ITEMS 256 | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_256x16_ITEMBYTES 1 | |||
void PQCLEAN_NTRULPR653_AVX2_crypto_encode_256x16(unsigned char *s, const void *v); | |||
#endif |
@@ -0,0 +1,88 @@ | |||
#include "crypto_encode_256x2.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
void PQCLEAN_NTRULPR653_AVX2_crypto_encode_256x2(unsigned char *s, const void *v) { | |||
__m256i a0 = _mm256_loadu_si256(0 + (__m256i *) v); | |||
__m256i a1 = _mm256_loadu_si256(1 + (__m256i *) v); | |||
__m256i a2 = _mm256_loadu_si256(2 + (__m256i *) v); | |||
__m256i a3 = _mm256_loadu_si256(3 + (__m256i *) v); | |||
__m256i a4 = _mm256_loadu_si256(4 + (__m256i *) v); | |||
__m256i a5 = _mm256_loadu_si256(5 + (__m256i *) v); | |||
__m256i a6 = _mm256_loadu_si256(6 + (__m256i *) v); | |||
__m256i a7 = _mm256_loadu_si256(7 + (__m256i *) v); | |||
__m256i bottom = _mm256_set1_epi8(1); | |||
__m256i zero = _mm256_setzero_si256(); | |||
__m256i b0 = _mm256_cmpgt_epi8(a0 & bottom, zero); | |||
__m256i b1 = _mm256_cmpgt_epi8(a1 & bottom, zero); | |||
__m256i b2 = _mm256_cmpgt_epi8(a2 & bottom, zero); | |||
__m256i b3 = _mm256_cmpgt_epi8(a3 & bottom, zero); | |||
__m256i b4 = _mm256_cmpgt_epi8(a4 & bottom, zero); | |||
__m256i b5 = _mm256_cmpgt_epi8(a5 & bottom, zero); | |||
__m256i b6 = _mm256_cmpgt_epi8(a6 & bottom, zero); | |||
__m256i b7 = _mm256_cmpgt_epi8(a7 & bottom, zero); | |||
int32_t c0 = _mm256_movemask_epi8(b0); | |||
int32_t c1 = _mm256_movemask_epi8(b1); | |||
int32_t c2 = _mm256_movemask_epi8(b2); | |||
int32_t c3 = _mm256_movemask_epi8(b3); | |||
int32_t c4 = _mm256_movemask_epi8(b4); | |||
int32_t c5 = _mm256_movemask_epi8(b5); | |||
int32_t c6 = _mm256_movemask_epi8(b6); | |||
int32_t c7 = _mm256_movemask_epi8(b7); | |||
*s++ = c0; | |||
c0 >>= 8; | |||
*s++ = c0; | |||
c0 >>= 8; | |||
*s++ = c0; | |||
c0 >>= 8; | |||
*s++ = c0; | |||
*s++ = c1; | |||
c1 >>= 8; | |||
*s++ = c1; | |||
c1 >>= 8; | |||
*s++ = c1; | |||
c1 >>= 8; | |||
*s++ = c1; | |||
*s++ = c2; | |||
c2 >>= 8; | |||
*s++ = c2; | |||
c2 >>= 8; | |||
*s++ = c2; | |||
c2 >>= 8; | |||
*s++ = c2; | |||
*s++ = c3; | |||
c3 >>= 8; | |||
*s++ = c3; | |||
c3 >>= 8; | |||
*s++ = c3; | |||
c3 >>= 8; | |||
*s++ = c3; | |||
*s++ = c4; | |||
c4 >>= 8; | |||
*s++ = c4; | |||
c4 >>= 8; | |||
*s++ = c4; | |||
c4 >>= 8; | |||
*s++ = c4; | |||
*s++ = c5; | |||
c5 >>= 8; | |||
*s++ = c5; | |||
c5 >>= 8; | |||
*s++ = c5; | |||
c5 >>= 8; | |||
*s++ = c5; | |||
*s++ = c6; | |||
c6 >>= 8; | |||
*s++ = c6; | |||
c6 >>= 8; | |||
*s++ = c6; | |||
c6 >>= 8; | |||
*s++ = c6; | |||
*s++ = c7; | |||
c7 >>= 8; | |||
*s++ = c7; | |||
c7 >>= 8; | |||
*s++ = c7; | |||
c7 >>= 8; | |||
*s++ = c7; | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_ENCODE_256X2_H | |||
#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_ENCODE_256X2_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_256x2_STRBYTES 32 | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_256x2_ITEMS 256 | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_256x2_ITEMBYTES 1 | |||
void PQCLEAN_NTRULPR653_AVX2_crypto_encode_256x2(unsigned char *s, const void *v); | |||
#endif |
@@ -0,0 +1,286 @@ | |||
#include "crypto_encode_653x1541.h" | |||
#include <immintrin.h> | |||
/* auto-generated; do not edit */ | |||
#define int16 int16_t | |||
#define uint16 uint16_t | |||
#define uint32 uint32_t | |||
void PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x1541(unsigned char *out, const void *v) { | |||
const int16 *R0 = v; | |||
/* XXX: caller could overlap R with input */ | |||
uint16 R[327]; | |||
long i; | |||
const uint16 *reading; | |||
uint16 *writing; | |||
uint16 r0, r1; | |||
uint32 r2; | |||
uint32 s0; | |||
reading = (uint16 *) R0; | |||
writing = R; | |||
i = 41; | |||
while (i > 0) { | |||
__m256i x, y; | |||
--i; | |||
if (!i) { | |||
reading -= 4; | |||
writing -= 2; | |||
out -= 2; | |||
} | |||
x = _mm256_loadu_si256((__m256i *) reading); | |||
x = _mm256_add_epi16(x, _mm256_set1_epi16(2310)); | |||
x &= _mm256_set1_epi16(16383); | |||
x = _mm256_mulhi_epi16(x, _mm256_set1_epi16(21846)); | |||
y = x & _mm256_set1_epi32(65535); | |||
x = _mm256_srli_epi32(x, 16); | |||
x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1541)); | |||
x = _mm256_add_epi32(y, x); | |||
x = _mm256_shuffle_epi8(x, _mm256_set_epi8( | |||
12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1, | |||
12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1 | |||
)); | |||
x = _mm256_permute4x64_epi64(x, 0xd8); | |||
_mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0)); | |||
s0 = _mm256_extract_epi32(x, 4); | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
s0 = _mm256_extract_epi32(x, 6); | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
reading += 16; | |||
writing += 8; | |||
} | |||
R[326] = (((R0[652] + 2310) & 16383) * 10923) >> 15; | |||
reading = (uint16 *) R; | |||
writing = R; | |||
i = 11; | |||
while (i > 0) { | |||
__m256i x, x2, y, y2; | |||
--i; | |||
if (!i) { | |||
reading -= 26; | |||
writing -= 13; | |||
out -= 26; | |||
} | |||
x = _mm256_loadu_si256((__m256i *) (reading + 0)); | |||
x2 = _mm256_loadu_si256((__m256i *) (reading + 16)); | |||
y = x & _mm256_set1_epi32(65535); | |||
y2 = x2 & _mm256_set1_epi32(65535); | |||
x = _mm256_srli_epi32(x, 16); | |||
x2 = _mm256_srli_epi32(x2, 16); | |||
x = _mm256_mullo_epi32(x, _mm256_set1_epi32(9277)); | |||
x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(9277)); | |||
x = _mm256_add_epi32(y, x); | |||
x2 = _mm256_add_epi32(y2, x2); | |||
x = _mm256_shuffle_epi8(x, _mm256_set_epi8( | |||
15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, | |||
15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 | |||
)); | |||
x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8( | |||
15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, | |||
15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 | |||
)); | |||
x = _mm256_permute4x64_epi64(x, 0xd8); | |||
x2 = _mm256_permute4x64_epi64(x2, 0xd8); | |||
_mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31)); | |||
_mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20)); | |||
reading += 32; | |||
writing += 16; | |||
out += 32; | |||
} | |||
R[163] = R[326]; | |||
reading = (uint16 *) R; | |||
writing = R; | |||
i = 11; | |||
while (i > 0) { | |||
__m256i x, y; | |||
--i; | |||
if (!i) { | |||
reading -= 12; | |||
writing -= 6; | |||
out -= 6; | |||
} | |||
x = _mm256_loadu_si256((__m256i *) reading); | |||
y = x & _mm256_set1_epi32(65535); | |||
x = _mm256_srli_epi32(x, 16); | |||
x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1314)); | |||
x = _mm256_add_epi32(y, x); | |||
x = _mm256_shuffle_epi8(x, _mm256_set_epi8( | |||
12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1, | |||
12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1 | |||
)); | |||
x = _mm256_permute4x64_epi64(x, 0xd8); | |||
_mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0)); | |||
s0 = _mm256_extract_epi32(x, 4); | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
s0 = _mm256_extract_epi32(x, 6); | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
reading += 16; | |||
writing += 8; | |||
} | |||
reading = (uint16 *) R; | |||
writing = R; | |||
i = 3; | |||
while (i > 0) { | |||
__m256i x, x2, y, y2; | |||
--i; | |||
if (!i) { | |||
reading -= 14; | |||
writing -= 7; | |||
out -= 14; | |||
} | |||
x = _mm256_loadu_si256((__m256i *) (reading + 0)); | |||
x2 = _mm256_loadu_si256((__m256i *) (reading + 16)); | |||
y = x & _mm256_set1_epi32(65535); | |||
y2 = x2 & _mm256_set1_epi32(65535); | |||
x = _mm256_srli_epi32(x, 16); | |||
x2 = _mm256_srli_epi32(x2, 16); | |||
x = _mm256_mullo_epi32(x, _mm256_set1_epi32(6745)); | |||
x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(6745)); | |||
x = _mm256_add_epi32(y, x); | |||
x2 = _mm256_add_epi32(y2, x2); | |||
x = _mm256_shuffle_epi8(x, _mm256_set_epi8( | |||
15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, | |||
15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 | |||
)); | |||
x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8( | |||
15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, | |||
15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 | |||
)); | |||
x = _mm256_permute4x64_epi64(x, 0xd8); | |||
x2 = _mm256_permute4x64_epi64(x2, 0xd8); | |||
_mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31)); | |||
_mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20)); | |||
reading += 32; | |||
writing += 16; | |||
out += 32; | |||
} | |||
reading = (uint16 *) R; | |||
writing = R; | |||
i = 3; | |||
while (i > 0) { | |||
__m256i x, y; | |||
--i; | |||
if (!i) { | |||
reading -= 8; | |||
writing -= 4; | |||
out -= 4; | |||
} | |||
x = _mm256_loadu_si256((__m256i *) reading); | |||
y = x & _mm256_set1_epi32(65535); | |||
x = _mm256_srli_epi32(x, 16); | |||
x = _mm256_mullo_epi32(x, _mm256_set1_epi32(695)); | |||
x = _mm256_add_epi32(y, x); | |||
x = _mm256_shuffle_epi8(x, _mm256_set_epi8( | |||
12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1, | |||
12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1 | |||
)); | |||
x = _mm256_permute4x64_epi64(x, 0xd8); | |||
_mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0)); | |||
s0 = _mm256_extract_epi32(x, 4); | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
s0 = _mm256_extract_epi32(x, 6); | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
reading += 16; | |||
writing += 8; | |||
} | |||
R[20] = R[40]; | |||
for (i = 0; i < 10; ++i) { | |||
r0 = R[2 * i]; | |||
r1 = R[2 * i + 1]; | |||
r2 = r0 + r1 * (uint32)1887; | |||
*out++ = r2; | |||
r2 >>= 8; | |||
R[i] = r2; | |||
} | |||
R[10] = R[20]; | |||
for (i = 0; i < 5; ++i) { | |||
r0 = R[2 * i]; | |||
r1 = R[2 * i + 1]; | |||
r2 = r0 + r1 * (uint32)13910; | |||
*out++ = r2; | |||
r2 >>= 8; | |||
*out++ = r2; | |||
r2 >>= 8; | |||
R[i] = r2; | |||
} | |||
R[5] = R[10]; | |||
for (i = 0; i < 2; ++i) { | |||
r0 = R[2 * i]; | |||
r1 = R[2 * i + 1]; | |||
r2 = r0 + r1 * (uint32)2953; | |||
*out++ = r2; | |||
r2 >>= 8; | |||
*out++ = r2; | |||
r2 >>= 8; | |||
R[i] = r2; | |||
} | |||
r0 = R[4]; | |||
r1 = R[5]; | |||
r2 = r0 + r1 * (uint32)2953; | |||
*out++ = r2; | |||
r2 >>= 8; | |||
R[2] = r2; | |||
r0 = R[0]; | |||
r1 = R[1]; | |||
r2 = r0 + r1 * (uint32)134; | |||
*out++ = r2; | |||
r2 >>= 8; | |||
R[0] = r2; | |||
R[1] = R[2]; | |||
r0 = R[0]; | |||
r1 = R[1]; | |||
r2 = r0 + r1 * (uint32)71; | |||
*out++ = r2; | |||
r2 >>= 8; | |||
R[0] = r2; | |||
r0 = R[0]; | |||
*out++ = r0; | |||
r0 >>= 8; | |||
*out++ = r0; /*clang-analyzer-deadcode.DeadStores*/ /*r0 >>= 8;*/ | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_ENCODE_653X1541_H | |||
#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_ENCODE_653X1541_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x1541_STRBYTES 865 | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x1541_ITEMS 653 | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x1541_ITEMBYTES 2 | |||
void PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x1541(unsigned char *out, const void *v); | |||
#endif |
@@ -0,0 +1,288 @@ | |||
#include "crypto_encode_653x1541round.h" | |||
#include <immintrin.h> | |||
/* auto-generated; do not edit */ | |||
#define int16 int16_t | |||
#define uint16 uint16_t | |||
#define uint32 uint32_t | |||
void PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x1541round(unsigned char *out, const void *v) { | |||
const int16 *R0 = v; | |||
/* XXX: caller could overlap R with input */ | |||
uint16 R[327]; | |||
long i; | |||
const uint16 *reading; | |||
uint16 *writing; | |||
uint16 r0, r1; | |||
uint32 r2; | |||
uint32 s0; | |||
reading = (uint16 *) R0; | |||
writing = R; | |||
i = 41; | |||
while (i > 0) { | |||
__m256i x, y; | |||
--i; | |||
if (!i) { | |||
reading -= 4; | |||
writing -= 2; | |||
out -= 2; | |||
} | |||
x = _mm256_loadu_si256((__m256i *) reading); | |||
x = _mm256_mulhrs_epi16(x, _mm256_set1_epi16(10923)); | |||
x = _mm256_add_epi16(x, _mm256_add_epi16(x, x)); | |||
x = _mm256_add_epi16(x, _mm256_set1_epi16(2310)); | |||
x &= _mm256_set1_epi16(16383); | |||
x = _mm256_mulhi_epi16(x, _mm256_set1_epi16(21846)); | |||
y = x & _mm256_set1_epi32(65535); | |||
x = _mm256_srli_epi32(x, 16); | |||
x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1541)); | |||
x = _mm256_add_epi32(y, x); | |||
x = _mm256_shuffle_epi8(x, _mm256_set_epi8( | |||
12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1, | |||
12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1 | |||
)); | |||
x = _mm256_permute4x64_epi64(x, 0xd8); | |||
_mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0)); | |||
s0 = _mm256_extract_epi32(x, 4); | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
s0 = _mm256_extract_epi32(x, 6); | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
reading += 16; | |||
writing += 8; | |||
} | |||
R[326] = (((3 * ((10923 * R0[652] + 16384) >> 15) + 2310) & 16383) * 10923) >> 15; | |||
reading = (uint16 *) R; | |||
writing = R; | |||
i = 11; | |||
while (i > 0) { | |||
__m256i x, x2, y, y2; | |||
--i; | |||
if (!i) { | |||
reading -= 26; | |||
writing -= 13; | |||
out -= 26; | |||
} | |||
x = _mm256_loadu_si256((__m256i *) (reading + 0)); | |||
x2 = _mm256_loadu_si256((__m256i *) (reading + 16)); | |||
y = x & _mm256_set1_epi32(65535); | |||
y2 = x2 & _mm256_set1_epi32(65535); | |||
x = _mm256_srli_epi32(x, 16); | |||
x2 = _mm256_srli_epi32(x2, 16); | |||
x = _mm256_mullo_epi32(x, _mm256_set1_epi32(9277)); | |||
x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(9277)); | |||
x = _mm256_add_epi32(y, x); | |||
x2 = _mm256_add_epi32(y2, x2); | |||
x = _mm256_shuffle_epi8(x, _mm256_set_epi8( | |||
15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, | |||
15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 | |||
)); | |||
x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8( | |||
15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, | |||
15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 | |||
)); | |||
x = _mm256_permute4x64_epi64(x, 0xd8); | |||
x2 = _mm256_permute4x64_epi64(x2, 0xd8); | |||
_mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31)); | |||
_mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20)); | |||
reading += 32; | |||
writing += 16; | |||
out += 32; | |||
} | |||
R[163] = R[326]; | |||
reading = (uint16 *) R; | |||
writing = R; | |||
i = 11; | |||
while (i > 0) { | |||
__m256i x, y; | |||
--i; | |||
if (!i) { | |||
reading -= 12; | |||
writing -= 6; | |||
out -= 6; | |||
} | |||
x = _mm256_loadu_si256((__m256i *) reading); | |||
y = x & _mm256_set1_epi32(65535); | |||
x = _mm256_srli_epi32(x, 16); | |||
x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1314)); | |||
x = _mm256_add_epi32(y, x); | |||
x = _mm256_shuffle_epi8(x, _mm256_set_epi8( | |||
12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1, | |||
12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1 | |||
)); | |||
x = _mm256_permute4x64_epi64(x, 0xd8); | |||
_mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0)); | |||
s0 = _mm256_extract_epi32(x, 4); | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
s0 = _mm256_extract_epi32(x, 6); | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
reading += 16; | |||
writing += 8; | |||
} | |||
reading = (uint16 *) R; | |||
writing = R; | |||
i = 3; | |||
while (i > 0) { | |||
__m256i x, x2, y, y2; | |||
--i; | |||
if (!i) { | |||
reading -= 14; | |||
writing -= 7; | |||
out -= 14; | |||
} | |||
x = _mm256_loadu_si256((__m256i *) (reading + 0)); | |||
x2 = _mm256_loadu_si256((__m256i *) (reading + 16)); | |||
y = x & _mm256_set1_epi32(65535); | |||
y2 = x2 & _mm256_set1_epi32(65535); | |||
x = _mm256_srli_epi32(x, 16); | |||
x2 = _mm256_srli_epi32(x2, 16); | |||
x = _mm256_mullo_epi32(x, _mm256_set1_epi32(6745)); | |||
x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(6745)); | |||
x = _mm256_add_epi32(y, x); | |||
x2 = _mm256_add_epi32(y2, x2); | |||
x = _mm256_shuffle_epi8(x, _mm256_set_epi8( | |||
15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, | |||
15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 | |||
)); | |||
x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8( | |||
15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, | |||
15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 | |||
)); | |||
x = _mm256_permute4x64_epi64(x, 0xd8); | |||
x2 = _mm256_permute4x64_epi64(x2, 0xd8); | |||
_mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31)); | |||
_mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20)); | |||
reading += 32; | |||
writing += 16; | |||
out += 32; | |||
} | |||
reading = (uint16 *) R; | |||
writing = R; | |||
i = 3; | |||
while (i > 0) { | |||
__m256i x, y; | |||
--i; | |||
if (!i) { | |||
reading -= 8; | |||
writing -= 4; | |||
out -= 4; | |||
} | |||
x = _mm256_loadu_si256((__m256i *) reading); | |||
y = x & _mm256_set1_epi32(65535); | |||
x = _mm256_srli_epi32(x, 16); | |||
x = _mm256_mullo_epi32(x, _mm256_set1_epi32(695)); | |||
x = _mm256_add_epi32(y, x); | |||
x = _mm256_shuffle_epi8(x, _mm256_set_epi8( | |||
12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1, | |||
12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1 | |||
)); | |||
x = _mm256_permute4x64_epi64(x, 0xd8); | |||
_mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0)); | |||
s0 = _mm256_extract_epi32(x, 4); | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
s0 = _mm256_extract_epi32(x, 6); | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
s0 >>= 8; | |||
*out++ = s0; | |||
reading += 16; | |||
writing += 8; | |||
} | |||
R[20] = R[40]; | |||
for (i = 0; i < 10; ++i) { | |||
r0 = R[2 * i]; | |||
r1 = R[2 * i + 1]; | |||
r2 = r0 + r1 * (uint32)1887; | |||
*out++ = r2; | |||
r2 >>= 8; | |||
R[i] = r2; | |||
} | |||
R[10] = R[20]; | |||
for (i = 0; i < 5; ++i) { | |||
r0 = R[2 * i]; | |||
r1 = R[2 * i + 1]; | |||
r2 = r0 + r1 * (uint32)13910; | |||
*out++ = r2; | |||
r2 >>= 8; | |||
*out++ = r2; | |||
r2 >>= 8; | |||
R[i] = r2; | |||
} | |||
R[5] = R[10]; | |||
for (i = 0; i < 2; ++i) { | |||
r0 = R[2 * i]; | |||
r1 = R[2 * i + 1]; | |||
r2 = r0 + r1 * (uint32)2953; | |||
*out++ = r2; | |||
r2 >>= 8; | |||
*out++ = r2; | |||
r2 >>= 8; | |||
R[i] = r2; | |||
} | |||
r0 = R[4]; | |||
r1 = R[5]; | |||
r2 = r0 + r1 * (uint32)2953; | |||
*out++ = r2; | |||
r2 >>= 8; | |||
R[2] = r2; | |||
r0 = R[0]; | |||
r1 = R[1]; | |||
r2 = r0 + r1 * (uint32)134; | |||
*out++ = r2; | |||
r2 >>= 8; | |||
R[0] = r2; | |||
R[1] = R[2]; | |||
r0 = R[0]; | |||
r1 = R[1]; | |||
r2 = r0 + r1 * (uint32)71; | |||
*out++ = r2; | |||
r2 >>= 8; | |||
R[0] = r2; | |||
r0 = R[0]; | |||
*out++ = r0; | |||
r0 >>= 8; | |||
*out++ = r0; /*clang-analyzer-deadcode.DeadStores*/ /*r0 >>= 8;*/ | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_ENCODE_653X1541ROUND_H | |||
#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_ENCODE_653X1541ROUND_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x1541round_STRBYTES 865 | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x1541round_ITEMS 653 | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x1541round_ITEMBYTES 2 | |||
void PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x1541round(unsigned char *out, const void *v); | |||
#endif |
@@ -0,0 +1,64 @@ | |||
#include "crypto_encode_653x3.h" | |||
#include <immintrin.h> | |||
#define uint8 uint8_t | |||
#define p 653 | |||
#define loops 6 | |||
#define overshoot 29 | |||
static const union { | |||
uint8 init[32]; | |||
__m256i val; | |||
} lobytes_buf = { .init = { | |||
255, 0, 255, 0, 255, 0, 255, 0, | |||
255, 0, 255, 0, 255, 0, 255, 0, | |||
255, 0, 255, 0, 255, 0, 255, 0, | |||
255, 0, 255, 0, 255, 0, 255, 0, | |||
} | |||
}; | |||
#define lobytes (lobytes_buf.val) | |||
void PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x3(unsigned char *s, const void *v) { | |||
const uint8 *f = v; | |||
int loop; | |||
const uint8 *nextf = f + 128 - 4 * overshoot; | |||
unsigned char *nexts = s + 32 - overshoot; | |||
for (loop = loops; loop > 0; --loop) { | |||
__m256i f0 = _mm256_loadu_si256((const __m256i *) (f + 0)); | |||
__m256i f1 = _mm256_loadu_si256((const __m256i *) (f + 32)); | |||
__m256i f2 = _mm256_loadu_si256((const __m256i *) (f + 64)); | |||
__m256i f3 = _mm256_loadu_si256((const __m256i *) (f + 96)); | |||
f = nextf; | |||
nextf += 128; | |||
__m256i a0 = _mm256_packus_epi16(f0 & lobytes, f1 & lobytes); | |||
/* 0 2 4 6 8 10 12 14 32 34 36 38 40 42 44 46 */ | |||
/* 16 18 20 22 24 26 28 30 48 50 52 54 56 58 60 62 */ | |||
__m256i a1 = _mm256_packus_epi16(_mm256_srli_epi16(f0, 8), _mm256_srli_epi16(f1, 8)); | |||
/* 1 3 ... */ | |||
__m256i a2 = _mm256_packus_epi16(f2 & lobytes, f3 & lobytes); | |||
__m256i a3 = _mm256_packus_epi16(_mm256_srli_epi16(f2, 8), _mm256_srli_epi16(f3, 8)); | |||
a0 = _mm256_add_epi8(a0, _mm256_slli_epi16(a1 & _mm256_set1_epi8(63), 2)); | |||
a2 = _mm256_add_epi8(a2, _mm256_slli_epi16(a3 & _mm256_set1_epi8(63), 2)); | |||
__m256i b0 = _mm256_packus_epi16(a0 & lobytes, a2 & lobytes); | |||
/* 0 4 8 12 32 36 40 44 64 68 72 76 96 100 104 108 */ | |||
/* 16 20 24 28 48 52 56 60 80 84 88 92 112 116 120 124 */ | |||
__m256i b2 = _mm256_packus_epi16(_mm256_srli_epi16(a0, 8), _mm256_srli_epi16(a2, 8)); | |||
/* 2 6 ... */ | |||
b0 = _mm256_add_epi8(b0, _mm256_slli_epi16(b2 & _mm256_set1_epi8(15), 4)); | |||
b0 = _mm256_permutevar8x32_epi32(b0, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)); | |||
b0 = _mm256_add_epi8(b0, _mm256_set1_epi8(85)); | |||
_mm256_storeu_si256((__m256i *) s, b0); | |||
s = nexts; | |||
nexts += 32; | |||
} | |||
*s++ = *f++ + 1; | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_ENCODE_653X3_H | |||
#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_ENCODE_653X3_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x3_STRBYTES 164 | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x3_ITEMS 653 | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x3_ITEMBYTES 1 | |||
void PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x3(unsigned char *s, const void *v); | |||
#endif |
@@ -0,0 +1,13 @@ | |||
#include "crypto_encode_653xint16.h" | |||
void PQCLEAN_NTRULPR653_AVX2_crypto_encode_653xint16(unsigned char *s, const void *v) { | |||
const uint16_t *x = v; | |||
int i; | |||
for (i = 0; i < 653; ++i) { | |||
uint16_t u = *x++; | |||
*s++ = u; | |||
*s++ = u >> 8; | |||
} | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_ENCODE_653XINT16_H | |||
#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_ENCODE_653XINT16_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_653xint16_STRBYTES 1306 | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_653xint16_ITEMBYTES 2 | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_encode_653xint16_ITEMS 653 | |||
void PQCLEAN_NTRULPR653_AVX2_crypto_encode_653xint16(unsigned char *s, const void *v); | |||
#endif |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_SORT | |||
#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_SORT | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
void PQCLEAN_NTRULPR653_AVX2_crypto_sort_int32(int32_t *x, size_t n); | |||
#endif |
@@ -0,0 +1,20 @@ | |||
#include "crypto_sort_int32.h" | |||
#include "crypto_sort_uint32.h" | |||
#include <stdint.h> | |||
#define uint32 uint32_t | |||
/* can save time by vectorizing xor loops */ | |||
/* can save time by integrating xor loops with int32_sort */ | |||
void PQCLEAN_NTRULPR653_AVX2_crypto_sort_uint32(uint32_t *array, size_t n) { | |||
uint32 *x = array; | |||
size_t j; | |||
for (j = 0; j < n; ++j) { | |||
x[j] ^= 0x80000000; | |||
} | |||
PQCLEAN_NTRULPR653_AVX2_crypto_sort_int32((int32_t *)array, n); | |||
for (j = 0; j < n; ++j) { | |||
x[j] ^= 0x80000000; | |||
} | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_SORT_UINT32_H | |||
#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_SORT_UINT32_H | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
void PQCLEAN_NTRULPR653_AVX2_crypto_sort_uint32(uint32_t *array, size_t n); | |||
#endif |
@@ -0,0 +1,15 @@ | |||
#include "crypto_stream_aes256ctr.h" | |||
int PQCLEAN_NTRULPR653_AVX2_crypto_stream_aes256ctr( | |||
uint8_t *out, | |||
size_t outlen, | |||
const uint8_t nonce[AESCTR_NONCEBYTES], | |||
const uint8_t key[AES256_KEYBYTES]) { | |||
aes256ctx state; | |||
aes256_ctr_keyexp(&state, key); | |||
aes256_ctr(out, outlen, nonce, &state); | |||
aes256_ctx_release(&state); | |||
return 0; | |||
} |
@@ -0,0 +1,15 @@ | |||
#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_STREAM_AES256CTR_H | |||
#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_STREAM_AES256CTR_H | |||
#include "aes.h" | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
int PQCLEAN_NTRULPR653_AVX2_crypto_stream_aes256ctr( | |||
uint8_t *out, | |||
size_t outlen, | |||
const uint8_t nonce[AESCTR_NONCEBYTES], | |||
const uint8_t key[AES256_KEYBYTES]); | |||
#endif |
@@ -0,0 +1,36 @@ | |||
#include "crypto_verify_1025.h" | |||
#include <immintrin.h> | |||
int PQCLEAN_NTRULPR653_AVX2_crypto_verify_1025(const unsigned char *x, const unsigned char *y) { | |||
__m256i diff = _mm256_set1_epi8(0); | |||
unsigned int differentbits = 0; | |||
int i = PQCLEAN_NTRULPR653_AVX2_crypto_verify_1025_BYTES; | |||
i -= 32; | |||
for (;;) { | |||
do { | |||
__m256i x0 = _mm256_loadu_si256((__m256i *) x); | |||
__m256i y0 = _mm256_loadu_si256((__m256i *) y); | |||
diff |= x0 ^ y0; | |||
i -= 32; | |||
x += 32; | |||
y += 32; | |||
} while (i >= 0); | |||
if (i <= -32) { | |||
break; | |||
} | |||
x += i; | |||
y += i; | |||
} | |||
diff |= _mm256_srli_epi16(diff, 8); | |||
diff |= _mm256_srli_epi32(diff, 16); | |||
diff |= _mm256_srli_epi64(diff, 32); | |||
differentbits = _mm256_extract_epi8(diff, 0); | |||
differentbits |= _mm256_extract_epi8(diff, 8); | |||
differentbits |= _mm256_extract_epi8(diff, 16); | |||
differentbits |= _mm256_extract_epi8(diff, 24); | |||
return (int) (1 & ((differentbits - 1) >> 8)) - 1; | |||
} |
@@ -0,0 +1,8 @@ | |||
#ifndef PQCLEAN_NTRULPR653_AVX2_CRYPTO_VERIFY_1025_H | |||
#define PQCLEAN_NTRULPR653_AVX2_CRYPTO_VERIFY_1025_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR653_AVX2_crypto_verify_1025_BYTES 1025 | |||
int PQCLEAN_NTRULPR653_AVX2_crypto_verify_1025(const unsigned char *x, const unsigned char *y); | |||
#endif |
@@ -0,0 +1,287 @@ | |||
#include "api.h" | |||
#include "crypto_sort_uint32.h" | |||
#include "crypto_stream_aes256ctr.h" | |||
#include "params.h" | |||
#include "randombytes.h" | |||
#include "sha2.h" | |||
#define int8 int8_t | |||
#define int16 int16_t | |||
#define int32 int32_t | |||
#define uint16 uint16_t | |||
#define uint32 uint32_t | |||
#define uint64 uint64_t | |||
/* ----- masks */ | |||
/* return -1 if x<0; otherwise return 0 */ | |||
static int int16_negative_mask(int16 x) { | |||
uint16 u = x; | |||
u >>= 15; | |||
return -(int) u; | |||
/* alternative with gcc -fwrapv: */ | |||
/* x>>15 compiles to CPU's arithmetic right shift */ | |||
} | |||
/* ----- arithmetic mod 3 */ | |||
typedef int8 small; | |||
/* F3 is always represented as -1,0,1 */ | |||
/* ----- arithmetic mod q */ | |||
#define q12 ((q-1)/2) | |||
typedef int16 Fq; | |||
/* works for -14000000 < x < 14000000 if q in 4591, 4621, 5167 */ | |||
/* assumes twos complement; use, e.g., gcc -fwrapv */ | |||
static Fq Fq_freeze(int32 x) { | |||
x -= q * ((q18 * x) >> 18); | |||
x -= q * ((q27 * x + 67108864) >> 27); | |||
return x; | |||
} | |||
/* works for all uint32 x */ | |||
static Fq Fq_bigfreeze(uint32 x) { | |||
x -= q * ((x * (uint64)q31) >> 31); | |||
x -= q * ((x * (uint64)q31) >> 31); | |||
x -= q; | |||
x += (-(x >> 31)) & (uint32)q; | |||
return x; | |||
} | |||
/* ----- Top and Right */ | |||
static int8 Top(Fq C) { | |||
return (tau1 * (int32)(C + tau0) + 16384) >> 15; | |||
} | |||
static Fq Right(int8 T) { | |||
return Fq_freeze(tau3 * (int32)T - tau2); | |||
} | |||
/* ----- polynomials mod q */ | |||
/* h = h*g in the ring Rq */ | |||
static void Rq_mult_small(Fq *h, const small *g) { | |||
crypto_encode_pxint16((unsigned char *) h, h); | |||
crypto_core_mult((unsigned char *) h, (const unsigned char *) h, (const unsigned char *) g); | |||
crypto_decode_pxint16(h, (const unsigned char *) h); | |||
} | |||
/* ----- sorting to generate short polynomial */ | |||
static void Short_fromlist(small *out, const uint32 *in) { | |||
uint32 L[ppadsort]; | |||
int i; | |||
for (i = 0; i < w; ++i) { | |||
L[i] = in[i] & (uint32) - 2; | |||
} | |||
for (i = w; i < p; ++i) { | |||
L[i] = (in[i] & (uint32) - 3) | 1; | |||
} | |||
for (i = p; i < ppadsort; ++i) { | |||
L[i] = 0xffffffff; | |||
} | |||
PQCLEAN_NTRULPR653_AVX2_crypto_sort_uint32(L, ppadsort); | |||
for (i = 0; i < p; ++i) { | |||
out[i] = (L[i] & 3) - 1; | |||
} | |||
} | |||
/* ----- underlying hash function */ | |||
#define Hash_bytes 32 | |||
static void Hash(unsigned char *out, const unsigned char *in, int inlen) { | |||
unsigned char h[64]; | |||
int i; | |||
sha512(h, in, inlen); | |||
for (i = 0; i < 32; ++i) { | |||
out[i] = h[i]; | |||
} | |||
} | |||
/* ----- higher-level randomness */ | |||
static void Short_random(small *out) { | |||
uint32 L[p]; | |||
randombytes((unsigned char *) L, sizeof L); | |||
crypto_decode_pxint32(L, (unsigned char *) L); | |||
Short_fromlist(out, L); | |||
} | |||
/* ----- Inputs, Generator */ | |||
typedef int8 Inputs[I]; /* passed by reference */ | |||
static const unsigned char aes_nonce[16] = {0}; | |||
/* G = Generator(pk) */ | |||
static void Generator(Fq *G, const unsigned char *pk) { | |||
uint32 L[p]; | |||
int i; | |||
PQCLEAN_NTRULPR653_AVX2_crypto_stream_aes256ctr((unsigned char *) L, 4 * p, aes_nonce, pk); | |||
crypto_decode_pxint32(L, (unsigned char *) L); | |||
for (i = 0; i < p; ++i) { | |||
G[i] = Fq_bigfreeze(L[i]) - q12; | |||
} | |||
} | |||
/* ----- NTRU LPRime */ | |||
#define Seeds_bytes 32 | |||
#define Ciphertexts_bytes (Rounded_bytes+Top_bytes) | |||
#define SecretKeys_bytes Small_bytes | |||
#define PublicKeys_bytes (Seeds_bytes+Rounded_bytes) | |||
#define Confirm_bytes 32 | |||
/* c,r_enc[1:] = Hide(r,pk,cache); cache is Hash4(pk) */ | |||
static void Hide(unsigned char *c, unsigned char *r_enc, const Inputs r, const unsigned char *pk, const unsigned char *cache) { | |||
small b[p]; | |||
int i; | |||
Inputs_encode(r_enc + 1, r); | |||
{ | |||
unsigned char h[Hash_bytes]; | |||
uint32 L[p]; | |||
{ | |||
unsigned char s[1 + Inputs_bytes]; | |||
Inputs_encode(s + 1, r); | |||
s[0] = 5; | |||
Hash(h, s, sizeof s); | |||
} | |||
PQCLEAN_NTRULPR653_AVX2_crypto_stream_aes256ctr((unsigned char *) L, 4 * p, aes_nonce, h); | |||
crypto_decode_pxint32(L, (unsigned char *) L); | |||
Short_fromlist(b, L); | |||
} | |||
{ | |||
Fq bG[p]; | |||
Generator(bG, pk); | |||
Rq_mult_small(bG, b); | |||
Round_and_encode(c, bG); | |||
c += Rounded_bytes; | |||
} | |||
{ | |||
Fq bA[p]; | |||
int8 T[I]; | |||
Rounded_decode(bA, pk + Seeds_bytes); | |||
Rq_mult_small(bA, b); | |||
for (i = 0; i < I; ++i) { | |||
T[i] = Top(Fq_freeze(bA[i] + r[i] * q12)); | |||
} | |||
Top_encode(c, T); | |||
c += Top_bytes; | |||
} | |||
{ | |||
unsigned char x[1 + Inputs_bytes + Hash_bytes]; | |||
for (i = 0; i < Inputs_bytes; ++i) { | |||
x[1 + i] = r_enc[1 + i]; | |||
} | |||
for (i = 0; i < Hash_bytes; ++i) { | |||
x[1 + Inputs_bytes + i] = cache[i]; | |||
} | |||
x[0] = 2; | |||
Hash(c, x, sizeof x); | |||
} | |||
} | |||
int PQCLEAN_NTRULPR653_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { | |||
Fq aG[p]; | |||
int i; | |||
randombytes(pk, Seeds_bytes); | |||
Generator(aG, pk); | |||
{ | |||
small a[p]; | |||
Short_random(a); | |||
Rq_mult_small(aG, a); | |||
Small_encode(sk, a); | |||
} | |||
Round_and_encode(pk + Seeds_bytes, aG); | |||
{ | |||
unsigned char sksave = sk[SecretKeys_bytes - 1]; | |||
for (i = 0; i < PublicKeys_bytes; ++i) { | |||
sk[SecretKeys_bytes + i] = pk[i]; | |||
} | |||
sk[SecretKeys_bytes - 1] = 4; | |||
Hash(sk + SecretKeys_bytes + PublicKeys_bytes + Inputs_bytes, sk + SecretKeys_bytes - 1, 1 + PublicKeys_bytes); | |||
sk[SecretKeys_bytes - 1] = sksave; | |||
randombytes(sk + SecretKeys_bytes + PublicKeys_bytes, Inputs_bytes); | |||
} | |||
return 0; | |||
} | |||
int PQCLEAN_NTRULPR653_AVX2_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk) { | |||
int i; | |||
unsigned char cache[Hash_bytes]; | |||
{ | |||
unsigned char y[1 + PublicKeys_bytes]; | |||
for (i = 0; i < PublicKeys_bytes; ++i) { | |||
y[1 + i] = pk[i]; | |||
} | |||
y[0] = 4; | |||
Hash(cache, y, sizeof y); | |||
} | |||
Inputs r; | |||
{ | |||
unsigned char s[Inputs_bytes]; | |||
randombytes(s, sizeof s); | |||
Inputs_decode(r, s); | |||
} | |||
{ | |||
unsigned char x[1 + Inputs_bytes + Ciphertexts_bytes + Confirm_bytes]; | |||
Hide(c, x, r, pk, cache); | |||
for (i = 0; i < Ciphertexts_bytes + Confirm_bytes; ++i) { | |||
x[1 + Inputs_bytes + i] = c[i]; | |||
} | |||
x[0] = 1; | |||
Hash(k, x, sizeof x); | |||
} | |||
return 0; | |||
} | |||
int PQCLEAN_NTRULPR653_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk) { | |||
const unsigned char *pk = sk + SecretKeys_bytes; | |||
const unsigned char *rho = pk + PublicKeys_bytes; | |||
const unsigned char *cache = rho + Inputs_bytes; | |||
Inputs r; | |||
int i; | |||
{ | |||
Fq aB[p]; | |||
Rounded_decode(aB, c); | |||
{ | |||
small a[p]; | |||
Small_decode(a, sk); | |||
Rq_mult_small(aB, a); | |||
} | |||
{ | |||
int8 T[I]; | |||
Top_decode(T, c + Rounded_bytes); | |||
for (i = 0; i < I; ++i) { | |||
r[i] = -int16_negative_mask(Fq_freeze(Right(T[i]) - aB[i] + 4 * w + 1)); | |||
} | |||
} | |||
} | |||
{ | |||
unsigned char cnew[Ciphertexts_bytes + Confirm_bytes]; | |||
int mask; | |||
unsigned char x[1 + Inputs_bytes + Ciphertexts_bytes + Confirm_bytes]; | |||
Hide(cnew, x, r, pk, cache); | |||
mask = crypto_verify_clen(c, cnew); | |||
for (i = 0; i < Inputs_bytes; ++i) { | |||
x[1 + i] ^= mask & (x[1 + i] ^ rho[i]); | |||
} | |||
for (i = 0; i < Ciphertexts_bytes + Confirm_bytes; ++i) { | |||
x[1 + Inputs_bytes + i] = c[i]; | |||
} | |||
x[0] = 1 + mask; | |||
Hash(k, x, sizeof x); | |||
} | |||
return 0; | |||
} |
@@ -0,0 +1,61 @@ | |||
#ifndef params_H | |||
#define params_H | |||
#include "crypto_core_multsntrup653.h" | |||
#include "crypto_decode_256x16.h" | |||
#include "crypto_decode_256x2.h" | |||
#include "crypto_decode_653x1541.h" | |||
#include "crypto_decode_653x3.h" | |||
#include "crypto_decode_653xint16.h" | |||
#include "crypto_decode_653xint32.h" | |||
#include "crypto_encode_256x16.h" | |||
#include "crypto_encode_256x2.h" | |||
#include "crypto_encode_653x1541.h" | |||
#include "crypto_encode_653x1541round.h" | |||
#include "crypto_encode_653x3.h" | |||
#include "crypto_encode_653xint16.h" | |||
#include "crypto_verify_1025.h" | |||
#define p 653 | |||
#define q 4621 | |||
#define w 252 | |||
#define tau0 2175 | |||
#define tau1 113 | |||
#define tau2 2031 | |||
#define tau3 290 | |||
#define I 256 | |||
#define ppadsort 653 | |||
#define q18 57 /* round(2^18/q) */ | |||
#define q27 29045 /* round(2^27/q) */ | |||
#define q31 464722 /* floor(2^31/q) */ | |||
#define crypto_verify_clen PQCLEAN_NTRULPR653_AVX2_crypto_verify_1025 | |||
#define Rounded_bytes PQCLEAN_NTRULPR653_AVX2_crypto_decode_653x1541_STRBYTES | |||
#define Rounded_decode PQCLEAN_NTRULPR653_AVX2_crypto_decode_653x1541 | |||
#define Round_and_encode PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x1541round | |||
#define Small_bytes PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x3_STRBYTES | |||
#define Small_encode PQCLEAN_NTRULPR653_AVX2_crypto_encode_653x3 | |||
#define Small_decode PQCLEAN_NTRULPR653_AVX2_crypto_decode_653x3 | |||
#define Top_bytes PQCLEAN_NTRULPR653_AVX2_crypto_encode_256x16_STRBYTES | |||
#define Top_encode PQCLEAN_NTRULPR653_AVX2_crypto_encode_256x16 | |||
#define Top_decode PQCLEAN_NTRULPR653_AVX2_crypto_decode_256x16 | |||
#define Inputs_bytes PQCLEAN_NTRULPR653_AVX2_crypto_encode_256x2_STRBYTES | |||
#define Inputs_encode PQCLEAN_NTRULPR653_AVX2_crypto_encode_256x2 | |||
#define Inputs_decode PQCLEAN_NTRULPR653_AVX2_crypto_decode_256x2 | |||
#define crypto_decode_pxint32 PQCLEAN_NTRULPR653_AVX2_crypto_decode_653xint32 | |||
#define crypto_decode_pxint16 PQCLEAN_NTRULPR653_AVX2_crypto_decode_653xint16 | |||
#define crypto_encode_pxint16 PQCLEAN_NTRULPR653_AVX2_crypto_encode_653xint16 | |||
#define crypto_core_mult PQCLEAN_NTRULPR653_AVX2_crypto_core_multsntrup653 | |||
#endif |
@@ -0,0 +1 @@ | |||
Public Domain |
@@ -0,0 +1,19 @@ | |||
# This Makefile can be used with GNU Make or BSD Make | |||
LIB=libntrulpr653_clean.a | |||
HEADERS=api.h crypto_core_multsntrup653.h crypto_decode_256x16.h crypto_decode_256x2.h crypto_decode_653x1541.h crypto_decode_653x3.h crypto_decode_653xint16.h crypto_decode_653xint32.h crypto_encode_256x16.h crypto_encode_256x2.h crypto_encode_653x1541.h crypto_encode_653x1541round.h crypto_encode_653x3.h crypto_encode_653xint16.h crypto_sort_int32.h crypto_sort_uint32.h crypto_stream_aes256ctr.h crypto_verify_1025.h params.h | |||
OBJECTS=crypto_core_multsntrup653.o crypto_decode_256x16.o crypto_decode_256x2.o crypto_decode_653x1541.o crypto_decode_653x3.o crypto_decode_653xint16.o crypto_decode_653xint32.o crypto_encode_256x16.o crypto_encode_256x2.o crypto_encode_653x1541.o crypto_encode_653x1541round.o crypto_encode_653x3.o crypto_encode_653xint16.o crypto_sort_int32.o crypto_sort_uint32.o crypto_stream_aes256ctr.o crypto_verify_1025.o kem.o | |||
CFLAGS=-O3 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS) | |||
all: $(LIB) | |||
%.o: %.c $(HEADERS) | |||
$(CC) $(CFLAGS) -c -o $@ $< | |||
$(LIB): $(OBJECTS) | |||
$(AR) -r $@ $(OBJECTS) | |||
clean: | |||
$(RM) $(OBJECTS) | |||
$(RM) $(LIB) |
@@ -0,0 +1,19 @@ | |||
# This Makefile can be used with Microsoft Visual Studio's nmake using the command: | |||
# nmake /f Makefile.Microsoft_nmake | |||
LIBRARY=libntrulpr653_clean.lib | |||
OBJECTS=crypto_core_multsntrup653.obj crypto_decode_256x16.obj crypto_decode_256x2.obj crypto_decode_653x1541.obj crypto_decode_653x3.obj crypto_decode_653xint16.obj crypto_decode_653xint32.obj crypto_encode_256x16.obj crypto_encode_256x2.obj crypto_encode_653x1541.obj crypto_encode_653x1541round.obj crypto_encode_653x3.obj crypto_encode_653xint16.obj crypto_sort_int32.obj crypto_sort_uint32.obj crypto_stream_aes256ctr.obj crypto_verify_1025.obj kem.obj | |||
CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /WX | |||
all: $(LIBRARY) | |||
# Make sure objects are recompiled if headers change. | |||
$(OBJECTS): *.h | |||
$(LIBRARY): $(OBJECTS) | |||
LIB.EXE /NOLOGO /WX /OUT:$@ $** | |||
clean: | |||
-DEL $(OBJECTS) | |||
-DEL $(LIBRARY) |
@@ -0,0 +1,16 @@ | |||
#ifndef PQCLEAN_NTRULPR653_CLEAN_API_H | |||
#define PQCLEAN_NTRULPR653_CLEAN_API_H | |||
#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_ALGNAME "ntrulpr653" | |||
#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_SECRETKEYBYTES 1125 | |||
#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_PUBLICKEYBYTES 897 | |||
#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_CIPHERTEXTBYTES 1025 | |||
#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_BYTES 32 | |||
int PQCLEAN_NTRULPR653_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); | |||
int PQCLEAN_NTRULPR653_CLEAN_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk); | |||
int PQCLEAN_NTRULPR653_CLEAN_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk); | |||
#endif |
@@ -0,0 +1,60 @@ | |||
#include "crypto_core_multsntrup653.h" | |||
#include "params.h" | |||
#define int8 int8_t | |||
#define int16 int16_t | |||
#define int32 int32_t | |||
typedef int8 small; | |||
typedef int16 Fq; | |||
/* always represented as -(q-1)/2...(q-1)/2 */ | |||
/* works for -14000000 < x < 14000000 if q in 4591, 4621, 5167 */ | |||
static Fq Fq_freeze(int32 x) { | |||
x -= q * ((q18 * x) >> 18); | |||
x -= q * ((q27 * x + 67108864) >> 27); | |||
return x; | |||
} | |||
int PQCLEAN_NTRULPR653_CLEAN_crypto_core_multsntrup653(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes) { | |||
Fq f[p]; | |||
small g[p]; | |||
Fq fg[p + p - 1]; | |||
int32 result; | |||
int i, j; | |||
crypto_decode_pxint16(f, inbytes); | |||
for (i = 0; i < p; ++i) { | |||
f[i] = Fq_freeze(f[i]); | |||
} | |||
for (i = 0; i < p; ++i) { | |||
small gi = kbytes[i]; | |||
small gi0 = gi & 1; | |||
g[i] = gi0 - (gi & (gi0 << 1)); | |||
} | |||
for (i = 0; i < p; ++i) { | |||
result = 0; | |||
for (j = 0; j <= i; ++j) { | |||
result += f[j] * (int32)g[i - j]; | |||
} | |||
fg[i] = Fq_freeze(result); | |||
} | |||
for (i = p; i < p + p - 1; ++i) { | |||
result = 0; | |||
for (j = i - p + 1; j < p; ++j) { | |||
result += f[j] * (int32)g[i - j]; | |||
} | |||
fg[i] = Fq_freeze(result); | |||
} | |||
for (i = p + p - 2; i >= p; --i) { | |||
fg[i - p] = Fq_freeze(fg[i - p] + fg[i]); | |||
fg[i - p + 1] = Fq_freeze(fg[i - p + 1] + fg[i]); | |||
} | |||
crypto_encode_pxint16(outbytes, fg); | |||
return 0; | |||
} |
@@ -0,0 +1,11 @@ | |||
#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_CORE_MULTSNTRUP653_H | |||
#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_CORE_MULTSNTRUP653_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_core_multsntrup653_OUTPUTBYTES 1306 | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_core_multsntrup653_INPUTBYTES 1306 | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_core_multsntrup653_KEYBYTES 653 | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_core_multsntrup653_CONSTBYTES 0 | |||
int PQCLEAN_NTRULPR653_CLEAN_crypto_core_multsntrup653(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes); | |||
#endif |
@@ -0,0 +1,11 @@ | |||
#include "crypto_decode_256x16.h" | |||
void PQCLEAN_NTRULPR653_CLEAN_crypto_decode_256x16(void *v, const unsigned char *s) { | |||
unsigned char *T = v; | |||
int i; | |||
for (i = 0; i < 128; ++i) { | |||
T[2 * i] = s[i] & 15; | |||
T[2 * i + 1] = s[i] >> 4; | |||
} | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_DECODE_256X16_H | |||
#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_DECODE_256X16_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_256x16_STRBYTES 128 | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_256x16_ITEMS 256 | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_256x16_ITEMBYTES 1 | |||
void PQCLEAN_NTRULPR653_CLEAN_crypto_decode_256x16(void *v, const unsigned char *s); | |||
#endif |
@@ -0,0 +1,10 @@ | |||
#include "crypto_decode_256x2.h" | |||
void PQCLEAN_NTRULPR653_CLEAN_crypto_decode_256x2(void *v, const unsigned char *s) { | |||
unsigned char *r = v; | |||
int i; | |||
for (i = 0; i < 256; ++i) { | |||
r[i] = 1 & (s[i >> 3] >> (i & 7)); | |||
} | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_DECODE_256X2_H | |||
#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_DECODE_256X2_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_256x2_STRBYTES 32 | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_256x2_ITEMS 256 | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_256x2_ITEMBYTES 1 | |||
void PQCLEAN_NTRULPR653_CLEAN_crypto_decode_256x2(void *v, const unsigned char *s); | |||
#endif |
@@ -0,0 +1,200 @@ | |||
#include "crypto_decode_653x1541.h" | |||
/* auto-generated; do not edit */ | |||
#define int16 int16_t | |||
#define uint16 uint16_t | |||
#define uint32 uint32_t | |||
#define uint64 uint64_t | |||
/* | |||
CPU division instruction typically takes time depending on x. | |||
This software is designed to take time independent of x. | |||
Time still varies depending on m; user must ensure that m is constant. | |||
Time also varies on CPUs where multiplication is variable-time. | |||
There could be more CPU issues. | |||
There could also be compiler issues. | |||
*/ | |||
static void uint32_divmod_uint14(uint32 *q, uint16 *r, uint32 x, uint16 m) { | |||
uint32 v = 0x80000000; | |||
uint32 qpart; | |||
uint32 mask; | |||
v /= m; | |||
/* caller guarantees m > 0 */ | |||
/* caller guarantees m < 16384 */ | |||
/* vm <= 2^31 <= vm+m-1 */ | |||
/* xvm <= 2^31 x <= xvm+x(m-1) */ | |||
*q = 0; | |||
qpart = (x * (uint64)v) >> 31; | |||
/* 2^31 qpart <= xv <= 2^31 qpart + 2^31-1 */ | |||
/* 2^31 qpart m <= xvm <= 2^31 qpart m + (2^31-1)m */ | |||
/* 2^31 qpart m <= 2^31 x <= 2^31 qpart m + (2^31-1)m + x(m-1) */ | |||
/* 0 <= 2^31 newx <= (2^31-1)m + x(m-1) */ | |||
/* 0 <= newx <= (1-1/2^31)m + x(m-1)/2^31 */ | |||
/* 0 <= newx <= (1-1/2^31)(2^14-1) + (2^32-1)((2^14-1)-1)/2^31 */ | |||
x -= qpart * m; | |||
*q += qpart; | |||
/* x <= 49146 */ | |||
qpart = (x * (uint64)v) >> 31; | |||
/* 0 <= newx <= (1-1/2^31)m + x(m-1)/2^31 */ | |||
/* 0 <= newx <= m + 49146(2^14-1)/2^31 */ | |||
/* 0 <= newx <= m + 0.4 */ | |||
/* 0 <= newx <= m */ | |||
x -= qpart * m; | |||
*q += qpart; | |||
/* x <= m */ | |||
x -= m; | |||
*q += 1; | |||
mask = -(x >> 31); | |||
x += mask & (uint32)m; | |||
*q += mask; | |||
/* x < m */ | |||
*r = x; | |||
} | |||
static uint16 uint32_mod_uint14(uint32 x, uint16 m) { | |||
uint32 q; | |||
uint16 r; | |||
uint32_divmod_uint14(&q, &r, x, m); | |||
return r; | |||
} | |||
void PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653x1541(void *v, const unsigned char *s) { | |||
int16 *R0 = v; | |||
uint16 R1[327], R2[164], R3[82], R4[41], R5[21], R6[11], R7[6], R8[3], R9[2], R10[1]; | |||
long long i; | |||
uint16 r0; | |||
uint32 r1, r2; | |||
s += PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653x1541_STRBYTES; | |||
r1 = 0; | |||
r1 = (r1 << 8) | *--s; | |||
r1 = (r1 << 8) | *--s; | |||
r1 = uint32_mod_uint14(r1, 2608); /* needed only for invalid inputs */ | |||
R10[0] = r1; | |||
r2 = R10[0]; | |||
r2 = (r2 << 8) | *--s; | |||
uint32_divmod_uint14(&r1, &r0, r2, 71); | |||
R9[0] = r0; | |||
r1 = uint32_mod_uint14(r1, 9402); /* needed only for invalid inputs */ | |||
R9[1] = r1; | |||
R8[2] = R9[1]; | |||
r2 = R9[0]; | |||
r2 = (r2 << 8) | *--s; | |||
uint32_divmod_uint14(&r1, &r0, r2, 134); | |||
R8[0] = r0; | |||
r1 = uint32_mod_uint14(r1, 134); /* needed only for invalid inputs */ | |||
R8[1] = r1; | |||
r2 = R8[2]; | |||
r2 = (r2 << 8) | *--s; | |||
uint32_divmod_uint14(&r1, &r0, r2, 2953); | |||
R7[4] = r0; | |||
r1 = uint32_mod_uint14(r1, 815); /* needed only for invalid inputs */ | |||
R7[5] = r1; | |||
for (i = 1; i >= 0; --i) { | |||
r2 = R8[i]; | |||
r2 = (r2 << 8) | *--s; | |||
r2 = (r2 << 8) | *--s; | |||
uint32_divmod_uint14(&r1, &r0, r2, 2953); | |||
R7[2 * i] = r0; | |||
r1 = uint32_mod_uint14(r1, 2953); /* needed only for invalid inputs */ | |||
R7[2 * i + 1] = r1; | |||
} | |||
R6[10] = R7[5]; | |||
for (i = 4; i >= 0; --i) { | |||
r2 = R7[i]; | |||
r2 = (r2 << 8) | *--s; | |||
r2 = (r2 << 8) | *--s; | |||
uint32_divmod_uint14(&r1, &r0, r2, 13910); | |||
R6[2 * i] = r0; | |||
r1 = uint32_mod_uint14(r1, 13910); /* needed only for invalid inputs */ | |||
R6[2 * i + 1] = r1; | |||
} | |||
R5[20] = R6[10]; | |||
for (i = 9; i >= 0; --i) { | |||
r2 = R6[i]; | |||
r2 = (r2 << 8) | *--s; | |||
uint32_divmod_uint14(&r1, &r0, r2, 1887); | |||
R5[2 * i] = r0; | |||
r1 = uint32_mod_uint14(r1, 1887); /* needed only for invalid inputs */ | |||
R5[2 * i + 1] = r1; | |||
} | |||
R4[40] = R5[20]; | |||
for (i = 19; i >= 0; --i) { | |||
r2 = R5[i]; | |||
r2 = (r2 << 8) | *--s; | |||
uint32_divmod_uint14(&r1, &r0, r2, 695); | |||
R4[2 * i] = r0; | |||
r1 = uint32_mod_uint14(r1, 695); /* needed only for invalid inputs */ | |||
R4[2 * i + 1] = r1; | |||
} | |||
r2 = R4[40]; | |||
r2 = (r2 << 8) | *--s; | |||
r2 = (r2 << 8) | *--s; | |||
uint32_divmod_uint14(&r1, &r0, r2, 6745); | |||
R3[80] = r0; | |||
r1 = uint32_mod_uint14(r1, 7910); /* needed only for invalid inputs */ | |||
R3[81] = r1; | |||
for (i = 39; i >= 0; --i) { | |||
r2 = R4[i]; | |||
r2 = (r2 << 8) | *--s; | |||
r2 = (r2 << 8) | *--s; | |||
uint32_divmod_uint14(&r1, &r0, r2, 6745); | |||
R3[2 * i] = r0; | |||
r1 = uint32_mod_uint14(r1, 6745); /* needed only for invalid inputs */ | |||
R3[2 * i + 1] = r1; | |||
} | |||
r2 = R3[81]; | |||
r2 = (r2 << 8) | *--s; | |||
uint32_divmod_uint14(&r1, &r0, r2, 1314); | |||
R2[162] = r0; | |||
r1 = uint32_mod_uint14(r1, 1541); /* needed only for invalid inputs */ | |||
R2[163] = r1; | |||
for (i = 80; i >= 0; --i) { | |||
r2 = R3[i]; | |||
r2 = (r2 << 8) | *--s; | |||
uint32_divmod_uint14(&r1, &r0, r2, 1314); | |||
R2[2 * i] = r0; | |||
r1 = uint32_mod_uint14(r1, 1314); /* needed only for invalid inputs */ | |||
R2[2 * i + 1] = r1; | |||
} | |||
R1[326] = R2[163]; | |||
for (i = 162; i >= 0; --i) { | |||
r2 = R2[i]; | |||
r2 = (r2 << 8) | *--s; | |||
r2 = (r2 << 8) | *--s; | |||
uint32_divmod_uint14(&r1, &r0, r2, 9277); | |||
R1[2 * i] = r0; | |||
r1 = uint32_mod_uint14(r1, 9277); /* needed only for invalid inputs */ | |||
R1[2 * i + 1] = r1; | |||
} | |||
R0[652] = 3 * R1[326] - 2310; | |||
for (i = 325; i >= 0; --i) { | |||
r2 = R1[i]; | |||
r2 = (r2 << 8) | *--s; | |||
uint32_divmod_uint14(&r1, &r0, r2, 1541); | |||
R0[2 * i] = 3 * r0 - 2310; | |||
r1 = uint32_mod_uint14(r1, 1541); /* needed only for invalid inputs */ | |||
R0[2 * i + 1] = 3 * r1 - 2310; | |||
} | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_DECODE_653X1541_H | |||
#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_DECODE_653X1541_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653x1541_STRBYTES 865 | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653x1541_ITEMS 653 | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653x1541_ITEMBYTES 2 | |||
void PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653x1541(void *v, const unsigned char *s); | |||
#endif |
@@ -0,0 +1,24 @@ | |||
#include "crypto_decode_653x3.h" | |||
#define uint8 uint8_t | |||
#define p 653 | |||
void PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653x3(void *v, const unsigned char *s) { | |||
uint8 *f = v; | |||
uint8 x; | |||
int i; | |||
for (i = 0; i < p / 4; ++i) { | |||
x = *s++; | |||
*f++ = ((uint8)(x & 3)) - 1; | |||
x >>= 2; | |||
*f++ = ((uint8)(x & 3)) - 1; | |||
x >>= 2; | |||
*f++ = ((uint8)(x & 3)) - 1; | |||
x >>= 2; | |||
*f++ = ((uint8)(x & 3)) - 1; | |||
} | |||
x = *s++; | |||
*f++ = ((uint8)(x & 3)) - 1; | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_DECODE_653X3_H | |||
#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_DECODE_653X3_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653x3_STRBYTES 164 | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653x3_ITEMS 653 | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653x3_ITEMBYTES 1 | |||
void PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653x3(void *v, const unsigned char *s); | |||
#endif |
@@ -0,0 +1,16 @@ | |||
#include "crypto_decode_653xint16.h" | |||
void PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653xint16(void *v, const unsigned char *s) { | |||
uint16_t *x = v; | |||
int i; | |||
for (i = 0; i < 653; ++i) { | |||
uint16_t u0 = s[0]; | |||
uint16_t u1 = s[1]; | |||
u1 <<= 8; | |||
*x = u0 | u1; | |||
x += 1; | |||
s += 2; | |||
} | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_DECODE_653XINT16_H | |||
#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_DECODE_653XINT16_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653xint16_STRBYTES 1306 | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653xint16_ITEMBYTES 2 | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653xint16_ITEMS 653 | |||
void PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653xint16(void *v, const unsigned char *s); | |||
#endif |
@@ -0,0 +1,20 @@ | |||
#include "crypto_decode_653xint32.h" | |||
void PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653xint32(void *v, const unsigned char *s) { | |||
uint32_t *x = v; | |||
int i; | |||
for (i = 0; i < 653; ++i) { | |||
uint32_t u0 = s[0]; | |||
uint32_t u1 = s[1]; | |||
uint32_t u2 = s[2]; | |||
uint32_t u3 = s[3]; | |||
u1 <<= 8; | |||
u2 <<= 16; | |||
u3 <<= 24; | |||
*x = u0 | u1 | u2 | u3; | |||
x += 1; | |||
s += 4; | |||
} | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_DECODE_653XINT32_H | |||
#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_DECODE_653XINT32_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653xint32_STRBYTES 2612 | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653xint32_ITEMBYTES 4 | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653xint32_ITEMS 653 | |||
void PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653xint32(void *v, const unsigned char *s); | |||
#endif |
@@ -0,0 +1,10 @@ | |||
#include "crypto_encode_256x16.h" | |||
void PQCLEAN_NTRULPR653_CLEAN_crypto_encode_256x16(unsigned char *s, const void *v) { | |||
const unsigned char *T = v; | |||
int i; | |||
for (i = 0; i < 128; ++i) { | |||
s[i] = T[2 * i] + (T[2 * i + 1] << 4); | |||
} | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_ENCODE_256X16_H | |||
#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_ENCODE_256X16_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_256x16_STRBYTES 128 | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_256x16_ITEMS 256 | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_256x16_ITEMBYTES 1 | |||
void PQCLEAN_NTRULPR653_CLEAN_crypto_encode_256x16(unsigned char *s, const void *v); | |||
#endif |
@@ -0,0 +1,13 @@ | |||
#include "crypto_encode_256x2.h" | |||
void PQCLEAN_NTRULPR653_CLEAN_crypto_encode_256x2(unsigned char *s, const void *v) { | |||
const unsigned char *r = v; | |||
int i; | |||
for (i = 0; i < 32; ++i) { | |||
s[i] = 0; | |||
} | |||
for (i = 0; i < 256; ++i) { | |||
s[i >> 3] |= (r[i] & 1) << (i & 7); | |||
} | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_ENCODE_256X2_H | |||
#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_ENCODE_256X2_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_256x2_STRBYTES 32 | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_256x2_ITEMS 256 | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_256x2_ITEMBYTES 1 | |||
void PQCLEAN_NTRULPR653_CLEAN_crypto_encode_256x2(unsigned char *s, const void *v); | |||
#endif |
@@ -0,0 +1,127 @@ | |||
#include "crypto_encode_653x1541.h" | |||
/* auto-generated; do not edit */ | |||
#define int16 int16_t | |||
#define uint16 uint16_t | |||
#define uint32 uint32_t | |||
void PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x1541(unsigned char *out, const void *v) { | |||
const int16 *R0 = v; | |||
/* XXX: caller could overlap R with input */ | |||
uint16 R[327]; | |||
long i; | |||
uint16 r0, r1; | |||
uint32 r2; | |||
for (i = 0; i < 326; ++i) { | |||
r0 = (((R0[2 * i] + 2310) & 16383) * 10923) >> 15; | |||
r1 = (((R0[2 * i + 1] + 2310) & 16383) * 10923) >> 15; | |||
r2 = r0 + r1 * (uint32)1541; | |||
*out++ = r2; | |||
r2 >>= 8; | |||
R[i] = r2; | |||
} | |||
R[326] = (((R0[652] + 2310) & 16383) * 10923) >> 15; | |||
for (i = 0; i < 163; ++i) { | |||
r0 = R[2 * i]; | |||
r1 = R[2 * i + 1]; | |||
r2 = r0 + r1 * (uint32)9277; | |||
*out++ = r2; | |||
r2 >>= 8; | |||
*out++ = r2; | |||
r2 >>= 8; | |||
R[i] = r2; | |||
} | |||
R[163] = R[326]; | |||
for (i = 0; i < 82; ++i) { | |||
r0 = R[2 * i]; | |||
r1 = R[2 * i + 1]; | |||
r2 = r0 + r1 * (uint32)1314; | |||
*out++ = r2; | |||
r2 >>= 8; | |||
R[i] = r2; | |||
} | |||
for (i = 0; i < 41; ++i) { | |||
r0 = R[2 * i]; | |||
r1 = R[2 * i + 1]; | |||
r2 = r0 + r1 * (uint32)6745; | |||
*out++ = r2; | |||
r2 >>= 8; | |||
*out++ = r2; | |||
r2 >>= 8; | |||
R[i] = r2; | |||
} | |||
for (i = 0; i < 20; ++i) { | |||
r0 = R[2 * i]; | |||
r1 = R[2 * i + 1]; | |||
r2 = r0 + r1 * (uint32)695; | |||
*out++ = r2; | |||
r2 >>= 8; | |||
R[i] = r2; | |||
} | |||
R[20] = R[40]; | |||
for (i = 0; i < 10; ++i) { | |||
r0 = R[2 * i]; | |||
r1 = R[2 * i + 1]; | |||
r2 = r0 + r1 * (uint32)1887; | |||
*out++ = r2; | |||
r2 >>= 8; | |||
R[i] = r2; | |||
} | |||
R[10] = R[20]; | |||
for (i = 0; i < 5; ++i) { | |||
r0 = R[2 * i]; | |||
r1 = R[2 * i + 1]; | |||
r2 = r0 + r1 * (uint32)13910; | |||
*out++ = r2; | |||
r2 >>= 8; | |||
*out++ = r2; | |||
r2 >>= 8; | |||
R[i] = r2; | |||
} | |||
R[5] = R[10]; | |||
for (i = 0; i < 2; ++i) { | |||
r0 = R[2 * i]; | |||
r1 = R[2 * i + 1]; | |||
r2 = r0 + r1 * (uint32)2953; | |||
*out++ = r2; | |||
r2 >>= 8; | |||
*out++ = r2; | |||
r2 >>= 8; | |||
R[i] = r2; | |||
} | |||
r0 = R[4]; | |||
r1 = R[5]; | |||
r2 = r0 + r1 * (uint32)2953; | |||
*out++ = r2; | |||
r2 >>= 8; | |||
R[2] = r2; | |||
r0 = R[0]; | |||
r1 = R[1]; | |||
r2 = r0 + r1 * (uint32)134; | |||
*out++ = r2; | |||
r2 >>= 8; | |||
R[0] = r2; | |||
R[1] = R[2]; | |||
r0 = R[0]; | |||
r1 = R[1]; | |||
r2 = r0 + r1 * (uint32)71; | |||
*out++ = r2; | |||
r2 >>= 8; | |||
R[0] = r2; | |||
r0 = R[0]; | |||
*out++ = r0; | |||
r0 >>= 8; | |||
*out++ = r0; /*clang-analyzer-deadcode.DeadStores*/ /*r0 >>= 8;*/ | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_ENCODE_653X1541_H | |||
#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_ENCODE_653X1541_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x1541_STRBYTES 865 | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x1541_ITEMS 653 | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x1541_ITEMBYTES 2 | |||
void PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x1541(unsigned char *out, const void *v); | |||
#endif |
@@ -0,0 +1,17 @@ | |||
#include "crypto_encode_653x1541.h" | |||
#include "crypto_encode_653x1541round.h" | |||
#define int16 int16_t | |||
#define p 653 | |||
void PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x1541round(unsigned char *out, const void *v) { | |||
const int16 *a = v; | |||
int16 x[p]; | |||
int i; | |||
for (i = 0; i < p; ++i) { | |||
x[i] = 3 * ((10923 * a[i] + 16384) >> 15); | |||
} | |||
PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x1541(out, x); | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_ENCODE_653X1541ROUND_H | |||
#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_ENCODE_653X1541ROUND_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x1541round_STRBYTES 865 | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x1541round_ITEMS 653 | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x1541round_ITEMBYTES 2 | |||
void PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x1541round(unsigned char *out, const void *v); | |||
#endif |
@@ -0,0 +1,21 @@ | |||
#include "crypto_encode_653x3.h" | |||
#define uint8 uint8_t | |||
#define p 653 | |||
void PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x3(unsigned char *s, const void *v) { | |||
const uint8 *f = v; | |||
uint8 x; | |||
int i; | |||
for (i = 0; i < p / 4; ++i) { | |||
x = *f++ + 1; | |||
x += (*f++ + 1) << 2; | |||
x += (*f++ + 1) << 4; | |||
x += (*f++ + 1) << 6; | |||
*s++ = x; | |||
} | |||
x = *f++ + 1; | |||
*s++ = x; | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_ENCODE_653X3_H | |||
#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_ENCODE_653X3_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x3_STRBYTES 164 | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x3_ITEMS 653 | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x3_ITEMBYTES 1 | |||
void PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x3(unsigned char *s, const void *v); | |||
#endif |
@@ -0,0 +1,13 @@ | |||
#include "crypto_encode_653xint16.h" | |||
void PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653xint16(unsigned char *s, const void *v) { | |||
const uint16_t *x = v; | |||
int i; | |||
for (i = 0; i < 653; ++i) { | |||
uint16_t u = *x++; | |||
*s++ = u; | |||
*s++ = u >> 8; | |||
} | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_ENCODE_653XINT16_H | |||
#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_ENCODE_653XINT16_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653xint16_STRBYTES 1306 | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653xint16_ITEMBYTES 2 | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653xint16_ITEMS 653 | |||
void PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653xint16(unsigned char *s, const void *v); | |||
#endif |
@@ -0,0 +1,86 @@ | |||
#include "crypto_sort_int32.h" | |||
#include <stdint.h> | |||
// Based on supercop-20190110/crypto_sort/int32/x86 | |||
#define int32 int32_t | |||
#define int32_MINMAX(a,b) \ | |||
do { \ | |||
int32_t ab = (b) ^ (a); \ | |||
int32_t c = (int32_t)((int64_t)(b) - (int64_t)(a)); \ | |||
c ^= ab & (c ^ (b)); \ | |||
c >>= 31; \ | |||
c &= ab; \ | |||
(a) ^= c; \ | |||
(b) ^= c; \ | |||
} while(0) | |||
/* assume 2 <= n <= 0x40000000 */ | |||
void PQCLEAN_NTRULPR653_CLEAN_crypto_sort_int32(int32 *array, size_t n) { | |||
size_t top, p, q, r, i, j; | |||
int32 *x = array; | |||
top = 1; | |||
while (top < n - top) { | |||
top += top; | |||
} | |||
for (p = top; p >= 1; p >>= 1) { | |||
i = 0; | |||
while (i + 2 * p <= n) { | |||
for (j = i; j < i + p; ++j) { | |||
int32_MINMAX(x[j], x[j + p]); | |||
} | |||
i += 2 * p; | |||
} | |||
for (j = i; j < n - p; ++j) { | |||
int32_MINMAX(x[j], x[j + p]); | |||
} | |||
i = 0; | |||
j = 0; | |||
for (q = top; q > p; q >>= 1) { | |||
if (j != i) { | |||
for (;;) { | |||
if (j == n - q) { | |||
goto done; | |||
} | |||
int32 a = x[j + p]; | |||
for (r = q; r > p; r >>= 1) { | |||
int32_MINMAX(a, x[j + r]); | |||
} | |||
x[j + p] = a; | |||
++j; | |||
if (j == i + p) { | |||
i += 2 * p; | |||
break; | |||
} | |||
} | |||
} | |||
while (i + p <= n - q) { | |||
for (j = i; j < i + p; ++j) { | |||
int32 a = x[j + p]; | |||
for (r = q; r > p; r >>= 1) { | |||
int32_MINMAX(a, x[j + r]); | |||
} | |||
x[j + p] = a; | |||
} | |||
i += 2 * p; | |||
} | |||
/* now i + p > n - q */ | |||
j = i; | |||
while (j < n - q) { | |||
int32 a = x[j + p]; | |||
for (r = q; r > p; r >>= 1) { | |||
int32_MINMAX(a, x[j + r]); | |||
} | |||
x[j + p] = a; | |||
++j; | |||
} | |||
done: | |||
; | |||
} | |||
} | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_SORT_INT32_H | |||
#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_SORT_INT32_H | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
void PQCLEAN_NTRULPR653_CLEAN_crypto_sort_int32(int32_t *array, size_t n); | |||
#endif |
@@ -0,0 +1,20 @@ | |||
#include "crypto_sort_int32.h" | |||
#include "crypto_sort_uint32.h" | |||
#include <stdint.h> | |||
#define uint32 uint32_t | |||
/* can save time by vectorizing xor loops */ | |||
/* can save time by integrating xor loops with int32_sort */ | |||
void PQCLEAN_NTRULPR653_CLEAN_crypto_sort_uint32(uint32_t *array, size_t n) { | |||
uint32 *x = array; | |||
size_t j; | |||
for (j = 0; j < n; ++j) { | |||
x[j] ^= 0x80000000; | |||
} | |||
PQCLEAN_NTRULPR653_CLEAN_crypto_sort_int32((int32_t *)array, n); | |||
for (j = 0; j < n; ++j) { | |||
x[j] ^= 0x80000000; | |||
} | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_SORT_UINT32_H | |||
#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_SORT_UINT32_H | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
void PQCLEAN_NTRULPR653_CLEAN_crypto_sort_uint32(uint32_t *array, size_t n); | |||
#endif |
@@ -0,0 +1,15 @@ | |||
#include "crypto_stream_aes256ctr.h" | |||
int PQCLEAN_NTRULPR653_CLEAN_crypto_stream_aes256ctr( | |||
uint8_t *out, | |||
size_t outlen, | |||
const uint8_t nonce[AESCTR_NONCEBYTES], | |||
const uint8_t key[AES256_KEYBYTES]) { | |||
aes256ctx state; | |||
aes256_ctr_keyexp(&state, key); | |||
aes256_ctr(out, outlen, nonce, &state); | |||
aes256_ctx_release(&state); | |||
return 0; | |||
} |
@@ -0,0 +1,15 @@ | |||
#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_STREAM_AES256CTR_H | |||
#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_STREAM_AES256CTR_H | |||
#include "aes.h" | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
int PQCLEAN_NTRULPR653_CLEAN_crypto_stream_aes256ctr( | |||
uint8_t *out, | |||
size_t outlen, | |||
const uint8_t nonce[AESCTR_NONCEBYTES], | |||
const uint8_t key[AES256_KEYBYTES]); | |||
#endif |
@@ -0,0 +1,13 @@ | |||
#include "crypto_verify_1025.h" | |||
int PQCLEAN_NTRULPR653_CLEAN_crypto_verify_1025(const unsigned char *x, const unsigned char *y) { | |||
unsigned int differentbits = 0; | |||
int i; | |||
for (i = 0; i < PQCLEAN_NTRULPR653_CLEAN_crypto_verify_1025_BYTES; ++i) { | |||
differentbits |= x[i] ^ y[i]; | |||
} | |||
return (int) (1 & ((differentbits - 1) >> 8)) - 1; | |||
} |
@@ -0,0 +1,8 @@ | |||
#ifndef PQCLEAN_NTRULPR653_CLEAN_CRYPTO_VERIFY_1025_H | |||
#define PQCLEAN_NTRULPR653_CLEAN_CRYPTO_VERIFY_1025_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR653_CLEAN_crypto_verify_1025_BYTES 1025 | |||
int PQCLEAN_NTRULPR653_CLEAN_crypto_verify_1025(const unsigned char *x, const unsigned char *y); | |||
#endif |
@@ -0,0 +1,287 @@ | |||
#include "api.h" | |||
#include "crypto_sort_uint32.h" | |||
#include "crypto_stream_aes256ctr.h" | |||
#include "params.h" | |||
#include "randombytes.h" | |||
#include "sha2.h" | |||
#define int8 int8_t | |||
#define int16 int16_t | |||
#define int32 int32_t | |||
#define uint16 uint16_t | |||
#define uint32 uint32_t | |||
#define uint64 uint64_t | |||
/* ----- masks */ | |||
/* return -1 if x<0; otherwise return 0 */ | |||
static int int16_negative_mask(int16 x) { | |||
uint16 u = x; | |||
u >>= 15; | |||
return -(int) u; | |||
/* alternative with gcc -fwrapv: */ | |||
/* x>>15 compiles to CPU's arithmetic right shift */ | |||
} | |||
/* ----- arithmetic mod 3 */ | |||
typedef int8 small; | |||
/* F3 is always represented as -1,0,1 */ | |||
/* ----- arithmetic mod q */ | |||
#define q12 ((q-1)/2) | |||
typedef int16 Fq; | |||
/* works for -14000000 < x < 14000000 if q in 4591, 4621, 5167 */ | |||
/* assumes twos complement; use, e.g., gcc -fwrapv */ | |||
static Fq Fq_freeze(int32 x) { | |||
x -= q * ((q18 * x) >> 18); | |||
x -= q * ((q27 * x + 67108864) >> 27); | |||
return x; | |||
} | |||
/* works for all uint32 x */ | |||
static Fq Fq_bigfreeze(uint32 x) { | |||
x -= q * ((x * (uint64)q31) >> 31); | |||
x -= q * ((x * (uint64)q31) >> 31); | |||
x -= q; | |||
x += (-(x >> 31)) & (uint32)q; | |||
return x; | |||
} | |||
/* ----- Top and Right */ | |||
static int8 Top(Fq C) { | |||
return (tau1 * (int32)(C + tau0) + 16384) >> 15; | |||
} | |||
static Fq Right(int8 T) { | |||
return Fq_freeze(tau3 * (int32)T - tau2); | |||
} | |||
/* ----- polynomials mod q */ | |||
/* h = h*g in the ring Rq */ | |||
static void Rq_mult_small(Fq *h, const small *g) { | |||
crypto_encode_pxint16((unsigned char *) h, h); | |||
crypto_core_mult((unsigned char *) h, (const unsigned char *) h, (const unsigned char *) g); | |||
crypto_decode_pxint16(h, (const unsigned char *) h); | |||
} | |||
/* ----- sorting to generate short polynomial */ | |||
static void Short_fromlist(small *out, const uint32 *in) { | |||
uint32 L[ppadsort]; | |||
int i; | |||
for (i = 0; i < w; ++i) { | |||
L[i] = in[i] & (uint32) - 2; | |||
} | |||
for (i = w; i < p; ++i) { | |||
L[i] = (in[i] & (uint32) - 3) | 1; | |||
} | |||
for (i = p; i < ppadsort; ++i) { | |||
L[i] = 0xffffffff; | |||
} | |||
PQCLEAN_NTRULPR653_CLEAN_crypto_sort_uint32(L, ppadsort); | |||
for (i = 0; i < p; ++i) { | |||
out[i] = (L[i] & 3) - 1; | |||
} | |||
} | |||
/* ----- underlying hash function */ | |||
#define Hash_bytes 32 | |||
static void Hash(unsigned char *out, const unsigned char *in, int inlen) { | |||
unsigned char h[64]; | |||
int i; | |||
sha512(h, in, inlen); | |||
for (i = 0; i < 32; ++i) { | |||
out[i] = h[i]; | |||
} | |||
} | |||
/* ----- higher-level randomness */ | |||
static void Short_random(small *out) { | |||
uint32 L[p]; | |||
randombytes((unsigned char *) L, sizeof L); | |||
crypto_decode_pxint32(L, (unsigned char *) L); | |||
Short_fromlist(out, L); | |||
} | |||
/* ----- Inputs, Generator */ | |||
typedef int8 Inputs[I]; /* passed by reference */ | |||
static const unsigned char aes_nonce[16] = {0}; | |||
/* G = Generator(pk) */ | |||
static void Generator(Fq *G, const unsigned char *pk) { | |||
uint32 L[p]; | |||
int i; | |||
PQCLEAN_NTRULPR653_CLEAN_crypto_stream_aes256ctr((unsigned char *) L, 4 * p, aes_nonce, pk); | |||
crypto_decode_pxint32(L, (unsigned char *) L); | |||
for (i = 0; i < p; ++i) { | |||
G[i] = Fq_bigfreeze(L[i]) - q12; | |||
} | |||
} | |||
/* ----- NTRU LPRime */ | |||
#define Seeds_bytes 32 | |||
#define Ciphertexts_bytes (Rounded_bytes+Top_bytes) | |||
#define SecretKeys_bytes Small_bytes | |||
#define PublicKeys_bytes (Seeds_bytes+Rounded_bytes) | |||
#define Confirm_bytes 32 | |||
/* c,r_enc[1:] = Hide(r,pk,cache); cache is Hash4(pk) */ | |||
static void Hide(unsigned char *c, unsigned char *r_enc, const Inputs r, const unsigned char *pk, const unsigned char *cache) { | |||
small b[p]; | |||
int i; | |||
Inputs_encode(r_enc + 1, r); | |||
{ | |||
unsigned char h[Hash_bytes]; | |||
uint32 L[p]; | |||
{ | |||
unsigned char s[1 + Inputs_bytes]; | |||
Inputs_encode(s + 1, r); | |||
s[0] = 5; | |||
Hash(h, s, sizeof s); | |||
} | |||
PQCLEAN_NTRULPR653_CLEAN_crypto_stream_aes256ctr((unsigned char *) L, 4 * p, aes_nonce, h); | |||
crypto_decode_pxint32(L, (unsigned char *) L); | |||
Short_fromlist(b, L); | |||
} | |||
{ | |||
Fq bG[p]; | |||
Generator(bG, pk); | |||
Rq_mult_small(bG, b); | |||
Round_and_encode(c, bG); | |||
c += Rounded_bytes; | |||
} | |||
{ | |||
Fq bA[p]; | |||
int8 T[I]; | |||
Rounded_decode(bA, pk + Seeds_bytes); | |||
Rq_mult_small(bA, b); | |||
for (i = 0; i < I; ++i) { | |||
T[i] = Top(Fq_freeze(bA[i] + r[i] * q12)); | |||
} | |||
Top_encode(c, T); | |||
c += Top_bytes; | |||
} | |||
{ | |||
unsigned char x[1 + Inputs_bytes + Hash_bytes]; | |||
for (i = 0; i < Inputs_bytes; ++i) { | |||
x[1 + i] = r_enc[1 + i]; | |||
} | |||
for (i = 0; i < Hash_bytes; ++i) { | |||
x[1 + Inputs_bytes + i] = cache[i]; | |||
} | |||
x[0] = 2; | |||
Hash(c, x, sizeof x); | |||
} | |||
} | |||
int PQCLEAN_NTRULPR653_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { | |||
Fq aG[p]; | |||
int i; | |||
randombytes(pk, Seeds_bytes); | |||
Generator(aG, pk); | |||
{ | |||
small a[p]; | |||
Short_random(a); | |||
Rq_mult_small(aG, a); | |||
Small_encode(sk, a); | |||
} | |||
Round_and_encode(pk + Seeds_bytes, aG); | |||
{ | |||
unsigned char sksave = sk[SecretKeys_bytes - 1]; | |||
for (i = 0; i < PublicKeys_bytes; ++i) { | |||
sk[SecretKeys_bytes + i] = pk[i]; | |||
} | |||
sk[SecretKeys_bytes - 1] = 4; | |||
Hash(sk + SecretKeys_bytes + PublicKeys_bytes + Inputs_bytes, sk + SecretKeys_bytes - 1, 1 + PublicKeys_bytes); | |||
sk[SecretKeys_bytes - 1] = sksave; | |||
randombytes(sk + SecretKeys_bytes + PublicKeys_bytes, Inputs_bytes); | |||
} | |||
return 0; | |||
} | |||
int PQCLEAN_NTRULPR653_CLEAN_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk) { | |||
int i; | |||
unsigned char cache[Hash_bytes]; | |||
{ | |||
unsigned char y[1 + PublicKeys_bytes]; | |||
for (i = 0; i < PublicKeys_bytes; ++i) { | |||
y[1 + i] = pk[i]; | |||
} | |||
y[0] = 4; | |||
Hash(cache, y, sizeof y); | |||
} | |||
Inputs r; | |||
{ | |||
unsigned char s[Inputs_bytes]; | |||
randombytes(s, sizeof s); | |||
Inputs_decode(r, s); | |||
} | |||
{ | |||
unsigned char x[1 + Inputs_bytes + Ciphertexts_bytes + Confirm_bytes]; | |||
Hide(c, x, r, pk, cache); | |||
for (i = 0; i < Ciphertexts_bytes + Confirm_bytes; ++i) { | |||
x[1 + Inputs_bytes + i] = c[i]; | |||
} | |||
x[0] = 1; | |||
Hash(k, x, sizeof x); | |||
} | |||
return 0; | |||
} | |||
int PQCLEAN_NTRULPR653_CLEAN_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk) { | |||
const unsigned char *pk = sk + SecretKeys_bytes; | |||
const unsigned char *rho = pk + PublicKeys_bytes; | |||
const unsigned char *cache = rho + Inputs_bytes; | |||
Inputs r; | |||
int i; | |||
{ | |||
Fq aB[p]; | |||
Rounded_decode(aB, c); | |||
{ | |||
small a[p]; | |||
Small_decode(a, sk); | |||
Rq_mult_small(aB, a); | |||
} | |||
{ | |||
int8 T[I]; | |||
Top_decode(T, c + Rounded_bytes); | |||
for (i = 0; i < I; ++i) { | |||
r[i] = -int16_negative_mask(Fq_freeze(Right(T[i]) - aB[i] + 4 * w + 1)); | |||
} | |||
} | |||
} | |||
{ | |||
unsigned char cnew[Ciphertexts_bytes + Confirm_bytes]; | |||
int mask; | |||
unsigned char x[1 + Inputs_bytes + Ciphertexts_bytes + Confirm_bytes]; | |||
Hide(cnew, x, r, pk, cache); | |||
mask = crypto_verify_clen(c, cnew); | |||
for (i = 0; i < Inputs_bytes; ++i) { | |||
x[1 + i] ^= mask & (x[1 + i] ^ rho[i]); | |||
} | |||
for (i = 0; i < Ciphertexts_bytes + Confirm_bytes; ++i) { | |||
x[1 + Inputs_bytes + i] = c[i]; | |||
} | |||
x[0] = 1 + mask; | |||
Hash(k, x, sizeof x); | |||
} | |||
return 0; | |||
} |
@@ -0,0 +1,63 @@ | |||
#ifndef params_H | |||
#define params_H | |||
#include "crypto_core_multsntrup653.h" | |||
#include "crypto_decode_256x16.h" | |||
#include "crypto_decode_256x2.h" | |||
#include "crypto_decode_653x1541.h" | |||
#include "crypto_decode_653x3.h" | |||
#include "crypto_decode_653xint16.h" | |||
#include "crypto_decode_653xint32.h" | |||
#include "crypto_encode_256x16.h" | |||
#include "crypto_encode_256x2.h" | |||
#include "crypto_encode_653x1541.h" | |||
#include "crypto_encode_653x1541round.h" | |||
#include "crypto_encode_653x3.h" | |||
#include "crypto_encode_653xint16.h" | |||
#include "crypto_verify_1025.h" | |||
#define p 653 | |||
#define q 4621 | |||
#define w 252 | |||
#define q27 29045 /* closest integer to 2^27/q */ | |||
#define q18 57 /* closest integer to 2^18/q */ | |||
#define tau0 2175 | |||
#define tau1 113 | |||
#define tau2 2031 | |||
#define tau3 290 | |||
#define I 256 | |||
#define ppadsort 653 | |||
#define q18 57 /* round(2^18/q) */ | |||
#define q27 29045 /* round(2^27/q) */ | |||
#define q31 464722 /* floor(2^31/q) */ | |||
#define crypto_verify_clen PQCLEAN_NTRULPR653_CLEAN_crypto_verify_1025 | |||
#define Rounded_bytes PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653x1541_STRBYTES | |||
#define Rounded_decode PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653x1541 | |||
#define Round_and_encode PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x1541round | |||
#define Small_bytes PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x3_STRBYTES | |||
#define Small_encode PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653x3 | |||
#define Small_decode PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653x3 | |||
#define Top_bytes PQCLEAN_NTRULPR653_CLEAN_crypto_encode_256x16_STRBYTES | |||
#define Top_encode PQCLEAN_NTRULPR653_CLEAN_crypto_encode_256x16 | |||
#define Top_decode PQCLEAN_NTRULPR653_CLEAN_crypto_decode_256x16 | |||
#define Inputs_bytes PQCLEAN_NTRULPR653_CLEAN_crypto_encode_256x2_STRBYTES | |||
#define Inputs_encode PQCLEAN_NTRULPR653_CLEAN_crypto_encode_256x2 | |||
#define Inputs_decode PQCLEAN_NTRULPR653_CLEAN_crypto_decode_256x2 | |||
#define crypto_decode_pxint32 PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653xint32 | |||
#define crypto_decode_pxint16 PQCLEAN_NTRULPR653_CLEAN_crypto_decode_653xint16 | |||
#define crypto_encode_pxint16 PQCLEAN_NTRULPR653_CLEAN_crypto_encode_653xint16 | |||
#define crypto_core_mult PQCLEAN_NTRULPR653_CLEAN_crypto_core_multsntrup653 | |||
#endif |
@@ -0,0 +1,26 @@ | |||
name: ntrulpr761 | |||
type: kem | |||
claimed-nist-level: 3 | |||
claimed-security: IND-CCA2 | |||
length-public-key: 1039 | |||
length-secret-key: 1294 | |||
length-ciphertext: 1167 | |||
length-shared-secret: 32 | |||
nistkat-sha256: 212f68484864e927c674a656ea44ea0f47c048d0dd3518b102c98a9eacd16a72 | |||
principal-submitters: | |||
- Daniel J. Bernstein | |||
- Chitchanok Chuengsatiansup | |||
- Tanja Lange | |||
- Christine van Vredendaal | |||
implementations: | |||
- name: clean | |||
version: supercop-20200826 | |||
- name: avx2 | |||
version: supercop-20200826 | |||
supported_platforms: | |||
- architecture: x86_64 | |||
operating_systems: | |||
- Linux | |||
- Darwin | |||
required_flags: | |||
- avx2 |
@@ -0,0 +1 @@ | |||
Public Domain |
@@ -0,0 +1,22 @@ | |||
# This Makefile can be used with GNU Make or BSD Make | |||
LIB=libntrulpr761_avx2.a | |||
HEADERS=api.h crypto_core_multsntrup761.h crypto_core_multsntrup761_ntt.h crypto_decode_256x16.h crypto_decode_256x2.h crypto_decode_761x1531.h crypto_decode_761x3.h crypto_decode_761xint16.h crypto_decode_761xint32.h crypto_encode_256x16.h crypto_encode_256x2.h crypto_encode_761x1531.h crypto_encode_761x1531round.h crypto_encode_761x3.h crypto_encode_761xint16.h crypto_sort_int32.h crypto_sort_uint32.h crypto_stream_aes256ctr.h crypto_verify_1167.h params.h | |||
OBJECTS=crypto_core_multsntrup761.o crypto_core_multsntrup761_ntt.o crypto_decode_256x16.o crypto_decode_256x2.o crypto_decode_761x1531.o crypto_decode_761x3.o crypto_decode_761xint16.o crypto_decode_761xint32.o crypto_encode_256x16.o crypto_encode_256x2.o crypto_encode_761x1531.o crypto_encode_761x1531round.o crypto_encode_761x3.o crypto_encode_761xint16.o crypto_sort_int32.o crypto_sort_uint32.o crypto_stream_aes256ctr.o crypto_verify_1167.o kem.o | |||
CFLAGS=-O3 -mavx2 -mbmi2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS) | |||
all: $(LIB) | |||
%.o: %.s $(HEADERS) | |||
$(AS) -o $@ $< | |||
%.o: %.c $(HEADERS) | |||
$(CC) $(CFLAGS) -c -o $@ $< | |||
$(LIB): $(OBJECTS) | |||
$(AR) -r $@ $(OBJECTS) | |||
clean: | |||
$(RM) $(OBJECTS) | |||
$(RM) $(LIB) |
@@ -0,0 +1,16 @@ | |||
#ifndef PQCLEAN_NTRULPR761_AVX2_API_H | |||
#define PQCLEAN_NTRULPR761_AVX2_API_H | |||
#define PQCLEAN_NTRULPR761_AVX2_CRYPTO_ALGNAME "ntrulpr761" | |||
#define PQCLEAN_NTRULPR761_AVX2_CRYPTO_SECRETKEYBYTES 1294 | |||
#define PQCLEAN_NTRULPR761_AVX2_CRYPTO_PUBLICKEYBYTES 1039 | |||
#define PQCLEAN_NTRULPR761_AVX2_CRYPTO_CIPHERTEXTBYTES 1167 | |||
#define PQCLEAN_NTRULPR761_AVX2_CRYPTO_BYTES 32 | |||
int PQCLEAN_NTRULPR761_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); | |||
int PQCLEAN_NTRULPR761_AVX2_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk); | |||
int PQCLEAN_NTRULPR761_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk); | |||
#endif |
@@ -0,0 +1,314 @@ | |||
#include "crypto_core_multsntrup761.h" | |||
#include "crypto_core_multsntrup761_ntt.h" | |||
#include "crypto_decode_761xint16.h" | |||
#include "crypto_encode_761xint16.h" | |||
#include <immintrin.h> | |||
typedef int8_t int8; | |||
typedef int16_t int16; | |||
#define int16x16 __m256i | |||
#define load_x16(p) _mm256_loadu_si256((int16x16 *) (p)) | |||
#define store_x16(p,v) _mm256_storeu_si256((int16x16 *) (p),(v)) | |||
#define const_x16 _mm256_set1_epi16 | |||
#define add_x16 _mm256_add_epi16 | |||
#define sub_x16 _mm256_sub_epi16 | |||
#define mullo_x16 _mm256_mullo_epi16 | |||
#define mulhi_x16 _mm256_mulhi_epi16 | |||
#define mulhrs_x16 _mm256_mulhrs_epi16 | |||
#define signmask_x16(x) _mm256_srai_epi16((x),15) | |||
typedef union { | |||
int16 v[3][512]; | |||
int16x16 _dummy; | |||
} vec3x512; | |||
typedef union { | |||
int16 v[768]; | |||
int16x16 _dummy; | |||
} vec768; | |||
typedef union { | |||
int16 v[3 * 512]; | |||
int16x16 _dummy; | |||
} vec1536; | |||
static inline int16x16 squeeze_4591_x16(int16x16 x) { | |||
return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(7)), const_x16(4591))); | |||
} | |||
static inline int16x16 squeeze_7681_x16(int16x16 x) { | |||
return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(4)), const_x16(7681))); | |||
} | |||
static inline int16x16 squeeze_10753_x16(int16x16 x) { | |||
return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(3)), const_x16(10753))); | |||
} | |||
static inline int16x16 mulmod_4591_x16(int16x16 x, int16x16 y) { | |||
int16x16 yqinv = mullo_x16(y, const_x16(15631)); /* XXX: precompute */ | |||
int16x16 b = mulhi_x16(x, y); | |||
int16x16 d = mullo_x16(x, yqinv); | |||
int16x16 e = mulhi_x16(d, const_x16(4591)); | |||
return sub_x16(b, e); | |||
} | |||
static inline int16x16 mulmod_7681_x16(int16x16 x, int16x16 y) { | |||
int16x16 yqinv = mullo_x16(y, const_x16(-7679)); /* XXX: precompute */ | |||
int16x16 b = mulhi_x16(x, y); | |||
int16x16 d = mullo_x16(x, yqinv); | |||
int16x16 e = mulhi_x16(d, const_x16(7681)); | |||
return sub_x16(b, e); | |||
} | |||
static inline int16x16 mulmod_10753_x16(int16x16 x, int16x16 y) { | |||
int16x16 yqinv = mullo_x16(y, const_x16(-10751)); /* XXX: precompute */ | |||
int16x16 b = mulhi_x16(x, y); | |||
int16x16 d = mullo_x16(x, yqinv); | |||
int16x16 e = mulhi_x16(d, const_x16(10753)); | |||
return sub_x16(b, e); | |||
} | |||
#define mask0 _mm256_set_epi16(-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1) | |||
#define mask1 _mm256_set_epi16(0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0) | |||
#define mask2 _mm256_set_epi16(0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0) | |||
static void good(int16 fpad[3][512], const int16 f[768]) { | |||
int j; | |||
int16x16 f0, f1; | |||
j = 0; | |||
for (;;) { | |||
f0 = load_x16(f + j); | |||
f1 = load_x16(f + 512 + j); | |||
store_x16(&fpad[0][j], (f0 & mask0) | (f1 & mask1)); | |||
store_x16(&fpad[1][j], (f0 & mask1) | (f1 & mask2)); | |||
store_x16(&fpad[2][j], (f0 & mask2) | (f1 & mask0)); | |||
j += 16; | |||
if (j == 256) { | |||
break; | |||
} | |||
f0 = load_x16(f + j); | |||
f1 = load_x16(f + 512 + j); | |||
store_x16(&fpad[0][j], (f0 & mask2) | (f1 & mask0)); | |||
store_x16(&fpad[1][j], (f0 & mask0) | (f1 & mask1)); | |||
store_x16(&fpad[2][j], (f0 & mask1) | (f1 & mask2)); | |||
j += 16; | |||
f0 = load_x16(f + j); | |||
f1 = load_x16(f + 512 + j); | |||
store_x16(&fpad[0][j], (f0 & mask1) | (f1 & mask2)); | |||
store_x16(&fpad[1][j], (f0 & mask2) | (f1 & mask0)); | |||
store_x16(&fpad[2][j], (f0 & mask0) | (f1 & mask1)); | |||
j += 16; | |||
} | |||
for (;;) { | |||
f0 = load_x16(f + j); | |||
store_x16(&fpad[0][j], f0 & mask2); | |||
store_x16(&fpad[1][j], f0 & mask0); | |||
store_x16(&fpad[2][j], f0 & mask1); | |||
j += 16; | |||
if (j == 512) { | |||
break; | |||
} | |||
f0 = load_x16(f + j); | |||
store_x16(&fpad[0][j], f0 & mask1); | |||
store_x16(&fpad[1][j], f0 & mask2); | |||
store_x16(&fpad[2][j], f0 & mask0); | |||
j += 16; | |||
f0 = load_x16(f + j); | |||
store_x16(&fpad[0][j], f0 & mask0); | |||
store_x16(&fpad[1][j], f0 & mask1); | |||
store_x16(&fpad[2][j], f0 & mask2); | |||
j += 16; | |||
} | |||
} | |||
static void ungood(int16 f[1536], const int16 fpad[3][512]) { | |||
int j; | |||
int16x16 f0, f1, f2, g0, g1, g2; | |||
j = 0; | |||
for (;;) { | |||
f0 = load_x16(&fpad[0][j]); | |||
f1 = load_x16(&fpad[1][j]); | |||
f2 = load_x16(&fpad[2][j]); | |||
g0 = (f0 & mask0) | (f1 & mask1) | (f2 & mask2); | |||
g1 = (f0 & mask1) | (f1 & mask2) | (f2 & mask0); | |||
g2 = f0 ^ f1 ^ f2 ^ g0 ^ g1; /* same as (f0&mask2)|(f1&mask0)|(f2&mask1) */ | |||
store_x16(f + 0 + j, g0); | |||
store_x16(f + 512 + j, g1); | |||
store_x16(f + 1024 + j, g2); | |||
j += 16; | |||
f0 = load_x16(&fpad[0][j]); | |||
f1 = load_x16(&fpad[1][j]); | |||
f2 = load_x16(&fpad[2][j]); | |||
g0 = (f0 & mask2) | (f1 & mask0) | (f2 & mask1); | |||
g1 = (f0 & mask0) | (f1 & mask1) | (f2 & mask2); | |||
g2 = f0 ^ f1 ^ f2 ^ g0 ^ g1; /* same as (f0&mask1)|(f1&mask2)|(f2&mask0) */ | |||
store_x16(f + 0 + j, g0); | |||
store_x16(f + 512 + j, g1); | |||
store_x16(f + 1024 + j, g2); | |||
j += 16; | |||
if (j == 512) { | |||
break; | |||
} | |||
f0 = load_x16(&fpad[0][j]); | |||
f1 = load_x16(&fpad[1][j]); | |||
f2 = load_x16(&fpad[2][j]); | |||
g0 = (f0 & mask1) | (f1 & mask2) | (f2 & mask0); | |||
g1 = (f0 & mask2) | (f1 & mask0) | (f2 & mask1); | |||
g2 = f0 ^ f1 ^ f2 ^ g0 ^ g1; /* same as (f0&mask0)|(f1&mask1)|(f2&mask2) */ | |||
store_x16(f + 0 + j, g0); | |||
store_x16(f + 512 + j, g1); | |||
store_x16(f + 1024 + j, g2); | |||
j += 16; | |||
} | |||
} | |||
static void mult768(int16 h[1536], const int16 f[768], const int16 g[768]) { | |||
vec3x512 x1, x2; | |||
vec1536 x3, x4; | |||
#define fpad (x1.v) | |||
#define gpad (x2.v) | |||
#define hpad fpad | |||
#define h_7681 (x3.v) | |||
#define h_10753 (x4.v) | |||
int i; | |||
good(fpad, f); | |||
PQCLEAN_NTRULPR761_AVX2_ntt512_7681(fpad[0], 3); | |||
good(gpad, g); | |||
PQCLEAN_NTRULPR761_AVX2_ntt512_7681(gpad[0], 3); | |||
for (i = 0; i < 512; i += 16) { | |||
int16x16 f0 = squeeze_7681_x16(load_x16(&fpad[0][i])); | |||
int16x16 f1 = squeeze_7681_x16(load_x16(&fpad[1][i])); | |||
int16x16 f2 = squeeze_7681_x16(load_x16(&fpad[2][i])); | |||
int16x16 g0 = squeeze_7681_x16(load_x16(&gpad[0][i])); | |||
int16x16 g1 = squeeze_7681_x16(load_x16(&gpad[1][i])); | |||
int16x16 g2 = squeeze_7681_x16(load_x16(&gpad[2][i])); | |||
int16x16 d0 = mulmod_7681_x16(f0, g0); | |||
int16x16 d1 = mulmod_7681_x16(f1, g1); | |||
int16x16 d2 = mulmod_7681_x16(f2, g2); | |||
int16x16 dsum = add_x16(add_x16(d0, d1), d2); | |||
int16x16 h0 = add_x16(dsum, mulmod_7681_x16(sub_x16(f2, f1), sub_x16(g1, g2))); | |||
int16x16 h1 = add_x16(dsum, mulmod_7681_x16(sub_x16(f1, f0), sub_x16(g0, g1))); | |||
int16x16 h2 = add_x16(dsum, mulmod_7681_x16(sub_x16(f0, f2), sub_x16(g2, g0))); | |||
store_x16(&hpad[0][i], squeeze_7681_x16(h0)); | |||
store_x16(&hpad[1][i], squeeze_7681_x16(h1)); | |||
store_x16(&hpad[2][i], squeeze_7681_x16(h2)); | |||
} | |||
PQCLEAN_NTRULPR761_AVX2_invntt512_7681(hpad[0], 3); | |||
ungood(h_7681, (const int16(*)[512]) hpad); | |||
good(fpad, f); | |||
PQCLEAN_NTRULPR761_AVX2_ntt512_10753(fpad[0], 3); | |||
good(gpad, g); | |||
PQCLEAN_NTRULPR761_AVX2_ntt512_10753(gpad[0], 3); | |||
for (i = 0; i < 512; i += 16) { | |||
int16x16 f0 = squeeze_10753_x16(load_x16(&fpad[0][i])); | |||
int16x16 f1 = squeeze_10753_x16(load_x16(&fpad[1][i])); | |||
int16x16 f2 = squeeze_10753_x16(load_x16(&fpad[2][i])); | |||
int16x16 g0 = squeeze_10753_x16(load_x16(&gpad[0][i])); | |||
int16x16 g1 = squeeze_10753_x16(load_x16(&gpad[1][i])); | |||
int16x16 g2 = squeeze_10753_x16(load_x16(&gpad[2][i])); | |||
int16x16 d0 = mulmod_10753_x16(f0, g0); | |||
int16x16 d1 = mulmod_10753_x16(f1, g1); | |||
int16x16 d2 = mulmod_10753_x16(f2, g2); | |||
int16x16 dsum = add_x16(add_x16(d0, d1), d2); | |||
int16x16 h0 = add_x16(dsum, mulmod_10753_x16(sub_x16(f2, f1), sub_x16(g1, g2))); | |||
int16x16 h1 = add_x16(dsum, mulmod_10753_x16(sub_x16(f1, f0), sub_x16(g0, g1))); | |||
int16x16 h2 = add_x16(dsum, mulmod_10753_x16(sub_x16(f0, f2), sub_x16(g2, g0))); | |||
store_x16(&hpad[0][i], squeeze_10753_x16(h0)); | |||
store_x16(&hpad[1][i], squeeze_10753_x16(h1)); | |||
store_x16(&hpad[2][i], squeeze_10753_x16(h2)); | |||
} | |||
PQCLEAN_NTRULPR761_AVX2_invntt512_10753(hpad[0], 3); | |||
ungood(h_10753, (const int16(*)[512]) hpad); | |||
for (i = 0; i < 1536; i += 16) { | |||
int16x16 u1 = load_x16(&h_10753[i]); | |||
int16x16 u2 = load_x16(&h_7681[i]); | |||
int16x16 t; | |||
u1 = mulmod_10753_x16(u1, const_x16(1268)); | |||
u2 = mulmod_7681_x16(u2, const_x16(956)); | |||
t = mulmod_7681_x16(sub_x16(u2, u1), const_x16(-2539)); | |||
t = add_x16(u1, mulmod_4591_x16(t, const_x16(-710))); | |||
store_x16(&h[i], t); | |||
} | |||
} | |||
#define crypto_decode_pxint16 PQCLEAN_NTRULPR761_AVX2_crypto_decode_761xint16 | |||
#define crypto_encode_pxint16 PQCLEAN_NTRULPR761_AVX2_crypto_encode_761xint16 | |||
#define p 761 | |||
#define q 4591 | |||
static inline int16x16 freeze_4591_x16(int16x16 x) { | |||
int16x16 mask, xq; | |||
x = add_x16(x, const_x16(q)&signmask_x16(x)); | |||
mask = signmask_x16(sub_x16(x, const_x16((q + 1) / 2))); | |||
xq = sub_x16(x, const_x16(q)); | |||
x = _mm256_blendv_epi8(xq, x, mask); | |||
return x; | |||
} | |||
int PQCLEAN_NTRULPR761_AVX2_crypto_core_multsntrup761(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes) { | |||
vec768 x1, x2; | |||
vec1536 x3; | |||
#define f (x1.v) | |||
#define g (x2.v) | |||
#define fg (x3.v) | |||
#define h f | |||
int i; | |||
int16x16 x; | |||
x = const_x16(0); | |||
for (i = p & ~15; i < 768; i += 16) { | |||
store_x16(&f[i], x); | |||
} | |||
for (i = p & ~15; i < 768; i += 16) { | |||
store_x16(&g[i], x); | |||
} | |||
crypto_decode_pxint16(f, inbytes); | |||
for (i = 0; i < 768; i += 16) { | |||
x = load_x16(&f[i]); | |||
x = freeze_4591_x16(squeeze_4591_x16(x)); | |||
store_x16(&f[i], x); | |||
} | |||
for (i = 0; i < p; ++i) { | |||
int8 gi = kbytes[i]; | |||
int8 gi0 = gi & 1; | |||
g[i] = gi0 - (gi & (gi0 << 1)); | |||
} | |||
mult768(fg, f, g); | |||
fg[0] -= fg[p - 1]; | |||
for (i = 0; i < 768; i += 16) { | |||
int16x16 fgi = load_x16(&fg[i]); | |||
int16x16 fgip = load_x16(&fg[i + p]); | |||
int16x16 fgip1 = load_x16(&fg[i + p - 1]); | |||
x = add_x16(fgi, add_x16(fgip, fgip1)); | |||
x = freeze_4591_x16(squeeze_4591_x16(x)); | |||
store_x16(&h[i], x); | |||
} | |||
crypto_encode_pxint16(outbytes, h); | |||
return 0; | |||
} |
@@ -0,0 +1,11 @@ | |||
#ifndef PQCLEAN_NTRULPR761_AVX2_CRYPTO_CORE_MULTSNTRUP761_H | |||
#define PQCLEAN_NTRULPR761_AVX2_CRYPTO_CORE_MULTSNTRUP761_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR761_AVX2_crypto_core_multsntrup761_OUTPUTBYTES 1522 | |||
#define PQCLEAN_NTRULPR761_AVX2_crypto_core_multsntrup761_INPUTBYTES 1522 | |||
#define PQCLEAN_NTRULPR761_AVX2_crypto_core_multsntrup761_KEYBYTES 761 | |||
#define PQCLEAN_NTRULPR761_AVX2_crypto_core_multsntrup761_CONSTBYTES 0 | |||
int PQCLEAN_NTRULPR761_AVX2_crypto_core_multsntrup761(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes); | |||
#endif |
@@ -0,0 +1,927 @@ | |||
#include "crypto_core_multsntrup761.h" | |||
#include "crypto_core_multsntrup761_ntt.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
/* auto-generated; do not edit */ | |||
typedef int8_t int8; | |||
typedef int16_t int16; | |||
#define zeta(n,i) (((__m256i *) zeta_##n)[(i)]) | |||
#define zeta_x4(n,i) (((__m256i *) zeta_x4_##n)[(i)]) | |||
#define zeta_qinv(n,i) (((__m256i *) qinvzeta_##n)[(i)]) | |||
#define zeta_x4_qinv(n,i) (((__m256i *) qinvzeta_x4_##n)[(i)]) | |||
#define zetainv(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) zeta_##n+(n)/2+1-16*((i)+1))) | |||
#define zetainv_x4(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) zeta_x4_##n+2*(n)+4-16*((i)+1))) | |||
#define zetainv_qinv(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) qinvzeta_##n+(n)/2+1-16*((i)+1))) | |||
#define zetainv_x4_qinv(n,i) _mm256_loadu_reverse16((__m256i *) ((int16 *) qinvzeta_x4_##n+2*(n)+4-16*((i)+1))) | |||
typedef union { | |||
int16 data[93 * 16]; | |||
__m256i _dummy; | |||
} vec1488; | |||
static const vec1488 qdata_7681 = { .data = { | |||
#define q_x16 (qdata[0]) | |||
7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, | |||
#define qrecip_x16 (qdata[1]) | |||
17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, 17474, | |||
#define qshift_x16 (qdata[2]) | |||
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, | |||
#define zeta4_x16 (qdata[3]) | |||
-3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, | |||
#define zeta4_x16_qinv (qdata[4]) | |||
-28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, | |||
#define zeta8_x16 (qdata[5]) | |||
-3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, | |||
#define zeta8_x16_qinv (qdata[6]) | |||
-16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, | |||
#define zetainv8_x16 (qdata[7]) | |||
-3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, | |||
#define zetainv8_x16_qinv (qdata[8]) | |||
-10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, | |||
#define zeta_x4_16 (qdata+9) | |||
-3593, -3593, -3593, -3593, -2194, -2194, -2194, -2194, -3625, -3625, -3625, -3625, 1100, 1100, 1100, 1100, | |||
-3777, -3777, -3777, -3777, -2456, -2456, -2456, -2456, 3182, 3182, 3182, 3182, 3696, 3696, 3696, 3696, | |||
3593, 3593, 3593, 3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
#define qinvzeta_x4_16 (qdata+12) | |||
-9, -9, -9, -9, 4974, 4974, 4974, 4974, -16425, -16425, -16425, -16425, 7244, 7244, 7244, 7244, | |||
-28865, -28865, -28865, -28865, -14744, -14744, -14744, -14744, 10350, 10350, 10350, 10350, -4496, -4496, -4496, -4496, | |||
9, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
#define zeta_x4_32 (qdata+15) | |||
-3593, -3593, -3593, -3593, 1414, 1414, 1414, 1414, -2194, -2194, -2194, -2194, -2495, -2495, -2495, -2495, | |||
-3625, -3625, -3625, -3625, 2876, 2876, 2876, 2876, 1100, 1100, 1100, 1100, -2250, -2250, -2250, -2250, | |||
-3777, -3777, -3777, -3777, -1701, -1701, -1701, -1701, -2456, -2456, -2456, -2456, 834, 834, 834, 834, | |||
3182, 3182, 3182, 3182, -2319, -2319, -2319, -2319, 3696, 3696, 3696, 3696, 121, 121, 121, 121, | |||
3593, 3593, 3593, 3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
#define qinvzeta_x4_32 (qdata+20) | |||
-9, -9, -9, -9, 20870, 20870, 20870, 20870, 4974, 4974, 4974, 4974, 22593, 22593, 22593, 22593, | |||
-16425, -16425, -16425, -16425, 828, 828, 828, 828, 7244, 7244, 7244, 7244, -23754, -23754, -23754, -23754, | |||
-28865, -28865, -28865, -28865, 20315, 20315, 20315, 20315, -14744, -14744, -14744, -14744, 18242, 18242, 18242, 18242, | |||
10350, 10350, 10350, 10350, -18191, -18191, -18191, -18191, -4496, -4496, -4496, -4496, -11655, -11655, -11655, -11655, | |||
9, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
#define zeta_64 (qdata+25) | |||
-3593, -617, 1414, 3706, -2194, -1296, -2495, -2237, -3625, 2830, 2876, -1599, 1100, 1525, -2250, 2816, | |||
-3777, 1921, -1701, 2006, -2456, 1483, 834, -1986, 3182, 3364, -2319, -1993, 3696, -2557, 121, 2088, | |||
3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
#define qinvzeta_64 (qdata+28) | |||
-9, 19351, 20870, -15750, 4974, -9488, 22593, 7491, -16425, 26382, 828, 23489, 7244, 20469, -23754, 2816, | |||
-28865, -5759, 20315, -3114, -14744, 15307, 18242, -19394, 10350, -10972, -18191, -31177, -4496, -25597, -11655, 22568, | |||
9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
#define zeta_128 (qdata+31) | |||
-3593, -2804, -617, -396, 1414, -549, 3706, 810, -2194, -1321, -1296, 438, -2495, -2535, -2237, -3689, | |||
-3625, 2043, 2830, -1881, 2876, 3153, -1599, 7, 1100, -514, 1525, -1760, -2250, -2440, 2816, 3600, | |||
-3777, 103, 1921, -3174, -1701, 1535, 2006, -1887, -2456, 1399, 1483, -679, 834, 3772, -1986, 1738, | |||
3182, -1431, 3364, -3555, -2319, -2310, -1993, 638, 3696, -2956, -2557, -1305, 121, 2555, 2088, -3266, | |||
3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
#define qinvzeta_128 (qdata+36) | |||
-9, -29428, 19351, 26228, 20870, 21467, -15750, 5930, 4974, -14121, -9488, -21066, 22593, 2073, 7491, 16279, | |||
-16425, -25093, 26382, 26279, 828, -29103, 23489, 11783, 7244, 14846, 20469, 14624, -23754, -6536, 2816, 11792, | |||
-28865, -4505, -5759, -6246, 20315, 9215, -3114, 6817, -14744, 4983, 15307, -28839, 18242, 1724, -19394, 23242, | |||
10350, -21399, -10972, -29667, -18191, -21766, -31177, 15998, -4496, 23668, -25597, -5913, -11655, -24581, 22568, -20674, | |||
9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
#define zeta_256 (qdata+41) | |||
-3593, 2665, -2804, -2572, -617, 727, -396, 3417, 1414, 2579, -549, 373, 3706, 3750, 810, -1054, | |||
-2194, -2133, -1321, 1681, -1296, -1386, 438, -2732, -2495, 1919, -2535, -2391, -2237, 2835, -3689, 2, | |||
-3625, -783, 2043, 3145, 2830, 1533, -1881, 2789, 2876, 2649, 3153, 3692, -1599, -1390, 7, -1166, | |||
1100, 3310, -514, 2224, 1525, -2743, -1760, 2385, -2250, -486, -2440, -1756, 2816, -3816, 3600, -3831, | |||
-3777, -1799, 103, 1497, 1921, 1521, -3174, -194, -1701, -859, 1535, 2175, 2006, -2762, -1887, -1698, | |||
-2456, -3480, 1399, 2883, 1483, -3428, -679, -2113, 834, 1532, 3772, -660, -1986, -2764, 1738, -915, | |||
3182, 1056, -1431, 1350, 3364, 1464, -3555, 2919, -2319, -2160, -2310, 730, -1993, -1598, 638, 3456, | |||
3696, -1168, -2956, -3588, -2557, -921, -1305, 3405, 121, -404, 2555, -3135, 2088, 2233, -3266, -2426, | |||
3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
#define qinvzeta_256 (qdata+50) | |||
-9, -17303, -29428, 24052, 19351, -12073, 26228, -24743, 20870, -12269, 21467, 19317, -15750, -25946, 5930, 32738, | |||
4974, -4693, -14121, 2193, -9488, 26262, -21066, 7508, 22593, 9599, 2073, 10409, 7491, -12013, 16279, -15358, | |||
-16425, -16655, -25093, 32329, 26382, 24573, 26279, 13541, 828, -25511, -29103, 26220, 23489, -8558, 11783, -24718, | |||
7244, 10478, 14846, 26800, 20469, 26441, 14624, -29871, -23754, -3558, -6536, -16092, 2816, 8472, 11792, -7415, | |||
-28865, -13575, -4505, -26663, -5759, -14351, -6246, -17602, 20315, -22875, 9215, 9855, -3114, -24266, 6817, -2722, | |||
-14744, -15768, 4983, 12611, 15307, -21860, -28839, -27201, 18242, 32252, 1724, 21868, -19394, -8908, 23242, 13933, | |||
10350, 17440, -21399, -11962, -10972, 30136, -29667, -1689, -18191, 6032, -21766, 30426, -31177, 15810, 15998, 3456, | |||
-4496, -9360, 23668, 27132, -25597, -5529, -5913, 1869, -11655, 22124, -24581, 21953, 22568, 23225, -20674, 17030, | |||
9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
#define zeta_512 (qdata+59) | |||
-3593, 2005, 2665, 2891, -2804, 2345, -2572, 1121, -617, -188, 727, 2786, -396, -3208, 3417, -17, | |||
1414, -3752, 2579, 2815, -549, 1837, 373, 151, 3706, -1012, 3750, -1509, 810, -3214, -1054, 3177, | |||
-2194, -1403, -2133, -3314, -1321, 83, 1681, -658, -1296, 2070, -1386, -3547, 438, 3781, -2732, 2230, | |||
-2495, -1669, 1919, 2589, -2535, -3312, -2391, -3542, -2237, -1441, 2835, -3568, -3689, -402, 2, -1070, | |||
-3625, 3763, -783, -3550, 2043, -2303, 3145, -436, 2830, -893, 1533, 1712, -1881, 124, 2789, -2001, | |||
2876, -2460, 2649, 3770, 3153, 2965, 3692, -1203, -1599, 2874, -1390, -1407, 7, -3745, -1166, 1649, | |||
1100, 2937, 3310, 3461, -514, -1526, 2224, 715, 1525, -1689, -2743, 434, -1760, -3163, 2385, -929, | |||
-2250, -2167, -486, -1144, -2440, -370, -1756, 2378, 2816, -1084, -3816, -1586, 3600, 1931, -3831, -1242, | |||
-3777, 592, -1799, 2340, 103, -1338, 1497, -2071, 1921, 1519, 1521, 451, -3174, 589, -194, -3744, | |||
-1701, 3677, -859, -1295, 1535, 642, 2175, -3794, 2006, 2130, -2762, 2918, -1887, 3334, -1698, 2072, | |||
-2456, 509, -3480, 2998, 1399, -3408, 2883, 1476, 1483, -2262, -3428, -1779, -679, 2258, -2113, 1348, | |||
834, -692, 1532, 2247, 3772, 2083, -660, -226, -1986, 2532, -2764, -3693, 1738, -429, -915, -2059, | |||
3182, 2812, 1056, 3434, -1431, -2515, 1350, -236, 3364, -2386, 1464, 222, -3555, -2963, 2919, -2422, | |||
-2319, -3657, -2160, 3450, -2310, -791, 730, 1181, -1993, -1404, -1598, 2339, 638, -3366, 3456, 2161, | |||
3696, -3343, -1168, 2719, -2956, -826, -3588, -670, -2557, 777, -921, 1151, -1305, -796, 3405, -1278, | |||
121, -3287, -404, 1072, 2555, 293, -3135, 2767, 2088, -3335, 2233, 3581, -3266, 3723, -2426, -179, | |||
3593, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
#define qinvzeta_512 (qdata+76) | |||
-9, 4565, -17303, 16715, -29428, 15145, 24052, -22943, 19351, 1860, -12073, -28958, 26228, -7304, -24743, -529, | |||
20870, -24232, -12269, 10495, 21467, -16083, 19317, 20119, -15750, -27636, -25946, -12261, 5930, -26766, 32738, -16791, | |||
4974, 25733, -4693, 20238, -14121, 18003, 2193, 6510, -9488, 29718, 26262, -25563, -21066, -1851, 7508, -19274, | |||
22593, -28805, 9599, -23523, 2073, 4880, 10409, 1578, 7491, -10145, -12013, 4624, 16279, 6766, -15358, 24530, | |||
-16425, 5299, -16655, -2526, -25093, -9983, 32329, 5708, 26382, -23933, 24573, 26288, 26279, 30844, 13541, 30255, | |||
828, 15972, -25511, 17082, -29103, -27243, 26220, -2739, 23489, 16186, -8558, -9087, 11783, -12449, -24718, -14223, | |||
7244, -8839, 10478, 30597, 14846, -12790, 26800, 14539, 20469, -6297, 26441, 9650, 14624, -25179, -29871, -9633, | |||
-23754, -5751, -3558, 2952, -6536, 23182, -16092, 23882, 2816, 964, 8472, -10802, 11792, -17013, -7415, -30938, | |||
-28865, -23984, -13575, -11996, -4505, -14650, -26663, -22039, -5759, 1007, -14351, 10179, -6246, -947, -17602, -20128, | |||
20315, 10333, -22875, -17167, 9215, -14718, 9855, -29394, -3114, 27730, -24266, 5990, 6817, 22790, -2722, 14360, | |||
-14744, 23549, -15768, -18506, 4983, 21168, 12611, 3524, 15307, 2858, -21860, 29453, -28839, 27858, -27201, 3396, | |||
18242, 5452, 32252, -18745, 1724, -4573, 21868, 31518, -19394, 20964, -8908, -18541, 23242, 17491, 13933, 16885, | |||
10350, -32004, 17440, -24214, -21399, -20435, -11962, -22764, -10972, -27986, 30136, -802, -29667, 11885, -1689, -13686, | |||
-18191, 32695, 6032, -16006, -21766, -20759, 30426, -24931, -31177, -32124, 15810, -4317, 15998, 26330, 3456, -13711, | |||
-4496, -19215, -9360, 26783, 23668, -14138, 27132, -32414, -25597, -2807, -5529, 8831, -5913, 17636, 1869, -16638, | |||
-11655, 9513, 22124, 25648, -24581, -21723, 21953, -14129, 22568, -15111, 23225, 26621, -20674, -15221, 17030, -1715, | |||
9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
} | |||
}; | |||
static const vec1488 qdata_10753 = { .data = { | |||
10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, | |||
24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, 24964, | |||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, | |||
223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, | |||
27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, | |||
4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, | |||
-1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, | |||
3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, | |||
-408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, | |||
1018, 1018, 1018, 1018, 2413, 2413, 2413, 2413, 4188, 4188, 4188, 4188, 357, 357, 357, 357, | |||
223, 223, 223, 223, -3686, -3686, -3686, -3686, -3688, -3688, -3688, -3688, -376, -376, -376, -376, | |||
-1018, -1018, -1018, -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
-6, -6, -6, -6, 10093, 10093, 10093, 10093, -1956, -1956, -1956, -1956, 28517, 28517, 28517, 28517, | |||
27359, 27359, 27359, 27359, -21094, -21094, -21094, -21094, 408, 408, 408, 408, -20856, -20856, -20856, -20856, | |||
6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
1018, 1018, 1018, 1018, -2695, -2695, -2695, -2695, 2413, 2413, 2413, 2413, 425, 425, 425, 425, | |||
4188, 4188, 4188, 4188, -4855, -4855, -4855, -4855, 357, 357, 357, 357, -3364, -3364, -3364, -3364, | |||
223, 223, 223, 223, 730, 730, 730, 730, -3686, -3686, -3686, -3686, -4544, -4544, -4544, -4544, | |||
-3688, -3688, -3688, -3688, -2236, -2236, -2236, -2236, -376, -376, -376, -376, 3784, 3784, 3784, 3784, | |||
-1018, -1018, -1018, -1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
-6, -6, -6, -6, 7033, 7033, 7033, 7033, 10093, 10093, 10093, 10093, 18345, 18345, 18345, 18345, | |||
-1956, -1956, -1956, -1956, 29449, 29449, 29449, 29449, 28517, 28517, 28517, 28517, -9508, -9508, -9508, -9508, | |||
27359, 27359, 27359, 27359, 16090, 16090, 16090, 16090, -21094, -21094, -21094, -21094, 28224, 28224, 28224, 28224, | |||
408, 408, 408, 408, -12476, -12476, -12476, -12476, -20856, -20856, -20856, -20856, 16072, 16072, 16072, 16072, | |||
6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
1018, -1520, -2695, 1341, 2413, 918, 425, 5175, 4188, -4035, -4855, 341, 357, 4347, -3364, 5213, | |||
223, -4875, 730, 1931, -3686, -2503, -4544, -4095, -3688, 5063, -2236, -3823, -376, 3012, 3784, -2629, | |||
-1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
-6, 23056, 7033, 829, 10093, 26518, 18345, 3639, -1956, -4547, 29449, 3925, 28517, -7429, -9508, -11683, | |||
27359, -17675, 16090, 14731, -21094, -25543, 28224, -14847, 408, 28103, -12476, 10001, -20856, -7228, 16072, 18363, | |||
6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
1018, -2935, -1520, -4744, -2695, -205, 1341, 1299, 2413, 4, 918, -4379, 425, -4616, 5175, -544, | |||
4188, 4129, -4035, 4102, -4855, -1287, 341, -2388, 357, 1284, 4347, 2984, -3364, 2178, 5213, -2576, | |||
223, 2790, -4875, 4876, 730, -4513, 1931, -3085, -3686, 3550, -2503, 847, -4544, 193, -4095, 1085, | |||
-3688, 3091, 5063, -4742, -2236, 2982, -3823, -1009, -376, -268, 3012, 3062, 3784, -2565, -2629, 4189, | |||
-1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
-6, 31369, 23056, 15736, 7033, -24269, 829, -6381, 10093, 22532, 26518, 23781, 18345, 15864, 3639, 15840, | |||
-1956, -23007, -4547, 5126, 29449, 8441, 3925, -16724, 28517, 23812, -7429, 31656, -9508, -19326, -11683, -27152, | |||
27359, 20198, -17675, 6924, 16090, 22623, 14731, 5619, -21094, -24098, -25543, 3407, 28224, 22209, -14847, 573, | |||
408, -4589, 28103, -5766, -12476, -12378, 10001, -31217, -20856, -2316, -7228, -20490, 16072, -14341, 18363, -12707, | |||
6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
1018, -4734, -2935, -400, -1520, 4977, -4744, -2973, -2695, 512, -205, -779, 1341, -1356, 1299, 635, | |||
2413, 567, 4, -4286, 918, -5114, -4379, -1586, 425, 1615, -4616, -336, 5175, -1841, -544, 2234, | |||
4188, -3441, 4129, 636, -4035, -4580, 4102, 2684, -4855, 3057, -1287, -2740, 341, -5156, -2388, -472, | |||
357, -794, 1284, 578, 4347, 3615, 2984, -3715, -3364, 2271, 2178, -326, 5213, 454, -2576, -3337, | |||
223, 2998, 2790, -151, -4875, 2981, 4876, 1324, 730, 2774, -4513, 2206, 1931, 886, -3085, -970, | |||
-3686, 3198, 3550, 2737, -2503, -909, 847, 1068, -4544, -2213, 193, 2884, -4095, -4808, 1085, 4123, | |||
-3688, 5341, 3091, 5294, 5063, -116, -4742, -5116, -2236, -2045, 2982, -1572, -3823, 4828, -1009, 467, | |||
-376, 5023, -268, -3169, 3012, -1458, 3062, -1268, 3784, -675, -2565, 1006, -2629, 5064, 4189, 864, | |||
-1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
-6, -26238, 31369, -24976, 23056, -30351, 15736, -18845, 7033, 512, -24269, -13579, 829, 29364, -6381, -11141, | |||
10093, -969, 22532, 6978, 26518, -4090, 23781, 11726, 18345, 4175, 15864, 7856, 3639, 719, 15840, -31558, | |||
-1956, 31887, -23007, -21892, -4547, 22044, 5126, -19844, 29449, -32271, 8441, 32076, 3925, -11300, -16724, 28200, | |||
28517, 16614, 23812, 11842, -7429, -2017, 31656, 28541, -9508, 29407, -19326, 31418, -11683, -31290, -27152, 27895, | |||
27359, 12214, 20198, -14999, -17675, -1627, 6924, -13012, 16090, -4394, 22623, 7326, 14731, -22666, 5619, 8246, | |||
-21094, 24702, -24098, 177, -25543, 7795, 3407, -13268, 28224, 2395, 22209, -7356, -14847, -17096, 573, -24037, | |||
408, -11555, -4589, -30546, 28103, 1932, -5766, 17412, -12476, 31235, -12378, -7716, 10001, -1316, -31217, 25555, | |||
-20856, -609, -2316, -8801, -7228, 11854, -20490, 780, 16072, -17571, -14341, -2066, 18363, 17352, -12707, 17248, | |||
6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
1018, 3453, -4734, 4519, -2935, 2118, -400, -554, -1520, 2196, 4977, 1893, -4744, -1409, -2973, -1053, | |||
-2695, 4601, 512, 279, -205, -3241, -779, 4889, 1341, 3524, -1356, -1663, 1299, 2283, 635, 73, | |||
2413, 2428, 567, 624, 4, -1930, -4286, 3419, 918, -2062, -5114, 5068, -4379, -97, -1586, 1782, | |||
425, 4621, 1615, 355, -4616, 1349, -336, 825, 5175, 3135, -1841, 1160, -544, 4408, 2234, -2605, | |||
4188, 854, -3441, -1056, 4129, 2439, 636, 4967, -4035, -4782, -4580, -5268, 4102, -663, 2684, -4670, | |||
-4855, 3760, 3057, 3535, -1287, 2680, -2740, -569, 341, 2139, -5156, 3827, -2388, 1639, -472, 1927, | |||
357, 5172, -794, -4003, 1284, 4144, 578, 693, 4347, 4784, 3615, 3125, 2984, 1122, -3715, 2113, | |||
-3364, -573, 2271, -4328, 2178, 2909, -326, -4000, 5213, -4447, 454, -3995, -2576, -4428, -3337, 2529, | |||
223, 5309, 2998, 5120, 2790, -2050, -151, 2963, -4875, 2657, 2981, -2807, 4876, 2237, 1324, -4403, | |||
730, 2624, 2774, -5083, -4513, 40, 2206, 152, 1931, -1573, 886, 2625, -3085, -778, -970, -5107, | |||
-3686, 4250, 3198, -5356, 3550, -3148, 2737, -3360, -2503, -2015, -909, 3096, 847, 5313, 1068, 834, | |||
-4544, -1132, -2213, -2151, 193, -1722, 2884, -4393, -4095, 2662, -4808, -2788, 1085, -1992, 4123, 5334, | |||
-3688, 5215, 5341, -1689, 3091, -2117, 5294, 4859, 5063, 3410, -116, 2205, -4742, -2374, -5116, -4720, | |||
-2236, 3570, -2045, 2813, 2982, 2087, -1572, -4973, -3823, 458, 4828, 3891, -1009, -2419, 467, -4891, | |||
-376, -1381, 5023, 1204, -268, 274, -3169, -3260, 3012, -1635, -1458, 4540, 3062, -4254, -1268, -1111, | |||
3784, 2230, -675, -2279, -2565, -4359, 1006, -1510, -2629, 5015, 5064, -2449, 4189, -5005, 864, 2487, | |||
-1018, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
-6, -29827, -26238, -21593, 31369, -29626, -24976, -7722, 23056, -16236, -30351, 30053, 15736, 9343, -18845, -16925, | |||
7033, 14329, 512, 15127, -24269, -21161, -13579, -1767, 829, -6716, 29364, -12415, -6381, 31467, -11141, 1609, | |||
10093, -20100, -969, -23952, 22532, -25482, 6978, 8027, 26518, 17394, -4090, -25652, 23781, -5729, 11726, -21770, | |||
18345, -4083, 4175, -15517, 15864, -19643, 7856, -22215, 3639, -18881, 719, -19320, 15840, -7880, -31558, 22483, | |||
-1956, -6314, 31887, 15328, -23007, -7289, -21892, 11623, -4547, 31058, 22044, 13164, 5126, -15511, -19844, 6594, | |||
29449, 11952, -32271, 6095, 8441, 23160, 32076, 22471, 3925, 6747, -11300, 12531, -16724, 8295, 28200, -7801, | |||
28517, -29644, 16614, -20899, 23812, 12336, 11842, 20661, -7429, 12976, -2017, 23093, 31656, -3998, 28541, 24129, | |||
-9508, -61, 29407, -232, -19326, -13987, 31418, 12384, -11683, -31583, -31290, 24165, -27152, 26292, 27895, 8161, | |||
27359, 4797, 12214, 5120, 20198, 19454, -14999, -4717, -17675, 8289, -1627, 31497, 6924, 1725, -13012, 19661, | |||
16090, -30144, -4394, -9691, 22623, 28712, 7326, 4248, 14731, 3035, -22666, 24641, 5619, -24330, 8246, -13811, | |||
-21094, -13158, 24702, -23788, -24098, 27572, 177, 13024, -25543, -29151, 7795, 7192, 3407, 27329, -13268, 12098, | |||
28224, -19564, 2395, -8807, 22209, 32070, -7356, -22313, -14847, 20070, -17096, 23836, 573, -14280, -24037, -1834, | |||
408, 32351, -11555, 4967, -4589, 18875, -30546, -6917, 28103, -26286, 1932, 18077, -5766, 29370, 17412, 19856, | |||
-12476, 23026, 31235, -30467, -12378, -24025, -7716, -12653, 10001, -8758, -1316, -20173, -31217, -11123, 25555, 23269, | |||
-20856, -29541, -609, 31924, -2316, 3346, -8801, -13500, -7228, 14237, 11854, 14780, -20490, -9374, 780, 16809, | |||
16072, 11446, -17571, -8935, -14341, 5369, -2066, -18918, 18363, 19863, 17352, -16273, -12707, 3699, 17248, 951, | |||
6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
} | |||
}; | |||
static inline __m256i sub_x16(__m256i a, __m256i b) { | |||
//__asm__("vpsubw %1,%0,%0" : "+x"(a),"+x"(b)); | |||
return _mm256_sub_epi16(a, b); | |||
} | |||
static inline __m256i add_x16(__m256i a, __m256i b) { | |||
return _mm256_add_epi16(a, b); | |||
} | |||
static inline __m256i reduce_x16(const __m256i *qdata, __m256i x) { | |||
__m256i y = _mm256_mulhi_epi16(x, qrecip_x16); | |||
y = _mm256_mulhrs_epi16(y, qshift_x16); | |||
y = _mm256_mullo_epi16(y, q_x16); | |||
return sub_x16(x, y); | |||
} | |||
static inline __m256i mulmod_x16_scaled(const __m256i *qdata, __m256i x, __m256i y, __m256i yqinv) { | |||
__m256i b = _mm256_mulhi_epi16(x, y); | |||
__m256i d = _mm256_mullo_epi16(x, yqinv); | |||
__m256i e = _mm256_mulhi_epi16(d, q_x16); | |||
return sub_x16(b, e); | |||
} | |||
typedef union { | |||
int8 data[32]; | |||
__m256i _dummy; | |||
} byte32; | |||
static const byte32 shuffle_buf = { .data = { | |||
14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, | |||
14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, | |||
} | |||
}; | |||
#define shuffle (*(__m256i *) shuffle_buf.data) | |||
static inline __m256i _mm256_loadu_reverse16(const __m256i *p) { | |||
__m256i x = _mm256_loadu_si256(p); | |||
x = _mm256_permute2x128_si256(x, x, 1); | |||
x = _mm256_shuffle_epi8(x, shuffle); | |||
return x; | |||
} | |||
static void ntt128(int16 *f, int reps, const __m256i *qdata) { | |||
__m256i f0, f1, f2, f3, g0, g1, g2, g3, h0, h1, h2, h3; | |||
int16 *origf = f; | |||
int rep; | |||
__m256i zetainv_128_0 = zetainv(128, 0); | |||
__m256i zetainv_qinv_128_0 = zetainv_qinv(128, 0); | |||
__m256i zetainv_x4_32_0 = zetainv_x4(32, 0); | |||
__m256i zetainv_x4_qinv_32_0 = zetainv_x4_qinv(32, 0); | |||
__m256i zetainv_128_1 = zetainv(128, 1); | |||
__m256i zetainv_qinv_128_1 = zetainv_qinv(128, 1); | |||
__m256i zetainv_x4_32_1 = zetainv_x4(32, 1); | |||
__m256i zetainv_x4_qinv_32_1 = zetainv_x4_qinv(32, 1); | |||
for (rep = 0; rep < reps; ++rep) { | |||
f1 = _mm256_loadu_si256((__m256i *) (f + 32)); | |||
f3 = _mm256_loadu_si256((__m256i *) (f + 96)); | |||
g3 = sub_x16(f1, f3); | |||
g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv); | |||
g1 = add_x16(f1, f3); | |||
f0 = _mm256_loadu_si256((__m256i *) (f + 0)); | |||
f2 = _mm256_loadu_si256((__m256i *) (f + 64)); | |||
g2 = sub_x16(f0, f2); | |||
g0 = add_x16(f0, f2); | |||
f3 = sub_x16(g3, g2); | |||
f2 = add_x16(g2, g3); | |||
f3 = mulmod_x16_scaled(qdata, f3, zetainv_128_0, zetainv_qinv_128_0); | |||
f2 = mulmod_x16_scaled(qdata, f2, zeta(128, 0), zeta_qinv(128, 0)); | |||
g2 = _mm256_unpacklo_epi16(f2, f3); | |||
g3 = _mm256_unpackhi_epi16(f2, f3); | |||
f1 = sub_x16(g0, g1); | |||
f0 = add_x16(g0, g1); | |||
f1 = mulmod_x16_scaled(qdata, f1, zeta(64, 0), zeta_qinv(64, 0)); | |||
f0 = reduce_x16(qdata, f0); | |||
g0 = _mm256_unpacklo_epi16(f0, f1); | |||
h0 = _mm256_unpacklo_epi32(g0, g2); | |||
h1 = _mm256_unpackhi_epi32(g0, g2); | |||
g1 = _mm256_unpackhi_epi16(f0, f1); | |||
h2 = _mm256_unpacklo_epi32(g1, g3); | |||
h3 = _mm256_unpackhi_epi32(g1, g3); | |||
f0 = _mm256_permute2x128_si256(h0, h1, 0x20); | |||
f2 = _mm256_permute2x128_si256(h0, h1, 0x31); | |||
f1 = _mm256_permute2x128_si256(h2, h3, 0x20); | |||
f3 = _mm256_permute2x128_si256(h2, h3, 0x31); | |||
_mm256_storeu_si256((__m256i *) (f + 0), f0); | |||
_mm256_storeu_si256((__m256i *) (f + 64), f2); | |||
_mm256_storeu_si256((__m256i *) (f + 32), f1); | |||
_mm256_storeu_si256((__m256i *) (f + 96), f3); | |||
f1 = _mm256_loadu_si256((__m256i *) (f + 48)); | |||
f3 = _mm256_loadu_si256((__m256i *) (f + 112)); | |||
g3 = sub_x16(f1, f3); | |||
g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv); | |||
g1 = add_x16(f1, f3); | |||
f0 = _mm256_loadu_si256((__m256i *) (f + 16)); | |||
f2 = _mm256_loadu_si256((__m256i *) (f + 80)); | |||
g2 = sub_x16(f0, f2); | |||
g0 = add_x16(f0, f2); | |||
f3 = sub_x16(g3, g2); | |||
f2 = add_x16(g2, g3); | |||
f3 = mulmod_x16_scaled(qdata, f3, zetainv_128_1, zetainv_qinv_128_1); | |||
f2 = mulmod_x16_scaled(qdata, f2, zeta(128, 1), zeta_qinv(128, 1)); | |||
g2 = _mm256_unpacklo_epi16(f2, f3); | |||
g3 = _mm256_unpackhi_epi16(f2, f3); | |||
f1 = sub_x16(g0, g1); | |||
f0 = add_x16(g0, g1); | |||
f1 = mulmod_x16_scaled(qdata, f1, zeta(64, 1), zeta_qinv(64, 1)); | |||
f0 = reduce_x16(qdata, f0); | |||
g0 = _mm256_unpacklo_epi16(f0, f1); | |||
h0 = _mm256_unpacklo_epi32(g0, g2); | |||
h1 = _mm256_unpackhi_epi32(g0, g2); | |||
g1 = _mm256_unpackhi_epi16(f0, f1); | |||
h2 = _mm256_unpacklo_epi32(g1, g3); | |||
h3 = _mm256_unpackhi_epi32(g1, g3); | |||
f0 = _mm256_permute2x128_si256(h0, h1, 0x20); | |||
f2 = _mm256_permute2x128_si256(h0, h1, 0x31); | |||
f1 = _mm256_permute2x128_si256(h2, h3, 0x20); | |||
f3 = _mm256_permute2x128_si256(h2, h3, 0x31); | |||
_mm256_storeu_si256((__m256i *) (f + 16), f0); | |||
_mm256_storeu_si256((__m256i *) (f + 80), f2); | |||
_mm256_storeu_si256((__m256i *) (f + 48), f1); | |||
_mm256_storeu_si256((__m256i *) (f + 112), f3); | |||
f += 128; | |||
} | |||
f = origf; | |||
for (rep = 0; rep < reps; ++rep) { | |||
f1 = _mm256_loadu_si256((__m256i *) (f + 64)); | |||
f3 = _mm256_loadu_si256((__m256i *) (f + 80)); | |||
g3 = sub_x16(f1, f3); | |||
g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv); | |||
g1 = add_x16(f1, f3); | |||
f0 = _mm256_loadu_si256((__m256i *) (f + 0)); | |||
f2 = _mm256_loadu_si256((__m256i *) (f + 16)); | |||
g2 = sub_x16(f0, f2); | |||
g0 = add_x16(f0, f2); | |||
f3 = sub_x16(g3, g2); | |||
f2 = add_x16(g2, g3); | |||
f3 = mulmod_x16_scaled(qdata, f3, zetainv_x4_32_0, zetainv_x4_qinv_32_0); | |||
f2 = mulmod_x16_scaled(qdata, f2, zeta_x4(32, 0), zeta_x4_qinv(32, 0)); | |||
g2 = _mm256_unpacklo_epi64(f2, f3); | |||
g3 = _mm256_unpackhi_epi64(f2, f3); | |||
f1 = sub_x16(g0, g1); | |||
f0 = add_x16(g0, g1); | |||
f1 = mulmod_x16_scaled(qdata, f1, zeta_x4(16, 0), zeta_x4_qinv(16, 0)); | |||
f0 = reduce_x16(qdata, f0); | |||
g1 = _mm256_unpackhi_epi64(f0, f1); | |||
g0 = _mm256_unpacklo_epi64(f0, f1); | |||
f1 = _mm256_permute2x128_si256(g1, g3, 0x20); | |||
f3 = _mm256_permute2x128_si256(g1, g3, 0x31); | |||
f0 = _mm256_permute2x128_si256(g0, g2, 0x20); | |||
f2 = _mm256_permute2x128_si256(g0, g2, 0x31); | |||
_mm256_storeu_si256((__m256i *) (f + 64), f1); | |||
_mm256_storeu_si256((__m256i *) (f + 80), f3); | |||
_mm256_storeu_si256((__m256i *) (f + 0), f0); | |||
_mm256_storeu_si256((__m256i *) (f + 16), f2); | |||
f1 = _mm256_loadu_si256((__m256i *) (f + 96)); | |||
f3 = _mm256_loadu_si256((__m256i *) (f + 112)); | |||
g3 = sub_x16(f1, f3); | |||
g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv); | |||
g1 = add_x16(f1, f3); | |||
f0 = _mm256_loadu_si256((__m256i *) (f + 32)); | |||
f2 = _mm256_loadu_si256((__m256i *) (f + 48)); | |||
g2 = sub_x16(f0, f2); | |||
g0 = add_x16(f0, f2); | |||
f3 = sub_x16(g3, g2); | |||
f2 = add_x16(g2, g3); | |||
f3 = mulmod_x16_scaled(qdata, f3, zetainv_x4_32_1, zetainv_x4_qinv_32_1); | |||
f2 = mulmod_x16_scaled(qdata, f2, zeta_x4(32, 1), zeta_x4_qinv(32, 1)); | |||
g2 = _mm256_unpacklo_epi64(f2, f3); | |||
g3 = _mm256_unpackhi_epi64(f2, f3); | |||
f1 = sub_x16(g0, g1); | |||
f0 = add_x16(g0, g1); | |||
f1 = mulmod_x16_scaled(qdata, f1, zeta_x4(16, 1), zeta_x4_qinv(16, 1)); | |||
f0 = reduce_x16(qdata, f0); | |||
g1 = _mm256_unpackhi_epi64(f0, f1); | |||
g0 = _mm256_unpacklo_epi64(f0, f1); | |||
f1 = _mm256_permute2x128_si256(g1, g3, 0x20); | |||
f3 = _mm256_permute2x128_si256(g1, g3, 0x31); | |||
f0 = _mm256_permute2x128_si256(g0, g2, 0x20); | |||
f2 = _mm256_permute2x128_si256(g0, g2, 0x31); | |||
_mm256_storeu_si256((__m256i *) (f + 96), f1); | |||
_mm256_storeu_si256((__m256i *) (f + 112), f3); | |||
_mm256_storeu_si256((__m256i *) (f + 32), f0); | |||
_mm256_storeu_si256((__m256i *) (f + 48), f2); | |||
f += 128; | |||
} | |||
f = origf; | |||
for (rep = 0; rep < reps; ++rep) { | |||
f1 = _mm256_loadu_si256((__m256i *) (f + 16)); | |||
f3 = _mm256_loadu_si256((__m256i *) (f + 48)); | |||
g3 = sub_x16(f1, f3); | |||
g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv); | |||
g1 = add_x16(f1, f3); | |||
f0 = _mm256_loadu_si256((__m256i *) (f + 0)); | |||
f2 = _mm256_loadu_si256((__m256i *) (f + 32)); | |||
g2 = sub_x16(f0, f2); | |||
g0 = add_x16(f0, f2); | |||
f2 = add_x16(g2, g3); | |||
f3 = sub_x16(g2, g3); | |||
f2 = reduce_x16(qdata, f2); | |||
f3 = reduce_x16(qdata, f3); | |||
f1 = sub_x16(g0, g1); | |||
f0 = add_x16(g0, g1); | |||
f0 = reduce_x16(qdata, f0); | |||
h0 = f0; | |||
h1 = f1; | |||
h2 = f2; | |||
h3 = f3; | |||
f1 = _mm256_loadu_si256((__m256i *) (f + 80)); | |||
f3 = _mm256_loadu_si256((__m256i *) (f + 112)); | |||
g3 = sub_x16(f1, f3); | |||
g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv); | |||
g1 = add_x16(f1, f3); | |||
f0 = _mm256_loadu_si256((__m256i *) (f + 64)); | |||
f2 = _mm256_loadu_si256((__m256i *) (f + 96)); | |||
g2 = sub_x16(f0, f2); | |||
g0 = add_x16(f0, f2); | |||
f3 = sub_x16(g3, g2); | |||
f2 = add_x16(g2, g3); | |||
f3 = mulmod_x16_scaled(qdata, f3, zetainv8_x16, zetainv8_x16_qinv); | |||
f2 = mulmod_x16_scaled(qdata, f2, zeta8_x16, zeta8_x16_qinv); | |||
f1 = sub_x16(g0, g1); | |||
f0 = add_x16(g0, g1); | |||
f1 = mulmod_x16_scaled(qdata, f1, zeta4_x16, zeta4_x16_qinv); | |||
f0 = reduce_x16(qdata, f0); | |||
g0 = add_x16(h0, f0); | |||
g1 = add_x16(h1, f1); | |||
g2 = add_x16(h2, f2); | |||
g3 = add_x16(h3, f3); | |||
_mm256_storeu_si256((__m256i *) (f + 0), g0); | |||
_mm256_storeu_si256((__m256i *) (f + 16), g1); | |||
_mm256_storeu_si256((__m256i *) (f + 32), g2); | |||
_mm256_storeu_si256((__m256i *) (f + 48), g3); | |||
g0 = sub_x16(h0, f0); | |||
g1 = sub_x16(h1, f1); | |||
g2 = sub_x16(h2, f2); | |||
g3 = sub_x16(h3, f3); | |||
_mm256_storeu_si256((__m256i *) (f + 64), g0); | |||
_mm256_storeu_si256((__m256i *) (f + 80), g1); | |||
_mm256_storeu_si256((__m256i *) (f + 96), g2); | |||
_mm256_storeu_si256((__m256i *) (f + 112), g3); | |||
f += 128; | |||
} | |||
} | |||
static void ntt512(int16 *f, int reps, const __m256i *qdata) { | |||
__m256i f0, f1, f2, f3, g0, g1, g2, g3; /* [-Werror=unused-variable] */ /* ,h0,h1,h2,h3; */ | |||
int16 *origf = f; | |||
int rep; | |||
__m256i zetainv_512[8]; | |||
__m256i zetainv_qinv_512[8]; | |||
int i; | |||
for (i = 0; i < 8; ++i) { | |||
zetainv_512[i] = zetainv(512, i); | |||
} | |||
for (i = 0; i < 8; ++i) { | |||
zetainv_qinv_512[i] = zetainv_qinv(512, i); | |||
} | |||
for (rep = 0; rep < reps; ++rep) { | |||
for (i = 0; i < 8; ++i) { | |||
f1 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 128)); | |||
f3 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 384)); | |||
g3 = sub_x16(f1, f3); | |||
g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv); | |||
g1 = add_x16(f1, f3); | |||
f0 = _mm256_loadu_si256((__m256i *) (f + 16 * i)); | |||
f2 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 256)); | |||
g2 = sub_x16(f0, f2); | |||
g0 = add_x16(f0, f2); | |||
f3 = sub_x16(g3, g2); | |||
f2 = add_x16(g2, g3); | |||
f3 = mulmod_x16_scaled(qdata, f3, zetainv_512[i], zetainv_qinv_512[i]); | |||
f2 = mulmod_x16_scaled(qdata, f2, zeta(512, i), zeta_qinv(512, i)); | |||
f1 = sub_x16(g0, g1); | |||
f0 = add_x16(g0, g1); | |||
f1 = mulmod_x16_scaled(qdata, f1, zeta(256, i), zeta_qinv(256, i)); | |||
f0 = reduce_x16(qdata, f0); | |||
_mm256_storeu_si256((__m256i *) (f + 16 * i + 384), f3); | |||
_mm256_storeu_si256((__m256i *) (f + 16 * i + 256), f2); | |||
_mm256_storeu_si256((__m256i *) (f + 16 * i + 128), f1); | |||
_mm256_storeu_si256((__m256i *) (f + 16 * i), f0); | |||
} | |||
f += 512; | |||
} | |||
f = origf; | |||
ntt128(f, reps * 4, qdata); | |||
} | |||
void PQCLEAN_NTRULPR761_AVX2_ntt512_7681(int16 *f, int reps) { | |||
ntt512(f, reps, (const __m256i *) qdata_7681.data); | |||
} | |||
void PQCLEAN_NTRULPR761_AVX2_ntt512_10753(int16 *f, int reps) { | |||
ntt512(f, reps, (const __m256i *) qdata_10753.data); | |||
} | |||
static void invntt128(int16 *f, int reps, const __m256i *qdata) { | |||
__m256i f0, f1, f2, f3, g0, g1, g2, g3, h0, h1, h2, h3; | |||
int16 *origf = f; | |||
int rep; | |||
__m256i zetainv_x4_16_0 = zetainv_x4(16, 0); | |||
__m256i zetainv_x4_qinv_16_0 = zetainv_x4_qinv(16, 0); | |||
__m256i zetainv_x4_32_0 = zetainv_x4(32, 0); | |||
__m256i zetainv_x4_qinv_32_0 = zetainv_x4_qinv(32, 0); | |||
__m256i zetainv_64_0 = zetainv(64, 0); | |||
__m256i zetainv_qinv_64_0 = zetainv_qinv(64, 0); | |||
__m256i zetainv_128_0 = zetainv(128, 0); | |||
__m256i zetainv_qinv_128_0 = zetainv_qinv(128, 0); | |||
__m256i zetainv_x4_16_1 = zetainv_x4(16, 1); | |||
__m256i zetainv_x4_qinv_16_1 = zetainv_x4_qinv(16, 1); | |||
__m256i zetainv_x4_32_1 = zetainv_x4(32, 1); | |||
__m256i zetainv_x4_qinv_32_1 = zetainv_x4_qinv(32, 1); | |||
__m256i zetainv_64_1 = zetainv(64, 1); | |||
__m256i zetainv_qinv_64_1 = zetainv_qinv(64, 1); | |||
__m256i zetainv_128_1 = zetainv(128, 1); | |||
__m256i zetainv_qinv_128_1 = zetainv_qinv(128, 1); | |||
for (rep = 0; rep < reps; ++rep) { | |||
f0 = _mm256_loadu_si256((__m256i *) (f + 0)); | |||
f1 = _mm256_loadu_si256((__m256i *) (f + 64)); | |||
f2 = _mm256_loadu_si256((__m256i *) (f + 16)); | |||
f3 = _mm256_loadu_si256((__m256i *) (f + 80)); | |||
g0 = _mm256_loadu_si256((__m256i *) (f + 32)); | |||
g1 = _mm256_loadu_si256((__m256i *) (f + 96)); | |||
g2 = _mm256_loadu_si256((__m256i *) (f + 48)); | |||
g3 = _mm256_loadu_si256((__m256i *) (f + 112)); | |||
h1 = sub_x16(f0, f1); | |||
h1 = reduce_x16(qdata, h1); | |||
h0 = add_x16(f0, f1); | |||
h3 = sub_x16(f2, f3); | |||
h3 = mulmod_x16_scaled(qdata, h3, zeta4_x16, zeta4_x16_qinv); | |||
h2 = add_x16(f2, f3); | |||
f1 = sub_x16(g0, g1); | |||
f1 = mulmod_x16_scaled(qdata, f1, zetainv8_x16, zetainv8_x16_qinv); | |||
f0 = add_x16(g0, g1); | |||
f3 = sub_x16(g2, g3); | |||
f3 = mulmod_x16_scaled(qdata, f3, zeta8_x16, zeta8_x16_qinv); | |||
f2 = add_x16(g2, g3); | |||
g0 = add_x16(h0, h2); | |||
g0 = reduce_x16(qdata, g0); | |||
g2 = sub_x16(h0, h2); | |||
g2 = reduce_x16(qdata, g2); | |||
g1 = sub_x16(h1, h3); | |||
g3 = add_x16(h1, h3); | |||
h2 = sub_x16(f0, f2); | |||
h2 = mulmod_x16_scaled(qdata, h2, zeta4_x16, zeta4_x16_qinv); | |||
h0 = add_x16(f0, f2); | |||
h3 = add_x16(f1, f3); | |||
h3 = mulmod_x16_scaled(qdata, h3, zeta4_x16, zeta4_x16_qinv); | |||
h1 = sub_x16(f1, f3); | |||
f0 = add_x16(g0, h0); | |||
g0 = sub_x16(g0, h0); | |||
f1 = add_x16(g1, h1); | |||
g1 = sub_x16(g1, h1); | |||
f2 = sub_x16(g2, h2); | |||
g2 = add_x16(g2, h2); | |||
f3 = sub_x16(g3, h3); | |||
g3 = add_x16(g3, h3); | |||
_mm256_storeu_si256((__m256i *) (f + 0), f0); | |||
_mm256_storeu_si256((__m256i *) (f + 32), g0); | |||
_mm256_storeu_si256((__m256i *) (f + 64), f1); | |||
_mm256_storeu_si256((__m256i *) (f + 96), g1); | |||
_mm256_storeu_si256((__m256i *) (f + 16), f2); | |||
_mm256_storeu_si256((__m256i *) (f + 48), g2); | |||
_mm256_storeu_si256((__m256i *) (f + 80), f3); | |||
_mm256_storeu_si256((__m256i *) (f + 112), g3); | |||
f += 128; | |||
} | |||
f = origf; | |||
for (rep = 0; rep < reps; ++rep) { | |||
f0 = _mm256_loadu_si256((__m256i *) (f + 0)); | |||
f1 = _mm256_loadu_si256((__m256i *) (f + 64)); | |||
f2 = _mm256_loadu_si256((__m256i *) (f + 16)); | |||
f3 = _mm256_loadu_si256((__m256i *) (f + 80)); | |||
g0 = _mm256_unpacklo_epi64(f0, f1); | |||
g1 = _mm256_unpacklo_epi64(f2, f3); | |||
g2 = _mm256_unpackhi_epi64(f0, f1); | |||
g3 = _mm256_unpackhi_epi64(f2, f3); | |||
f2 = _mm256_permute2x128_si256(g0, g1, 0x31); | |||
f3 = _mm256_permute2x128_si256(g2, g3, 0x31); | |||
f0 = _mm256_permute2x128_si256(g0, g1, 0x20); | |||
f1 = _mm256_permute2x128_si256(g2, g3, 0x20); | |||
f2 = mulmod_x16_scaled(qdata, f2, zetainv_x4_32_0, zetainv_x4_qinv_32_0); | |||
f3 = mulmod_x16_scaled(qdata, f3, zeta_x4(32, 0), zeta_x4_qinv(32, 0)); | |||
g3 = add_x16(f3, f2); | |||
g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv); | |||
g2 = sub_x16(f3, f2); | |||
f0 = reduce_x16(qdata, f0); | |||
f1 = mulmod_x16_scaled(qdata, f1, zetainv_x4_16_0, zetainv_x4_qinv_16_0); | |||
g1 = add_x16(f0, f1); | |||
g0 = sub_x16(f0, f1); | |||
f1 = add_x16(g1, g3); | |||
f3 = sub_x16(g1, g3); | |||
f0 = add_x16(g0, g2); | |||
f2 = sub_x16(g0, g2); | |||
_mm256_storeu_si256((__m256i *) (f + 64), f1); | |||
_mm256_storeu_si256((__m256i *) (f + 80), f3); | |||
_mm256_storeu_si256((__m256i *) (f + 0), f0); | |||
_mm256_storeu_si256((__m256i *) (f + 16), f2); | |||
f0 = _mm256_loadu_si256((__m256i *) (f + 32)); | |||
f1 = _mm256_loadu_si256((__m256i *) (f + 96)); | |||
f2 = _mm256_loadu_si256((__m256i *) (f + 48)); | |||
f3 = _mm256_loadu_si256((__m256i *) (f + 112)); | |||
g0 = _mm256_unpacklo_epi64(f0, f1); | |||
g1 = _mm256_unpacklo_epi64(f2, f3); | |||
g2 = _mm256_unpackhi_epi64(f0, f1); | |||
g3 = _mm256_unpackhi_epi64(f2, f3); | |||
f2 = _mm256_permute2x128_si256(g0, g1, 0x31); | |||
f3 = _mm256_permute2x128_si256(g2, g3, 0x31); | |||
f0 = _mm256_permute2x128_si256(g0, g1, 0x20); | |||
f1 = _mm256_permute2x128_si256(g2, g3, 0x20); | |||
f2 = mulmod_x16_scaled(qdata, f2, zetainv_x4_32_1, zetainv_x4_qinv_32_1); | |||
f3 = mulmod_x16_scaled(qdata, f3, zeta_x4(32, 1), zeta_x4_qinv(32, 1)); | |||
g3 = add_x16(f3, f2); | |||
g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv); | |||
g2 = sub_x16(f3, f2); | |||
f0 = reduce_x16(qdata, f0); | |||
f1 = mulmod_x16_scaled(qdata, f1, zetainv_x4_16_1, zetainv_x4_qinv_16_1); | |||
g1 = add_x16(f0, f1); | |||
g0 = sub_x16(f0, f1); | |||
f1 = add_x16(g1, g3); | |||
f3 = sub_x16(g1, g3); | |||
f0 = add_x16(g0, g2); | |||
f2 = sub_x16(g0, g2); | |||
_mm256_storeu_si256((__m256i *) (f + 96), f1); | |||
_mm256_storeu_si256((__m256i *) (f + 112), f3); | |||
_mm256_storeu_si256((__m256i *) (f + 32), f0); | |||
_mm256_storeu_si256((__m256i *) (f + 48), f2); | |||
f += 128; | |||
} | |||
f = origf; | |||
for (rep = 0; rep < reps; ++rep) { | |||
f0 = _mm256_loadu_si256((__m256i *) (f + 0)); | |||
f2 = _mm256_loadu_si256((__m256i *) (f + 64)); | |||
f1 = _mm256_loadu_si256((__m256i *) (f + 32)); | |||
f3 = _mm256_loadu_si256((__m256i *) (f + 96)); | |||
g0 = _mm256_permute2x128_si256(f0, f2, 0x20); | |||
g2 = _mm256_permute2x128_si256(f0, f2, 0x31); | |||
f0 = _mm256_unpacklo_epi16(g0, g2); | |||
f2 = _mm256_unpackhi_epi16(g0, g2); | |||
g1 = _mm256_permute2x128_si256(f1, f3, 0x20); | |||
g3 = _mm256_permute2x128_si256(f1, f3, 0x31); | |||
f1 = _mm256_unpacklo_epi16(g1, g3); | |||
f3 = _mm256_unpackhi_epi16(g1, g3); | |||
g1 = _mm256_unpackhi_epi16(f0, f2); | |||
g0 = _mm256_unpacklo_epi16(f0, f2); | |||
g3 = _mm256_unpackhi_epi16(f1, f3); | |||
g2 = _mm256_unpacklo_epi16(f1, f3); | |||
f2 = _mm256_unpacklo_epi64(g1, g3); | |||
f3 = _mm256_unpackhi_epi64(g1, g3); | |||
f0 = _mm256_unpacklo_epi64(g0, g2); | |||
f1 = _mm256_unpackhi_epi64(g0, g2); | |||
f2 = mulmod_x16_scaled(qdata, f2, zetainv_128_0, zetainv_qinv_128_0); | |||
f3 = mulmod_x16_scaled(qdata, f3, zeta(128, 0), zeta_qinv(128, 0)); | |||
f0 = reduce_x16(qdata, f0); | |||
f1 = mulmod_x16_scaled(qdata, f1, zetainv_64_0, zetainv_qinv_64_0); | |||
g3 = add_x16(f3, f2); | |||
g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv); | |||
g1 = add_x16(f0, f1); | |||
g2 = sub_x16(f3, f2); | |||
g0 = sub_x16(f0, f1); | |||
f1 = add_x16(g1, g3); | |||
f3 = sub_x16(g1, g3); | |||
f0 = add_x16(g0, g2); | |||
f2 = sub_x16(g0, g2); | |||
_mm256_storeu_si256((__m256i *) (f + 32), f1); | |||
_mm256_storeu_si256((__m256i *) (f + 96), f3); | |||
_mm256_storeu_si256((__m256i *) (f + 0), f0); | |||
_mm256_storeu_si256((__m256i *) (f + 64), f2); | |||
f0 = _mm256_loadu_si256((__m256i *) (f + 16)); | |||
f2 = _mm256_loadu_si256((__m256i *) (f + 80)); | |||
f1 = _mm256_loadu_si256((__m256i *) (f + 48)); | |||
f3 = _mm256_loadu_si256((__m256i *) (f + 112)); | |||
g0 = _mm256_permute2x128_si256(f0, f2, 0x20); | |||
g2 = _mm256_permute2x128_si256(f0, f2, 0x31); | |||
f0 = _mm256_unpacklo_epi16(g0, g2); | |||
f2 = _mm256_unpackhi_epi16(g0, g2); | |||
g1 = _mm256_permute2x128_si256(f1, f3, 0x20); | |||
g3 = _mm256_permute2x128_si256(f1, f3, 0x31); | |||
f1 = _mm256_unpacklo_epi16(g1, g3); | |||
f3 = _mm256_unpackhi_epi16(g1, g3); | |||
g1 = _mm256_unpackhi_epi16(f0, f2); | |||
g0 = _mm256_unpacklo_epi16(f0, f2); | |||
g3 = _mm256_unpackhi_epi16(f1, f3); | |||
g2 = _mm256_unpacklo_epi16(f1, f3); | |||
f2 = _mm256_unpacklo_epi64(g1, g3); | |||
f3 = _mm256_unpackhi_epi64(g1, g3); | |||
f0 = _mm256_unpacklo_epi64(g0, g2); | |||
f1 = _mm256_unpackhi_epi64(g0, g2); | |||
f2 = mulmod_x16_scaled(qdata, f2, zetainv_128_1, zetainv_qinv_128_1); | |||
f3 = mulmod_x16_scaled(qdata, f3, zeta(128, 1), zeta_qinv(128, 1)); | |||
f0 = reduce_x16(qdata, f0); | |||
f1 = mulmod_x16_scaled(qdata, f1, zetainv_64_1, zetainv_qinv_64_1); | |||
g3 = add_x16(f3, f2); | |||
g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv); | |||
g1 = add_x16(f0, f1); | |||
g2 = sub_x16(f3, f2); | |||
g0 = sub_x16(f0, f1); | |||
f1 = add_x16(g1, g3); | |||
f3 = sub_x16(g1, g3); | |||
f0 = add_x16(g0, g2); | |||
f2 = sub_x16(g0, g2); | |||
_mm256_storeu_si256((__m256i *) (f + 48), f1); | |||
_mm256_storeu_si256((__m256i *) (f + 112), f3); | |||
_mm256_storeu_si256((__m256i *) (f + 16), f0); | |||
_mm256_storeu_si256((__m256i *) (f + 80), f2); | |||
f += 128; | |||
} | |||
} | |||
static void invntt512(int16 *f, int reps, const __m256i *qdata) { | |||
__m256i f0, f1, f2, f3, g0, g1, g2, g3; /* [-Werror=unused-variable] */ /* ,h0,h1,h2,h3; */ | |||
/* [-Werror=unused-variable] */ /* int16 *origf = f; */ | |||
int rep; | |||
__m256i zetainv_512[8]; | |||
__m256i zetainv_qinv_512[8]; | |||
__m256i zetainv_256[8]; | |||
__m256i zetainv_qinv_256[8]; | |||
int i; | |||
for (i = 0; i < 8; ++i) { | |||
zetainv_512[i] = zetainv(512, i); | |||
} | |||
for (i = 0; i < 8; ++i) { | |||
zetainv_qinv_512[i] = zetainv_qinv(512, i); | |||
} | |||
for (i = 0; i < 8; ++i) { | |||
zetainv_256[i] = zetainv(256, i); | |||
} | |||
for (i = 0; i < 8; ++i) { | |||
zetainv_qinv_256[i] = zetainv_qinv(256, i); | |||
} | |||
invntt128(f, 4 * reps, qdata); | |||
for (rep = 0; rep < reps; ++rep) { | |||
for (i = 0; i < 8; ++i) { | |||
f2 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 256)); | |||
f3 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 384)); | |||
f2 = mulmod_x16_scaled(qdata, f2, zetainv_512[i], zetainv_qinv_512[i]); | |||
f3 = mulmod_x16_scaled(qdata, f3, zeta(512, i), zeta_qinv(512, i)); | |||
g3 = add_x16(f3, f2); | |||
g3 = mulmod_x16_scaled(qdata, g3, zeta4_x16, zeta4_x16_qinv); | |||
g2 = sub_x16(f3, f2); | |||
f0 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 0)); | |||
f1 = _mm256_loadu_si256((__m256i *) (f + 16 * i + 128)); | |||
f0 = reduce_x16(qdata, f0); | |||
f1 = mulmod_x16_scaled(qdata, f1, zetainv_256[i], zetainv_qinv_256[i]); | |||
g1 = add_x16(f0, f1); | |||
g0 = sub_x16(f0, f1); | |||
f1 = add_x16(g1, g3); | |||
f3 = sub_x16(g1, g3); | |||
f0 = add_x16(g0, g2); | |||
f2 = sub_x16(g0, g2); | |||
_mm256_storeu_si256((__m256i *) (f + 16 * i + 128), f1); | |||
_mm256_storeu_si256((__m256i *) (f + 16 * i + 384), f3); | |||
_mm256_storeu_si256((__m256i *) (f + 16 * i + 0), f0); | |||
_mm256_storeu_si256((__m256i *) (f + 16 * i + 256), f2); | |||
} | |||
f += 512; | |||
} | |||
} | |||
void PQCLEAN_NTRULPR761_AVX2_invntt512_7681(int16 *f, int reps) { | |||
invntt512(f, reps, (const __m256i *) qdata_7681.data); | |||
} | |||
void PQCLEAN_NTRULPR761_AVX2_invntt512_10753(int16 *f, int reps) { | |||
invntt512(f, reps, (const __m256i *) qdata_10753.data); | |||
} |
@@ -0,0 +1,13 @@ | |||
#ifndef ntt_H | |||
#define ntt_H | |||
#include <stdint.h> | |||
extern void PQCLEAN_NTRULPR761_AVX2_ntt512_7681(int16_t *f, int reps); | |||
extern void PQCLEAN_NTRULPR761_AVX2_ntt512_10753(int16_t *f, int reps); | |||
extern void PQCLEAN_NTRULPR761_AVX2_invntt512_7681(int16_t *f, int reps); | |||
extern void PQCLEAN_NTRULPR761_AVX2_invntt512_10753(int16_t *f, int reps); | |||
#endif |
@@ -0,0 +1,11 @@ | |||
#include "crypto_decode_256x16.h" | |||
void PQCLEAN_NTRULPR761_AVX2_crypto_decode_256x16(void *v, const unsigned char *s) { | |||
unsigned char *T = v; | |||
int i; | |||
for (i = 0; i < 128; ++i) { | |||
T[2 * i] = s[i] & 15; | |||
T[2 * i + 1] = s[i] >> 4; | |||
} | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_NTRULPR761_AVX2_CRYPTO_DECODE_256X16_H | |||
#define PQCLEAN_NTRULPR761_AVX2_CRYPTO_DECODE_256X16_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR761_AVX2_crypto_decode_256x16_STRBYTES 128 | |||
#define PQCLEAN_NTRULPR761_AVX2_crypto_decode_256x16_ITEMS 256 | |||
#define PQCLEAN_NTRULPR761_AVX2_crypto_decode_256x16_ITEMBYTES 1 | |||
void PQCLEAN_NTRULPR761_AVX2_crypto_decode_256x16(void *v, const unsigned char *s); | |||
#endif |
@@ -0,0 +1,27 @@ | |||
#include "crypto_decode_256x2.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
#define COPY _mm256_set_epi64x(0x0303030303030303,0x0202020202020202,0x0101010101010101,0x0000000000000000) | |||
#define MASK _mm256_set1_epi64x(0x8040201008040201) | |||
#define MASK2 _mm256_set1_epi64x(0x0101010101010101) | |||
void PQCLEAN_NTRULPR761_AVX2_crypto_decode_256x2(void *v, const unsigned char *s) { | |||
__m256i *r = v; | |||
int i; | |||
for (i = 0; i < 8; ++i) { | |||
/* bytes s0 s1 s2 s3 */ | |||
__m256i x = _mm256_set1_epi32(*(int32_t *) s); | |||
/* s0 s1 s2 s3 s0 s1 s2 s3 s0 s1 s2 s3 s0 s1 s2 s3 s0 s1 s2 s3 s0 s1 s2 s3 s0 s1 s2 s3 s0 s1 s2 s3 */ | |||
x = _mm256_shuffle_epi8(x, COPY); | |||
/* s0 s0 s0 s0 s0 s0 s0 s0 s1 s1 s1 s1 s1 s1 s1 s1 s2 s2 s2 s2 s2 s2 s2 s2 s3 s3 s3 s3 s3 s3 s3 s3 */ | |||
x = _mm256_andnot_si256(x, MASK); | |||
x = _mm256_cmpeq_epi8(x, _mm256_setzero_si256()); | |||
x &= MASK2; | |||
_mm256_storeu_si256(r, x); | |||
s += 4; | |||
r += 1; | |||
} | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_NTRULPR761_AVX2_CRYPTO_DECODE_256X2_H | |||
#define PQCLEAN_NTRULPR761_AVX2_CRYPTO_DECODE_256X2_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR761_AVX2_crypto_decode_256x2_STRBYTES 32 | |||
#define PQCLEAN_NTRULPR761_AVX2_crypto_decode_256x2_ITEMS 256 | |||
#define PQCLEAN_NTRULPR761_AVX2_crypto_decode_256x2_ITEMBYTES 1 | |||
void PQCLEAN_NTRULPR761_AVX2_crypto_decode_256x2(void *v, const unsigned char *s); | |||
#endif |
@@ -0,0 +1,436 @@ | |||
#include "crypto_decode_761x1531.h" | |||
#include <immintrin.h> | |||
/* auto-generated; do not edit */ | |||
#define int16 int16_t | |||
#define int32 int32_t | |||
static inline int16 mullo(int16 x, int16 y) { | |||
return x * y; | |||
} | |||
static inline int16 mulhi(int16 x, int16 y) { | |||
return (x * (int32)y) >> 16; | |||
} | |||
static inline __m256i add(__m256i x, __m256i y) { | |||
return _mm256_add_epi16(x, y); | |||
} | |||
static inline __m256i sub(__m256i x, __m256i y) { | |||
return _mm256_sub_epi16(x, y); | |||
} | |||
static inline __m256i shiftleftconst(__m256i x, int16 y) { | |||
return _mm256_slli_epi16(x, y); | |||
} | |||
static inline __m256i signedshiftrightconst(__m256i x, int16 y) { | |||
return _mm256_srai_epi16(x, y); | |||
} | |||
static inline __m256i addconst(__m256i x, int16 y) { | |||
return add(x, _mm256_set1_epi16(y)); | |||
} | |||
static inline __m256i subconst(__m256i x, int16 y) { | |||
return sub(x, _mm256_set1_epi16(y)); | |||
} | |||
static inline __m256i mulloconst(__m256i x, int16 y) { | |||
return _mm256_mullo_epi16(x, _mm256_set1_epi16(y)); | |||
} | |||
static inline __m256i mulhiconst(__m256i x, int16 y) { | |||
return _mm256_mulhi_epi16(x, _mm256_set1_epi16(y)); | |||
} | |||
static inline __m256i ifgesubconst(__m256i x, int16 y) { | |||
__m256i y16 = _mm256_set1_epi16(y); | |||
__m256i top16 = _mm256_set1_epi16((int16)(y - 1)); | |||
return sub(x, _mm256_cmpgt_epi16(x, top16) & y16); | |||
} | |||
static inline __m256i ifnegaddconst(__m256i x, int16 y) { | |||
return add(x, signedshiftrightconst(x, 15) & _mm256_set1_epi16(y)); | |||
} | |||
void PQCLEAN_NTRULPR761_AVX2_crypto_decode_761x1531(void *v, const unsigned char *s) { | |||
int16 *R0 = v; | |||
int16 R1[381], R2[191], R3[96], R4[48], R5[24], R6[12], R7[6], R8[3], R9[2], R10[1]; | |||
long long i; | |||
int16 a0, a1, a2; | |||
__m256i A0, A1, A2, S0, S1, B0, B1, C0, C1; | |||
s += PQCLEAN_NTRULPR761_AVX2_crypto_decode_761x1531_STRBYTES; | |||
a1 = 0; | |||
a1 += *--s; /* 0...255 */ | |||
a1 = mulhi(a1, -84) - mulhi(mullo(a1, -4828), 3475); | |||
a1 += *--s; /* -1738...1992 */ | |||
a1 += (a1 >> 15) & 3475; /* 0...3474 */ | |||
R10[0] = a1; | |||
/* R10 ------> R9: reconstruct mod 1*[593]+[1500] */ | |||
i = 0; | |||
s -= 1; | |||
a2 = a0 = R10[0]; | |||
a0 = mulhi(a0, 60) - mulhi(mullo(a0, -28292), 593); /* -297...311 */ | |||
a0 += s[1 * i + 0]; /* -297...566 */ | |||
a0 += (a0 >> 15) & 593; /* 0...592 */ | |||
a1 = (a2 << 8) + s[i] - a0; | |||
a1 = mullo(a1, -31055); | |||
/* invalid inputs might need reduction mod 1500 */ | |||
a1 -= 1500; | |||
a1 += (a1 >> 15) & 1500; | |||
R9[0] = a0; | |||
R9[1] = a1; | |||
s -= 0; | |||
/* R9 ------> R8: reconstruct mod 2*[6232]+[1500] */ | |||
R8[2] = R9[1]; | |||
s -= 2; | |||
for (i = 0; i >= 0; --i) { | |||
a2 = a0 = R9[i]; | |||
a0 = mulhi(a0, 672) - mulhi(mullo(a0, -2692), 6232); /* -3116...3284 */ | |||
a0 += s[2 * i + 1]; /* -3116...3539 */ | |||
a0 = mulhi(a0, 672) - mulhi(mullo(a0, -2692), 6232); /* -3148...3152 */ | |||
a0 += s[2 * i + 0]; /* -3148...3407 */ | |||
a0 += (a0 >> 15) & 6232; /* 0...6231 */ | |||
a1 = (a2 << 13) + (s[2 * i + 1] << 5) + ((s[2 * i] - a0) >> 3); | |||
a1 = mullo(a1, 12451); | |||
/* invalid inputs might need reduction mod 6232 */ | |||
a1 -= 6232; | |||
a1 += (a1 >> 15) & 6232; | |||
R8[2 * i] = a0; | |||
R8[2 * i + 1] = a1; | |||
} | |||
/* R8 ------> R7: reconstruct mod 5*[1263]+[304] */ | |||
i = 0; | |||
s -= 1; | |||
a2 = a0 = R8[2]; | |||
a0 = mulhi(a0, -476) - mulhi(mullo(a0, -13284), 1263); /* -751...631 */ | |||
a0 += s[1 * i + 0]; /* -751...886 */ | |||
a0 += (a0 >> 15) & 1263; /* 0...1262 */ | |||
a1 = (a2 << 8) + s[i] - a0; | |||
a1 = mullo(a1, -22001); | |||
/* invalid inputs might need reduction mod 304 */ | |||
a1 -= 304; | |||
a1 += (a1 >> 15) & 304; | |||
R7[4] = a0; | |||
R7[5] = a1; | |||
s -= 2; | |||
for (i = 1; i >= 0; --i) { | |||
a2 = a0 = R8[i]; | |||
a0 = mulhi(a0, -476) - mulhi(mullo(a0, -13284), 1263); /* -751...631 */ | |||
a0 += s[1 * i + 0]; /* -751...886 */ | |||
a0 += (a0 >> 15) & 1263; /* 0...1262 */ | |||
a1 = (a2 << 8) + s[i] - a0; | |||
a1 = mullo(a1, -22001); | |||
/* invalid inputs might need reduction mod 1263 */ | |||
a1 -= 1263; | |||
a1 += (a1 >> 15) & 1263; | |||
R7[2 * i] = a0; | |||
R7[2 * i + 1] = a1; | |||
} | |||
/* R7 ------> R6: reconstruct mod 11*[9097]+[2188] */ | |||
i = 0; | |||
s -= 2; | |||
a0 = R7[5]; | |||
a0 = mulhi(a0, 2348) - mulhi(mullo(a0, -1844), 9097); /* -4549...5135 */ | |||
a0 += s[2 * i + 1]; /* -4549...5390 */ | |||
a0 = mulhi(a0, 2348) - mulhi(mullo(a0, -1844), 9097); /* -4712...4741 */ | |||
a0 += s[2 * i + 0]; /* -4712...4996 */ | |||
a0 += (a0 >> 15) & 9097; /* 0...9096 */ | |||
a1 = (s[2 * i + 1] << 8) + s[2 * i] - a0; | |||
a1 = mullo(a1, 17081); | |||
/* invalid inputs might need reduction mod 2188 */ | |||
a1 -= 2188; | |||
a1 += (a1 >> 15) & 2188; | |||
R6[10] = a0; | |||
R6[11] = a1; | |||
s -= 10; | |||
for (i = 4; i >= 0; --i) { | |||
a0 = R7[i]; | |||
a0 = mulhi(a0, 2348) - mulhi(mullo(a0, -1844), 9097); /* -4549...5135 */ | |||
a0 += s[2 * i + 1]; /* -4549...5390 */ | |||
a0 = mulhi(a0, 2348) - mulhi(mullo(a0, -1844), 9097); /* -4712...4741 */ | |||
a0 += s[2 * i + 0]; /* -4712...4996 */ | |||
a0 += (a0 >> 15) & 9097; /* 0...9096 */ | |||
a1 = (s[2 * i + 1] << 8) + s[2 * i] - a0; | |||
a1 = mullo(a1, 17081); | |||
/* invalid inputs might need reduction mod 9097 */ | |||
a1 -= 9097; | |||
a1 += (a1 >> 15) & 9097; | |||
R6[2 * i] = a0; | |||
R6[2 * i + 1] = a1; | |||
} | |||
/* R6 ------> R5: reconstruct mod 23*[1526]+[367] */ | |||
i = 0; | |||
s -= 1; | |||
a2 = a0 = R6[11]; | |||
a0 = mulhi(a0, 372) - mulhi(mullo(a0, -10994), 1526); /* -763...856 */ | |||
a0 += s[1 * i + 0]; /* -763...1111 */ | |||
a0 += (a0 >> 15) & 1526; /* 0...1525 */ | |||
a1 = (a2 << 7) + ((s[i] - a0) >> 1); | |||
a1 = mullo(a1, -18381); | |||
/* invalid inputs might need reduction mod 367 */ | |||
a1 -= 367; | |||
a1 += (a1 >> 15) & 367; | |||
R5[22] = a0; | |||
R5[23] = a1; | |||
s -= 11; | |||
for (i = 10; i >= 0; --i) { | |||
a2 = a0 = R6[i]; | |||
a0 = mulhi(a0, 372) - mulhi(mullo(a0, -10994), 1526); /* -763...856 */ | |||
a0 += s[1 * i + 0]; /* -763...1111 */ | |||
a0 += (a0 >> 15) & 1526; /* 0...1525 */ | |||
a1 = (a2 << 7) + ((s[i] - a0) >> 1); | |||
a1 = mullo(a1, -18381); | |||
/* invalid inputs might need reduction mod 1526 */ | |||
a1 -= 1526; | |||
a1 += (a1 >> 15) & 1526; | |||
R5[2 * i] = a0; | |||
R5[2 * i + 1] = a1; | |||
} | |||
/* R5 ------> R4: reconstruct mod 47*[625]+[150] */ | |||
i = 0; | |||
s -= 1; | |||
a2 = a0 = R5[23]; | |||
a0 = mulhi(a0, -284) - mulhi(mullo(a0, -26844), 625); /* -384...312 */ | |||
a0 += s[1 * i + 0]; /* -384...567 */ | |||
a0 += (a0 >> 15) & 625; /* 0...624 */ | |||
a1 = (a2 << 8) + s[i] - a0; | |||
a1 = mullo(a1, 32401); | |||
/* invalid inputs might need reduction mod 150 */ | |||
a1 -= 150; | |||
a1 += (a1 >> 15) & 150; | |||
R4[46] = a0; | |||
R4[47] = a1; | |||
s -= 23; | |||
i = 7; | |||
for (;;) { | |||
A2 = A0 = _mm256_loadu_si256((__m256i *) &R5[i]); | |||
S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i))); | |||
A0 = sub(mulhiconst(A0, -284), mulhiconst(mulloconst(A0, -26844), 625)); /* -384...312 */ | |||
A0 = add(A0, S0); /* -384...567 */ | |||
A0 = ifnegaddconst(A0, 625); /* 0...624 */ | |||
A1 = add(shiftleftconst(A2, 8), sub(S0, A0)); | |||
A1 = mulloconst(A1, 32401); | |||
/* invalid inputs might need reduction mod 625 */ | |||
A1 = ifgesubconst(A1, 625); | |||
/* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */ | |||
/* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */ | |||
B0 = _mm256_unpacklo_epi16(A0, A1); | |||
B1 = _mm256_unpackhi_epi16(A0, A1); | |||
/* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */ | |||
/* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */ | |||
C0 = _mm256_permute2x128_si256(B0, B1, 0x20); | |||
C1 = _mm256_permute2x128_si256(B0, B1, 0x31); | |||
/* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */ | |||
/* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */ | |||
_mm256_storeu_si256((__m256i *) (&R4[2 * i]), C0); | |||
_mm256_storeu_si256((__m256i *) (16 + &R4[2 * i]), C1); | |||
if (!i) { | |||
break; | |||
} | |||
i = -16 - ((~15) & -i); | |||
} | |||
/* R4 ------> R3: reconstruct mod 95*[6400]+[1531] */ | |||
i = 0; | |||
s -= 2; | |||
a2 = a0 = R4[47]; | |||
a0 = mulhi(a0, 2816) - mulhi(mullo(a0, -2621), 6400); /* -3200...3904 */ | |||
a0 += s[2 * i + 1]; /* -3200...4159 */ | |||
a0 = mulhi(a0, 2816) - mulhi(mullo(a0, -2621), 6400); /* -3338...3378 */ | |||
a0 += s[2 * i + 0]; /* -3338...3633 */ | |||
a0 += (a0 >> 15) & 6400; /* 0...6399 */ | |||
a1 = (a2 << 8) + s[2 * i + 1] + ((s[2 * i] - a0) >> 8); | |||
a1 = mullo(a1, 23593); | |||
/* invalid inputs might need reduction mod 1531 */ | |||
a1 -= 1531; | |||
a1 += (a1 >> 15) & 1531; | |||
R3[94] = a0; | |||
R3[95] = a1; | |||
s -= 94; | |||
i = 31; | |||
for (;;) { | |||
A2 = A0 = _mm256_loadu_si256((__m256i *) &R4[i]); | |||
S0 = _mm256_loadu_si256((__m256i *) (s + 2 * i)); | |||
S1 = _mm256_srli_epi16(S0, 8); | |||
S0 &= _mm256_set1_epi16(255); | |||
A0 = sub(mulhiconst(A0, 2816), mulhiconst(mulloconst(A0, -2621), 6400)); /* -3200...3904 */ | |||
A0 = add(A0, S1); /* -3200...4159 */ | |||
A0 = sub(mulhiconst(A0, 2816), mulhiconst(mulloconst(A0, -2621), 6400)); /* -3338...3378 */ | |||
A0 = add(A0, S0); /* -3338...3633 */ | |||
A0 = ifnegaddconst(A0, 6400); /* 0...6399 */ | |||
A1 = add(add(shiftleftconst(A2, 8), S1), signedshiftrightconst(sub(S0, A0), 8)); | |||
A1 = mulloconst(A1, 23593); | |||
/* invalid inputs might need reduction mod 6400 */ | |||
A1 = ifgesubconst(A1, 6400); | |||
/* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */ | |||
/* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */ | |||
B0 = _mm256_unpacklo_epi16(A0, A1); | |||
B1 = _mm256_unpackhi_epi16(A0, A1); | |||
/* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */ | |||
/* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */ | |||
C0 = _mm256_permute2x128_si256(B0, B1, 0x20); | |||
C1 = _mm256_permute2x128_si256(B0, B1, 0x31); | |||
/* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */ | |||
/* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */ | |||
_mm256_storeu_si256((__m256i *) (&R3[2 * i]), C0); | |||
_mm256_storeu_si256((__m256i *) (16 + &R3[2 * i]), C1); | |||
if (!i) { | |||
break; | |||
} | |||
i = -16 - ((~15) & -i); | |||
} | |||
/* R3 ------> R2: reconstruct mod 190*[1280]+[1531] */ | |||
R2[190] = R3[95]; | |||
s -= 95; | |||
i = 79; | |||
for (;;) { | |||
A2 = A0 = _mm256_loadu_si256((__m256i *) &R3[i]); | |||
S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i))); | |||
A0 = sub(mulhiconst(A0, 256), mulhiconst(mulloconst(A0, -13107), 1280)); /* -640...704 */ | |||
A0 = add(A0, S0); /* -640...959 */ | |||
A0 = ifnegaddconst(A0, 1280); /* 0...1279 */ | |||
A1 = add(A2, signedshiftrightconst(sub(S0, A0), 8)); | |||
A1 = mulloconst(A1, -13107); | |||
/* invalid inputs might need reduction mod 1280 */ | |||
A1 = ifgesubconst(A1, 1280); | |||
/* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */ | |||
/* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */ | |||
B0 = _mm256_unpacklo_epi16(A0, A1); | |||
B1 = _mm256_unpackhi_epi16(A0, A1); | |||
/* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */ | |||
/* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */ | |||
C0 = _mm256_permute2x128_si256(B0, B1, 0x20); | |||
C1 = _mm256_permute2x128_si256(B0, B1, 0x31); | |||
/* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */ | |||
/* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */ | |||
_mm256_storeu_si256((__m256i *) (&R2[2 * i]), C0); | |||
_mm256_storeu_si256((__m256i *) (16 + &R2[2 * i]), C1); | |||
if (!i) { | |||
break; | |||
} | |||
i = -16 - ((~15) & -i); | |||
} | |||
/* R2 ------> R1: reconstruct mod 380*[9157]+[1531] */ | |||
R1[380] = R2[190]; | |||
s -= 380; | |||
i = 174; | |||
for (;;) { | |||
A0 = _mm256_loadu_si256((__m256i *) &R2[i]); | |||
S0 = _mm256_loadu_si256((__m256i *) (s + 2 * i)); | |||
S1 = _mm256_srli_epi16(S0, 8); | |||
S0 &= _mm256_set1_epi16(255); | |||
A0 = sub(mulhiconst(A0, 1592), mulhiconst(mulloconst(A0, -1832), 9157)); /* -4579...4976 */ | |||
A0 = add(A0, S1); /* -4579...5231 */ | |||
A0 = sub(mulhiconst(A0, 1592), mulhiconst(mulloconst(A0, -1832), 9157)); /* -4690...4705 */ | |||
A0 = add(A0, S0); /* -4690...4960 */ | |||
A0 = ifnegaddconst(A0, 9157); /* 0...9156 */ | |||
A1 = add(shiftleftconst(S1, 8), sub(S0, A0)); | |||
A1 = mulloconst(A1, 25357); | |||
/* invalid inputs might need reduction mod 9157 */ | |||
A1 = ifgesubconst(A1, 9157); | |||
/* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */ | |||
/* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */ | |||
B0 = _mm256_unpacklo_epi16(A0, A1); | |||
B1 = _mm256_unpackhi_epi16(A0, A1); | |||
/* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */ | |||
/* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */ | |||
C0 = _mm256_permute2x128_si256(B0, B1, 0x20); | |||
C1 = _mm256_permute2x128_si256(B0, B1, 0x31); | |||
/* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */ | |||
/* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */ | |||
_mm256_storeu_si256((__m256i *) (&R1[2 * i]), C0); | |||
_mm256_storeu_si256((__m256i *) (16 + &R1[2 * i]), C1); | |||
if (!i) { | |||
break; | |||
} | |||
i = -16 - ((~15) & -i); | |||
} | |||
/* R1 ------> R0: reconstruct mod 761*[1531] */ | |||
R0[760] = 3 * R1[380] - 2295; | |||
s -= 380; | |||
i = 364; | |||
for (;;) { | |||
A2 = A0 = _mm256_loadu_si256((__m256i *) &R1[i]); | |||
S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i))); | |||
A0 = sub(mulhiconst(A0, 518), mulhiconst(mulloconst(A0, -10958), 1531)); /* -766...895 */ | |||
A0 = add(A0, S0); /* -766...1150 */ | |||
A0 = ifnegaddconst(A0, 1531); /* 0...1530 */ | |||
A1 = add(shiftleftconst(A2, 8), sub(S0, A0)); | |||
A1 = mulloconst(A1, 15667); | |||
/* invalid inputs might need reduction mod 1531 */ | |||
A1 = ifgesubconst(A1, 1531); | |||
A0 = mulloconst(A0, 3); | |||
A1 = mulloconst(A1, 3); | |||
A0 = subconst(A0, 2295); | |||
A1 = subconst(A1, 2295); | |||
/* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */ | |||
/* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */ | |||
B0 = _mm256_unpacklo_epi16(A0, A1); | |||
B1 = _mm256_unpackhi_epi16(A0, A1); | |||
/* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */ | |||
/* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */ | |||
C0 = _mm256_permute2x128_si256(B0, B1, 0x20); | |||
C1 = _mm256_permute2x128_si256(B0, B1, 0x31); | |||
/* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */ | |||
/* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */ | |||
_mm256_storeu_si256((__m256i *) (&R0[2 * i]), C0); | |||
_mm256_storeu_si256((__m256i *) (16 + &R0[2 * i]), C1); | |||
if (!i) { | |||
break; | |||
} | |||
i = -16 - ((~15) & -i); | |||
} | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_NTRULPR761_AVX2_CRYPTO_DECODE_761X1531_H | |||
#define PQCLEAN_NTRULPR761_AVX2_CRYPTO_DECODE_761X1531_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR761_AVX2_crypto_decode_761x1531_STRBYTES 1007 | |||
#define PQCLEAN_NTRULPR761_AVX2_crypto_decode_761x1531_ITEMS 761 | |||
#define PQCLEAN_NTRULPR761_AVX2_crypto_decode_761x1531_ITEMBYTES 2 | |||
void PQCLEAN_NTRULPR761_AVX2_crypto_decode_761x1531(void *v, const unsigned char *s); | |||
#endif |
@@ -0,0 +1,65 @@ | |||
#include "crypto_decode_761x3.h" | |||
#include <immintrin.h> | |||
#define uint8 uint8_t | |||
#define p 761 | |||
#define loops 6 | |||
#define overshoot 2 | |||
void PQCLEAN_NTRULPR761_AVX2_crypto_decode_761x3(void *v, const unsigned char *s) { | |||
uint8 *f = v; | |||
int loop; | |||
uint8 *nextf = f + 128 - 4 * overshoot; | |||
const unsigned char *nexts = s + 32 - overshoot; | |||
for (loop = loops; loop > 0; --loop) { | |||
__m256i s0 = _mm256_loadu_si256((const __m256i *) s); | |||
s = nexts; | |||
nexts += 32; | |||
__m256i s1 = _mm256_srli_epi16(s0 & _mm256_set1_epi8(-16), 4); | |||
s0 &= _mm256_set1_epi8(15); | |||
__m256i a0 = _mm256_unpacklo_epi8(s0, s1); | |||
/* 0 0>>4 1 1>>4 2 2>>4 3 3>>4 4 4>>4 5 5>>4 6 6>>4 7 7>>4 */ | |||
/* 16 16>>4 ... */ | |||
__m256i a1 = _mm256_unpackhi_epi8(s0, s1); | |||
/* 8 8>>4 9 9>>4 10 10>>4 ... */ | |||
/* 24 24>>4 ... */ | |||
__m256i a2 = _mm256_srli_epi16(a0 & _mm256_set1_epi8(12), 2); | |||
__m256i a3 = _mm256_srli_epi16(a1 & _mm256_set1_epi8(12), 2); | |||
a0 &= _mm256_set1_epi8(3); | |||
a1 &= _mm256_set1_epi8(3); | |||
__m256i b0 = _mm256_unpacklo_epi8(a0, a2); | |||
/* 0 0>>2 0>>4 0>>6 1 1>>2 1>>4 1>>6 */ | |||
/* 2 2>>2 2>>4 2>>6 3 3>>2 3>>4 3>.6 */ | |||
/* 16 16>>2 16>>4 16>>6 ... */ | |||
__m256i b2 = _mm256_unpackhi_epi8(a0, a2); | |||
/* 4 4>>2 ... */ | |||
__m256i b1 = _mm256_unpacklo_epi8(a1, a3); | |||
/* 8 8>>2 ... */ | |||
__m256i b3 = _mm256_unpackhi_epi8(a1, a3); | |||
/* 12 12>>2 ... */ | |||
__m256i f0 = _mm256_permute2x128_si256(b0, b2, 0x20); | |||
__m256i f2 = _mm256_permute2x128_si256(b0, b2, 0x31); | |||
__m256i f1 = _mm256_permute2x128_si256(b1, b3, 0x20); | |||
__m256i f3 = _mm256_permute2x128_si256(b1, b3, 0x31); | |||
f0 = _mm256_add_epi8(f0, _mm256_set1_epi8(-1)); | |||
f1 = _mm256_add_epi8(f1, _mm256_set1_epi8(-1)); | |||
f2 = _mm256_add_epi8(f2, _mm256_set1_epi8(-1)); | |||
f3 = _mm256_add_epi8(f3, _mm256_set1_epi8(-1)); | |||
_mm256_storeu_si256((__m256i *) (f + 0), f0); | |||
_mm256_storeu_si256((__m256i *) (f + 32), f1); | |||
_mm256_storeu_si256((__m256i *) (f + 64), f2); | |||
_mm256_storeu_si256((__m256i *) (f + 96), f3); | |||
f = nextf; | |||
nextf += 128; | |||
} | |||
*f = ((uint8)(*s & 3)) - 1; | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_NTRULPR761_AVX2_CRYPTO_DECODE_761X3_H | |||
#define PQCLEAN_NTRULPR761_AVX2_CRYPTO_DECODE_761X3_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR761_AVX2_crypto_decode_761x3_STRBYTES 191 | |||
#define PQCLEAN_NTRULPR761_AVX2_crypto_decode_761x3_ITEMS 761 | |||
#define PQCLEAN_NTRULPR761_AVX2_crypto_decode_761x3_ITEMBYTES 1 | |||
void PQCLEAN_NTRULPR761_AVX2_crypto_decode_761x3(void *v, const unsigned char *s); | |||
#endif |
@@ -0,0 +1,16 @@ | |||
#include "crypto_decode_761xint16.h" | |||
void PQCLEAN_NTRULPR761_AVX2_crypto_decode_761xint16(void *v, const unsigned char *s) { | |||
uint16_t *x = v; | |||
int i; | |||
for (i = 0; i < 761; ++i) { | |||
uint16_t u0 = s[0]; | |||
uint16_t u1 = s[1]; | |||
u1 <<= 8; | |||
*x = u0 | u1; | |||
x += 1; | |||
s += 2; | |||
} | |||
} |
@@ -0,0 +1,10 @@ | |||
#ifndef PQCLEAN_NTRULPR761_AVX2_CRYPTO_DECODE_761XINT16_H | |||
#define PQCLEAN_NTRULPR761_AVX2_CRYPTO_DECODE_761XINT16_H | |||
#include <stdint.h> | |||
#define PQCLEAN_NTRULPR761_AVX2_crypto_decode_761xint16_STRBYTES 1522 | |||
#define PQCLEAN_NTRULPR761_AVX2_crypto_decode_761xint16_ITEMBYTES 2 | |||
#define PQCLEAN_NTRULPR761_AVX2_crypto_decode_761xint16_ITEMS 761 | |||
void PQCLEAN_NTRULPR761_AVX2_crypto_decode_761xint16(void *v, const unsigned char *s); | |||
#endif |