Browse Source

Update Kyber from upstream

Makes Kyber-AVX run on MacOS (#251)
kyber
Thom Wiggers 4 years ago
committed by Kris Kwiatkowski
parent
commit
c0f56ccdc2
100 changed files with 4736 additions and 3362 deletions
  1. +1
    -0
      crypto_kem/kyber1024-90s/META.yml
  2. +1
    -11
      crypto_kem/kyber1024-90s/avx2/LICENSE
  3. +35
    -7
      crypto_kem/kyber1024-90s/avx2/Makefile
  4. +57
    -88
      crypto_kem/kyber1024-90s/avx2/aes256ctr.c
  5. +7
    -12
      crypto_kem/kyber1024-90s/avx2/aes256ctr.h
  6. +22
    -0
      crypto_kem/kyber1024-90s/avx2/align.h
  7. +65
    -17
      crypto_kem/kyber1024-90s/avx2/basemul.S
  8. +7
    -7
      crypto_kem/kyber1024-90s/avx2/cbd.c
  9. +6
    -3
      crypto_kem/kyber1024-90s/avx2/cbd.h
  10. +30
    -0
      crypto_kem/kyber1024-90s/avx2/cdecl.inc
  11. +149
    -28
      crypto_kem/kyber1024-90s/avx2/consts.c
  12. +12
    -16
      crypto_kem/kyber1024-90s/avx2/consts.h
  13. +129
    -0
      crypto_kem/kyber1024-90s/avx2/fq.S
  14. +7
    -4
      crypto_kem/kyber1024-90s/avx2/fq.inc
  15. +165
    -112
      crypto_kem/kyber1024-90s/avx2/indcpa.c
  16. +9
    -14
      crypto_kem/kyber1024-90s/avx2/indcpa.h
  17. +39
    -31
      crypto_kem/kyber1024-90s/avx2/invntt.S
  18. +0
    -217
      crypto_kem/kyber1024-90s/avx2/invntt.s
  19. +68
    -44
      crypto_kem/kyber1024-90s/avx2/kem.c
  20. +19
    -0
      crypto_kem/kyber1024-90s/avx2/kem.h
  21. +220
    -0
      crypto_kem/kyber1024-90s/avx2/ntt.S
  22. +20
    -12
      crypto_kem/kyber1024-90s/avx2/ntt.h
  23. +10
    -10
      crypto_kem/kyber1024-90s/avx2/params.h
  24. +208
    -219
      crypto_kem/kyber1024-90s/avx2/poly.c
  25. +28
    -14
      crypto_kem/kyber1024-90s/avx2/poly.h
  26. +107
    -75
      crypto_kem/kyber1024-90s/avx2/polyvec.c
  27. +21
    -9
      crypto_kem/kyber1024-90s/avx2/polyvec.h
  28. +9
    -3
      crypto_kem/kyber1024-90s/avx2/reduce.h
  29. +325
    -351
      crypto_kem/kyber1024-90s/avx2/rejsample.c
  30. +4
    -5
      crypto_kem/kyber1024-90s/avx2/rejsample.h
  31. +255
    -0
      crypto_kem/kyber1024-90s/avx2/shuffle.S
  32. +2
    -0
      crypto_kem/kyber1024-90s/avx2/shuffle.inc
  33. +14
    -10
      crypto_kem/kyber1024-90s/avx2/symmetric.h
  34. +20
    -21
      crypto_kem/kyber1024-90s/avx2/verify.c
  35. +6
    -3
      crypto_kem/kyber1024-90s/avx2/verify.h
  36. +1
    -11
      crypto_kem/kyber1024-90s/clean/LICENSE
  37. +23
    -2
      crypto_kem/kyber1024-90s/clean/Makefile
  38. +1
    -1
      crypto_kem/kyber1024-90s/clean/Makefile.Microsoft_nmake
  39. +13
    -15
      crypto_kem/kyber1024-90s/clean/cbd.c
  40. +6
    -3
      crypto_kem/kyber1024-90s/clean/cbd.h
  41. +118
    -83
      crypto_kem/kyber1024-90s/clean/indcpa.c
  42. +9
    -14
      crypto_kem/kyber1024-90s/clean/indcpa.h
  43. +60
    -34
      crypto_kem/kyber1024-90s/clean/kem.c
  44. +19
    -0
      crypto_kem/kyber1024-90s/clean/kem.h
  45. +61
    -58
      crypto_kem/kyber1024-90s/clean/ntt.c
  46. +14
    -5
      crypto_kem/kyber1024-90s/clean/ntt.h
  47. +10
    -10
      crypto_kem/kyber1024-90s/clean/params.h
  48. +146
    -132
      crypto_kem/kyber1024-90s/clean/poly.c
  49. +28
    -13
      crypto_kem/kyber1024-90s/clean/poly.h
  50. +100
    -71
      crypto_kem/kyber1024-90s/clean/polyvec.c
  51. +21
    -9
      crypto_kem/kyber1024-90s/clean/polyvec.h
  52. +15
    -16
      crypto_kem/kyber1024-90s/clean/reduce.c
  53. +8
    -4
      crypto_kem/kyber1024-90s/clean/reduce.h
  54. +2
    -3
      crypto_kem/kyber1024-90s/clean/symmetric-aes.c
  55. +0
    -0
      crypto_kem/kyber1024-90s/clean/symmetric-aes.h
  56. +8
    -6
      crypto_kem/kyber1024-90s/clean/symmetric.h
  57. +10
    -13
      crypto_kem/kyber1024-90s/clean/verify.c
  58. +6
    -3
      crypto_kem/kyber1024-90s/clean/verify.h
  59. +1
    -0
      crypto_kem/kyber1024/META.yml
  60. +1
    -11
      crypto_kem/kyber1024/avx2/LICENSE
  61. +40
    -8
      crypto_kem/kyber1024/avx2/Makefile
  62. +22
    -0
      crypto_kem/kyber1024/avx2/align.h
  63. +65
    -17
      crypto_kem/kyber1024/avx2/basemul.S
  64. +7
    -7
      crypto_kem/kyber1024/avx2/cbd.c
  65. +6
    -3
      crypto_kem/kyber1024/avx2/cbd.h
  66. +30
    -0
      crypto_kem/kyber1024/avx2/cdecl.inc
  67. +149
    -28
      crypto_kem/kyber1024/avx2/consts.c
  68. +12
    -16
      crypto_kem/kyber1024/avx2/consts.h
  69. +140
    -181
      crypto_kem/kyber1024/avx2/fips202x4.c
  70. +14
    -26
      crypto_kem/kyber1024/avx2/fips202x4.h
  71. +55
    -38
      crypto_kem/kyber1024/avx2/fq.S
  72. +7
    -4
      crypto_kem/kyber1024/avx2/fq.inc
  73. +174
    -106
      crypto_kem/kyber1024/avx2/indcpa.c
  74. +9
    -14
      crypto_kem/kyber1024/avx2/indcpa.h
  75. +39
    -31
      crypto_kem/kyber1024/avx2/invntt.S
  76. +68
    -44
      crypto_kem/kyber1024/avx2/kem.c
  77. +19
    -0
      crypto_kem/kyber1024/avx2/kem.h
  78. +38
    -27
      crypto_kem/kyber1024/avx2/ntt.S
  79. +20
    -12
      crypto_kem/kyber1024/avx2/ntt.h
  80. +10
    -10
      crypto_kem/kyber1024/avx2/params.h
  81. +231
    -229
      crypto_kem/kyber1024/avx2/poly.c
  82. +29
    -14
      crypto_kem/kyber1024/avx2/poly.h
  83. +107
    -75
      crypto_kem/kyber1024/avx2/polyvec.c
  84. +21
    -9
      crypto_kem/kyber1024/avx2/polyvec.h
  85. +9
    -3
      crypto_kem/kyber1024/avx2/reduce.h
  86. +325
    -351
      crypto_kem/kyber1024/avx2/rejsample.c
  87. +4
    -5
      crypto_kem/kyber1024/avx2/rejsample.h
  88. +80
    -31
      crypto_kem/kyber1024/avx2/shuffle.S
  89. +2
    -0
      crypto_kem/kyber1024/avx2/shuffle.inc
  90. +0
    -63
      crypto_kem/kyber1024/avx2/symmetric-fips202.c
  91. +60
    -0
      crypto_kem/kyber1024/avx2/symmetric-shake.c
  92. +19
    -11
      crypto_kem/kyber1024/avx2/symmetric.h
  93. +20
    -21
      crypto_kem/kyber1024/avx2/verify.c
  94. +6
    -3
      crypto_kem/kyber1024/avx2/verify.h
  95. +1
    -11
      crypto_kem/kyber1024/clean/LICENSE
  96. +2
    -2
      crypto_kem/kyber1024/clean/Makefile
  97. +1
    -1
      crypto_kem/kyber1024/clean/Makefile.Microsoft_nmake
  98. +13
    -15
      crypto_kem/kyber1024/clean/cbd.c
  99. +6
    -3
      crypto_kem/kyber1024/clean/cbd.h
  100. +118
    -83
      crypto_kem/kyber1024/clean/indcpa.c

+ 1
- 0
crypto_kem/kyber1024-90s/META.yml View File

@@ -28,6 +28,7 @@ implementations:
- architecture: x86_64
operating_systems:
- Linux
- Darwin
required_flags:
- aes
- avx2


+ 1
- 11
crypto_kem/kyber1024-90s/avx2/LICENSE View File

@@ -1,14 +1,4 @@
kyber-20170627
Public Domain
Authors: Joppe Bos,
Léo Ducas,
Eike Kiltz ,
Tancrède Lepoint,
Vadim Lyubashevsky,
John Schanck,
Peter Schwabe,
Gregor Seiler,
Damien Stehlé
Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/)

For Keccak and AES we are using public-domain
code from sources and by authors listed in


+ 35
- 7
crypto_kem/kyber1024-90s/avx2/Makefile View File

@@ -1,9 +1,40 @@
# This Makefile can be used with GNU Make or BSD Make

LIB=libkyber1024-90s_avx2.a
HEADERS=api.h params.h poly.h polyvec.h reduce.h fq.inc cbd.h consts.h ntt.h shuffle.inc verify.h indcpa.h rejsample.h symmetric.h aes256ctr.h
OBJECTS=kem.o poly.o polyvec.o fq.o shuffle.o cbd.o ntt.o invntt.o basemul.o consts.o \
verify.o indcpa.o rejsample.o aes256ctr.o
HEADERS= \
aes256ctr.h \
align.h \
api.h \
cbd.h \
cdecl.inc \
consts.h \
fq.inc \
indcpa.h \
kem.h \
ntt.h \
params.h \
poly.h \
polyvec.h \
reduce.h \
rejsample.h \
shuffle.inc \
symmetric.h \
verify.h
OBJECTS= \
aes256ctr.o \
basemul.o \
cbd.o \
consts.o \
fq.o \
indcpa.o \
invntt.o \
kem.o \
ntt.o \
poly.o \
polyvec.o \
rejsample.o \
shuffle.o \
verify.o

CFLAGS=-mavx2 -maes -mbmi2 -mpopcnt -O3 -Wall -Wextra -Wpedantic -Werror \
-Wmissing-prototypes -Wredundant-decls -std=c99 \
@@ -14,11 +45,8 @@ all: $(LIB)
%.o: %.c $(HEADERS)
$(CC) $(CFLAGS) -c -o $@ $<

%.o: %.s $(HEADERS)
$(AS) -o $@ $<

%.o: %.S $(HEADERS)
$(AS) -c -o $@ $<
$(CC) $(CFLAGS) -c -o $@ $<

$(LIB): $(OBJECTS)
$(AR) -r $@ $(OBJECTS)


+ 57
- 88
crypto_kem/kyber1024-90s/avx2/aes256ctr.c View File

@@ -1,94 +1,68 @@
/*
crypto_stream_aes256ctr
based heavily on public-domain code by Romain Dolbeau
Based heavily on public-domain code by Romain Dolbeau
Different handling of nonce+counter than original version
using separated 96-bit nonce and internal 32-bit counter, starting from zero
using separated 64-bit nonce and internal 64-bit counter, starting from zero
Public Domain
*/

#include "aes256ctr.h"

#include <immintrin.h>
#include <stddef.h>
#include <stdint.h>

static inline void aesni_encrypt8(uint8_t *out,
static inline void aesni_encrypt4(uint8_t out[64],
__m128i *n,
const __m128i rkeys[16]) {
__m128i nv0;
__m128i nv1;
__m128i nv2;
__m128i nv3;
__m128i nv4;
__m128i nv5;
__m128i nv6;
__m128i nv7;
__m128i f, f0, f1, f2, f3, t;

/* Load current counter value */
__m128i nv0i = _mm_load_si128(n);

/* Increase counter in 8 consecutive blocks */
nv0 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(0, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7));
nv1 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(1, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7));
nv2 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(2, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7));
nv3 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(3, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7));
nv4 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(4, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7));
nv5 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(5, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7));
nv6 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(6, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7));
nv7 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(7, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7));

/* Write counter for next iteration, increased by 8 */
_mm_store_si128(n, _mm_add_epi32(nv0i, _mm_set_epi64x(8, 0)));

/* Actual AES encryption, 8x interleaved */
__m128i temp0 = _mm_xor_si128(nv0, rkeys[0]);
__m128i temp1 = _mm_xor_si128(nv1, rkeys[0]);
__m128i temp2 = _mm_xor_si128(nv2, rkeys[0]);
__m128i temp3 = _mm_xor_si128(nv3, rkeys[0]);
__m128i temp4 = _mm_xor_si128(nv4, rkeys[0]);
__m128i temp5 = _mm_xor_si128(nv5, rkeys[0]);
__m128i temp6 = _mm_xor_si128(nv6, rkeys[0]);
__m128i temp7 = _mm_xor_si128(nv7, rkeys[0]);

for (uint8_t i = 1; i < 14; i++) {
temp0 = _mm_aesenc_si128(temp0, rkeys[i]);
temp1 = _mm_aesenc_si128(temp1, rkeys[i]);
temp2 = _mm_aesenc_si128(temp2, rkeys[i]);
temp3 = _mm_aesenc_si128(temp3, rkeys[i]);
temp4 = _mm_aesenc_si128(temp4, rkeys[i]);
temp5 = _mm_aesenc_si128(temp5, rkeys[i]);
temp6 = _mm_aesenc_si128(temp6, rkeys[i]);
temp7 = _mm_aesenc_si128(temp7, rkeys[i]);
f = _mm_load_si128(n);

/* Increase counter in 4 consecutive blocks */
t = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0);
f0 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(0, 0)), t);
f1 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(1, 0)), t);
f2 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(2, 0)), t);
f3 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(3, 0)), t);

/* Write counter for next iteration, increased by 4 */
_mm_store_si128(n, _mm_add_epi64(f, _mm_set_epi64x(4, 0)));

/* Actual AES encryption, 4x interleaved */
t = _mm_load_si128(&rkeys[0]);
f0 = _mm_xor_si128(f0, t);
f1 = _mm_xor_si128(f1, t);
f2 = _mm_xor_si128(f2, t);
f3 = _mm_xor_si128(f3, t);

for (int i = 1; i < 14; i++) {
t = _mm_load_si128(&rkeys[i]);
f0 = _mm_aesenc_si128(f0, t);
f1 = _mm_aesenc_si128(f1, t);
f2 = _mm_aesenc_si128(f2, t);
f3 = _mm_aesenc_si128(f3, t);
}

temp0 = _mm_aesenclast_si128(temp0, rkeys[14]);
temp1 = _mm_aesenclast_si128(temp1, rkeys[14]);
temp2 = _mm_aesenclast_si128(temp2, rkeys[14]);
temp3 = _mm_aesenclast_si128(temp3, rkeys[14]);
temp4 = _mm_aesenclast_si128(temp4, rkeys[14]);
temp5 = _mm_aesenclast_si128(temp5, rkeys[14]);
temp6 = _mm_aesenclast_si128(temp6, rkeys[14]);
temp7 = _mm_aesenclast_si128(temp7, rkeys[14]);
t = _mm_load_si128(&rkeys[14]);
f0 = _mm_aesenclast_si128(f0, t);
f1 = _mm_aesenclast_si128(f1, t);
f2 = _mm_aesenclast_si128(f2, t);
f3 = _mm_aesenclast_si128(f3, t);

/* Write results */
_mm_storeu_si128((__m128i *)(out + 0), temp0);
_mm_storeu_si128((__m128i *)(out + 16), temp1);
_mm_storeu_si128((__m128i *)(out + 32), temp2);
_mm_storeu_si128((__m128i *)(out + 48), temp3);
_mm_storeu_si128((__m128i *)(out + 64), temp4);
_mm_storeu_si128((__m128i *)(out + 80), temp5);
_mm_storeu_si128((__m128i *)(out + 96), temp6);
_mm_storeu_si128((__m128i *)(out + 112), temp7);
_mm_storeu_si128((__m128i *)(out + 0), f0);
_mm_storeu_si128((__m128i *)(out + 16), f1);
_mm_storeu_si128((__m128i *)(out + 32), f2);
_mm_storeu_si128((__m128i *)(out + 48), f3);
}

void PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(aes256ctr_ctx *state,
const uint8_t *key,
uint16_t nonce) {
__m128i key0 = _mm_loadu_si128((__m128i *)(key + 0));
__m128i key1 = _mm_loadu_si128((__m128i *)(key + 16));
__m128i temp0, temp1, temp2, temp4;
size_t idx = 0;
void PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(aes256ctr_ctx *state, const uint8_t key[32], uint64_t nonce) {
__m128i key0, key1, temp0, temp1, temp2, temp4;
int idx = 0;

state->n = _mm_set_epi64x(0, (uint64_t)nonce << 48);
key0 = _mm_loadu_si128((__m128i *)(key + 0));
key1 = _mm_loadu_si128((__m128i *)(key + 16));
state->n = _mm_loadl_epi64((__m128i *)&nonce);

state->rkeys[idx++] = key0;
temp0 = key0;
@@ -137,38 +111,33 @@ void PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(aes256ctr_ctx *state,
state->rkeys[idx++] = temp0;
}

void PQCLEAN_KYBER102490S_AVX2_aes256ctr_select(aes256ctr_ctx *state, uint16_t nonce) {
state->n = _mm_set_epi64x(0, (uint64_t)nonce << 48);
}

void PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(uint8_t *out,
size_t nblocks,
aes256ctr_ctx *state) {
size_t i;

size_t i = 0;
for (i = 0; i < nblocks; i++) {
aesni_encrypt8(out, &state->n, state->rkeys);
out += 128;
aesni_encrypt4(out, &state->n, state->rkeys);
out += 64;
}
}

void PQCLEAN_KYBER102490S_AVX2_aes256ctr_prf(uint8_t *out,
size_t outlen,
const uint8_t *seed,
uint8_t nonce) {
size_t i;
uint8_t buf[128];
const uint8_t seed[32],
uint64_t nonce) {
unsigned int i = 0;
uint8_t buf[64];
aes256ctr_ctx state;

PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, seed, (uint16_t)nonce << 8);
PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, seed, nonce);

while (outlen >= 128) {
aesni_encrypt8(out, &state.n, state.rkeys);
outlen -= 128;
while (outlen >= 64) {
aesni_encrypt4(out, &state.n, state.rkeys);
outlen -= 64;
}

if (outlen) {
aesni_encrypt8(buf, &state.n, state.rkeys);
aesni_encrypt4(buf, &state.n, state.rkeys);
for (i = 0; i < outlen; i++) {
out[i] = buf[i];
}


+ 7
- 12
crypto_kem/kyber1024-90s/avx2/aes256ctr.h View File

@@ -5,22 +5,17 @@
#include <stddef.h>
#include <stdint.h>

#define AES256CTR_NAMESPACE(s) pqcrystals_aes256ctr_avx2##s

#define AES256CTR_BLOCKBYTES 64

typedef struct {
__m128i rkeys[16];
__m128i n;
} aes256ctr_ctx;

void PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(aes256ctr_ctx *state,
const uint8_t *key,
uint16_t nonce);
void PQCLEAN_KYBER102490S_AVX2_aes256ctr_select(aes256ctr_ctx *state, uint16_t nonce);
void PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(uint8_t *out,
size_t nblocks,
aes256ctr_ctx *state);

void PQCLEAN_KYBER102490S_AVX2_aes256ctr_prf(uint8_t *out,
size_t outlen,
const uint8_t *seed,
uint8_t nonce);
void PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(aes256ctr_ctx *state, const uint8_t key[32], uint64_t nonce);
void PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(uint8_t *out, size_t nblocks, aes256ctr_ctx *state);
void PQCLEAN_KYBER102490S_AVX2_aes256ctr_prf(uint8_t *out, size_t outlen, const uint8_t seed[32], uint64_t nonce);

#endif

+ 22
- 0
crypto_kem/kyber1024-90s/avx2/align.h View File

@@ -0,0 +1,22 @@
#ifndef PQCLEAN_KYBER102490S_AVX2_ALIGN_H
#define PQCLEAN_KYBER102490S_AVX2_ALIGN_H
#include <immintrin.h>

#define ALIGN16_TYPE(t) \
union { \
__m128i vec; \
t orig; \
}

#define ALIGN32_ARRAY(t, s) \
union { \
__m256i vec; \
t arr[(s)]; \
}

#define ALIGN32_ARRAY_2D(t, n, m) \
union { \
__m256i vec; \
t arr[(n)][(m)]; \
}
#endif

+ 65
- 17
crypto_kem/kyber1024-90s/avx2/basemul.S View File

@@ -1,4 +1,5 @@
#include "params.h"
#include "cdecl.inc"

.macro schoolbook off,sign
#load
@@ -48,7 +49,7 @@ vpaddd %ymm12,%ymm13,%ymm12 # y0
vpaddd %ymm7,%ymm8,%ymm7 # y1
.endm

.macro red a0,a1,b0,b1 x,y,z
.macro red a0,a1,b0,b1,x,y,z
#pack
vpxor %ymm\x,%ymm\x,%ymm\x
vpblendw $0xAA,%ymm\x,%ymm\a0,%ymm\y
@@ -73,13 +74,8 @@ vpsubw %ymm\z,%ymm\a0,%ymm\a0
vpsubw %ymm\y,%ymm\b0,%ymm\b0
.endm

.global PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx
PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx:
#consts
vmovdqa PQCLEAN_KYBER102490S_AVX2_16xq(%rip),%ymm0
vmovdqa PQCLEAN_KYBER102490S_AVX2_16xqinv(%rip),%ymm1
vmovdqu (%rcx),%ymm2

.text
basemul64_acc_avx:
poly0.0:
schoolbook 0,0

@@ -117,7 +113,7 @@ vpaddd %ymm12,%ymm5,%ymm5
vpaddd %ymm7,%ymm6,%ymm6

#reduce
red 3,4,5,6 7,8,9
red 3,4,5,6,7,8,9

#store
vmovdqa %ymm3,(%rdi)
@@ -160,7 +156,7 @@ vpaddd %ymm12,%ymm5,%ymm5
vpaddd %ymm7,%ymm6,%ymm6

#reduce
red 3,4,5,6 7,8,9
red 3,4,5,6,7,8,9

#store
vmovdqa %ymm3,64(%rdi)
@@ -168,17 +164,40 @@ vmovdqa %ymm5,96(%rdi)

ret

.global PQCLEAN_KYBER102490S_AVX2_basemul_avx
PQCLEAN_KYBER102490S_AVX2_basemul_avx:
.global cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx)
cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx):
#consts
vmovdqa PQCLEAN_KYBER102490S_AVX2_16xq(%rip),%ymm0
vmovdqa PQCLEAN_KYBER102490S_AVX2_16xqinv(%rip),%ymm1
vmovdqu (%rcx),%ymm2
vmovdqa _16XQ*2(%rcx),%ymm0
vmovdqa _16XQINV*2(%rcx),%ymm1

vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2
call basemul64_acc_avx

vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2
add $128,%rdi
add $128,%rsi
add $128,%rdx
call basemul64_acc_avx

vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2
add $128,%rdi
add $128,%rsi
add $128,%rdx
call basemul64_acc_avx

vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2
add $128,%rdi
add $128,%rsi
add $128,%rdx
call basemul64_acc_avx

ret

basemul64_avx:
schoolbook 0,0

#reduce
red 14,9,12,7 8,10,11
red 14,9,12,7,8,10,11

#store
vmovdqa %ymm14,(%rdi)
@@ -187,10 +206,39 @@ vmovdqa %ymm12,32(%rdi)
schoolbook 64,1

#reduce
red 14,9,12,7 8,10,11
red 14,9,12,7,8,10,11

#store
vmovdqa %ymm14,64(%rdi)
vmovdqa %ymm12,96(%rdi)

ret

.global cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_avx)
cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_avx):
#consts
vmovdqa _16XQ*2(%rcx),%ymm0
vmovdqa _16XQINV*2(%rcx),%ymm1

vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2
call basemul64_avx

vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2
add $128,%rdi
add $128,%rsi
add $128,%rdx
call basemul64_avx

vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2
add $128,%rdi
add $128,%rsi
add $128,%rdx
call basemul64_avx

vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2
add $128,%rdi
add $128,%rsi
add $128,%rdx
call basemul64_avx

ret

+ 7
- 7
crypto_kem/kyber1024-90s/avx2/cbd.c View File

@@ -1,27 +1,27 @@
#include "cbd.h"
#include "params.h"
#include "cbd.h"
#include <immintrin.h>
#include <stdint.h>

/*************************************************
* Name: cbd
* Name: PQCLEAN_KYBER102490S_AVX2_cbd
*
* Description: Given an array of uniformly random bytes, compute
* polynomial with coefficients distributed according to
* a centered binomial distribution with parameter KYBER_ETA
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *buf: pointer to input byte array
* - const unsigned char *buf: pointer to input byte array
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_cbd(poly *r, const uint8_t *buf) {
void PQCLEAN_KYBER102490S_AVX2_cbd(poly *restrict r, const uint8_t *restrict buf) {
unsigned int i = 0;
__m256i vec0, vec1, vec2, vec3, tmp;
const __m256i mask55 = _mm256_set1_epi32(0x55555555);
const __m256i mask33 = _mm256_set1_epi32(0x33333333);
const __m256i mask03 = _mm256_set1_epi32(0x03030303);

for (size_t i = 0; i < KYBER_N / 64; i++) {
vec0 = _mm256_loadu_si256((__m256i *)&buf[32 * i]);
for (i = 0; i < KYBER_N / 64; i++) {
vec0 = _mm256_load_si256((__m256i *)&buf[32 * i]);

vec1 = _mm256_srli_epi32(vec0, 1);
vec0 = _mm256_and_si256(mask55, vec0);


+ 6
- 3
crypto_kem/kyber1024-90s/avx2/cbd.h View File

@@ -1,8 +1,11 @@
#ifndef CBD_H
#define CBD_H
#ifndef PQCLEAN_KYBER102490S_AVX2_CBD_H
#define PQCLEAN_KYBER102490S_AVX2_CBD_H

#include "params.h"
#include "poly.h"
#include <stdint.h>

void PQCLEAN_KYBER102490S_AVX2_cbd(poly *r, const uint8_t *buf);

void PQCLEAN_KYBER102490S_AVX2_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]);

#endif

+ 30
- 0
crypto_kem/kyber1024-90s/avx2/cdecl.inc View File

@@ -0,0 +1,30 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_CDECL
#define PQCLEAN_DILITHIUM2_AVX2_CDECL

#define _16XQ 0
#define _16XQINV 16
#define _16XV 32
#define _16XFLO 48
#define _16XFHI 64
#define _16XMONTSQLO 80
#define _16XMONTSQHI 96
#define _16XMASK 112
#define _ZETAS_EXP 128
#define _ZETAS_INV_EXP 528


/* The C ABI on MacOS exports all symbols with a leading
* underscore. This means that any symbols we refer to from
* C files (functions) can't be found, and all symbols we
* refer to from ASM also can't be found (nttconsts.c).
*
* This define helps us get around this
*/

#if defined(__WIN32__) || defined(__APPLE__)
#define cdecl(s) _##s
#else
#define cdecl(s) s
#endif

#endif

+ 149
- 28
crypto_kem/kyber1024-90s/avx2/consts.c View File

@@ -1,34 +1,155 @@
#include "consts.h"
#include "params.h"

const uint16_t PQCLEAN_KYBER102490S_AVX2_zetas_exp[396] = {31499, 31499, 2571, 2571, 14746, 14746, 2970, 2970, 13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, 53134, 53134, 53134, 53134, 53134, 53134, 53134, 53134, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 44630, 44630, 44630, 44630, 27758, 27758, 27758, 27758, 61737, 61737, 61737, 61737, 49846, 49846, 49846, 49846, 3158, 3158, 3158, 3158, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 182, 182, 182, 182, 59709, 59709, 17364, 17364, 39176, 39176, 36479, 36479, 5572, 5572, 64434, 64434, 21439, 21439, 39295, 39295, 573, 573, 2004, 2004, 264, 264, 383, 383, 2500, 2500, 1458, 1458, 1727, 1727, 3199, 3199, 59847, 59020, 1497, 30967, 41972, 20179, 20711, 25081, 52740, 26617, 16065, 53095, 9135, 64887, 39550, 27837, 1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, 65202, 54059, 33310, 20494, 37798, 945, 50654, 6182, 32011, 10631, 29176, 36775, 47051, 17561, 51106, 60261, 2226, 555, 2078, 1550, 422, 177, 3038, 1574, 3083, 1159, 2552, 2727, 1739, 2457, 418, 3173, 11182, 13387, 51303, 43881, 13131, 60950, 23093, 5493, 33034, 30318, 46795, 12639, 20100, 18525, 19529, 52918, 430, 843, 871, 105, 587, 3094, 2869, 1653, 778, 3182, 1483, 1119, 644, 349, 329, 3254, 788, 788, 1812, 1812, 28191, 28191, 28191, 28191, 28191, 28191, 28191, 28191, 48842, 48842, 48842, 48842, 48842, 48842, 48842, 48842, 287, 287, 287, 287, 287, 287, 287, 287, 202, 202, 202, 202, 202, 202, 202, 202, 10690, 10690, 10690, 10690, 1359, 1359, 1359, 1359, 54335, 54335, 54335, 54335, 31164, 31164, 31164, 31164, 962, 962, 962, 962, 2127, 2127, 2127, 2127, 1855, 1855, 1855, 1855, 1468, 1468, 1468, 1468, 37464, 37464, 24313, 24313, 55004, 55004, 8800, 8800, 18427, 18427, 8859, 8859, 26676, 26676, 49374, 49374, 2648, 2648, 1017, 1017, 732, 732, 608, 608, 1787, 1787, 411, 411, 3124, 3124, 1758, 1758, 19884, 37287, 49650, 56638, 37227, 9076, 35338, 18250, 13427, 14017, 36381, 52780, 16832, 4312, 41381, 47622, 2476, 3239, 3058, 830, 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, 34353, 25435, 58154, 24392, 44610, 10946, 24215, 16990, 10336, 57603, 43035, 10907, 31637, 28644, 23998, 48114, 817, 603, 1322, 1864, 2114, 1218, 2455, 2142, 2144, 2051, 1819, 2459, 3221, 996, 958, 1522, 20297, 2146, 15356, 33152, 59257, 50634, 54492, 14470, 44039, 45338, 23211, 48094, 41677, 45279, 7757, 23132, 1097, 610, 2044, 384, 3193, 1994, 220, 1670, 1799, 794, 2475, 478, 3021, 991, 1869, 1628};
const uint16_t PQCLEAN_KYBER102490S_AVX2_zetas_inv_exp[396] = {42405, 57780, 20258, 23860, 17443, 42326, 20199, 21498, 51067, 11045, 14903, 6280, 32385, 50181, 63391, 45240, 1701, 1460, 2338, 308, 2851, 854, 2535, 1530, 1659, 3109, 1335, 136, 2945, 1285, 2719, 2232, 17423, 41539, 36893, 33900, 54630, 22502, 7934, 55201, 48547, 41322, 54591, 20927, 41145, 7383, 40102, 31184, 1807, 2371, 2333, 108, 870, 1510, 1278, 1185, 1187, 874, 2111, 1215, 1465, 2007, 2726, 2512, 17915, 24156, 61225, 48705, 12757, 29156, 51520, 52110, 47287, 30199, 56461, 28310, 8899, 15887, 28250, 45653, 1275, 2652, 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, 16163, 16163, 38861, 38861, 56678, 56678, 47110, 47110, 56737, 56737, 10533, 10533, 41224, 41224, 28073, 28073, 1571, 1571, 205, 205, 2918, 2918, 1542, 1542, 2721, 2721, 2597, 2597, 2312, 2312, 681, 681, 34373, 34373, 34373, 34373, 11202, 11202, 11202, 11202, 64178, 64178, 64178, 64178, 54847, 54847, 54847, 54847, 1861, 1861, 1861, 1861, 1474, 1474, 1474, 1474, 1202, 1202, 1202, 1202, 2367, 2367, 2367, 2367, 16695, 16695, 16695, 16695, 16695, 16695, 16695, 16695, 37346, 37346, 37346, 37346, 37346, 37346, 37346, 37346, 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3042, 3042, 3042, 3042, 3042, 3042, 3042, 3042, 64749, 64749, 1517, 1517, 12619, 46008, 47012, 45437, 52898, 18742, 35219, 32503, 60044, 42444, 4587, 52406, 21656, 14234, 52150, 54355, 75, 3000, 2980, 2685, 2210, 1846, 147, 2551, 1676, 460, 235, 2742, 3224, 2458, 2486, 2899, 5276, 14431, 47976, 18486, 28762, 36361, 54906, 33526, 59355, 14883, 64592, 27739, 45043, 32227, 11478, 335, 156, 2911, 872, 1590, 602, 777, 2170, 246, 1755, 291, 3152, 2907, 1779, 1251, 2774, 1103, 37700, 25987, 650, 56402, 12442, 49472, 38920, 12797, 40456, 44826, 45358, 23565, 34570, 64040, 6517, 5690, 1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, 2677, 2106, 26242, 26242, 44098, 44098, 1103, 1103, 59965, 59965, 29058, 29058, 26361, 26361, 48173, 48173, 5828, 5828, 130, 130, 1602, 1602, 1871, 1871, 829, 829, 2946, 2946, 3065, 3065, 1325, 1325, 2756, 2756, 15691, 15691, 15691, 15691, 3800, 3800, 3800, 3800, 37779, 37779, 37779, 37779, 20907, 20907, 20907, 20907, 3147, 3147, 3147, 3147, 1752, 1752, 1752, 1752, 2707, 2707, 2707, 2707, 171, 171, 171, 171, 12403, 12403, 12403, 12403, 12403, 12403, 12403, 12403, 52012, 52012, 52012, 52012, 52012, 52012, 52012, 52012, 1907, 1907, 1907, 1907, 1907, 1907, 1907, 1907, 1836, 1836, 1836, 1836, 1836, 1836, 1836, 1836, 50791, 50791, 359, 359, 60300, 60300, 1932, 1932};
#include "consts.h"
#include <stdint.h>

#define Q KYBER_Q
#define MONT ((1U << 16) % KYBER_Q)
#define MONT ((1U << 16) % Q)
#define QINV 62209 // q^-1 mod 2^16
#define V ((1U << 26)/KYBER_Q + 1)
#define FHI (MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q)
#define FLO (FHI * QINV % 65536)
#define MONTSQHI (MONT * MONT % KYBER_Q)
#define MONTSQLO (MONTSQHI * QINV % 65536)
#define V (((1U << 26) + Q/2)/Q)
#define FHI (MONT*(MONT*(Q-1)*((Q-1)/128) % Q) % Q)
#define FLO (FHI*QINV % 65536)
#define MONTSQHI (MONT*MONT % Q)
#define MONTSQLO (MONTSQHI*QINV % 65536)
#define MASK 4095

const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q}};
const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}};
const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xv = {.as_arr = {V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V}};
const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xflo = {.as_arr = {FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO}};
const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xfhi = {.as_arr = {FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI}};
const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xmontsqlo = {.as_arr = {MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO}};
const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xmontsqhi = {.as_arr = {MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI}};
const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xmask = {.as_arr = {MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK}};

#undef Q
#undef QINV
#undef MONT
#undef V
#undef FLO
#undef FHI
#undef MONTSQLO
#undef MONTSQHI
#undef MASK

const qdata_t PQCLEAN_KYBER102490S_AVX2_qdata = {.as_arr = {
#define _16XQ 0
Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q,

#define _16XQINV 16
QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV,
QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV,

#define _16XV 32
V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,

#define _16XFLO 48
FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO,
FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO,

#define _16XFHI 64
FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI,
FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI,

#define _16XMONTSQLO 80
MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,

#define _16XMONTSQHI 96
MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,

#define _16XMASK 112
MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK,
MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK,

#define _ZETAS_EXP 128
31499, 31499, 2571, 2571, 14746, 14746, 2970, 2970,
13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525,
53134, 53134, 53134, 53134, 53134, 53134, 53134, 53134,
1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493,
1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422,
44630, 44630, 44630, 44630, 27758, 27758, 27758, 27758,
61737, 61737, 61737, 61737, 49846, 49846, 49846, 49846,
3158, 3158, 3158, 3158, 622, 622, 622, 622,
1577, 1577, 1577, 1577, 182, 182, 182, 182,
59709, 59709, 17364, 17364, 39176, 39176, 36479, 36479,
5572, 5572, 64434, 64434, 21439, 21439, 39295, 39295,
573, 573, 2004, 2004, 264, 264, 383, 383,
2500, 2500, 1458, 1458, 1727, 1727, 3199, 3199,
59847, 59020, 1497, 30967, 41972, 20179, 20711, 25081,
52740, 26617, 16065, 53095, 9135, 64887, 39550, 27837,
1223, 652, 2777, 1015, 2036, 1491, 3047, 1785,
516, 3321, 3009, 2663, 1711, 2167, 126, 1469,
65202, 54059, 33310, 20494, 37798, 945, 50654, 6182,
32011, 10631, 29176, 36775, 47051, 17561, 51106, 60261,
2226, 555, 2078, 1550, 422, 177, 3038, 1574,
3083, 1159, 2552, 2727, 1739, 2457, 418, 3173,
11182, 13387, 51303, 43881, 13131, 60950, 23093, 5493,
33034, 30318, 46795, 12639, 20100, 18525, 19529, 52918,
430, 843, 871, 105, 587, 3094, 2869, 1653,
778, 3182, 1483, 1119, 644, 349, 329, 3254,
788, 788, 1812, 1812, 28191, 28191, 28191, 28191,
28191, 28191, 28191, 28191, 48842, 48842, 48842, 48842,
48842, 48842, 48842, 48842, 287, 287, 287, 287,
287, 287, 287, 287, 202, 202, 202, 202,
202, 202, 202, 202, 10690, 10690, 10690, 10690,
1359, 1359, 1359, 1359, 54335, 54335, 54335, 54335,
31164, 31164, 31164, 31164, 962, 962, 962, 962,
2127, 2127, 2127, 2127, 1855, 1855, 1855, 1855,
1468, 1468, 1468, 1468, 37464, 37464, 24313, 24313,
55004, 55004, 8800, 8800, 18427, 18427, 8859, 8859,
26676, 26676, 49374, 49374, 2648, 2648, 1017, 1017,
732, 732, 608, 608, 1787, 1787, 411, 411,
3124, 3124, 1758, 1758, 19884, 37287, 49650, 56638,
37227, 9076, 35338, 18250, 13427, 14017, 36381, 52780,
16832, 4312, 41381, 47622, 2476, 3239, 3058, 830,
107, 1908, 3082, 2378, 2931, 961, 1821, 2604,
448, 2264, 677, 2054, 34353, 25435, 58154, 24392,
44610, 10946, 24215, 16990, 10336, 57603, 43035, 10907,
31637, 28644, 23998, 48114, 817, 603, 1322, 1864,
2114, 1218, 2455, 2142, 2144, 2051, 1819, 2459,
3221, 996, 958, 1522, 20297, 2146, 15356, 33152,
59257, 50634, 54492, 14470, 44039, 45338, 23211, 48094,
41677, 45279, 7757, 23132, 1097, 610, 2044, 384,
3193, 1994, 220, 1670, 1799, 794, 2475, 478,
3021, 991, 1869, 1628, 0, 0, 0, 0,

#define _ZETAS_INV_EXP 528
42405, 57780, 20258, 23860, 17443, 42326, 20199, 21498,
51067, 11045, 14903, 6280, 32385, 50181, 63391, 45240,
1701, 1460, 2338, 308, 2851, 854, 2535, 1530,
1659, 3109, 1335, 136, 2945, 1285, 2719, 2232,
17423, 41539, 36893, 33900, 54630, 22502, 7934, 55201,
48547, 41322, 54591, 20927, 41145, 7383, 40102, 31184,
1807, 2371, 2333, 108, 870, 1510, 1278, 1185,
1187, 874, 2111, 1215, 1465, 2007, 2726, 2512,
17915, 24156, 61225, 48705, 12757, 29156, 51520, 52110,
47287, 30199, 56461, 28310, 8899, 15887, 28250, 45653,
1275, 2652, 1065, 2881, 725, 1508, 2368, 398,
951, 247, 1421, 3222, 2499, 271, 90, 853,
16163, 16163, 38861, 38861, 56678, 56678, 47110, 47110,
56737, 56737, 10533, 10533, 41224, 41224, 28073, 28073,
1571, 1571, 205, 205, 2918, 2918, 1542, 1542,
2721, 2721, 2597, 2597, 2312, 2312, 681, 681,
34373, 34373, 34373, 34373, 11202, 11202, 11202, 11202,
64178, 64178, 64178, 64178, 54847, 54847, 54847, 54847,
1861, 1861, 1861, 1861, 1474, 1474, 1474, 1474,
1202, 1202, 1202, 1202, 2367, 2367, 2367, 2367,
16695, 16695, 16695, 16695, 16695, 16695, 16695, 16695,
37346, 37346, 37346, 37346, 37346, 37346, 37346, 37346,
3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127,
3042, 3042, 3042, 3042, 3042, 3042, 3042, 3042,
64749, 64749, 1517, 1517, 12619, 46008, 47012, 45437,
52898, 18742, 35219, 32503, 60044, 42444, 4587, 52406,
21656, 14234, 52150, 54355, 75, 3000, 2980, 2685,
2210, 1846, 147, 2551, 1676, 460, 235, 2742,
3224, 2458, 2486, 2899, 5276, 14431, 47976, 18486,
28762, 36361, 54906, 33526, 59355, 14883, 64592, 27739,
45043, 32227, 11478, 335, 156, 2911, 872, 1590,
602, 777, 2170, 246, 1755, 291, 3152, 2907,
1779, 1251, 2774, 1103, 37700, 25987, 650, 56402,
12442, 49472, 38920, 12797, 40456, 44826, 45358, 23565,
34570, 64040, 6517, 5690, 1860, 3203, 1162, 1618,
666, 320, 8, 2813, 1544, 282, 1838, 1293,
2314, 552, 2677, 2106, 26242, 26242, 44098, 44098,
1103, 1103, 59965, 59965, 29058, 29058, 26361, 26361,
48173, 48173, 5828, 5828, 130, 130, 1602, 1602,
1871, 1871, 829, 829, 2946, 2946, 3065, 3065,
1325, 1325, 2756, 2756, 15691, 15691, 15691, 15691,
3800, 3800, 3800, 3800, 37779, 37779, 37779, 37779,
20907, 20907, 20907, 20907, 3147, 3147, 3147, 3147,
1752, 1752, 1752, 1752, 2707, 2707, 2707, 2707,
171, 171, 171, 171, 12403, 12403, 12403, 12403,
12403, 12403, 12403, 12403, 52012, 52012, 52012, 52012,
52012, 52012, 52012, 52012, 1907, 1907, 1907, 1907,
1907, 1907, 1907, 1907, 1836, 1836, 1836, 1836,
1836, 1836, 1836, 1836, 50791, 50791, 359, 359,
60300, 60300, 1932, 1932, 0, 0, 0, 0
}
};

+ 12
- 16
crypto_kem/kyber1024-90s/avx2/consts.h View File

@@ -1,24 +1,20 @@
#ifndef CONSTS_H
#define CONSTS_H
#ifndef PQCLEAN_KYBER102490S_AVX2_CONSTS_H
#define PQCLEAN_KYBER102490S_AVX2_CONSTS_H

#include "cdecl.inc"

#include "params.h"
#include <immintrin.h>
#include <stdint.h>

typedef union {
uint16_t as_arr[16];
__m256i as_vec;
} aligned_uint16_t;
#define ALIGNED_UINT16_T(N) \
union { \
__m256i as_vec; \
uint16_t as_arr[(N)]; \
}

extern const uint16_t PQCLEAN_KYBER102490S_AVX2_zetas_exp[396];
extern const uint16_t PQCLEAN_KYBER102490S_AVX2_zetas_inv_exp[396];
typedef ALIGNED_UINT16_T(928) qdata_t;

extern const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xq;
extern const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xqinv;
extern const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xv;
extern const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xflo;
extern const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xfhi;
extern const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xmontsqlo;
extern const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xmontsqhi;
extern const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xmask;
extern const qdata_t PQCLEAN_KYBER102490S_AVX2_qdata;

#endif

+ 129
- 0
crypto_kem/kyber1024-90s/avx2/fq.S View File

@@ -0,0 +1,129 @@
#include "cdecl.inc"
.include "fq.inc"

.text
reduce128_avx:
#load
vmovdqa (%rdi),%ymm2
vmovdqa 32(%rdi),%ymm3
vmovdqa 64(%rdi),%ymm4
vmovdqa 96(%rdi),%ymm5
vmovdqa 128(%rdi),%ymm6
vmovdqa 160(%rdi),%ymm7
vmovdqa 192(%rdi),%ymm8
vmovdqa 224(%rdi),%ymm9

red16 2,10
red16 3,11
red16 4,12
red16 5,13
red16 6,14
red16 7,15
red16 8,10
red16 9,11

#store
vmovdqa %ymm2,(%rdi)
vmovdqa %ymm3,32(%rdi)
vmovdqa %ymm4,64(%rdi)
vmovdqa %ymm5,96(%rdi)
vmovdqa %ymm6,128(%rdi)
vmovdqa %ymm7,160(%rdi)
vmovdqa %ymm8,192(%rdi)
vmovdqa %ymm9,224(%rdi)

ret

.global cdecl(PQCLEAN_KYBER102490S_AVX2_reduce_avx)
cdecl(PQCLEAN_KYBER102490S_AVX2_reduce_avx):
#consts
vmovdqa _16XQ*2(%rsi),%ymm0
vmovdqa _16XV*2(%rsi),%ymm1
call reduce128_avx
add $256,%rdi
call reduce128_avx
ret

csubq128_avx:
#load
vmovdqa (%rdi),%ymm1
vmovdqa 32(%rdi),%ymm2
vmovdqa 64(%rdi),%ymm3
vmovdqa 96(%rdi),%ymm4
vmovdqa 128(%rdi),%ymm5
vmovdqa 160(%rdi),%ymm6
vmovdqa 192(%rdi),%ymm7
vmovdqa 224(%rdi),%ymm8

csubq 1,9
csubq 2,10
csubq 3,11
csubq 4,12
csubq 5,13
csubq 6,14
csubq 7,15
csubq 8,9

#store
vmovdqa %ymm1,(%rdi)
vmovdqa %ymm2,32(%rdi)
vmovdqa %ymm3,64(%rdi)
vmovdqa %ymm4,96(%rdi)
vmovdqa %ymm5,128(%rdi)
vmovdqa %ymm6,160(%rdi)
vmovdqa %ymm7,192(%rdi)
vmovdqa %ymm8,224(%rdi)

ret

.global cdecl(PQCLEAN_KYBER102490S_AVX2_csubq_avx)
cdecl(PQCLEAN_KYBER102490S_AVX2_csubq_avx):
#consts
vmovdqa _16XQ*2(%rsi),%ymm0
call csubq128_avx
add $256,%rdi
call csubq128_avx
ret

tomont128_avx:
#load
vmovdqa (%rdi),%ymm3
vmovdqa 32(%rdi),%ymm4
vmovdqa 64(%rdi),%ymm5
vmovdqa 96(%rdi),%ymm6
vmovdqa 128(%rdi),%ymm7
vmovdqa 160(%rdi),%ymm8
vmovdqa 192(%rdi),%ymm9
vmovdqa 224(%rdi),%ymm10

fqmulprecomp 1,2,3,11
fqmulprecomp 1,2,4,12
fqmulprecomp 1,2,5,13
fqmulprecomp 1,2,6,14
fqmulprecomp 1,2,7,15
fqmulprecomp 1,2,8,11
fqmulprecomp 1,2,9,12
fqmulprecomp 1,2,10,13

#store
vmovdqa %ymm3,(%rdi)
vmovdqa %ymm4,32(%rdi)
vmovdqa %ymm5,64(%rdi)
vmovdqa %ymm6,96(%rdi)
vmovdqa %ymm7,128(%rdi)
vmovdqa %ymm8,160(%rdi)
vmovdqa %ymm9,192(%rdi)
vmovdqa %ymm10,224(%rdi)

ret

.global cdecl(PQCLEAN_KYBER102490S_AVX2_tomont_avx)
cdecl(PQCLEAN_KYBER102490S_AVX2_tomont_avx):
#consts
vmovdqa _16XQ*2(%rsi),%ymm0
vmovdqa _16XMONTSQLO*2(%rsi),%ymm1
vmovdqa _16XMONTSQHI*2(%rsi),%ymm2
call tomont128_avx
add $256,%rdi
call tomont128_avx
ret

+ 7
- 4
crypto_kem/kyber1024-90s/avx2/fq.inc View File

@@ -1,24 +1,27 @@
.macro red16 r x=12
.macro red16 r,x=12
vpmulhw %ymm1,%ymm\r,%ymm\x
vpsraw $10,%ymm\x,%ymm\x
vpmullw %ymm0,%ymm\x,%ymm\x
vpsubw %ymm\x,%ymm\r,%ymm\r
.endm

.macro csubq r x=12
.macro csubq r,x=12
vpsubw %ymm0,%ymm\r,%ymm\r
vpsraw $15,%ymm\r,%ymm\x
vpand %ymm0,%ymm\x,%ymm\x
vpaddw %ymm\x,%ymm\r,%ymm\r
#vpcmpgtw %ymm0,%ymm\r,%ymm\x
#vpand %ymm0,%ymm\x,%ymm\x
#vpsubw %ymm\x,%ymm\r,%ymm\r
.endm

.macro caddq r x=12
.macro caddq r,x=12
vpsraw $15,%ymm\r,%ymm\x
vpand %ymm0,%ymm\x,%ymm\x
vpaddw %ymm\x,%ymm\r,%ymm\r
.endm

.macro fqmulprecomp al,ah,b x=12
.macro fqmulprecomp al,ah,b,x=12
vpmullw %ymm\al,%ymm\b,%ymm\x
vpmulhw %ymm\ah,%ymm\b,%ymm\b
vpmulhw %ymm0,%ymm\x,%ymm\x


+ 165
- 112
crypto_kem/kyber1024-90s/avx2/indcpa.c View File

@@ -1,26 +1,33 @@
#include "align.h"
#include "cbd.h"
#include "indcpa.h"
#include "ntt.h"
#include "params.h"
#include "poly.h"
#include "polyvec.h"
#include "randombytes.h"
#include "rejsample.h"
#include "symmetric.h"
#include <stddef.h>
#include <stdint.h>

/*************************************************
* Name: pack_pk
*
* Description: Serialize the public key as concatenation of the
* compressed and serialized vector of polynomials pk
* serialized vector of polynomials pk
* and the public seed used to generate the matrix A.
*
* Arguments: uint8_t *r: pointer to the output serialized public key
* const poly *pk: pointer to the input public-key polynomial
* polyvec *pk: pointer to the input public-key polyvec
* const uint8_t *seed: pointer to the input public seed
**************************************************/
static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) {
static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES],
polyvec *pk,
const uint8_t seed[KYBER_SYMBYTES]) {
size_t i = 0;
PQCLEAN_KYBER102490S_AVX2_polyvec_tobytes(r, pk);
for (size_t i = 0; i < KYBER_SYMBYTES; i++) {
for (i = 0; i < KYBER_SYMBYTES; i++) {
r[i + KYBER_POLYVECBYTES] = seed[i];
}
}
@@ -28,16 +35,19 @@ static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) {
/*************************************************
* Name: unpack_pk
*
* Description: De-serialize and decompress public key from a byte array;
* Description: De-serialize public key from a byte array;
* approximate inverse of pack_pk
*
* Arguments: - polyvec *pk: pointer to output public-key vector of polynomials
* - uint8_t *seed: pointer to output seed to generate matrix A
* Arguments: - polyvec *pk: pointer to output public-key polynomial vector
* - uint8_t *seed: pointer to output seed to generate matrix A
* - const uint8_t *packedpk: pointer to input serialized public key
**************************************************/
static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) {
static void unpack_pk(polyvec *pk,
uint8_t seed[KYBER_SYMBYTES],
const uint8_t packedpk[KYBER_INDCPA_PUBLICKEYBYTES]) {
size_t i = 0;
PQCLEAN_KYBER102490S_AVX2_polyvec_frombytes(pk, packedpk);
for (size_t i = 0; i < KYBER_SYMBYTES; i++) {
for (i = 0; i < KYBER_SYMBYTES; i++) {
seed[i] = packedpk[i + KYBER_POLYVECBYTES];
}
}
@@ -48,9 +58,9 @@ static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) {
* Description: Serialize the secret key
*
* Arguments: - uint8_t *r: pointer to output serialized secret key
* - const polyvec *sk: pointer to input vector of polynomials (secret key)
* - polyvec *sk: pointer to input vector of polynomials (secret key)
**************************************************/
static void pack_sk(uint8_t *r, polyvec *sk) {
static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) {
PQCLEAN_KYBER102490S_AVX2_polyvec_tobytes(r, sk);
}

@@ -60,10 +70,12 @@ static void pack_sk(uint8_t *r, polyvec *sk) {
* Description: De-serialize the secret key;
* inverse of pack_sk
*
* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key)
* Arguments: - polyvec *sk: pointer to output vector of polynomials
* (secret key)
* - const uint8_t *packedsk: pointer to input serialized secret key
**************************************************/
static void unpack_sk(polyvec *sk, const uint8_t *packedsk) {
static void unpack_sk(polyvec *sk,
const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) {
PQCLEAN_KYBER102490S_AVX2_polyvec_frombytes(sk, packedsk);
}

@@ -74,11 +86,13 @@ static void unpack_sk(polyvec *sk, const uint8_t *packedsk) {
* compressed and serialized vector of polynomials b
* and the compressed and serialized polynomial v
*
* Arguments: uint8_t *r: pointer to the output serialized ciphertext
* const poly *pk: pointer to the input vector of polynomials b
* const uint8_t *seed: pointer to the input polynomial v
* Arguments: uint8_t *r: pointer to the output serialized ciphertext
* poly *pk: pointer to the input vector of polynomials b
* poly *v: pointer to the input polynomial v
**************************************************/
static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) {
static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES],
polyvec *b,
poly *v) {
PQCLEAN_KYBER102490S_AVX2_polyvec_compress(r, b);
PQCLEAN_KYBER102490S_AVX2_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v);
}
@@ -89,22 +103,42 @@ static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) {
* Description: De-serialize and decompress ciphertext from a byte array;
* approximate inverse of pack_ciphertext
*
* Arguments: - polyvec *b: pointer to the output vector of polynomials b
* - poly *v: pointer to the output polynomial v
* Arguments: - polyvec *b: pointer to the output vector of polynomials b
* - poly *v: pointer to the output polynomial v
* - const uint8_t *c: pointer to the input serialized ciphertext
**************************************************/
static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t *c) {
static void unpack_ciphertext(polyvec *b,
poly *v,
const uint8_t c[KYBER_INDCPA_BYTES]) {
PQCLEAN_KYBER102490S_AVX2_polyvec_decompress(b, c);
PQCLEAN_KYBER102490S_AVX2_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES);
}

static size_t rej_uniform_ref(int16_t *r, size_t len, const uint8_t *buf, size_t buflen) {
unsigned int ctr, pos;
uint16_t val;
/*************************************************
* Name: rej_uniform
*
* Description: Run rejection sampling on uniform random bytes to generate
* uniform random integers mod q
*
* Arguments: - int16_t *r: pointer to output buffer
* - unsigned int len: requested number of 16-bit integers
* (uniform mod q)
* - const uint8_t *buf: pointer to input buffer
* (assumed to be uniform random bytes)
* - unsigned int buflen: length of input buffer in bytes
*
* Returns number of sampled 16-bit integers (at most len)
**************************************************/
static unsigned int rej_uniform(int16_t *r,
unsigned int len,
const uint8_t *buf,
unsigned int buflen) {
unsigned int ctr = 0, pos = 0;
uint16_t val = 0;

ctr = pos = 0;
while (ctr < len && pos + 2 <= buflen) {
val = (uint16_t)(buf[pos] | ((uint16_t)buf[pos + 1] << 8));
val = buf[pos] | ((uint16_t)buf[pos + 1] << 8);
pos += 2;

if (val < 19 * KYBER_Q) {
@@ -116,46 +150,47 @@ static size_t rej_uniform_ref(int16_t *r, size_t len, const uint8_t *buf, size_t
return ctr;
}

#define gen_a(A,B) gen_matrix(A,B,0)
#define gen_at(A,B) gen_matrix(A,B,1)
#define gen_a(A,B) PQCLEAN_KYBER102490S_AVX2_gen_matrix(A,B,0)
#define gen_at(A,B) PQCLEAN_KYBER102490S_AVX2_gen_matrix(A,B,1)

/*************************************************
* Name: gen_matrix
* Name: PQCLEAN_KYBER102490S_AVX2_gen_matrix
*
* Description: Deterministically generate matrix A (or the transpose of A)
* from a seed. Entries of the matrix are polynomials that look
* uniformly random. Performs rejection sampling on output of
* a XOF
*
* Arguments: - polyvec *a: pointer to ouptput matrix A
* Arguments: - polyvec *a: pointer to ouptput matrix A
* - const uint8_t *seed: pointer to input seed
* - int transposed: boolean deciding whether A or A^T is generated
* - int transposed: boolean deciding whether A or A^T is generated
**************************************************/
#define GEN_MATRIX_MAXNBLOCKS ((530+XOF_BLOCKBYTES)/XOF_BLOCKBYTES) /* 530 is expected number of required bytes */
static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) {
size_t ctr;
union {
uint8_t x[XOF_BLOCKBYTES * GEN_MATRIX_MAXNBLOCKS];
__m256i _dummy;
} buf;
#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \
+ XOF_BLOCKBYTES)/XOF_BLOCKBYTES)
void PQCLEAN_KYBER102490S_AVX2_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed) {
unsigned int ctr = 0, i = 0, j = 0;
ALIGN16_TYPE(uint64_t) nonce = {.orig = 0};
ALIGN32_ARRAY(uint8_t, GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES) buf;
aes256ctr_ctx state;

PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, seed, 0);

for (size_t i = 0; i < KYBER_K; i++) {
for (size_t j = 0; j < KYBER_K; j++) {
for (i = 0; i < KYBER_K; i++) {
for (j = 0; j < KYBER_K; j++) {
if (transposed) {
PQCLEAN_KYBER102490S_AVX2_aes256ctr_select(&state, (i << 8) + j);
nonce.orig = (j << 8) | i;
} else {
PQCLEAN_KYBER102490S_AVX2_aes256ctr_select(&state, (j << 8) + i);
nonce.orig = (i << 8) | j;
}

PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.x, GEN_MATRIX_MAXNBLOCKS, &state);
ctr = PQCLEAN_KYBER102490S_AVX2_rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf.x, GEN_MATRIX_MAXNBLOCKS * XOF_BLOCKBYTES);
state.n = _mm_loadl_epi64(&nonce.vec);
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.arr, GEN_MATRIX_NBLOCKS, &state);
ctr = PQCLEAN_KYBER102490S_AVX2_rej_uniform_avx(a[i].vec[j].coeffs, buf.arr);

while (ctr < KYBER_N) {
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.x, 1, &state);
ctr += rej_uniform_ref(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf.x, XOF_BLOCKBYTES);
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.arr, 1, &state);
ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf.arr,
XOF_BLOCKBYTES);
}

PQCLEAN_KYBER102490S_AVX2_poly_nttunpack(&a[i].vec[j]);
@@ -164,47 +199,53 @@ static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) {
}

/*************************************************
* Name: indcpa_keypair
* Name: PQCLEAN_KYBER102490S_AVX2_indcpa_keypair
*
* Description: Generates public and private key for the CPA-secure
* public-key encryption scheme underlying Kyber
*
* Arguments: - uint8_t *pk: pointer to output public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
* - uint8_t *sk: pointer to output private key (of length KYBER_INDCPA_SECRETKEYBYTES bytes)
* Arguments: - uint8_t *pk: pointer to output public key
* (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
* - uint8_t *sk: pointer to output private key
(of length KYBER_INDCPA_SECRETKEYBYTES bytes)
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk) {
polyvec a[KYBER_K], skpv, e, pkpv;
uint8_t buf[2 * KYBER_SYMBYTES];
const uint8_t *publicseed = buf;
const uint8_t *noiseseed = buf + KYBER_SYMBYTES;
uint8_t nonce = 0;
void PQCLEAN_KYBER102490S_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) {
unsigned int i = 0;
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf;
const uint8_t *publicseed = buf.arr;
const uint8_t *noiseseed = buf.arr + KYBER_SYMBYTES;
polyvec a[KYBER_K], e, pkpv, skpv;

randombytes(buf, KYBER_SYMBYTES);
hash_g(buf, buf, KYBER_SYMBYTES);
randombytes(buf.arr, KYBER_SYMBYTES);
hash_g(buf.arr, buf.arr, KYBER_SYMBYTES);

gen_a(a, publicseed);

ALIGN16_TYPE(uint64_t) nonce = {.orig = 0};
aes256ctr_ctx state;
uint8_t coins[128];
PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, noiseseed, 0);
for (size_t i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_AVX2_aes256ctr_select(&state, (uint16_t)nonce++ << 8);
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(coins, 1, &state);
PQCLEAN_KYBER102490S_AVX2_cbd(skpv.vec + i, coins);
ALIGN32_ARRAY(uint8_t, 128) coins;
PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, noiseseed, nonce.orig++);
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(coins.arr, 2, &state);
state.n = _mm_loadl_epi64(&nonce.vec);
nonce.orig++;
PQCLEAN_KYBER102490S_AVX2_cbd(&skpv.vec[i], coins.arr);
}
for (size_t i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_AVX2_aes256ctr_select(&state, (uint16_t)nonce++ << 8);
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(coins, 1, &state);
PQCLEAN_KYBER102490S_AVX2_cbd(e.vec + i, coins);
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(coins.arr, 2, &state);
state.n = _mm_loadl_epi64(&nonce.vec);
nonce.orig++;
PQCLEAN_KYBER102490S_AVX2_cbd(&e.vec[i], coins.arr);
}

PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(&skpv);
PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(&e);

// matrix-vector multiplication
for (size_t i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc(pkpv.vec + i, a + i, &skpv);
PQCLEAN_KYBER102490S_AVX2_poly_frommont(pkpv.vec + i);
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv);
PQCLEAN_KYBER102490S_AVX2_poly_tomont(&pkpv.vec[i]);
}

PQCLEAN_KYBER102490S_AVX2_polyvec_add(&pkpv, &pkpv, &e);
@@ -215,58 +256,67 @@ void PQCLEAN_KYBER102490S_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk) {
}

/*************************************************
* Name: indcpa_enc
* Name: PQCLEAN_KYBER102490S_AVX2_indcpa_enc
*
* Description: Encryption function of the CPA-secure
* public-key encryption scheme underlying Kyber.
*
* Arguments: - uint8_t *c: pointer to output ciphertext (of length KYBER_INDCPA_BYTES bytes)
* - const uint8_t *m: pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes)
* - const uint8_t *pk: pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
* - const uint8_t *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes)
* to deterministically generate all randomness
* Arguments: - uint8_t *c: pointer to output ciphertext
* (of length KYBER_INDCPA_BYTES bytes)
* - const uint8_t *m: pointer to input message
* (of length KYBER_INDCPA_MSGBYTES bytes)
* - const uint8_t *pk: pointer to input public key
* (of length KYBER_INDCPA_PUBLICKEYBYTES)
* - const uint8_t *coins: pointer to input random coins
* used as seed (of length KYBER_SYMBYTES)
* to deterministically generate all
* randomness
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_indcpa_enc(uint8_t *c,
const uint8_t *m,
const uint8_t *pk,
const uint8_t *coins) {
polyvec at[KYBER_K], pkpv, sp, ep, bp;
poly k, v, epp;
uint8_t seed[KYBER_SYMBYTES];
uint8_t nonce = 0;
unpack_pk(&pkpv, seed, pk);
void PQCLEAN_KYBER102490S_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
const uint8_t m[KYBER_INDCPA_MSGBYTES],
const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
const uint8_t coins[KYBER_SYMBYTES]) {
unsigned int i = 0;
ALIGN32_ARRAY(uint8_t, KYBER_SYMBYTES) seed;
polyvec sp, pkpv, ep, at[KYBER_K], bp;
poly v, k, epp;
unpack_pk(&pkpv, seed.arr, pk);
PQCLEAN_KYBER102490S_AVX2_poly_frommsg(&k, m);
gen_at(at, seed);
gen_at(at, seed.arr);

ALIGN16_TYPE(uint64_t) nonce = {.orig = 0};
aes256ctr_ctx state;
uint8_t buf[128];
PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, coins, 0);
for (size_t i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_AVX2_aes256ctr_select(&state, (uint16_t)nonce++ << 8);
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf, 1, &state);
PQCLEAN_KYBER102490S_AVX2_cbd(sp.vec + i, buf);
ALIGN32_ARRAY(uint8_t, 128) buf;
PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, coins, nonce.orig++);
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.arr, 2, &state);
state.n = _mm_loadl_epi64(&nonce.vec);
nonce.orig++;
PQCLEAN_KYBER102490S_AVX2_cbd(&sp.vec[i], buf.arr);
}
for (size_t i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_AVX2_aes256ctr_select(&state, (uint16_t)nonce++ << 8);
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf, 1, &state);
PQCLEAN_KYBER102490S_AVX2_cbd(ep.vec + i, buf);
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.arr, 2, &state);
state.n = _mm_loadl_epi64(&nonce.vec);
nonce.orig++;
PQCLEAN_KYBER102490S_AVX2_cbd(&ep.vec[i], buf.arr);
}
PQCLEAN_KYBER102490S_AVX2_aes256ctr_select(&state, (uint16_t)nonce++ << 8);
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf, 1, &state);
PQCLEAN_KYBER102490S_AVX2_cbd(&epp, buf);
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.arr, 2, &state);
state.n = _mm_loadl_epi64(&nonce.vec);
nonce.orig++;
PQCLEAN_KYBER102490S_AVX2_cbd(&epp, buf.arr);

PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(&sp);

// matrix-vector multiplication
for (size_t i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc(bp.vec + i, at + i, &sp);
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp);
}

PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc(&v, &pkpv, &sp);
PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp);

PQCLEAN_KYBER102490S_AVX2_polyvec_invntt(&bp);
PQCLEAN_KYBER102490S_AVX2_poly_invntt(&v);
PQCLEAN_KYBER102490S_AVX2_polyvec_invntt_tomont(&bp);
PQCLEAN_KYBER102490S_AVX2_poly_invntt_tomont(&v);

PQCLEAN_KYBER102490S_AVX2_polyvec_add(&bp, &bp, &ep);
PQCLEAN_KYBER102490S_AVX2_poly_add(&v, &v, &epp);
@@ -278,18 +328,21 @@ void PQCLEAN_KYBER102490S_AVX2_indcpa_enc(uint8_t *c,
}

/*************************************************
* Name: indcpa_dec
* Name: PQCLEAN_KYBER102490S_AVX2_indcpa_dec
*
* Description: Decryption function of the CPA-secure
* public-key encryption scheme underlying Kyber.
*
* Arguments: - uint8_t *m: pointer to output decrypted message (of length KYBER_INDCPA_MSGBYTES)
* - const uint8_t *c: pointer to input ciphertext (of length KYBER_INDCPA_BYTES)
* - const uint8_t *sk: pointer to input secret key (of length KYBER_INDCPA_SECRETKEYBYTES)
* Arguments: - uint8_t *m: pointer to output decrypted message
* (of length KYBER_INDCPA_MSGBYTES)
* - const uint8_t *c: pointer to input ciphertext
* (of length KYBER_INDCPA_BYTES)
* - const uint8_t *sk: pointer to input secret key
* (of length KYBER_INDCPA_SECRETKEYBYTES)
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_indcpa_dec(uint8_t *m,
const uint8_t *c,
const uint8_t *sk) {
void PQCLEAN_KYBER102490S_AVX2_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES],
const uint8_t c[KYBER_INDCPA_BYTES],
const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) {
polyvec bp, skpv;
poly v, mp;

@@ -297,8 +350,8 @@ void PQCLEAN_KYBER102490S_AVX2_indcpa_dec(uint8_t *m,
unpack_sk(&skpv, sk);

PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(&bp);
PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc(&mp, &skpv, &bp);
PQCLEAN_KYBER102490S_AVX2_poly_invntt(&mp);
PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp);
PQCLEAN_KYBER102490S_AVX2_poly_invntt_tomont(&mp);

PQCLEAN_KYBER102490S_AVX2_poly_sub(&mp, &v, &mp);
PQCLEAN_KYBER102490S_AVX2_poly_reduce(&mp);


+ 9
- 14
crypto_kem/kyber1024-90s/avx2/indcpa.h View File

@@ -1,21 +1,16 @@
#ifndef INDCPA_H
#define INDCPA_H
#ifndef PQCLEAN_KYBER102490S_AVX2_INDCPA_H
#define PQCLEAN_KYBER102490S_AVX2_INDCPA_H

#include "params.h"
#include "polyvec.h"
#include <stdint.h>

void PQCLEAN_KYBER102490S_AVX2_indcpa_keypair(
uint8_t *pk,
uint8_t *sk);
void PQCLEAN_KYBER102490S_AVX2_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed);

void PQCLEAN_KYBER102490S_AVX2_indcpa_enc(
uint8_t *c,
const uint8_t *m,
const uint8_t *pk,
const uint8_t *coins);
void PQCLEAN_KYBER102490S_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]);

void PQCLEAN_KYBER102490S_AVX2_indcpa_dec(
uint8_t *m,
const uint8_t *c,
const uint8_t *sk);
void PQCLEAN_KYBER102490S_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], const uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], const uint8_t coins[KYBER_SYMBYTES]);

void PQCLEAN_KYBER102490S_AVX2_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t c[KYBER_INDCPA_BYTES], const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]);

#endif

crypto_kem/kyber768/avx2/invntt.s → crypto_kem/kyber1024-90s/avx2/invntt.S View File

@@ -1,7 +1,8 @@
#include "cdecl.inc"
.include "shuffle.inc"
.include "fq.inc"

.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 zl0=1,zl1=1,zh0=2,zh1=2
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=1,zl1=1,zh0=2,zh1=2
#update & mul
vpsubw %ymm\rh0,%ymm\rl0,%ymm12
vpsubw %ymm\rh1,%ymm\rl1,%ymm13
@@ -36,12 +37,8 @@ vpsubw %ymm\rh2,%ymm14,%ymm\rh2
vpsubw %ymm\rh3,%ymm15,%ymm\rh3
.endm

.global PQCLEAN_KYBER768_AVX2_invntt_levels0t5_avx
.p2align 5
PQCLEAN_KYBER768_AVX2_invntt_levels0t5_avx:
#consts
vmovdqa PQCLEAN_KYBER768_AVX2_16xq(%rip),%ymm0

.text
invntt_levels0t5_avx:
level0:
#zetas
vmovdqu (%rsi),%ymm15
@@ -59,14 +56,14 @@ vmovdqa 160(%rdi),%ymm9
vmovdqa 192(%rdi),%ymm10
vmovdqa 224(%rdi),%ymm11

butterfly 4,5,8,9,6,7,10,11 15,3,1,2
butterfly 4,5,8,9,6,7,10,11,15,3,1,2

level1:
#zetas
vmovdqu 128(%rsi),%ymm3
vmovdqu 160(%rsi),%ymm2

butterfly 4,5,6,7,8,9,10,11 3,3,2,2
butterfly 4,5,6,7,8,9,10,11,3,3,2,2

shuffle1 4,5,3,5
shuffle1 6,7,4,7
@@ -79,9 +76,9 @@ vmovdqu 192(%rsi),%ymm10
vmovdqu 224(%rsi),%ymm2

#consts
vmovdqa PQCLEAN_KYBER768_AVX2_16xv(%rip),%ymm1
vmovdqa _16XV*2(%rdx),%ymm1

butterfly 3,4,6,8,5,7,9,11 10,10,2,2
butterfly 3,4,6,8,5,7,9,11,10,10,2,2

red16 3

@@ -95,7 +92,7 @@ level3:
vmovdqu 256(%rsi),%ymm9
vmovdqu 288(%rsi),%ymm2

butterfly 10,3,6,5,4,8,7,11 9,9,2,2
butterfly 10,3,6,5,4,8,7,11,9,9,2,2

red16 10

@@ -109,7 +106,7 @@ level4:
vmovdqu 320(%rsi),%ymm7
vmovdqu 352(%rsi),%ymm2

butterfly 9,10,6,4,3,5,8,11 7,7,2,2
butterfly 9,10,6,4,3,5,8,11,7,7,2,2

red16 9

@@ -123,7 +120,7 @@ level5:
vpbroadcastd 384(%rsi),%ymm8
vpbroadcastd 388(%rsi),%ymm2

butterfly 7,9,6,3,10,4,5,11 8,8,2,2
butterfly 7,9,6,3,10,4,5,11,8,8,2,2

red16 7

@@ -139,11 +136,7 @@ vmovdqa %ymm11,224(%rdi)

ret

.global PQCLEAN_KYBER768_AVX2_invntt_level6_avx
PQCLEAN_KYBER768_AVX2_invntt_level6_avx:
#consts
vmovdqa PQCLEAN_KYBER768_AVX2_16xq(%rip),%ymm0

invntt_level6_avx:
#zetas
vpbroadcastd (%rsi),%ymm1
vpbroadcastd 4(%rsi),%ymm2
@@ -161,8 +154,8 @@ vmovdqa 352(%rdi),%ymm11
butterfly 4,5,6,7,8,9,10,11

#consts
vmovdqa PQCLEAN_KYBER768_AVX2_16xflo(%rip),%ymm12
vmovdqa PQCLEAN_KYBER768_AVX2_16xfhi(%rip),%ymm13
vmovdqa _16XFLO*2(%rdx),%ymm12
vmovdqa _16XFHI*2(%rdx),%ymm13

#store
vmovdqa %ymm8,256(%rdi)
@@ -170,10 +163,10 @@ vmovdqa %ymm9,288(%rdi)
vmovdqa %ymm10,320(%rdi)
vmovdqa %ymm11,352(%rdi)

fqmulprecomp 12,13,4 8
fqmulprecomp 12,13,5 9
fqmulprecomp 12,13,6 10
fqmulprecomp 12,13,7 11
fqmulprecomp 12,13,4,8
fqmulprecomp 12,13,5,9
fqmulprecomp 12,13,6,10
fqmulprecomp 12,13,7,11

#store
vmovdqa %ymm4,(%rdi)
@@ -194,8 +187,8 @@ vmovdqa 480(%rdi),%ymm11
butterfly 4,5,6,7,8,9,10,11

#consts
vmovdqa PQCLEAN_KYBER768_AVX2_16xflo(%rip),%ymm12
vmovdqa PQCLEAN_KYBER768_AVX2_16xfhi(%rip),%ymm13
vmovdqa _16XFLO*2(%rdx),%ymm12
vmovdqa _16XFHI*2(%rdx),%ymm13

#store
vmovdqa %ymm8,384(%rdi)
@@ -203,10 +196,10 @@ vmovdqa %ymm9,416(%rdi)
vmovdqa %ymm10,448(%rdi)
vmovdqa %ymm11,480(%rdi)

fqmulprecomp 12,13,4 8
fqmulprecomp 12,13,5 9
fqmulprecomp 12,13,6 10
fqmulprecomp 12,13,7 11
fqmulprecomp 12,13,4,8
fqmulprecomp 12,13,5,9
fqmulprecomp 12,13,6,10
fqmulprecomp 12,13,7,11

#store
vmovdqa %ymm4,128(%rdi)
@@ -215,3 +208,18 @@ vmovdqa %ymm6,192(%rdi)
vmovdqa %ymm7,224(%rdi)

ret

.global cdecl(PQCLEAN_KYBER102490S_AVX2_invntt_avx)
cdecl(PQCLEAN_KYBER102490S_AVX2_invntt_avx):
#consts
vmovdqa _16XQ*2(%rsi),%ymm0
mov %rsi,%rdx
add $_ZETAS_INV_EXP*2,%rsi
call invntt_levels0t5_avx
add $256,%rdi
add $392,%rsi
call invntt_levels0t5_avx
sub $256,%rdi
add $392,%rsi
call invntt_level6_avx
ret

+ 0
- 217
crypto_kem/kyber1024-90s/avx2/invntt.s View File

@@ -1,217 +0,0 @@
.include "shuffle.inc"
.include "fq.inc"

.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 zl0=1,zl1=1,zh0=2,zh1=2
#update & mul
vpsubw %ymm\rh0,%ymm\rl0,%ymm12
vpsubw %ymm\rh1,%ymm\rl1,%ymm13
vpsubw %ymm\rh2,%ymm\rl2,%ymm14

vpaddw %ymm\rh0,%ymm\rl0,%ymm\rl0
vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl1
vpmullw %ymm\zl0,%ymm12,%ymm\rh0

vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl2
vpmullw %ymm\zl0,%ymm13,%ymm\rh1
vpsubw %ymm\rh3,%ymm\rl3,%ymm15

vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl3
vpmullw %ymm\zl1,%ymm14,%ymm\rh2
vpmullw %ymm\zl1,%ymm15,%ymm\rh3

vpmulhw %ymm\zh0,%ymm12,%ymm12
vpmulhw %ymm\zh0,%ymm13,%ymm13

vpmulhw %ymm\zh1,%ymm14,%ymm14
vpmulhw %ymm\zh1,%ymm15,%ymm15

#reduce
vpmulhw %ymm0,%ymm\rh0,%ymm\rh0
vpmulhw %ymm0,%ymm\rh1,%ymm\rh1
vpmulhw %ymm0,%ymm\rh2,%ymm\rh2
vpmulhw %ymm0,%ymm\rh3,%ymm\rh3
vpsubw %ymm\rh0,%ymm12,%ymm\rh0
vpsubw %ymm\rh1,%ymm13,%ymm\rh1
vpsubw %ymm\rh2,%ymm14,%ymm\rh2
vpsubw %ymm\rh3,%ymm15,%ymm\rh3
.endm

.global PQCLEAN_KYBER102490S_AVX2_invntt_levels0t5_avx
.p2align 5
PQCLEAN_KYBER102490S_AVX2_invntt_levels0t5_avx:
#consts
vmovdqa PQCLEAN_KYBER102490S_AVX2_16xq(%rip),%ymm0

level0:
#zetas
vmovdqu (%rsi),%ymm15
vmovdqu 64(%rsi),%ymm3
vmovdqu 32(%rsi),%ymm1
vmovdqu 96(%rsi),%ymm2

#load
vmovdqa (%rdi),%ymm4
vmovdqa 32(%rdi),%ymm5
vmovdqa 64(%rdi),%ymm6
vmovdqa 96(%rdi),%ymm7
vmovdqa 128(%rdi),%ymm8
vmovdqa 160(%rdi),%ymm9
vmovdqa 192(%rdi),%ymm10
vmovdqa 224(%rdi),%ymm11

butterfly 4,5,8,9,6,7,10,11 15,3,1,2

level1:
#zetas
vmovdqu 128(%rsi),%ymm3
vmovdqu 160(%rsi),%ymm2

butterfly 4,5,6,7,8,9,10,11 3,3,2,2

shuffle1 4,5,3,5
shuffle1 6,7,4,7
shuffle1 8,9,6,9
shuffle1 10,11,8,11

level2:
#zetas
vmovdqu 192(%rsi),%ymm10
vmovdqu 224(%rsi),%ymm2

#consts
vmovdqa PQCLEAN_KYBER102490S_AVX2_16xv(%rip),%ymm1

butterfly 3,4,6,8,5,7,9,11 10,10,2,2

red16 3

shuffle2 3,4,10,4
shuffle2 6,8,3,8
shuffle2 5,7,6,7
shuffle2 9,11,5,11

level3:
#zetas
vmovdqu 256(%rsi),%ymm9
vmovdqu 288(%rsi),%ymm2

butterfly 10,3,6,5,4,8,7,11 9,9,2,2

red16 10

shuffle4 10,3,9,3
shuffle4 6,5,10,5
shuffle4 4,8,6,8
shuffle4 7,11,4,11

level4:
#zetas
vmovdqu 320(%rsi),%ymm7
vmovdqu 352(%rsi),%ymm2

butterfly 9,10,6,4,3,5,8,11 7,7,2,2

red16 9

shuffle8 9,10,7,10
shuffle8 6,4,9,4
shuffle8 3,5,6,5
shuffle8 8,11,3,11

level5:
#zetas
vpbroadcastd 384(%rsi),%ymm8
vpbroadcastd 388(%rsi),%ymm2

butterfly 7,9,6,3,10,4,5,11 8,8,2,2

red16 7

#store
vmovdqa %ymm7,(%rdi)
vmovdqa %ymm9,32(%rdi)
vmovdqa %ymm6,64(%rdi)
vmovdqa %ymm3,96(%rdi)
vmovdqa %ymm10,128(%rdi)
vmovdqa %ymm4,160(%rdi)
vmovdqa %ymm5,192(%rdi)
vmovdqa %ymm11,224(%rdi)

ret

.global PQCLEAN_KYBER102490S_AVX2_invntt_level6_avx
PQCLEAN_KYBER102490S_AVX2_invntt_level6_avx:
#consts
vmovdqa PQCLEAN_KYBER102490S_AVX2_16xq(%rip),%ymm0

#zetas
vpbroadcastd (%rsi),%ymm1
vpbroadcastd 4(%rsi),%ymm2

#load
vmovdqa (%rdi),%ymm4
vmovdqa 32(%rdi),%ymm5
vmovdqa 64(%rdi),%ymm6
vmovdqa 96(%rdi),%ymm7
vmovdqa 256(%rdi),%ymm8
vmovdqa 288(%rdi),%ymm9
vmovdqa 320(%rdi),%ymm10
vmovdqa 352(%rdi),%ymm11

butterfly 4,5,6,7,8,9,10,11

#consts
vmovdqa PQCLEAN_KYBER102490S_AVX2_16xflo(%rip),%ymm12
vmovdqa PQCLEAN_KYBER102490S_AVX2_16xfhi(%rip),%ymm13

#store
vmovdqa %ymm8,256(%rdi)
vmovdqa %ymm9,288(%rdi)
vmovdqa %ymm10,320(%rdi)
vmovdqa %ymm11,352(%rdi)

fqmulprecomp 12,13,4 8
fqmulprecomp 12,13,5 9
fqmulprecomp 12,13,6 10
fqmulprecomp 12,13,7 11

#store
vmovdqa %ymm4,(%rdi)
vmovdqa %ymm5,32(%rdi)
vmovdqa %ymm6,64(%rdi)
vmovdqa %ymm7,96(%rdi)

#load
vmovdqa 128(%rdi),%ymm4
vmovdqa 160(%rdi),%ymm5
vmovdqa 192(%rdi),%ymm6
vmovdqa 224(%rdi),%ymm7
vmovdqa 384(%rdi),%ymm8
vmovdqa 416(%rdi),%ymm9
vmovdqa 448(%rdi),%ymm10
vmovdqa 480(%rdi),%ymm11

butterfly 4,5,6,7,8,9,10,11

#consts
vmovdqa PQCLEAN_KYBER102490S_AVX2_16xflo(%rip),%ymm12
vmovdqa PQCLEAN_KYBER102490S_AVX2_16xfhi(%rip),%ymm13

#store
vmovdqa %ymm8,384(%rdi)
vmovdqa %ymm9,416(%rdi)
vmovdqa %ymm10,448(%rdi)
vmovdqa %ymm11,480(%rdi)

fqmulprecomp 12,13,4 8
fqmulprecomp 12,13,5 9
fqmulprecomp 12,13,6 10
fqmulprecomp 12,13,7 11

#store
vmovdqa %ymm4,128(%rdi)
vmovdqa %ymm5,160(%rdi)
vmovdqa %ymm6,192(%rdi)
vmovdqa %ymm7,224(%rdi)

ret

+ 68
- 44
crypto_kem/kyber1024-90s/avx2/kem.c View File

@@ -1,103 +1,127 @@
#include "api.h"
#include "align.h"
#include "indcpa.h"
#include "kem.h"
#include "params.h"
#include "randombytes.h"
#include "symmetric.h"
#include "verify.h"
#include <stddef.h>
#include <stdint.h>


#include <stdlib.h>
/*************************************************
* Name: crypto_kem_keypair
* Name: PQCLEAN_KYBER102490S_AVX2_crypto_kem_keypair
*
* Description: Generates public and private key
* for CCA-secure Kyber key encapsulation mechanism
*
* Arguments: - uint8_t *pk: pointer to output public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes)
* - uint8_t *sk: pointer to output private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes)
* Arguments: - unsigned char *pk: pointer to output public key
* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes)
* - unsigned char *sk: pointer to output private key
* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes)
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_KYBER102490S_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) {
size_t i;
int PQCLEAN_KYBER102490S_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) {
size_t i = 0;
PQCLEAN_KYBER102490S_AVX2_indcpa_keypair(pk, sk);
for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) {
sk[i + KYBER_INDCPA_SECRETKEYBYTES] = pk[i];
}
hash_h(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);
randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES); /* Value z for pseudo-random output on reject */
/* Value z for pseudo-random output on reject */
randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES);
return 0;
}

/*************************************************
* Name: crypto_kem_enc
* Name: PQCLEAN_KYBER102490S_AVX2_crypto_kem_enc
*
* Description: Generates cipher text and shared
* secret for given public key
*
* Arguments: - uint8_t *ct: pointer to output cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes)
* - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes)
* - const uint8_t *pk: pointer to input public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes)
* Arguments: - unsigned char *ct: pointer to output cipher text
* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes)
* - unsigned char *ss: pointer to output shared secret
* (an already allocated array of CRYPTO_BYTES bytes)
* - const unsigned char *pk: pointer to input public key
* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes)
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_KYBER102490S_AVX2_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */
uint8_t buf[2 * KYBER_SYMBYTES];
int PQCLEAN_KYBER102490S_AVX2_crypto_kem_enc(unsigned char *ct,
unsigned char *ss,
const unsigned char *pk) {
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf;
/* Will contain key, coins */
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr;

randombytes(buf, KYBER_SYMBYTES);
hash_h(buf, buf, KYBER_SYMBYTES); /* Don't release system RNG output */
randombytes(buf.arr, KYBER_SYMBYTES);
/* Don't release system RNG output */
hash_h(buf.arr, buf.arr, KYBER_SYMBYTES);

hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); /* Multitarget countermeasure for coins + contributory KEM */
hash_g(kr, buf, 2 * KYBER_SYMBYTES);
/* Multitarget countermeasure for coins + contributory KEM */
hash_h(buf.arr + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);
hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES);

PQCLEAN_KYBER102490S_AVX2_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */
/* coins are in kr+KYBER_SYMBYTES */
PQCLEAN_KYBER102490S_AVX2_indcpa_enc(ct, buf.arr, pk, kr.arr + KYBER_SYMBYTES);

hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */
kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */
/* overwrite coins in kr with H(c) */
hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES);
/* hash concatenation of pre-k and H(c) to k */
kdf(ss, kr.arr, 2 * KYBER_SYMBYTES);
return 0;
}

/*************************************************
* Name: crypto_kem_dec
* Name: PQCLEAN_KYBER102490S_AVX2_crypto_kem_dec
*
* Description: Generates shared secret for given
* cipher text and private key
*
* Arguments: - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes)
* - const uint8_t *ct: pointer to input cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes)
* - const uint8_t *sk: pointer to input private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes)
* Arguments: - unsigned char *ss: pointer to output shared secret
* (an already allocated array of CRYPTO_BYTES bytes)
* - const unsigned char *ct: pointer to input cipher text
* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes)
* - const unsigned char *sk: pointer to input private key
* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes)
*
* Returns 0.
*
* On failure, ss will contain a pseudo-random value.
**************************************************/
int PQCLEAN_KYBER102490S_AVX2_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
size_t i;
uint8_t fail;
union {
uint8_t x[KYBER_CIPHERTEXTBYTES];
__m256i __dummy;
} _cmp;
uint8_t *cmp = _cmp.x;
uint8_t buf[2 * KYBER_SYMBYTES];
uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */
int PQCLEAN_KYBER102490S_AVX2_crypto_kem_dec(unsigned char *ss,
const unsigned char *ct,
const unsigned char *sk) {
size_t i = 0;
int fail = 0;
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf;
/* Will contain key, coins */
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr;
uint8_t cmp[KYBER_CIPHERTEXTBYTES];
const uint8_t *pk = sk + KYBER_INDCPA_SECRETKEYBYTES;

PQCLEAN_KYBER102490S_AVX2_indcpa_dec(buf, ct, sk);
PQCLEAN_KYBER102490S_AVX2_indcpa_dec(buf.arr, ct, sk);

for (i = 0; i < KYBER_SYMBYTES; i++) { /* Multitarget countermeasure for coins + contributory KEM */
buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; /* Save hash by storing H(pk) in sk */
/* Multitarget countermeasure for coins + contributory KEM */
for (i = 0; i < KYBER_SYMBYTES; i++) {
buf.arr[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i];
}
hash_g(kr, buf, 2 * KYBER_SYMBYTES);
hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES);

PQCLEAN_KYBER102490S_AVX2_indcpa_enc(cmp, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */
/* coins are in kr+KYBER_SYMBYTES */
PQCLEAN_KYBER102490S_AVX2_indcpa_enc(cmp, buf.arr, pk, kr.arr + KYBER_SYMBYTES);

fail = PQCLEAN_KYBER102490S_AVX2_verify(ct, cmp, KYBER_CIPHERTEXTBYTES);

hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */
/* overwrite coins in kr with H(c) */
hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES);

PQCLEAN_KYBER102490S_AVX2_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); /* Overwrite pre-k with z on re-encryption failure */
/* Overwrite pre-k with z on re-encryption failure */
PQCLEAN_KYBER102490S_AVX2_cmov(kr.arr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail);

kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */
/* hash concatenation of pre-k and H(c) to k */
kdf(ss, kr.arr, 2 * KYBER_SYMBYTES);
return 0;
}

+ 19
- 0
crypto_kem/kyber1024-90s/avx2/kem.h View File

@@ -0,0 +1,19 @@
#ifndef PQCLEAN_KYBER102490S_AVX2_KEM_H
#define PQCLEAN_KYBER102490S_AVX2_KEM_H

#include "params.h"


int PQCLEAN_KYBER102490S_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk);


int PQCLEAN_KYBER102490S_AVX2_crypto_kem_enc(unsigned char *ct,
unsigned char *ss,
const unsigned char *pk);


int PQCLEAN_KYBER102490S_AVX2_crypto_kem_dec(unsigned char *ss,
const unsigned char *ct,
const unsigned char *sk);

#endif

+ 220
- 0
crypto_kem/kyber1024-90s/avx2/ntt.S View File

@@ -0,0 +1,220 @@
#include "cdecl.inc"
.include "shuffle.inc"
.include "fq.inc"

.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=1,zh1=1
#mul
vpmullw %ymm\zl0,%ymm\rh0,%ymm12
vpmullw %ymm\zl0,%ymm\rh1,%ymm13
vpmullw %ymm\zl1,%ymm\rh2,%ymm14
vpmullw %ymm\zl1,%ymm\rh3,%ymm15
vpmulhw %ymm\zh0,%ymm\rh0,%ymm\rh0
vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh1
vpmulhw %ymm\zh1,%ymm\rh2,%ymm\rh2
vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh3

#reduce
vpmulhw %ymm0,%ymm12,%ymm12
vpmulhw %ymm0,%ymm13,%ymm13
vpmulhw %ymm0,%ymm14,%ymm14
vpmulhw %ymm0,%ymm15,%ymm15
vpsubw %ymm12,%ymm\rh0,%ymm12
vpsubw %ymm13,%ymm\rh1,%ymm13
vpsubw %ymm14,%ymm\rh2,%ymm14
vpsubw %ymm15,%ymm\rh3,%ymm15

#update
vpsubw %ymm12,%ymm\rl0,%ymm\rh0
vpaddw %ymm12,%ymm\rl0,%ymm\rl0
vpsubw %ymm13,%ymm\rl1,%ymm\rh1
vpaddw %ymm13,%ymm\rl1,%ymm\rl1
vpsubw %ymm14,%ymm\rl2,%ymm\rh2
vpaddw %ymm14,%ymm\rl2,%ymm\rl2
vpsubw %ymm15,%ymm\rl3,%ymm\rh3
vpaddw %ymm15,%ymm\rl3,%ymm\rl3
.endm

# We break the dependency chains with the cost of slightly more additions.
# But they can be run in parallel to the multiplications on execution port 5
# (multiplications only go to ports 0 and 1)
.macro butterfly2 rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,x=3,y=2,zl0=15,zl1=15,zh0=1,zh1=1
#mul
vpmullw %ymm\zl0,%ymm\rh0,%ymm12
vpmulhw %ymm\zh0,%ymm\rh0,%ymm\x
vpmullw %ymm\zl0,%ymm\rh1,%ymm13
vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh0
vpmullw %ymm\zl1,%ymm\rh2,%ymm14
vpmulhw %ymm\zh1,%ymm\rh2,%ymm\y
vpmullw %ymm\zl1,%ymm\rh3,%ymm15
vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh2

#reduce
vpmulhw %ymm0,%ymm12,%ymm12
vpmulhw %ymm0,%ymm13,%ymm13
vpmulhw %ymm0,%ymm14,%ymm14
vpmulhw %ymm0,%ymm15,%ymm15

vpsubw %ymm\rh0,%ymm\rl1,%ymm\rh1
vpaddw %ymm\rh0,%ymm\rl1,%ymm\rl1
vpsubw %ymm\x,%ymm\rl0,%ymm\rh0
vpaddw %ymm\x,%ymm\rl0,%ymm\rl0
vpsubw %ymm\rh2,%ymm\rl3,%ymm\rh3
vpaddw %ymm\rh2,%ymm\rl3,%ymm\rl3
vpsubw %ymm\y,%ymm\rl2,%ymm\rh2
vpaddw %ymm\y,%ymm\rl2,%ymm\rl2

#update
vpaddw %ymm12,%ymm\rh0,%ymm\rh0
vpsubw %ymm12,%ymm\rl0,%ymm\rl0
vpaddw %ymm13,%ymm\rh1,%ymm\rh1
vpsubw %ymm13,%ymm\rl1,%ymm\rl1
vpaddw %ymm14,%ymm\rh2,%ymm\rh2
vpsubw %ymm14,%ymm\rl2,%ymm\rl2
vpaddw %ymm15,%ymm\rh3,%ymm\rh3
vpsubw %ymm15,%ymm\rl3,%ymm\rl3
.endm

.text
ntt_level0_avx:
level0:
#zetas
vpbroadcastd (%rsi),%ymm15
vpbroadcastd 4(%rsi),%ymm1

#load
vmovdqa (%rdi),%ymm4
vmovdqa 32(%rdi),%ymm5
vmovdqa 64(%rdi),%ymm6
vmovdqa 96(%rdi),%ymm7
vmovdqa 256(%rdi),%ymm8
vmovdqa 288(%rdi),%ymm9
vmovdqa 320(%rdi),%ymm10
vmovdqa 352(%rdi),%ymm11

butterfly2 4,5,6,7,8,9,10,11

#store
vmovdqa %ymm4,(%rdi)
vmovdqa %ymm5,32(%rdi)
vmovdqa %ymm6,64(%rdi)
vmovdqa %ymm7,96(%rdi)
vmovdqa %ymm8,256(%rdi)
vmovdqa %ymm9,288(%rdi)
vmovdqa %ymm10,320(%rdi)
vmovdqa %ymm11,352(%rdi)

ret

ntt_levels1t6_avx:
level1:
#zetas
vpbroadcastd (%rsi),%ymm15
vpbroadcastd 4(%rsi),%ymm1

#load
vmovdqa (%rdi),%ymm4
vmovdqa 32(%rdi),%ymm5
vmovdqa 64(%rdi),%ymm6
vmovdqa 96(%rdi),%ymm7
vmovdqa 128(%rdi),%ymm8
vmovdqa 160(%rdi),%ymm9
vmovdqa 192(%rdi),%ymm10
vmovdqa 224(%rdi),%ymm11

butterfly2 4,5,6,7,8,9,10,11,3

level2:
#zetas
vmovdqu 8(%rsi),%ymm15
vmovdqu 40(%rsi),%ymm1

shuffle8 4,8,3,8
shuffle8 5,9,4,9
shuffle8 6,10,5,10
shuffle8 7,11,6,11

butterfly2 3,8,4,9,5,10,6,11,7

level3:
#zetas
vmovdqu 72(%rsi),%ymm15
vmovdqu 104(%rsi),%ymm1

shuffle4 3,5,7,5
shuffle4 8,10,3,10
shuffle4 4,6,8,6
shuffle4 9,11,4,11

butterfly2 7,5,3,10,8,6,4,11,9

level4:
#zetas
vmovdqu 136(%rsi),%ymm15
vmovdqu 168(%rsi),%ymm1

shuffle2 7,8,9,8
shuffle2 5,6,7,6
shuffle2 3,4,5,4
shuffle2 10,11,3,11

butterfly2 9,8,7,6,5,4,3,11,10

level5:
#zetas
vmovdqu 200(%rsi),%ymm15
vmovdqu 232(%rsi),%ymm1

shuffle1 9,5,10,5
shuffle1 8,4,9,4
shuffle1 7,3,8,3
shuffle1 6,11,7,11

butterfly2 10,5,9,4,8,3,7,11,6

level6:
#zetas
vmovdqu 264(%rsi),%ymm14
vmovdqu 328(%rsi),%ymm15
vmovdqu 296(%rsi),%ymm1
vmovdqu 360(%rsi),%ymm2

butterfly2 10,5,8,3,9,4,7,11,6,1,14,15,1,2

vmovdqa _16XV*2(%rdx),%ymm1
red16 10,12
red16 5,13
red16 9,14
red16 4,15
red16 8,2
red16 3,6
red16 7,12
red16 11,13

#store
vmovdqa %ymm10,(%rdi)
vmovdqa %ymm5,32(%rdi)
vmovdqa %ymm9,64(%rdi)
vmovdqa %ymm4,96(%rdi)
vmovdqa %ymm8,128(%rdi)
vmovdqa %ymm3,160(%rdi)
vmovdqa %ymm7,192(%rdi)
vmovdqa %ymm11,224(%rdi)

ret

.global cdecl(PQCLEAN_KYBER102490S_AVX2_ntt_avx)
cdecl(PQCLEAN_KYBER102490S_AVX2_ntt_avx):
#consts
vmovdqa _16XQ*2(%rsi),%ymm0
mov %rsi,%rdx
add $_ZETAS_EXP*2,%rsi
call ntt_level0_avx
add $128,%rdi
call ntt_level0_avx
sub $128,%rdi
add $8,%rsi
call ntt_levels1t6_avx
add $256,%rdi
add $392,%rsi
call ntt_levels1t6_avx
ret

+ 20
- 12
crypto_kem/kyber1024-90s/avx2/ntt.h View File

@@ -2,19 +2,27 @@
#define NTT_H

#include "consts.h"
#include "params.h"
#include <stdint.h>

void PQCLEAN_KYBER102490S_AVX2_ntt_level0_avx(int16_t *r, const uint16_t *zetas);
void PQCLEAN_KYBER102490S_AVX2_ntt_levels1t6_avx(int16_t *r, const uint16_t *zetas);
void PQCLEAN_KYBER102490S_AVX2_invntt_levels0t5_avx(int16_t *r, const uint16_t *zetas);
void PQCLEAN_KYBER102490S_AVX2_invntt_level6_avx(int16_t *r, const uint16_t *zetas);
void PQCLEAN_KYBER102490S_AVX2_nttpack_avx(int16_t *r);
void PQCLEAN_KYBER102490S_AVX2_nttunpack_avx(int16_t *r);
void PQCLEAN_KYBER102490S_AVX2_basemul_avx(int16_t *r, const int16_t *a, const int16_t *b, const uint16_t *zeta);
void PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx(int16_t *r, const int16_t *a, const int16_t *b, const uint16_t *zeta);

void PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx(uint8_t *r, const int16_t *a);
void PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx(int16_t *r, const uint8_t *a);

void PQCLEAN_KYBER102490S_AVX2_ntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata);

void PQCLEAN_KYBER102490S_AVX2_invntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata);


void nttpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata);

void PQCLEAN_KYBER102490S_AVX2_nttunpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata);


void PQCLEAN_KYBER102490S_AVX2_basemul_avx(int16_t *r, const int16_t *a, const int16_t *b, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata);

void PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx(int16_t *r, const int16_t *a, const int16_t *b, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata);


void PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx(uint8_t *r, const int16_t *a, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata);

void PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx(int16_t *r, const uint8_t *a, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata);

#endif

+ 10
- 10
crypto_kem/kyber1024-90s/avx2/params.h View File

@@ -1,8 +1,5 @@
#ifndef PARAMS_H
#define PARAMS_H


/* Don't change parameters below this line */
#ifndef PQCLEAN_KYBER102490S_AVX2_PARAMS_H
#define PQCLEAN_KYBER102490S_AVX2_PARAMS_H

#define KYBER_N 256
#define KYBER_Q 3329
@@ -12,9 +9,8 @@
#define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */
#define KYBER_SSBYTES 32 /* size in bytes of shared key */

#define KYBER_POLYBYTES 384
#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES)

#define KYBER_POLYBYTES 384
#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES)

#define KYBER_K 4
#define KYBER_POLYCOMPRESSEDBYTES 160
@@ -23,10 +19,14 @@
#define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES
#define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES)
#define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES)
#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES)
#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES \
+ KYBER_POLYCOMPRESSEDBYTES)

#define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES)
#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) /* 32 bytes of additional space to save H(pk) */
/* 32 bytes of additional space to save H(pk) */
#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES \
+ KYBER_INDCPA_PUBLICKEYBYTES \
+ 2*KYBER_SYMBYTES)
#define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES

#endif

+ 208
- 219
crypto_kem/kyber1024-90s/avx2/poly.c View File

@@ -1,113 +1,210 @@
#include "align.h"
#include "cbd.h"
#include "consts.h"
#include "ntt.h"
#include "params.h"
#include "poly.h"
#include "reduce.h"
#include "symmetric.h"

#include <immintrin.h>
#include <stdint.h>

/*************************************************
* Name: poly_compress
* Name: PQCLEAN_KYBER102490S_AVX2_poly_compress
*
* Description: Compression and subsequent serialization of a polynomial
*
* Arguments: - uint8_t *r: pointer to output byte array
* - const poly *a: pointer to input polynomial
* (of length KYBER_POLYCOMPRESSEDBYTES)
* - poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_poly_compress(uint8_t *r, poly *a) {
void PQCLEAN_KYBER102490S_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *restrict a) {
unsigned int i = 0, j = 0;
uint8_t t[8];
size_t i, j, k = 0;

PQCLEAN_KYBER102490S_AVX2_poly_csubq(a);

for (i = 0; i < KYBER_N; i += 8) {
for (i = 0; i < KYBER_N / 8; i++) {
for (j = 0; j < 8; j++) {
t[j] = (uint8_t)(((((uint32_t)a->coeffs[i + j] << 5) + KYBER_Q / 2) / KYBER_Q) & 31);
t[j] = ((((uint32_t)a->coeffs[8 * i + j] << 5) + KYBER_Q / 2) / KYBER_Q) & 31;
}

r[k] = (uint8_t)( t[0] | (t[1] << 5));
r[k + 1] = (uint8_t)((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
r[k + 2] = (uint8_t)((t[3] >> 1) | (t[4] << 4));
r[k + 3] = (uint8_t)((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
r[k + 4] = (uint8_t)((t[6] >> 2) | (t[7] << 3));
k += 5;
r[0] = (t[0] >> 0) | (t[1] << 5);
r[1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7);
r[2] = (t[3] >> 1) | (t[4] << 4);
r[3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6);
r[4] = (t[6] >> 2) | (t[7] << 3);
r += 5;
}
}

/*************************************************
* Name: poly_decompress
* Name: PQCLEAN_KYBER102490S_AVX2_poly_decompress
*
* Description: De-serialization and subsequent decompression of a polynomial;
* approximate inverse of poly_compress
* approximate inverse of PQCLEAN_KYBER102490S_AVX2_poly_compress
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *a: pointer to input byte array
* (of length KYBER_POLYCOMPRESSEDBYTES bytes)
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_poly_decompress(poly *r, const uint8_t *a) {
size_t i;
for (i = 0; i < KYBER_N; i += 8) {
r->coeffs[i + 0] = (int16_t)( (((a[0] & 31) * KYBER_Q) + 16) >> 5);
r->coeffs[i + 1] = (int16_t)(((((a[0] >> 5) | ((a[1] & 3) << 3)) * KYBER_Q) + 16) >> 5);
r->coeffs[i + 2] = (int16_t)(((((a[1] >> 2) & 31) * KYBER_Q) + 16) >> 5);
r->coeffs[i + 3] = (int16_t)(((((a[1] >> 7) | ((a[2] & 15) << 1)) * KYBER_Q) + 16) >> 5);
r->coeffs[i + 4] = (int16_t)(((((a[2] >> 4) | ((a[3] & 1) << 4)) * KYBER_Q) + 16) >> 5);
r->coeffs[i + 5] = (int16_t)(((((a[3] >> 1) & 31) * KYBER_Q) + 16) >> 5);
r->coeffs[i + 6] = (int16_t)(((((a[3] >> 6) | ((a[4] & 7) << 2)) * KYBER_Q) + 16) >> 5);
r->coeffs[i + 7] = (int16_t)( (((a[4] >> 3) * KYBER_Q) + 16) >> 5);
void PQCLEAN_KYBER102490S_AVX2_poly_decompress(poly *restrict r,
const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) {
unsigned int i = 0;

unsigned int j = 0;
uint8_t t[8];
for (i = 0; i < KYBER_N / 8; i++) {
t[0] = (a[0] >> 0);
t[1] = (a[0] >> 5) | (a[1] << 3);
t[2] = (a[1] >> 2);
t[3] = (a[1] >> 7) | (a[2] << 1);
t[4] = (a[2] >> 4) | (a[3] << 4);
t[5] = (a[3] >> 1);
t[6] = (a[3] >> 6) | (a[4] << 2);
t[7] = (a[4] >> 3);
a += 5;

for (j = 0; j < 8; j++) {
r->coeffs[8 * i + j] = ((uint32_t)(t[j] & 31) * KYBER_Q + 16) >> 5;
}
}
}

/*************************************************
* Name: poly_tobytes
* Name: PQCLEAN_KYBER102490S_AVX2_poly_tobytes
*
* Description: Serialization of a polynomial
*
* Arguments: - uint8_t *r: pointer to output byte array
* - const poly *a: pointer to input polynomial
* (needs space for KYBER_POLYBYTES bytes)
* - poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_poly_tobytes(uint8_t *r, poly *a) {
PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx(r, a->coeffs);
PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx(r + 192, a->coeffs + 128);
void PQCLEAN_KYBER102490S_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) {
PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx(r, a->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata);
}

/*************************************************
* Name: poly_frombytes
* Name: PQCLEAN_KYBER102490S_AVX2_poly_frombytes
*
* Description: De-serialization of a polynomial;
* inverse of poly_tobytes
* inverse of PQCLEAN_KYBER102490S_AVX2_poly_tobytes
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *a: pointer to input byte array
* (of KYBER_POLYBYTES bytes)
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]) {
PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx(r->coeffs, a, &PQCLEAN_KYBER102490S_AVX2_qdata);
}

/*************************************************
* Name: PQCLEAN_KYBER102490S_AVX2_poly_frommsg
*
* Description: Convert 32-byte message to polynomial
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *msg: pointer to input message
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_poly_frommsg(poly *restrict r,
const uint8_t msg[KYBER_INDCPA_MSGBYTES]) {
__m256i f, g0, g1, g2, g3, h0, h1, h2, h3;
const __m256i shift = _mm256_broadcastsi128_si256(_mm_set_epi32(0, 1, 2, 3));
const __m256i idx = _mm256_broadcastsi128_si256(_mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0));
const __m256i hqs = _mm256_set1_epi16((KYBER_Q + 1) / 2);

#define FROMMSG64(i) \
g3 = _mm256_shuffle_epi32(f,0x55*(i)); \
g3 = _mm256_sllv_epi32(g3,shift); \
g3 = _mm256_shuffle_epi8(g3,idx); \
g0 = _mm256_slli_epi16(g3,12); \
g1 = _mm256_slli_epi16(g3,8); \
g2 = _mm256_slli_epi16(g3,4); \
g0 = _mm256_srai_epi16(g0,15); \
g1 = _mm256_srai_epi16(g1,15); \
g2 = _mm256_srai_epi16(g2,15); \
g3 = _mm256_srai_epi16(g3,15); \
g0 = _mm256_and_si256(g0,hqs); /* 19 18 17 16 3 2 1 0 */ \
g1 = _mm256_and_si256(g1,hqs); /* 23 22 21 20 7 6 5 4 */ \
g2 = _mm256_and_si256(g2,hqs); /* 27 26 25 24 11 10 9 8 */ \
g3 = _mm256_and_si256(g3,hqs); /* 31 30 29 28 15 14 13 12 */ \
h0 = _mm256_unpacklo_epi64(g0,g1); \
h2 = _mm256_unpackhi_epi64(g0,g1); \
h1 = _mm256_unpacklo_epi64(g2,g3); \
h3 = _mm256_unpackhi_epi64(g2,g3); \
g0 = _mm256_permute2x128_si256(h0,h1,0x20); \
g2 = _mm256_permute2x128_si256(h0,h1,0x31); \
g1 = _mm256_permute2x128_si256(h2,h3,0x20); \
g3 = _mm256_permute2x128_si256(h2,h3,0x31); \
_mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+ 0],g0); \
_mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+16],g1); \
_mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+ 0],g2); \
_mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+16],g3)

f = _mm256_load_si256((__m256i *)msg);
FROMMSG64(0);
FROMMSG64(1);
FROMMSG64(2);
FROMMSG64(3);
}

/*************************************************
* Name: PQCLEAN_KYBER102490S_AVX2_poly_tomsg
*
* Description: Convert polynomial to 32-byte message
*
* Arguments: - uint8_t *msg: pointer to output message
* - poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_poly_frombytes(poly *r, const uint8_t *a) {
PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx(r->coeffs, a);
PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx(r->coeffs + 128, a + 192);
void PQCLEAN_KYBER102490S_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *restrict a) {
unsigned int i = 0;
uint32_t small = 0;
__m256i f0, f1, g0, g1;
const __m256i hqs = _mm256_set1_epi16((KYBER_Q - 1) / 2);
const __m256i hhqs = _mm256_set1_epi16((KYBER_Q - 5) / 4);

for (i = 0; i < KYBER_N / 32; i++) {
f0 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i]);
f1 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i + 16]);
f0 = _mm256_sub_epi16(hqs, f0);
f1 = _mm256_sub_epi16(hqs, f1);
g0 = _mm256_srai_epi16(f0, 15);
g1 = _mm256_srai_epi16(f1, 15);
f0 = _mm256_xor_si256(f0, g0);
f1 = _mm256_xor_si256(f1, g1);
f0 = _mm256_sub_epi16(hhqs, f0);
f1 = _mm256_sub_epi16(hhqs, f1);
f0 = _mm256_packs_epi16(f0, f1);
small = _mm256_movemask_epi8(f0);
small = ~small;
msg[4 * i + 0] = small;
msg[4 * i + 1] = small >> 16;
msg[4 * i + 2] = small >> 8;
msg[4 * i + 3] = small >> 24;
}
}

/*************************************************
* Name: poly_getnoise
* Name: PQCLEAN_KYBER102490S_AVX2_poly_getnoise
*
* Description: Sample a polynomial deterministically from a seed and a nonce,
* with output polynomial close to centered binomial distribution
* with parameter KYBER_ETA
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *seed: pointer to input seed
* (of length KYBER_SYMBYTES bytes)
* - uint8_t nonce: one-byte input nonce
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce) {
uint8_t buf[KYBER_ETA * KYBER_N / 4];

prf(buf, KYBER_ETA * KYBER_N / 4, seed, nonce);
PQCLEAN_KYBER102490S_AVX2_cbd(r, buf);
void PQCLEAN_KYBER102490S_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) {
ALIGN32_ARRAY(uint8_t, KYBER_ETA * KYBER_N / 4) buf;
prf(buf.arr, sizeof(buf.arr), seed, nonce);
PQCLEAN_KYBER102490S_AVX2_cbd(r, buf.arr);
}


/*************************************************
* Name: poly_ntt
* Name: PQCLEAN_KYBER102490S_AVX2_poly_ntt
*
* Description: Computes negacyclic number-theoretic transform (NTT) of
* a polynomial in place;
@@ -116,73 +213,78 @@ void PQCLEAN_KYBER102490S_AVX2_poly_getnoise(poly *r, const uint8_t *seed, uint8
* Arguments: - uint16_t *r: pointer to in/output polynomial
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_poly_ntt(poly *r) {
PQCLEAN_KYBER102490S_AVX2_ntt_level0_avx(r->coeffs, PQCLEAN_KYBER102490S_AVX2_zetas_exp);
PQCLEAN_KYBER102490S_AVX2_ntt_level0_avx(r->coeffs + 64, PQCLEAN_KYBER102490S_AVX2_zetas_exp);
PQCLEAN_KYBER102490S_AVX2_ntt_levels1t6_avx(r->coeffs, PQCLEAN_KYBER102490S_AVX2_zetas_exp + 4);
PQCLEAN_KYBER102490S_AVX2_ntt_levels1t6_avx(r->coeffs + 128, PQCLEAN_KYBER102490S_AVX2_zetas_exp + 200);
PQCLEAN_KYBER102490S_AVX2_ntt_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata);
}

/*************************************************
* Name: poly_invntt
* Name: PQCLEAN_KYBER102490S_AVX2_poly_invntt_tomont
*
* Description: Computes inverse of negacyclic number-theoretic transform (NTT) of
* a polynomial in place;
* Description: Computes inverse of negacyclic number-theoretic transform (NTT)
* of a polynomial in place;
* inputs assumed to be in bitreversed order, output in normal order
*
* Arguments: - uint16_t *a: pointer to in/output polynomial
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_poly_invntt(poly *r) {
PQCLEAN_KYBER102490S_AVX2_invntt_levels0t5_avx(r->coeffs, PQCLEAN_KYBER102490S_AVX2_zetas_inv_exp);
PQCLEAN_KYBER102490S_AVX2_invntt_levels0t5_avx(r->coeffs + 128, PQCLEAN_KYBER102490S_AVX2_zetas_inv_exp + 196);
PQCLEAN_KYBER102490S_AVX2_invntt_level6_avx(r->coeffs, PQCLEAN_KYBER102490S_AVX2_zetas_inv_exp + 392);
void PQCLEAN_KYBER102490S_AVX2_poly_invntt_tomont(poly *r) {
PQCLEAN_KYBER102490S_AVX2_invntt_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata);
}

// FIXME
void PQCLEAN_KYBER102490S_AVX2_poly_nttunpack(poly *r) {
PQCLEAN_KYBER102490S_AVX2_nttunpack_avx(r->coeffs);
PQCLEAN_KYBER102490S_AVX2_nttunpack_avx(r->coeffs + 128);
PQCLEAN_KYBER102490S_AVX2_nttunpack_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata);
}

//XXX Add comment
void PQCLEAN_KYBER102490S_AVX2_poly_basemul(poly *r, const poly *a, const poly *b) {
PQCLEAN_KYBER102490S_AVX2_basemul_avx(r->coeffs,
a->coeffs,
b->coeffs,
PQCLEAN_KYBER102490S_AVX2_zetas_exp + 152);
PQCLEAN_KYBER102490S_AVX2_basemul_avx(r->coeffs + 64,
a->coeffs + 64,
b->coeffs + 64,
PQCLEAN_KYBER102490S_AVX2_zetas_exp + 184);
PQCLEAN_KYBER102490S_AVX2_basemul_avx(r->coeffs + 128,
a->coeffs + 128,
b->coeffs + 128,
PQCLEAN_KYBER102490S_AVX2_zetas_exp + 348);
PQCLEAN_KYBER102490S_AVX2_basemul_avx(r->coeffs + 192,
a->coeffs + 192,
b->coeffs + 192,
PQCLEAN_KYBER102490S_AVX2_zetas_exp + 380);
/*************************************************
* Name: PQCLEAN_KYBER102490S_AVX2_poly_basemul_montgomery
*
* Description: Multiplication of two polynomials in NTT domain
*
* Arguments: - poly *r: pointer to output polynomial
* - const poly *a: pointer to first input polynomial
* - const poly *b: pointer to second input polynomial
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const poly *b) {
PQCLEAN_KYBER102490S_AVX2_basemul_avx(r->coeffs, a->coeffs, b->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata);
}

// FIXME
void PQCLEAN_KYBER102490S_AVX2_poly_frommont(poly *r) {
PQCLEAN_KYBER102490S_AVX2_frommont_avx(r->coeffs);
PQCLEAN_KYBER102490S_AVX2_frommont_avx(r->coeffs + 128);
/*************************************************
* Name: PQCLEAN_KYBER102490S_AVX2_poly_tomont
*
* Description: Inplace conversion of all coefficients of a polynomial
* from normal domain to Montgomery domain
*
* Arguments: - poly *r: pointer to input/output polynomial
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_poly_tomont(poly *r) {
PQCLEAN_KYBER102490S_AVX2_tomont_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata);
}

// FIXME
/*************************************************
* Name: PQCLEAN_KYBER102490S_AVX2_poly_reduce
*
* Description: Applies Barrett reduction to all coefficients of a polynomial
* for details of the Barrett reduction see comments in reduce.c
*
* Arguments: - poly *r: pointer to input/output polynomial
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_poly_reduce(poly *r) {
PQCLEAN_KYBER102490S_AVX2_reduce_avx(r->coeffs);
PQCLEAN_KYBER102490S_AVX2_reduce_avx(r->coeffs + 128);
PQCLEAN_KYBER102490S_AVX2_reduce_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata);
}

// FIXME
/*************************************************
* Name: PQCLEAN_KYBER102490S_AVX2_poly_csubq
*
* Description: Applies conditional subtraction of q to each coefficient
* of a polynomial. For details of conditional subtraction
* of q see comments in reduce.c
*
* Arguments: - poly *r: pointer to input/output polynomial
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_poly_csubq(poly *r) {
PQCLEAN_KYBER102490S_AVX2_csubq_avx(r->coeffs);
PQCLEAN_KYBER102490S_AVX2_csubq_avx(r->coeffs + 128);
PQCLEAN_KYBER102490S_AVX2_csubq_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata);
}

/*************************************************
* Name: poly_add
* Name: PQCLEAN_KYBER102490S_AVX2_poly_add
*
* Description: Add two polynomials
*
@@ -191,18 +293,19 @@ void PQCLEAN_KYBER102490S_AVX2_poly_csubq(poly *r) {
* - const poly *b: pointer to second input polynomial
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_poly_add(poly *r, const poly *a, const poly *b) {
__m256i vec0, vec1;

for (size_t i = 0; i < KYBER_N; i += 16) {
vec0 = _mm256_load_si256((__m256i *)&a->coeffs[i]);
vec1 = _mm256_load_si256((__m256i *)&b->coeffs[i]);
vec0 = _mm256_add_epi16(vec0, vec1);
_mm256_store_si256((__m256i *)&r->coeffs[i], vec0);
unsigned int i = 0;
__m256i f0, f1;

for (i = 0; i < KYBER_N; i += 16) {
f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]);
f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]);
f0 = _mm256_add_epi16(f0, f1);
_mm256_store_si256((__m256i *)&r->coeffs[i], f0);
}
}

/*************************************************
* Name: poly_sub
* Name: PQCLEAN_KYBER102490S_AVX2_poly_sub
*
* Description: Subtract two polynomials
*
@@ -211,127 +314,13 @@ void PQCLEAN_KYBER102490S_AVX2_poly_add(poly *r, const poly *a, const poly *b) {
* - const poly *b: pointer to second input polynomial
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_poly_sub(poly *r, const poly *a, const poly *b) {
__m256i vec0, vec1;

for (size_t i = 0; i < KYBER_N; i += 16) {
vec0 = _mm256_load_si256((__m256i *)&a->coeffs[i]);
vec1 = _mm256_load_si256((__m256i *)&b->coeffs[i]);
vec0 = _mm256_sub_epi16(vec0, vec1);
_mm256_store_si256((__m256i *)&r->coeffs[i], vec0);
}
}

/*************************************************
* Name: poly_frommsg
*
* Description: Convert 32-byte message to polynomial
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *msg: pointer to input message
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]) {
__m128i tmp;
__m256i a[4], d0, d1, d2, d3;
const __m256i shift = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
const __m256i zeros = _mm256_setzero_si256();
const __m256i ones = _mm256_set1_epi32(1);
const __m256i hqs = _mm256_set1_epi32((KYBER_Q + 1) / 2);

tmp = _mm_loadu_si128((__m128i *)msg);
for (size_t i = 0; i < 4; i++) {
a[i] = _mm256_broadcastd_epi32(tmp);
tmp = _mm_srli_si128(tmp, 4);
}

for (size_t i = 0; i < 4; i++) {
d0 = _mm256_srlv_epi32(a[i], shift);
d1 = _mm256_srli_epi32(d0, 8);
d2 = _mm256_srli_epi32(d0, 16);
d3 = _mm256_srli_epi32(d0, 24);

d0 = _mm256_and_si256(d0, ones);
d1 = _mm256_and_si256(d1, ones);
d2 = _mm256_and_si256(d2, ones);
d3 = _mm256_and_si256(d3, ones);

d0 = _mm256_sub_epi32(zeros, d0);
d1 = _mm256_sub_epi32(zeros, d1);
d2 = _mm256_sub_epi32(zeros, d2);
d3 = _mm256_sub_epi32(zeros, d3);

d0 = _mm256_and_si256(hqs, d0);
d1 = _mm256_and_si256(hqs, d1);
d2 = _mm256_and_si256(hqs, d2);
d3 = _mm256_and_si256(hqs, d3);

d0 = _mm256_packus_epi32(d0, d1);
d2 = _mm256_packus_epi32(d2, d3);
d0 = _mm256_permute4x64_epi64(d0, 0xD8);
d2 = _mm256_permute4x64_epi64(d2, 0xD8);
_mm256_store_si256((__m256i *)&r->coeffs[32 * i + 0], d0);
_mm256_store_si256((__m256i *)&r->coeffs[32 * i + 16], d2);
}

tmp = _mm_loadu_si128((__m128i *)&msg[16]);
for (size_t i = 0; i < 4; i++) {
a[i] = _mm256_broadcastd_epi32(tmp);
tmp = _mm_srli_si128(tmp, 4);
}

for (size_t i = 0; i < 4; i++) {
d0 = _mm256_srlv_epi32(a[i], shift);
d1 = _mm256_srli_epi32(d0, 8);
d2 = _mm256_srli_epi32(d0, 16);
d3 = _mm256_srli_epi32(d0, 24);

d0 = _mm256_and_si256(d0, ones);
d1 = _mm256_and_si256(d1, ones);
d2 = _mm256_and_si256(d2, ones);
d3 = _mm256_and_si256(d3, ones);

d0 = _mm256_sub_epi32(zeros, d0);
d1 = _mm256_sub_epi32(zeros, d1);
d2 = _mm256_sub_epi32(zeros, d2);
d3 = _mm256_sub_epi32(zeros, d3);

d0 = _mm256_and_si256(hqs, d0);
d1 = _mm256_and_si256(hqs, d1);
d2 = _mm256_and_si256(hqs, d2);
d3 = _mm256_and_si256(hqs, d3);

d0 = _mm256_packus_epi32(d0, d1);
d2 = _mm256_packus_epi32(d2, d3);
d0 = _mm256_permute4x64_epi64(d0, 0xD8);
d2 = _mm256_permute4x64_epi64(d2, 0xD8);
_mm256_store_si256((__m256i *)&r->coeffs[128 + 32 * i + 0], d0);
_mm256_store_si256((__m256i *)&r->coeffs[128 + 32 * i + 16], d2);
}
}

/*************************************************
* Name: poly_tomsg
*
* Description: Convert polynomial to 32-byte message
*
* Arguments: - uint8_t *msg: pointer to output message
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a) {
uint32_t small;
__m256i vec, tmp;
const __m256i hqs = _mm256_set1_epi16((KYBER_Q - 1) / 2);
const __m256i hhqs = _mm256_set1_epi16((KYBER_Q - 5) / 4);

for (size_t i = 0; i < KYBER_N / 16; i++) {
vec = _mm256_load_si256((__m256i *)&a->coeffs[16 * i]);
vec = _mm256_sub_epi16(hqs, vec);
tmp = _mm256_srai_epi16(vec, 15);
vec = _mm256_xor_si256(vec, tmp);
vec = _mm256_sub_epi16(hhqs, vec);
small = (uint32_t)_mm256_movemask_epi8(vec);
small = _pext_u32(small, 0xAAAAAAAA);
small = ~small;
msg[2 * i + 0] = (uint8_t)small;
msg[2 * i + 1] = (uint8_t)(small >> 8);
unsigned int i = 0;
__m256i f0, f1;

for (i = 0; i < KYBER_N; i += 16) {
f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]);
f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]);
f0 = _mm256_sub_epi16(f0, f1);
_mm256_store_si256((__m256i *)&r->coeffs[i], f0);
}
}

+ 28
- 14
crypto_kem/kyber1024-90s/avx2/poly.h View File

@@ -1,8 +1,7 @@
#ifndef POLY_H
#define POLY_H
#ifndef PQCLEAN_KYBER102490S_AVX2_POLY_H
#define PQCLEAN_KYBER102490S_AVX2_POLY_H

#include "params.h"

#include <immintrin.h>
#include <stdint.h>

@@ -11,32 +10,47 @@
* coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1]
*/
typedef union {
__m256i dummy;
int16_t coeffs[KYBER_N];
__m256i _dummy;
} poly;

void PQCLEAN_KYBER102490S_AVX2_poly_compress(uint8_t *r, poly *a);
void PQCLEAN_KYBER102490S_AVX2_poly_decompress(poly *r, const uint8_t *a);

void PQCLEAN_KYBER102490S_AVX2_poly_tobytes(uint8_t *r, poly *a);
void PQCLEAN_KYBER102490S_AVX2_poly_frombytes(poly *r, const uint8_t *a);
void PQCLEAN_KYBER102490S_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a);

void PQCLEAN_KYBER102490S_AVX2_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]);


void PQCLEAN_KYBER102490S_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a);

void PQCLEAN_KYBER102490S_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]);


void PQCLEAN_KYBER102490S_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]);

void PQCLEAN_KYBER102490S_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a);

void PQCLEAN_KYBER102490S_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]);
void PQCLEAN_KYBER102490S_AVX2_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a);

void PQCLEAN_KYBER102490S_AVX2_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce);
void PQCLEAN_KYBER102490S_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);


void PQCLEAN_KYBER102490S_AVX2_poly_ntt(poly *r);
void PQCLEAN_KYBER102490S_AVX2_poly_invntt(poly *r);

void PQCLEAN_KYBER102490S_AVX2_poly_invntt_tomont(poly *r);

void PQCLEAN_KYBER102490S_AVX2_poly_nttunpack(poly *r);
void PQCLEAN_KYBER102490S_AVX2_poly_basemul(poly *r, const poly *a, const poly *b);
void PQCLEAN_KYBER102490S_AVX2_poly_frommont(poly *r);

void PQCLEAN_KYBER102490S_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const poly *b);

void PQCLEAN_KYBER102490S_AVX2_poly_tomont(poly *r);


void PQCLEAN_KYBER102490S_AVX2_poly_reduce(poly *r);

void PQCLEAN_KYBER102490S_AVX2_poly_csubq(poly *r);


void PQCLEAN_KYBER102490S_AVX2_poly_add(poly *r, const poly *a, const poly *b);

void PQCLEAN_KYBER102490S_AVX2_poly_sub(poly *r, const poly *a, const poly *b);

#endif

+ 107
- 75
crypto_kem/kyber1024-90s/avx2/polyvec.c View File

@@ -1,167 +1,198 @@
#include "params.h"
#include "consts.h"
#include "ntt.h"
#include "poly.h"
#include "polyvec.h"

#include <stdint.h>

/*************************************************
* Name: polyvec_compress
* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_compress
*
* Description: Compress and serialize vector of polynomials
*
* Arguments: - uint8_t *r: pointer to output byte array
* - const polyvec *a: pointer to input vector of polynomials
* (needs space for KYBER_POLYVECCOMPRESSEDBYTES)
* - polyvec *a: pointer to input vector of polynomials
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_polyvec_compress(uint8_t *r, polyvec *a) {
void PQCLEAN_KYBER102490S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES],
polyvec *restrict a) {
unsigned int i = 0, j = 0, k = 0;

PQCLEAN_KYBER102490S_AVX2_polyvec_csubq(a);

uint16_t t[8];
for (size_t i = 0; i < KYBER_K; i++) {
for (size_t j = 0; j < KYBER_N / 8; j++) {
for (size_t k = 0; k < 8; k++) {
t[k] = ((((uint32_t)a->vec[i].coeffs[8 * j + k] << 11) + KYBER_Q / 2) / KYBER_Q) & 0x7ff;
for (i = 0; i < KYBER_K; i++) {
for (j = 0; j < KYBER_N / 8; j++) {
for (k = 0; k < 8; k++) {
{
t[k] = ((((uint32_t)a->vec[i].coeffs[8 * j + k] << 11) + KYBER_Q / 2)
/ KYBER_Q) & 0x7ff;
}
}

r[11 * j + 0] = (uint8_t)t[0];
r[11 * j + 1] = (uint8_t)((t[0] >> 8) | ((t[1] & 0x1f) << 3));
r[11 * j + 2] = (uint8_t)((t[1] >> 5) | ((t[2] & 0x03) << 6));
r[11 * j + 3] = (uint8_t)((t[2] >> 2));
r[11 * j + 4] = (uint8_t)((t[2] >> 10) | ((t[3] & 0x7f) << 1));
r[11 * j + 5] = (uint8_t)((t[3] >> 7) | ((t[4] & 0x0f) << 4));
r[11 * j + 6] = (uint8_t)((t[4] >> 4) | ((t[5] & 0x01) << 7));
r[11 * j + 7] = (uint8_t)((t[5] >> 1));
r[11 * j + 8] = (uint8_t)((t[5] >> 9) | ((t[6] & 0x3f) << 2));
r[11 * j + 9] = (uint8_t)((t[6] >> 6) | ((t[7] & 0x07) << 5));
r[11 * j + 10] = (uint8_t)((t[7] >> 3));
r[ 0] = (t[0] >> 0);
r[ 1] = (t[0] >> 8) | (t[1] << 3);
r[ 2] = (t[1] >> 5) | (t[2] << 6);
r[ 3] = (t[2] >> 2);
r[ 4] = (t[2] >> 10) | (t[3] << 1);
r[ 5] = (t[3] >> 7) | (t[4] << 4);
r[ 6] = (t[4] >> 4) | (t[5] << 7);
r[ 7] = (t[5] >> 1);
r[ 8] = (t[5] >> 9) | (t[6] << 2);
r[ 9] = (t[6] >> 6) | (t[7] << 5);
r[10] = (t[7] >> 3);
r += 11;
}
r += 352;
}
}

/*************************************************
* Name: polyvec_decompress
* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_decompress
*
* Description: De-serialize and decompress vector of polynomials;
* approximate inverse of polyvec_compress
* approximate inverse of PQCLEAN_KYBER102490S_AVX2_polyvec_compress
*
* Arguments: - polyvec *r: pointer to output vector of polynomials
* - uint8_t *a: pointer to input byte array
* - const uint8_t *a: pointer to input byte array
* (of length KYBER_POLYVECCOMPRESSEDBYTES)
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_polyvec_decompress(polyvec *r, const uint8_t *a) {
for (size_t i = 0; i < KYBER_K; i++) {
for (size_t j = 0; j < KYBER_N / 8; j++) {
r->vec[i].coeffs[8 * j + 0] = (int16_t)( (((a[11 * j + 0] | (((uint32_t)a[11 * j + 1] & 0x07) << 8)) * KYBER_Q) + 1024) >> 11);
r->vec[i].coeffs[8 * j + 1] = (int16_t)(((((a[11 * j + 1] >> 3) | (((uint32_t)a[11 * j + 2] & 0x3f) << 5)) * KYBER_Q) + 1024) >> 11);
r->vec[i].coeffs[8 * j + 2] = (int16_t)(((((a[11 * j + 2] >> 6) | (((uint32_t)a[11 * j + 3] & 0xff) << 2) | (((uint32_t)a[11 * j + 4] & 0x01) << 10)) * KYBER_Q) + 1024) >> 11);
r->vec[i].coeffs[8 * j + 3] = (int16_t)(((((a[11 * j + 4] >> 1) | (((uint32_t)a[11 * j + 5] & 0x0f) << 7)) * KYBER_Q) + 1024) >> 11);
r->vec[i].coeffs[8 * j + 4] = (int16_t)(((((a[11 * j + 5] >> 4) | (((uint32_t)a[11 * j + 6] & 0x7f) << 4)) * KYBER_Q) + 1024) >> 11);
r->vec[i].coeffs[8 * j + 5] = (int16_t)(((((a[11 * j + 6] >> 7) | (((uint32_t)a[11 * j + 7] & 0xff) << 1) | (((uint32_t)a[11 * j + 8] & 0x03) << 9)) * KYBER_Q) + 1024) >> 11);
r->vec[i].coeffs[8 * j + 6] = (int16_t)(((((a[11 * j + 8] >> 2) | (((uint32_t)a[11 * j + 9] & 0x1f) << 6)) * KYBER_Q) + 1024) >> 11);
r->vec[i].coeffs[8 * j + 7] = (int16_t)(((((a[11 * j + 9] >> 5) | (((uint32_t)a[11 * j + 10] & 0xff) << 3)) * KYBER_Q) + 1024) >> 11);
void PQCLEAN_KYBER102490S_AVX2_polyvec_decompress(polyvec *restrict r,
const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) {
unsigned int i = 0, j = 0, k = 0;

uint16_t t[8];
for (i = 0; i < KYBER_K; i++) {
for (j = 0; j < KYBER_N / 8; j++) {
t[0] = (a[0] >> 0) | ((uint16_t)a[ 1] << 8);
t[1] = (a[1] >> 3) | ((uint16_t)a[ 2] << 5);
t[2] = (a[2] >> 6) | ((uint16_t)a[ 3] << 2) | ((uint16_t)a[4] << 10);
t[3] = (a[4] >> 1) | ((uint16_t)a[ 5] << 7);
t[4] = (a[5] >> 4) | ((uint16_t)a[ 6] << 4);
t[5] = (a[6] >> 7) | ((uint16_t)a[ 7] << 1) | ((uint16_t)a[8] << 9);
t[6] = (a[8] >> 2) | ((uint16_t)a[ 9] << 6);
t[7] = (a[9] >> 5) | ((uint16_t)a[10] << 3);
a += 11;

for (k = 0; k < 8; k++) {
r->vec[i].coeffs[8 * j + k] = ((uint32_t)(t[k] & 0x7FF) * KYBER_Q + 1024) >> 11;
}
}
a += 352;
}
}

/*************************************************
* Name: polyvec_tobytes
* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_tobytes
*
* Description: Serialize vector of polynomials
*
* Arguments: - uint8_t *r: pointer to output byte array
* - const polyvec *a: pointer to input vector of polynomials
* (needs space for KYBER_POLYVECBYTES)
* - polyvec *a: pointer to input vector of polynomials
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_polyvec_tobytes(uint8_t *r, polyvec *a) {
for (size_t i = 0; i < KYBER_K; i++) {
void PQCLEAN_KYBER102490S_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a) {
unsigned int i = 0;
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_AVX2_poly_tobytes(r + i * KYBER_POLYBYTES, &a->vec[i]);
}
}

/*************************************************
* Name: polyvec_frombytes
* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_frombytes
*
* Description: De-serialize vector of polynomials;
* inverse of polyvec_tobytes
* inverse of PQCLEAN_KYBER102490S_AVX2_polyvec_tobytes
*
* Arguments: - uint8_t *r: pointer to output byte array
* Arguments: - uint8_t *r: pointer to output byte array
* - const polyvec *a: pointer to input vector of polynomials
* (of length KYBER_POLYVECBYTES)
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_polyvec_frombytes(polyvec *r, const uint8_t *a) {
for (size_t i = 0; i < KYBER_K; i++) {
void PQCLEAN_KYBER102490S_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]) {
unsigned int i = 0;
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_AVX2_poly_frombytes(&r->vec[i], a + i * KYBER_POLYBYTES);
}
}

/*************************************************
* Name: polyvec_ntt
* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_ntt
*
* Description: Apply forward NTT to all elements of a vector of polynomials
*
* Arguments: - polyvec *r: pointer to in/output vector of polynomials
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(polyvec *r) {
for (size_t i = 0; i < KYBER_K; i++) {
unsigned int i = 0;
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_AVX2_poly_ntt(&r->vec[i]);
}
}

/*************************************************
* Name: polyvec_invntt
* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_invntt_tomont
*
* Description: Apply inverse NTT to all elements of a vector of polynomials
* and multiply by Montgomery factor 2^16
*
* Arguments: - polyvec *r: pointer to in/output vector of polynomials
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_polyvec_invntt(polyvec *r) {
for (size_t i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_AVX2_poly_invntt(&r->vec[i]);
void PQCLEAN_KYBER102490S_AVX2_polyvec_invntt_tomont(polyvec *r) {
unsigned int i = 0;
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_AVX2_poly_invntt_tomont(&r->vec[i]);
}
}

/*************************************************
* Name: polyvec_pointwise_acc
* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery
*
* Description: Pointwise multiply elements of a and b and accumulate into r
* Description: Pointwise multiply elements of a and b, accumulate into r,
* and multiply by 2^-16.
*
* Arguments: - poly *r: pointer to output polynomial
* - const polyvec *a: pointer to first input vector of polynomials
* - const polyvec *b: pointer to second input vector of polynomials
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b) {
PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx(r->coeffs,
a->vec->coeffs,
b->vec->coeffs,
PQCLEAN_KYBER102490S_AVX2_zetas_exp + 152);
PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx(r->coeffs + 64,
a->vec->coeffs + 64,
b->vec->coeffs + 64,
PQCLEAN_KYBER102490S_AVX2_zetas_exp + 184);
PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx(r->coeffs + 128,
a->vec->coeffs + 128,
b->vec->coeffs + 128,
PQCLEAN_KYBER102490S_AVX2_zetas_exp + 348);
PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx(r->coeffs + 192,
a->vec->coeffs + 192,
b->vec->coeffs + 192,
PQCLEAN_KYBER102490S_AVX2_zetas_exp + 380);
void PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(poly *r,
const polyvec *a,
const polyvec *b) {
PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx(r->coeffs, a->vec->coeffs, b->vec->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata);
}

// FIXME
/*************************************************
* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_reduce
*
* Description: Applies Barrett reduction to each coefficient
* of each element of a vector of polynomials
* for details of the Barrett reduction see comments in reduce.c
*
* Arguments: - poly *r: pointer to input/output polynomial
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_polyvec_reduce(polyvec *r) {
for (size_t i = 0; i < KYBER_K; i++) {
unsigned int i = 0;
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_AVX2_poly_reduce(&r->vec[i]);
}
}

// FIXME
/*************************************************
* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_csubq
*
* Description: Applies conditional subtraction of q to each coefficient
* of each element of a vector of polynomials
* for details of conditional subtraction of q see comments in
* reduce.c
*
* Arguments: - poly *r: pointer to input/output polynomial
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_polyvec_csubq(polyvec *r) {
for (size_t i = 0; i < KYBER_K; i++) {
unsigned int i = 0;
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_AVX2_poly_csubq(&r->vec[i]);
}
}

/*************************************************
* Name: polyvec_add
* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_add
*
* Description: Add vectors of polynomials
*
@@ -170,7 +201,8 @@ void PQCLEAN_KYBER102490S_AVX2_polyvec_csubq(polyvec *r) {
* - const polyvec *b: pointer to second input vector of polynomials
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b) {
for (size_t i = 0; i < KYBER_K; i++) {
unsigned int i = 0;
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_AVX2_poly_add(&r->vec[i], &a->vec[i], &b->vec[i]);
}
}

+ 21
- 9
crypto_kem/kyber1024-90s/avx2/polyvec.h View File

@@ -1,29 +1,41 @@
#ifndef POLYVEC_H
#define POLYVEC_H
#ifndef PQCLEAN_KYBER102490S_AVX2_POLYVEC_H
#define PQCLEAN_KYBER102490S_AVX2_POLYVEC_H

#include "params.h"
#include "poly.h"

#include <stdint.h>

typedef struct {
poly vec[KYBER_K];
} polyvec;

void PQCLEAN_KYBER102490S_AVX2_polyvec_compress(uint8_t *r, polyvec *a);
void PQCLEAN_KYBER102490S_AVX2_polyvec_decompress(polyvec *r, const uint8_t *a);

void PQCLEAN_KYBER102490S_AVX2_polyvec_tobytes(uint8_t *r, polyvec *a);
void PQCLEAN_KYBER102490S_AVX2_polyvec_frombytes(polyvec *r, const uint8_t *a);
void PQCLEAN_KYBER102490S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a);

void PQCLEAN_KYBER102490S_AVX2_polyvec_decompress(polyvec *r,
const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]);


void PQCLEAN_KYBER102490S_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a);

void PQCLEAN_KYBER102490S_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]);


void PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(polyvec *r);
void PQCLEAN_KYBER102490S_AVX2_polyvec_invntt(polyvec *r);

void PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b);
void PQCLEAN_KYBER102490S_AVX2_polyvec_invntt_tomont(polyvec *r);


void PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(poly *r,
const polyvec *a,
const polyvec *b);


void PQCLEAN_KYBER102490S_AVX2_polyvec_reduce(polyvec *r);

void PQCLEAN_KYBER102490S_AVX2_polyvec_csubq(polyvec *r);


void PQCLEAN_KYBER102490S_AVX2_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b);

#endif

+ 9
- 3
crypto_kem/kyber1024-90s/avx2/reduce.h View File

@@ -3,8 +3,14 @@

#include <stdint.h>

int16_t PQCLEAN_KYBER102490S_AVX2_reduce_avx(int16_t *r);
int16_t PQCLEAN_KYBER102490S_AVX2_csubq_avx(int16_t *r);
int16_t PQCLEAN_KYBER102490S_AVX2_frommont_avx(int16_t *r);
#include "consts.h"
#include "params.h"


int16_t PQCLEAN_KYBER102490S_AVX2_reduce_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata);

int16_t PQCLEAN_KYBER102490S_AVX2_csubq_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata);

int16_t PQCLEAN_KYBER102490S_AVX2_tomont_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata);

#endif

+ 325
- 351
crypto_kem/kyber1024-90s/avx2/rejsample.c View File

@@ -1,386 +1,360 @@
#include "align.h"
#include "consts.h"
#include "params.h"
#include "rejsample.h"

#include <immintrin.h>
#include <stdint.h>

static const uint8_t idx[256][8] = {
{ 0, 0, 0, 0, 0, 0, 0, 0},
{ 0, 0, 0, 0, 0, 0, 0, 0},
{ 2, 0, 0, 0, 0, 0, 0, 0},
{ 0, 2, 0, 0, 0, 0, 0, 0},
{ 4, 0, 0, 0, 0, 0, 0, 0},
{ 0, 4, 0, 0, 0, 0, 0, 0},
{ 2, 4, 0, 0, 0, 0, 0, 0},
{ 0, 2, 4, 0, 0, 0, 0, 0},
{ 6, 0, 0, 0, 0, 0, 0, 0},
{ 0, 6, 0, 0, 0, 0, 0, 0},
{ 2, 6, 0, 0, 0, 0, 0, 0},
{ 0, 2, 6, 0, 0, 0, 0, 0},
{ 4, 6, 0, 0, 0, 0, 0, 0},
{ 0, 4, 6, 0, 0, 0, 0, 0},
{ 2, 4, 6, 0, 0, 0, 0, 0},
{ 0, 2, 4, 6, 0, 0, 0, 0},
{ 8, 0, 0, 0, 0, 0, 0, 0},
{ 0, 8, 0, 0, 0, 0, 0, 0},
{ 2, 8, 0, 0, 0, 0, 0, 0},
{ 0, 2, 8, 0, 0, 0, 0, 0},
{ 4, 8, 0, 0, 0, 0, 0, 0},
{ 0, 4, 8, 0, 0, 0, 0, 0},
{ 2, 4, 8, 0, 0, 0, 0, 0},
{ 0, 2, 4, 8, 0, 0, 0, 0},
{ 6, 8, 0, 0, 0, 0, 0, 0},
{ 0, 6, 8, 0, 0, 0, 0, 0},
{ 2, 6, 8, 0, 0, 0, 0, 0},
{ 0, 2, 6, 8, 0, 0, 0, 0},
{ 4, 6, 8, 0, 0, 0, 0, 0},
{ 0, 4, 6, 8, 0, 0, 0, 0},
{ 2, 4, 6, 8, 0, 0, 0, 0},
{ 0, 2, 4, 6, 8, 0, 0, 0},
{10, 0, 0, 0, 0, 0, 0, 0},
{ 0, 10, 0, 0, 0, 0, 0, 0},
{ 2, 10, 0, 0, 0, 0, 0, 0},
{ 0, 2, 10, 0, 0, 0, 0, 0},
{ 4, 10, 0, 0, 0, 0, 0, 0},
{ 0, 4, 10, 0, 0, 0, 0, 0},
{ 2, 4, 10, 0, 0, 0, 0, 0},
{ 0, 2, 4, 10, 0, 0, 0, 0},
{ 6, 10, 0, 0, 0, 0, 0, 0},
{ 0, 6, 10, 0, 0, 0, 0, 0},
{ 2, 6, 10, 0, 0, 0, 0, 0},
{ 0, 2, 6, 10, 0, 0, 0, 0},
{ 4, 6, 10, 0, 0, 0, 0, 0},
{ 0, 4, 6, 10, 0, 0, 0, 0},
{ 2, 4, 6, 10, 0, 0, 0, 0},
{ 0, 2, 4, 6, 10, 0, 0, 0},
{ 8, 10, 0, 0, 0, 0, 0, 0},
{ 0, 8, 10, 0, 0, 0, 0, 0},
{ 2, 8, 10, 0, 0, 0, 0, 0},
{ 0, 2, 8, 10, 0, 0, 0, 0},
{ 4, 8, 10, 0, 0, 0, 0, 0},
{ 0, 4, 8, 10, 0, 0, 0, 0},
{ 2, 4, 8, 10, 0, 0, 0, 0},
{ 0, 2, 4, 8, 10, 0, 0, 0},
{ 6, 8, 10, 0, 0, 0, 0, 0},
{ 0, 6, 8, 10, 0, 0, 0, 0},
{ 2, 6, 8, 10, 0, 0, 0, 0},
{ 0, 2, 6, 8, 10, 0, 0, 0},
{ 4, 6, 8, 10, 0, 0, 0, 0},
{ 0, 4, 6, 8, 10, 0, 0, 0},
{ 2, 4, 6, 8, 10, 0, 0, 0},
{ 0, 2, 4, 6, 8, 10, 0, 0},
{12, 0, 0, 0, 0, 0, 0, 0},
{ 0, 12, 0, 0, 0, 0, 0, 0},
{ 2, 12, 0, 0, 0, 0, 0, 0},
{ 0, 2, 12, 0, 0, 0, 0, 0},
{ 4, 12, 0, 0, 0, 0, 0, 0},
{ 0, 4, 12, 0, 0, 0, 0, 0},
{ 2, 4, 12, 0, 0, 0, 0, 0},
{ 0, 2, 4, 12, 0, 0, 0, 0},
{ 6, 12, 0, 0, 0, 0, 0, 0},
{ 0, 6, 12, 0, 0, 0, 0, 0},
{ 2, 6, 12, 0, 0, 0, 0, 0},
{ 0, 2, 6, 12, 0, 0, 0, 0},
{ 4, 6, 12, 0, 0, 0, 0, 0},
{ 0, 4, 6, 12, 0, 0, 0, 0},
{ 2, 4, 6, 12, 0, 0, 0, 0},
{ 0, 2, 4, 6, 12, 0, 0, 0},
{ 8, 12, 0, 0, 0, 0, 0, 0},
{ 0, 8, 12, 0, 0, 0, 0, 0},
{ 2, 8, 12, 0, 0, 0, 0, 0},
{ 0, 2, 8, 12, 0, 0, 0, 0},
{ 4, 8, 12, 0, 0, 0, 0, 0},
{ 0, 4, 8, 12, 0, 0, 0, 0},
{ 2, 4, 8, 12, 0, 0, 0, 0},
{ 0, 2, 4, 8, 12, 0, 0, 0},
{ 6, 8, 12, 0, 0, 0, 0, 0},
{ 0, 6, 8, 12, 0, 0, 0, 0},
{ 2, 6, 8, 12, 0, 0, 0, 0},
{ 0, 2, 6, 8, 12, 0, 0, 0},
{ 4, 6, 8, 12, 0, 0, 0, 0},
{ 0, 4, 6, 8, 12, 0, 0, 0},
{ 2, 4, 6, 8, 12, 0, 0, 0},
{ 0, 2, 4, 6, 8, 12, 0, 0},
{10, 12, 0, 0, 0, 0, 0, 0},
{ 0, 10, 12, 0, 0, 0, 0, 0},
{ 2, 10, 12, 0, 0, 0, 0, 0},
{ 0, 2, 10, 12, 0, 0, 0, 0},
{ 4, 10, 12, 0, 0, 0, 0, 0},
{ 0, 4, 10, 12, 0, 0, 0, 0},
{ 2, 4, 10, 12, 0, 0, 0, 0},
{ 0, 2, 4, 10, 12, 0, 0, 0},
{ 6, 10, 12, 0, 0, 0, 0, 0},
{ 0, 6, 10, 12, 0, 0, 0, 0},
{ 2, 6, 10, 12, 0, 0, 0, 0},
{ 0, 2, 6, 10, 12, 0, 0, 0},
{ 4, 6, 10, 12, 0, 0, 0, 0},
{ 0, 4, 6, 10, 12, 0, 0, 0},
{ 2, 4, 6, 10, 12, 0, 0, 0},
{ 0, 2, 4, 6, 10, 12, 0, 0},
{ 8, 10, 12, 0, 0, 0, 0, 0},
{ 0, 8, 10, 12, 0, 0, 0, 0},
{ 2, 8, 10, 12, 0, 0, 0, 0},
{ 0, 2, 8, 10, 12, 0, 0, 0},
{ 4, 8, 10, 12, 0, 0, 0, 0},
{ 0, 4, 8, 10, 12, 0, 0, 0},
{ 2, 4, 8, 10, 12, 0, 0, 0},
{ 0, 2, 4, 8, 10, 12, 0, 0},
{ 6, 8, 10, 12, 0, 0, 0, 0},
{ 0, 6, 8, 10, 12, 0, 0, 0},
{ 2, 6, 8, 10, 12, 0, 0, 0},
{ 0, 2, 6, 8, 10, 12, 0, 0},
{ 4, 6, 8, 10, 12, 0, 0, 0},
{ 0, 4, 6, 8, 10, 12, 0, 0},
{ 2, 4, 6, 8, 10, 12, 0, 0},
{ 0, 2, 4, 6, 8, 10, 12, 0},
{14, 0, 0, 0, 0, 0, 0, 0},
{ 0, 14, 0, 0, 0, 0, 0, 0},
{ 2, 14, 0, 0, 0, 0, 0, 0},
{ 0, 2, 14, 0, 0, 0, 0, 0},
{ 4, 14, 0, 0, 0, 0, 0, 0},
{ 0, 4, 14, 0, 0, 0, 0, 0},
{ 2, 4, 14, 0, 0, 0, 0, 0},
{ 0, 2, 4, 14, 0, 0, 0, 0},
{ 6, 14, 0, 0, 0, 0, 0, 0},
{ 0, 6, 14, 0, 0, 0, 0, 0},
{ 2, 6, 14, 0, 0, 0, 0, 0},
{ 0, 2, 6, 14, 0, 0, 0, 0},
{ 4, 6, 14, 0, 0, 0, 0, 0},
{ 0, 4, 6, 14, 0, 0, 0, 0},
{ 2, 4, 6, 14, 0, 0, 0, 0},
{ 0, 2, 4, 6, 14, 0, 0, 0},
{ 8, 14, 0, 0, 0, 0, 0, 0},
{ 0, 8, 14, 0, 0, 0, 0, 0},
{ 2, 8, 14, 0, 0, 0, 0, 0},
{ 0, 2, 8, 14, 0, 0, 0, 0},
{ 4, 8, 14, 0, 0, 0, 0, 0},
{ 0, 4, 8, 14, 0, 0, 0, 0},
{ 2, 4, 8, 14, 0, 0, 0, 0},
{ 0, 2, 4, 8, 14, 0, 0, 0},
{ 6, 8, 14, 0, 0, 0, 0, 0},
{ 0, 6, 8, 14, 0, 0, 0, 0},
{ 2, 6, 8, 14, 0, 0, 0, 0},
{ 0, 2, 6, 8, 14, 0, 0, 0},
{ 4, 6, 8, 14, 0, 0, 0, 0},
{ 0, 4, 6, 8, 14, 0, 0, 0},
{ 2, 4, 6, 8, 14, 0, 0, 0},
{ 0, 2, 4, 6, 8, 14, 0, 0},
{10, 14, 0, 0, 0, 0, 0, 0},
{ 0, 10, 14, 0, 0, 0, 0, 0},
{ 2, 10, 14, 0, 0, 0, 0, 0},
{ 0, 2, 10, 14, 0, 0, 0, 0},
{ 4, 10, 14, 0, 0, 0, 0, 0},
{ 0, 4, 10, 14, 0, 0, 0, 0},
{ 2, 4, 10, 14, 0, 0, 0, 0},
{ 0, 2, 4, 10, 14, 0, 0, 0},
{ 6, 10, 14, 0, 0, 0, 0, 0},
{ 0, 6, 10, 14, 0, 0, 0, 0},
{ 2, 6, 10, 14, 0, 0, 0, 0},
{ 0, 2, 6, 10, 14, 0, 0, 0},
{ 4, 6, 10, 14, 0, 0, 0, 0},
{ 0, 4, 6, 10, 14, 0, 0, 0},
{ 2, 4, 6, 10, 14, 0, 0, 0},
{ 0, 2, 4, 6, 10, 14, 0, 0},
{ 8, 10, 14, 0, 0, 0, 0, 0},
{ 0, 8, 10, 14, 0, 0, 0, 0},
{ 2, 8, 10, 14, 0, 0, 0, 0},
{ 0, 2, 8, 10, 14, 0, 0, 0},
{ 4, 8, 10, 14, 0, 0, 0, 0},
{ 0, 4, 8, 10, 14, 0, 0, 0},
{ 2, 4, 8, 10, 14, 0, 0, 0},
{ 0, 2, 4, 8, 10, 14, 0, 0},
{ 6, 8, 10, 14, 0, 0, 0, 0},
{ 0, 6, 8, 10, 14, 0, 0, 0},
{ 2, 6, 8, 10, 14, 0, 0, 0},
{ 0, 2, 6, 8, 10, 14, 0, 0},
{ 4, 6, 8, 10, 14, 0, 0, 0},
{ 0, 4, 6, 8, 10, 14, 0, 0},
{ 2, 4, 6, 8, 10, 14, 0, 0},
{ 0, 2, 4, 6, 8, 10, 14, 0},
{12, 14, 0, 0, 0, 0, 0, 0},
{ 0, 12, 14, 0, 0, 0, 0, 0},
{ 2, 12, 14, 0, 0, 0, 0, 0},
{ 0, 2, 12, 14, 0, 0, 0, 0},
{ 4, 12, 14, 0, 0, 0, 0, 0},
{ 0, 4, 12, 14, 0, 0, 0, 0},
{ 2, 4, 12, 14, 0, 0, 0, 0},
{ 0, 2, 4, 12, 14, 0, 0, 0},
{ 6, 12, 14, 0, 0, 0, 0, 0},
{ 0, 6, 12, 14, 0, 0, 0, 0},
{ 2, 6, 12, 14, 0, 0, 0, 0},
{ 0, 2, 6, 12, 14, 0, 0, 0},
{ 4, 6, 12, 14, 0, 0, 0, 0},
{ 0, 4, 6, 12, 14, 0, 0, 0},
{ 2, 4, 6, 12, 14, 0, 0, 0},
{ 0, 2, 4, 6, 12, 14, 0, 0},
{ 8, 12, 14, 0, 0, 0, 0, 0},
{ 0, 8, 12, 14, 0, 0, 0, 0},
{ 2, 8, 12, 14, 0, 0, 0, 0},
{ 0, 2, 8, 12, 14, 0, 0, 0},
{ 4, 8, 12, 14, 0, 0, 0, 0},
{ 0, 4, 8, 12, 14, 0, 0, 0},
{ 2, 4, 8, 12, 14, 0, 0, 0},
{ 0, 2, 4, 8, 12, 14, 0, 0},
{ 6, 8, 12, 14, 0, 0, 0, 0},
{ 0, 6, 8, 12, 14, 0, 0, 0},
{ 2, 6, 8, 12, 14, 0, 0, 0},
{ 0, 2, 6, 8, 12, 14, 0, 0},
{ 4, 6, 8, 12, 14, 0, 0, 0},
{ 0, 4, 6, 8, 12, 14, 0, 0},
{ 2, 4, 6, 8, 12, 14, 0, 0},
{ 0, 2, 4, 6, 8, 12, 14, 0},
{10, 12, 14, 0, 0, 0, 0, 0},
{ 0, 10, 12, 14, 0, 0, 0, 0},
{ 2, 10, 12, 14, 0, 0, 0, 0},
{ 0, 2, 10, 12, 14, 0, 0, 0},
{ 4, 10, 12, 14, 0, 0, 0, 0},
{ 0, 4, 10, 12, 14, 0, 0, 0},
{ 2, 4, 10, 12, 14, 0, 0, 0},
{ 0, 2, 4, 10, 12, 14, 0, 0},
{ 6, 10, 12, 14, 0, 0, 0, 0},
{ 0, 6, 10, 12, 14, 0, 0, 0},
{ 2, 6, 10, 12, 14, 0, 0, 0},
{ 0, 2, 6, 10, 12, 14, 0, 0},
{ 4, 6, 10, 12, 14, 0, 0, 0},
{ 0, 4, 6, 10, 12, 14, 0, 0},
{ 2, 4, 6, 10, 12, 14, 0, 0},
{ 0, 2, 4, 6, 10, 12, 14, 0},
{ 8, 10, 12, 14, 0, 0, 0, 0},
{ 0, 8, 10, 12, 14, 0, 0, 0},
{ 2, 8, 10, 12, 14, 0, 0, 0},
{ 0, 2, 8, 10, 12, 14, 0, 0},
{ 4, 8, 10, 12, 14, 0, 0, 0},
{ 0, 4, 8, 10, 12, 14, 0, 0},
{ 2, 4, 8, 10, 12, 14, 0, 0},
{ 0, 2, 4, 8, 10, 12, 14, 0},
{ 6, 8, 10, 12, 14, 0, 0, 0},
{ 0, 6, 8, 10, 12, 14, 0, 0},
{ 2, 6, 8, 10, 12, 14, 0, 0},
{ 0, 2, 6, 8, 10, 12, 14, 0},
{ 4, 6, 8, 10, 12, 14, 0, 0},
{ 0, 4, 6, 8, 10, 12, 14, 0},
{ 2, 4, 6, 8, 10, 12, 14, 0},
{ 0, 2, 4, 6, 8, 10, 12, 14}
static const ALIGN32_ARRAY_2D(uint8_t, 256, 8) idx = {.arr = {
{-1, -1, -1, -1, -1, -1, -1, -1},
{ 0, -1, -1, -1, -1, -1, -1, -1},
{ 2, -1, -1, -1, -1, -1, -1, -1},
{ 0, 2, -1, -1, -1, -1, -1, -1},
{ 4, -1, -1, -1, -1, -1, -1, -1},
{ 0, 4, -1, -1, -1, -1, -1, -1},
{ 2, 4, -1, -1, -1, -1, -1, -1},
{ 0, 2, 4, -1, -1, -1, -1, -1},
{ 6, -1, -1, -1, -1, -1, -1, -1},
{ 0, 6, -1, -1, -1, -1, -1, -1},
{ 2, 6, -1, -1, -1, -1, -1, -1},
{ 0, 2, 6, -1, -1, -1, -1, -1},
{ 4, 6, -1, -1, -1, -1, -1, -1},
{ 0, 4, 6, -1, -1, -1, -1, -1},
{ 2, 4, 6, -1, -1, -1, -1, -1},
{ 0, 2, 4, 6, -1, -1, -1, -1},
{ 8, -1, -1, -1, -1, -1, -1, -1},
{ 0, 8, -1, -1, -1, -1, -1, -1},
{ 2, 8, -1, -1, -1, -1, -1, -1},
{ 0, 2, 8, -1, -1, -1, -1, -1},
{ 4, 8, -1, -1, -1, -1, -1, -1},
{ 0, 4, 8, -1, -1, -1, -1, -1},
{ 2, 4, 8, -1, -1, -1, -1, -1},
{ 0, 2, 4, 8, -1, -1, -1, -1},
{ 6, 8, -1, -1, -1, -1, -1, -1},
{ 0, 6, 8, -1, -1, -1, -1, -1},
{ 2, 6, 8, -1, -1, -1, -1, -1},
{ 0, 2, 6, 8, -1, -1, -1, -1},
{ 4, 6, 8, -1, -1, -1, -1, -1},
{ 0, 4, 6, 8, -1, -1, -1, -1},
{ 2, 4, 6, 8, -1, -1, -1, -1},
{ 0, 2, 4, 6, 8, -1, -1, -1},
{10, -1, -1, -1, -1, -1, -1, -1},
{ 0, 10, -1, -1, -1, -1, -1, -1},
{ 2, 10, -1, -1, -1, -1, -1, -1},
{ 0, 2, 10, -1, -1, -1, -1, -1},
{ 4, 10, -1, -1, -1, -1, -1, -1},
{ 0, 4, 10, -1, -1, -1, -1, -1},
{ 2, 4, 10, -1, -1, -1, -1, -1},
{ 0, 2, 4, 10, -1, -1, -1, -1},
{ 6, 10, -1, -1, -1, -1, -1, -1},
{ 0, 6, 10, -1, -1, -1, -1, -1},
{ 2, 6, 10, -1, -1, -1, -1, -1},
{ 0, 2, 6, 10, -1, -1, -1, -1},
{ 4, 6, 10, -1, -1, -1, -1, -1},
{ 0, 4, 6, 10, -1, -1, -1, -1},
{ 2, 4, 6, 10, -1, -1, -1, -1},
{ 0, 2, 4, 6, 10, -1, -1, -1},
{ 8, 10, -1, -1, -1, -1, -1, -1},
{ 0, 8, 10, -1, -1, -1, -1, -1},
{ 2, 8, 10, -1, -1, -1, -1, -1},
{ 0, 2, 8, 10, -1, -1, -1, -1},
{ 4, 8, 10, -1, -1, -1, -1, -1},
{ 0, 4, 8, 10, -1, -1, -1, -1},
{ 2, 4, 8, 10, -1, -1, -1, -1},
{ 0, 2, 4, 8, 10, -1, -1, -1},
{ 6, 8, 10, -1, -1, -1, -1, -1},
{ 0, 6, 8, 10, -1, -1, -1, -1},
{ 2, 6, 8, 10, -1, -1, -1, -1},
{ 0, 2, 6, 8, 10, -1, -1, -1},
{ 4, 6, 8, 10, -1, -1, -1, -1},
{ 0, 4, 6, 8, 10, -1, -1, -1},
{ 2, 4, 6, 8, 10, -1, -1, -1},
{ 0, 2, 4, 6, 8, 10, -1, -1},
{12, -1, -1, -1, -1, -1, -1, -1},
{ 0, 12, -1, -1, -1, -1, -1, -1},
{ 2, 12, -1, -1, -1, -1, -1, -1},
{ 0, 2, 12, -1, -1, -1, -1, -1},
{ 4, 12, -1, -1, -1, -1, -1, -1},
{ 0, 4, 12, -1, -1, -1, -1, -1},
{ 2, 4, 12, -1, -1, -1, -1, -1},
{ 0, 2, 4, 12, -1, -1, -1, -1},
{ 6, 12, -1, -1, -1, -1, -1, -1},
{ 0, 6, 12, -1, -1, -1, -1, -1},
{ 2, 6, 12, -1, -1, -1, -1, -1},
{ 0, 2, 6, 12, -1, -1, -1, -1},
{ 4, 6, 12, -1, -1, -1, -1, -1},
{ 0, 4, 6, 12, -1, -1, -1, -1},
{ 2, 4, 6, 12, -1, -1, -1, -1},
{ 0, 2, 4, 6, 12, -1, -1, -1},
{ 8, 12, -1, -1, -1, -1, -1, -1},
{ 0, 8, 12, -1, -1, -1, -1, -1},
{ 2, 8, 12, -1, -1, -1, -1, -1},
{ 0, 2, 8, 12, -1, -1, -1, -1},
{ 4, 8, 12, -1, -1, -1, -1, -1},
{ 0, 4, 8, 12, -1, -1, -1, -1},
{ 2, 4, 8, 12, -1, -1, -1, -1},
{ 0, 2, 4, 8, 12, -1, -1, -1},
{ 6, 8, 12, -1, -1, -1, -1, -1},
{ 0, 6, 8, 12, -1, -1, -1, -1},
{ 2, 6, 8, 12, -1, -1, -1, -1},
{ 0, 2, 6, 8, 12, -1, -1, -1},
{ 4, 6, 8, 12, -1, -1, -1, -1},
{ 0, 4, 6, 8, 12, -1, -1, -1},
{ 2, 4, 6, 8, 12, -1, -1, -1},
{ 0, 2, 4, 6, 8, 12, -1, -1},
{10, 12, -1, -1, -1, -1, -1, -1},
{ 0, 10, 12, -1, -1, -1, -1, -1},
{ 2, 10, 12, -1, -1, -1, -1, -1},
{ 0, 2, 10, 12, -1, -1, -1, -1},
{ 4, 10, 12, -1, -1, -1, -1, -1},
{ 0, 4, 10, 12, -1, -1, -1, -1},
{ 2, 4, 10, 12, -1, -1, -1, -1},
{ 0, 2, 4, 10, 12, -1, -1, -1},
{ 6, 10, 12, -1, -1, -1, -1, -1},
{ 0, 6, 10, 12, -1, -1, -1, -1},
{ 2, 6, 10, 12, -1, -1, -1, -1},
{ 0, 2, 6, 10, 12, -1, -1, -1},
{ 4, 6, 10, 12, -1, -1, -1, -1},
{ 0, 4, 6, 10, 12, -1, -1, -1},
{ 2, 4, 6, 10, 12, -1, -1, -1},
{ 0, 2, 4, 6, 10, 12, -1, -1},
{ 8, 10, 12, -1, -1, -1, -1, -1},
{ 0, 8, 10, 12, -1, -1, -1, -1},
{ 2, 8, 10, 12, -1, -1, -1, -1},
{ 0, 2, 8, 10, 12, -1, -1, -1},
{ 4, 8, 10, 12, -1, -1, -1, -1},
{ 0, 4, 8, 10, 12, -1, -1, -1},
{ 2, 4, 8, 10, 12, -1, -1, -1},
{ 0, 2, 4, 8, 10, 12, -1, -1},
{ 6, 8, 10, 12, -1, -1, -1, -1},
{ 0, 6, 8, 10, 12, -1, -1, -1},
{ 2, 6, 8, 10, 12, -1, -1, -1},
{ 0, 2, 6, 8, 10, 12, -1, -1},
{ 4, 6, 8, 10, 12, -1, -1, -1},
{ 0, 4, 6, 8, 10, 12, -1, -1},
{ 2, 4, 6, 8, 10, 12, -1, -1},
{ 0, 2, 4, 6, 8, 10, 12, -1},
{14, -1, -1, -1, -1, -1, -1, -1},
{ 0, 14, -1, -1, -1, -1, -1, -1},
{ 2, 14, -1, -1, -1, -1, -1, -1},
{ 0, 2, 14, -1, -1, -1, -1, -1},
{ 4, 14, -1, -1, -1, -1, -1, -1},
{ 0, 4, 14, -1, -1, -1, -1, -1},
{ 2, 4, 14, -1, -1, -1, -1, -1},
{ 0, 2, 4, 14, -1, -1, -1, -1},
{ 6, 14, -1, -1, -1, -1, -1, -1},
{ 0, 6, 14, -1, -1, -1, -1, -1},
{ 2, 6, 14, -1, -1, -1, -1, -1},
{ 0, 2, 6, 14, -1, -1, -1, -1},
{ 4, 6, 14, -1, -1, -1, -1, -1},
{ 0, 4, 6, 14, -1, -1, -1, -1},
{ 2, 4, 6, 14, -1, -1, -1, -1},
{ 0, 2, 4, 6, 14, -1, -1, -1},
{ 8, 14, -1, -1, -1, -1, -1, -1},
{ 0, 8, 14, -1, -1, -1, -1, -1},
{ 2, 8, 14, -1, -1, -1, -1, -1},
{ 0, 2, 8, 14, -1, -1, -1, -1},
{ 4, 8, 14, -1, -1, -1, -1, -1},
{ 0, 4, 8, 14, -1, -1, -1, -1},
{ 2, 4, 8, 14, -1, -1, -1, -1},
{ 0, 2, 4, 8, 14, -1, -1, -1},
{ 6, 8, 14, -1, -1, -1, -1, -1},
{ 0, 6, 8, 14, -1, -1, -1, -1},
{ 2, 6, 8, 14, -1, -1, -1, -1},
{ 0, 2, 6, 8, 14, -1, -1, -1},
{ 4, 6, 8, 14, -1, -1, -1, -1},
{ 0, 4, 6, 8, 14, -1, -1, -1},
{ 2, 4, 6, 8, 14, -1, -1, -1},
{ 0, 2, 4, 6, 8, 14, -1, -1},
{10, 14, -1, -1, -1, -1, -1, -1},
{ 0, 10, 14, -1, -1, -1, -1, -1},
{ 2, 10, 14, -1, -1, -1, -1, -1},
{ 0, 2, 10, 14, -1, -1, -1, -1},
{ 4, 10, 14, -1, -1, -1, -1, -1},
{ 0, 4, 10, 14, -1, -1, -1, -1},
{ 2, 4, 10, 14, -1, -1, -1, -1},
{ 0, 2, 4, 10, 14, -1, -1, -1},
{ 6, 10, 14, -1, -1, -1, -1, -1},
{ 0, 6, 10, 14, -1, -1, -1, -1},
{ 2, 6, 10, 14, -1, -1, -1, -1},
{ 0, 2, 6, 10, 14, -1, -1, -1},
{ 4, 6, 10, 14, -1, -1, -1, -1},
{ 0, 4, 6, 10, 14, -1, -1, -1},
{ 2, 4, 6, 10, 14, -1, -1, -1},
{ 0, 2, 4, 6, 10, 14, -1, -1},
{ 8, 10, 14, -1, -1, -1, -1, -1},
{ 0, 8, 10, 14, -1, -1, -1, -1},
{ 2, 8, 10, 14, -1, -1, -1, -1},
{ 0, 2, 8, 10, 14, -1, -1, -1},
{ 4, 8, 10, 14, -1, -1, -1, -1},
{ 0, 4, 8, 10, 14, -1, -1, -1},
{ 2, 4, 8, 10, 14, -1, -1, -1},
{ 0, 2, 4, 8, 10, 14, -1, -1},
{ 6, 8, 10, 14, -1, -1, -1, -1},
{ 0, 6, 8, 10, 14, -1, -1, -1},
{ 2, 6, 8, 10, 14, -1, -1, -1},
{ 0, 2, 6, 8, 10, 14, -1, -1},
{ 4, 6, 8, 10, 14, -1, -1, -1},
{ 0, 4, 6, 8, 10, 14, -1, -1},
{ 2, 4, 6, 8, 10, 14, -1, -1},
{ 0, 2, 4, 6, 8, 10, 14, -1},
{12, 14, -1, -1, -1, -1, -1, -1},
{ 0, 12, 14, -1, -1, -1, -1, -1},
{ 2, 12, 14, -1, -1, -1, -1, -1},
{ 0, 2, 12, 14, -1, -1, -1, -1},
{ 4, 12, 14, -1, -1, -1, -1, -1},
{ 0, 4, 12, 14, -1, -1, -1, -1},
{ 2, 4, 12, 14, -1, -1, -1, -1},
{ 0, 2, 4, 12, 14, -1, -1, -1},
{ 6, 12, 14, -1, -1, -1, -1, -1},
{ 0, 6, 12, 14, -1, -1, -1, -1},
{ 2, 6, 12, 14, -1, -1, -1, -1},
{ 0, 2, 6, 12, 14, -1, -1, -1},
{ 4, 6, 12, 14, -1, -1, -1, -1},
{ 0, 4, 6, 12, 14, -1, -1, -1},
{ 2, 4, 6, 12, 14, -1, -1, -1},
{ 0, 2, 4, 6, 12, 14, -1, -1},
{ 8, 12, 14, -1, -1, -1, -1, -1},
{ 0, 8, 12, 14, -1, -1, -1, -1},
{ 2, 8, 12, 14, -1, -1, -1, -1},
{ 0, 2, 8, 12, 14, -1, -1, -1},
{ 4, 8, 12, 14, -1, -1, -1, -1},
{ 0, 4, 8, 12, 14, -1, -1, -1},
{ 2, 4, 8, 12, 14, -1, -1, -1},
{ 0, 2, 4, 8, 12, 14, -1, -1},
{ 6, 8, 12, 14, -1, -1, -1, -1},
{ 0, 6, 8, 12, 14, -1, -1, -1},
{ 2, 6, 8, 12, 14, -1, -1, -1},
{ 0, 2, 6, 8, 12, 14, -1, -1},
{ 4, 6, 8, 12, 14, -1, -1, -1},
{ 0, 4, 6, 8, 12, 14, -1, -1},
{ 2, 4, 6, 8, 12, 14, -1, -1},
{ 0, 2, 4, 6, 8, 12, 14, -1},
{10, 12, 14, -1, -1, -1, -1, -1},
{ 0, 10, 12, 14, -1, -1, -1, -1},
{ 2, 10, 12, 14, -1, -1, -1, -1},
{ 0, 2, 10, 12, 14, -1, -1, -1},
{ 4, 10, 12, 14, -1, -1, -1, -1},
{ 0, 4, 10, 12, 14, -1, -1, -1},
{ 2, 4, 10, 12, 14, -1, -1, -1},
{ 0, 2, 4, 10, 12, 14, -1, -1},
{ 6, 10, 12, 14, -1, -1, -1, -1},
{ 0, 6, 10, 12, 14, -1, -1, -1},
{ 2, 6, 10, 12, 14, -1, -1, -1},
{ 0, 2, 6, 10, 12, 14, -1, -1},
{ 4, 6, 10, 12, 14, -1, -1, -1},
{ 0, 4, 6, 10, 12, 14, -1, -1},
{ 2, 4, 6, 10, 12, 14, -1, -1},
{ 0, 2, 4, 6, 10, 12, 14, -1},
{ 8, 10, 12, 14, -1, -1, -1, -1},
{ 0, 8, 10, 12, 14, -1, -1, -1},
{ 2, 8, 10, 12, 14, -1, -1, -1},
{ 0, 2, 8, 10, 12, 14, -1, -1},
{ 4, 8, 10, 12, 14, -1, -1, -1},
{ 0, 4, 8, 10, 12, 14, -1, -1},
{ 2, 4, 8, 10, 12, 14, -1, -1},
{ 0, 2, 4, 8, 10, 12, 14, -1},
{ 6, 8, 10, 12, 14, -1, -1, -1},
{ 0, 6, 8, 10, 12, 14, -1, -1},
{ 2, 6, 8, 10, 12, 14, -1, -1},
{ 0, 2, 6, 8, 10, 12, 14, -1},
{ 4, 6, 8, 10, 12, 14, -1, -1},
{ 0, 4, 6, 8, 10, 12, 14, -1},
{ 2, 4, 6, 8, 10, 12, 14, -1},
{ 0, 2, 4, 6, 8, 10, 12, 14}
}
};

#define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a)
#define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a)
#define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a)
#define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a)

size_t PQCLEAN_KYBER102490S_AVX2_rej_uniform(int16_t *r,
size_t len,
const uint8_t *buf,
size_t buflen) {
size_t ctr, pos;
uint16_t val;
uint32_t good0, good1, good2;
const __m256i bound = _mm256_set1_epi16((int16_t)(19 * KYBER_Q - 1)); // -1 to use cheaper >= instead of > comparison
#define REJ_UNIFORM_BUFLEN 576
unsigned int PQCLEAN_KYBER102490S_AVX2_rej_uniform_avx(int16_t *restrict r,
const uint8_t *restrict buf) {
unsigned int ctr = 0, pos = 0;
uint16_t val = 0;
uint32_t good = 0;
const __m256i bound = _mm256_set1_epi16((int16_t)(19 * KYBER_Q - 1));
const __m256i ones = _mm256_set1_epi8(1);
const __m256i kyberq = _mm256_load_si256(&PQCLEAN_KYBER102490S_AVX2_16xq.as_vec);
const __m256i v = _mm256_load_si256(&PQCLEAN_KYBER102490S_AVX2_16xv.as_vec);
__m256i d0, d1, d2, tmp0, tmp1, tmp2, pi0, pi1, pi2;
__m128i d, tmp, pilo, pihi;
const __m256i kyberq = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER102490S_AVX2_qdata.as_arr[_16XQ]);
const __m256i v = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER102490S_AVX2_qdata.as_arr[_16XV]);
__m256i f0, f1, g0, g1, g2, g3;
__m128i f, t, pilo, pihi;

ctr = pos = 0;
while (ctr + 48 <= len && pos + 96 <= buflen) {
d0 = _mm256_loadu_si256((__m256i *)&buf[pos + 0]);
d1 = _mm256_loadu_si256((__m256i *)&buf[pos + 32]);
d2 = _mm256_loadu_si256((__m256i *)&buf[pos + 64]);
ctr = 0;
for (pos = 0; pos < 2 * KYBER_N; pos += 64) {
f0 = _mm256_load_si256((__m256i *)&buf[pos + 0]);
f1 = _mm256_load_si256((__m256i *)&buf[pos + 32]);

tmp0 = _mm256_cmpge_epu16(bound, d0);
tmp1 = _mm256_cmpge_epu16(bound, d1);
tmp2 = _mm256_cmpge_epu16(bound, d2);
good0 = (uint32_t)_mm256_movemask_epi8(tmp0);
good1 = (uint32_t)_mm256_movemask_epi8(tmp1);
good2 = (uint32_t)_mm256_movemask_epi8(tmp2);
good0 = _pext_u32(good0, 0x55555555);
good1 = _pext_u32(good1, 0x55555555);
good2 = _pext_u32(good2, 0x55555555);
g0 = _mm256_cmpge_epu16(bound, f0);
g1 = _mm256_cmpge_epu16(bound, f1);

pilo = _mm_loadl_epi64((__m128i *)&idx[good0 & 0xFF]);
pihi = _mm_loadl_epi64((__m128i *)&idx[(good0 >> 8) & 0xFF]);
pi0 = _mm256_castsi128_si256(pilo);
pi0 = _mm256_inserti128_si256(pi0, pihi, 1);
g0 = _mm256_packs_epi16(g0, g1);
good = _mm256_movemask_epi8(g0);

pilo = _mm_loadl_epi64((__m128i *)&idx[good1 & 0xFF]);
pihi = _mm_loadl_epi64((__m128i *)&idx[(good1 >> 8) & 0xFF]);
pi1 = _mm256_castsi128_si256(pilo);
pi1 = _mm256_inserti128_si256(pi1, pihi, 1);
g0 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 0) & 0xFF]));
g1 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 8) & 0xFF]));
g0 = _mm256_inserti128_si256(g0, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 16) & 0xFF]), 1);
g1 = _mm256_inserti128_si256(g1, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 24) & 0xFF]), 1);

pilo = _mm_loadl_epi64((__m128i *)&idx[good2 & 0xFF]);
pihi = _mm_loadl_epi64((__m128i *)&idx[(good2 >> 8) & 0xFF]);
pi2 = _mm256_castsi128_si256(pilo);
pi2 = _mm256_inserti128_si256(pi2, pihi, 1);
//g0 = _mm256_cvtepu8_epi64(_mm_loadl_epi64((__m128i *)&good));
//g1 = _mm256_i64gather_epi64((long long *)idx, g0, 8);

tmp0 = _mm256_add_epi8(pi0, ones);
tmp1 = _mm256_add_epi8(pi1, ones);
tmp2 = _mm256_add_epi8(pi2, ones);
pi0 = _mm256_unpacklo_epi8(pi0, tmp0);
pi1 = _mm256_unpacklo_epi8(pi1, tmp1);
pi2 = _mm256_unpacklo_epi8(pi2, tmp2);
/* Barrett reduction of (still unsigned) values */
g2 = _mm256_mulhi_epu16(f0, v);
g3 = _mm256_mulhi_epu16(f1, v);
g2 = _mm256_srli_epi16(g2, 10);
g3 = _mm256_srli_epi16(g3, 10);
g2 = _mm256_mullo_epi16(g2, kyberq);
g3 = _mm256_mullo_epi16(g3, kyberq);
f0 = _mm256_sub_epi16(f0, g2);
f1 = _mm256_sub_epi16(f1, g3);

d0 = _mm256_shuffle_epi8(d0, pi0);
d1 = _mm256_shuffle_epi8(d1, pi1);
d2 = _mm256_shuffle_epi8(d2, pi2);
g2 = _mm256_add_epi8(g0, ones);
g3 = _mm256_add_epi8(g1, ones);
g0 = _mm256_unpacklo_epi8(g0, g2);
g1 = _mm256_unpacklo_epi8(g1, g3);

/* Barrett reduction of (still unsigned) d values */
tmp0 = _mm256_mulhi_epu16(d0, v);
tmp1 = _mm256_mulhi_epu16(d1, v);
tmp2 = _mm256_mulhi_epu16(d2, v);
tmp0 = _mm256_srli_epi16(tmp0, 10);
tmp1 = _mm256_srli_epi16(tmp1, 10);
tmp2 = _mm256_srli_epi16(tmp2, 10);
tmp0 = _mm256_mullo_epi16(tmp0, kyberq);
tmp1 = _mm256_mullo_epi16(tmp1, kyberq);
tmp2 = _mm256_mullo_epi16(tmp2, kyberq);
d0 = _mm256_sub_epi16(d0, tmp0);
d1 = _mm256_sub_epi16(d1, tmp1);
d2 = _mm256_sub_epi16(d2, tmp2);
f0 = _mm256_shuffle_epi8(f0, g0);
f1 = _mm256_shuffle_epi8(f1, g1);

_mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(d0));
ctr += (unsigned int)_mm_popcnt_u32(good0 & 0xFF);
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(d0, 1));
ctr += (unsigned int)_mm_popcnt_u32((good0 >> 8) & 0xFF);
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(d1));
ctr += (unsigned int)_mm_popcnt_u32(good1 & 0xFF);
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(d1, 1));
ctr += (unsigned int)_mm_popcnt_u32((good1 >> 8) & 0xFF);
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(d2));
ctr += (unsigned int)_mm_popcnt_u32(good2 & 0xFF);
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(d2, 1));
ctr += (unsigned int)_mm_popcnt_u32((good2 >> 8) & 0xFF);
pos += 96;
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f0));
ctr += _mm_popcnt_u32((good >> 0) & 0xFF);
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f0, 1));
ctr += _mm_popcnt_u32((good >> 16) & 0xFF);
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f1));
ctr += _mm_popcnt_u32((good >> 8) & 0xFF);
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f1, 1));
ctr += _mm_popcnt_u32((good >> 24) & 0xFF);
}

while (ctr + 8 <= len && pos + 16 <= buflen) {
d = _mm_loadu_si128((__m128i *)&buf[pos]);
tmp = _mm_cmpge_epu16(_mm256_castsi256_si128(bound), d);
good0 = (uint32_t)_mm_movemask_epi8(tmp);
good0 = _pext_u32(good0, 0x55555555);
pilo = _mm_loadl_epi64((__m128i *)&idx[good0]);
while (ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_BUFLEN - 16) {
f = _mm_load_si128((__m128i *)&buf[pos]);
t = _mm_cmpge_epu16(_mm256_castsi256_si128(bound), f);
good = _mm_movemask_epi8(t);
good = _pext_u32(good, 0x5555);
pilo = _mm_loadl_epi64((__m128i *)&idx.arr[good]);
pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones));
pilo = _mm_unpacklo_epi8(pilo, pihi);
d = _mm_shuffle_epi8(d, pilo);

/* Barrett reduction */
tmp = _mm_mulhi_epu16(d, _mm256_castsi256_si128(v));
tmp = _mm_srli_epi16(tmp, 10);
tmp = _mm_mullo_epi16(tmp, _mm256_castsi256_si128(kyberq));
d = _mm_sub_epi16(d, tmp);
t = _mm_mulhi_epu16(f, _mm256_castsi256_si128(v));
t = _mm_srli_epi16(t, 10);
t = _mm_mullo_epi16(t, _mm256_castsi256_si128(kyberq));
f = _mm_sub_epi16(f, t);

_mm_storeu_si128((__m128i *)&r[ctr], d);
ctr += (unsigned int)_mm_popcnt_u32(good0);
f = _mm_shuffle_epi8(f, pilo);
_mm_storeu_si128((__m128i *)&r[ctr], f);
ctr += _mm_popcnt_u32(good);
pos += 16;
}

while (ctr < len && pos + 2 <= buflen) {
val = (uint16_t)(buf[pos] | ((uint16_t)buf[pos + 1] << 8));
while (ctr < KYBER_N && pos <= REJ_UNIFORM_BUFLEN - 2) {
val = buf[pos] | ((uint16_t)buf[pos + 1] << 8);
pos += 2;

if (val < 19 * KYBER_Q) {
val -= ((int32_t)val * 20159 >> 26) * KYBER_Q;
r[ctr++] = (int16_t)val;
r[ctr++] = val;
}
}



+ 4
- 5
crypto_kem/kyber1024-90s/avx2/rejsample.h View File

@@ -1,12 +1,11 @@
#ifndef REJSAMPLE_H
#define REJSAMPLE_H

#include <stddef.h>
#include "params.h"
#include <stdint.h>

size_t PQCLEAN_KYBER102490S_AVX2_rej_uniform(int16_t *r,
size_t len,
const uint8_t *buf,
size_t buflen);

unsigned int PQCLEAN_KYBER102490S_AVX2_rej_uniform_avx(int16_t *r,
const unsigned char *buf);

#endif

+ 255
- 0
crypto_kem/kyber1024-90s/avx2/shuffle.S View File

@@ -0,0 +1,255 @@
#include "cdecl.inc"
.include "fq.inc"
.include "shuffle.inc"

/*
nttpack_avx:
#load
vmovdqa (%rdi),%ymm4
vmovdqa 32(%rdi),%ymm5
vmovdqa 64(%rdi),%ymm6
vmovdqa 96(%rdi),%ymm7
vmovdqa 128(%rdi),%ymm8
vmovdqa 160(%rdi),%ymm9
vmovdqa 192(%rdi),%ymm10
vmovdqa 224(%rdi),%ymm11

shuffle1 4,5,3,5
shuffle1 6,7,4,7
shuffle1 8,9,6,9
shuffle1 10,11,8,11

shuffle2 3,4,10,4
shuffle2 6,8,3,8
shuffle2 5,7,6,7
shuffle2 9,11,5,11

shuffle4 10,3,9,3
shuffle4 6,5,10,5
shuffle4 4,8,6,8
shuffle4 7,11,4,11

shuffle8 9,10,7,10
shuffle8 6,4,9,4
shuffle8 3,5,6,5
shuffle8 8,11,3,11

#store
vmovdqa %ymm7,(%rdi)
vmovdqa %ymm9,32(%rdi)
vmovdqa %ymm6,64(%rdi)
vmovdqa %ymm3,96(%rdi)
vmovdqa %ymm10,128(%rdi)
vmovdqa %ymm4,160(%rdi)
vmovdqa %ymm5,192(%rdi)
vmovdqa %ymm11,224(%rdi)

ret
*/

.text
nttunpack128_avx:
#load
vmovdqa (%rdi),%ymm4
vmovdqa 32(%rdi),%ymm5
vmovdqa 64(%rdi),%ymm6
vmovdqa 96(%rdi),%ymm7
vmovdqa 128(%rdi),%ymm8
vmovdqa 160(%rdi),%ymm9
vmovdqa 192(%rdi),%ymm10
vmovdqa 224(%rdi),%ymm11

shuffle8 4,8,3,8
shuffle8 5,9,4,9
shuffle8 6,10,5,10
shuffle8 7,11,6,11

shuffle4 3,5,7,5
shuffle4 8,10,3,10
shuffle4 4,6,8,6
shuffle4 9,11,4,11

shuffle2 7,8,9,8
shuffle2 5,6,7,6
shuffle2 3,4,5,4
shuffle2 10,11,3,11

shuffle1 9,5,10,5
shuffle1 8,4,9,4
shuffle1 7,3,8,3
shuffle1 6,11,7,11

#store
vmovdqa %ymm10,(%rdi)
vmovdqa %ymm5,32(%rdi)
vmovdqa %ymm9,64(%rdi)
vmovdqa %ymm4,96(%rdi)
vmovdqa %ymm8,128(%rdi)
vmovdqa %ymm3,160(%rdi)
vmovdqa %ymm7,192(%rdi)
vmovdqa %ymm11,224(%rdi)

ret

.global cdecl(PQCLEAN_KYBER102490S_AVX2_nttunpack_avx)
cdecl(PQCLEAN_KYBER102490S_AVX2_nttunpack_avx):
call nttunpack128_avx
add $256,%rdi
call nttunpack128_avx
ret

ntttobytes128_avx:
#load
vmovdqa (%rsi),%ymm5
vmovdqa 32(%rsi),%ymm6
vmovdqa 64(%rsi),%ymm7
vmovdqa 96(%rsi),%ymm8
vmovdqa 128(%rsi),%ymm9
vmovdqa 160(%rsi),%ymm10
vmovdqa 192(%rsi),%ymm11
vmovdqa 224(%rsi),%ymm12

#csubq
csubq 5,13
csubq 6,14
csubq 7,15
csubq 8,1
csubq 9,13
csubq 10,14
csubq 11,15
csubq 12,1

#bitpack
vpsllw $12,%ymm6,%ymm4
vpor %ymm4,%ymm5,%ymm4

vpsrlw $4,%ymm6,%ymm5
vpsllw $8,%ymm7,%ymm6
vpor %ymm5,%ymm6,%ymm5

vpsrlw $8,%ymm7,%ymm6
vpsllw $4,%ymm8,%ymm7
vpor %ymm6,%ymm7,%ymm6

vpsllw $12,%ymm10,%ymm7
vpor %ymm7,%ymm9,%ymm7

vpsrlw $4,%ymm10,%ymm8
vpsllw $8,%ymm11,%ymm9
vpor %ymm8,%ymm9,%ymm8

vpsrlw $8,%ymm11,%ymm9
vpsllw $4,%ymm12,%ymm10
vpor %ymm9,%ymm10,%ymm9

shuffle1 4,5,3,5
shuffle1 6,7,4,7
shuffle1 8,9,6,9

shuffle2 3,4,8,4
shuffle2 6,5,3,5
shuffle2 7,9,6,9

shuffle4 8,3,7,3
shuffle4 6,4,8,4
shuffle4 5,9,6,9

shuffle8 7,8,5,8
shuffle8 6,3,7,3
shuffle8 4,9,6,9

#store
vmovdqu %ymm5,(%rdi)
vmovdqu %ymm7,32(%rdi)
vmovdqu %ymm6,64(%rdi)
vmovdqu %ymm8,96(%rdi)
vmovdqu %ymm3,128(%rdi)
vmovdqu %ymm9,160(%rdi)

ret

.global cdecl(PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx)
cdecl(PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx):
#consts
vmovdqa _16XQ*2(%rdx),%ymm0
call ntttobytes128_avx
add $256,%rsi
add $192,%rdi
call ntttobytes128_avx
ret

nttfrombytes128_avx:
#load
vmovdqu (%rsi),%ymm4
vmovdqu 32(%rsi),%ymm5
vmovdqu 64(%rsi),%ymm6
vmovdqu 96(%rsi),%ymm7
vmovdqu 128(%rsi),%ymm8
vmovdqu 160(%rsi),%ymm9

shuffle8 4,7,3,7
shuffle8 5,8,4,8
shuffle8 6,9,5,9

shuffle4 3,8,6,8
shuffle4 7,5,3,5
shuffle4 4,9,7,9

shuffle2 6,5,4,5
shuffle2 8,7,6,7
shuffle2 3,9,8,9

shuffle1 4,7,10,7
shuffle1 5,8,4,8
shuffle1 6,9,5,9

#bitunpack
vpsrlw $12,%ymm10,%ymm11
vpsllw $4,%ymm7,%ymm12
vpor %ymm11,%ymm12,%ymm11
vpand %ymm0,%ymm10,%ymm10
vpand %ymm0,%ymm11,%ymm11

vpsrlw $8,%ymm7,%ymm12
vpsllw $8,%ymm4,%ymm13
vpor %ymm12,%ymm13,%ymm12
vpand %ymm0,%ymm12,%ymm12

vpsrlw $4,%ymm4,%ymm13
vpand %ymm0,%ymm13,%ymm13

vpsrlw $12,%ymm8,%ymm14
vpsllw $4,%ymm5,%ymm15
vpor %ymm14,%ymm15,%ymm14
vpand %ymm0,%ymm8,%ymm8
vpand %ymm0,%ymm14,%ymm14

vpsrlw $8,%ymm5,%ymm15
vpsllw $8,%ymm9,%ymm1
vpor %ymm15,%ymm1,%ymm15
vpand %ymm0,%ymm15,%ymm15

vpsrlw $4,%ymm9,%ymm1
vpand %ymm0,%ymm1,%ymm1

#store
vmovdqa %ymm10,(%rdi)
vmovdqa %ymm11,32(%rdi)
vmovdqa %ymm12,64(%rdi)
vmovdqa %ymm13,96(%rdi)
vmovdqa %ymm8,128(%rdi)
vmovdqa %ymm14,160(%rdi)
vmovdqa %ymm15,192(%rdi)
vmovdqa %ymm1,224(%rdi)

ret

.global cdecl(PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx)
cdecl(PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx):
#consts
vmovdqa _16XMASK*2(%rdx),%ymm0
call nttfrombytes128_avx
add $256,%rdi
add $192,%rsi
call nttfrombytes128_avx
ret

+ 2
- 0
crypto_kem/kyber1024-90s/avx2/shuffle.inc View File

@@ -9,6 +9,8 @@ vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3
.endm

.macro shuffle2 r0,r1,r2,r3
#vshufps $0x20,%ymm\r1,%ymm\r0,%ymm\r2
#vshufps $0x31,%ymm\r1,%ymm\r0,%ymm\r3
vpsllq $32,%ymm\r1,%ymm12
vpsrlq $32,%ymm\r0,%ymm13
vpblendd $0xAA,%ymm12,%ymm\r0,%ymm\r2


+ 14
- 10
crypto_kem/kyber1024-90s/avx2/symmetric.h View File

@@ -2,22 +2,26 @@
#define SYMMETRIC_H

#include "params.h"
#include <stddef.h>
#include <stdint.h>


#include "aes256ctr.h"
#include "sha2.h"

#define hash_h(OUT, IN, INBYTES) sha256((OUT), (IN), (INBYTES))
#define hash_g(OUT, IN, INBYTES) sha512((OUT), (IN), (INBYTES))
#define xof_absorb(STATE, IN, X, Y) PQCLEAN_KYBER102490S_AVX2_aes256ctr_init((STATE), (IN), (Y) + ((uint16_t)(X) << 8))
#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks((OUT), (OUTBLOCKS), (STATE))
#define xof_ctx_release(STATE)
#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER102490S_AVX2_aes256ctr_prf((OUT), (OUTBYTES), (KEY), (NONCE))
#define kdf(OUT, IN, INBYTES) sha256((OUT), (IN), (INBYTES))

#define XOF_BLOCKBYTES 128

typedef aes256ctr_ctx xof_state;

#define XOF_BLOCKBYTES AES256CTR_BLOCKBYTES

#define hash_h(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES)
#define hash_g(OUT, IN, INBYTES) sha512(OUT, IN, INBYTES)
#define xof_absorb(STATE, SEED, X, Y) \
PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(STATE, SEED, (X) | ((uint16_t)(Y) << 8))
#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) \
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE)
#define prf(OUT, OUTBYTES, KEY, NONCE) \
PQCLEAN_KYBER102490S_AVX2_aes256ctr_prf(OUT, OUTBYTES, KEY, NONCE)
#define kdf(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES)


#endif /* SYMMETRIC_H */

+ 20
- 21
crypto_kem/kyber1024-90s/avx2/verify.c View File

@@ -1,23 +1,22 @@
#include "verify.h"

#include <immintrin.h>
#include <stdint.h>
#include <stdlib.h>

/*************************************************
* Name: verify
* Name: PQCLEAN_KYBER102490S_AVX2_verify
*
* Description: Compare two arrays for equality in constant time.
*
* Arguments: const uint8_t *a: pointer to first byte array
* const uint8_t *b: pointer to second byte array
* Arguments: const unsigned char *a: pointer to first byte array
* const unsigned char *b: pointer to second byte array
* size_t len: length of the byte arrays
*
* Returns 0 if the byte arrays are equal, 1 otherwise
**************************************************/
uint8_t PQCLEAN_KYBER102490S_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) {
size_t pos;
uint64_t r;
int PQCLEAN_KYBER102490S_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) {
size_t pos = 0;
uint64_t r = 0;
__m256i avec, bvec, cvec;

cvec = _mm256_setzero_si256();
@@ -27,38 +26,38 @@ uint8_t PQCLEAN_KYBER102490S_AVX2_verify(const uint8_t *a, const uint8_t *b, siz
avec = _mm256_xor_si256(avec, bvec);
cvec = _mm256_or_si256(cvec, avec);
}
r = !_mm256_testz_si256(cvec, cvec);

cvec = _mm256_cmpeq_epi8(cvec, _mm256_setzero_si256());
r = (uint32_t)(_mm256_movemask_epi8(cvec) ^ -1);

while (pos < len) {
r |= a[pos] ^ b[pos];
pos += 1;
if (pos < len) {
avec = _mm256_loadu_si256((__m256i *)&a[pos]);
bvec = _mm256_loadu_si256((__m256i *)&b[pos]);
cvec = _mm256_cmpeq_epi8(avec, bvec);
r |= _mm256_movemask_epi8(cvec) & (-(uint32_t)1 >> (32 + pos - len));
}

r = (-r) >> 63;
return (uint8_t)r;
return r;
}

/*************************************************
* Name: cmov
* Name: PQCLEAN_KYBER102490S_AVX2_cmov
*
* Description: Copy len bytes from x to r if b is 1;
* don't modify x if b is 0. Requires b to be in {0,1};
* assumes two's complement representation of negative integers.
* Runs in constant time.
*
* Arguments: uint8_t *r: pointer to output byte array
* const uint8_t *x: pointer to input byte array
* Arguments: unsigned char *r: pointer to output byte array
* const unsigned char *x: pointer to input byte array
* size_t len: Amount of bytes to be copied
* uint8_t b: Condition bit; has to be in {0,1}
* unsigned char b: Condition bit; has to be in {0,1}
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b) {
size_t pos;
void PQCLEAN_KYBER102490S_AVX2_cmov(uint8_t *restrict r, const uint8_t *restrict x, size_t len, uint8_t b) {
size_t pos = 0;
__m256i xvec, rvec, bvec;

b = -b;
bvec = _mm256_set1_epi8((char)b);
bvec = _mm256_set1_epi8(b);

for (pos = 0; pos + 32 <= len; pos += 32) {
rvec = _mm256_loadu_si256((__m256i *)&r[pos]);


+ 6
- 3
crypto_kem/kyber1024-90s/avx2/verify.h View File

@@ -1,10 +1,13 @@
#ifndef VERIFY_H
#define VERIFY_H
#ifndef PQCLEAN_KYBER102490S_AVX2_VERIFY_H
#define PQCLEAN_KYBER102490S_AVX2_VERIFY_H

#include "params.h"
#include <stddef.h>
#include <stdint.h>

uint8_t PQCLEAN_KYBER102490S_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len);

int PQCLEAN_KYBER102490S_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len);


void PQCLEAN_KYBER102490S_AVX2_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b);



+ 1
- 11
crypto_kem/kyber1024-90s/clean/LICENSE View File

@@ -1,14 +1,4 @@
kyber-20170627
Public Domain
Authors: Joppe Bos,
Léo Ducas,
Eike Kiltz ,
Tancrède Lepoint,
Vadim Lyubashevsky,
John Schanck,
Peter Schwabe,
Gregor Seiler,
Damien Stehlé
Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/)

For Keccak and AES we are using public-domain
code from sources and by authors listed in


+ 23
- 2
crypto_kem/kyber1024-90s/clean/Makefile View File

@@ -1,8 +1,29 @@
# This Makefile can be used with GNU Make or BSD Make

LIB=libkyber1024-90s_clean.a
HEADERS=api.h cbd.h indcpa.h ntt.h params.h poly.h polyvec.h reduce.h verify.h symmetric.h aes256ctr.h
OBJECTS=cbd.o indcpa.o kem.o ntt.o poly.o polyvec.o reduce.o verify.o aes256ctr.o
HEADERS= \
api.h \
cbd.h \
indcpa.h \
kem.h \
ntt.h \
params.h \
poly.h \
polyvec.h \
reduce.h \
symmetric-aes.h \
symmetric.h \
verify.h
OBJECTS= \
cbd.o \
indcpa.o \
kem.o \
ntt.o \
poly.o \
polyvec.o \
reduce.o \
verify.o \
symmetric-aes.o

CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS)



+ 1
- 1
crypto_kem/kyber1024-90s/clean/Makefile.Microsoft_nmake View File

@@ -2,7 +2,7 @@
# nmake /f Makefile.Microsoft_nmake

LIBRARY=libkyber1024-90s_clean.lib
OBJECTS=cbd.obj indcpa.obj kem.obj ntt.obj poly.obj polyvec.obj reduce.obj verify.obj aes256ctr.obj
OBJECTS=cbd.obj indcpa.obj kem.obj ntt.obj poly.obj polyvec.obj reduce.obj verify.obj symmetric-aes.o

# Warning C4146 is raised when a unary minus operator is applied to an
# unsigned type; this has nonetheless been standard and portable for as


+ 13
- 15
crypto_kem/kyber1024-90s/clean/cbd.c View File

@@ -1,7 +1,5 @@
#include "cbd.h"
#include "params.h"

#include <stddef.h>
#include "cbd.h"
#include <stdint.h>

/*************************************************
@@ -14,8 +12,8 @@
*
* Returns 32-bit unsigned integer loaded from x
**************************************************/
static uint32_t load32_littleendian(const uint8_t *x) {
uint32_t r;
static uint32_t load32_littleendian(const uint8_t x[4]) {
uint32_t r = 0;
r = (uint32_t)x[0];
r |= (uint32_t)x[1] << 8;
r |= (uint32_t)x[2] << 16;
@@ -24,27 +22,27 @@ static uint32_t load32_littleendian(const uint8_t *x) {
}

/*************************************************
* Name: cbd
* Name: PQCLEAN_KYBER102490S_CLEAN_cbd
*
* Description: Given an array of uniformly random bytes, compute
* polynomial with coefficients distributed according to
* a centered binomial distribution with parameter KYBER_ETA
* specialized for KYBER_ETA=2
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *buf: pointer to input byte array
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_cbd(poly *r, const uint8_t *buf) {
uint32_t d, t;
int16_t a, b;
void PQCLEAN_KYBER102490S_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]) {
unsigned int i = 0, j = 0;
uint32_t t = 0, d = 0;
int16_t a = 0, b = 0;

for (size_t i = 0; i < KYBER_N / 8; i++) {
t = load32_littleendian(buf + 4 * i);
for (i = 0; i < KYBER_N / 8; i++) {
t = load32_littleendian(buf + 4 * i);
d = t & 0x55555555;
d += (t >> 1) & 0x55555555;

for (size_t j = 0; j < 8; j++) {
a = (d >> 4 * j) & 0x3;
for (j = 0; j < 8; j++) {
a = (d >> (4 * j + 0)) & 0x3;
b = (d >> (4 * j + 2)) & 0x3;
r->coeffs[8 * i + j] = a - b;
}


+ 6
- 3
crypto_kem/kyber1024-90s/clean/cbd.h View File

@@ -1,8 +1,11 @@
#ifndef CBD_H
#define CBD_H
#ifndef PQCLEAN_KYBER102490S_CLEAN_CBD_H
#define PQCLEAN_KYBER102490S_CLEAN_CBD_H

#include "params.h"
#include "poly.h"
#include <stdint.h>

void PQCLEAN_KYBER102490S_CLEAN_cbd(poly *r, const uint8_t *buf);

void PQCLEAN_KYBER102490S_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]);

#endif

+ 118
- 83
crypto_kem/kyber1024-90s/clean/indcpa.c View File

@@ -5,7 +5,7 @@
#include "polyvec.h"
#include "randombytes.h"
#include "symmetric.h"
#include <stddef.h>
#include <stdint.h>

/*************************************************
@@ -16,12 +16,15 @@
* and the public seed used to generate the matrix A.
*
* Arguments: uint8_t *r: pointer to the output serialized public key
* const poly *pk: pointer to the input public-key polynomial
* polyvec *pk: pointer to the input public-key polyvec
* const uint8_t *seed: pointer to the input public seed
**************************************************/
static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) {
static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES],
polyvec *pk,
const uint8_t seed[KYBER_SYMBYTES]) {
size_t i = 0;
PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes(r, pk);
for (size_t i = 0; i < KYBER_SYMBYTES; i++) {
for (i = 0; i < KYBER_SYMBYTES; i++) {
r[i + KYBER_POLYVECBYTES] = seed[i];
}
}
@@ -32,13 +35,18 @@ static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) {
* Description: De-serialize public key from a byte array;
* approximate inverse of pack_pk
*
* Arguments: - polyvec *pk: pointer to output public-key vector of polynomials
* - uint8_t *seed: pointer to output seed to generate matrix A
* Arguments: - polyvec *pk: pointer to output public-key
* polynomial vector
* - uint8_t *seed: pointer to output seed to generate
* matrix A
* - const uint8_t *packedpk: pointer to input serialized public key
**************************************************/
static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) {
static void unpack_pk(polyvec *pk,
uint8_t seed[KYBER_SYMBYTES],
const uint8_t packedpk[KYBER_INDCPA_PUBLICKEYBYTES]) {
size_t i = 0;
PQCLEAN_KYBER102490S_CLEAN_polyvec_frombytes(pk, packedpk);
for (size_t i = 0; i < KYBER_SYMBYTES; i++) {
for (i = 0; i < KYBER_SYMBYTES; i++) {
seed[i] = packedpk[i + KYBER_POLYVECBYTES];
}
}
@@ -49,9 +57,9 @@ static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) {
* Description: Serialize the secret key
*
* Arguments: - uint8_t *r: pointer to output serialized secret key
* - const polyvec *sk: pointer to input vector of polynomials (secret key)
* - polyvec *sk: pointer to input vector of polynomials (secret key)
**************************************************/
static void pack_sk(uint8_t *r, polyvec *sk) {
static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) {
PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes(r, sk);
}

@@ -61,10 +69,12 @@ static void pack_sk(uint8_t *r, polyvec *sk) {
* Description: De-serialize the secret key;
* inverse of pack_sk
*
* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key)
* Arguments: - polyvec *sk: pointer to output vector of
* polynomials (secret key)
* - const uint8_t *packedsk: pointer to input serialized secret key
**************************************************/
static void unpack_sk(polyvec *sk, const uint8_t *packedsk) {
static void unpack_sk(polyvec *sk,
const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) {
PQCLEAN_KYBER102490S_CLEAN_polyvec_frombytes(sk, packedsk);
}

@@ -75,11 +85,13 @@ static void unpack_sk(polyvec *sk, const uint8_t *packedsk) {
* compressed and serialized vector of polynomials b
* and the compressed and serialized polynomial v
*
* Arguments: uint8_t *r: pointer to the output serialized ciphertext
* const poly *pk: pointer to the input vector of polynomials b
* const uint8_t *seed: pointer to the input polynomial v
* Arguments: uint8_t *r: pointer to the output serialized ciphertext
* poly *pk: pointer to the input vector of polynomials b
* poly *v: pointer to the input polynomial v
**************************************************/
static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) {
static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES],
polyvec *b,
poly *v) {
PQCLEAN_KYBER102490S_CLEAN_polyvec_compress(r, b);
PQCLEAN_KYBER102490S_CLEAN_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v);
}
@@ -90,11 +102,13 @@ static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) {
* Description: De-serialize and decompress ciphertext from a byte array;
* approximate inverse of pack_ciphertext
*
* Arguments: - polyvec *b: pointer to the output vector of polynomials b
* - poly *v: pointer to the output polynomial v
* Arguments: - polyvec *b: pointer to the output vector of polynomials b
* - poly *v: pointer to the output polynomial v
* - const uint8_t *c: pointer to the input serialized ciphertext
**************************************************/
static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t *c) {
static void unpack_ciphertext(polyvec *b,
poly *v,
const uint8_t c[KYBER_INDCPA_BYTES]) {
PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress(b, c);
PQCLEAN_KYBER102490S_CLEAN_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES);
}
@@ -105,24 +119,29 @@ static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t *c) {
* Description: Run rejection sampling on uniform random bytes to generate
* uniform random integers mod q
*
* Arguments: - int16_t *r: pointer to output buffer
* - size_t len: requested number of 16-bit integers (uniform mod q)
* - const uint8_t *buf: pointer to input buffer (assumed to be uniform random bytes)
* - size_t buflen: length of input buffer in bytes
* Arguments: - int16_t *r: pointer to output buffer
* - unsigned int len: requested number of 16-bit integers
* (uniform mod q)
* - const uint8_t *buf: pointer to input buffer
* (assumed to be uniform random bytes)
* - unsigned int buflen: length of input buffer in bytes
*
* Returns number of sampled 16-bit integers (at most len)
**************************************************/
static size_t rej_uniform(int16_t *r, size_t len, const uint8_t *buf, size_t buflen) {
size_t ctr, pos;
uint16_t val;
static unsigned int rej_uniform(int16_t *r,
unsigned int len,
const uint8_t *buf,
unsigned int buflen) {
unsigned int ctr = 0, pos = 0;
uint16_t val = 0;

ctr = pos = 0;
while (ctr < len && pos + 2 <= buflen) {
val = (uint16_t)(buf[pos] | ((uint16_t)buf[pos + 1] << 8));
val = buf[pos] | ((uint16_t)buf[pos + 1] << 8);
pos += 2;

if (val < 19 * KYBER_Q) {
val -= (uint16_t)((val >> 12) * KYBER_Q); // Barrett reduction
val -= (val >> 12) * KYBER_Q; // Barrett reduction
r[ctr++] = (int16_t)val;
}
}
@@ -130,26 +149,28 @@ static size_t rej_uniform(int16_t *r, size_t len, const uint8_t *buf, size_t buf
return ctr;
}

#define gen_a(A,B) gen_matrix(A,B,0)
#define gen_at(A,B) gen_matrix(A,B,1)
#define gen_a(A,B) PQCLEAN_KYBER102490S_CLEAN_gen_matrix(A,B,0)
#define gen_at(A,B) PQCLEAN_KYBER102490S_CLEAN_gen_matrix(A,B,1)

/*************************************************
* Name: gen_matrix
* Name: PQCLEAN_KYBER102490S_CLEAN_gen_matrix
*
* Description: Deterministically generate matrix A (or the transpose of A)
* from a seed. Entries of the matrix are polynomials that look
* uniformly random. Performs rejection sampling on output of
* a XOF
*
* Arguments: - polyvec *a: pointer to ouptput matrix A
* Arguments: - polyvec *a: pointer to ouptput matrix A
* - const uint8_t *seed: pointer to input seed
* - int transposed: boolean deciding whether A or A^T is generated
* - int transposed: boolean deciding whether A or A^T
* is generated
**************************************************/
#define MAXNBLOCKS ((530+XOF_BLOCKBYTES)/XOF_BLOCKBYTES) /* 530 is expected number of required bytes */
static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) {
size_t ctr;
uint8_t i, j;
uint8_t buf[XOF_BLOCKBYTES * MAXNBLOCKS + 1];
#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \
+ XOF_BLOCKBYTES)/XOF_BLOCKBYTES)
// Not static for benchmarking
void PQCLEAN_KYBER102490S_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed) {
unsigned int ctr = 0, i = 0, j = 0;
uint8_t buf[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES];
xof_state state;

for (i = 0; i < KYBER_K; i++) {
@@ -160,12 +181,13 @@ static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) {
xof_absorb(&state, seed, j, i);
}

xof_squeezeblocks(buf, MAXNBLOCKS, &state);
ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, MAXNBLOCKS * XOF_BLOCKBYTES);
xof_squeezeblocks(buf, GEN_MATRIX_NBLOCKS, &state);
ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, sizeof(buf));

while (ctr < KYBER_N) {
xof_squeezeblocks(buf, 1, &state);
ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, XOF_BLOCKBYTES);
ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf,
XOF_BLOCKBYTES);
}
xof_ctx_release(&state);
}
@@ -173,40 +195,44 @@ static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) {
}

/*************************************************
* Name: indcpa_keypair
* Name: PQCLEAN_KYBER102490S_CLEAN_indcpa_keypair
*
* Description: Generates public and private key for the CPA-secure
* public-key encryption scheme underlying Kyber
*
* Arguments: - uint8_t *pk: pointer to output public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
* - uint8_t *sk: pointer to output private key (of length KYBER_INDCPA_SECRETKEYBYTES bytes)
* Arguments: - uint8_t *pk: pointer to output public key
* (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
* - uint8_t *sk: pointer to output private key
(of length KYBER_INDCPA_SECRETKEYBYTES bytes)
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_indcpa_keypair(uint8_t *pk, uint8_t *sk) {
polyvec a[KYBER_K], e, pkpv, skpv;
void PQCLEAN_KYBER102490S_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) {
unsigned int i = 0;
uint8_t buf[2 * KYBER_SYMBYTES];
uint8_t *publicseed = buf;
uint8_t *noiseseed = buf + KYBER_SYMBYTES;
const uint8_t *publicseed = buf;
const uint8_t *noiseseed = buf + KYBER_SYMBYTES;
uint8_t nonce = 0;
polyvec a[KYBER_K], e, pkpv, skpv;

randombytes(buf, KYBER_SYMBYTES);
hash_g(buf, buf, KYBER_SYMBYTES);

gen_a(a, publicseed);

for (size_t i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(skpv.vec + i, noiseseed, nonce++);
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(&skpv.vec[i], noiseseed, nonce++);
}
for (size_t i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(e.vec + i, noiseseed, nonce++);
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(&e.vec[i], noiseseed, nonce++);
}

PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt(&skpv);
PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt(&e);

// matrix-vector multiplication
for (size_t i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc(&pkpv.vec[i], &a[i], &skpv);
PQCLEAN_KYBER102490S_CLEAN_poly_frommont(&pkpv.vec[i]);
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv);
PQCLEAN_KYBER102490S_CLEAN_poly_tomont(&pkpv.vec[i]);
}

PQCLEAN_KYBER102490S_CLEAN_polyvec_add(&pkpv, &pkpv, &e);
@@ -217,34 +243,40 @@ void PQCLEAN_KYBER102490S_CLEAN_indcpa_keypair(uint8_t *pk, uint8_t *sk) {
}

/*************************************************
* Name: indcpa_enc
* Name: PQCLEAN_KYBER102490S_CLEAN_indcpa_enc
*
* Description: Encryption function of the CPA-secure
* public-key encryption scheme underlying Kyber.
*
* Arguments: - uint8_t *c: pointer to output ciphertext (of length KYBER_INDCPA_BYTES bytes)
* - const uint8_t *m: pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes)
* - const uint8_t *pk: pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
* - const uint8_t *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes)
* to deterministically generate all randomness
* Arguments: - uint8_t *c: pointer to output ciphertext
* (of length KYBER_INDCPA_BYTES bytes)
* - const uint8_t *m: pointer to input message
* (of length KYBER_INDCPA_MSGBYTES bytes)
* - const uint8_t *pk: pointer to input public key
* (of length KYBER_INDCPA_PUBLICKEYBYTES)
* - const uint8_t *coins: pointer to input random coins
* used as seed (of length KYBER_SYMBYTES)
* to deterministically generate all
* randomness
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(uint8_t *c,
const uint8_t *m,
const uint8_t *pk,
const uint8_t *coins) {
polyvec sp, pkpv, ep, at[KYBER_K], bp;
poly v, k, epp;
void PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
const uint8_t m[KYBER_INDCPA_MSGBYTES],
const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
const uint8_t coins[KYBER_SYMBYTES]) {
unsigned int i = 0;
uint8_t seed[KYBER_SYMBYTES];
uint8_t nonce = 0;
polyvec sp, pkpv, ep, at[KYBER_K], bp;
poly v, k, epp;

unpack_pk(&pkpv, seed, pk);
PQCLEAN_KYBER102490S_CLEAN_poly_frommsg(&k, m);
gen_at(at, seed);

for (size_t i = 0; i < KYBER_K; i++) {
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(sp.vec + i, coins, nonce++);
}
for (size_t i = 0; i < KYBER_K; i++) {
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(ep.vec + i, coins, nonce++);
}
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(&epp, coins, nonce++);
@@ -252,14 +284,14 @@ void PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(uint8_t *c,
PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt(&sp);

// matrix-vector multiplication
for (size_t i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc(&bp.vec[i], &at[i], &sp);
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp);
}

PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc(&v, &pkpv, &sp);
PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp);

PQCLEAN_KYBER102490S_CLEAN_polyvec_invntt(&bp);
PQCLEAN_KYBER102490S_CLEAN_poly_invntt(&v);
PQCLEAN_KYBER102490S_CLEAN_polyvec_invntt_tomont(&bp);
PQCLEAN_KYBER102490S_CLEAN_poly_invntt_tomont(&v);

PQCLEAN_KYBER102490S_CLEAN_polyvec_add(&bp, &bp, &ep);
PQCLEAN_KYBER102490S_CLEAN_poly_add(&v, &v, &epp);
@@ -271,18 +303,21 @@ void PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(uint8_t *c,
}

/*************************************************
* Name: indcpa_dec
* Name: PQCLEAN_KYBER102490S_CLEAN_indcpa_dec
*
* Description: Decryption function of the CPA-secure
* public-key encryption scheme underlying Kyber.
*
* Arguments: - uint8_t *m: pointer to output decrypted message (of length KYBER_INDCPA_MSGBYTES)
* - const uint8_t *c: pointer to input ciphertext (of length KYBER_INDCPA_BYTES)
* - const uint8_t *sk: pointer to input secret key (of length KYBER_INDCPA_SECRETKEYBYTES)
* Arguments: - uint8_t *m: pointer to output decrypted message
* (of length KYBER_INDCPA_MSGBYTES)
* - const uint8_t *c: pointer to input ciphertext
* (of length KYBER_INDCPA_BYTES)
* - const uint8_t *sk: pointer to input secret key
* (of length KYBER_INDCPA_SECRETKEYBYTES)
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_indcpa_dec(uint8_t *m,
const uint8_t *c,
const uint8_t *sk) {
void PQCLEAN_KYBER102490S_CLEAN_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES],
const uint8_t c[KYBER_INDCPA_BYTES],
const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) {
polyvec bp, skpv;
poly v, mp;

@@ -290,8 +325,8 @@ void PQCLEAN_KYBER102490S_CLEAN_indcpa_dec(uint8_t *m,
unpack_sk(&skpv, sk);

PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt(&bp);
PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc(&mp, &skpv, &bp);
PQCLEAN_KYBER102490S_CLEAN_poly_invntt(&mp);
PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp);
PQCLEAN_KYBER102490S_CLEAN_poly_invntt_tomont(&mp);

PQCLEAN_KYBER102490S_CLEAN_poly_sub(&mp, &v, &mp);
PQCLEAN_KYBER102490S_CLEAN_poly_reduce(&mp);


+ 9
- 14
crypto_kem/kyber1024-90s/clean/indcpa.h View File

@@ -1,21 +1,16 @@
#ifndef INDCPA_H
#define INDCPA_H
#ifndef PQCLEAN_KYBER102490S_CLEAN_INDCPA_H
#define PQCLEAN_KYBER102490S_CLEAN_INDCPA_H

#include "params.h"
#include "polyvec.h"
#include <stdint.h>

void PQCLEAN_KYBER102490S_CLEAN_indcpa_keypair(
uint8_t *pk,
uint8_t *sk);
void PQCLEAN_KYBER102490S_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed);

void PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(
uint8_t *c,
const uint8_t *m,
const uint8_t *pk,
const uint8_t *coins);
void PQCLEAN_KYBER102490S_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]);

void PQCLEAN_KYBER102490S_CLEAN_indcpa_dec(
uint8_t *m,
const uint8_t *c,
const uint8_t *sk);
void PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], const uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], const uint8_t coins[KYBER_SYMBYTES]);

void PQCLEAN_KYBER102490S_CLEAN_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t c[KYBER_INDCPA_BYTES], const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]);

#endif

+ 60
- 34
crypto_kem/kyber1024-90s/clean/kem.c View File

@@ -1,99 +1,125 @@
#include "api.h"
#include "indcpa.h"
#include "kem.h"
#include "params.h"
#include "randombytes.h"
#include "symmetric.h"
#include "verify.h"
#include <stddef.h>
#include <stdint.h>

#include <stdlib.h>
/*************************************************
* Name: crypto_kem_keypair
* Name: PQCLEAN_KYBER102490S_CLEAN_crypto_kem_keypair
*
* Description: Generates public and private key
* for CCA-secure Kyber key encapsulation mechanism
*
* Arguments: - uint8_t *pk: pointer to output public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes)
* - uint8_t *sk: pointer to output private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes)
* Arguments: - unsigned char *pk: pointer to output public key
* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes)
* - unsigned char *sk: pointer to output private key
* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes)
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) {
size_t i;
int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) {
size_t i = 0;
PQCLEAN_KYBER102490S_CLEAN_indcpa_keypair(pk, sk);
for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) {
sk[i + KYBER_INDCPA_SECRETKEYBYTES] = pk[i];
}
hash_h(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);
randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES); /* Value z for pseudo-random output on reject */
/* Value z for pseudo-random output on reject */
randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES);
return 0;
}

/*************************************************
* Name: crypto_kem_enc
* Name: PQCLEAN_KYBER102490S_CLEAN_crypto_kem_enc
*
* Description: Generates cipher text and shared
* secret for given public key
*
* Arguments: - uint8_t *ct: pointer to output cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes)
* - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes)
* - const uint8_t *pk: pointer to input public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes)
* Arguments: - unsigned char *ct: pointer to output cipher text
* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes)
* - unsigned char *ss: pointer to output shared secret
* (an already allocated array of CRYPTO_BYTES bytes)
* - const unsigned char *pk: pointer to input public key
* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes)
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */
int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_enc(unsigned char *ct,
unsigned char *ss,
const unsigned char *pk) {
uint8_t buf[2 * KYBER_SYMBYTES];
/* Will contain key, coins */
uint8_t kr[2 * KYBER_SYMBYTES];

randombytes(buf, KYBER_SYMBYTES);
hash_h(buf, buf, KYBER_SYMBYTES); /* Don't release system RNG output */
/* Don't release system RNG output */
hash_h(buf, buf, KYBER_SYMBYTES);

hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); /* Multitarget countermeasure for coins + contributory KEM */
/* Multitarget countermeasure for coins + contributory KEM */
hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);
hash_g(kr, buf, 2 * KYBER_SYMBYTES);

PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */
/* coins are in kr+KYBER_SYMBYTES */
PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES);

hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */
kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */
/* overwrite coins in kr with H(c) */
hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES);
/* hash concatenation of pre-k and H(c) to k */
kdf(ss, kr, 2 * KYBER_SYMBYTES);
return 0;
}

/*************************************************
* Name: crypto_kem_dec
* Name: PQCLEAN_KYBER102490S_CLEAN_crypto_kem_dec
*
* Description: Generates shared secret for given
* cipher text and private key
*
* Arguments: - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes)
* - const uint8_t *ct: pointer to input cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes)
* - const uint8_t *sk: pointer to input private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes)
* Arguments: - unsigned char *ss: pointer to output shared secret
* (an already allocated array of CRYPTO_BYTES bytes)
* - const unsigned char *ct: pointer to input cipher text
* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes)
* - const unsigned char *sk: pointer to input private key
* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes)
*
* Returns 0.
*
* On failure, ss will contain a pseudo-random value.
**************************************************/
int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
size_t i;
uint8_t fail;
uint8_t cmp[KYBER_CIPHERTEXTBYTES];
int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_dec(unsigned char *ss,
const unsigned char *ct,
const unsigned char *sk) {
size_t i = 0;
int fail = 0;
uint8_t buf[2 * KYBER_SYMBYTES];
uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */
/* Will contain key, coins */
uint8_t kr[2 * KYBER_SYMBYTES];
uint8_t cmp[KYBER_CIPHERTEXTBYTES];
const uint8_t *pk = sk + KYBER_INDCPA_SECRETKEYBYTES;

PQCLEAN_KYBER102490S_CLEAN_indcpa_dec(buf, ct, sk);

for (i = 0; i < KYBER_SYMBYTES; i++) { /* Multitarget countermeasure for coins + contributory KEM */
buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; /* Save hash by storing H(pk) in sk */
/* Multitarget countermeasure for coins + contributory KEM */
for (i = 0; i < KYBER_SYMBYTES; i++) {
buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i];
}
hash_g(kr, buf, 2 * KYBER_SYMBYTES);

PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(cmp, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */
/* coins are in kr+KYBER_SYMBYTES */
PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(cmp, buf, pk, kr + KYBER_SYMBYTES);

fail = PQCLEAN_KYBER102490S_CLEAN_verify(ct, cmp, KYBER_CIPHERTEXTBYTES);

hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */
/* overwrite coins in kr with H(c) */
hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES);

PQCLEAN_KYBER102490S_CLEAN_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); /* Overwrite pre-k with z on re-encryption failure */
/* Overwrite pre-k with z on re-encryption failure */
PQCLEAN_KYBER102490S_CLEAN_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail);

kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */
/* hash concatenation of pre-k and H(c) to k */
kdf(ss, kr, 2 * KYBER_SYMBYTES);
return 0;
}

+ 19
- 0
crypto_kem/kyber1024-90s/clean/kem.h View File

@@ -0,0 +1,19 @@
#ifndef PQCLEAN_KYBER102490S_CLEAN_KEM_H
#define PQCLEAN_KYBER102490S_CLEAN_KEM_H

#include "params.h"


int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk);


int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_enc(unsigned char *ct,
unsigned char *ss,
const unsigned char *pk);


int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_dec(unsigned char *ss,
const unsigned char *ct,
const unsigned char *sk);

#endif

+ 61
- 58
crypto_kem/kyber1024-90s/clean/ntt.c View File

@@ -1,11 +1,9 @@
#include "ntt.h"
#include "params.h"
#include "ntt.h"
#include "reduce.h"

#include <stddef.h>
#include <stdint.h>

/* Code to generate zetas and zetas_inv used in the number-theoretic transform:
/* Code to generate PQCLEAN_KYBER102490S_CLEAN_zetas and PQCLEAN_KYBER102490S_CLEAN_zetas_inv used in the number-theoretic transform:

#define KYBER_ROOT_OF_UNITY 17

@@ -17,12 +15,8 @@ static const uint16_t tree[128] = {
1, 65, 33, 97, 17, 81, 49, 113, 9, 73, 41, 105, 25, 89, 57, 121,
5, 69, 37, 101, 21, 85, 53, 117, 13, 77, 45, 109, 29, 93, 61, 125,
3, 67, 35, 99, 19, 83, 51, 115, 11, 75, 43, 107, 27, 91, 59, 123,
7, 71, 39, 103, 23, 87, 55, 119, 15, 79, 47, 111, 31, 95, 63, 127};


static int16_t fqmul(int16_t a, int16_t b) {
return montgomery_reduce((int32_t)a*b);
}
7, 71, 39, 103, 23, 87, 55, 119, 15, 79, 47, 111, 31, 95, 63, 127
};

void init_ntt() {
unsigned int i, j, k;
@@ -33,40 +27,44 @@ void init_ntt() {
tmp[i] = fqmul(tmp[i-1], KYBER_ROOT_OF_UNITY*MONT % KYBER_Q);

for(i = 0; i < 128; ++i)
zetas[i] = tmp[tree[i]];
PQCLEAN_KYBER102490S_CLEAN_zetas[i] = tmp[tree[i]];

k = 0;
for(i = 64; i >= 1; i >>= 1)
for(j = i; j < 2*i; ++j)
zetas_inv[k++] = -tmp[128 - tree[j]];
PQCLEAN_KYBER102490S_CLEAN_zetas_inv[k++] = -tmp[128 - tree[j]];

zetas_inv[127] = MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q;
PQCLEAN_KYBER102490S_CLEAN_zetas_inv[127] = MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q;
}

*/

const int16_t PQCLEAN_KYBER102490S_CLEAN_zetas[128] = {
2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182, 962, 2127, 1855, 1468,
573, 2004, 264, 383, 2500, 1458, 1727, 3199, 2648, 1017, 732, 608, 1787, 411, 3124, 1758,
1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469,
2476, 3239, 3058, 830, 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054,
2226, 430, 555, 843, 2078, 871, 1550, 105, 422, 587, 177, 3094, 3038, 2869, 1574, 1653,
3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349, 418, 329, 3173, 3254,
817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193, 1218, 1994, 2455, 220, 2142, 1670,
2144, 1799, 2051, 794, 1819, 2475, 2459, 478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628
2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182, 962,
2127, 1855, 1468, 573, 2004, 264, 383, 2500, 1458, 1727, 3199, 2648, 1017,
732, 608, 1787, 411, 3124, 1758, 1223, 652, 2777, 1015, 2036, 1491, 3047,
1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, 2476, 3239, 3058, 830,
107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, 2226,
430, 555, 843, 2078, 871, 1550, 105, 422, 587, 177, 3094, 3038, 2869, 1574,
1653, 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349,
418, 329, 3173, 3254, 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193,
1218, 1994, 2455, 220, 2142, 1670, 2144, 1799, 2051, 794, 1819, 2475, 2459,
478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628
};

const int16_t PQCLEAN_KYBER102490S_CLEAN_zetas_inv[128] = {
1701, 1807, 1460, 2371, 2338, 2333, 308, 108, 2851, 870, 854, 1510, 2535, 1278, 1530, 1185,
1659, 1187, 3109, 874, 1335, 2111, 136, 1215, 2945, 1465, 1285, 2007, 2719, 2726, 2232, 2512,
75, 156, 3000, 2911, 2980, 872, 2685, 1590, 2210, 602, 1846, 777, 147, 2170, 2551, 246,
1676, 1755, 460, 291, 235, 3152, 2742, 2907, 3224, 1779, 2458, 1251, 2486, 2774, 2899, 1103,
1275, 2652, 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853,
1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, 2677, 2106,
1571, 205, 2918, 1542, 2721, 2597, 2312, 681, 130, 1602, 1871, 829, 2946, 3065, 1325, 2756,
1861, 1474, 1202, 2367, 3147, 1752, 2707, 171, 3127, 3042, 1907, 1836, 1517, 359, 758, 1441
1701, 1807, 1460, 2371, 2338, 2333, 308, 108, 2851, 870, 854, 1510, 2535,
1278, 1530, 1185, 1659, 1187, 3109, 874, 1335, 2111, 136, 1215, 2945, 1465,
1285, 2007, 2719, 2726, 2232, 2512, 75, 156, 3000, 2911, 2980, 872, 2685,
1590, 2210, 602, 1846, 777, 147, 2170, 2551, 246, 1676, 1755, 460, 291, 235,
3152, 2742, 2907, 3224, 1779, 2458, 1251, 2486, 2774, 2899, 1103, 1275, 2652,
1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853,
1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552,
2677, 2106, 1571, 205, 2918, 1542, 2721, 2597, 2312, 681, 130, 1602, 1871,
829, 2946, 3065, 1325, 2756, 1861, 1474, 1202, 2367, 3147, 1752, 2707, 171,
3127, 3042, 1907, 1836, 1517, 359, 758, 1441
};


/*************************************************
* Name: fqmul
*
@@ -82,68 +80,73 @@ static int16_t fqmul(int16_t a, int16_t b) {
}

/*************************************************
* Name: ntt
* Name: PQCLEAN_KYBER102490S_CLEAN_ntt
*
* Description: Inplace number-theoretic transform (NTT) in Rq
* input is in standard order, output is in bitreversed order
*
* Arguments: - int16_t poly[256]: pointer to input/output vector of elements of Zq
* Arguments: - int16_t r[256]: pointer to input/output vector of elements
* of Zq
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_ntt(int16_t poly[256]) {
size_t j, k = 1;
int16_t t, zeta;
void PQCLEAN_KYBER102490S_CLEAN_ntt(int16_t r[256]) {
unsigned int len = 0, start = 0, j = 0, k = 0;
int16_t t = 0, zeta = 0;

for (size_t len = 128; len >= 2; len >>= 1) {
for (size_t start = 0; start < 256; start = j + len) {
k = 1;
for (len = 128; len >= 2; len >>= 1) {
for (start = 0; start < 256; start = j + len) {
zeta = PQCLEAN_KYBER102490S_CLEAN_zetas[k++];
for (j = start; j < start + len; ++j) {
t = fqmul(zeta, poly[j + len]);
poly[j + len] = poly[j] - t;
poly[j] = poly[j] + t;
t = fqmul(zeta, r[j + len]);
r[j + len] = r[j] - t;
r[j] = r[j] + t;
}
}
}
}

/*************************************************
* Name: invntt
* Name: invntt_tomont
*
* Description: Inplace inverse number-theoretic transform in Rq
* input is in bitreversed order, output is in standard order
* Description: Inplace inverse number-theoretic transform in Rq and
* multiplication by Montgomery factor 2^16.
* Input is in bitreversed order, output is in standard order
*
* Arguments: - int16_t poly[256]: pointer to input/output vector of elements of Zq
* Arguments: - int16_t r[256]: pointer to input/output vector of elements
* of Zq
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_invntt(int16_t poly[256]) {
size_t j, k = 0;
int16_t t, zeta;
void PQCLEAN_KYBER102490S_CLEAN_invntt(int16_t r[256]) {
unsigned int start = 0, len = 0, j = 0, k = 0;
int16_t t = 0, zeta = 0;

for (size_t len = 2; len <= 128; len <<= 1) {
for (size_t start = 0; start < 256; start = j + len) {
k = 0;
for (len = 2; len <= 128; len <<= 1) {
for (start = 0; start < 256; start = j + len) {
zeta = PQCLEAN_KYBER102490S_CLEAN_zetas_inv[k++];
for (j = start; j < start + len; ++j) {
t = poly[j];
poly[j] = PQCLEAN_KYBER102490S_CLEAN_barrett_reduce(t + poly[j + len]);
poly[j + len] = t - poly[j + len];
poly[j + len] = fqmul(zeta, poly[j + len]);
t = r[j];
r[j] = PQCLEAN_KYBER102490S_CLEAN_barrett_reduce(t + r[j + len]);
r[j + len] = t - r[j + len];
r[j + len] = fqmul(zeta, r[j + len]);
}
}
}

for (j = 0; j < 256; ++j) {
poly[j] = fqmul(poly[j], PQCLEAN_KYBER102490S_CLEAN_zetas_inv[127]);
r[j] = fqmul(r[j], PQCLEAN_KYBER102490S_CLEAN_zetas_inv[127]);
}
}

/*************************************************
* Name: basemul
* Name: PQCLEAN_KYBER102490S_CLEAN_basemul
*
* Description: Multiplication of polynomials in Zq[X]/((X^2-zeta))
* Description: Multiplication of polynomials in Zq[X]/(X^2-zeta)
* used for multiplication of elements in Rq in NTT domain
*
* Arguments: - int16_t r[2]: pointer to the output polynomial
* Arguments: - int16_t r[2]: pointer to the output polynomial
* - const int16_t a[2]: pointer to the first factor
* - const int16_t b[2]: pointer to the second factor
* - int16_t zeta: integer defining the reduction polynomial
* - int16_t zeta: integer defining the reduction polynomial
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta) {
r[0] = fqmul(a[1], b[1]);


+ 14
- 5
crypto_kem/kyber1024-90s/clean/ntt.h View File

@@ -1,13 +1,22 @@
#ifndef NTT_H
#define NTT_H
#ifndef PQCLEAN_KYBER102490S_CLEAN_NTT_H
#define PQCLEAN_KYBER102490S_CLEAN_NTT_H

#include "params.h"
#include <stdint.h>


extern const int16_t PQCLEAN_KYBER102490S_CLEAN_zetas[128];
extern const int16_t PQCLEAN_KYBER102490S_CLEAN_zetasinv[128];

void PQCLEAN_KYBER102490S_CLEAN_ntt(int16_t *poly);
void PQCLEAN_KYBER102490S_CLEAN_invntt(int16_t *poly);

extern const int16_t PQCLEAN_KYBER102490S_CLEAN_zetas_inv[128];


void PQCLEAN_KYBER102490S_CLEAN_ntt(int16_t r[256]);


void PQCLEAN_KYBER102490S_CLEAN_invntt(int16_t r[256]);


void PQCLEAN_KYBER102490S_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta);

#endif

+ 10
- 10
crypto_kem/kyber1024-90s/clean/params.h View File

@@ -1,8 +1,5 @@
#ifndef PARAMS_H
#define PARAMS_H


/* Don't change parameters below this line */
#ifndef PQCLEAN_KYBER102490S_CLEAN_PARAMS_H
#define PQCLEAN_KYBER102490S_CLEAN_PARAMS_H

#define KYBER_N 256
#define KYBER_Q 3329
@@ -12,9 +9,8 @@
#define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */
#define KYBER_SSBYTES 32 /* size in bytes of shared key */

#define KYBER_POLYBYTES 384
#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES)

#define KYBER_POLYBYTES 384
#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES)

#define KYBER_K 4
#define KYBER_POLYCOMPRESSEDBYTES 160
@@ -23,10 +19,14 @@
#define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES
#define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES)
#define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES)
#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES)
#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES \
+ KYBER_POLYCOMPRESSEDBYTES)

#define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES)
#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) /* 32 bytes of additional space to save H(pk) */
/* 32 bytes of additional space to save H(pk) */
#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES \
+ KYBER_INDCPA_PUBLICKEYBYTES \
+ 2*KYBER_SYMBYTES)
#define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES

#endif

+ 146
- 132
crypto_kem/kyber1024-90s/clean/poly.c View File

@@ -1,120 +1,177 @@
#include "params.h"
#include "cbd.h"
#include "ntt.h"
#include "params.h"
#include "poly.h"
#include "reduce.h"
#include "symmetric.h"

#include <stdint.h>

/*************************************************
* Name: poly_compress
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_compress
*
* Description: Compression and subsequent serialization of a polynomial
*
* Arguments: - uint8_t *r: pointer to output byte array (needs space for KYBER_POLYCOMPRESSEDBYTES bytes)
* - const poly *a: pointer to input polynomial
* Arguments: - uint8_t *r: pointer to output byte array
* (of length KYBER_POLYCOMPRESSEDBYTES)
* - poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_poly_compress(uint8_t *r, poly *a) {
void PQCLEAN_KYBER102490S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a) {
unsigned int i = 0, j = 0;
uint8_t t[8];
size_t k = 0;

PQCLEAN_KYBER102490S_CLEAN_poly_csubq(a);

for (size_t i = 0; i < KYBER_N; i += 8) {
for (size_t j = 0; j < 8; j++) {
t[j] = ((((uint32_t)a->coeffs[i + j] << 5) + KYBER_Q / 2) / KYBER_Q) & 31;
for (i = 0; i < KYBER_N / 8; i++) {
for (j = 0; j < 8; j++) {
t[j] = ((((uint32_t)a->coeffs[8 * i + j] << 5) + KYBER_Q / 2) / KYBER_Q) & 31;
}

r[k] = (uint8_t)( t[0] | (t[1] << 5));
r[k + 1] = (uint8_t)((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
r[k + 2] = (uint8_t)((t[3] >> 1) | (t[4] << 4));
r[k + 3] = (uint8_t)((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
r[k + 4] = (uint8_t)((t[6] >> 2) | (t[7] << 3));
k += 5;
r[0] = (t[0] >> 0) | (t[1] << 5);
r[1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7);
r[2] = (t[3] >> 1) | (t[4] << 4);
r[3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6);
r[4] = (t[6] >> 2) | (t[7] << 3);
r += 5;
}
}

/*************************************************
* Name: poly_decompress
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_decompress
*
* Description: De-serialization and subsequent decompression of a polynomial;
* approximate inverse of poly_compress
* approximate inverse of PQCLEAN_KYBER102490S_CLEAN_poly_compress
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *a: pointer to input byte array (of length KYBER_POLYCOMPRESSEDBYTES bytes)
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *a: pointer to input byte array
* (of length KYBER_POLYCOMPRESSEDBYTES bytes)
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_poly_decompress(poly *r, const uint8_t *a) {
for (size_t i = 0; i < KYBER_N; i += 8) {
r->coeffs[i + 0] = (int16_t)( (((a[0] & 31) * KYBER_Q) + 16) >> 5);
r->coeffs[i + 1] = (int16_t)(((((a[0] >> 5) | ((a[1] & 3) << 3)) * KYBER_Q) + 16) >> 5);
r->coeffs[i + 2] = (int16_t)(((((a[1] >> 2) & 31) * KYBER_Q) + 16) >> 5);
r->coeffs[i + 3] = (int16_t)(((((a[1] >> 7) | ((a[2] & 15) << 1)) * KYBER_Q) + 16) >> 5);
r->coeffs[i + 4] = (int16_t)(((((a[2] >> 4) | ((a[3] & 1) << 4)) * KYBER_Q) + 16) >> 5);
r->coeffs[i + 5] = (int16_t)(((((a[3] >> 1) & 31) * KYBER_Q) + 16) >> 5);
r->coeffs[i + 6] = (int16_t)(((((a[3] >> 6) | ((a[4] & 7) << 2)) * KYBER_Q) + 16) >> 5);
r->coeffs[i + 7] = (int16_t)( (((a[4] >> 3) * KYBER_Q) + 16) >> 5);
void PQCLEAN_KYBER102490S_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) {
unsigned int i = 0;

unsigned int j = 0;
uint8_t t[8];
for (i = 0; i < KYBER_N / 8; i++) {
t[0] = (a[0] >> 0);
t[1] = (a[0] >> 5) | (a[1] << 3);
t[2] = (a[1] >> 2);
t[3] = (a[1] >> 7) | (a[2] << 1);
t[4] = (a[2] >> 4) | (a[3] << 4);
t[5] = (a[3] >> 1);
t[6] = (a[3] >> 6) | (a[4] << 2);
t[7] = (a[4] >> 3);
a += 5;

for (j = 0; j < 8; j++) {
r->coeffs[8 * i + j] = ((uint32_t)(t[j] & 31) * KYBER_Q + 16) >> 5;
}
}
}

/*************************************************
* Name: poly_tobytes
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_tobytes
*
* Description: Serialization of a polynomial
*
* Arguments: - uint8_t *r: pointer to output byte array (needs space for KYBER_POLYBYTES bytes)
* - const poly *a: pointer to input polynomial
* Arguments: - uint8_t *r: pointer to output byte array
* (needs space for KYBER_POLYBYTES bytes)
* - poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_poly_tobytes(uint8_t *r, poly *a) {
int16_t t0, t1;
void PQCLEAN_KYBER102490S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) {
unsigned int i = 0;
uint16_t t0 = 0, t1 = 0;

PQCLEAN_KYBER102490S_CLEAN_poly_csubq(a);

for (size_t i = 0; i < KYBER_N / 2; i++) {
for (i = 0; i < KYBER_N / 2; i++) {
t0 = a->coeffs[2 * i];
t1 = a->coeffs[2 * i + 1];
r[3 * i] = t0 & 0xff;
r[3 * i + 1] = (uint8_t)((t0 >> 8) | ((t1 & 0xf) << 4));
r[3 * i + 2] = (uint8_t)(t1 >> 4);
r[3 * i + 0] = (t0 >> 0);
r[3 * i + 1] = (t0 >> 8) | (t1 << 4);
r[3 * i + 2] = (t1 >> 4);
}
}

/*************************************************
* Name: poly_frombytes
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_frombytes
*
* Description: De-serialization of a polynomial;
* inverse of poly_tobytes
* inverse of PQCLEAN_KYBER102490S_CLEAN_poly_tobytes
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *a: pointer to input byte array (of KYBER_POLYBYTES bytes)
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *a: pointer to input byte array
* (of KYBER_POLYBYTES bytes)
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_poly_frombytes(poly *r, const uint8_t *a) {
for (size_t i = 0; i < KYBER_N / 2; i++) {
r->coeffs[2 * i] = (int16_t)(a[3 * i] | ((uint16_t)a[3 * i + 1] & 0x0f) << 8);
r->coeffs[2 * i + 1] = (int16_t)(a[3 * i + 1] >> 4 | ((uint16_t)a[3 * i + 2] & 0xff) << 4);
void PQCLEAN_KYBER102490S_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]) {
unsigned int i = 0;
for (i = 0; i < KYBER_N / 2; i++) {
r->coeffs[2 * i] = ((a[3 * i + 0] >> 0) | ((uint16_t)a[3 * i + 1] << 8)) & 0xFFF;
r->coeffs[2 * i + 1] = ((a[3 * i + 1] >> 4) | ((uint16_t)a[3 * i + 2] << 4)) & 0xFFF;
}
}

/*************************************************
* Name: poly_getnoise
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_frommsg
*
* Description: Convert 32-byte message to polynomial
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *msg: pointer to input message
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]) {
unsigned int i = 0, j = 0;
int16_t mask = 0;

for (i = 0; i < KYBER_N / 8; i++) {
for (j = 0; j < 8; j++) {
mask = -(int16_t)((msg[i] >> j) & 1);
r->coeffs[8 * i + j] = mask & ((KYBER_Q + 1) / 2);
}
}
}

/*************************************************
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_tomsg
*
* Description: Convert polynomial to 32-byte message
*
* Arguments: - uint8_t *msg: pointer to output message
* - poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a) {
unsigned int i = 0, j = 0;
uint16_t t = 0;

PQCLEAN_KYBER102490S_CLEAN_poly_csubq(a);

for (i = 0; i < KYBER_N / 8; i++) {
msg[i] = 0;
for (j = 0; j < 8; j++) {
t = ((((uint16_t)a->coeffs[8 * i + j] << 1) + KYBER_Q / 2) / KYBER_Q) & 1;
msg[i] |= t << j;
}
}
}

/*************************************************
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_getnoise
*
* Description: Sample a polynomial deterministically from a seed and a nonce,
* with output polynomial close to centered binomial distribution
* with parameter KYBER_ETA
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *seed: pointer to input seed (pointing to array of length KYBER_SYMBYTES bytes)
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *seed: pointer to input seed
* (of length KYBER_SYMBYTES bytes)
* - uint8_t nonce: one-byte input nonce
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce) {
void PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) {
uint8_t buf[KYBER_ETA * KYBER_N / 4];

prf(buf, KYBER_ETA * KYBER_N / 4, seed, nonce);
prf(buf, sizeof(buf), seed, nonce);
PQCLEAN_KYBER102490S_CLEAN_cbd(r, buf);
}

/*************************************************
* Name: poly_ntt
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_ntt
*
* Description: Computes negacyclic number-theoretic transform (NTT) of
* a polynomial in place;
@@ -128,20 +185,20 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_ntt(poly *r) {
}

/*************************************************
* Name: poly_invntt
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_invntt_tomont
*
* Description: Computes inverse of negacyclic number-theoretic transform (NTT) of
* a polynomial in place;
* Description: Computes inverse of negacyclic number-theoretic transform (NTT)
* of a polynomial in place;
* inputs assumed to be in bitreversed order, output in normal order
*
* Arguments: - uint16_t *a: pointer to in/output polynomial
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_poly_invntt(poly *r) {
void PQCLEAN_KYBER102490S_CLEAN_poly_invntt_tomont(poly *r) {
PQCLEAN_KYBER102490S_CLEAN_invntt(r->coeffs);
}

/*************************************************
* Name: poly_basemul
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_basemul_montgomery
*
* Description: Multiplication of two polynomials in NTT domain
*
@@ -149,68 +206,64 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_invntt(poly *r) {
* - const poly *a: pointer to first input polynomial
* - const poly *b: pointer to second input polynomial
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_poly_basemul(poly *r, const poly *a, const poly *b) {
for (size_t i = 0; i < KYBER_N / 4; ++i) {
PQCLEAN_KYBER102490S_CLEAN_basemul(
r->coeffs + 4 * i,
a->coeffs + 4 * i,
b->coeffs + 4 * i,
PQCLEAN_KYBER102490S_CLEAN_zetas[64 + i]);
PQCLEAN_KYBER102490S_CLEAN_basemul(
r->coeffs + 4 * i + 2,
a->coeffs + 4 * i + 2,
b->coeffs + 4 * i + 2,
-PQCLEAN_KYBER102490S_CLEAN_zetas[64 + i]);
void PQCLEAN_KYBER102490S_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, const poly *b) {
unsigned int i = 0;
for (i = 0; i < KYBER_N / 4; i++) {
PQCLEAN_KYBER102490S_CLEAN_basemul(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i], PQCLEAN_KYBER102490S_CLEAN_zetas[64 + i]);
PQCLEAN_KYBER102490S_CLEAN_basemul(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2], &b->coeffs[4 * i + 2],
-PQCLEAN_KYBER102490S_CLEAN_zetas[64 + i]);
}
}

/*************************************************
* Name: poly_frommont
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_tomont
*
* Description: Inplace conversion of all coefficients of a polynomial
* from Montgomery domain to normal domain
* from normal domain to Montgomery domain
*
* Arguments: - poly *r: pointer to input/output polynomial
* Arguments: - poly *r: pointer to input/output polynomial
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_poly_frommont(poly *r) {
void PQCLEAN_KYBER102490S_CLEAN_poly_tomont(poly *r) {
unsigned int i = 0;
const int16_t f = (1ULL << 32) % KYBER_Q;

for (size_t i = 0; i < KYBER_N; i++) {
r->coeffs[i] = PQCLEAN_KYBER102490S_CLEAN_montgomery_reduce(
(int32_t)r->coeffs[i] * f);
for (i = 0; i < KYBER_N; i++) {
r->coeffs[i] = PQCLEAN_KYBER102490S_CLEAN_montgomery_reduce((int32_t)r->coeffs[i] * f);
}
}

/*************************************************
* Name: poly_reduce
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_reduce
*
* Description: Applies Barrett reduction to all coefficients of a polynomial
* for details of the Barrett reduction see comments in reduce.c
*
* Arguments: - poly *r: pointer to input/output polynomial
* Arguments: - poly *r: pointer to input/output polynomial
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_poly_reduce(poly *r) {
for (size_t i = 0; i < KYBER_N; i++) {
unsigned int i = 0;
for (i = 0; i < KYBER_N; i++) {
r->coeffs[i] = PQCLEAN_KYBER102490S_CLEAN_barrett_reduce(r->coeffs[i]);
}
}

/*************************************************
* Name: poly_csubq
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_csubq
*
* Description: Applies conditional subtraction of q to each coefficient of a polynomial
* for details of conditional subtraction of q see comments in reduce.c
* Description: Applies conditional subtraction of q to each coefficient
* of a polynomial. For details of conditional subtraction
* of q see comments in reduce.c
*
* Arguments: - poly *r: pointer to input/output polynomial
* Arguments: - poly *r: pointer to input/output polynomial
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_poly_csubq(poly *r) {
for (size_t i = 0; i < KYBER_N; i++) {
unsigned int i = 0;
for (i = 0; i < KYBER_N; i++) {
r->coeffs[i] = PQCLEAN_KYBER102490S_CLEAN_csubq(r->coeffs[i]);
}
}

/*************************************************
* Name: poly_add
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_add
*
* Description: Add two polynomials
*
@@ -219,13 +272,14 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_csubq(poly *r) {
* - const poly *b: pointer to second input polynomial
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_poly_add(poly *r, const poly *a, const poly *b) {
for (size_t i = 0; i < KYBER_N; i++) {
unsigned int i = 0;
for (i = 0; i < KYBER_N; i++) {
r->coeffs[i] = a->coeffs[i] + b->coeffs[i];
}
}

/*************************************************
* Name: poly_sub
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_sub
*
* Description: Subtract two polynomials
*
@@ -234,48 +288,8 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_add(poly *r, const poly *a, const poly *b)
* - const poly *b: pointer to second input polynomial
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_poly_sub(poly *r, const poly *a, const poly *b) {
for (size_t i = 0; i < KYBER_N; i++) {
unsigned int i = 0;
for (i = 0; i < KYBER_N; i++) {
r->coeffs[i] = a->coeffs[i] - b->coeffs[i];
}
}

/*************************************************
* Name: poly_frommsg
*
* Description: Convert 32-byte message to polynomial
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *msg: pointer to input message
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]) {
uint16_t mask;

for (size_t i = 0; i < KYBER_SYMBYTES; i++) {
for (size_t j = 0; j < 8; j++) {
mask = -((msg[i] >> j) & 1);
r->coeffs[8 * i + j] = mask & ((KYBER_Q + 1) / 2);
}
}
}

/*************************************************
* Name: poly_tomsg
*
* Description: Convert polynomial to 32-byte message
*
* Arguments: - uint8_t *msg: pointer to output message
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a) {
uint16_t t;

PQCLEAN_KYBER102490S_CLEAN_poly_csubq(a);

for (size_t i = 0; i < KYBER_SYMBYTES; i++) {
msg[i] = 0;
for (size_t j = 0; j < 8; j++) {
t = (((a->coeffs[8 * i + j] << 1) + KYBER_Q / 2) / KYBER_Q) & 1;
msg[i] |= t << j;
}
}
}

+ 28
- 13
crypto_kem/kyber1024-90s/clean/poly.h View File

@@ -1,9 +1,9 @@
#ifndef POLY_H
#define POLY_H
#ifndef PQCLEAN_KYBER102490S_CLEAN_POLY_H
#define PQCLEAN_KYBER102490S_CLEAN_POLY_H

#include "params.h"

#include <stdint.h>

/*
* Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial
* coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1]
@@ -12,26 +12,41 @@ typedef struct {
int16_t coeffs[KYBER_N];
} poly;

void PQCLEAN_KYBER102490S_CLEAN_poly_compress(uint8_t *r, poly *a);
void PQCLEAN_KYBER102490S_CLEAN_poly_decompress(poly *r, const uint8_t *a);

void PQCLEAN_KYBER102490S_CLEAN_poly_tobytes(uint8_t *r, poly *a);
void PQCLEAN_KYBER102490S_CLEAN_poly_frombytes(poly *r, const uint8_t *a);
void PQCLEAN_KYBER102490S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a);

void PQCLEAN_KYBER102490S_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]);


void PQCLEAN_KYBER102490S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a);

void PQCLEAN_KYBER102490S_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]);


void PQCLEAN_KYBER102490S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]);

void PQCLEAN_KYBER102490S_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a);

void PQCLEAN_KYBER102490S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]);
void PQCLEAN_KYBER102490S_CLEAN_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a);

void PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce);
void PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);


void PQCLEAN_KYBER102490S_CLEAN_poly_ntt(poly *r);
void PQCLEAN_KYBER102490S_CLEAN_poly_invntt(poly *r);
void PQCLEAN_KYBER102490S_CLEAN_poly_basemul(poly *r, const poly *a, const poly *b);
void PQCLEAN_KYBER102490S_CLEAN_poly_frommont(poly *r);

void PQCLEAN_KYBER102490S_CLEAN_poly_invntt_tomont(poly *r);

void PQCLEAN_KYBER102490S_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, const poly *b);

void PQCLEAN_KYBER102490S_CLEAN_poly_tomont(poly *r);


void PQCLEAN_KYBER102490S_CLEAN_poly_reduce(poly *r);

void PQCLEAN_KYBER102490S_CLEAN_poly_csubq(poly *r);


void PQCLEAN_KYBER102490S_CLEAN_poly_add(poly *r, const poly *a, const poly *b);

void PQCLEAN_KYBER102490S_CLEAN_poly_sub(poly *r, const poly *a, const poly *b);

#endif

+ 100
- 71
crypto_kem/kyber1024-90s/clean/polyvec.c View File

@@ -1,138 +1,163 @@
#include "polyvec.h"

#include "params.h"
#include "poly.h"

#include <stddef.h>
#include "polyvec.h"
#include <stdint.h>

/*************************************************
* Name: polyvec_compress
* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_compress
*
* Description: Compress and serialize vector of polynomials
*
* Arguments: - uint8_t *r: pointer to output byte array (needs space for KYBER_POLYVECCOMPRESSEDBYTES)
* - const polyvec *a: pointer to input vector of polynomials
* Arguments: - uint8_t *r: pointer to output byte array
* (needs space for KYBER_POLYVECCOMPRESSEDBYTES)
* - polyvec *a: pointer to input vector of polynomials
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_polyvec_compress(uint8_t *r, polyvec *a) {
void PQCLEAN_KYBER102490S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a) {
unsigned int i = 0, j = 0, k = 0;

PQCLEAN_KYBER102490S_CLEAN_polyvec_csubq(a);

uint16_t t[8];
for (size_t i = 0; i < KYBER_K; i++) {
for (size_t j = 0; j < KYBER_N / 8; j++) {
for (size_t k = 0; k < 8; k++) {
t[k] = ((((uint32_t)a->vec[i].coeffs[8 * j + k] << 11) + KYBER_Q / 2) / KYBER_Q) & 0x7ff;
for (i = 0; i < KYBER_K; i++) {
for (j = 0; j < KYBER_N / 8; j++) {
for (k = 0; k < 8; k++) {
{
t[k] = ((((uint32_t)a->vec[i].coeffs[8 * j + k] << 11) + KYBER_Q / 2)
/ KYBER_Q) & 0x7ff;
}
}

r[11 * j + 0] = (uint8_t)t[0];
r[11 * j + 1] = (uint8_t)((t[0] >> 8) | ((t[1] & 0x1f) << 3));
r[11 * j + 2] = (uint8_t)((t[1] >> 5) | ((t[2] & 0x03) << 6));
r[11 * j + 3] = (uint8_t)((t[2] >> 2));
r[11 * j + 4] = (uint8_t)((t[2] >> 10) | ((t[3] & 0x7f) << 1));
r[11 * j + 5] = (uint8_t)((t[3] >> 7) | ((t[4] & 0x0f) << 4));
r[11 * j + 6] = (uint8_t)((t[4] >> 4) | ((t[5] & 0x01) << 7));
r[11 * j + 7] = (uint8_t)((t[5] >> 1));
r[11 * j + 8] = (uint8_t)((t[5] >> 9) | ((t[6] & 0x3f) << 2));
r[11 * j + 9] = (uint8_t)((t[6] >> 6) | ((t[7] & 0x07) << 5));
r[11 * j + 10] = (uint8_t)((t[7] >> 3));
r[ 0] = (t[0] >> 0);
r[ 1] = (t[0] >> 8) | (t[1] << 3);
r[ 2] = (t[1] >> 5) | (t[2] << 6);
r[ 3] = (t[2] >> 2);
r[ 4] = (t[2] >> 10) | (t[3] << 1);
r[ 5] = (t[3] >> 7) | (t[4] << 4);
r[ 6] = (t[4] >> 4) | (t[5] << 7);
r[ 7] = (t[5] >> 1);
r[ 8] = (t[5] >> 9) | (t[6] << 2);
r[ 9] = (t[6] >> 6) | (t[7] << 5);
r[10] = (t[7] >> 3);
r += 11;
}
r += 352;
}
}

/*************************************************
* Name: polyvec_decompress
* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress
*
* Description: De-serialize and decompress vector of polynomials;
* approximate inverse of polyvec_compress
* approximate inverse of PQCLEAN_KYBER102490S_CLEAN_polyvec_compress
*
* Arguments: - polyvec *r: pointer to output vector of polynomials
* - uint8_t *a: pointer to input byte array (of length KYBER_POLYVECCOMPRESSEDBYTES)
* - const uint8_t *a: pointer to input byte array
* (of length KYBER_POLYVECCOMPRESSEDBYTES)
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress(polyvec *r, const uint8_t *a) {
for (size_t i = 0; i < KYBER_K; i++) {
for (size_t j = 0; j < KYBER_N / 8; j++) {
r->vec[i].coeffs[8 * j + 0] = (int16_t)( (((a[11 * j + 0] | (((uint32_t)a[11 * j + 1] & 0x07) << 8)) * KYBER_Q) + 1024) >> 11);
r->vec[i].coeffs[8 * j + 1] = (int16_t)(((((a[11 * j + 1] >> 3) | (((uint32_t)a[11 * j + 2] & 0x3f) << 5)) * KYBER_Q) + 1024) >> 11);
r->vec[i].coeffs[8 * j + 2] = (int16_t)(((((a[11 * j + 2] >> 6) | (((uint32_t)a[11 * j + 3] & 0xff) << 2) | (((uint32_t)a[11 * j + 4] & 0x01) << 10)) * KYBER_Q) + 1024) >> 11);
r->vec[i].coeffs[8 * j + 3] = (int16_t)(((((a[11 * j + 4] >> 1) | (((uint32_t)a[11 * j + 5] & 0x0f) << 7)) * KYBER_Q) + 1024) >> 11);
r->vec[i].coeffs[8 * j + 4] = (int16_t)(((((a[11 * j + 5] >> 4) | (((uint32_t)a[11 * j + 6] & 0x7f) << 4)) * KYBER_Q) + 1024) >> 11);
r->vec[i].coeffs[8 * j + 5] = (int16_t)(((((a[11 * j + 6] >> 7) | (((uint32_t)a[11 * j + 7] & 0xff) << 1) | (((uint32_t)a[11 * j + 8] & 0x03) << 9)) * KYBER_Q) + 1024) >> 11);
r->vec[i].coeffs[8 * j + 6] = (int16_t)(((((a[11 * j + 8] >> 2) | (((uint32_t)a[11 * j + 9] & 0x1f) << 6)) * KYBER_Q) + 1024) >> 11);
r->vec[i].coeffs[8 * j + 7] = (int16_t)(((((a[11 * j + 9] >> 5) | (((uint32_t)a[11 * j + 10] & 0xff) << 3)) * KYBER_Q) + 1024) >> 11);
void PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress(polyvec *r,
const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) {
unsigned int i = 0, j = 0, k = 0;

uint16_t t[8];
for (i = 0; i < KYBER_K; i++) {
for (j = 0; j < KYBER_N / 8; j++) {
t[0] = (a[0] >> 0) | ((uint16_t)a[ 1] << 8);
t[1] = (a[1] >> 3) | ((uint16_t)a[ 2] << 5);
t[2] = (a[2] >> 6) | ((uint16_t)a[ 3] << 2) | ((uint16_t)a[4] << 10);
t[3] = (a[4] >> 1) | ((uint16_t)a[ 5] << 7);
t[4] = (a[5] >> 4) | ((uint16_t)a[ 6] << 4);
t[5] = (a[6] >> 7) | ((uint16_t)a[ 7] << 1) | ((uint16_t)a[8] << 9);
t[6] = (a[8] >> 2) | ((uint16_t)a[ 9] << 6);
t[7] = (a[9] >> 5) | ((uint16_t)a[10] << 3);
a += 11;

for (k = 0; k < 8; k++) {
r->vec[i].coeffs[8 * j + k] = ((uint32_t)(t[k] & 0x7FF) * KYBER_Q + 1024) >> 11;
}
}
a += 352;
}
}

/*************************************************
* Name: polyvec_tobytes
* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes
*
* Description: Serialize vector of polynomials
*
* Arguments: - uint8_t *r: pointer to output byte array (needs space for KYBER_POLYVECBYTES)
* - const polyvec *a: pointer to input vector of polynomials
* Arguments: - uint8_t *r: pointer to output byte array
* (needs space for KYBER_POLYVECBYTES)
* - polyvec *a: pointer to input vector of polynomials
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes(uint8_t *r, polyvec *a) {
for (size_t i = 0; i < KYBER_K; i++) {
void PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a) {
unsigned int i = 0;
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_CLEAN_poly_tobytes(r + i * KYBER_POLYBYTES, &a->vec[i]);
}
}

/*************************************************
* Name: polyvec_frombytes
* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_frombytes
*
* Description: De-serialize vector of polynomials;
* inverse of polyvec_tobytes
* inverse of PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes
*
* Arguments: - uint8_t *r: pointer to output byte array
* - const polyvec *a: pointer to input vector of polynomials (of length KYBER_POLYVECBYTES)
* Arguments: - uint8_t *r: pointer to output byte array
* - const polyvec *a: pointer to input vector of polynomials
* (of length KYBER_POLYVECBYTES)
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t *a) {
for (size_t i = 0; i < KYBER_K; i++) {
void PQCLEAN_KYBER102490S_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]) {
unsigned int i = 0;
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_CLEAN_poly_frombytes(&r->vec[i], a + i * KYBER_POLYBYTES);
}
}

/*************************************************
* Name: polyvec_ntt
* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt
*
* Description: Apply forward NTT to all elements of a vector of polynomials
*
* Arguments: - polyvec *r: pointer to in/output vector of polynomials
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt(polyvec *r) {
for (size_t i = 0; i < KYBER_K; i++) {
unsigned int i = 0;
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_CLEAN_poly_ntt(&r->vec[i]);
}
}

/*************************************************
* Name: polyvec_invntt
* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_invntt_tomont
*
* Description: Apply inverse NTT to all elements of a vector of polynomials
* and multiply by Montgomery factor 2^16
*
* Arguments: - polyvec *r: pointer to in/output vector of polynomials
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_polyvec_invntt(polyvec *r) {
for (size_t i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_CLEAN_poly_invntt(&r->vec[i]);
void PQCLEAN_KYBER102490S_CLEAN_polyvec_invntt_tomont(polyvec *r) {
unsigned int i = 0;
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_CLEAN_poly_invntt_tomont(&r->vec[i]);
}
}

/*************************************************
* Name: polyvec_pointwise_acc
* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery
*
* Description: Pointwise multiply elements of a and b and accumulate into r
* Description: Pointwise multiply elements of a and b, accumulate into r,
* and multiply by 2^-16.
*
* Arguments: - poly *r: pointer to output polynomial
* - const polyvec *a: pointer to first input vector of polynomials
* - const polyvec *b: pointer to second input vector of polynomials
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b) {
void PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(poly *r,
const polyvec *a,
const polyvec *b) {
unsigned int i = 0;
poly t;

PQCLEAN_KYBER102490S_CLEAN_poly_basemul(r, &a->vec[0], &b->vec[0]);
for (size_t i = 1; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_CLEAN_poly_basemul(&t, &a->vec[i], &b->vec[i]);
PQCLEAN_KYBER102490S_CLEAN_poly_basemul_montgomery(r, &a->vec[0], &b->vec[0]);
for (i = 1; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_CLEAN_poly_basemul_montgomery(&t, &a->vec[i], &b->vec[i]);
PQCLEAN_KYBER102490S_CLEAN_poly_add(r, r, &t);
}

@@ -140,37 +165,40 @@ void PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc(poly *r, const polyvec *a,
}

/*************************************************
* Name: polyvec_reduce
* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_reduce
*
* Description: Applies Barrett reduction to each coefficient
* of each element of a vector of polynomials
* for details of the Barrett reduction see comments in reduce.c
*
* Arguments: - poly *r: pointer to input/output polynomial
* Arguments: - poly *r: pointer to input/output polynomial
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_polyvec_reduce(polyvec *r) {
for (size_t i = 0; i < KYBER_K; i++) {
unsigned int i = 0;
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_CLEAN_poly_reduce(&r->vec[i]);
}
}

/*************************************************
* Name: polyvec_csubq
* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_csubq
*
* Description: Applies conditional subtraction of q to each coefficient
* of each element of a vector of polynomials
* for details of conditional subtraction of q see comments in reduce.c
* for details of conditional subtraction of q see comments in
* reduce.c
*
* Arguments: - poly *r: pointer to input/output polynomial
* Arguments: - poly *r: pointer to input/output polynomial
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_polyvec_csubq(polyvec *r) {
for (size_t i = 0; i < KYBER_K; i++) {
unsigned int i = 0;
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_CLEAN_poly_csubq(&r->vec[i]);
}
}

/*************************************************
* Name: polyvec_add
* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_add
*
* Description: Add vectors of polynomials
*
@@ -179,7 +207,8 @@ void PQCLEAN_KYBER102490S_CLEAN_polyvec_csubq(polyvec *r) {
* - const polyvec *b: pointer to second input vector of polynomials
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b) {
for (size_t i = 0; i < KYBER_K; i++) {
unsigned int i = 0;
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_CLEAN_poly_add(&r->vec[i], &a->vec[i], &b->vec[i]);
}
}

+ 21
- 9
crypto_kem/kyber1024-90s/clean/polyvec.h View File

@@ -1,29 +1,41 @@
#ifndef POLYVEC_H
#define POLYVEC_H
#ifndef PQCLEAN_KYBER102490S_CLEAN_POLYVEC_H
#define PQCLEAN_KYBER102490S_CLEAN_POLYVEC_H

#include "params.h"
#include "poly.h"

#include <stdint.h>

typedef struct {
poly vec[KYBER_K];
} polyvec;

void PQCLEAN_KYBER102490S_CLEAN_polyvec_compress(uint8_t *r, polyvec *a);
void PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress(polyvec *r, const uint8_t *a);

void PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes(uint8_t *r, polyvec *a);
void PQCLEAN_KYBER102490S_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t *a);
void PQCLEAN_KYBER102490S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a);

void PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress(polyvec *r,
const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]);


void PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a);

void PQCLEAN_KYBER102490S_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]);


void PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt(polyvec *r);
void PQCLEAN_KYBER102490S_CLEAN_polyvec_invntt(polyvec *r);

void PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b);
void PQCLEAN_KYBER102490S_CLEAN_polyvec_invntt_tomont(polyvec *r);


void PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(poly *r,
const polyvec *a,
const polyvec *b);


void PQCLEAN_KYBER102490S_CLEAN_polyvec_reduce(polyvec *r);

void PQCLEAN_KYBER102490S_CLEAN_polyvec_csubq(polyvec *r);


void PQCLEAN_KYBER102490S_CLEAN_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b);

#endif

+ 15
- 16
crypto_kem/kyber1024-90s/clean/reduce.c View File

@@ -1,32 +1,32 @@
#include "reduce.h"

#include "params.h"
#include "reduce.h"
#include <stdint.h>

/*************************************************
* Name: montgomery_reduce
* Name: PQCLEAN_KYBER102490S_CLEAN_montgomery_reduce
*
* Description: Montgomery reduction; given a 32-bit integer a, computes
* 16-bit integer congruent to a * R^-1 mod q,
* where R=2^16
*
* Arguments: - int32_t a: input integer to be reduced; has to be in {-q2^15,...,q2^15-1}
* Arguments: - int32_t a: input integer to be reduced;
* has to be in {-q2^15,...,q2^15-1}
*
* Returns: integer in {-q+1,...,q-1} congruent to a * R^-1 modulo q.
**************************************************/
int16_t PQCLEAN_KYBER102490S_CLEAN_montgomery_reduce(int32_t a) {
int32_t t;
int16_t u;
int32_t t = 0;
int16_t u = 0;

u = (int16_t)(a * (int64_t)QINV);
t = (int32_t)u * KYBER_Q;
t = a - t;
t >>= 16;
return (int16_t)t;
return t;
}

/*************************************************
* Name: barrett_reduce
* Name: PQCLEAN_KYBER102490S_CLEAN_barrett_reduce
*
* Description: Barrett reduction; given a 16-bit integer a, computes
* 16-bit integer congruent to a mod q in {0,...,q}
@@ -36,21 +36,20 @@ int16_t PQCLEAN_KYBER102490S_CLEAN_montgomery_reduce(int32_t a) {
* Returns: integer in {0,...,q} congruent to a modulo q.
**************************************************/
int16_t PQCLEAN_KYBER102490S_CLEAN_barrett_reduce(int16_t a) {
int32_t t;
const int32_t v = (1U << 26) / KYBER_Q + 1;
int16_t t = 0;
const int16_t v = ((1U << 26) + KYBER_Q / 2) / KYBER_Q;

t = v * a;
t >>= 26;
t = (int32_t)v * a >> 26;
t *= KYBER_Q;
return a - (int16_t)t;
return a - t;
}

/*************************************************
* Name: csubq
* Name: PQCLEAN_KYBER102490S_CLEAN_csubq
*
* Description: Conditionallly subtract q
*
* Arguments: - int16_t a: input integer
* Arguments: - int16_t x: input integer
*
* Returns: a - q if a >= q, else a
**************************************************/


+ 8
- 4
crypto_kem/kyber1024-90s/clean/reduce.h View File

@@ -1,15 +1,19 @@
#ifndef REDUCE_H
#define REDUCE_H
#ifndef PQCLEAN_KYBER102490S_CLEAN_REDUCE_H
#define PQCLEAN_KYBER102490S_CLEAN_REDUCE_H

#include "params.h"
#include <stdint.h>

#define MONT 2285 // 2^16 % Q
#define QINV 62209 // q^(-1) mod 2^16
#define MONT 2285 // 2^16 mod q
#define QINV 62209 // q^-1 mod 2^16


int16_t PQCLEAN_KYBER102490S_CLEAN_montgomery_reduce(int32_t a);


int16_t PQCLEAN_KYBER102490S_CLEAN_barrett_reduce(int16_t a);


int16_t PQCLEAN_KYBER102490S_CLEAN_csubq(int16_t a);

#endif

crypto_kem/kyber1024-90s/clean/aes256ctr.c → crypto_kem/kyber1024-90s/clean/symmetric-aes.c View File

@@ -1,4 +1,4 @@
#include "aes256ctr.h"
#include "symmetric-aes.h"
#include "aes.h"
#include <stddef.h>
#include <stdint.h>
@@ -14,7 +14,7 @@ static inline void br_enc32be(unsigned char *dst, uint32_t x) {
static void aes256_ctr_xof(unsigned char *out, size_t outlen, const unsigned char *iv, uint32_t ctr, const aes256ctx *ctx) {
uint8_t ivw[16];
uint8_t buf[AES_BLOCKBYTES];
size_t i;
size_t i = 0;

memcpy(ivw, iv, AESCTR_NONCEBYTES);
br_enc32be(ivw + AESCTR_NONCEBYTES, ctr);
@@ -94,7 +94,6 @@ void PQCLEAN_KYBER102490S_CLEAN_aes256xof_squeezeblocks(uint8_t *out, size_t nbl
s->ctr += (uint32_t) (4 * nblocks);
}

/** Free the AES ctx **/
void PQCLEAN_KYBER102490S_CLEAN_aes256xof_ctx_release(aes256xof_ctx *s) {
aes256_ctx_release(&s->sk_exp);
}

crypto_kem/kyber1024-90s/clean/aes256ctr.h → crypto_kem/kyber1024-90s/clean/symmetric-aes.h View File


+ 8
- 6
crypto_kem/kyber1024-90s/clean/symmetric.h View File

@@ -2,22 +2,24 @@
#define SYMMETRIC_H

#include "params.h"
#include <stddef.h>
#include <stdint.h>


#include "aes256ctr.h"
#include "sha2.h"
#include "symmetric-aes.h"

typedef aes256xof_ctx xof_state;

#define XOF_BLOCKBYTES 64

#define hash_h(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES)
#define hash_g(OUT, IN, INBYTES) sha512(OUT, IN, INBYTES)
#define xof_absorb(STATE, IN, X, Y) PQCLEAN_KYBER102490S_CLEAN_aes256xof_absorb(STATE, IN, X, Y)
#define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER102490S_CLEAN_aes256xof_absorb(STATE, SEED, X, Y)
#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_KYBER102490S_CLEAN_aes256xof_squeezeblocks(OUT, OUTBLOCKS, STATE)
#define xof_ctx_release(STATE) PQCLEAN_KYBER102490S_CLEAN_aes256xof_ctx_release(STATE)
#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER102490S_CLEAN_aes256_prf(OUT, OUTBYTES, KEY, NONCE)
#define kdf(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES)

#define XOF_BLOCKBYTES 64

typedef aes256xof_ctx xof_state;


#endif /* SYMMETRIC_H */

+ 10
- 13
crypto_kem/kyber1024-90s/clean/verify.c View File

@@ -1,34 +1,31 @@
#include "verify.h"

#include <stddef.h>
#include <stdint.h>

/*************************************************
* Name: verify
* Name: PQCLEAN_KYBER102490S_CLEAN_verify
*
* Description: Compare two arrays for equality in constant time.
*
* Arguments: const uint8_t *a: pointer to first byte array
* const uint8_t *b: pointer to second byte array
* size_t len: length of the byte arrays
* size_t len: length of the byte arrays
*
* Returns 0 if the byte arrays are equal, 1 otherwise
**************************************************/
uint8_t PQCLEAN_KYBER102490S_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len) {
uint64_t r;
size_t i;
r = 0;
int PQCLEAN_KYBER102490S_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len) {
size_t i = 0;
uint8_t r = 0;

for (i = 0; i < len; i++) {
r |= a[i] ^ b[i];
}

r = (-r) >> 63;
return (uint8_t)r;
return (-(uint64_t)r) >> 63;
}

/*************************************************
* Name: cmov
* Name: PQCLEAN_KYBER102490S_CLEAN_cmov
*
* Description: Copy len bytes from x to r if b is 1;
* don't modify x if b is 0. Requires b to be in {0,1};
@@ -37,14 +34,14 @@ uint8_t PQCLEAN_KYBER102490S_CLEAN_verify(const uint8_t *a, const uint8_t *b, si
*
* Arguments: uint8_t *r: pointer to output byte array
* const uint8_t *x: pointer to input byte array
* size_t len: Amount of bytes to be copied
* size_t len: Amount of bytes to be copied
* uint8_t b: Condition bit; has to be in {0,1}
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b) {
size_t i;
size_t i = 0;

b = -b;
for (i = 0; i < len; i++) {
r[i] ^= b & (x[i] ^ r[i]);
r[i] ^= b & (r[i] ^ x[i]);
}
}

+ 6
- 3
crypto_kem/kyber1024-90s/clean/verify.h View File

@@ -1,10 +1,13 @@
#ifndef VERIFY_H
#define VERIFY_H
#ifndef PQCLEAN_KYBER102490S_CLEAN_VERIFY_H
#define PQCLEAN_KYBER102490S_CLEAN_VERIFY_H

#include "params.h"
#include <stddef.h>
#include <stdint.h>

uint8_t PQCLEAN_KYBER102490S_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len);

int PQCLEAN_KYBER102490S_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len);


void PQCLEAN_KYBER102490S_CLEAN_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b);



+ 1
- 0
crypto_kem/kyber1024/META.yml View File

@@ -28,6 +28,7 @@ implementations:
- architecture: x86_64
operating_systems:
- Linux
- Darwin
required_flags:
- avx2
- bmi2


+ 1
- 11
crypto_kem/kyber1024/avx2/LICENSE View File

@@ -1,14 +1,4 @@
kyber-20170627
Public Domain
Authors: Joppe Bos,
Léo Ducas,
Eike Kiltz ,
Tancrède Lepoint,
Vadim Lyubashevsky,
John Schanck,
Peter Schwabe,
Gregor Seiler,
Damien Stehlé
Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/)

For Keccak and AES we are using public-domain
code from sources and by authors listed in


+ 40
- 8
crypto_kem/kyber1024/avx2/Makefile View File

@@ -1,26 +1,58 @@
# This Makefile can be used with GNU Make or BSD Make

LIB=libkyber1024_avx2.a
HEADERS=api.h params.h poly.h polyvec.h reduce.h fq.inc cbd.h consts.h ntt.h shuffle.inc verify.h indcpa.h rejsample.h symmetric.h fips202x4.h
OBJECTS=kem.o poly.o polyvec.o fq.o shuffle.o cbd.o ntt.o invntt.o basemul.o consts.o \
verify.o indcpa.o rejsample.o fips202x4.o symmetric-fips202.o
HEADERS= \
align.h \
api.h \
cbd.h \
cdecl.inc \
consts.h \
fips202x4.h \
fq.inc \
indcpa.h \
kem.h \
ntt.h \
params.h \
poly.h \
polyvec.h \
reduce.h \
rejsample.h \
shuffle.inc \
symmetric.h \
verify.h
OBJECTS= \
basemul.o \
cbd.o \
consts.o \
fips202x4.o \
fq.o \
indcpa.o \
invntt.o \
kem.o \
ntt.o \
poly.o \
polyvec.o \
rejsample.o \
shuffle.o \
symmetric-shake.o \
verify.o

KECCAK4XDIR=../../../common/keccak4x
KECCAK4XOBJ=KeccakP-1600-times4-SIMD256.o
KECCAK4X=$(KECCAK4XDIR)/$(KECCAK4XOBJ)

CFLAGS=-mavx2 -mbmi2 -mpopcnt -O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS)
CFLAGS=-mavx2 -mbmi2 -mpopcnt -O3 -Wall -Wextra -Wpedantic -Werror \
-Wmissing-prototypes -Wredundant-decls \
-Wpointer-arith -Wshadow \
-std=c99 -I../../../common $(EXTRAFLAGS)

all: $(LIB)

%.o: %.c $(HEADERS)
$(CC) $(CFLAGS) -c -o $@ $<

%.o: %.s $(HEADERS)
$(AS) -o $@ $<

%.o: %.S $(HEADERS)
$(AS) -c -o $@ $<
$(CC) $(CFLAGS) -c -o $@ $<

$(LIB): $(OBJECTS) $(KECCAK4X)
$(AR) -r $@ $(OBJECTS) $(KECCAK4X)


+ 22
- 0
crypto_kem/kyber1024/avx2/align.h View File

@@ -0,0 +1,22 @@
#ifndef PQCLEAN_KYBER1024_AVX2_ALIGN_H
#define PQCLEAN_KYBER1024_AVX2_ALIGN_H
#include <immintrin.h>

#define ALIGN16_TYPE(t) \
union { \
__m128i vec; \
t orig; \
}

#define ALIGN32_ARRAY(t, s) \
union { \
__m256i vec; \
t arr[(s)]; \
}

#define ALIGN32_ARRAY_2D(t, n, m) \
union { \
__m256i vec; \
t arr[(n)][(m)]; \
}
#endif

+ 65
- 17
crypto_kem/kyber1024/avx2/basemul.S View File

@@ -1,4 +1,5 @@
#include "params.h"
#include "cdecl.inc"

.macro schoolbook off,sign
#load
@@ -48,7 +49,7 @@ vpaddd %ymm12,%ymm13,%ymm12 # y0
vpaddd %ymm7,%ymm8,%ymm7 # y1
.endm

.macro red a0,a1,b0,b1 x,y,z
.macro red a0,a1,b0,b1,x,y,z
#pack
vpxor %ymm\x,%ymm\x,%ymm\x
vpblendw $0xAA,%ymm\x,%ymm\a0,%ymm\y
@@ -73,13 +74,8 @@ vpsubw %ymm\z,%ymm\a0,%ymm\a0
vpsubw %ymm\y,%ymm\b0,%ymm\b0
.endm

.global PQCLEAN_KYBER1024_AVX2_basemul_acc_avx
PQCLEAN_KYBER1024_AVX2_basemul_acc_avx:
#consts
vmovdqa PQCLEAN_KYBER1024_AVX2_16xq(%rip),%ymm0
vmovdqa PQCLEAN_KYBER1024_AVX2_16xqinv(%rip),%ymm1
vmovdqu (%rcx),%ymm2

.text
basemul64_acc_avx:
poly0.0:
schoolbook 0,0

@@ -117,7 +113,7 @@ vpaddd %ymm12,%ymm5,%ymm5
vpaddd %ymm7,%ymm6,%ymm6

#reduce
red 3,4,5,6 7,8,9
red 3,4,5,6,7,8,9

#store
vmovdqa %ymm3,(%rdi)
@@ -160,7 +156,7 @@ vpaddd %ymm12,%ymm5,%ymm5
vpaddd %ymm7,%ymm6,%ymm6

#reduce
red 3,4,5,6 7,8,9
red 3,4,5,6,7,8,9

#store
vmovdqa %ymm3,64(%rdi)
@@ -168,17 +164,40 @@ vmovdqa %ymm5,96(%rdi)

ret

.global PQCLEAN_KYBER1024_AVX2_basemul_avx
PQCLEAN_KYBER1024_AVX2_basemul_avx:
.global cdecl(PQCLEAN_KYBER1024_AVX2_basemul_acc_avx)
cdecl(PQCLEAN_KYBER1024_AVX2_basemul_acc_avx):
#consts
vmovdqa PQCLEAN_KYBER1024_AVX2_16xq(%rip),%ymm0
vmovdqa PQCLEAN_KYBER1024_AVX2_16xqinv(%rip),%ymm1
vmovdqu (%rcx),%ymm2
vmovdqa _16XQ*2(%rcx),%ymm0
vmovdqa _16XQINV*2(%rcx),%ymm1

vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2
call basemul64_acc_avx

vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2
add $128,%rdi
add $128,%rsi
add $128,%rdx
call basemul64_acc_avx

vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2
add $128,%rdi
add $128,%rsi
add $128,%rdx
call basemul64_acc_avx

vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2
add $128,%rdi
add $128,%rsi
add $128,%rdx
call basemul64_acc_avx

ret

basemul64_avx:
schoolbook 0,0

#reduce
red 14,9,12,7 8,10,11
red 14,9,12,7,8,10,11

#store
vmovdqa %ymm14,(%rdi)
@@ -187,10 +206,39 @@ vmovdqa %ymm12,32(%rdi)
schoolbook 64,1

#reduce
red 14,9,12,7 8,10,11
red 14,9,12,7,8,10,11

#store
vmovdqa %ymm14,64(%rdi)
vmovdqa %ymm12,96(%rdi)

ret

.global cdecl(PQCLEAN_KYBER1024_AVX2_basemul_avx)
cdecl(PQCLEAN_KYBER1024_AVX2_basemul_avx):
#consts
vmovdqa _16XQ*2(%rcx),%ymm0
vmovdqa _16XQINV*2(%rcx),%ymm1

vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2
call basemul64_avx

vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2
add $128,%rdi
add $128,%rsi
add $128,%rdx
call basemul64_avx

vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2
add $128,%rdi
add $128,%rsi
add $128,%rdx
call basemul64_avx

vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2
add $128,%rdi
add $128,%rsi
add $128,%rdx
call basemul64_avx

ret

+ 7
- 7
crypto_kem/kyber1024/avx2/cbd.c View File

@@ -1,27 +1,27 @@
#include "cbd.h"
#include "params.h"
#include "cbd.h"
#include <immintrin.h>
#include <stdint.h>

/*************************************************
* Name: cbd
* Name: PQCLEAN_KYBER1024_AVX2_cbd
*
* Description: Given an array of uniformly random bytes, compute
* polynomial with coefficients distributed according to
* a centered binomial distribution with parameter KYBER_ETA
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *buf: pointer to input byte array
* - const unsigned char *buf: pointer to input byte array
**************************************************/
void PQCLEAN_KYBER1024_AVX2_cbd(poly *r, const uint8_t *buf) {
void PQCLEAN_KYBER1024_AVX2_cbd(poly *restrict r, const uint8_t *restrict buf) {
unsigned int i = 0;
__m256i vec0, vec1, vec2, vec3, tmp;
const __m256i mask55 = _mm256_set1_epi32(0x55555555);
const __m256i mask33 = _mm256_set1_epi32(0x33333333);
const __m256i mask03 = _mm256_set1_epi32(0x03030303);

for (size_t i = 0; i < KYBER_N / 64; i++) {
vec0 = _mm256_loadu_si256((__m256i *)&buf[32 * i]);
for (i = 0; i < KYBER_N / 64; i++) {
vec0 = _mm256_load_si256((__m256i *)&buf[32 * i]);

vec1 = _mm256_srli_epi32(vec0, 1);
vec0 = _mm256_and_si256(mask55, vec0);


+ 6
- 3
crypto_kem/kyber1024/avx2/cbd.h View File

@@ -1,8 +1,11 @@
#ifndef CBD_H
#define CBD_H
#ifndef PQCLEAN_KYBER1024_AVX2_CBD_H
#define PQCLEAN_KYBER1024_AVX2_CBD_H

#include "params.h"
#include "poly.h"
#include <stdint.h>

void PQCLEAN_KYBER1024_AVX2_cbd(poly *r, const uint8_t *buf);

void PQCLEAN_KYBER1024_AVX2_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]);

#endif

+ 30
- 0
crypto_kem/kyber1024/avx2/cdecl.inc View File

@@ -0,0 +1,30 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_CDECL
#define PQCLEAN_DILITHIUM2_AVX2_CDECL

#define _16XQ 0
#define _16XQINV 16
#define _16XV 32
#define _16XFLO 48
#define _16XFHI 64
#define _16XMONTSQLO 80
#define _16XMONTSQHI 96
#define _16XMASK 112
#define _ZETAS_EXP 128
#define _ZETAS_INV_EXP 528


/* The C ABI on MacOS exports all symbols with a leading
* underscore. This means that any symbols we refer to from
* C files (functions) can't be found, and all symbols we
* refer to from ASM also can't be found (nttconsts.c).
*
* This define helps us get around this
*/

#if defined(__WIN32__) || defined(__APPLE__)
#define cdecl(s) _##s
#else
#define cdecl(s) s
#endif

#endif

+ 149
- 28
crypto_kem/kyber1024/avx2/consts.c View File

@@ -1,34 +1,155 @@
#include "consts.h"
#include "params.h"

const uint16_t PQCLEAN_KYBER1024_AVX2_zetas_exp[396] = {31499, 31499, 2571, 2571, 14746, 14746, 2970, 2970, 13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, 53134, 53134, 53134, 53134, 53134, 53134, 53134, 53134, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 44630, 44630, 44630, 44630, 27758, 27758, 27758, 27758, 61737, 61737, 61737, 61737, 49846, 49846, 49846, 49846, 3158, 3158, 3158, 3158, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 182, 182, 182, 182, 59709, 59709, 17364, 17364, 39176, 39176, 36479, 36479, 5572, 5572, 64434, 64434, 21439, 21439, 39295, 39295, 573, 573, 2004, 2004, 264, 264, 383, 383, 2500, 2500, 1458, 1458, 1727, 1727, 3199, 3199, 59847, 59020, 1497, 30967, 41972, 20179, 20711, 25081, 52740, 26617, 16065, 53095, 9135, 64887, 39550, 27837, 1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, 65202, 54059, 33310, 20494, 37798, 945, 50654, 6182, 32011, 10631, 29176, 36775, 47051, 17561, 51106, 60261, 2226, 555, 2078, 1550, 422, 177, 3038, 1574, 3083, 1159, 2552, 2727, 1739, 2457, 418, 3173, 11182, 13387, 51303, 43881, 13131, 60950, 23093, 5493, 33034, 30318, 46795, 12639, 20100, 18525, 19529, 52918, 430, 843, 871, 105, 587, 3094, 2869, 1653, 778, 3182, 1483, 1119, 644, 349, 329, 3254, 788, 788, 1812, 1812, 28191, 28191, 28191, 28191, 28191, 28191, 28191, 28191, 48842, 48842, 48842, 48842, 48842, 48842, 48842, 48842, 287, 287, 287, 287, 287, 287, 287, 287, 202, 202, 202, 202, 202, 202, 202, 202, 10690, 10690, 10690, 10690, 1359, 1359, 1359, 1359, 54335, 54335, 54335, 54335, 31164, 31164, 31164, 31164, 962, 962, 962, 962, 2127, 2127, 2127, 2127, 1855, 1855, 1855, 1855, 1468, 1468, 1468, 1468, 37464, 37464, 24313, 24313, 55004, 55004, 8800, 8800, 18427, 18427, 8859, 8859, 26676, 26676, 49374, 49374, 2648, 2648, 1017, 1017, 732, 732, 608, 608, 1787, 1787, 411, 411, 3124, 3124, 1758, 1758, 19884, 37287, 49650, 56638, 37227, 9076, 35338, 18250, 13427, 14017, 36381, 52780, 16832, 4312, 41381, 47622, 2476, 3239, 3058, 830, 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, 34353, 25435, 58154, 24392, 44610, 10946, 24215, 16990, 10336, 57603, 43035, 10907, 31637, 28644, 23998, 48114, 817, 603, 1322, 1864, 2114, 1218, 2455, 2142, 2144, 2051, 1819, 2459, 3221, 996, 958, 1522, 20297, 2146, 15356, 33152, 59257, 50634, 54492, 14470, 44039, 45338, 23211, 48094, 41677, 45279, 7757, 23132, 1097, 610, 2044, 384, 3193, 1994, 220, 1670, 1799, 794, 2475, 478, 3021, 991, 1869, 1628};
const uint16_t PQCLEAN_KYBER1024_AVX2_zetas_inv_exp[396] = {42405, 57780, 20258, 23860, 17443, 42326, 20199, 21498, 51067, 11045, 14903, 6280, 32385, 50181, 63391, 45240, 1701, 1460, 2338, 308, 2851, 854, 2535, 1530, 1659, 3109, 1335, 136, 2945, 1285, 2719, 2232, 17423, 41539, 36893, 33900, 54630, 22502, 7934, 55201, 48547, 41322, 54591, 20927, 41145, 7383, 40102, 31184, 1807, 2371, 2333, 108, 870, 1510, 1278, 1185, 1187, 874, 2111, 1215, 1465, 2007, 2726, 2512, 17915, 24156, 61225, 48705, 12757, 29156, 51520, 52110, 47287, 30199, 56461, 28310, 8899, 15887, 28250, 45653, 1275, 2652, 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, 16163, 16163, 38861, 38861, 56678, 56678, 47110, 47110, 56737, 56737, 10533, 10533, 41224, 41224, 28073, 28073, 1571, 1571, 205, 205, 2918, 2918, 1542, 1542, 2721, 2721, 2597, 2597, 2312, 2312, 681, 681, 34373, 34373, 34373, 34373, 11202, 11202, 11202, 11202, 64178, 64178, 64178, 64178, 54847, 54847, 54847, 54847, 1861, 1861, 1861, 1861, 1474, 1474, 1474, 1474, 1202, 1202, 1202, 1202, 2367, 2367, 2367, 2367, 16695, 16695, 16695, 16695, 16695, 16695, 16695, 16695, 37346, 37346, 37346, 37346, 37346, 37346, 37346, 37346, 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3042, 3042, 3042, 3042, 3042, 3042, 3042, 3042, 64749, 64749, 1517, 1517, 12619, 46008, 47012, 45437, 52898, 18742, 35219, 32503, 60044, 42444, 4587, 52406, 21656, 14234, 52150, 54355, 75, 3000, 2980, 2685, 2210, 1846, 147, 2551, 1676, 460, 235, 2742, 3224, 2458, 2486, 2899, 5276, 14431, 47976, 18486, 28762, 36361, 54906, 33526, 59355, 14883, 64592, 27739, 45043, 32227, 11478, 335, 156, 2911, 872, 1590, 602, 777, 2170, 246, 1755, 291, 3152, 2907, 1779, 1251, 2774, 1103, 37700, 25987, 650, 56402, 12442, 49472, 38920, 12797, 40456, 44826, 45358, 23565, 34570, 64040, 6517, 5690, 1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, 2677, 2106, 26242, 26242, 44098, 44098, 1103, 1103, 59965, 59965, 29058, 29058, 26361, 26361, 48173, 48173, 5828, 5828, 130, 130, 1602, 1602, 1871, 1871, 829, 829, 2946, 2946, 3065, 3065, 1325, 1325, 2756, 2756, 15691, 15691, 15691, 15691, 3800, 3800, 3800, 3800, 37779, 37779, 37779, 37779, 20907, 20907, 20907, 20907, 3147, 3147, 3147, 3147, 1752, 1752, 1752, 1752, 2707, 2707, 2707, 2707, 171, 171, 171, 171, 12403, 12403, 12403, 12403, 12403, 12403, 12403, 12403, 52012, 52012, 52012, 52012, 52012, 52012, 52012, 52012, 1907, 1907, 1907, 1907, 1907, 1907, 1907, 1907, 1836, 1836, 1836, 1836, 1836, 1836, 1836, 1836, 50791, 50791, 359, 359, 60300, 60300, 1932, 1932};
#include "consts.h"
#include <stdint.h>

#define Q KYBER_Q
#define MONT ((1U << 16) % KYBER_Q)
#define MONT ((1U << 16) % Q)
#define QINV 62209 // q^-1 mod 2^16
#define V ((1U << 26)/KYBER_Q + 1)
#define FHI (MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q)
#define FLO (FHI * QINV % 65536)
#define MONTSQHI (MONT * MONT % KYBER_Q)
#define MONTSQLO (MONTSQHI * QINV % 65536)
#define V (((1U << 26) + Q/2)/Q)
#define FHI (MONT*(MONT*(Q-1)*((Q-1)/128) % Q) % Q)
#define FLO (FHI*QINV % 65536)
#define MONTSQHI (MONT*MONT % Q)
#define MONTSQLO (MONTSQHI*QINV % 65536)
#define MASK 4095

const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q}};
const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}};
const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xv = {.as_arr = {V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V}};
const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xflo = {.as_arr = {FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO}};
const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xfhi = {.as_arr = {FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI}};
const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xmontsqlo = {.as_arr = {MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO}};
const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xmontsqhi = {.as_arr = {MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI}};
const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xmask = {.as_arr = {MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK}};

#undef Q
#undef QINV
#undef MONT
#undef V
#undef FLO
#undef FHI
#undef MONTSQLO
#undef MONTSQHI
#undef MASK

const qdata_t PQCLEAN_KYBER1024_AVX2_qdata = {.as_arr = {
#define _16XQ 0
Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q,

#define _16XQINV 16
QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV,
QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV,

#define _16XV 32
V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,

#define _16XFLO 48
FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO,
FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO,

#define _16XFHI 64
FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI,
FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI,

#define _16XMONTSQLO 80
MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,

#define _16XMONTSQHI 96
MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,

#define _16XMASK 112
MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK,
MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK,

#define _ZETAS_EXP 128
31499, 31499, 2571, 2571, 14746, 14746, 2970, 2970,
13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525,
53134, 53134, 53134, 53134, 53134, 53134, 53134, 53134,
1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493,
1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422,
44630, 44630, 44630, 44630, 27758, 27758, 27758, 27758,
61737, 61737, 61737, 61737, 49846, 49846, 49846, 49846,
3158, 3158, 3158, 3158, 622, 622, 622, 622,
1577, 1577, 1577, 1577, 182, 182, 182, 182,
59709, 59709, 17364, 17364, 39176, 39176, 36479, 36479,
5572, 5572, 64434, 64434, 21439, 21439, 39295, 39295,
573, 573, 2004, 2004, 264, 264, 383, 383,
2500, 2500, 1458, 1458, 1727, 1727, 3199, 3199,
59847, 59020, 1497, 30967, 41972, 20179, 20711, 25081,
52740, 26617, 16065, 53095, 9135, 64887, 39550, 27837,
1223, 652, 2777, 1015, 2036, 1491, 3047, 1785,
516, 3321, 3009, 2663, 1711, 2167, 126, 1469,
65202, 54059, 33310, 20494, 37798, 945, 50654, 6182,
32011, 10631, 29176, 36775, 47051, 17561, 51106, 60261,
2226, 555, 2078, 1550, 422, 177, 3038, 1574,
3083, 1159, 2552, 2727, 1739, 2457, 418, 3173,
11182, 13387, 51303, 43881, 13131, 60950, 23093, 5493,
33034, 30318, 46795, 12639, 20100, 18525, 19529, 52918,
430, 843, 871, 105, 587, 3094, 2869, 1653,
778, 3182, 1483, 1119, 644, 349, 329, 3254,
788, 788, 1812, 1812, 28191, 28191, 28191, 28191,
28191, 28191, 28191, 28191, 48842, 48842, 48842, 48842,
48842, 48842, 48842, 48842, 287, 287, 287, 287,
287, 287, 287, 287, 202, 202, 202, 202,
202, 202, 202, 202, 10690, 10690, 10690, 10690,
1359, 1359, 1359, 1359, 54335, 54335, 54335, 54335,
31164, 31164, 31164, 31164, 962, 962, 962, 962,
2127, 2127, 2127, 2127, 1855, 1855, 1855, 1855,
1468, 1468, 1468, 1468, 37464, 37464, 24313, 24313,
55004, 55004, 8800, 8800, 18427, 18427, 8859, 8859,
26676, 26676, 49374, 49374, 2648, 2648, 1017, 1017,
732, 732, 608, 608, 1787, 1787, 411, 411,
3124, 3124, 1758, 1758, 19884, 37287, 49650, 56638,
37227, 9076, 35338, 18250, 13427, 14017, 36381, 52780,
16832, 4312, 41381, 47622, 2476, 3239, 3058, 830,
107, 1908, 3082, 2378, 2931, 961, 1821, 2604,
448, 2264, 677, 2054, 34353, 25435, 58154, 24392,
44610, 10946, 24215, 16990, 10336, 57603, 43035, 10907,
31637, 28644, 23998, 48114, 817, 603, 1322, 1864,
2114, 1218, 2455, 2142, 2144, 2051, 1819, 2459,
3221, 996, 958, 1522, 20297, 2146, 15356, 33152,
59257, 50634, 54492, 14470, 44039, 45338, 23211, 48094,
41677, 45279, 7757, 23132, 1097, 610, 2044, 384,
3193, 1994, 220, 1670, 1799, 794, 2475, 478,
3021, 991, 1869, 1628, 0, 0, 0, 0,

#define _ZETAS_INV_EXP 528
42405, 57780, 20258, 23860, 17443, 42326, 20199, 21498,
51067, 11045, 14903, 6280, 32385, 50181, 63391, 45240,
1701, 1460, 2338, 308, 2851, 854, 2535, 1530,
1659, 3109, 1335, 136, 2945, 1285, 2719, 2232,
17423, 41539, 36893, 33900, 54630, 22502, 7934, 55201,
48547, 41322, 54591, 20927, 41145, 7383, 40102, 31184,
1807, 2371, 2333, 108, 870, 1510, 1278, 1185,
1187, 874, 2111, 1215, 1465, 2007, 2726, 2512,
17915, 24156, 61225, 48705, 12757, 29156, 51520, 52110,
47287, 30199, 56461, 28310, 8899, 15887, 28250, 45653,
1275, 2652, 1065, 2881, 725, 1508, 2368, 398,
951, 247, 1421, 3222, 2499, 271, 90, 853,
16163, 16163, 38861, 38861, 56678, 56678, 47110, 47110,
56737, 56737, 10533, 10533, 41224, 41224, 28073, 28073,
1571, 1571, 205, 205, 2918, 2918, 1542, 1542,
2721, 2721, 2597, 2597, 2312, 2312, 681, 681,
34373, 34373, 34373, 34373, 11202, 11202, 11202, 11202,
64178, 64178, 64178, 64178, 54847, 54847, 54847, 54847,
1861, 1861, 1861, 1861, 1474, 1474, 1474, 1474,
1202, 1202, 1202, 1202, 2367, 2367, 2367, 2367,
16695, 16695, 16695, 16695, 16695, 16695, 16695, 16695,
37346, 37346, 37346, 37346, 37346, 37346, 37346, 37346,
3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127,
3042, 3042, 3042, 3042, 3042, 3042, 3042, 3042,
64749, 64749, 1517, 1517, 12619, 46008, 47012, 45437,
52898, 18742, 35219, 32503, 60044, 42444, 4587, 52406,
21656, 14234, 52150, 54355, 75, 3000, 2980, 2685,
2210, 1846, 147, 2551, 1676, 460, 235, 2742,
3224, 2458, 2486, 2899, 5276, 14431, 47976, 18486,
28762, 36361, 54906, 33526, 59355, 14883, 64592, 27739,
45043, 32227, 11478, 335, 156, 2911, 872, 1590,
602, 777, 2170, 246, 1755, 291, 3152, 2907,
1779, 1251, 2774, 1103, 37700, 25987, 650, 56402,
12442, 49472, 38920, 12797, 40456, 44826, 45358, 23565,
34570, 64040, 6517, 5690, 1860, 3203, 1162, 1618,
666, 320, 8, 2813, 1544, 282, 1838, 1293,
2314, 552, 2677, 2106, 26242, 26242, 44098, 44098,
1103, 1103, 59965, 59965, 29058, 29058, 26361, 26361,
48173, 48173, 5828, 5828, 130, 130, 1602, 1602,
1871, 1871, 829, 829, 2946, 2946, 3065, 3065,
1325, 1325, 2756, 2756, 15691, 15691, 15691, 15691,
3800, 3800, 3800, 3800, 37779, 37779, 37779, 37779,
20907, 20907, 20907, 20907, 3147, 3147, 3147, 3147,
1752, 1752, 1752, 1752, 2707, 2707, 2707, 2707,
171, 171, 171, 171, 12403, 12403, 12403, 12403,
12403, 12403, 12403, 12403, 52012, 52012, 52012, 52012,
52012, 52012, 52012, 52012, 1907, 1907, 1907, 1907,
1907, 1907, 1907, 1907, 1836, 1836, 1836, 1836,
1836, 1836, 1836, 1836, 50791, 50791, 359, 359,
60300, 60300, 1932, 1932, 0, 0, 0, 0
}
};

+ 12
- 16
crypto_kem/kyber1024/avx2/consts.h View File

@@ -1,24 +1,20 @@
#ifndef CONSTS_H
#define CONSTS_H
#ifndef PQCLEAN_KYBER1024_AVX2_CONSTS_H
#define PQCLEAN_KYBER1024_AVX2_CONSTS_H

#include "cdecl.inc"

#include "params.h"
#include <immintrin.h>
#include <stdint.h>

typedef union {
uint16_t as_arr[16];
__m256i as_vec;
} aligned_uint16_t;
#define ALIGNED_UINT16_T(N) \
union { \
__m256i as_vec; \
uint16_t as_arr[(N)]; \
}

extern const uint16_t PQCLEAN_KYBER1024_AVX2_zetas_exp[396];
extern const uint16_t PQCLEAN_KYBER1024_AVX2_zetas_inv_exp[396];
typedef ALIGNED_UINT16_T(928) qdata_t;

extern const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xq;
extern const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xqinv;
extern const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xv;
extern const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xflo;
extern const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xfhi;
extern const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xmontsqlo;
extern const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xmontsqhi;
extern const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xmask;
extern const qdata_t PQCLEAN_KYBER1024_AVX2_qdata;

#endif

+ 140
- 181
crypto_kem/kyber1024/avx2/fips202x4.c View File

@@ -1,148 +1,111 @@
#include "fips202.h"
#include "fips202x4.h"
#include "params.h"

#include <immintrin.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>

/* Use implementation from the Keccak Code Package */
#define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds
extern void KeccakF1600_StatePermute4x(__m256i *s);

#define NROUNDS 24
#define ROL(a, offset) (((a) << (offset)) ^ ((a) >> (64-(offset))))

static uint64_t load64(const uint8_t *x) {
unsigned long long r = 0, i;

for (i = 0; i < 8; ++i) {
r |= (unsigned long long)x[i] << 8 * i;
}
return r;
}

static void store64(uint8_t *x, uint64_t u) {
size_t i;
static inline void store64(uint8_t x[8], uint64_t u) {
unsigned int i = 0;

for (i = 0; i < 8; ++i) {
x[i] = (uint8_t)u;
u >>= 8;
for (i = 0; i < 8; i++) {
x[i] = u >> 8 * i;
}
}

/* Use implementation from the Keccak Code Package */
extern void KeccakP1600times4_PermuteAll_24rounds(__m256i *s);
#define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds

static void keccak_absorb4x(__m256i *s,
static void keccakx4_absorb(__m256i s[25],
unsigned int r,
const uint8_t *m0,
const uint8_t *m1,
const uint8_t *m2,
const uint8_t *m3,
size_t mlen,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen,
uint8_t p) {
size_t i;
uint8_t t0[200] = {0};
uint8_t t1[200] = {0};
uint8_t t2[200] = {0};
uint8_t t3[200] = {0};
size_t i = 0, pos = 0;
__m256i t, idx;

unsigned long long *ss = (unsigned long long *)s;
for (i = 0; i < 25; ++i) {
s[i] = _mm256_setzero_si256();
}

while (mlen >= r) {
idx = _mm256_set_epi64x((long long)in3, (long long)in2, (long long)in1, (long long)in0);
while (inlen >= r) {
for (i = 0; i < r / 8; ++i) {
ss[4 * i + 0] ^= load64(m0 + 8 * i);
ss[4 * i + 1] ^= load64(m1 + 8 * i);
ss[4 * i + 2] ^= load64(m2 + 8 * i);
ss[4 * i + 3] ^= load64(m3 + 8 * i);
t = _mm256_i64gather_epi64((long long *)pos, idx, 1);
s[i] = _mm256_xor_si256(s[i], t);
pos += 8;
}

KeccakF1600_StatePermute4x(s);
mlen -= r;
m0 += r;
m1 += r;
m2 += r;
m3 += r;
inlen -= r;
}

for (i = 0; i < mlen; ++i) {
t0[i] = m0[i];
t1[i] = m1[i];
t2[i] = m2[i];
t3[i] = m3[i];
i = 0;
while (inlen >= 8) {
t = _mm256_i64gather_epi64((long long *)pos, idx, 1);
s[i] = _mm256_xor_si256(s[i], t);

i++;
pos += 8;
inlen -= 8;
}

t0[i] = p;
t1[i] = p;
t2[i] = p;
t3[i] = p;

t0[r - 1] |= 128;
t1[r - 1] |= 128;
t2[r - 1] |= 128;
t3[r - 1] |= 128;

for (i = 0; i < r / 8; ++i) {
ss[4 * i + 0] ^= load64(t0 + 8 * i);
ss[4 * i + 1] ^= load64(t1 + 8 * i);
ss[4 * i + 2] ^= load64(t2 + 8 * i);
ss[4 * i + 3] ^= load64(t3 + 8 * i);
if (inlen) {
t = _mm256_i64gather_epi64((long long *)pos, idx, 1);
idx = _mm256_set1_epi64x((long long)((1ULL << (8 * inlen)) - 1));
t = _mm256_and_si256(t, idx);
s[i] = _mm256_xor_si256(s[i], t);
}

t = _mm256_set1_epi64x((uint64_t)p << 8 * inlen);
s[i] = _mm256_xor_si256(s[i], t);
t = _mm256_set1_epi64x((long long)(1ULL << 63));
s[r / 8 - 1] = _mm256_xor_si256(s[r / 8 - 1], t);
}

static void keccak_squeezeblocks4x(uint8_t *h0,
uint8_t *h1,
uint8_t *h2,
uint8_t *h3,
static void keccakx4_squeezeblocks(uint8_t *out0,
uint8_t *out1,
uint8_t *out2,
uint8_t *out3,
size_t nblocks,
__m256i *s,
unsigned int r) {
unsigned long long *ss = (unsigned long long *)s;
unsigned int r,
__m256i s[25]) {
unsigned int i = 0;
uint64_t f0 = 0, f1 = 0, f2 = 0, f3 = 0;

while (nblocks > 0) {
KeccakF1600_StatePermute4x(s);
for (size_t i = 0; i < (r >> 3); i++) {
store64(h0 + 8 * i, ss[4 * i + 0]);
store64(h1 + 8 * i, ss[4 * i + 1]);
store64(h2 + 8 * i, ss[4 * i + 2]);
store64(h3 + 8 * i, ss[4 * i + 3]);
for (i = 0; i < r / 8; ++i) {
f0 = _mm256_extract_epi64(s[i], 0);
f1 = _mm256_extract_epi64(s[i], 1);
f2 = _mm256_extract_epi64(s[i], 2);
f3 = _mm256_extract_epi64(s[i], 3);
store64(out0, f0);
store64(out1, f1);
store64(out2, f2);
store64(out3, f3);

out0 += 8;
out1 += 8;
out2 += 8;
out3 += 8;
}
h0 += r;
h1 += r;
h2 += r;
h3 += r;
nblocks--;
}
}

void PQCLEAN_KYBER1024_AVX2_kyber_shake128x4_absorb(keccak4x_state *state,
const uint8_t *seed,
uint16_t nonce0,
uint16_t nonce1,
uint16_t nonce2,
uint16_t nonce3) {
uint8_t extseed[4][KYBER_SYMBYTES + 2];

for (size_t i = 0; i < KYBER_SYMBYTES; ++i) {
extseed[0][i] = seed[i];
extseed[1][i] = seed[i];
extseed[2][i] = seed[i];
extseed[3][i] = seed[i];
}
extseed[0][KYBER_SYMBYTES + 0] = (uint8_t)nonce0;
extseed[0][KYBER_SYMBYTES + 1] = (uint8_t)(nonce0 >> 8);
extseed[1][KYBER_SYMBYTES + 0] = (uint8_t)nonce1;
extseed[1][KYBER_SYMBYTES + 1] = (uint8_t)(nonce1 >> 8);
extseed[2][KYBER_SYMBYTES + 0] = (uint8_t)nonce2;
extseed[2][KYBER_SYMBYTES + 1] = (uint8_t)(nonce2 >> 8);
extseed[3][KYBER_SYMBYTES + 0] = (uint8_t)nonce3;
extseed[3][KYBER_SYMBYTES + 1] = (uint8_t)(nonce3 >> 8);

/* zero state */
for (size_t i = 0; i < 25; i++) {
state->s[i] = _mm256_xor_si256(state->s[i], state->s[i]);
--nblocks;
}
}

/* absorb 4 message of identical length in parallel */
keccak_absorb4x(state->s, SHAKE128_RATE, extseed[0], extseed[1], extseed[2], extseed[3], KYBER_SYMBYTES + 2, 0x1F);
void PQCLEAN_KYBER1024_AVX2_shake128x4_absorb(keccakx4_state *state,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen) {
keccakx4_absorb(state->s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F);
}

void PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(uint8_t *out0,
@@ -150,82 +113,78 @@ void PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(uint8_t *out0,
uint8_t *out2,
uint8_t *out3,
size_t nblocks,
keccak4x_state *state) {
keccak_squeezeblocks4x(out0, out1, out2, out3, nblocks, state->s, SHAKE128_RATE);
keccakx4_state *state) {
keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE128_RATE,
state->s);
}

static void shake256x4(uint8_t *out0,
uint8_t *out1,
uint8_t *out2,
uint8_t *out3, size_t outlen,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3, size_t inlen) {
__m256i s[25];
uint8_t t0[SHAKE256_RATE];
uint8_t t1[SHAKE256_RATE];
uint8_t t2[SHAKE256_RATE];
uint8_t t3[SHAKE256_RATE];

/* zero state */
for (size_t i = 0; i < 25; i++) {
s[i] = _mm256_xor_si256(s[i], s[i]);
}

/* absorb 4 message of identical length in parallel */
keccak_absorb4x(s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F);

/* Squeeze output */
keccak_squeezeblocks4x(out0, out1, out2, out3, outlen / SHAKE256_RATE, s, SHAKE256_RATE);
void PQCLEAN_KYBER1024_AVX2_shake256x4_absorb(keccakx4_state *state,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen) {
keccakx4_absorb(state->s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F);
}

out0 += (outlen / SHAKE256_RATE) * SHAKE256_RATE;
out1 += (outlen / SHAKE256_RATE) * SHAKE256_RATE;
out2 += (outlen / SHAKE256_RATE) * SHAKE256_RATE;
out3 += (outlen / SHAKE256_RATE) * SHAKE256_RATE;
void PQCLEAN_KYBER1024_AVX2_shake256x4_squeezeblocks(uint8_t *out0,
uint8_t *out1,
uint8_t *out2,
uint8_t *out3,
size_t nblocks,
keccakx4_state *state) {
keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE256_RATE,
state->s);
}

if (outlen % SHAKE256_RATE) {
keccak_squeezeblocks4x(t0, t1, t2, t3, 1, s, SHAKE256_RATE);
for (size_t i = 0; i < outlen % SHAKE256_RATE; i++) {
out0[i] = t0[i];
out1[i] = t1[i];
out2[i] = t2[i];
out3[i] = t3[i];
void PQCLEAN_KYBER1024_AVX2_shake128x4(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t outlen, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen) {
unsigned int i = 0;
size_t nblocks = outlen / SHAKE128_RATE;
uint8_t t[4][SHAKE128_RATE];
keccakx4_state state;

PQCLEAN_KYBER1024_AVX2_shake128x4_absorb(&state, in0, in1, in2, in3, inlen);
PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state);

out0 += nblocks * SHAKE128_RATE;
out1 += nblocks * SHAKE128_RATE;
out2 += nblocks * SHAKE128_RATE;
out3 += nblocks * SHAKE128_RATE;
outlen -= nblocks * SHAKE128_RATE;

if (outlen) {
PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state);
for (i = 0; i < outlen; ++i) {
out0[i] = t[0][i];
out1[i] = t[1][i];
out2[i] = t[2][i];
out3[i] = t[3][i];
}
}
}

void PQCLEAN_KYBER1024_AVX2_shake256x4_prf(uint8_t *out0,
uint8_t *out1,
uint8_t *out2,
uint8_t *out3,
size_t outlen,
const uint8_t *key,
uint8_t nonce0,
uint8_t nonce1,
uint8_t nonce2,
uint8_t nonce3) {
uint8_t extseed[4][KYBER_SYMBYTES + 1];

for (size_t i = 0; i < KYBER_SYMBYTES; i++) {
extseed[0][i] = key[i];
extseed[1][i] = key[i];
extseed[2][i] = key[i];
extseed[3][i] = key[i];
void PQCLEAN_KYBER1024_AVX2_shake256x4(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t outlen, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen) {
unsigned int i = 0;
size_t nblocks = outlen / SHAKE256_RATE;
uint8_t t[4][SHAKE256_RATE];
keccakx4_state state;

PQCLEAN_KYBER1024_AVX2_shake256x4_absorb(&state, in0, in1, in2, in3, inlen);
PQCLEAN_KYBER1024_AVX2_shake256x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state);

out0 += nblocks * SHAKE256_RATE;
out1 += nblocks * SHAKE256_RATE;
out2 += nblocks * SHAKE256_RATE;
out3 += nblocks * SHAKE256_RATE;
outlen -= nblocks * SHAKE256_RATE;

if (outlen) {
PQCLEAN_KYBER1024_AVX2_shake256x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state);
for (i = 0; i < outlen; ++i) {
out0[i] = t[0][i];
out1[i] = t[1][i];
out2[i] = t[2][i];
out3[i] = t[3][i];
}
}
extseed[0][KYBER_SYMBYTES] = nonce0;
extseed[1][KYBER_SYMBYTES] = nonce1;
extseed[2][KYBER_SYMBYTES] = nonce2;
extseed[3][KYBER_SYMBYTES] = nonce3;

shake256x4(out0,
out1,
out2,
out3,
outlen,
extseed[0],
extseed[1],
extseed[2],
extseed[3],
KYBER_SYMBYTES + 1);
}

+ 14
- 26
crypto_kem/kyber1024/avx2/fips202x4.h View File

@@ -7,31 +7,19 @@

typedef struct {
__m256i s[25];
} keccak4x_state;

void PQCLEAN_KYBER1024_AVX2_kyber_shake128x4_absorb(keccak4x_state *state,
const uint8_t *seed,
uint16_t nonce0,
uint16_t nonce1,
uint16_t nonce2,
uint16_t nonce3);

void PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(uint8_t *out0,
uint8_t *out1,
uint8_t *out2,
uint8_t *out3,
size_t nblocks,
keccak4x_state *state);

void PQCLEAN_KYBER1024_AVX2_shake256x4_prf(uint8_t *out0,
uint8_t *out1,
uint8_t *out2,
uint8_t *out3,
size_t outlen,
const uint8_t *key,
uint8_t nonce0,
uint8_t nonce1,
uint8_t nonce2,
uint8_t nonce3);
} keccakx4_state;

void PQCLEAN_KYBER1024_AVX2_shake128x4_absorb(keccakx4_state *state, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen);

void PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t nblocks, keccakx4_state *state);

void PQCLEAN_KYBER1024_AVX2_shake256x4_absorb(keccakx4_state *state, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen);

void PQCLEAN_KYBER1024_AVX2_shake256x4_squeezeblocks(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t nblocks,
keccakx4_state *state);

void PQCLEAN_KYBER1024_AVX2_shake128x4(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t outlen, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen);

void PQCLEAN_KYBER1024_AVX2_shake256x4(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t outlen, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen);

#endif

crypto_kem/kyber512/avx2/fq.s → crypto_kem/kyber1024/avx2/fq.S View File

@@ -1,11 +1,8 @@
#include "cdecl.inc"
.include "fq.inc"

.global PQCLEAN_KYBER512_AVX2_reduce_avx
PQCLEAN_KYBER512_AVX2_reduce_avx:
#consts
vmovdqa PQCLEAN_KYBER512_AVX2_16xq(%rip),%ymm0
vmovdqa PQCLEAN_KYBER512_AVX2_16xv(%rip),%ymm1

.text
reduce128_avx:
#load
vmovdqa (%rdi),%ymm2
vmovdqa 32(%rdi),%ymm3
@@ -16,14 +13,14 @@ vmovdqa 160(%rdi),%ymm7
vmovdqa 192(%rdi),%ymm8
vmovdqa 224(%rdi),%ymm9

red16 2 10
red16 3 11
red16 4 12
red16 5 13
red16 6 14
red16 7 15
red16 8 10
red16 9 11
red16 2,10
red16 3,11
red16 4,12
red16 5,13
red16 6,14
red16 7,15
red16 8,10
red16 9,11

#store
vmovdqa %ymm2,(%rdi)
@@ -37,11 +34,17 @@ vmovdqa %ymm9,224(%rdi)

ret

.global PQCLEAN_KYBER512_AVX2_csubq_avx
PQCLEAN_KYBER512_AVX2_csubq_avx:
.global cdecl(PQCLEAN_KYBER1024_AVX2_reduce_avx)
cdecl(PQCLEAN_KYBER1024_AVX2_reduce_avx):
#consts
vmovdqa PQCLEAN_KYBER512_AVX2_16xq(%rip),%ymm0
vmovdqa _16XQ*2(%rsi),%ymm0
vmovdqa _16XV*2(%rsi),%ymm1
call reduce128_avx
add $256,%rdi
call reduce128_avx
ret

csubq128_avx:
#load
vmovdqa (%rdi),%ymm1
vmovdqa 32(%rdi),%ymm2
@@ -52,14 +55,14 @@ vmovdqa 160(%rdi),%ymm6
vmovdqa 192(%rdi),%ymm7
vmovdqa 224(%rdi),%ymm8

csubq 1 9
csubq 2 10
csubq 3 11
csubq 4 12
csubq 5 13
csubq 6 14
csubq 7 15
csubq 8 9
csubq 1,9
csubq 2,10
csubq 3,11
csubq 4,12
csubq 5,13
csubq 6,14
csubq 7,15
csubq 8,9

#store
vmovdqa %ymm1,(%rdi)
@@ -73,13 +76,16 @@ vmovdqa %ymm8,224(%rdi)

ret

.global PQCLEAN_KYBER512_AVX2_frommont_avx
PQCLEAN_KYBER512_AVX2_frommont_avx:
.global cdecl(PQCLEAN_KYBER1024_AVX2_csubq_avx)
cdecl(PQCLEAN_KYBER1024_AVX2_csubq_avx):
#consts
vmovdqa PQCLEAN_KYBER512_AVX2_16xq(%rip),%ymm0
vmovdqa PQCLEAN_KYBER512_AVX2_16xmontsqlo(%rip),%ymm1
vmovdqa PQCLEAN_KYBER512_AVX2_16xmontsqhi(%rip),%ymm2
vmovdqa _16XQ*2(%rsi),%ymm0
call csubq128_avx
add $256,%rdi
call csubq128_avx
ret

tomont128_avx:
#load
vmovdqa (%rdi),%ymm3
vmovdqa 32(%rdi),%ymm4
@@ -90,14 +96,14 @@ vmovdqa 160(%rdi),%ymm8
vmovdqa 192(%rdi),%ymm9
vmovdqa 224(%rdi),%ymm10

fqmulprecomp 1,2,3 11
fqmulprecomp 1,2,4 12
fqmulprecomp 1,2,5 13
fqmulprecomp 1,2,6 14
fqmulprecomp 1,2,7 15
fqmulprecomp 1,2,8 11
fqmulprecomp 1,2,9 12
fqmulprecomp 1,2,10 13
fqmulprecomp 1,2,3,11
fqmulprecomp 1,2,4,12
fqmulprecomp 1,2,5,13
fqmulprecomp 1,2,6,14
fqmulprecomp 1,2,7,15
fqmulprecomp 1,2,8,11
fqmulprecomp 1,2,9,12
fqmulprecomp 1,2,10,13

#store
vmovdqa %ymm3,(%rdi)
@@ -110,3 +116,14 @@ vmovdqa %ymm9,192(%rdi)
vmovdqa %ymm10,224(%rdi)

ret

.global cdecl(PQCLEAN_KYBER1024_AVX2_tomont_avx)
cdecl(PQCLEAN_KYBER1024_AVX2_tomont_avx):
#consts
vmovdqa _16XQ*2(%rsi),%ymm0
vmovdqa _16XMONTSQLO*2(%rsi),%ymm1
vmovdqa _16XMONTSQHI*2(%rsi),%ymm2
call tomont128_avx
add $256,%rdi
call tomont128_avx
ret

+ 7
- 4
crypto_kem/kyber1024/avx2/fq.inc View File

@@ -1,24 +1,27 @@
.macro red16 r x=12
.macro red16 r,x=12
vpmulhw %ymm1,%ymm\r,%ymm\x
vpsraw $10,%ymm\x,%ymm\x
vpmullw %ymm0,%ymm\x,%ymm\x
vpsubw %ymm\x,%ymm\r,%ymm\r
.endm

.macro csubq r x=12
.macro csubq r,x=12
vpsubw %ymm0,%ymm\r,%ymm\r
vpsraw $15,%ymm\r,%ymm\x
vpand %ymm0,%ymm\x,%ymm\x
vpaddw %ymm\x,%ymm\r,%ymm\r
#vpcmpgtw %ymm0,%ymm\r,%ymm\x
#vpand %ymm0,%ymm\x,%ymm\x
#vpsubw %ymm\x,%ymm\r,%ymm\r
.endm

.macro caddq r x=12
.macro caddq r,x=12
vpsraw $15,%ymm\r,%ymm\x
vpand %ymm0,%ymm\x,%ymm\x
vpaddw %ymm\x,%ymm\r,%ymm\r
.endm

.macro fqmulprecomp al,ah,b x=12
.macro fqmulprecomp al,ah,b,x=12
vpmullw %ymm\al,%ymm\b,%ymm\x
vpmulhw %ymm\ah,%ymm\b,%ymm\b
vpmulhw %ymm0,%ymm\x,%ymm\x


+ 174
- 106
crypto_kem/kyber1024/avx2/indcpa.c View File

@@ -1,26 +1,33 @@
#include "align.h"
#include "cbd.h"
#include "indcpa.h"
#include "ntt.h"
#include "params.h"
#include "poly.h"
#include "polyvec.h"
#include "randombytes.h"
#include "rejsample.h"
#include "symmetric.h"
#include <stddef.h>
#include <stdint.h>

/*************************************************
* Name: pack_pk
*
* Description: Serialize the public key as concatenation of the
* compressed and serialized vector of polynomials pk
* serialized vector of polynomials pk
* and the public seed used to generate the matrix A.
*
* Arguments: uint8_t *r: pointer to the output serialized public key
* const poly *pk: pointer to the input public-key polynomial
* polyvec *pk: pointer to the input public-key polyvec
* const uint8_t *seed: pointer to the input public seed
**************************************************/
static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) {
static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES],
polyvec *pk,
const uint8_t seed[KYBER_SYMBYTES]) {
size_t i = 0;
PQCLEAN_KYBER1024_AVX2_polyvec_tobytes(r, pk);
for (size_t i = 0; i < KYBER_SYMBYTES; i++) {
for (i = 0; i < KYBER_SYMBYTES; i++) {
r[i + KYBER_POLYVECBYTES] = seed[i];
}
}
@@ -28,16 +35,19 @@ static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) {
/*************************************************
* Name: unpack_pk
*
* Description: De-serialize and decompress public key from a byte array;
* Description: De-serialize public key from a byte array;
* approximate inverse of pack_pk
*
* Arguments: - polyvec *pk: pointer to output public-key vector of polynomials
* - uint8_t *seed: pointer to output seed to generate matrix A
* Arguments: - polyvec *pk: pointer to output public-key polynomial vector
* - uint8_t *seed: pointer to output seed to generate matrix A
* - const uint8_t *packedpk: pointer to input serialized public key
**************************************************/
static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) {
static void unpack_pk(polyvec *pk,
uint8_t seed[KYBER_SYMBYTES],
const uint8_t packedpk[KYBER_INDCPA_PUBLICKEYBYTES]) {
size_t i = 0;
PQCLEAN_KYBER1024_AVX2_polyvec_frombytes(pk, packedpk);
for (size_t i = 0; i < KYBER_SYMBYTES; i++) {
for (i = 0; i < KYBER_SYMBYTES; i++) {
seed[i] = packedpk[i + KYBER_POLYVECBYTES];
}
}
@@ -48,9 +58,9 @@ static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) {
* Description: Serialize the secret key
*
* Arguments: - uint8_t *r: pointer to output serialized secret key
* - const polyvec *sk: pointer to input vector of polynomials (secret key)
* - polyvec *sk: pointer to input vector of polynomials (secret key)
**************************************************/
static void pack_sk(uint8_t *r, polyvec *sk) {
static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) {
PQCLEAN_KYBER1024_AVX2_polyvec_tobytes(r, sk);
}

@@ -60,10 +70,12 @@ static void pack_sk(uint8_t *r, polyvec *sk) {
* Description: De-serialize the secret key;
* inverse of pack_sk
*
* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key)
* Arguments: - polyvec *sk: pointer to output vector of polynomials
* (secret key)
* - const uint8_t *packedsk: pointer to input serialized secret key
**************************************************/
static void unpack_sk(polyvec *sk, const uint8_t *packedsk) {
static void unpack_sk(polyvec *sk,
const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) {
PQCLEAN_KYBER1024_AVX2_polyvec_frombytes(sk, packedsk);
}

@@ -74,11 +86,13 @@ static void unpack_sk(polyvec *sk, const uint8_t *packedsk) {
* compressed and serialized vector of polynomials b
* and the compressed and serialized polynomial v
*
* Arguments: uint8_t *r: pointer to the output serialized ciphertext
* const poly *pk: pointer to the input vector of polynomials b
* const uint8_t *seed: pointer to the input polynomial v
* Arguments: uint8_t *r: pointer to the output serialized ciphertext
* poly *pk: pointer to the input vector of polynomials b
* poly *v: pointer to the input polynomial v
**************************************************/
static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) {
static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES],
polyvec *b,
poly *v) {
PQCLEAN_KYBER1024_AVX2_polyvec_compress(r, b);
PQCLEAN_KYBER1024_AVX2_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v);
}
@@ -89,22 +103,42 @@ static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) {
* Description: De-serialize and decompress ciphertext from a byte array;
* approximate inverse of pack_ciphertext
*
* Arguments: - polyvec *b: pointer to the output vector of polynomials b
* - poly *v: pointer to the output polynomial v
* Arguments: - polyvec *b: pointer to the output vector of polynomials b
* - poly *v: pointer to the output polynomial v
* - const uint8_t *c: pointer to the input serialized ciphertext
**************************************************/
static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t *c) {
static void unpack_ciphertext(polyvec *b,
poly *v,
const uint8_t c[KYBER_INDCPA_BYTES]) {
PQCLEAN_KYBER1024_AVX2_polyvec_decompress(b, c);
PQCLEAN_KYBER1024_AVX2_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES);
}

static size_t rej_uniform_ref(int16_t *r, size_t len, const uint8_t *buf, size_t buflen) {
unsigned int ctr, pos;
uint16_t val;
/*************************************************
* Name: rej_uniform
*
* Description: Run rejection sampling on uniform random bytes to generate
* uniform random integers mod q
*
* Arguments: - int16_t *r: pointer to output buffer
* - unsigned int len: requested number of 16-bit integers
* (uniform mod q)
* - const uint8_t *buf: pointer to input buffer
* (assumed to be uniform random bytes)
* - unsigned int buflen: length of input buffer in bytes
*
* Returns number of sampled 16-bit integers (at most len)
**************************************************/
static unsigned int rej_uniform(int16_t *r,
unsigned int len,
const uint8_t *buf,
unsigned int buflen) {
unsigned int ctr = 0, pos = 0;
uint16_t val = 0;

ctr = pos = 0;
while (ctr < len && pos + 2 <= buflen) {
val = (uint16_t)(buf[pos] | ((uint16_t)buf[pos + 1] << 8));
val = buf[pos] | ((uint16_t)buf[pos + 1] << 8);
pos += 2;

if (val < 19 * KYBER_Q) {
@@ -116,57 +150,76 @@ static size_t rej_uniform_ref(int16_t *r, size_t len, const uint8_t *buf, size_t
return ctr;
}

#define gen_a(A,B) gen_matrix(A,B,0)
#define gen_at(A,B) gen_matrix(A,B,1)
#define gen_a(A,B) PQCLEAN_KYBER1024_AVX2_gen_matrix(A,B,0)
#define gen_at(A,B) PQCLEAN_KYBER1024_AVX2_gen_matrix(A,B,1)

/*************************************************
* Name: gen_matrix
* Name: PQCLEAN_KYBER1024_AVX2_gen_matrix
*
* Description: Deterministically generate matrix A (or the transpose of A)
* from a seed. Entries of the matrix are polynomials that look
* uniformly random. Performs rejection sampling on output of
* a XOF
*
* Arguments: - polyvec *a: pointer to ouptput matrix A
* Arguments: - polyvec *a: pointer to ouptput matrix A
* - const uint8_t *seed: pointer to input seed
* - int transposed: boolean deciding whether A or A^T is generated
* - int transposed: boolean deciding whether A or A^T is generated
**************************************************/
#define GEN_MATRIX_MAXNBLOCKS ((530+XOF_BLOCKBYTES)/XOF_BLOCKBYTES) /* 530 is expected number of required bytes */
static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) {
uint16_t i;
size_t ctr0, ctr1, ctr2, ctr3, bufbytes;
union {
uint8_t x[4][XOF_BLOCKBYTES * GEN_MATRIX_MAXNBLOCKS];
__m256i _dummy;
} buf;
keccak4x_state state;
#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \
+ XOF_BLOCKBYTES)/XOF_BLOCKBYTES)
void PQCLEAN_KYBER1024_AVX2_gen_matrix(polyvec *a, const uint8_t seed[32], int transposed) {
unsigned int i = 0, ctr0 = 0, ctr1 = 0, ctr2 = 0, ctr3 = 0;
ALIGN32_ARRAY_2D(uint8_t, 4, (GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 31) / 32 * 32) buf;
__m256i f;
keccakx4_state state;

for (i = 0; i < 4; i++) {
f = _mm256_load_si256((__m256i *)seed);
_mm256_store_si256((__m256i *)buf.arr[0], f);
_mm256_store_si256((__m256i *)buf.arr[1], f);
_mm256_store_si256((__m256i *)buf.arr[2], f);
_mm256_store_si256((__m256i *)buf.arr[3], f);

if (transposed) {
PQCLEAN_KYBER1024_AVX2_kyber_shake128x4_absorb(
&state, seed, i + 0, i + 256, i + 512, i + 768);
buf.arr[0][KYBER_SYMBYTES + 0] = i;
buf.arr[0][KYBER_SYMBYTES + 1] = 0;
buf.arr[1][KYBER_SYMBYTES + 0] = i;
buf.arr[1][KYBER_SYMBYTES + 1] = 1;
buf.arr[2][KYBER_SYMBYTES + 0] = i;
buf.arr[2][KYBER_SYMBYTES + 1] = 2;
buf.arr[3][KYBER_SYMBYTES + 0] = i;
buf.arr[3][KYBER_SYMBYTES + 1] = 3;
} else {
PQCLEAN_KYBER1024_AVX2_kyber_shake128x4_absorb(
&state, seed, 256 * i + 0, 256 * i + 1, 256 * i + 2, 256 * i + 3);
buf.arr[0][KYBER_SYMBYTES + 0] = 0;
buf.arr[0][KYBER_SYMBYTES + 1] = i;
buf.arr[1][KYBER_SYMBYTES + 0] = 1;
buf.arr[1][KYBER_SYMBYTES + 1] = i;
buf.arr[2][KYBER_SYMBYTES + 0] = 2;
buf.arr[2][KYBER_SYMBYTES + 1] = i;
buf.arr[3][KYBER_SYMBYTES + 0] = 3;
buf.arr[3][KYBER_SYMBYTES + 1] = i;
}

PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(
buf.x[0], buf.x[1], buf.x[2], buf.x[3], GEN_MATRIX_MAXNBLOCKS, &state);
bufbytes = GEN_MATRIX_MAXNBLOCKS * XOF_BLOCKBYTES;
PQCLEAN_KYBER1024_AVX2_shake128x4_absorb(&state, buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], KYBER_SYMBYTES + 2);
PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3],
GEN_MATRIX_NBLOCKS, &state);

ctr0 = PQCLEAN_KYBER1024_AVX2_rej_uniform(a[i].vec[0].coeffs, KYBER_N, buf.x[0], bufbytes);
ctr1 = PQCLEAN_KYBER1024_AVX2_rej_uniform(a[i].vec[1].coeffs, KYBER_N, buf.x[1], bufbytes);
ctr2 = PQCLEAN_KYBER1024_AVX2_rej_uniform(a[i].vec[2].coeffs, KYBER_N, buf.x[2], bufbytes);
ctr3 = PQCLEAN_KYBER1024_AVX2_rej_uniform(a[i].vec[3].coeffs, KYBER_N, buf.x[3], bufbytes);
ctr0 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[0].coeffs, buf.arr[0]);
ctr1 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[1].coeffs, buf.arr[1]);
ctr2 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[2].coeffs, buf.arr[2]);
ctr3 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[3].coeffs, buf.arr[3]);

while (ctr0 < KYBER_N || ctr1 < KYBER_N || ctr2 < KYBER_N || ctr3 < KYBER_N) {
PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(buf.x[0], buf.x[1], buf.x[2], buf.x[3], 1, &state);
bufbytes = XOF_BLOCKBYTES;

ctr0 += rej_uniform_ref(a[i].vec[0].coeffs + ctr0, KYBER_N - ctr0, buf.x[0], bufbytes);
ctr1 += rej_uniform_ref(a[i].vec[1].coeffs + ctr1, KYBER_N - ctr1, buf.x[1], bufbytes);
ctr2 += rej_uniform_ref(a[i].vec[2].coeffs + ctr2, KYBER_N - ctr2, buf.x[2], bufbytes);
ctr3 += rej_uniform_ref(a[i].vec[3].coeffs + ctr3, KYBER_N - ctr3, buf.x[3], bufbytes);
PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], 1, &state);

ctr0 += rej_uniform(a[i].vec[0].coeffs + ctr0, KYBER_N - ctr0, buf.arr[0],
XOF_BLOCKBYTES);
ctr1 += rej_uniform(a[i].vec[1].coeffs + ctr1, KYBER_N - ctr1, buf.arr[1],
XOF_BLOCKBYTES);
ctr2 += rej_uniform(a[i].vec[2].coeffs + ctr2, KYBER_N - ctr2, buf.arr[2],
XOF_BLOCKBYTES);
ctr3 += rej_uniform(a[i].vec[3].coeffs + ctr3, KYBER_N - ctr3, buf.arr[3],
XOF_BLOCKBYTES);
}

PQCLEAN_KYBER1024_AVX2_poly_nttunpack(&a[i].vec[0]);
@@ -177,36 +230,41 @@ static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) {
}

/*************************************************
* Name: indcpa_keypair
* Name: PQCLEAN_KYBER1024_AVX2_indcpa_keypair
*
* Description: Generates public and private key for the CPA-secure
* public-key encryption scheme underlying Kyber
*
* Arguments: - uint8_t *pk: pointer to output public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
* - uint8_t *sk: pointer to output private key (of length KYBER_INDCPA_SECRETKEYBYTES bytes)
* Arguments: - uint8_t *pk: pointer to output public key
* (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
* - uint8_t *sk: pointer to output private key
(of length KYBER_INDCPA_SECRETKEYBYTES bytes)
**************************************************/
void PQCLEAN_KYBER1024_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk) {
polyvec a[KYBER_K], skpv, e, pkpv;
uint8_t buf[2 * KYBER_SYMBYTES];
const uint8_t *publicseed = buf;
const uint8_t *noiseseed = buf + KYBER_SYMBYTES;
uint8_t nonce = 0;
void PQCLEAN_KYBER1024_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) {
unsigned int i = 0;
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf;
const uint8_t *publicseed = buf.arr;
const uint8_t *noiseseed = buf.arr + KYBER_SYMBYTES;
polyvec a[KYBER_K], e, pkpv, skpv;

randombytes(buf, KYBER_SYMBYTES);
hash_g(buf, buf, KYBER_SYMBYTES);
randombytes(buf.arr, KYBER_SYMBYTES);
hash_g(buf.arr, buf.arr, KYBER_SYMBYTES);

gen_a(a, publicseed);

PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(skpv.vec + 0, skpv.vec + 1, skpv.vec + 2, skpv.vec + 3, noiseseed, nonce + 0, nonce + 1, nonce + 2, nonce + 3);
PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(e.vec + 0, e.vec + 1, e.vec + 2, e.vec + 3, noiseseed, nonce + 4, nonce + 5, nonce + 6, nonce + 7);
PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(skpv.vec + 0, skpv.vec + 1, skpv.vec + 2, skpv.vec + 3, noiseseed,
0, 1, 2, 3);
PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(e.vec + 0, e.vec + 1, e.vec + 2, e.vec + 3, noiseseed,
4, 5, 6, 7);

PQCLEAN_KYBER1024_AVX2_polyvec_ntt(&skpv);
PQCLEAN_KYBER1024_AVX2_polyvec_ntt(&e);

// matrix-vector multiplication
for (size_t i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc(pkpv.vec + i, a + i, &skpv);
PQCLEAN_KYBER1024_AVX2_poly_frommont(pkpv.vec + i);
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv);
PQCLEAN_KYBER1024_AVX2_poly_tomont(&pkpv.vec[i]);
}

PQCLEAN_KYBER1024_AVX2_polyvec_add(&pkpv, &pkpv, &e);
@@ -217,45 +275,52 @@ void PQCLEAN_KYBER1024_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk) {
}

/*************************************************
* Name: indcpa_enc
* Name: PQCLEAN_KYBER1024_AVX2_indcpa_enc
*
* Description: Encryption function of the CPA-secure
* public-key encryption scheme underlying Kyber.
*
* Arguments: - uint8_t *c: pointer to output ciphertext (of length KYBER_INDCPA_BYTES bytes)
* - const uint8_t *m: pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes)
* - const uint8_t *pk: pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
* - const uint8_t *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes)
* to deterministically generate all randomness
* Arguments: - uint8_t *c: pointer to output ciphertext
* (of length KYBER_INDCPA_BYTES bytes)
* - const uint8_t *m: pointer to input message
* (of length KYBER_INDCPA_MSGBYTES bytes)
* - const uint8_t *pk: pointer to input public key
* (of length KYBER_INDCPA_PUBLICKEYBYTES)
* - const uint8_t *coins: pointer to input random coins
* used as seed (of length KYBER_SYMBYTES)
* to deterministically generate all
* randomness
**************************************************/
void PQCLEAN_KYBER1024_AVX2_indcpa_enc(uint8_t *c,
const uint8_t *m,
const uint8_t *pk,
const uint8_t *coins) {
polyvec at[KYBER_K], pkpv, sp, ep, bp;
poly k, v, epp;
uint8_t seed[KYBER_SYMBYTES];
uint8_t nonce = 0;
unpack_pk(&pkpv, seed, pk);
void PQCLEAN_KYBER1024_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
const uint8_t m[KYBER_INDCPA_MSGBYTES],
const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
const uint8_t coins[KYBER_SYMBYTES]) {
unsigned int i = 0;
ALIGN32_ARRAY(uint8_t, KYBER_SYMBYTES) seed;
polyvec sp, pkpv, ep, at[KYBER_K], bp;
poly v, k, epp;
unpack_pk(&pkpv, seed.arr, pk);
PQCLEAN_KYBER1024_AVX2_poly_frommsg(&k, m);
gen_at(at, seed);
gen_at(at, seed.arr);

PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(sp.vec + 0, sp.vec + 1, sp.vec + 2, sp.vec + 3, coins, nonce + 0, nonce + 1, nonce + 2, nonce + 3);
PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(ep.vec + 0, ep.vec + 1, ep.vec + 2, ep.vec + 3, coins, nonce + 4, nonce + 5, nonce + 6, nonce + 7);
PQCLEAN_KYBER1024_AVX2_poly_getnoise(&epp, coins, nonce + 8);
PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(sp.vec + 0, sp.vec + 1, sp.vec + 2, sp.vec + 3, coins,
0, 1, 2, 3);
PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(ep.vec + 0, ep.vec + 1, ep.vec + 2, ep.vec + 3, coins,
4, 5, 6, 7);
PQCLEAN_KYBER1024_AVX2_poly_getnoise(&epp, coins, 8);

PQCLEAN_KYBER1024_AVX2_polyvec_ntt(&sp);

// matrix-vector multiplication
for (size_t i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc(bp.vec + i, at + i, &sp);
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp);
}

PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc(&v, &pkpv, &sp);
PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp);

PQCLEAN_KYBER1024_AVX2_polyvec_invntt(&bp);
PQCLEAN_KYBER1024_AVX2_poly_invntt(&v);
PQCLEAN_KYBER1024_AVX2_polyvec_invntt_tomont(&bp);
PQCLEAN_KYBER1024_AVX2_poly_invntt_tomont(&v);

PQCLEAN_KYBER1024_AVX2_polyvec_add(&bp, &bp, &ep);
PQCLEAN_KYBER1024_AVX2_poly_add(&v, &v, &epp);
@@ -267,18 +332,21 @@ void PQCLEAN_KYBER1024_AVX2_indcpa_enc(uint8_t *c,
}

/*************************************************
* Name: indcpa_dec
* Name: PQCLEAN_KYBER1024_AVX2_indcpa_dec
*
* Description: Decryption function of the CPA-secure
* public-key encryption scheme underlying Kyber.
*
* Arguments: - uint8_t *m: pointer to output decrypted message (of length KYBER_INDCPA_MSGBYTES)
* - const uint8_t *c: pointer to input ciphertext (of length KYBER_INDCPA_BYTES)
* - const uint8_t *sk: pointer to input secret key (of length KYBER_INDCPA_SECRETKEYBYTES)
* Arguments: - uint8_t *m: pointer to output decrypted message
* (of length KYBER_INDCPA_MSGBYTES)
* - const uint8_t *c: pointer to input ciphertext
* (of length KYBER_INDCPA_BYTES)
* - const uint8_t *sk: pointer to input secret key
* (of length KYBER_INDCPA_SECRETKEYBYTES)
**************************************************/
void PQCLEAN_KYBER1024_AVX2_indcpa_dec(uint8_t *m,
const uint8_t *c,
const uint8_t *sk) {
void PQCLEAN_KYBER1024_AVX2_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES],
const uint8_t c[KYBER_INDCPA_BYTES],
const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) {
polyvec bp, skpv;
poly v, mp;

@@ -286,8 +354,8 @@ void PQCLEAN_KYBER1024_AVX2_indcpa_dec(uint8_t *m,
unpack_sk(&skpv, sk);

PQCLEAN_KYBER1024_AVX2_polyvec_ntt(&bp);
PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc(&mp, &skpv, &bp);
PQCLEAN_KYBER1024_AVX2_poly_invntt(&mp);
PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp);
PQCLEAN_KYBER1024_AVX2_poly_invntt_tomont(&mp);

PQCLEAN_KYBER1024_AVX2_poly_sub(&mp, &v, &mp);
PQCLEAN_KYBER1024_AVX2_poly_reduce(&mp);


+ 9
- 14
crypto_kem/kyber1024/avx2/indcpa.h View File

@@ -1,21 +1,16 @@
#ifndef INDCPA_H
#define INDCPA_H
#ifndef PQCLEAN_KYBER1024_AVX2_INDCPA_H
#define PQCLEAN_KYBER1024_AVX2_INDCPA_H

#include "params.h"
#include "polyvec.h"
#include <stdint.h>

void PQCLEAN_KYBER1024_AVX2_indcpa_keypair(
uint8_t *pk,
uint8_t *sk);
void PQCLEAN_KYBER1024_AVX2_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed);

void PQCLEAN_KYBER1024_AVX2_indcpa_enc(
uint8_t *c,
const uint8_t *m,
const uint8_t *pk,
const uint8_t *coins);
void PQCLEAN_KYBER1024_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]);

void PQCLEAN_KYBER1024_AVX2_indcpa_dec(
uint8_t *m,
const uint8_t *c,
const uint8_t *sk);
void PQCLEAN_KYBER1024_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], const uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], const uint8_t coins[KYBER_SYMBYTES]);

void PQCLEAN_KYBER1024_AVX2_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t c[KYBER_INDCPA_BYTES], const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]);

#endif

crypto_kem/kyber512/avx2/invntt.s → crypto_kem/kyber1024/avx2/invntt.S View File

@@ -1,7 +1,8 @@
#include "cdecl.inc"
.include "shuffle.inc"
.include "fq.inc"

.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 zl0=1,zl1=1,zh0=2,zh1=2
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=1,zl1=1,zh0=2,zh1=2
#update & mul
vpsubw %ymm\rh0,%ymm\rl0,%ymm12
vpsubw %ymm\rh1,%ymm\rl1,%ymm13
@@ -36,12 +37,8 @@ vpsubw %ymm\rh2,%ymm14,%ymm\rh2
vpsubw %ymm\rh3,%ymm15,%ymm\rh3
.endm

.global PQCLEAN_KYBER512_AVX2_invntt_levels0t5_avx
.p2align 5
PQCLEAN_KYBER512_AVX2_invntt_levels0t5_avx:
#consts
vmovdqa PQCLEAN_KYBER512_AVX2_16xq(%rip),%ymm0

.text
invntt_levels0t5_avx:
level0:
#zetas
vmovdqu (%rsi),%ymm15
@@ -59,14 +56,14 @@ vmovdqa 160(%rdi),%ymm9
vmovdqa 192(%rdi),%ymm10
vmovdqa 224(%rdi),%ymm11

butterfly 4,5,8,9,6,7,10,11 15,3,1,2
butterfly 4,5,8,9,6,7,10,11,15,3,1,2

level1:
#zetas
vmovdqu 128(%rsi),%ymm3
vmovdqu 160(%rsi),%ymm2

butterfly 4,5,6,7,8,9,10,11 3,3,2,2
butterfly 4,5,6,7,8,9,10,11,3,3,2,2

shuffle1 4,5,3,5
shuffle1 6,7,4,7
@@ -79,9 +76,9 @@ vmovdqu 192(%rsi),%ymm10
vmovdqu 224(%rsi),%ymm2

#consts
vmovdqa PQCLEAN_KYBER512_AVX2_16xv(%rip),%ymm1
vmovdqa _16XV*2(%rdx),%ymm1

butterfly 3,4,6,8,5,7,9,11 10,10,2,2
butterfly 3,4,6,8,5,7,9,11,10,10,2,2

red16 3

@@ -95,7 +92,7 @@ level3:
vmovdqu 256(%rsi),%ymm9
vmovdqu 288(%rsi),%ymm2

butterfly 10,3,6,5,4,8,7,11 9,9,2,2
butterfly 10,3,6,5,4,8,7,11,9,9,2,2

red16 10

@@ -109,7 +106,7 @@ level4:
vmovdqu 320(%rsi),%ymm7
vmovdqu 352(%rsi),%ymm2

butterfly 9,10,6,4,3,5,8,11 7,7,2,2
butterfly 9,10,6,4,3,5,8,11,7,7,2,2

red16 9

@@ -123,7 +120,7 @@ level5:
vpbroadcastd 384(%rsi),%ymm8
vpbroadcastd 388(%rsi),%ymm2

butterfly 7,9,6,3,10,4,5,11 8,8,2,2
butterfly 7,9,6,3,10,4,5,11,8,8,2,2

red16 7

@@ -139,11 +136,7 @@ vmovdqa %ymm11,224(%rdi)

ret

.global PQCLEAN_KYBER512_AVX2_invntt_level6_avx
PQCLEAN_KYBER512_AVX2_invntt_level6_avx:
#consts
vmovdqa PQCLEAN_KYBER512_AVX2_16xq(%rip),%ymm0

invntt_level6_avx:
#zetas
vpbroadcastd (%rsi),%ymm1
vpbroadcastd 4(%rsi),%ymm2
@@ -161,8 +154,8 @@ vmovdqa 352(%rdi),%ymm11
butterfly 4,5,6,7,8,9,10,11

#consts
vmovdqa PQCLEAN_KYBER512_AVX2_16xflo(%rip),%ymm12
vmovdqa PQCLEAN_KYBER512_AVX2_16xfhi(%rip),%ymm13
vmovdqa _16XFLO*2(%rdx),%ymm12
vmovdqa _16XFHI*2(%rdx),%ymm13

#store
vmovdqa %ymm8,256(%rdi)
@@ -170,10 +163,10 @@ vmovdqa %ymm9,288(%rdi)
vmovdqa %ymm10,320(%rdi)
vmovdqa %ymm11,352(%rdi)

fqmulprecomp 12,13,4 8
fqmulprecomp 12,13,5 9
fqmulprecomp 12,13,6 10
fqmulprecomp 12,13,7 11
fqmulprecomp 12,13,4,8
fqmulprecomp 12,13,5,9
fqmulprecomp 12,13,6,10
fqmulprecomp 12,13,7,11

#store
vmovdqa %ymm4,(%rdi)
@@ -194,8 +187,8 @@ vmovdqa 480(%rdi),%ymm11
butterfly 4,5,6,7,8,9,10,11

#consts
vmovdqa PQCLEAN_KYBER512_AVX2_16xflo(%rip),%ymm12
vmovdqa PQCLEAN_KYBER512_AVX2_16xfhi(%rip),%ymm13
vmovdqa _16XFLO*2(%rdx),%ymm12
vmovdqa _16XFHI*2(%rdx),%ymm13

#store
vmovdqa %ymm8,384(%rdi)
@@ -203,10 +196,10 @@ vmovdqa %ymm9,416(%rdi)
vmovdqa %ymm10,448(%rdi)
vmovdqa %ymm11,480(%rdi)

fqmulprecomp 12,13,4 8
fqmulprecomp 12,13,5 9
fqmulprecomp 12,13,6 10
fqmulprecomp 12,13,7 11
fqmulprecomp 12,13,4,8
fqmulprecomp 12,13,5,9
fqmulprecomp 12,13,6,10
fqmulprecomp 12,13,7,11

#store
vmovdqa %ymm4,128(%rdi)
@@ -215,3 +208,18 @@ vmovdqa %ymm6,192(%rdi)
vmovdqa %ymm7,224(%rdi)

ret

.global cdecl(PQCLEAN_KYBER1024_AVX2_invntt_avx)
cdecl(PQCLEAN_KYBER1024_AVX2_invntt_avx):
#consts
vmovdqa _16XQ*2(%rsi),%ymm0
mov %rsi,%rdx
add $_ZETAS_INV_EXP*2,%rsi
call invntt_levels0t5_avx
add $256,%rdi
add $392,%rsi
call invntt_levels0t5_avx
sub $256,%rdi
add $392,%rsi
call invntt_level6_avx
ret

+ 68
- 44
crypto_kem/kyber1024/avx2/kem.c View File

@@ -1,103 +1,127 @@
#include "api.h"
#include "align.h"
#include "indcpa.h"
#include "kem.h"
#include "params.h"
#include "randombytes.h"
#include "symmetric.h"
#include "verify.h"
#include <stddef.h>
#include <stdint.h>


#include <stdlib.h>
/*************************************************
* Name: crypto_kem_keypair
* Name: PQCLEAN_KYBER1024_AVX2_crypto_kem_keypair
*
* Description: Generates public and private key
* for CCA-secure Kyber key encapsulation mechanism
*
* Arguments: - uint8_t *pk: pointer to output public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes)
* - uint8_t *sk: pointer to output private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes)
* Arguments: - unsigned char *pk: pointer to output public key
* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes)
* - unsigned char *sk: pointer to output private key
* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes)
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_KYBER1024_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) {
size_t i;
int PQCLEAN_KYBER1024_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) {
size_t i = 0;
PQCLEAN_KYBER1024_AVX2_indcpa_keypair(pk, sk);
for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) {
sk[i + KYBER_INDCPA_SECRETKEYBYTES] = pk[i];
}
hash_h(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);
randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES); /* Value z for pseudo-random output on reject */
/* Value z for pseudo-random output on reject */
randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES);
return 0;
}

/*************************************************
* Name: crypto_kem_enc
* Name: PQCLEAN_KYBER1024_AVX2_crypto_kem_enc
*
* Description: Generates cipher text and shared
* secret for given public key
*
* Arguments: - uint8_t *ct: pointer to output cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes)
* - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes)
* - const uint8_t *pk: pointer to input public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes)
* Arguments: - unsigned char *ct: pointer to output cipher text
* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes)
* - unsigned char *ss: pointer to output shared secret
* (an already allocated array of CRYPTO_BYTES bytes)
* - const unsigned char *pk: pointer to input public key
* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes)
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_KYBER1024_AVX2_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */
uint8_t buf[2 * KYBER_SYMBYTES];
int PQCLEAN_KYBER1024_AVX2_crypto_kem_enc(unsigned char *ct,
unsigned char *ss,
const unsigned char *pk) {
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf;
/* Will contain key, coins */
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr;

randombytes(buf, KYBER_SYMBYTES);
hash_h(buf, buf, KYBER_SYMBYTES); /* Don't release system RNG output */
randombytes(buf.arr, KYBER_SYMBYTES);
/* Don't release system RNG output */
hash_h(buf.arr, buf.arr, KYBER_SYMBYTES);

hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); /* Multitarget countermeasure for coins + contributory KEM */
hash_g(kr, buf, 2 * KYBER_SYMBYTES);
/* Multitarget countermeasure for coins + contributory KEM */
hash_h(buf.arr + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);
hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES);

PQCLEAN_KYBER1024_AVX2_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */
/* coins are in kr+KYBER_SYMBYTES */
PQCLEAN_KYBER1024_AVX2_indcpa_enc(ct, buf.arr, pk, kr.arr + KYBER_SYMBYTES);

hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */
kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */
/* overwrite coins in kr with H(c) */
hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES);
/* hash concatenation of pre-k and H(c) to k */
kdf(ss, kr.arr, 2 * KYBER_SYMBYTES);
return 0;
}

/*************************************************
* Name: crypto_kem_dec
* Name: PQCLEAN_KYBER1024_AVX2_crypto_kem_dec
*
* Description: Generates shared secret for given
* cipher text and private key
*
* Arguments: - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes)
* - const uint8_t *ct: pointer to input cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes)
* - const uint8_t *sk: pointer to input private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes)
* Arguments: - unsigned char *ss: pointer to output shared secret
* (an already allocated array of CRYPTO_BYTES bytes)
* - const unsigned char *ct: pointer to input cipher text
* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes)
* - const unsigned char *sk: pointer to input private key
* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes)
*
* Returns 0.
*
* On failure, ss will contain a pseudo-random value.
**************************************************/
int PQCLEAN_KYBER1024_AVX2_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
size_t i;
uint8_t fail;
union {
uint8_t x[KYBER_CIPHERTEXTBYTES];
__m256i __dummy;
} _cmp;
uint8_t *cmp = _cmp.x;
uint8_t buf[2 * KYBER_SYMBYTES];
uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */
int PQCLEAN_KYBER1024_AVX2_crypto_kem_dec(unsigned char *ss,
const unsigned char *ct,
const unsigned char *sk) {
size_t i = 0;
int fail = 0;
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf;
/* Will contain key, coins */
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr;
uint8_t cmp[KYBER_CIPHERTEXTBYTES];
const uint8_t *pk = sk + KYBER_INDCPA_SECRETKEYBYTES;

PQCLEAN_KYBER1024_AVX2_indcpa_dec(buf, ct, sk);
PQCLEAN_KYBER1024_AVX2_indcpa_dec(buf.arr, ct, sk);

for (i = 0; i < KYBER_SYMBYTES; i++) { /* Multitarget countermeasure for coins + contributory KEM */
buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; /* Save hash by storing H(pk) in sk */
/* Multitarget countermeasure for coins + contributory KEM */
for (i = 0; i < KYBER_SYMBYTES; i++) {
buf.arr[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i];
}
hash_g(kr, buf, 2 * KYBER_SYMBYTES);
hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES);

PQCLEAN_KYBER1024_AVX2_indcpa_enc(cmp, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */
/* coins are in kr+KYBER_SYMBYTES */
PQCLEAN_KYBER1024_AVX2_indcpa_enc(cmp, buf.arr, pk, kr.arr + KYBER_SYMBYTES);

fail = PQCLEAN_KYBER1024_AVX2_verify(ct, cmp, KYBER_CIPHERTEXTBYTES);

hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */
/* overwrite coins in kr with H(c) */
hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES);

PQCLEAN_KYBER1024_AVX2_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); /* Overwrite pre-k with z on re-encryption failure */
/* Overwrite pre-k with z on re-encryption failure */
PQCLEAN_KYBER1024_AVX2_cmov(kr.arr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail);

kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */
/* hash concatenation of pre-k and H(c) to k */
kdf(ss, kr.arr, 2 * KYBER_SYMBYTES);
return 0;
}

+ 19
- 0
crypto_kem/kyber1024/avx2/kem.h View File

@@ -0,0 +1,19 @@
#ifndef PQCLEAN_KYBER1024_AVX2_KEM_H
#define PQCLEAN_KYBER1024_AVX2_KEM_H

#include "params.h"


int PQCLEAN_KYBER1024_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk);


int PQCLEAN_KYBER1024_AVX2_crypto_kem_enc(unsigned char *ct,
unsigned char *ss,
const unsigned char *pk);


int PQCLEAN_KYBER1024_AVX2_crypto_kem_dec(unsigned char *ss,
const unsigned char *ct,
const unsigned char *sk);

#endif

crypto_kem/kyber1024-90s/avx2/ntt.s → crypto_kem/kyber1024/avx2/ntt.S View File

@@ -1,7 +1,8 @@
#include "cdecl.inc"
.include "shuffle.inc"
.include "fq.inc"

.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 zl0=15,zl1=15,zh0=1,zh1=1
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=1,zh1=1
#mul
vpmullw %ymm\zl0,%ymm\rh0,%ymm12
vpmullw %ymm\zl0,%ymm\rh1,%ymm13
@@ -36,7 +37,7 @@ vpaddw %ymm15,%ymm\rl3,%ymm\rl3
# We break the dependency chains with the cost of slightly more additions.
# But they can be run in parallel to the multiplications on execution port 5
# (multiplications only go to ports 0 and 1)
.macro butterfly2 rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 x=3,y=2,zl0=15,zl1=15,zh0=1,zh1=1
.macro butterfly2 rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,x=3,y=2,zl0=15,zl1=15,zh0=1,zh1=1
#mul
vpmullw %ymm\zl0,%ymm\rh0,%ymm12
vpmulhw %ymm\zh0,%ymm\rh0,%ymm\x
@@ -73,11 +74,8 @@ vpaddw %ymm15,%ymm\rh3,%ymm\rh3
vpsubw %ymm15,%ymm\rl3,%ymm\rl3
.endm

.global PQCLEAN_KYBER102490S_AVX2_ntt_level0_avx
PQCLEAN_KYBER102490S_AVX2_ntt_level0_avx:
#consts
vmovdqa PQCLEAN_KYBER102490S_AVX2_16xq(%rip),%ymm0

.text
ntt_level0_avx:
level0:
#zetas
vpbroadcastd (%rsi),%ymm15
@@ -107,11 +105,7 @@ vmovdqa %ymm11,352(%rdi)

ret

.global PQCLEAN_KYBER102490S_AVX2_ntt_levels1t6_avx
PQCLEAN_KYBER102490S_AVX2_ntt_levels1t6_avx:
#consts
vmovdqa PQCLEAN_KYBER102490S_AVX2_16xq(%rip),%ymm0

ntt_levels1t6_avx:
level1:
#zetas
vpbroadcastd (%rsi),%ymm15
@@ -127,7 +121,7 @@ vmovdqa 160(%rdi),%ymm9
vmovdqa 192(%rdi),%ymm10
vmovdqa 224(%rdi),%ymm11

butterfly2 4,5,6,7,8,9,10,11 3
butterfly2 4,5,6,7,8,9,10,11,3

level2:
#zetas
@@ -139,7 +133,7 @@ shuffle8 5,9,4,9
shuffle8 6,10,5,10
shuffle8 7,11,6,11

butterfly2 3,8,4,9,5,10,6,11 7
butterfly2 3,8,4,9,5,10,6,11,7

level3:
#zetas
@@ -151,7 +145,7 @@ shuffle4 8,10,3,10
shuffle4 4,6,8,6
shuffle4 9,11,4,11

butterfly2 7,5,3,10,8,6,4,11 9
butterfly2 7,5,3,10,8,6,4,11,9

level4:
#zetas
@@ -163,7 +157,7 @@ shuffle2 5,6,7,6
shuffle2 3,4,5,4
shuffle2 10,11,3,11

butterfly2 9,8,7,6,5,4,3,11 10
butterfly2 9,8,7,6,5,4,3,11,10

level5:
#zetas
@@ -175,7 +169,7 @@ shuffle1 8,4,9,4
shuffle1 7,3,8,3
shuffle1 6,11,7,11

butterfly2 10,5,9,4,8,3,7,11 6
butterfly2 10,5,9,4,8,3,7,11,6

level6:
#zetas
@@ -184,17 +178,17 @@ vmovdqu 328(%rsi),%ymm15
vmovdqu 296(%rsi),%ymm1
vmovdqu 360(%rsi),%ymm2

butterfly2 10,5,8,3,9,4,7,11 6,1,14,15,1,2
butterfly2 10,5,8,3,9,4,7,11,6,1,14,15,1,2

vmovdqa PQCLEAN_KYBER102490S_AVX2_16xv(%rip),%ymm1
red16 10 12
red16 5 13
red16 9 14
red16 4 15
red16 8 2
red16 3 6
red16 7 12
red16 11 13
vmovdqa _16XV*2(%rdx),%ymm1
red16 10,12
red16 5,13
red16 9,14
red16 4,15
red16 8,2
red16 3,6
red16 7,12
red16 11,13

#store
vmovdqa %ymm10,(%rdi)
@@ -207,3 +201,20 @@ vmovdqa %ymm7,192(%rdi)
vmovdqa %ymm11,224(%rdi)

ret

.global cdecl(PQCLEAN_KYBER1024_AVX2_ntt_avx)
cdecl(PQCLEAN_KYBER1024_AVX2_ntt_avx):
#consts
vmovdqa _16XQ*2(%rsi),%ymm0
mov %rsi,%rdx
add $_ZETAS_EXP*2,%rsi
call ntt_level0_avx
add $128,%rdi
call ntt_level0_avx
sub $128,%rdi
add $8,%rsi
call ntt_levels1t6_avx
add $256,%rdi
add $392,%rsi
call ntt_levels1t6_avx
ret

+ 20
- 12
crypto_kem/kyber1024/avx2/ntt.h View File

@@ -2,19 +2,27 @@
#define NTT_H

#include "consts.h"
#include "params.h"
#include <stdint.h>

void PQCLEAN_KYBER1024_AVX2_ntt_level0_avx(int16_t *r, const uint16_t *zetas);
void PQCLEAN_KYBER1024_AVX2_ntt_levels1t6_avx(int16_t *r, const uint16_t *zetas);
void PQCLEAN_KYBER1024_AVX2_invntt_levels0t5_avx(int16_t *r, const uint16_t *zetas);
void PQCLEAN_KYBER1024_AVX2_invntt_level6_avx(int16_t *r, const uint16_t *zetas);
void PQCLEAN_KYBER1024_AVX2_nttpack_avx(int16_t *r);
void PQCLEAN_KYBER1024_AVX2_nttunpack_avx(int16_t *r);
void PQCLEAN_KYBER1024_AVX2_basemul_avx(int16_t *r, const int16_t *a, const int16_t *b, const uint16_t *zeta);
void PQCLEAN_KYBER1024_AVX2_basemul_acc_avx(int16_t *r, const int16_t *a, const int16_t *b, const uint16_t *zeta);

void PQCLEAN_KYBER1024_AVX2_ntttobytes_avx(uint8_t *r, const int16_t *a);
void PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx(int16_t *r, const uint8_t *a);

void PQCLEAN_KYBER1024_AVX2_ntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata);

void PQCLEAN_KYBER1024_AVX2_invntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata);


void nttpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata);

void PQCLEAN_KYBER1024_AVX2_nttunpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata);


void PQCLEAN_KYBER1024_AVX2_basemul_avx(int16_t *r, const int16_t *a, const int16_t *b, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata);

void PQCLEAN_KYBER1024_AVX2_basemul_acc_avx(int16_t *r, const int16_t *a, const int16_t *b, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata);


void PQCLEAN_KYBER1024_AVX2_ntttobytes_avx(uint8_t *r, const int16_t *a, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata);

void PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx(int16_t *r, const uint8_t *a, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata);

#endif

+ 10
- 10
crypto_kem/kyber1024/avx2/params.h View File

@@ -1,8 +1,5 @@
#ifndef PARAMS_H
#define PARAMS_H


/* Don't change parameters below this line */
#ifndef PQCLEAN_KYBER1024_AVX2_PARAMS_H
#define PQCLEAN_KYBER1024_AVX2_PARAMS_H

#define KYBER_N 256
#define KYBER_Q 3329
@@ -12,9 +9,8 @@
#define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */
#define KYBER_SSBYTES 32 /* size in bytes of shared key */

#define KYBER_POLYBYTES 384
#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES)

#define KYBER_POLYBYTES 384
#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES)

#define KYBER_K 4
#define KYBER_POLYCOMPRESSEDBYTES 160
@@ -23,10 +19,14 @@
#define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES
#define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES)
#define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES)
#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES)
#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES \
+ KYBER_POLYCOMPRESSEDBYTES)

#define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES)
#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) /* 32 bytes of additional space to save H(pk) */
/* 32 bytes of additional space to save H(pk) */
#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES \
+ KYBER_INDCPA_PUBLICKEYBYTES \
+ 2*KYBER_SYMBYTES)
#define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES

#endif

+ 231
- 229
crypto_kem/kyber1024/avx2/poly.c View File

@@ -1,132 +1,242 @@
#include "align.h"
#include "cbd.h"
#include "consts.h"
#include "ntt.h"
#include "params.h"
#include "poly.h"
#include "reduce.h"
#include "symmetric.h"

#include <immintrin.h>
#include <stdint.h>

/*************************************************
* Name: poly_compress
* Name: PQCLEAN_KYBER1024_AVX2_poly_compress
*
* Description: Compression and subsequent serialization of a polynomial
*
* Arguments: - uint8_t *r: pointer to output byte array
* - const poly *a: pointer to input polynomial
* (of length KYBER_POLYCOMPRESSEDBYTES)
* - poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_compress(uint8_t *r, poly *a) {
void PQCLEAN_KYBER1024_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *restrict a) {
unsigned int i = 0, j = 0;
uint8_t t[8];
size_t i, j, k = 0;

PQCLEAN_KYBER1024_AVX2_poly_csubq(a);

for (i = 0; i < KYBER_N; i += 8) {
for (i = 0; i < KYBER_N / 8; i++) {
for (j = 0; j < 8; j++) {
t[j] = (uint8_t)(((((uint32_t)a->coeffs[i + j] << 5) + KYBER_Q / 2) / KYBER_Q) & 31);
t[j] = ((((uint32_t)a->coeffs[8 * i + j] << 5) + KYBER_Q / 2) / KYBER_Q) & 31;
}

r[k] = (uint8_t)( t[0] | (t[1] << 5));
r[k + 1] = (uint8_t)((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
r[k + 2] = (uint8_t)((t[3] >> 1) | (t[4] << 4));
r[k + 3] = (uint8_t)((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
r[k + 4] = (uint8_t)((t[6] >> 2) | (t[7] << 3));
k += 5;
r[0] = (t[0] >> 0) | (t[1] << 5);
r[1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7);
r[2] = (t[3] >> 1) | (t[4] << 4);
r[3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6);
r[4] = (t[6] >> 2) | (t[7] << 3);
r += 5;
}
}

/*************************************************
* Name: poly_decompress
* Name: PQCLEAN_KYBER1024_AVX2_poly_decompress
*
* Description: De-serialization and subsequent decompression of a polynomial;
* approximate inverse of poly_compress
* approximate inverse of PQCLEAN_KYBER1024_AVX2_poly_compress
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *a: pointer to input byte array
* (of length KYBER_POLYCOMPRESSEDBYTES bytes)
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_decompress(poly *r, const uint8_t *a) {
size_t i;
for (i = 0; i < KYBER_N; i += 8) {
r->coeffs[i + 0] = (int16_t)( (((a[0] & 31) * KYBER_Q) + 16) >> 5);
r->coeffs[i + 1] = (int16_t)(((((a[0] >> 5) | ((a[1] & 3) << 3)) * KYBER_Q) + 16) >> 5);
r->coeffs[i + 2] = (int16_t)(((((a[1] >> 2) & 31) * KYBER_Q) + 16) >> 5);
r->coeffs[i + 3] = (int16_t)(((((a[1] >> 7) | ((a[2] & 15) << 1)) * KYBER_Q) + 16) >> 5);
r->coeffs[i + 4] = (int16_t)(((((a[2] >> 4) | ((a[3] & 1) << 4)) * KYBER_Q) + 16) >> 5);
r->coeffs[i + 5] = (int16_t)(((((a[3] >> 1) & 31) * KYBER_Q) + 16) >> 5);
r->coeffs[i + 6] = (int16_t)(((((a[3] >> 6) | ((a[4] & 7) << 2)) * KYBER_Q) + 16) >> 5);
r->coeffs[i + 7] = (int16_t)( (((a[4] >> 3) * KYBER_Q) + 16) >> 5);
void PQCLEAN_KYBER1024_AVX2_poly_decompress(poly *restrict r,
const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) {
unsigned int i = 0;

unsigned int j = 0;
uint8_t t[8];
for (i = 0; i < KYBER_N / 8; i++) {
t[0] = (a[0] >> 0);
t[1] = (a[0] >> 5) | (a[1] << 3);
t[2] = (a[1] >> 2);
t[3] = (a[1] >> 7) | (a[2] << 1);
t[4] = (a[2] >> 4) | (a[3] << 4);
t[5] = (a[3] >> 1);
t[6] = (a[3] >> 6) | (a[4] << 2);
t[7] = (a[4] >> 3);
a += 5;

for (j = 0; j < 8; j++) {
r->coeffs[8 * i + j] = ((uint32_t)(t[j] & 31) * KYBER_Q + 16) >> 5;
}
}
}

/*************************************************
* Name: poly_tobytes
* Name: PQCLEAN_KYBER1024_AVX2_poly_tobytes
*
* Description: Serialization of a polynomial
*
* Arguments: - uint8_t *r: pointer to output byte array
* - const poly *a: pointer to input polynomial
* (needs space for KYBER_POLYBYTES bytes)
* - poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_tobytes(uint8_t *r, poly *a) {
PQCLEAN_KYBER1024_AVX2_ntttobytes_avx(r, a->coeffs);
PQCLEAN_KYBER1024_AVX2_ntttobytes_avx(r + 192, a->coeffs + 128);
void PQCLEAN_KYBER1024_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) {
PQCLEAN_KYBER1024_AVX2_ntttobytes_avx(r, a->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata);
}

/*************************************************
* Name: poly_frombytes
* Name: PQCLEAN_KYBER1024_AVX2_poly_frombytes
*
* Description: De-serialization of a polynomial;
* inverse of poly_tobytes
* inverse of PQCLEAN_KYBER1024_AVX2_poly_tobytes
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *a: pointer to input byte array
* (of KYBER_POLYBYTES bytes)
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]) {
PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx(r->coeffs, a, &PQCLEAN_KYBER1024_AVX2_qdata);
}

/*************************************************
* Name: PQCLEAN_KYBER1024_AVX2_poly_frommsg
*
* Description: Convert 32-byte message to polynomial
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *msg: pointer to input message
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_frombytes(poly *r, const uint8_t *a) {
PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx(r->coeffs, a);
PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx(r->coeffs + 128, a + 192);
void PQCLEAN_KYBER1024_AVX2_poly_frommsg(poly *restrict r,
const uint8_t msg[KYBER_INDCPA_MSGBYTES]) {
__m256i f, g0, g1, g2, g3, h0, h1, h2, h3;
const __m256i shift = _mm256_broadcastsi128_si256(_mm_set_epi32(0, 1, 2, 3));
const __m256i idx = _mm256_broadcastsi128_si256(_mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0));
const __m256i hqs = _mm256_set1_epi16((KYBER_Q + 1) / 2);

#define FROMMSG64(i) \
g3 = _mm256_shuffle_epi32(f,0x55*(i)); \
g3 = _mm256_sllv_epi32(g3,shift); \
g3 = _mm256_shuffle_epi8(g3,idx); \
g0 = _mm256_slli_epi16(g3,12); \
g1 = _mm256_slli_epi16(g3,8); \
g2 = _mm256_slli_epi16(g3,4); \
g0 = _mm256_srai_epi16(g0,15); \
g1 = _mm256_srai_epi16(g1,15); \
g2 = _mm256_srai_epi16(g2,15); \
g3 = _mm256_srai_epi16(g3,15); \
g0 = _mm256_and_si256(g0,hqs); /* 19 18 17 16 3 2 1 0 */ \
g1 = _mm256_and_si256(g1,hqs); /* 23 22 21 20 7 6 5 4 */ \
g2 = _mm256_and_si256(g2,hqs); /* 27 26 25 24 11 10 9 8 */ \
g3 = _mm256_and_si256(g3,hqs); /* 31 30 29 28 15 14 13 12 */ \
h0 = _mm256_unpacklo_epi64(g0,g1); \
h2 = _mm256_unpackhi_epi64(g0,g1); \
h1 = _mm256_unpacklo_epi64(g2,g3); \
h3 = _mm256_unpackhi_epi64(g2,g3); \
g0 = _mm256_permute2x128_si256(h0,h1,0x20); \
g2 = _mm256_permute2x128_si256(h0,h1,0x31); \
g1 = _mm256_permute2x128_si256(h2,h3,0x20); \
g3 = _mm256_permute2x128_si256(h2,h3,0x31); \
_mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+ 0],g0); \
_mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+16],g1); \
_mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+ 0],g2); \
_mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+16],g3)

f = _mm256_load_si256((__m256i *)msg);
FROMMSG64(0);
FROMMSG64(1);
FROMMSG64(2);
FROMMSG64(3);
}

/*************************************************
* Name: poly_getnoise
* Name: PQCLEAN_KYBER1024_AVX2_poly_tomsg
*
* Description: Convert polynomial to 32-byte message
*
* Arguments: - uint8_t *msg: pointer to output message
* - poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *restrict a) {
unsigned int i = 0;
uint32_t small = 0;
__m256i f0, f1, g0, g1;
const __m256i hqs = _mm256_set1_epi16((KYBER_Q - 1) / 2);
const __m256i hhqs = _mm256_set1_epi16((KYBER_Q - 5) / 4);

for (i = 0; i < KYBER_N / 32; i++) {
f0 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i]);
f1 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i + 16]);
f0 = _mm256_sub_epi16(hqs, f0);
f1 = _mm256_sub_epi16(hqs, f1);
g0 = _mm256_srai_epi16(f0, 15);
g1 = _mm256_srai_epi16(f1, 15);
f0 = _mm256_xor_si256(f0, g0);
f1 = _mm256_xor_si256(f1, g1);
f0 = _mm256_sub_epi16(hhqs, f0);
f1 = _mm256_sub_epi16(hhqs, f1);
f0 = _mm256_packs_epi16(f0, f1);
small = _mm256_movemask_epi8(f0);
small = ~small;
msg[4 * i + 0] = small;
msg[4 * i + 1] = small >> 16;
msg[4 * i + 2] = small >> 8;
msg[4 * i + 3] = small >> 24;
}
}

/*************************************************
* Name: PQCLEAN_KYBER1024_AVX2_poly_getnoise
*
* Description: Sample a polynomial deterministically from a seed and a nonce,
* with output polynomial close to centered binomial distribution
* with parameter KYBER_ETA
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *seed: pointer to input seed
* (of length KYBER_SYMBYTES bytes)
* - uint8_t nonce: one-byte input nonce
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce) {
uint8_t buf[KYBER_ETA * KYBER_N / 4];

prf(buf, KYBER_ETA * KYBER_N / 4, seed, nonce);
PQCLEAN_KYBER1024_AVX2_cbd(r, buf);
void PQCLEAN_KYBER1024_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) {
ALIGN32_ARRAY(uint8_t, KYBER_ETA * KYBER_N / 4) buf;
prf(buf.arr, sizeof(buf.arr), seed, nonce);
PQCLEAN_KYBER1024_AVX2_cbd(r, buf.arr);
}

// FIXME
void PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(poly *r0,
poly *r1,
poly *r2,
poly *r3,
const uint8_t *seed,
const uint8_t seed[32],
uint8_t nonce0,
uint8_t nonce1,
uint8_t nonce2,
uint8_t nonce3) {
uint8_t buf[4][SHAKE256_RATE];

PQCLEAN_KYBER1024_AVX2_shake256x4_prf(buf[0], buf[1], buf[2], buf[3], SHAKE256_RATE, seed, nonce0, nonce1, nonce2, nonce3);

PQCLEAN_KYBER1024_AVX2_cbd(r0, buf[0]);
PQCLEAN_KYBER1024_AVX2_cbd(r1, buf[1]);
PQCLEAN_KYBER1024_AVX2_cbd(r2, buf[2]);
PQCLEAN_KYBER1024_AVX2_cbd(r3, buf[3]);
ALIGN32_ARRAY_2D(uint8_t, 4, 160) buf;
__m256i f;
keccakx4_state state;

f = _mm256_load_si256((__m256i *)seed);
_mm256_store_si256((__m256i *)buf.arr[0], f);
_mm256_store_si256((__m256i *)buf.arr[1], f);
_mm256_store_si256((__m256i *)buf.arr[2], f);
_mm256_store_si256((__m256i *)buf.arr[3], f);

buf.arr[0][32] = nonce0;
buf.arr[1][32] = nonce1;
buf.arr[2][32] = nonce2;
buf.arr[3][32] = nonce3;

PQCLEAN_KYBER1024_AVX2_shake256x4_absorb(&state, buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], 33);
PQCLEAN_KYBER1024_AVX2_shake256x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], 1, &state);

PQCLEAN_KYBER1024_AVX2_cbd(r0, buf.arr[0]);
PQCLEAN_KYBER1024_AVX2_cbd(r1, buf.arr[1]);
PQCLEAN_KYBER1024_AVX2_cbd(r2, buf.arr[2]);
PQCLEAN_KYBER1024_AVX2_cbd(r3, buf.arr[3]);
}

/*************************************************
* Name: poly_ntt
* Name: PQCLEAN_KYBER1024_AVX2_poly_ntt
*
* Description: Computes negacyclic number-theoretic transform (NTT) of
* a polynomial in place;
@@ -135,73 +245,78 @@ void PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(poly *r0,
* Arguments: - uint16_t *r: pointer to in/output polynomial
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_ntt(poly *r) {
PQCLEAN_KYBER1024_AVX2_ntt_level0_avx(r->coeffs, PQCLEAN_KYBER1024_AVX2_zetas_exp);
PQCLEAN_KYBER1024_AVX2_ntt_level0_avx(r->coeffs + 64, PQCLEAN_KYBER1024_AVX2_zetas_exp);
PQCLEAN_KYBER1024_AVX2_ntt_levels1t6_avx(r->coeffs, PQCLEAN_KYBER1024_AVX2_zetas_exp + 4);
PQCLEAN_KYBER1024_AVX2_ntt_levels1t6_avx(r->coeffs + 128, PQCLEAN_KYBER1024_AVX2_zetas_exp + 200);
PQCLEAN_KYBER1024_AVX2_ntt_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata);
}

/*************************************************
* Name: poly_invntt
* Name: PQCLEAN_KYBER1024_AVX2_poly_invntt_tomont
*
* Description: Computes inverse of negacyclic number-theoretic transform (NTT) of
* a polynomial in place;
* Description: Computes inverse of negacyclic number-theoretic transform (NTT)
* of a polynomial in place;
* inputs assumed to be in bitreversed order, output in normal order
*
* Arguments: - uint16_t *a: pointer to in/output polynomial
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_invntt(poly *r) {
PQCLEAN_KYBER1024_AVX2_invntt_levels0t5_avx(r->coeffs, PQCLEAN_KYBER1024_AVX2_zetas_inv_exp);
PQCLEAN_KYBER1024_AVX2_invntt_levels0t5_avx(r->coeffs + 128, PQCLEAN_KYBER1024_AVX2_zetas_inv_exp + 196);
PQCLEAN_KYBER1024_AVX2_invntt_level6_avx(r->coeffs, PQCLEAN_KYBER1024_AVX2_zetas_inv_exp + 392);
void PQCLEAN_KYBER1024_AVX2_poly_invntt_tomont(poly *r) {
PQCLEAN_KYBER1024_AVX2_invntt_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata);
}

// FIXME
void PQCLEAN_KYBER1024_AVX2_poly_nttunpack(poly *r) {
PQCLEAN_KYBER1024_AVX2_nttunpack_avx(r->coeffs);
PQCLEAN_KYBER1024_AVX2_nttunpack_avx(r->coeffs + 128);
PQCLEAN_KYBER1024_AVX2_nttunpack_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata);
}

//XXX Add comment
void PQCLEAN_KYBER1024_AVX2_poly_basemul(poly *r, const poly *a, const poly *b) {
PQCLEAN_KYBER1024_AVX2_basemul_avx(r->coeffs,
a->coeffs,
b->coeffs,
PQCLEAN_KYBER1024_AVX2_zetas_exp + 152);
PQCLEAN_KYBER1024_AVX2_basemul_avx(r->coeffs + 64,
a->coeffs + 64,
b->coeffs + 64,
PQCLEAN_KYBER1024_AVX2_zetas_exp + 184);
PQCLEAN_KYBER1024_AVX2_basemul_avx(r->coeffs + 128,
a->coeffs + 128,
b->coeffs + 128,
PQCLEAN_KYBER1024_AVX2_zetas_exp + 348);
PQCLEAN_KYBER1024_AVX2_basemul_avx(r->coeffs + 192,
a->coeffs + 192,
b->coeffs + 192,
PQCLEAN_KYBER1024_AVX2_zetas_exp + 380);
/*************************************************
* Name: PQCLEAN_KYBER1024_AVX2_poly_basemul_montgomery
*
* Description: Multiplication of two polynomials in NTT domain
*
* Arguments: - poly *r: pointer to output polynomial
* - const poly *a: pointer to first input polynomial
* - const poly *b: pointer to second input polynomial
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const poly *b) {
PQCLEAN_KYBER1024_AVX2_basemul_avx(r->coeffs, a->coeffs, b->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata);
}

// FIXME
void PQCLEAN_KYBER1024_AVX2_poly_frommont(poly *r) {
PQCLEAN_KYBER1024_AVX2_frommont_avx(r->coeffs);
PQCLEAN_KYBER1024_AVX2_frommont_avx(r->coeffs + 128);
/*************************************************
* Name: PQCLEAN_KYBER1024_AVX2_poly_tomont
*
* Description: Inplace conversion of all coefficients of a polynomial
* from normal domain to Montgomery domain
*
* Arguments: - poly *r: pointer to input/output polynomial
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_tomont(poly *r) {
PQCLEAN_KYBER1024_AVX2_tomont_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata);
}

// FIXME
/*************************************************
* Name: PQCLEAN_KYBER1024_AVX2_poly_reduce
*
* Description: Applies Barrett reduction to all coefficients of a polynomial
* for details of the Barrett reduction see comments in reduce.c
*
* Arguments: - poly *r: pointer to input/output polynomial
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_reduce(poly *r) {
PQCLEAN_KYBER1024_AVX2_reduce_avx(r->coeffs);
PQCLEAN_KYBER1024_AVX2_reduce_avx(r->coeffs + 128);
PQCLEAN_KYBER1024_AVX2_reduce_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata);
}

// FIXME
/*************************************************
* Name: PQCLEAN_KYBER1024_AVX2_poly_csubq
*
* Description: Applies conditional subtraction of q to each coefficient
* of a polynomial. For details of conditional subtraction
* of q see comments in reduce.c
*
* Arguments: - poly *r: pointer to input/output polynomial
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_csubq(poly *r) {
PQCLEAN_KYBER1024_AVX2_csubq_avx(r->coeffs);
PQCLEAN_KYBER1024_AVX2_csubq_avx(r->coeffs + 128);
PQCLEAN_KYBER1024_AVX2_csubq_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata);
}

/*************************************************
* Name: poly_add
* Name: PQCLEAN_KYBER1024_AVX2_poly_add
*
* Description: Add two polynomials
*
@@ -210,18 +325,19 @@ void PQCLEAN_KYBER1024_AVX2_poly_csubq(poly *r) {
* - const poly *b: pointer to second input polynomial
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_add(poly *r, const poly *a, const poly *b) {
__m256i vec0, vec1;

for (size_t i = 0; i < KYBER_N; i += 16) {
vec0 = _mm256_load_si256((__m256i *)&a->coeffs[i]);
vec1 = _mm256_load_si256((__m256i *)&b->coeffs[i]);
vec0 = _mm256_add_epi16(vec0, vec1);
_mm256_store_si256((__m256i *)&r->coeffs[i], vec0);
unsigned int i = 0;
__m256i f0, f1;

for (i = 0; i < KYBER_N; i += 16) {
f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]);
f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]);
f0 = _mm256_add_epi16(f0, f1);
_mm256_store_si256((__m256i *)&r->coeffs[i], f0);
}
}

/*************************************************
* Name: poly_sub
* Name: PQCLEAN_KYBER1024_AVX2_poly_sub
*
* Description: Subtract two polynomials
*
@@ -230,127 +346,13 @@ void PQCLEAN_KYBER1024_AVX2_poly_add(poly *r, const poly *a, const poly *b) {
* - const poly *b: pointer to second input polynomial
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_sub(poly *r, const poly *a, const poly *b) {
__m256i vec0, vec1;

for (size_t i = 0; i < KYBER_N; i += 16) {
vec0 = _mm256_load_si256((__m256i *)&a->coeffs[i]);
vec1 = _mm256_load_si256((__m256i *)&b->coeffs[i]);
vec0 = _mm256_sub_epi16(vec0, vec1);
_mm256_store_si256((__m256i *)&r->coeffs[i], vec0);
}
}

/*************************************************
* Name: poly_frommsg
*
* Description: Convert 32-byte message to polynomial
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *msg: pointer to input message
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]) {
__m128i tmp;
__m256i a[4], d0, d1, d2, d3;
const __m256i shift = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
const __m256i zeros = _mm256_setzero_si256();
const __m256i ones = _mm256_set1_epi32(1);
const __m256i hqs = _mm256_set1_epi32((KYBER_Q + 1) / 2);

tmp = _mm_loadu_si128((__m128i *)msg);
for (size_t i = 0; i < 4; i++) {
a[i] = _mm256_broadcastd_epi32(tmp);
tmp = _mm_srli_si128(tmp, 4);
}

for (size_t i = 0; i < 4; i++) {
d0 = _mm256_srlv_epi32(a[i], shift);
d1 = _mm256_srli_epi32(d0, 8);
d2 = _mm256_srli_epi32(d0, 16);
d3 = _mm256_srli_epi32(d0, 24);

d0 = _mm256_and_si256(d0, ones);
d1 = _mm256_and_si256(d1, ones);
d2 = _mm256_and_si256(d2, ones);
d3 = _mm256_and_si256(d3, ones);

d0 = _mm256_sub_epi32(zeros, d0);
d1 = _mm256_sub_epi32(zeros, d1);
d2 = _mm256_sub_epi32(zeros, d2);
d3 = _mm256_sub_epi32(zeros, d3);

d0 = _mm256_and_si256(hqs, d0);
d1 = _mm256_and_si256(hqs, d1);
d2 = _mm256_and_si256(hqs, d2);
d3 = _mm256_and_si256(hqs, d3);

d0 = _mm256_packus_epi32(d0, d1);
d2 = _mm256_packus_epi32(d2, d3);
d0 = _mm256_permute4x64_epi64(d0, 0xD8);
d2 = _mm256_permute4x64_epi64(d2, 0xD8);
_mm256_store_si256((__m256i *)&r->coeffs[32 * i + 0], d0);
_mm256_store_si256((__m256i *)&r->coeffs[32 * i + 16], d2);
}

tmp = _mm_loadu_si128((__m128i *)&msg[16]);
for (size_t i = 0; i < 4; i++) {
a[i] = _mm256_broadcastd_epi32(tmp);
tmp = _mm_srli_si128(tmp, 4);
}

for (size_t i = 0; i < 4; i++) {
d0 = _mm256_srlv_epi32(a[i], shift);
d1 = _mm256_srli_epi32(d0, 8);
d2 = _mm256_srli_epi32(d0, 16);
d3 = _mm256_srli_epi32(d0, 24);

d0 = _mm256_and_si256(d0, ones);
d1 = _mm256_and_si256(d1, ones);
d2 = _mm256_and_si256(d2, ones);
d3 = _mm256_and_si256(d3, ones);

d0 = _mm256_sub_epi32(zeros, d0);
d1 = _mm256_sub_epi32(zeros, d1);
d2 = _mm256_sub_epi32(zeros, d2);
d3 = _mm256_sub_epi32(zeros, d3);

d0 = _mm256_and_si256(hqs, d0);
d1 = _mm256_and_si256(hqs, d1);
d2 = _mm256_and_si256(hqs, d2);
d3 = _mm256_and_si256(hqs, d3);

d0 = _mm256_packus_epi32(d0, d1);
d2 = _mm256_packus_epi32(d2, d3);
d0 = _mm256_permute4x64_epi64(d0, 0xD8);
d2 = _mm256_permute4x64_epi64(d2, 0xD8);
_mm256_store_si256((__m256i *)&r->coeffs[128 + 32 * i + 0], d0);
_mm256_store_si256((__m256i *)&r->coeffs[128 + 32 * i + 16], d2);
}
}

/*************************************************
* Name: poly_tomsg
*
* Description: Convert polynomial to 32-byte message
*
* Arguments: - uint8_t *msg: pointer to output message
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a) {
uint32_t small;
__m256i vec, tmp;
const __m256i hqs = _mm256_set1_epi16((KYBER_Q - 1) / 2);
const __m256i hhqs = _mm256_set1_epi16((KYBER_Q - 5) / 4);

for (size_t i = 0; i < KYBER_N / 16; i++) {
vec = _mm256_load_si256((__m256i *)&a->coeffs[16 * i]);
vec = _mm256_sub_epi16(hqs, vec);
tmp = _mm256_srai_epi16(vec, 15);
vec = _mm256_xor_si256(vec, tmp);
vec = _mm256_sub_epi16(hhqs, vec);
small = (uint32_t)_mm256_movemask_epi8(vec);
small = _pext_u32(small, 0xAAAAAAAA);
small = ~small;
msg[2 * i + 0] = (uint8_t)small;
msg[2 * i + 1] = (uint8_t)(small >> 8);
unsigned int i = 0;
__m256i f0, f1;

for (i = 0; i < KYBER_N; i += 16) {
f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]);
f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]);
f0 = _mm256_sub_epi16(f0, f1);
_mm256_store_si256((__m256i *)&r->coeffs[i], f0);
}
}

+ 29
- 14
crypto_kem/kyber1024/avx2/poly.h View File

@@ -1,8 +1,7 @@
#ifndef POLY_H
#define POLY_H
#ifndef PQCLEAN_KYBER1024_AVX2_POLY_H
#define PQCLEAN_KYBER1024_AVX2_POLY_H

#include "params.h"

#include <immintrin.h>
#include <stdint.h>

@@ -11,20 +10,28 @@
* coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1]
*/
typedef union {
__m256i dummy;
int16_t coeffs[KYBER_N];
__m256i _dummy;
} poly;

void PQCLEAN_KYBER1024_AVX2_poly_compress(uint8_t *r, poly *a);
void PQCLEAN_KYBER1024_AVX2_poly_decompress(poly *r, const uint8_t *a);

void PQCLEAN_KYBER1024_AVX2_poly_tobytes(uint8_t *r, poly *a);
void PQCLEAN_KYBER1024_AVX2_poly_frombytes(poly *r, const uint8_t *a);
void PQCLEAN_KYBER1024_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a);

void PQCLEAN_KYBER1024_AVX2_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]);


void PQCLEAN_KYBER1024_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a);

void PQCLEAN_KYBER1024_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]);


void PQCLEAN_KYBER1024_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]);

void PQCLEAN_KYBER1024_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a);


void PQCLEAN_KYBER1024_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]);
void PQCLEAN_KYBER1024_AVX2_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a);
void PQCLEAN_KYBER1024_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);

void PQCLEAN_KYBER1024_AVX2_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce);
void PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(poly *r0,
poly *r1,
poly *r2,
@@ -37,15 +44,23 @@ void PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(poly *r0,


void PQCLEAN_KYBER1024_AVX2_poly_ntt(poly *r);
void PQCLEAN_KYBER1024_AVX2_poly_invntt(poly *r);

void PQCLEAN_KYBER1024_AVX2_poly_invntt_tomont(poly *r);

void PQCLEAN_KYBER1024_AVX2_poly_nttunpack(poly *r);
void PQCLEAN_KYBER1024_AVX2_poly_basemul(poly *r, const poly *a, const poly *b);
void PQCLEAN_KYBER1024_AVX2_poly_frommont(poly *r);

void PQCLEAN_KYBER1024_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const poly *b);

void PQCLEAN_KYBER1024_AVX2_poly_tomont(poly *r);


void PQCLEAN_KYBER1024_AVX2_poly_reduce(poly *r);

void PQCLEAN_KYBER1024_AVX2_poly_csubq(poly *r);


void PQCLEAN_KYBER1024_AVX2_poly_add(poly *r, const poly *a, const poly *b);

void PQCLEAN_KYBER1024_AVX2_poly_sub(poly *r, const poly *a, const poly *b);

#endif

+ 107
- 75
crypto_kem/kyber1024/avx2/polyvec.c View File

@@ -1,167 +1,198 @@
#include "params.h"
#include "consts.h"
#include "ntt.h"
#include "poly.h"
#include "polyvec.h"

#include <stdint.h>

/*************************************************
* Name: polyvec_compress
* Name: PQCLEAN_KYBER1024_AVX2_polyvec_compress
*
* Description: Compress and serialize vector of polynomials
*
* Arguments: - uint8_t *r: pointer to output byte array
* - const polyvec *a: pointer to input vector of polynomials
* (needs space for KYBER_POLYVECCOMPRESSEDBYTES)
* - polyvec *a: pointer to input vector of polynomials
**************************************************/
void PQCLEAN_KYBER1024_AVX2_polyvec_compress(uint8_t *r, polyvec *a) {
void PQCLEAN_KYBER1024_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES],
polyvec *restrict a) {
unsigned int i = 0, j = 0, k = 0;

PQCLEAN_KYBER1024_AVX2_polyvec_csubq(a);

uint16_t t[8];
for (size_t i = 0; i < KYBER_K; i++) {
for (size_t j = 0; j < KYBER_N / 8; j++) {
for (size_t k = 0; k < 8; k++) {
t[k] = ((((uint32_t)a->vec[i].coeffs[8 * j + k] << 11) + KYBER_Q / 2) / KYBER_Q) & 0x7ff;
for (i = 0; i < KYBER_K; i++) {
for (j = 0; j < KYBER_N / 8; j++) {
for (k = 0; k < 8; k++) {
{
t[k] = ((((uint32_t)a->vec[i].coeffs[8 * j + k] << 11) + KYBER_Q / 2)
/ KYBER_Q) & 0x7ff;
}
}

r[11 * j + 0] = (uint8_t)t[0];
r[11 * j + 1] = (uint8_t)((t[0] >> 8) | ((t[1] & 0x1f) << 3));
r[11 * j + 2] = (uint8_t)((t[1] >> 5) | ((t[2] & 0x03) << 6));
r[11 * j + 3] = (uint8_t)((t[2] >> 2));
r[11 * j + 4] = (uint8_t)((t[2] >> 10) | ((t[3] & 0x7f) << 1));
r[11 * j + 5] = (uint8_t)((t[3] >> 7) | ((t[4] & 0x0f) << 4));
r[11 * j + 6] = (uint8_t)((t[4] >> 4) | ((t[5] & 0x01) << 7));
r[11 * j + 7] = (uint8_t)((t[5] >> 1));
r[11 * j + 8] = (uint8_t)((t[5] >> 9) | ((t[6] & 0x3f) << 2));
r[11 * j + 9] = (uint8_t)((t[6] >> 6) | ((t[7] & 0x07) << 5));
r[11 * j + 10] = (uint8_t)((t[7] >> 3));
r[ 0] = (t[0] >> 0);
r[ 1] = (t[0] >> 8) | (t[1] << 3);
r[ 2] = (t[1] >> 5) | (t[2] << 6);
r[ 3] = (t[2] >> 2);
r[ 4] = (t[2] >> 10) | (t[3] << 1);
r[ 5] = (t[3] >> 7) | (t[4] << 4);
r[ 6] = (t[4] >> 4) | (t[5] << 7);
r[ 7] = (t[5] >> 1);
r[ 8] = (t[5] >> 9) | (t[6] << 2);
r[ 9] = (t[6] >> 6) | (t[7] << 5);
r[10] = (t[7] >> 3);
r += 11;
}
r += 352;
}
}

/*************************************************
* Name: polyvec_decompress
* Name: PQCLEAN_KYBER1024_AVX2_polyvec_decompress
*
* Description: De-serialize and decompress vector of polynomials;
* approximate inverse of polyvec_compress
* approximate inverse of PQCLEAN_KYBER1024_AVX2_polyvec_compress
*
* Arguments: - polyvec *r: pointer to output vector of polynomials
* - uint8_t *a: pointer to input byte array
* - const uint8_t *a: pointer to input byte array
* (of length KYBER_POLYVECCOMPRESSEDBYTES)
**************************************************/
void PQCLEAN_KYBER1024_AVX2_polyvec_decompress(polyvec *r, const uint8_t *a) {
for (size_t i = 0; i < KYBER_K; i++) {
for (size_t j = 0; j < KYBER_N / 8; j++) {
r->vec[i].coeffs[8 * j + 0] = (int16_t)( (((a[11 * j + 0] | (((uint32_t)a[11 * j + 1] & 0x07) << 8)) * KYBER_Q) + 1024) >> 11);
r->vec[i].coeffs[8 * j + 1] = (int16_t)(((((a[11 * j + 1] >> 3) | (((uint32_t)a[11 * j + 2] & 0x3f) << 5)) * KYBER_Q) + 1024) >> 11);
r->vec[i].coeffs[8 * j + 2] = (int16_t)(((((a[11 * j + 2] >> 6) | (((uint32_t)a[11 * j + 3] & 0xff) << 2) | (((uint32_t)a[11 * j + 4] & 0x01) << 10)) * KYBER_Q) + 1024) >> 11);
r->vec[i].coeffs[8 * j + 3] = (int16_t)(((((a[11 * j + 4] >> 1) | (((uint32_t)a[11 * j + 5] & 0x0f) << 7)) * KYBER_Q) + 1024) >> 11);
r->vec[i].coeffs[8 * j + 4] = (int16_t)(((((a[11 * j + 5] >> 4) | (((uint32_t)a[11 * j + 6] & 0x7f) << 4)) * KYBER_Q) + 1024) >> 11);
r->vec[i].coeffs[8 * j + 5] = (int16_t)(((((a[11 * j + 6] >> 7) | (((uint32_t)a[11 * j + 7] & 0xff) << 1) | (((uint32_t)a[11 * j + 8] & 0x03) << 9)) * KYBER_Q) + 1024) >> 11);
r->vec[i].coeffs[8 * j + 6] = (int16_t)(((((a[11 * j + 8] >> 2) | (((uint32_t)a[11 * j + 9] & 0x1f) << 6)) * KYBER_Q) + 1024) >> 11);
r->vec[i].coeffs[8 * j + 7] = (int16_t)(((((a[11 * j + 9] >> 5) | (((uint32_t)a[11 * j + 10] & 0xff) << 3)) * KYBER_Q) + 1024) >> 11);
void PQCLEAN_KYBER1024_AVX2_polyvec_decompress(polyvec *restrict r,
const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) {
unsigned int i = 0, j = 0, k = 0;

uint16_t t[8];
for (i = 0; i < KYBER_K; i++) {
for (j = 0; j < KYBER_N / 8; j++) {
t[0] = (a[0] >> 0) | ((uint16_t)a[ 1] << 8);
t[1] = (a[1] >> 3) | ((uint16_t)a[ 2] << 5);
t[2] = (a[2] >> 6) | ((uint16_t)a[ 3] << 2) | ((uint16_t)a[4] << 10);
t[3] = (a[4] >> 1) | ((uint16_t)a[ 5] << 7);
t[4] = (a[5] >> 4) | ((uint16_t)a[ 6] << 4);
t[5] = (a[6] >> 7) | ((uint16_t)a[ 7] << 1) | ((uint16_t)a[8] << 9);
t[6] = (a[8] >> 2) | ((uint16_t)a[ 9] << 6);
t[7] = (a[9] >> 5) | ((uint16_t)a[10] << 3);
a += 11;

for (k = 0; k < 8; k++) {
r->vec[i].coeffs[8 * j + k] = ((uint32_t)(t[k] & 0x7FF) * KYBER_Q + 1024) >> 11;
}
}
a += 352;
}
}

/*************************************************
* Name: polyvec_tobytes
* Name: PQCLEAN_KYBER1024_AVX2_polyvec_tobytes
*
* Description: Serialize vector of polynomials
*
* Arguments: - uint8_t *r: pointer to output byte array
* - const polyvec *a: pointer to input vector of polynomials
* (needs space for KYBER_POLYVECBYTES)
* - polyvec *a: pointer to input vector of polynomials
**************************************************/
void PQCLEAN_KYBER1024_AVX2_polyvec_tobytes(uint8_t *r, polyvec *a) {
for (size_t i = 0; i < KYBER_K; i++) {
void PQCLEAN_KYBER1024_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a) {
unsigned int i = 0;
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_AVX2_poly_tobytes(r + i * KYBER_POLYBYTES, &a->vec[i]);
}
}

/*************************************************
* Name: polyvec_frombytes
* Name: PQCLEAN_KYBER1024_AVX2_polyvec_frombytes
*
* Description: De-serialize vector of polynomials;
* inverse of polyvec_tobytes
* inverse of PQCLEAN_KYBER1024_AVX2_polyvec_tobytes
*
* Arguments: - uint8_t *r: pointer to output byte array
* Arguments: - uint8_t *r: pointer to output byte array
* - const polyvec *a: pointer to input vector of polynomials
* (of length KYBER_POLYVECBYTES)
**************************************************/
void PQCLEAN_KYBER1024_AVX2_polyvec_frombytes(polyvec *r, const uint8_t *a) {
for (size_t i = 0; i < KYBER_K; i++) {
void PQCLEAN_KYBER1024_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]) {
unsigned int i = 0;
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_AVX2_poly_frombytes(&r->vec[i], a + i * KYBER_POLYBYTES);
}
}

/*************************************************
* Name: polyvec_ntt
* Name: PQCLEAN_KYBER1024_AVX2_polyvec_ntt
*
* Description: Apply forward NTT to all elements of a vector of polynomials
*
* Arguments: - polyvec *r: pointer to in/output vector of polynomials
**************************************************/
void PQCLEAN_KYBER1024_AVX2_polyvec_ntt(polyvec *r) {
for (size_t i = 0; i < KYBER_K; i++) {
unsigned int i = 0;
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_AVX2_poly_ntt(&r->vec[i]);
}
}

/*************************************************
* Name: polyvec_invntt
* Name: PQCLEAN_KYBER1024_AVX2_polyvec_invntt_tomont
*
* Description: Apply inverse NTT to all elements of a vector of polynomials
* and multiply by Montgomery factor 2^16
*
* Arguments: - polyvec *r: pointer to in/output vector of polynomials
**************************************************/
void PQCLEAN_KYBER1024_AVX2_polyvec_invntt(polyvec *r) {
for (size_t i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_AVX2_poly_invntt(&r->vec[i]);
void PQCLEAN_KYBER1024_AVX2_polyvec_invntt_tomont(polyvec *r) {
unsigned int i = 0;
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_AVX2_poly_invntt_tomont(&r->vec[i]);
}
}

/*************************************************
* Name: polyvec_pointwise_acc
* Name: PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery
*
* Description: Pointwise multiply elements of a and b and accumulate into r
* Description: Pointwise multiply elements of a and b, accumulate into r,
* and multiply by 2^-16.
*
* Arguments: - poly *r: pointer to output polynomial
* - const polyvec *a: pointer to first input vector of polynomials
* - const polyvec *b: pointer to second input vector of polynomials
**************************************************/
void PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b) {
PQCLEAN_KYBER1024_AVX2_basemul_acc_avx(r->coeffs,
a->vec->coeffs,
b->vec->coeffs,
PQCLEAN_KYBER1024_AVX2_zetas_exp + 152);
PQCLEAN_KYBER1024_AVX2_basemul_acc_avx(r->coeffs + 64,
a->vec->coeffs + 64,
b->vec->coeffs + 64,
PQCLEAN_KYBER1024_AVX2_zetas_exp + 184);
PQCLEAN_KYBER1024_AVX2_basemul_acc_avx(r->coeffs + 128,
a->vec->coeffs + 128,
b->vec->coeffs + 128,
PQCLEAN_KYBER1024_AVX2_zetas_exp + 348);
PQCLEAN_KYBER1024_AVX2_basemul_acc_avx(r->coeffs + 192,
a->vec->coeffs + 192,
b->vec->coeffs + 192,
PQCLEAN_KYBER1024_AVX2_zetas_exp + 380);
void PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(poly *r,
const polyvec *a,
const polyvec *b) {
PQCLEAN_KYBER1024_AVX2_basemul_acc_avx(r->coeffs, a->vec->coeffs, b->vec->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata);
}

// FIXME
/*************************************************
* Name: PQCLEAN_KYBER1024_AVX2_polyvec_reduce
*
* Description: Applies Barrett reduction to each coefficient
* of each element of a vector of polynomials
* for details of the Barrett reduction see comments in reduce.c
*
* Arguments: - poly *r: pointer to input/output polynomial
**************************************************/
void PQCLEAN_KYBER1024_AVX2_polyvec_reduce(polyvec *r) {
for (size_t i = 0; i < KYBER_K; i++) {
unsigned int i = 0;
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_AVX2_poly_reduce(&r->vec[i]);
}
}

// FIXME
/*************************************************
* Name: PQCLEAN_KYBER1024_AVX2_polyvec_csubq
*
* Description: Applies conditional subtraction of q to each coefficient
* of each element of a vector of polynomials
* for details of conditional subtraction of q see comments in
* reduce.c
*
* Arguments: - poly *r: pointer to input/output polynomial
**************************************************/
void PQCLEAN_KYBER1024_AVX2_polyvec_csubq(polyvec *r) {
for (size_t i = 0; i < KYBER_K; i++) {
unsigned int i = 0;
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_AVX2_poly_csubq(&r->vec[i]);
}
}

/*************************************************
* Name: polyvec_add
* Name: PQCLEAN_KYBER1024_AVX2_polyvec_add
*
* Description: Add vectors of polynomials
*
@@ -170,7 +201,8 @@ void PQCLEAN_KYBER1024_AVX2_polyvec_csubq(polyvec *r) {
* - const polyvec *b: pointer to second input vector of polynomials
**************************************************/
void PQCLEAN_KYBER1024_AVX2_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b) {
for (size_t i = 0; i < KYBER_K; i++) {
unsigned int i = 0;
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_AVX2_poly_add(&r->vec[i], &a->vec[i], &b->vec[i]);
}
}

+ 21
- 9
crypto_kem/kyber1024/avx2/polyvec.h View File

@@ -1,29 +1,41 @@
#ifndef POLYVEC_H
#define POLYVEC_H
#ifndef PQCLEAN_KYBER1024_AVX2_POLYVEC_H
#define PQCLEAN_KYBER1024_AVX2_POLYVEC_H

#include "params.h"
#include "poly.h"

#include <stdint.h>

typedef struct {
poly vec[KYBER_K];
} polyvec;

void PQCLEAN_KYBER1024_AVX2_polyvec_compress(uint8_t *r, polyvec *a);
void PQCLEAN_KYBER1024_AVX2_polyvec_decompress(polyvec *r, const uint8_t *a);

void PQCLEAN_KYBER1024_AVX2_polyvec_tobytes(uint8_t *r, polyvec *a);
void PQCLEAN_KYBER1024_AVX2_polyvec_frombytes(polyvec *r, const uint8_t *a);
void PQCLEAN_KYBER1024_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a);

void PQCLEAN_KYBER1024_AVX2_polyvec_decompress(polyvec *r,
const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]);


void PQCLEAN_KYBER1024_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a);

void PQCLEAN_KYBER1024_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]);


void PQCLEAN_KYBER1024_AVX2_polyvec_ntt(polyvec *r);
void PQCLEAN_KYBER1024_AVX2_polyvec_invntt(polyvec *r);

void PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b);
void PQCLEAN_KYBER1024_AVX2_polyvec_invntt_tomont(polyvec *r);


void PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(poly *r,
const polyvec *a,
const polyvec *b);


void PQCLEAN_KYBER1024_AVX2_polyvec_reduce(polyvec *r);

void PQCLEAN_KYBER1024_AVX2_polyvec_csubq(polyvec *r);


void PQCLEAN_KYBER1024_AVX2_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b);

#endif

+ 9
- 3
crypto_kem/kyber1024/avx2/reduce.h View File

@@ -3,8 +3,14 @@

#include <stdint.h>

int16_t PQCLEAN_KYBER1024_AVX2_reduce_avx(int16_t *r);
int16_t PQCLEAN_KYBER1024_AVX2_csubq_avx(int16_t *r);
int16_t PQCLEAN_KYBER1024_AVX2_frommont_avx(int16_t *r);
#include "consts.h"
#include "params.h"


int16_t PQCLEAN_KYBER1024_AVX2_reduce_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata);

int16_t PQCLEAN_KYBER1024_AVX2_csubq_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata);

int16_t PQCLEAN_KYBER1024_AVX2_tomont_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata);

#endif

+ 325
- 351
crypto_kem/kyber1024/avx2/rejsample.c View File

@@ -1,386 +1,360 @@
#include "align.h"
#include "consts.h"
#include "params.h"
#include "rejsample.h"

#include <immintrin.h>
#include <stdint.h>

static const uint8_t idx[256][8] = {
{ 0, 0, 0, 0, 0, 0, 0, 0},
{ 0, 0, 0, 0, 0, 0, 0, 0},
{ 2, 0, 0, 0, 0, 0, 0, 0},
{ 0, 2, 0, 0, 0, 0, 0, 0},
{ 4, 0, 0, 0, 0, 0, 0, 0},
{ 0, 4, 0, 0, 0, 0, 0, 0},
{ 2, 4, 0, 0, 0, 0, 0, 0},
{ 0, 2, 4, 0, 0, 0, 0, 0},
{ 6, 0, 0, 0, 0, 0, 0, 0},
{ 0, 6, 0, 0, 0, 0, 0, 0},
{ 2, 6, 0, 0, 0, 0, 0, 0},
{ 0, 2, 6, 0, 0, 0, 0, 0},
{ 4, 6, 0, 0, 0, 0, 0, 0},
{ 0, 4, 6, 0, 0, 0, 0, 0},
{ 2, 4, 6, 0, 0, 0, 0, 0},
{ 0, 2, 4, 6, 0, 0, 0, 0},
{ 8, 0, 0, 0, 0, 0, 0, 0},
{ 0, 8, 0, 0, 0, 0, 0, 0},
{ 2, 8, 0, 0, 0, 0, 0, 0},
{ 0, 2, 8, 0, 0, 0, 0, 0},
{ 4, 8, 0, 0, 0, 0, 0, 0},
{ 0, 4, 8, 0, 0, 0, 0, 0},
{ 2, 4, 8, 0, 0, 0, 0, 0},
{ 0, 2, 4, 8, 0, 0, 0, 0},
{ 6, 8, 0, 0, 0, 0, 0, 0},
{ 0, 6, 8, 0, 0, 0, 0, 0},
{ 2, 6, 8, 0, 0, 0, 0, 0},
{ 0, 2, 6, 8, 0, 0, 0, 0},
{ 4, 6, 8, 0, 0, 0, 0, 0},
{ 0, 4, 6, 8, 0, 0, 0, 0},
{ 2, 4, 6, 8, 0, 0, 0, 0},
{ 0, 2, 4, 6, 8, 0, 0, 0},
{10, 0, 0, 0, 0, 0, 0, 0},
{ 0, 10, 0, 0, 0, 0, 0, 0},
{ 2, 10, 0, 0, 0, 0, 0, 0},
{ 0, 2, 10, 0, 0, 0, 0, 0},
{ 4, 10, 0, 0, 0, 0, 0, 0},
{ 0, 4, 10, 0, 0, 0, 0, 0},
{ 2, 4, 10, 0, 0, 0, 0, 0},
{ 0, 2, 4, 10, 0, 0, 0, 0},
{ 6, 10, 0, 0, 0, 0, 0, 0},
{ 0, 6, 10, 0, 0, 0, 0, 0},
{ 2, 6, 10, 0, 0, 0, 0, 0},
{ 0, 2, 6, 10, 0, 0, 0, 0},
{ 4, 6, 10, 0, 0, 0, 0, 0},
{ 0, 4, 6, 10, 0, 0, 0, 0},
{ 2, 4, 6, 10, 0, 0, 0, 0},
{ 0, 2, 4, 6, 10, 0, 0, 0},
{ 8, 10, 0, 0, 0, 0, 0, 0},
{ 0, 8, 10, 0, 0, 0, 0, 0},
{ 2, 8, 10, 0, 0, 0, 0, 0},
{ 0, 2, 8, 10, 0, 0, 0, 0},
{ 4, 8, 10, 0, 0, 0, 0, 0},
{ 0, 4, 8, 10, 0, 0, 0, 0},
{ 2, 4, 8, 10, 0, 0, 0, 0},
{ 0, 2, 4, 8, 10, 0, 0, 0},
{ 6, 8, 10, 0, 0, 0, 0, 0},
{ 0, 6, 8, 10, 0, 0, 0, 0},
{ 2, 6, 8, 10, 0, 0, 0, 0},
{ 0, 2, 6, 8, 10, 0, 0, 0},
{ 4, 6, 8, 10, 0, 0, 0, 0},
{ 0, 4, 6, 8, 10, 0, 0, 0},
{ 2, 4, 6, 8, 10, 0, 0, 0},
{ 0, 2, 4, 6, 8, 10, 0, 0},
{12, 0, 0, 0, 0, 0, 0, 0},
{ 0, 12, 0, 0, 0, 0, 0, 0},
{ 2, 12, 0, 0, 0, 0, 0, 0},
{ 0, 2, 12, 0, 0, 0, 0, 0},
{ 4, 12, 0, 0, 0, 0, 0, 0},
{ 0, 4, 12, 0, 0, 0, 0, 0},
{ 2, 4, 12, 0, 0, 0, 0, 0},
{ 0, 2, 4, 12, 0, 0, 0, 0},
{ 6, 12, 0, 0, 0, 0, 0, 0},
{ 0, 6, 12, 0, 0, 0, 0, 0},
{ 2, 6, 12, 0, 0, 0, 0, 0},
{ 0, 2, 6, 12, 0, 0, 0, 0},
{ 4, 6, 12, 0, 0, 0, 0, 0},
{ 0, 4, 6, 12, 0, 0, 0, 0},
{ 2, 4, 6, 12, 0, 0, 0, 0},
{ 0, 2, 4, 6, 12, 0, 0, 0},
{ 8, 12, 0, 0, 0, 0, 0, 0},
{ 0, 8, 12, 0, 0, 0, 0, 0},
{ 2, 8, 12, 0, 0, 0, 0, 0},
{ 0, 2, 8, 12, 0, 0, 0, 0},
{ 4, 8, 12, 0, 0, 0, 0, 0},
{ 0, 4, 8, 12, 0, 0, 0, 0},
{ 2, 4, 8, 12, 0, 0, 0, 0},
{ 0, 2, 4, 8, 12, 0, 0, 0},
{ 6, 8, 12, 0, 0, 0, 0, 0},
{ 0, 6, 8, 12, 0, 0, 0, 0},
{ 2, 6, 8, 12, 0, 0, 0, 0},
{ 0, 2, 6, 8, 12, 0, 0, 0},
{ 4, 6, 8, 12, 0, 0, 0, 0},
{ 0, 4, 6, 8, 12, 0, 0, 0},
{ 2, 4, 6, 8, 12, 0, 0, 0},
{ 0, 2, 4, 6, 8, 12, 0, 0},
{10, 12, 0, 0, 0, 0, 0, 0},
{ 0, 10, 12, 0, 0, 0, 0, 0},
{ 2, 10, 12, 0, 0, 0, 0, 0},
{ 0, 2, 10, 12, 0, 0, 0, 0},
{ 4, 10, 12, 0, 0, 0, 0, 0},
{ 0, 4, 10, 12, 0, 0, 0, 0},
{ 2, 4, 10, 12, 0, 0, 0, 0},
{ 0, 2, 4, 10, 12, 0, 0, 0},
{ 6, 10, 12, 0, 0, 0, 0, 0},
{ 0, 6, 10, 12, 0, 0, 0, 0},
{ 2, 6, 10, 12, 0, 0, 0, 0},
{ 0, 2, 6, 10, 12, 0, 0, 0},
{ 4, 6, 10, 12, 0, 0, 0, 0},
{ 0, 4, 6, 10, 12, 0, 0, 0},
{ 2, 4, 6, 10, 12, 0, 0, 0},
{ 0, 2, 4, 6, 10, 12, 0, 0},
{ 8, 10, 12, 0, 0, 0, 0, 0},
{ 0, 8, 10, 12, 0, 0, 0, 0},
{ 2, 8, 10, 12, 0, 0, 0, 0},
{ 0, 2, 8, 10, 12, 0, 0, 0},
{ 4, 8, 10, 12, 0, 0, 0, 0},
{ 0, 4, 8, 10, 12, 0, 0, 0},
{ 2, 4, 8, 10, 12, 0, 0, 0},
{ 0, 2, 4, 8, 10, 12, 0, 0},
{ 6, 8, 10, 12, 0, 0, 0, 0},
{ 0, 6, 8, 10, 12, 0, 0, 0},
{ 2, 6, 8, 10, 12, 0, 0, 0},
{ 0, 2, 6, 8, 10, 12, 0, 0},
{ 4, 6, 8, 10, 12, 0, 0, 0},
{ 0, 4, 6, 8, 10, 12, 0, 0},
{ 2, 4, 6, 8, 10, 12, 0, 0},
{ 0, 2, 4, 6, 8, 10, 12, 0},
{14, 0, 0, 0, 0, 0, 0, 0},
{ 0, 14, 0, 0, 0, 0, 0, 0},
{ 2, 14, 0, 0, 0, 0, 0, 0},
{ 0, 2, 14, 0, 0, 0, 0, 0},
{ 4, 14, 0, 0, 0, 0, 0, 0},
{ 0, 4, 14, 0, 0, 0, 0, 0},
{ 2, 4, 14, 0, 0, 0, 0, 0},
{ 0, 2, 4, 14, 0, 0, 0, 0},
{ 6, 14, 0, 0, 0, 0, 0, 0},
{ 0, 6, 14, 0, 0, 0, 0, 0},
{ 2, 6, 14, 0, 0, 0, 0, 0},
{ 0, 2, 6, 14, 0, 0, 0, 0},
{ 4, 6, 14, 0, 0, 0, 0, 0},
{ 0, 4, 6, 14, 0, 0, 0, 0},
{ 2, 4, 6, 14, 0, 0, 0, 0},
{ 0, 2, 4, 6, 14, 0, 0, 0},
{ 8, 14, 0, 0, 0, 0, 0, 0},
{ 0, 8, 14, 0, 0, 0, 0, 0},
{ 2, 8, 14, 0, 0, 0, 0, 0},
{ 0, 2, 8, 14, 0, 0, 0, 0},
{ 4, 8, 14, 0, 0, 0, 0, 0},
{ 0, 4, 8, 14, 0, 0, 0, 0},
{ 2, 4, 8, 14, 0, 0, 0, 0},
{ 0, 2, 4, 8, 14, 0, 0, 0},
{ 6, 8, 14, 0, 0, 0, 0, 0},
{ 0, 6, 8, 14, 0, 0, 0, 0},
{ 2, 6, 8, 14, 0, 0, 0, 0},
{ 0, 2, 6, 8, 14, 0, 0, 0},
{ 4, 6, 8, 14, 0, 0, 0, 0},
{ 0, 4, 6, 8, 14, 0, 0, 0},
{ 2, 4, 6, 8, 14, 0, 0, 0},
{ 0, 2, 4, 6, 8, 14, 0, 0},
{10, 14, 0, 0, 0, 0, 0, 0},
{ 0, 10, 14, 0, 0, 0, 0, 0},
{ 2, 10, 14, 0, 0, 0, 0, 0},
{ 0, 2, 10, 14, 0, 0, 0, 0},
{ 4, 10, 14, 0, 0, 0, 0, 0},
{ 0, 4, 10, 14, 0, 0, 0, 0},
{ 2, 4, 10, 14, 0, 0, 0, 0},
{ 0, 2, 4, 10, 14, 0, 0, 0},
{ 6, 10, 14, 0, 0, 0, 0, 0},
{ 0, 6, 10, 14, 0, 0, 0, 0},
{ 2, 6, 10, 14, 0, 0, 0, 0},
{ 0, 2, 6, 10, 14, 0, 0, 0},
{ 4, 6, 10, 14, 0, 0, 0, 0},
{ 0, 4, 6, 10, 14, 0, 0, 0},
{ 2, 4, 6, 10, 14, 0, 0, 0},
{ 0, 2, 4, 6, 10, 14, 0, 0},
{ 8, 10, 14, 0, 0, 0, 0, 0},
{ 0, 8, 10, 14, 0, 0, 0, 0},
{ 2, 8, 10, 14, 0, 0, 0, 0},
{ 0, 2, 8, 10, 14, 0, 0, 0},
{ 4, 8, 10, 14, 0, 0, 0, 0},
{ 0, 4, 8, 10, 14, 0, 0, 0},
{ 2, 4, 8, 10, 14, 0, 0, 0},
{ 0, 2, 4, 8, 10, 14, 0, 0},
{ 6, 8, 10, 14, 0, 0, 0, 0},
{ 0, 6, 8, 10, 14, 0, 0, 0},
{ 2, 6, 8, 10, 14, 0, 0, 0},
{ 0, 2, 6, 8, 10, 14, 0, 0},
{ 4, 6, 8, 10, 14, 0, 0, 0},
{ 0, 4, 6, 8, 10, 14, 0, 0},
{ 2, 4, 6, 8, 10, 14, 0, 0},
{ 0, 2, 4, 6, 8, 10, 14, 0},
{12, 14, 0, 0, 0, 0, 0, 0},
{ 0, 12, 14, 0, 0, 0, 0, 0},
{ 2, 12, 14, 0, 0, 0, 0, 0},
{ 0, 2, 12, 14, 0, 0, 0, 0},
{ 4, 12, 14, 0, 0, 0, 0, 0},
{ 0, 4, 12, 14, 0, 0, 0, 0},
{ 2, 4, 12, 14, 0, 0, 0, 0},
{ 0, 2, 4, 12, 14, 0, 0, 0},
{ 6, 12, 14, 0, 0, 0, 0, 0},
{ 0, 6, 12, 14, 0, 0, 0, 0},
{ 2, 6, 12, 14, 0, 0, 0, 0},
{ 0, 2, 6, 12, 14, 0, 0, 0},
{ 4, 6, 12, 14, 0, 0, 0, 0},
{ 0, 4, 6, 12, 14, 0, 0, 0},
{ 2, 4, 6, 12, 14, 0, 0, 0},
{ 0, 2, 4, 6, 12, 14, 0, 0},
{ 8, 12, 14, 0, 0, 0, 0, 0},
{ 0, 8, 12, 14, 0, 0, 0, 0},
{ 2, 8, 12, 14, 0, 0, 0, 0},
{ 0, 2, 8, 12, 14, 0, 0, 0},
{ 4, 8, 12, 14, 0, 0, 0, 0},
{ 0, 4, 8, 12, 14, 0, 0, 0},
{ 2, 4, 8, 12, 14, 0, 0, 0},
{ 0, 2, 4, 8, 12, 14, 0, 0},
{ 6, 8, 12, 14, 0, 0, 0, 0},
{ 0, 6, 8, 12, 14, 0, 0, 0},
{ 2, 6, 8, 12, 14, 0, 0, 0},
{ 0, 2, 6, 8, 12, 14, 0, 0},
{ 4, 6, 8, 12, 14, 0, 0, 0},
{ 0, 4, 6, 8, 12, 14, 0, 0},
{ 2, 4, 6, 8, 12, 14, 0, 0},
{ 0, 2, 4, 6, 8, 12, 14, 0},
{10, 12, 14, 0, 0, 0, 0, 0},
{ 0, 10, 12, 14, 0, 0, 0, 0},
{ 2, 10, 12, 14, 0, 0, 0, 0},
{ 0, 2, 10, 12, 14, 0, 0, 0},
{ 4, 10, 12, 14, 0, 0, 0, 0},
{ 0, 4, 10, 12, 14, 0, 0, 0},
{ 2, 4, 10, 12, 14, 0, 0, 0},
{ 0, 2, 4, 10, 12, 14, 0, 0},
{ 6, 10, 12, 14, 0, 0, 0, 0},
{ 0, 6, 10, 12, 14, 0, 0, 0},
{ 2, 6, 10, 12, 14, 0, 0, 0},
{ 0, 2, 6, 10, 12, 14, 0, 0},
{ 4, 6, 10, 12, 14, 0, 0, 0},
{ 0, 4, 6, 10, 12, 14, 0, 0},
{ 2, 4, 6, 10, 12, 14, 0, 0},
{ 0, 2, 4, 6, 10, 12, 14, 0},
{ 8, 10, 12, 14, 0, 0, 0, 0},
{ 0, 8, 10, 12, 14, 0, 0, 0},
{ 2, 8, 10, 12, 14, 0, 0, 0},
{ 0, 2, 8, 10, 12, 14, 0, 0},
{ 4, 8, 10, 12, 14, 0, 0, 0},
{ 0, 4, 8, 10, 12, 14, 0, 0},
{ 2, 4, 8, 10, 12, 14, 0, 0},
{ 0, 2, 4, 8, 10, 12, 14, 0},
{ 6, 8, 10, 12, 14, 0, 0, 0},
{ 0, 6, 8, 10, 12, 14, 0, 0},
{ 2, 6, 8, 10, 12, 14, 0, 0},
{ 0, 2, 6, 8, 10, 12, 14, 0},
{ 4, 6, 8, 10, 12, 14, 0, 0},
{ 0, 4, 6, 8, 10, 12, 14, 0},
{ 2, 4, 6, 8, 10, 12, 14, 0},
{ 0, 2, 4, 6, 8, 10, 12, 14}
static const ALIGN32_ARRAY_2D(uint8_t, 256, 8) idx = {.arr = {
{-1, -1, -1, -1, -1, -1, -1, -1},
{ 0, -1, -1, -1, -1, -1, -1, -1},
{ 2, -1, -1, -1, -1, -1, -1, -1},
{ 0, 2, -1, -1, -1, -1, -1, -1},
{ 4, -1, -1, -1, -1, -1, -1, -1},
{ 0, 4, -1, -1, -1, -1, -1, -1},
{ 2, 4, -1, -1, -1, -1, -1, -1},
{ 0, 2, 4, -1, -1, -1, -1, -1},
{ 6, -1, -1, -1, -1, -1, -1, -1},
{ 0, 6, -1, -1, -1, -1, -1, -1},
{ 2, 6, -1, -1, -1, -1, -1, -1},
{ 0, 2, 6, -1, -1, -1, -1, -1},
{ 4, 6, -1, -1, -1, -1, -1, -1},
{ 0, 4, 6, -1, -1, -1, -1, -1},
{ 2, 4, 6, -1, -1, -1, -1, -1},
{ 0, 2, 4, 6, -1, -1, -1, -1},
{ 8, -1, -1, -1, -1, -1, -1, -1},
{ 0, 8, -1, -1, -1, -1, -1, -1},
{ 2, 8, -1, -1, -1, -1, -1, -1},
{ 0, 2, 8, -1, -1, -1, -1, -1},
{ 4, 8, -1, -1, -1, -1, -1, -1},
{ 0, 4, 8, -1, -1, -1, -1, -1},
{ 2, 4, 8, -1, -1, -1, -1, -1},
{ 0, 2, 4, 8, -1, -1, -1, -1},
{ 6, 8, -1, -1, -1, -1, -1, -1},
{ 0, 6, 8, -1, -1, -1, -1, -1},
{ 2, 6, 8, -1, -1, -1, -1, -1},
{ 0, 2, 6, 8, -1, -1, -1, -1},
{ 4, 6, 8, -1, -1, -1, -1, -1},
{ 0, 4, 6, 8, -1, -1, -1, -1},
{ 2, 4, 6, 8, -1, -1, -1, -1},
{ 0, 2, 4, 6, 8, -1, -1, -1},
{10, -1, -1, -1, -1, -1, -1, -1},
{ 0, 10, -1, -1, -1, -1, -1, -1},
{ 2, 10, -1, -1, -1, -1, -1, -1},
{ 0, 2, 10, -1, -1, -1, -1, -1},
{ 4, 10, -1, -1, -1, -1, -1, -1},
{ 0, 4, 10, -1, -1, -1, -1, -1},
{ 2, 4, 10, -1, -1, -1, -1, -1},
{ 0, 2, 4, 10, -1, -1, -1, -1},
{ 6, 10, -1, -1, -1, -1, -1, -1},
{ 0, 6, 10, -1, -1, -1, -1, -1},
{ 2, 6, 10, -1, -1, -1, -1, -1},
{ 0, 2, 6, 10, -1, -1, -1, -1},
{ 4, 6, 10, -1, -1, -1, -1, -1},
{ 0, 4, 6, 10, -1, -1, -1, -1},
{ 2, 4, 6, 10, -1, -1, -1, -1},
{ 0, 2, 4, 6, 10, -1, -1, -1},
{ 8, 10, -1, -1, -1, -1, -1, -1},
{ 0, 8, 10, -1, -1, -1, -1, -1},
{ 2, 8, 10, -1, -1, -1, -1, -1},
{ 0, 2, 8, 10, -1, -1, -1, -1},
{ 4, 8, 10, -1, -1, -1, -1, -1},
{ 0, 4, 8, 10, -1, -1, -1, -1},
{ 2, 4, 8, 10, -1, -1, -1, -1},
{ 0, 2, 4, 8, 10, -1, -1, -1},
{ 6, 8, 10, -1, -1, -1, -1, -1},
{ 0, 6, 8, 10, -1, -1, -1, -1},
{ 2, 6, 8, 10, -1, -1, -1, -1},
{ 0, 2, 6, 8, 10, -1, -1, -1},
{ 4, 6, 8, 10, -1, -1, -1, -1},
{ 0, 4, 6, 8, 10, -1, -1, -1},
{ 2, 4, 6, 8, 10, -1, -1, -1},
{ 0, 2, 4, 6, 8, 10, -1, -1},
{12, -1, -1, -1, -1, -1, -1, -1},
{ 0, 12, -1, -1, -1, -1, -1, -1},
{ 2, 12, -1, -1, -1, -1, -1, -1},
{ 0, 2, 12, -1, -1, -1, -1, -1},
{ 4, 12, -1, -1, -1, -1, -1, -1},
{ 0, 4, 12, -1, -1, -1, -1, -1},
{ 2, 4, 12, -1, -1, -1, -1, -1},
{ 0, 2, 4, 12, -1, -1, -1, -1},
{ 6, 12, -1, -1, -1, -1, -1, -1},
{ 0, 6, 12, -1, -1, -1, -1, -1},
{ 2, 6, 12, -1, -1, -1, -1, -1},
{ 0, 2, 6, 12, -1, -1, -1, -1},
{ 4, 6, 12, -1, -1, -1, -1, -1},
{ 0, 4, 6, 12, -1, -1, -1, -1},
{ 2, 4, 6, 12, -1, -1, -1, -1},
{ 0, 2, 4, 6, 12, -1, -1, -1},
{ 8, 12, -1, -1, -1, -1, -1, -1},
{ 0, 8, 12, -1, -1, -1, -1, -1},
{ 2, 8, 12, -1, -1, -1, -1, -1},
{ 0, 2, 8, 12, -1, -1, -1, -1},
{ 4, 8, 12, -1, -1, -1, -1, -1},
{ 0, 4, 8, 12, -1, -1, -1, -1},
{ 2, 4, 8, 12, -1, -1, -1, -1},
{ 0, 2, 4, 8, 12, -1, -1, -1},
{ 6, 8, 12, -1, -1, -1, -1, -1},
{ 0, 6, 8, 12, -1, -1, -1, -1},
{ 2, 6, 8, 12, -1, -1, -1, -1},
{ 0, 2, 6, 8, 12, -1, -1, -1},
{ 4, 6, 8, 12, -1, -1, -1, -1},
{ 0, 4, 6, 8, 12, -1, -1, -1},
{ 2, 4, 6, 8, 12, -1, -1, -1},
{ 0, 2, 4, 6, 8, 12, -1, -1},
{10, 12, -1, -1, -1, -1, -1, -1},
{ 0, 10, 12, -1, -1, -1, -1, -1},
{ 2, 10, 12, -1, -1, -1, -1, -1},
{ 0, 2, 10, 12, -1, -1, -1, -1},
{ 4, 10, 12, -1, -1, -1, -1, -1},
{ 0, 4, 10, 12, -1, -1, -1, -1},
{ 2, 4, 10, 12, -1, -1, -1, -1},
{ 0, 2, 4, 10, 12, -1, -1, -1},
{ 6, 10, 12, -1, -1, -1, -1, -1},
{ 0, 6, 10, 12, -1, -1, -1, -1},
{ 2, 6, 10, 12, -1, -1, -1, -1},
{ 0, 2, 6, 10, 12, -1, -1, -1},
{ 4, 6, 10, 12, -1, -1, -1, -1},
{ 0, 4, 6, 10, 12, -1, -1, -1},
{ 2, 4, 6, 10, 12, -1, -1, -1},
{ 0, 2, 4, 6, 10, 12, -1, -1},
{ 8, 10, 12, -1, -1, -1, -1, -1},
{ 0, 8, 10, 12, -1, -1, -1, -1},
{ 2, 8, 10, 12, -1, -1, -1, -1},
{ 0, 2, 8, 10, 12, -1, -1, -1},
{ 4, 8, 10, 12, -1, -1, -1, -1},
{ 0, 4, 8, 10, 12, -1, -1, -1},
{ 2, 4, 8, 10, 12, -1, -1, -1},
{ 0, 2, 4, 8, 10, 12, -1, -1},
{ 6, 8, 10, 12, -1, -1, -1, -1},
{ 0, 6, 8, 10, 12, -1, -1, -1},
{ 2, 6, 8, 10, 12, -1, -1, -1},
{ 0, 2, 6, 8, 10, 12, -1, -1},
{ 4, 6, 8, 10, 12, -1, -1, -1},
{ 0, 4, 6, 8, 10, 12, -1, -1},
{ 2, 4, 6, 8, 10, 12, -1, -1},
{ 0, 2, 4, 6, 8, 10, 12, -1},
{14, -1, -1, -1, -1, -1, -1, -1},
{ 0, 14, -1, -1, -1, -1, -1, -1},
{ 2, 14, -1, -1, -1, -1, -1, -1},
{ 0, 2, 14, -1, -1, -1, -1, -1},
{ 4, 14, -1, -1, -1, -1, -1, -1},
{ 0, 4, 14, -1, -1, -1, -1, -1},
{ 2, 4, 14, -1, -1, -1, -1, -1},
{ 0, 2, 4, 14, -1, -1, -1, -1},
{ 6, 14, -1, -1, -1, -1, -1, -1},
{ 0, 6, 14, -1, -1, -1, -1, -1},
{ 2, 6, 14, -1, -1, -1, -1, -1},
{ 0, 2, 6, 14, -1, -1, -1, -1},
{ 4, 6, 14, -1, -1, -1, -1, -1},
{ 0, 4, 6, 14, -1, -1, -1, -1},
{ 2, 4, 6, 14, -1, -1, -1, -1},
{ 0, 2, 4, 6, 14, -1, -1, -1},
{ 8, 14, -1, -1, -1, -1, -1, -1},
{ 0, 8, 14, -1, -1, -1, -1, -1},
{ 2, 8, 14, -1, -1, -1, -1, -1},
{ 0, 2, 8, 14, -1, -1, -1, -1},
{ 4, 8, 14, -1, -1, -1, -1, -1},
{ 0, 4, 8, 14, -1, -1, -1, -1},
{ 2, 4, 8, 14, -1, -1, -1, -1},
{ 0, 2, 4, 8, 14, -1, -1, -1},
{ 6, 8, 14, -1, -1, -1, -1, -1},
{ 0, 6, 8, 14, -1, -1, -1, -1},
{ 2, 6, 8, 14, -1, -1, -1, -1},
{ 0, 2, 6, 8, 14, -1, -1, -1},
{ 4, 6, 8, 14, -1, -1, -1, -1},
{ 0, 4, 6, 8, 14, -1, -1, -1},
{ 2, 4, 6, 8, 14, -1, -1, -1},
{ 0, 2, 4, 6, 8, 14, -1, -1},
{10, 14, -1, -1, -1, -1, -1, -1},
{ 0, 10, 14, -1, -1, -1, -1, -1},
{ 2, 10, 14, -1, -1, -1, -1, -1},
{ 0, 2, 10, 14, -1, -1, -1, -1},
{ 4, 10, 14, -1, -1, -1, -1, -1},
{ 0, 4, 10, 14, -1, -1, -1, -1},
{ 2, 4, 10, 14, -1, -1, -1, -1},
{ 0, 2, 4, 10, 14, -1, -1, -1},
{ 6, 10, 14, -1, -1, -1, -1, -1},
{ 0, 6, 10, 14, -1, -1, -1, -1},
{ 2, 6, 10, 14, -1, -1, -1, -1},
{ 0, 2, 6, 10, 14, -1, -1, -1},
{ 4, 6, 10, 14, -1, -1, -1, -1},
{ 0, 4, 6, 10, 14, -1, -1, -1},
{ 2, 4, 6, 10, 14, -1, -1, -1},
{ 0, 2, 4, 6, 10, 14, -1, -1},
{ 8, 10, 14, -1, -1, -1, -1, -1},
{ 0, 8, 10, 14, -1, -1, -1, -1},
{ 2, 8, 10, 14, -1, -1, -1, -1},
{ 0, 2, 8, 10, 14, -1, -1, -1},
{ 4, 8, 10, 14, -1, -1, -1, -1},
{ 0, 4, 8, 10, 14, -1, -1, -1},
{ 2, 4, 8, 10, 14, -1, -1, -1},
{ 0, 2, 4, 8, 10, 14, -1, -1},
{ 6, 8, 10, 14, -1, -1, -1, -1},
{ 0, 6, 8, 10, 14, -1, -1, -1},
{ 2, 6, 8, 10, 14, -1, -1, -1},
{ 0, 2, 6, 8, 10, 14, -1, -1},
{ 4, 6, 8, 10, 14, -1, -1, -1},
{ 0, 4, 6, 8, 10, 14, -1, -1},
{ 2, 4, 6, 8, 10, 14, -1, -1},
{ 0, 2, 4, 6, 8, 10, 14, -1},
{12, 14, -1, -1, -1, -1, -1, -1},
{ 0, 12, 14, -1, -1, -1, -1, -1},
{ 2, 12, 14, -1, -1, -1, -1, -1},
{ 0, 2, 12, 14, -1, -1, -1, -1},
{ 4, 12, 14, -1, -1, -1, -1, -1},
{ 0, 4, 12, 14, -1, -1, -1, -1},
{ 2, 4, 12, 14, -1, -1, -1, -1},
{ 0, 2, 4, 12, 14, -1, -1, -1},
{ 6, 12, 14, -1, -1, -1, -1, -1},
{ 0, 6, 12, 14, -1, -1, -1, -1},
{ 2, 6, 12, 14, -1, -1, -1, -1},
{ 0, 2, 6, 12, 14, -1, -1, -1},
{ 4, 6, 12, 14, -1, -1, -1, -1},
{ 0, 4, 6, 12, 14, -1, -1, -1},
{ 2, 4, 6, 12, 14, -1, -1, -1},
{ 0, 2, 4, 6, 12, 14, -1, -1},
{ 8, 12, 14, -1, -1, -1, -1, -1},
{ 0, 8, 12, 14, -1, -1, -1, -1},
{ 2, 8, 12, 14, -1, -1, -1, -1},
{ 0, 2, 8, 12, 14, -1, -1, -1},
{ 4, 8, 12, 14, -1, -1, -1, -1},
{ 0, 4, 8, 12, 14, -1, -1, -1},
{ 2, 4, 8, 12, 14, -1, -1, -1},
{ 0, 2, 4, 8, 12, 14, -1, -1},
{ 6, 8, 12, 14, -1, -1, -1, -1},
{ 0, 6, 8, 12, 14, -1, -1, -1},
{ 2, 6, 8, 12, 14, -1, -1, -1},
{ 0, 2, 6, 8, 12, 14, -1, -1},
{ 4, 6, 8, 12, 14, -1, -1, -1},
{ 0, 4, 6, 8, 12, 14, -1, -1},
{ 2, 4, 6, 8, 12, 14, -1, -1},
{ 0, 2, 4, 6, 8, 12, 14, -1},
{10, 12, 14, -1, -1, -1, -1, -1},
{ 0, 10, 12, 14, -1, -1, -1, -1},
{ 2, 10, 12, 14, -1, -1, -1, -1},
{ 0, 2, 10, 12, 14, -1, -1, -1},
{ 4, 10, 12, 14, -1, -1, -1, -1},
{ 0, 4, 10, 12, 14, -1, -1, -1},
{ 2, 4, 10, 12, 14, -1, -1, -1},
{ 0, 2, 4, 10, 12, 14, -1, -1},
{ 6, 10, 12, 14, -1, -1, -1, -1},
{ 0, 6, 10, 12, 14, -1, -1, -1},
{ 2, 6, 10, 12, 14, -1, -1, -1},
{ 0, 2, 6, 10, 12, 14, -1, -1},
{ 4, 6, 10, 12, 14, -1, -1, -1},
{ 0, 4, 6, 10, 12, 14, -1, -1},
{ 2, 4, 6, 10, 12, 14, -1, -1},
{ 0, 2, 4, 6, 10, 12, 14, -1},
{ 8, 10, 12, 14, -1, -1, -1, -1},
{ 0, 8, 10, 12, 14, -1, -1, -1},
{ 2, 8, 10, 12, 14, -1, -1, -1},
{ 0, 2, 8, 10, 12, 14, -1, -1},
{ 4, 8, 10, 12, 14, -1, -1, -1},
{ 0, 4, 8, 10, 12, 14, -1, -1},
{ 2, 4, 8, 10, 12, 14, -1, -1},
{ 0, 2, 4, 8, 10, 12, 14, -1},
{ 6, 8, 10, 12, 14, -1, -1, -1},
{ 0, 6, 8, 10, 12, 14, -1, -1},
{ 2, 6, 8, 10, 12, 14, -1, -1},
{ 0, 2, 6, 8, 10, 12, 14, -1},
{ 4, 6, 8, 10, 12, 14, -1, -1},
{ 0, 4, 6, 8, 10, 12, 14, -1},
{ 2, 4, 6, 8, 10, 12, 14, -1},
{ 0, 2, 4, 6, 8, 10, 12, 14}
}
};

#define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a)
#define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a)
#define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a)
#define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a)

size_t PQCLEAN_KYBER1024_AVX2_rej_uniform(int16_t *r,
size_t len,
const uint8_t *buf,
size_t buflen) {
size_t ctr, pos;
uint16_t val;
uint32_t good0, good1, good2;
const __m256i bound = _mm256_set1_epi16((int16_t)(19 * KYBER_Q - 1)); // -1 to use cheaper >= instead of > comparison
#define REJ_UNIFORM_BUFLEN 672
unsigned int PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(int16_t *restrict r,
const uint8_t *restrict buf) {
unsigned int ctr = 0, pos = 0;
uint16_t val = 0;
uint32_t good = 0;
const __m256i bound = _mm256_set1_epi16((int16_t)(19 * KYBER_Q - 1));
const __m256i ones = _mm256_set1_epi8(1);
const __m256i kyberq = _mm256_load_si256(&PQCLEAN_KYBER1024_AVX2_16xq.as_vec);
const __m256i v = _mm256_load_si256(&PQCLEAN_KYBER1024_AVX2_16xv.as_vec);
__m256i d0, d1, d2, tmp0, tmp1, tmp2, pi0, pi1, pi2;
__m128i d, tmp, pilo, pihi;
const __m256i kyberq = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER1024_AVX2_qdata.as_arr[_16XQ]);
const __m256i v = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER1024_AVX2_qdata.as_arr[_16XV]);
__m256i f0, f1, g0, g1, g2, g3;
__m128i f, t, pilo, pihi;

ctr = pos = 0;
while (ctr + 48 <= len && pos + 96 <= buflen) {
d0 = _mm256_loadu_si256((__m256i *)&buf[pos + 0]);
d1 = _mm256_loadu_si256((__m256i *)&buf[pos + 32]);
d2 = _mm256_loadu_si256((__m256i *)&buf[pos + 64]);
ctr = 0;
for (pos = 0; pos < 2 * KYBER_N; pos += 64) {
f0 = _mm256_load_si256((__m256i *)&buf[pos + 0]);
f1 = _mm256_load_si256((__m256i *)&buf[pos + 32]);

tmp0 = _mm256_cmpge_epu16(bound, d0);
tmp1 = _mm256_cmpge_epu16(bound, d1);
tmp2 = _mm256_cmpge_epu16(bound, d2);
good0 = (uint32_t)_mm256_movemask_epi8(tmp0);
good1 = (uint32_t)_mm256_movemask_epi8(tmp1);
good2 = (uint32_t)_mm256_movemask_epi8(tmp2);
good0 = _pext_u32(good0, 0x55555555);
good1 = _pext_u32(good1, 0x55555555);
good2 = _pext_u32(good2, 0x55555555);
g0 = _mm256_cmpge_epu16(bound, f0);
g1 = _mm256_cmpge_epu16(bound, f1);

pilo = _mm_loadl_epi64((__m128i *)&idx[good0 & 0xFF]);
pihi = _mm_loadl_epi64((__m128i *)&idx[(good0 >> 8) & 0xFF]);
pi0 = _mm256_castsi128_si256(pilo);
pi0 = _mm256_inserti128_si256(pi0, pihi, 1);
g0 = _mm256_packs_epi16(g0, g1);
good = _mm256_movemask_epi8(g0);

pilo = _mm_loadl_epi64((__m128i *)&idx[good1 & 0xFF]);
pihi = _mm_loadl_epi64((__m128i *)&idx[(good1 >> 8) & 0xFF]);
pi1 = _mm256_castsi128_si256(pilo);
pi1 = _mm256_inserti128_si256(pi1, pihi, 1);
g0 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 0) & 0xFF]));
g1 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 8) & 0xFF]));
g0 = _mm256_inserti128_si256(g0, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 16) & 0xFF]), 1);
g1 = _mm256_inserti128_si256(g1, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 24) & 0xFF]), 1);

pilo = _mm_loadl_epi64((__m128i *)&idx[good2 & 0xFF]);
pihi = _mm_loadl_epi64((__m128i *)&idx[(good2 >> 8) & 0xFF]);
pi2 = _mm256_castsi128_si256(pilo);
pi2 = _mm256_inserti128_si256(pi2, pihi, 1);
//g0 = _mm256_cvtepu8_epi64(_mm_loadl_epi64((__m128i *)&good));
//g1 = _mm256_i64gather_epi64((long long *)idx, g0, 8);

tmp0 = _mm256_add_epi8(pi0, ones);
tmp1 = _mm256_add_epi8(pi1, ones);
tmp2 = _mm256_add_epi8(pi2, ones);
pi0 = _mm256_unpacklo_epi8(pi0, tmp0);
pi1 = _mm256_unpacklo_epi8(pi1, tmp1);
pi2 = _mm256_unpacklo_epi8(pi2, tmp2);
/* Barrett reduction of (still unsigned) values */
g2 = _mm256_mulhi_epu16(f0, v);
g3 = _mm256_mulhi_epu16(f1, v);
g2 = _mm256_srli_epi16(g2, 10);
g3 = _mm256_srli_epi16(g3, 10);
g2 = _mm256_mullo_epi16(g2, kyberq);
g3 = _mm256_mullo_epi16(g3, kyberq);
f0 = _mm256_sub_epi16(f0, g2);
f1 = _mm256_sub_epi16(f1, g3);

d0 = _mm256_shuffle_epi8(d0, pi0);
d1 = _mm256_shuffle_epi8(d1, pi1);
d2 = _mm256_shuffle_epi8(d2, pi2);
g2 = _mm256_add_epi8(g0, ones);
g3 = _mm256_add_epi8(g1, ones);
g0 = _mm256_unpacklo_epi8(g0, g2);
g1 = _mm256_unpacklo_epi8(g1, g3);

/* Barrett reduction of (still unsigned) d values */
tmp0 = _mm256_mulhi_epu16(d0, v);
tmp1 = _mm256_mulhi_epu16(d1, v);
tmp2 = _mm256_mulhi_epu16(d2, v);
tmp0 = _mm256_srli_epi16(tmp0, 10);
tmp1 = _mm256_srli_epi16(tmp1, 10);
tmp2 = _mm256_srli_epi16(tmp2, 10);
tmp0 = _mm256_mullo_epi16(tmp0, kyberq);
tmp1 = _mm256_mullo_epi16(tmp1, kyberq);
tmp2 = _mm256_mullo_epi16(tmp2, kyberq);
d0 = _mm256_sub_epi16(d0, tmp0);
d1 = _mm256_sub_epi16(d1, tmp1);
d2 = _mm256_sub_epi16(d2, tmp2);
f0 = _mm256_shuffle_epi8(f0, g0);
f1 = _mm256_shuffle_epi8(f1, g1);

_mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(d0));
ctr += (unsigned int)_mm_popcnt_u32(good0 & 0xFF);
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(d0, 1));
ctr += (unsigned int)_mm_popcnt_u32((good0 >> 8) & 0xFF);
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(d1));
ctr += (unsigned int)_mm_popcnt_u32(good1 & 0xFF);
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(d1, 1));
ctr += (unsigned int)_mm_popcnt_u32((good1 >> 8) & 0xFF);
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(d2));
ctr += (unsigned int)_mm_popcnt_u32(good2 & 0xFF);
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(d2, 1));
ctr += (unsigned int)_mm_popcnt_u32((good2 >> 8) & 0xFF);
pos += 96;
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f0));
ctr += _mm_popcnt_u32((good >> 0) & 0xFF);
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f0, 1));
ctr += _mm_popcnt_u32((good >> 16) & 0xFF);
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f1));
ctr += _mm_popcnt_u32((good >> 8) & 0xFF);
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f1, 1));
ctr += _mm_popcnt_u32((good >> 24) & 0xFF);
}

while (ctr + 8 <= len && pos + 16 <= buflen) {
d = _mm_loadu_si128((__m128i *)&buf[pos]);
tmp = _mm_cmpge_epu16(_mm256_castsi256_si128(bound), d);
good0 = (uint32_t)_mm_movemask_epi8(tmp);
good0 = _pext_u32(good0, 0x55555555);
pilo = _mm_loadl_epi64((__m128i *)&idx[good0]);
while (ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_BUFLEN - 16) {
f = _mm_load_si128((__m128i *)&buf[pos]);
t = _mm_cmpge_epu16(_mm256_castsi256_si128(bound), f);
good = _mm_movemask_epi8(t);
good = _pext_u32(good, 0x5555);
pilo = _mm_loadl_epi64((__m128i *)&idx.arr[good]);
pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones));
pilo = _mm_unpacklo_epi8(pilo, pihi);
d = _mm_shuffle_epi8(d, pilo);

/* Barrett reduction */
tmp = _mm_mulhi_epu16(d, _mm256_castsi256_si128(v));
tmp = _mm_srli_epi16(tmp, 10);
tmp = _mm_mullo_epi16(tmp, _mm256_castsi256_si128(kyberq));
d = _mm_sub_epi16(d, tmp);
t = _mm_mulhi_epu16(f, _mm256_castsi256_si128(v));
t = _mm_srli_epi16(t, 10);
t = _mm_mullo_epi16(t, _mm256_castsi256_si128(kyberq));
f = _mm_sub_epi16(f, t);

_mm_storeu_si128((__m128i *)&r[ctr], d);
ctr += (unsigned int)_mm_popcnt_u32(good0);
f = _mm_shuffle_epi8(f, pilo);
_mm_storeu_si128((__m128i *)&r[ctr], f);
ctr += _mm_popcnt_u32(good);
pos += 16;
}

while (ctr < len && pos + 2 <= buflen) {
val = (uint16_t)(buf[pos] | ((uint16_t)buf[pos + 1] << 8));
while (ctr < KYBER_N && pos <= REJ_UNIFORM_BUFLEN - 2) {
val = buf[pos] | ((uint16_t)buf[pos + 1] << 8);
pos += 2;

if (val < 19 * KYBER_Q) {
val -= ((int32_t)val * 20159 >> 26) * KYBER_Q;
r[ctr++] = (int16_t)val;
r[ctr++] = val;
}
}



+ 4
- 5
crypto_kem/kyber1024/avx2/rejsample.h View File

@@ -1,12 +1,11 @@
#ifndef REJSAMPLE_H
#define REJSAMPLE_H

#include <stddef.h>
#include "params.h"
#include <stdint.h>

size_t PQCLEAN_KYBER1024_AVX2_rej_uniform(int16_t *r,
size_t len,
const uint8_t *buf,
size_t buflen);

unsigned int PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(int16_t *r,
const unsigned char *buf);

#endif

crypto_kem/kyber512-90s/avx2/shuffle.s → crypto_kem/kyber1024/avx2/shuffle.S View File

@@ -1,12 +1,9 @@
#include "cdecl.inc"
.include "fq.inc"
.include "shuffle.inc"

.global PQCLEAN_KYBER51290S_AVX2_nttunpack_avx
PQCLEAN_KYBER51290S_AVX2_nttunpack_avx:
#consts
vmovdqa PQCLEAN_KYBER51290S_AVX2_16xq(%rip),%ymm0
vmovdqa PQCLEAN_KYBER51290S_AVX2_16xv(%rip),%ymm1

/*
nttpack_avx:
#load
vmovdqa (%rdi),%ymm4
vmovdqa 32(%rdi),%ymm5
@@ -17,18 +14,51 @@ vmovdqa 160(%rdi),%ymm9
vmovdqa 192(%rdi),%ymm10
vmovdqa 224(%rdi),%ymm11

/*
#reduce
red16 4 12
red16 5 13
red16 6 14
red16 7 15
red16 8 12
red16 9 13
red16 10 14
red16 11 15
shuffle1 4,5,3,5
shuffle1 6,7,4,7
shuffle1 8,9,6,9
shuffle1 10,11,8,11

shuffle2 3,4,10,4
shuffle2 6,8,3,8
shuffle2 5,7,6,7
shuffle2 9,11,5,11

shuffle4 10,3,9,3
shuffle4 6,5,10,5
shuffle4 4,8,6,8
shuffle4 7,11,4,11

shuffle8 9,10,7,10
shuffle8 6,4,9,4
shuffle8 3,5,6,5
shuffle8 8,11,3,11

#store
vmovdqa %ymm7,(%rdi)
vmovdqa %ymm9,32(%rdi)
vmovdqa %ymm6,64(%rdi)
vmovdqa %ymm3,96(%rdi)
vmovdqa %ymm10,128(%rdi)
vmovdqa %ymm4,160(%rdi)
vmovdqa %ymm5,192(%rdi)
vmovdqa %ymm11,224(%rdi)

ret
*/

.text
nttunpack128_avx:
#load
vmovdqa (%rdi),%ymm4
vmovdqa 32(%rdi),%ymm5
vmovdqa 64(%rdi),%ymm6
vmovdqa 96(%rdi),%ymm7
vmovdqa 128(%rdi),%ymm8
vmovdqa 160(%rdi),%ymm9
vmovdqa 192(%rdi),%ymm10
vmovdqa 224(%rdi),%ymm11

shuffle8 4,8,3,8
shuffle8 5,9,4,9
shuffle8 6,10,5,10
@@ -61,11 +91,14 @@ vmovdqa %ymm11,224(%rdi)

ret

.global PQCLEAN_KYBER51290S_AVX2_ntttobytes_avx
PQCLEAN_KYBER51290S_AVX2_ntttobytes_avx:
#consts
vmovdqa PQCLEAN_KYBER51290S_AVX2_16xq(%rip),%ymm0
.global cdecl(PQCLEAN_KYBER1024_AVX2_nttunpack_avx)
cdecl(PQCLEAN_KYBER1024_AVX2_nttunpack_avx):
call nttunpack128_avx
add $256,%rdi
call nttunpack128_avx
ret

ntttobytes128_avx:
#load
vmovdqa (%rsi),%ymm5
vmovdqa 32(%rsi),%ymm6
@@ -77,14 +110,14 @@ vmovdqa 192(%rsi),%ymm11
vmovdqa 224(%rsi),%ymm12

#csubq
csubq 5 13
csubq 6 14
csubq 7 15
csubq 8 1
csubq 9 13
csubq 10 14
csubq 11 15
csubq 12 1
csubq 5,13
csubq 6,14
csubq 7,15
csubq 8,1
csubq 9,13
csubq 10,14
csubq 11,15
csubq 12,1

#bitpack
vpsllw $12,%ymm6,%ymm4
@@ -135,11 +168,17 @@ vmovdqu %ymm9,160(%rdi)

ret

.global PQCLEAN_KYBER51290S_AVX2_nttfrombytes_avx
PQCLEAN_KYBER51290S_AVX2_nttfrombytes_avx:
.global cdecl(PQCLEAN_KYBER1024_AVX2_ntttobytes_avx)
cdecl(PQCLEAN_KYBER1024_AVX2_ntttobytes_avx):
#consts
vmovdqa PQCLEAN_KYBER51290S_AVX2_16xmask(%rip),%ymm0
vmovdqa _16XQ*2(%rdx),%ymm0
call ntttobytes128_avx
add $256,%rsi
add $192,%rdi
call ntttobytes128_avx
ret

nttfrombytes128_avx:
#load
vmovdqu (%rsi),%ymm4
vmovdqu 32(%rsi),%ymm5
@@ -204,3 +243,13 @@ vmovdqa %ymm15,192(%rdi)
vmovdqa %ymm1,224(%rdi)

ret

.global cdecl(PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx)
cdecl(PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx):
#consts
vmovdqa _16XMASK*2(%rdx),%ymm0
call nttfrombytes128_avx
add $256,%rdi
add $192,%rsi
call nttfrombytes128_avx
ret

+ 2
- 0
crypto_kem/kyber1024/avx2/shuffle.inc View File

@@ -9,6 +9,8 @@ vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3
.endm

.macro shuffle2 r0,r1,r2,r3
#vshufps $0x20,%ymm\r1,%ymm\r0,%ymm\r2
#vshufps $0x31,%ymm\r1,%ymm\r0,%ymm\r3
vpsllq $32,%ymm\r1,%ymm12
vpsrlq $32,%ymm\r0,%ymm13
vpblendd $0xAA,%ymm12,%ymm\r0,%ymm\r2


+ 0
- 63
crypto_kem/kyber1024/avx2/symmetric-fips202.c View File

@@ -1,63 +0,0 @@
#include "fips202.h"
#include "symmetric.h"

#include <stdlib.h>
/*************************************************
* Name: kyber_shake128_absorb
*
* Description: Absorb step of the SHAKE128 specialized for the Kyber context.

* Arguments: - keccak_state *s: pointer to (uninitialized) output Keccak state
* - const uint8_t *input: pointer to KYBER_SYMBYTES input to be absorbed into s
* - uint8_t i additional byte of input
* - uint8_t j additional byte of input
**************************************************/
void PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb(keccak_state *s, const uint8_t *input, uint8_t x, uint8_t y) {
size_t i;
uint8_t extseed[KYBER_SYMBYTES + 2];

for (i = 0; i < KYBER_SYMBYTES; i++) {
extseed[i] = input[i];
}
extseed[i++] = x;
extseed[i] = y;
shake128_absorb(s, extseed, KYBER_SYMBYTES + 2);
}

/*************************************************
* Name: kyber_shake128_squeezeblocks
*
* Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of SHAKE128_RATE bytes each.
* Modifies the state. Can be called multiple times to keep squeezing,
* i.e., is incremental.
*
* Arguments: - uint8_t *output: pointer to output blocks
* - unsigned long long nblocks: number of blocks to be squeezed (written to output)
* - keccak_state *s: pointer to in/output Keccak state
**************************************************/
void PQCLEAN_KYBER1024_AVX2_kyber_shake128_squeezeblocks(uint8_t *output, size_t nblocks, keccak_state *s) {
shake128_squeezeblocks(output, nblocks, s);
}

/*************************************************
* Name: shake256_prf
*
* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input
* and then generates outlen bytes of SHAKE256 output
*
* Arguments: - uint8_t *output: pointer to output
* - size_t outlen: number of requested output bytes
* - const uint8_t * key: pointer to the key (of length KYBER_SYMBYTES)
* - const uint8_t nonce: single-byte nonce (public PRF input)
**************************************************/
void PQCLEAN_KYBER1024_AVX2_shake256_prf(uint8_t *output, size_t outlen, const uint8_t *key, uint8_t nonce) {
uint8_t extkey[KYBER_SYMBYTES + 1];
size_t i;

for (i = 0; i < KYBER_SYMBYTES; i++) {
extkey[i] = key[i];
}
extkey[i] = nonce;

shake256(output, outlen, extkey, KYBER_SYMBYTES + 1);
}

+ 60
- 0
crypto_kem/kyber1024/avx2/symmetric-shake.c View File

@@ -0,0 +1,60 @@
#include "fips202.h"
#include "params.h"
#include "symmetric.h"
#include <stddef.h>
#include <stdint.h>

/*************************************************
* Name: PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb
*
* Description: Absorb step of the SHAKE128 specialized for the Kyber context.
*
* Arguments: - keccak_state *state: pointer to (uninitialized) output
* Keccak state
* - const uint8_t *seed: pointer to KYBER_SYMBYTES input
* to be absorbed into state
* - uint8_t i additional byte of input
* - uint8_t j additional byte of input
**************************************************/
void PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb(xof_state *state,
const uint8_t seed[KYBER_SYMBYTES],
uint8_t x,
uint8_t y) {
unsigned int i = 0;
uint8_t extseed[KYBER_SYMBYTES + 2];

for (i = 0; i < KYBER_SYMBYTES; i++) {
extseed[i] = seed[i];
}
extseed[i++] = x;
extseed[i] = y;

shake128_absorb(state, extseed, sizeof(extseed));
}

/*************************************************
* Name: PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf
*
* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input
* and then generates outlen bytes of SHAKE256 output
*
* Arguments: - uint8_t *out: pointer to output
* - size_t outlen: number of requested output bytes
* - const uint8_t *key: pointer to the key
* (of length KYBER_SYMBYTES)
* - uint8_t nonce: single-byte nonce (public PRF input)
**************************************************/
void PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf(uint8_t *out,
size_t outlen,
const uint8_t key[KYBER_SYMBYTES],
uint8_t nonce) {
unsigned int i = 0;
uint8_t extkey[KYBER_SYMBYTES + 1];

for (i = 0; i < KYBER_SYMBYTES; i++) {
extkey[i] = key[i];
}
extkey[i] = nonce;

shake256(out, outlen, extkey, sizeof(extkey));
}

+ 19
- 11
crypto_kem/kyber1024/avx2/symmetric.h View File

@@ -2,28 +2,36 @@
#define SYMMETRIC_H

#include "params.h"
#include <stddef.h>
#include <stdint.h>


#include "fips202.h"
#include "fips202x4.h"

typedef shake128ctx keccak_state;
typedef shake128ctx xof_state;

void PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb(keccak_state *s, const uint8_t *input, uint8_t x, uint8_t y);
void PQCLEAN_KYBER1024_AVX2_kyber_shake128_squeezeblocks(uint8_t *output, size_t nblocks, keccak_state *s);
void PQCLEAN_KYBER1024_AVX2_shake256_prf(uint8_t *output, size_t outlen, const uint8_t *key, uint8_t nonce);
void PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb(shake128ctx *s,
const uint8_t seed[KYBER_SYMBYTES],
uint8_t x,
uint8_t y);

void PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf(uint8_t *out,
size_t outlen,
const uint8_t key[KYBER_SYMBYTES],
uint8_t nonce);

#define XOF_BLOCKBYTES SHAKE128_RATE

#define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES)
#define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES)
#define xof_absorb(STATE, IN, X, Y) PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb(STATE, IN, X, Y)
#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE)
#define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb(STATE, SEED, X, Y)
#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) \
shake128_squeezeblocks(OUT, OUTBLOCKS, STATE)
#define xof_ctx_release(STATE) shake128_ctx_release(STATE)
#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER1024_AVX2_shake256_prf(OUT, OUTBYTES, KEY, NONCE)
#define prf(OUT, OUTBYTES, KEY, NONCE) \
PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE)
#define kdf(OUT, IN, INBYTES) shake256(OUT, KYBER_SSBYTES, IN, INBYTES)

#define XOF_BLOCKBYTES SHAKE128_RATE

typedef keccak_state xof_state;


#endif /* SYMMETRIC_H */

+ 20
- 21
crypto_kem/kyber1024/avx2/verify.c View File

@@ -1,23 +1,22 @@
#include "verify.h"

#include <immintrin.h>
#include <stdint.h>
#include <stdlib.h>

/*************************************************
* Name: verify
* Name: PQCLEAN_KYBER1024_AVX2_verify
*
* Description: Compare two arrays for equality in constant time.
*
* Arguments: const uint8_t *a: pointer to first byte array
* const uint8_t *b: pointer to second byte array
* Arguments: const unsigned char *a: pointer to first byte array
* const unsigned char *b: pointer to second byte array
* size_t len: length of the byte arrays
*
* Returns 0 if the byte arrays are equal, 1 otherwise
**************************************************/
uint8_t PQCLEAN_KYBER1024_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) {
size_t pos;
uint64_t r;
int PQCLEAN_KYBER1024_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) {
size_t pos = 0;
uint64_t r = 0;
__m256i avec, bvec, cvec;

cvec = _mm256_setzero_si256();
@@ -27,38 +26,38 @@ uint8_t PQCLEAN_KYBER1024_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t
avec = _mm256_xor_si256(avec, bvec);
cvec = _mm256_or_si256(cvec, avec);
}
r = !_mm256_testz_si256(cvec, cvec);

cvec = _mm256_cmpeq_epi8(cvec, _mm256_setzero_si256());
r = (uint32_t)(_mm256_movemask_epi8(cvec) ^ -1);

while (pos < len) {
r |= a[pos] ^ b[pos];
pos += 1;
if (pos < len) {
avec = _mm256_loadu_si256((__m256i *)&a[pos]);
bvec = _mm256_loadu_si256((__m256i *)&b[pos]);
cvec = _mm256_cmpeq_epi8(avec, bvec);
r |= _mm256_movemask_epi8(cvec) & (-(uint32_t)1 >> (32 + pos - len));
}

r = (-r) >> 63;
return (uint8_t)r;
return r;
}

/*************************************************
* Name: cmov
* Name: PQCLEAN_KYBER1024_AVX2_cmov
*
* Description: Copy len bytes from x to r if b is 1;
* don't modify x if b is 0. Requires b to be in {0,1};
* assumes two's complement representation of negative integers.
* Runs in constant time.
*
* Arguments: uint8_t *r: pointer to output byte array
* const uint8_t *x: pointer to input byte array
* Arguments: unsigned char *r: pointer to output byte array
* const unsigned char *x: pointer to input byte array
* size_t len: Amount of bytes to be copied
* uint8_t b: Condition bit; has to be in {0,1}
* unsigned char b: Condition bit; has to be in {0,1}
**************************************************/
void PQCLEAN_KYBER1024_AVX2_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b) {
size_t pos;
void PQCLEAN_KYBER1024_AVX2_cmov(uint8_t *restrict r, const uint8_t *restrict x, size_t len, uint8_t b) {
size_t pos = 0;
__m256i xvec, rvec, bvec;

b = -b;
bvec = _mm256_set1_epi8((char)b);
bvec = _mm256_set1_epi8(b);

for (pos = 0; pos + 32 <= len; pos += 32) {
rvec = _mm256_loadu_si256((__m256i *)&r[pos]);


+ 6
- 3
crypto_kem/kyber1024/avx2/verify.h View File

@@ -1,10 +1,13 @@
#ifndef VERIFY_H
#define VERIFY_H
#ifndef PQCLEAN_KYBER1024_AVX2_VERIFY_H
#define PQCLEAN_KYBER1024_AVX2_VERIFY_H

#include "params.h"
#include <stddef.h>
#include <stdint.h>

uint8_t PQCLEAN_KYBER1024_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len);

int PQCLEAN_KYBER1024_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len);


void PQCLEAN_KYBER1024_AVX2_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b);



+ 1
- 11
crypto_kem/kyber1024/clean/LICENSE View File

@@ -1,14 +1,4 @@
kyber-20170627
Public Domain
Authors: Joppe Bos,
Léo Ducas,
Eike Kiltz ,
Tancrède Lepoint,
Vadim Lyubashevsky,
John Schanck,
Peter Schwabe,
Gregor Seiler,
Damien Stehlé
Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/)

For Keccak and AES we are using public-domain
code from sources and by authors listed in


+ 2
- 2
crypto_kem/kyber1024/clean/Makefile View File

@@ -1,8 +1,8 @@
# This Makefile can be used with GNU Make or BSD Make

LIB=libkyber1024_clean.a
HEADERS=api.h cbd.h indcpa.h ntt.h params.h poly.h polyvec.h reduce.h verify.h symmetric.h
OBJECTS=cbd.o indcpa.o kem.o ntt.o poly.o polyvec.o reduce.o verify.o symmetric-fips202.o
HEADERS=api.h cbd.h indcpa.h kem.h ntt.h params.h poly.h polyvec.h reduce.h verify.h symmetric.h
OBJECTS=cbd.o indcpa.o kem.o ntt.o poly.o polyvec.o reduce.o verify.o symmetric-shake.o

CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS)



+ 1
- 1
crypto_kem/kyber1024/clean/Makefile.Microsoft_nmake View File

@@ -2,7 +2,7 @@
# nmake /f Makefile.Microsoft_nmake

LIBRARY=libkyber1024_clean.lib
OBJECTS=cbd.obj indcpa.obj kem.obj ntt.obj poly.obj polyvec.obj reduce.obj verify.obj symmetric-fips202.obj
OBJECTS=cbd.obj indcpa.obj kem.obj ntt.obj poly.obj polyvec.obj reduce.obj verify.obj symmetric-shake.obj

# Warning C4146 is raised when a unary minus operator is applied to an
# unsigned type; this has nonetheless been standard and portable for as


+ 13
- 15
crypto_kem/kyber1024/clean/cbd.c View File

@@ -1,7 +1,5 @@
#include "cbd.h"
#include "params.h"

#include <stddef.h>
#include "cbd.h"
#include <stdint.h>

/*************************************************
@@ -14,8 +12,8 @@
*
* Returns 32-bit unsigned integer loaded from x
**************************************************/
static uint32_t load32_littleendian(const uint8_t *x) {
uint32_t r;
static uint32_t load32_littleendian(const uint8_t x[4]) {
uint32_t r = 0;
r = (uint32_t)x[0];
r |= (uint32_t)x[1] << 8;
r |= (uint32_t)x[2] << 16;
@@ -24,27 +22,27 @@ static uint32_t load32_littleendian(const uint8_t *x) {
}

/*************************************************
* Name: cbd
* Name: PQCLEAN_KYBER1024_CLEAN_cbd
*
* Description: Given an array of uniformly random bytes, compute
* polynomial with coefficients distributed according to
* a centered binomial distribution with parameter KYBER_ETA
* specialized for KYBER_ETA=2
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *buf: pointer to input byte array
**************************************************/
void PQCLEAN_KYBER1024_CLEAN_cbd(poly *r, const uint8_t *buf) {
uint32_t d, t;
int16_t a, b;
void PQCLEAN_KYBER1024_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]) {
unsigned int i = 0, j = 0;
uint32_t t = 0, d = 0;
int16_t a = 0, b = 0;

for (size_t i = 0; i < KYBER_N / 8; i++) {
t = load32_littleendian(buf + 4 * i);
for (i = 0; i < KYBER_N / 8; i++) {
t = load32_littleendian(buf + 4 * i);
d = t & 0x55555555;
d += (t >> 1) & 0x55555555;

for (size_t j = 0; j < 8; j++) {
a = (d >> 4 * j) & 0x3;
for (j = 0; j < 8; j++) {
a = (d >> (4 * j + 0)) & 0x3;
b = (d >> (4 * j + 2)) & 0x3;
r->coeffs[8 * i + j] = a - b;
}


+ 6
- 3
crypto_kem/kyber1024/clean/cbd.h View File

@@ -1,8 +1,11 @@
#ifndef CBD_H
#define CBD_H
#ifndef PQCLEAN_KYBER1024_CLEAN_CBD_H
#define PQCLEAN_KYBER1024_CLEAN_CBD_H

#include "params.h"
#include "poly.h"
#include <stdint.h>

void PQCLEAN_KYBER1024_CLEAN_cbd(poly *r, const uint8_t *buf);

void PQCLEAN_KYBER1024_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]);

#endif

+ 118
- 83
crypto_kem/kyber1024/clean/indcpa.c View File

@@ -5,7 +5,7 @@
#include "polyvec.h"
#include "randombytes.h"
#include "symmetric.h"
#include <stddef.h>
#include <stdint.h>

/*************************************************
@@ -16,12 +16,15 @@
* and the public seed used to generate the matrix A.
*
* Arguments: uint8_t *r: pointer to the output serialized public key
* const poly *pk: pointer to the input public-key polynomial
* polyvec *pk: pointer to the input public-key polyvec
* const uint8_t *seed: pointer to the input public seed
**************************************************/
static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) {
static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES],
polyvec *pk,
const uint8_t seed[KYBER_SYMBYTES]) {
size_t i = 0;
PQCLEAN_KYBER1024_CLEAN_polyvec_tobytes(r, pk);
for (size_t i = 0; i < KYBER_SYMBYTES; i++) {
for (i = 0; i < KYBER_SYMBYTES; i++) {
r[i + KYBER_POLYVECBYTES] = seed[i];
}
}
@@ -32,13 +35,18 @@ static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) {
* Description: De-serialize public key from a byte array;
* approximate inverse of pack_pk
*
* Arguments: - polyvec *pk: pointer to output public-key vector of polynomials
* - uint8_t *seed: pointer to output seed to generate matrix A
* Arguments: - polyvec *pk: pointer to output public-key
* polynomial vector
* - uint8_t *seed: pointer to output seed to generate
* matrix A
* - const uint8_t *packedpk: pointer to input serialized public key
**************************************************/
static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) {
static void unpack_pk(polyvec *pk,
uint8_t seed[KYBER_SYMBYTES],
const uint8_t packedpk[KYBER_INDCPA_PUBLICKEYBYTES]) {
size_t i = 0;
PQCLEAN_KYBER1024_CLEAN_polyvec_frombytes(pk, packedpk);
for (size_t i = 0; i < KYBER_SYMBYTES; i++) {
for (i = 0; i < KYBER_SYMBYTES; i++) {
seed[i] = packedpk[i + KYBER_POLYVECBYTES];
}
}
@@ -49,9 +57,9 @@ static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) {
* Description: Serialize the secret key
*
* Arguments: - uint8_t *r: pointer to output serialized secret key
* - const polyvec *sk: pointer to input vector of polynomials (secret key)
* - polyvec *sk: pointer to input vector of polynomials (secret key)
**************************************************/
static void pack_sk(uint8_t *r, polyvec *sk) {
static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) {
PQCLEAN_KYBER1024_CLEAN_polyvec_tobytes(r, sk);
}

@@ -61,10 +69,12 @@ static void pack_sk(uint8_t *r, polyvec *sk) {
* Description: De-serialize the secret key;
* inverse of pack_sk
*
* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key)
* Arguments: - polyvec *sk: pointer to output vector of
* polynomials (secret key)
* - const uint8_t *packedsk: pointer to input serialized secret key
**************************************************/
static void unpack_sk(polyvec *sk, const uint8_t *packedsk) {
static void unpack_sk(polyvec *sk,
const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) {
PQCLEAN_KYBER1024_CLEAN_polyvec_frombytes(sk, packedsk);
}

@@ -75,11 +85,13 @@ static void unpack_sk(polyvec *sk, const uint8_t *packedsk) {
* compressed and serialized vector of polynomials b
* and the compressed and serialized polynomial v
*
* Arguments: uint8_t *r: pointer to the output serialized ciphertext
* const poly *pk: pointer to the input vector of polynomials b
* const uint8_t *seed: pointer to the input polynomial v
* Arguments: uint8_t *r: pointer to the output serialized ciphertext
* poly *pk: pointer to the input vector of polynomials b
* poly *v: pointer to the input polynomial v
**************************************************/
static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) {
static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES],
polyvec *b,
poly *v) {
PQCLEAN_KYBER1024_CLEAN_polyvec_compress(r, b);
PQCLEAN_KYBER1024_CLEAN_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v);
}
@@ -90,11 +102,13 @@ static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) {
* Description: De-serialize and decompress ciphertext from a byte array;
* approximate inverse of pack_ciphertext
*
* Arguments: - polyvec *b: pointer to the output vector of polynomials b
* - poly *v: pointer to the output polynomial v
* Arguments: - polyvec *b: pointer to the output vector of polynomials b
* - poly *v: pointer to the output polynomial v
* - const uint8_t *c: pointer to the input serialized ciphertext
**************************************************/
static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t *c) {
static void unpack_ciphertext(polyvec *b,
poly *v,
const uint8_t c[KYBER_INDCPA_BYTES]) {
PQCLEAN_KYBER1024_CLEAN_polyvec_decompress(b, c);
PQCLEAN_KYBER1024_CLEAN_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES);
}
@@ -105,24 +119,29 @@ static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t *c) {
* Description: Run rejection sampling on uniform random bytes to generate
* uniform random integers mod q
*
* Arguments: - int16_t *r: pointer to output buffer
* - size_t len: requested number of 16-bit integers (uniform mod q)
* - const uint8_t *buf: pointer to input buffer (assumed to be uniform random bytes)
* - size_t buflen: length of input buffer in bytes
* Arguments: - int16_t *r: pointer to output buffer
* - unsigned int len: requested number of 16-bit integers
* (uniform mod q)
* - const uint8_t *buf: pointer to input buffer
* (assumed to be uniform random bytes)
* - unsigned int buflen: length of input buffer in bytes
*
* Returns number of sampled 16-bit integers (at most len)
**************************************************/
static size_t rej_uniform(int16_t *r, size_t len, const uint8_t *buf, size_t buflen) {
size_t ctr, pos;
uint16_t val;
static unsigned int rej_uniform(int16_t *r,
unsigned int len,
const uint8_t *buf,
unsigned int buflen) {
unsigned int ctr = 0, pos = 0;
uint16_t val = 0;

ctr = pos = 0;
while (ctr < len && pos + 2 <= buflen) {
val = (uint16_t)(buf[pos] | ((uint16_t)buf[pos + 1] << 8));
val = buf[pos] | ((uint16_t)buf[pos + 1] << 8);
pos += 2;

if (val < 19 * KYBER_Q) {
val -= (uint16_t)((val >> 12) * KYBER_Q); // Barrett reduction
val -= (val >> 12) * KYBER_Q; // Barrett reduction
r[ctr++] = (int16_t)val;
}
}
@@ -130,26 +149,28 @@ static size_t rej_uniform(int16_t *r, size_t len, const uint8_t *buf, size_t buf
return ctr;
}

#define gen_a(A,B) gen_matrix(A,B,0)
#define gen_at(A,B) gen_matrix(A,B,1)
#define gen_a(A,B) PQCLEAN_KYBER1024_CLEAN_gen_matrix(A,B,0)
#define gen_at(A,B) PQCLEAN_KYBER1024_CLEAN_gen_matrix(A,B,1)

/*************************************************
* Name: gen_matrix
* Name: PQCLEAN_KYBER1024_CLEAN_gen_matrix
*
* Description: Deterministically generate matrix A (or the transpose of A)
* from a seed. Entries of the matrix are polynomials that look
* uniformly random. Performs rejection sampling on output of
* a XOF
*
* Arguments: - polyvec *a: pointer to ouptput matrix A
* Arguments: - polyvec *a: pointer to ouptput matrix A
* - const uint8_t *seed: pointer to input seed
* - int transposed: boolean deciding whether A or A^T is generated
* - int transposed: boolean deciding whether A or A^T
* is generated
**************************************************/
#define MAXNBLOCKS ((530+XOF_BLOCKBYTES)/XOF_BLOCKBYTES) /* 530 is expected number of required bytes */
static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) {
size_t ctr;
uint8_t i, j;
uint8_t buf[XOF_BLOCKBYTES * MAXNBLOCKS + 1];
#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \
+ XOF_BLOCKBYTES)/XOF_BLOCKBYTES)
// Not static for benchmarking
void PQCLEAN_KYBER1024_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed) {
unsigned int ctr = 0, i = 0, j = 0;
uint8_t buf[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES];
xof_state state;

for (i = 0; i < KYBER_K; i++) {
@@ -160,12 +181,13 @@ static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) {
xof_absorb(&state, seed, j, i);
}

xof_squeezeblocks(buf, MAXNBLOCKS, &state);
ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, MAXNBLOCKS * XOF_BLOCKBYTES);
xof_squeezeblocks(buf, GEN_MATRIX_NBLOCKS, &state);
ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, sizeof(buf));

while (ctr < KYBER_N) {
xof_squeezeblocks(buf, 1, &state);
ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, XOF_BLOCKBYTES);
ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf,
XOF_BLOCKBYTES);
}
xof_ctx_release(&state);
}
@@ -173,40 +195,44 @@ static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) {
}

/*************************************************
* Name: indcpa_keypair
* Name: PQCLEAN_KYBER1024_CLEAN_indcpa_keypair
*
* Description: Generates public and private key for the CPA-secure
* public-key encryption scheme underlying Kyber
*
* Arguments: - uint8_t *pk: pointer to output public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
* - uint8_t *sk: pointer to output private key (of length KYBER_INDCPA_SECRETKEYBYTES bytes)
* Arguments: - uint8_t *pk: pointer to output public key
* (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
* - uint8_t *sk: pointer to output private key
(of length KYBER_INDCPA_SECRETKEYBYTES bytes)
**************************************************/
void PQCLEAN_KYBER1024_CLEAN_indcpa_keypair(uint8_t *pk, uint8_t *sk) {
polyvec a[KYBER_K], e, pkpv, skpv;
void PQCLEAN_KYBER1024_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) {
unsigned int i = 0;
uint8_t buf[2 * KYBER_SYMBYTES];
uint8_t *publicseed = buf;
uint8_t *noiseseed = buf + KYBER_SYMBYTES;
const uint8_t *publicseed = buf;
const uint8_t *noiseseed = buf + KYBER_SYMBYTES;
uint8_t nonce = 0;
polyvec a[KYBER_K], e, pkpv, skpv;

randombytes(buf, KYBER_SYMBYTES);
hash_g(buf, buf, KYBER_SYMBYTES);

gen_a(a, publicseed);

for (size_t i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_CLEAN_poly_getnoise(skpv.vec + i, noiseseed, nonce++);
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_CLEAN_poly_getnoise(&skpv.vec[i], noiseseed, nonce++);
}
for (size_t i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_CLEAN_poly_getnoise(e.vec + i, noiseseed, nonce++);
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_CLEAN_poly_getnoise(&e.vec[i], noiseseed, nonce++);
}

PQCLEAN_KYBER1024_CLEAN_polyvec_ntt(&skpv);
PQCLEAN_KYBER1024_CLEAN_polyvec_ntt(&e);

// matrix-vector multiplication
for (size_t i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc(&pkpv.vec[i], &a[i], &skpv);
PQCLEAN_KYBER1024_CLEAN_poly_frommont(&pkpv.vec[i]);
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv);
PQCLEAN_KYBER1024_CLEAN_poly_tomont(&pkpv.vec[i]);
}

PQCLEAN_KYBER1024_CLEAN_polyvec_add(&pkpv, &pkpv, &e);
@@ -217,34 +243,40 @@ void PQCLEAN_KYBER1024_CLEAN_indcpa_keypair(uint8_t *pk, uint8_t *sk) {
}

/*************************************************
* Name: indcpa_enc
* Name: PQCLEAN_KYBER1024_CLEAN_indcpa_enc
*
* Description: Encryption function of the CPA-secure
* public-key encryption scheme underlying Kyber.
*
* Arguments: - uint8_t *c: pointer to output ciphertext (of length KYBER_INDCPA_BYTES bytes)
* - const uint8_t *m: pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes)
* - const uint8_t *pk: pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
* - const uint8_t *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes)
* to deterministically generate all randomness
* Arguments: - uint8_t *c: pointer to output ciphertext
* (of length KYBER_INDCPA_BYTES bytes)
* - const uint8_t *m: pointer to input message
* (of length KYBER_INDCPA_MSGBYTES bytes)
* - const uint8_t *pk: pointer to input public key
* (of length KYBER_INDCPA_PUBLICKEYBYTES)
* - const uint8_t *coins: pointer to input random coins
* used as seed (of length KYBER_SYMBYTES)
* to deterministically generate all
* randomness
**************************************************/
void PQCLEAN_KYBER1024_CLEAN_indcpa_enc(uint8_t *c,
const uint8_t *m,
const uint8_t *pk,
const uint8_t *coins) {
polyvec sp, pkpv, ep, at[KYBER_K], bp;
poly v, k, epp;
void PQCLEAN_KYBER1024_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
const uint8_t m[KYBER_INDCPA_MSGBYTES],
const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
const uint8_t coins[KYBER_SYMBYTES]) {
unsigned int i = 0;
uint8_t seed[KYBER_SYMBYTES];
uint8_t nonce = 0;
polyvec sp, pkpv, ep, at[KYBER_K], bp;
poly v, k, epp;

unpack_pk(&pkpv, seed, pk);
PQCLEAN_KYBER1024_CLEAN_poly_frommsg(&k, m);
gen_at(at, seed);

for (size_t i = 0; i < KYBER_K; i++) {
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_CLEAN_poly_getnoise(sp.vec + i, coins, nonce++);
}
for (size_t i = 0; i < KYBER_K; i++) {
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_CLEAN_poly_getnoise(ep.vec + i, coins, nonce++);
}
PQCLEAN_KYBER1024_CLEAN_poly_getnoise(&epp, coins, nonce++);
@@ -252,14 +284,14 @@ void PQCLEAN_KYBER1024_CLEAN_indcpa_enc(uint8_t *c,
PQCLEAN_KYBER1024_CLEAN_polyvec_ntt(&sp);

// matrix-vector multiplication
for (size_t i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc(&bp.vec[i], &at[i], &sp);
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp);
}

PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc(&v, &pkpv, &sp);
PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp);

PQCLEAN_KYBER1024_CLEAN_polyvec_invntt(&bp);
PQCLEAN_KYBER1024_CLEAN_poly_invntt(&v);
PQCLEAN_KYBER1024_CLEAN_polyvec_invntt_tomont(&bp);
PQCLEAN_KYBER1024_CLEAN_poly_invntt_tomont(&v);

PQCLEAN_KYBER1024_CLEAN_polyvec_add(&bp, &bp, &ep);
PQCLEAN_KYBER1024_CLEAN_poly_add(&v, &v, &epp);
@@ -271,18 +303,21 @@ void PQCLEAN_KYBER1024_CLEAN_indcpa_enc(uint8_t *c,
}

/*************************************************
* Name: indcpa_dec
* Name: PQCLEAN_KYBER1024_CLEAN_indcpa_dec
*
* Description: Decryption function of the CPA-secure
* public-key encryption scheme underlying Kyber.
*
* Arguments: - uint8_t *m: pointer to output decrypted message (of length KYBER_INDCPA_MSGBYTES)
* - const uint8_t *c: pointer to input ciphertext (of length KYBER_INDCPA_BYTES)
* - const uint8_t *sk: pointer to input secret key (of length KYBER_INDCPA_SECRETKEYBYTES)
* Arguments: - uint8_t *m: pointer to output decrypted message
* (of length KYBER_INDCPA_MSGBYTES)
* - const uint8_t *c: pointer to input ciphertext
* (of length KYBER_INDCPA_BYTES)
* - const uint8_t *sk: pointer to input secret key
* (of length KYBER_INDCPA_SECRETKEYBYTES)
**************************************************/
void PQCLEAN_KYBER1024_CLEAN_indcpa_dec(uint8_t *m,
const uint8_t *c,
const uint8_t *sk) {
void PQCLEAN_KYBER1024_CLEAN_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES],
const uint8_t c[KYBER_INDCPA_BYTES],
const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) {
polyvec bp, skpv;
poly v, mp;

@@ -290,8 +325,8 @@ void PQCLEAN_KYBER1024_CLEAN_indcpa_dec(uint8_t *m,
unpack_sk(&skpv, sk);

PQCLEAN_KYBER1024_CLEAN_polyvec_ntt(&bp);
PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc(&mp, &skpv, &bp);
PQCLEAN_KYBER1024_CLEAN_poly_invntt(&mp);
PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp);
PQCLEAN_KYBER1024_CLEAN_poly_invntt_tomont(&mp);

PQCLEAN_KYBER1024_CLEAN_poly_sub(&mp, &v, &mp);
PQCLEAN_KYBER1024_CLEAN_poly_reduce(&mp);


Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save