@@ -28,6 +28,7 @@ implementations: | |||
- architecture: x86_64 | |||
operating_systems: | |||
- Linux | |||
- Darwin | |||
required_flags: | |||
- aes | |||
- avx2 | |||
@@ -1,14 +1,4 @@ | |||
kyber-20170627 | |||
Public Domain | |||
Authors: Joppe Bos, | |||
Léo Ducas, | |||
Eike Kiltz , | |||
Tancrède Lepoint, | |||
Vadim Lyubashevsky, | |||
John Schanck, | |||
Peter Schwabe, | |||
Gregor Seiler, | |||
Damien Stehlé | |||
Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) | |||
For Keccak and AES we are using public-domain | |||
code from sources and by authors listed in | |||
@@ -1,9 +1,40 @@ | |||
# This Makefile can be used with GNU Make or BSD Make | |||
LIB=libkyber1024-90s_avx2.a | |||
HEADERS=api.h params.h poly.h polyvec.h reduce.h fq.inc cbd.h consts.h ntt.h shuffle.inc verify.h indcpa.h rejsample.h symmetric.h aes256ctr.h | |||
OBJECTS=kem.o poly.o polyvec.o fq.o shuffle.o cbd.o ntt.o invntt.o basemul.o consts.o \ | |||
verify.o indcpa.o rejsample.o aes256ctr.o | |||
HEADERS= \ | |||
aes256ctr.h \ | |||
align.h \ | |||
api.h \ | |||
cbd.h \ | |||
cdecl.inc \ | |||
consts.h \ | |||
fq.inc \ | |||
indcpa.h \ | |||
kem.h \ | |||
ntt.h \ | |||
params.h \ | |||
poly.h \ | |||
polyvec.h \ | |||
reduce.h \ | |||
rejsample.h \ | |||
shuffle.inc \ | |||
symmetric.h \ | |||
verify.h | |||
OBJECTS= \ | |||
aes256ctr.o \ | |||
basemul.o \ | |||
cbd.o \ | |||
consts.o \ | |||
fq.o \ | |||
indcpa.o \ | |||
invntt.o \ | |||
kem.o \ | |||
ntt.o \ | |||
poly.o \ | |||
polyvec.o \ | |||
rejsample.o \ | |||
shuffle.o \ | |||
verify.o | |||
CFLAGS=-mavx2 -maes -mbmi2 -mpopcnt -O3 -Wall -Wextra -Wpedantic -Werror \ | |||
-Wmissing-prototypes -Wredundant-decls -std=c99 \ | |||
@@ -14,11 +45,8 @@ all: $(LIB) | |||
%.o: %.c $(HEADERS) | |||
$(CC) $(CFLAGS) -c -o $@ $< | |||
%.o: %.s $(HEADERS) | |||
$(AS) -o $@ $< | |||
%.o: %.S $(HEADERS) | |||
$(AS) -c -o $@ $< | |||
$(CC) $(CFLAGS) -c -o $@ $< | |||
$(LIB): $(OBJECTS) | |||
$(AR) -r $@ $(OBJECTS) | |||
@@ -1,94 +1,68 @@ | |||
/* | |||
crypto_stream_aes256ctr | |||
based heavily on public-domain code by Romain Dolbeau | |||
Based heavily on public-domain code by Romain Dolbeau | |||
Different handling of nonce+counter than original version | |||
using separated 96-bit nonce and internal 32-bit counter, starting from zero | |||
using separated 64-bit nonce and internal 64-bit counter, starting from zero | |||
Public Domain | |||
*/ | |||
#include "aes256ctr.h" | |||
#include <immintrin.h> | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
static inline void aesni_encrypt8(uint8_t *out, | |||
static inline void aesni_encrypt4(uint8_t out[64], | |||
__m128i *n, | |||
const __m128i rkeys[16]) { | |||
__m128i nv0; | |||
__m128i nv1; | |||
__m128i nv2; | |||
__m128i nv3; | |||
__m128i nv4; | |||
__m128i nv5; | |||
__m128i nv6; | |||
__m128i nv7; | |||
__m128i f, f0, f1, f2, f3, t; | |||
/* Load current counter value */ | |||
__m128i nv0i = _mm_load_si128(n); | |||
/* Increase counter in 8 consecutive blocks */ | |||
nv0 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(0, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); | |||
nv1 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(1, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); | |||
nv2 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(2, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); | |||
nv3 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(3, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); | |||
nv4 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(4, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); | |||
nv5 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(5, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); | |||
nv6 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(6, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); | |||
nv7 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(7, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); | |||
/* Write counter for next iteration, increased by 8 */ | |||
_mm_store_si128(n, _mm_add_epi32(nv0i, _mm_set_epi64x(8, 0))); | |||
/* Actual AES encryption, 8x interleaved */ | |||
__m128i temp0 = _mm_xor_si128(nv0, rkeys[0]); | |||
__m128i temp1 = _mm_xor_si128(nv1, rkeys[0]); | |||
__m128i temp2 = _mm_xor_si128(nv2, rkeys[0]); | |||
__m128i temp3 = _mm_xor_si128(nv3, rkeys[0]); | |||
__m128i temp4 = _mm_xor_si128(nv4, rkeys[0]); | |||
__m128i temp5 = _mm_xor_si128(nv5, rkeys[0]); | |||
__m128i temp6 = _mm_xor_si128(nv6, rkeys[0]); | |||
__m128i temp7 = _mm_xor_si128(nv7, rkeys[0]); | |||
for (uint8_t i = 1; i < 14; i++) { | |||
temp0 = _mm_aesenc_si128(temp0, rkeys[i]); | |||
temp1 = _mm_aesenc_si128(temp1, rkeys[i]); | |||
temp2 = _mm_aesenc_si128(temp2, rkeys[i]); | |||
temp3 = _mm_aesenc_si128(temp3, rkeys[i]); | |||
temp4 = _mm_aesenc_si128(temp4, rkeys[i]); | |||
temp5 = _mm_aesenc_si128(temp5, rkeys[i]); | |||
temp6 = _mm_aesenc_si128(temp6, rkeys[i]); | |||
temp7 = _mm_aesenc_si128(temp7, rkeys[i]); | |||
f = _mm_load_si128(n); | |||
/* Increase counter in 4 consecutive blocks */ | |||
t = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); | |||
f0 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(0, 0)), t); | |||
f1 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(1, 0)), t); | |||
f2 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(2, 0)), t); | |||
f3 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(3, 0)), t); | |||
/* Write counter for next iteration, increased by 4 */ | |||
_mm_store_si128(n, _mm_add_epi64(f, _mm_set_epi64x(4, 0))); | |||
/* Actual AES encryption, 4x interleaved */ | |||
t = _mm_load_si128(&rkeys[0]); | |||
f0 = _mm_xor_si128(f0, t); | |||
f1 = _mm_xor_si128(f1, t); | |||
f2 = _mm_xor_si128(f2, t); | |||
f3 = _mm_xor_si128(f3, t); | |||
for (int i = 1; i < 14; i++) { | |||
t = _mm_load_si128(&rkeys[i]); | |||
f0 = _mm_aesenc_si128(f0, t); | |||
f1 = _mm_aesenc_si128(f1, t); | |||
f2 = _mm_aesenc_si128(f2, t); | |||
f3 = _mm_aesenc_si128(f3, t); | |||
} | |||
temp0 = _mm_aesenclast_si128(temp0, rkeys[14]); | |||
temp1 = _mm_aesenclast_si128(temp1, rkeys[14]); | |||
temp2 = _mm_aesenclast_si128(temp2, rkeys[14]); | |||
temp3 = _mm_aesenclast_si128(temp3, rkeys[14]); | |||
temp4 = _mm_aesenclast_si128(temp4, rkeys[14]); | |||
temp5 = _mm_aesenclast_si128(temp5, rkeys[14]); | |||
temp6 = _mm_aesenclast_si128(temp6, rkeys[14]); | |||
temp7 = _mm_aesenclast_si128(temp7, rkeys[14]); | |||
t = _mm_load_si128(&rkeys[14]); | |||
f0 = _mm_aesenclast_si128(f0, t); | |||
f1 = _mm_aesenclast_si128(f1, t); | |||
f2 = _mm_aesenclast_si128(f2, t); | |||
f3 = _mm_aesenclast_si128(f3, t); | |||
/* Write results */ | |||
_mm_storeu_si128((__m128i *)(out + 0), temp0); | |||
_mm_storeu_si128((__m128i *)(out + 16), temp1); | |||
_mm_storeu_si128((__m128i *)(out + 32), temp2); | |||
_mm_storeu_si128((__m128i *)(out + 48), temp3); | |||
_mm_storeu_si128((__m128i *)(out + 64), temp4); | |||
_mm_storeu_si128((__m128i *)(out + 80), temp5); | |||
_mm_storeu_si128((__m128i *)(out + 96), temp6); | |||
_mm_storeu_si128((__m128i *)(out + 112), temp7); | |||
_mm_storeu_si128((__m128i *)(out + 0), f0); | |||
_mm_storeu_si128((__m128i *)(out + 16), f1); | |||
_mm_storeu_si128((__m128i *)(out + 32), f2); | |||
_mm_storeu_si128((__m128i *)(out + 48), f3); | |||
} | |||
void PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(aes256ctr_ctx *state, | |||
const uint8_t *key, | |||
uint16_t nonce) { | |||
__m128i key0 = _mm_loadu_si128((__m128i *)(key + 0)); | |||
__m128i key1 = _mm_loadu_si128((__m128i *)(key + 16)); | |||
__m128i temp0, temp1, temp2, temp4; | |||
size_t idx = 0; | |||
void PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(aes256ctr_ctx *state, const uint8_t key[32], uint64_t nonce) { | |||
__m128i key0, key1, temp0, temp1, temp2, temp4; | |||
int idx = 0; | |||
state->n = _mm_set_epi64x(0, (uint64_t)nonce << 48); | |||
key0 = _mm_loadu_si128((__m128i *)(key + 0)); | |||
key1 = _mm_loadu_si128((__m128i *)(key + 16)); | |||
state->n = _mm_loadl_epi64((__m128i *)&nonce); | |||
state->rkeys[idx++] = key0; | |||
temp0 = key0; | |||
@@ -137,38 +111,33 @@ void PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(aes256ctr_ctx *state, | |||
state->rkeys[idx++] = temp0; | |||
} | |||
void PQCLEAN_KYBER102490S_AVX2_aes256ctr_select(aes256ctr_ctx *state, uint16_t nonce) { | |||
state->n = _mm_set_epi64x(0, (uint64_t)nonce << 48); | |||
} | |||
void PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(uint8_t *out, | |||
size_t nblocks, | |||
aes256ctr_ctx *state) { | |||
size_t i; | |||
size_t i = 0; | |||
for (i = 0; i < nblocks; i++) { | |||
aesni_encrypt8(out, &state->n, state->rkeys); | |||
out += 128; | |||
aesni_encrypt4(out, &state->n, state->rkeys); | |||
out += 64; | |||
} | |||
} | |||
void PQCLEAN_KYBER102490S_AVX2_aes256ctr_prf(uint8_t *out, | |||
size_t outlen, | |||
const uint8_t *seed, | |||
uint8_t nonce) { | |||
size_t i; | |||
uint8_t buf[128]; | |||
const uint8_t seed[32], | |||
uint64_t nonce) { | |||
unsigned int i = 0; | |||
uint8_t buf[64]; | |||
aes256ctr_ctx state; | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, seed, (uint16_t)nonce << 8); | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, seed, nonce); | |||
while (outlen >= 128) { | |||
aesni_encrypt8(out, &state.n, state.rkeys); | |||
outlen -= 128; | |||
while (outlen >= 64) { | |||
aesni_encrypt4(out, &state.n, state.rkeys); | |||
outlen -= 64; | |||
} | |||
if (outlen) { | |||
aesni_encrypt8(buf, &state.n, state.rkeys); | |||
aesni_encrypt4(buf, &state.n, state.rkeys); | |||
for (i = 0; i < outlen; i++) { | |||
out[i] = buf[i]; | |||
} | |||
@@ -5,22 +5,17 @@ | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
#define AES256CTR_NAMESPACE(s) pqcrystals_aes256ctr_avx2##s | |||
#define AES256CTR_BLOCKBYTES 64 | |||
typedef struct { | |||
__m128i rkeys[16]; | |||
__m128i n; | |||
} aes256ctr_ctx; | |||
void PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(aes256ctr_ctx *state, | |||
const uint8_t *key, | |||
uint16_t nonce); | |||
void PQCLEAN_KYBER102490S_AVX2_aes256ctr_select(aes256ctr_ctx *state, uint16_t nonce); | |||
void PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(uint8_t *out, | |||
size_t nblocks, | |||
aes256ctr_ctx *state); | |||
void PQCLEAN_KYBER102490S_AVX2_aes256ctr_prf(uint8_t *out, | |||
size_t outlen, | |||
const uint8_t *seed, | |||
uint8_t nonce); | |||
void PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(aes256ctr_ctx *state, const uint8_t key[32], uint64_t nonce); | |||
void PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(uint8_t *out, size_t nblocks, aes256ctr_ctx *state); | |||
void PQCLEAN_KYBER102490S_AVX2_aes256ctr_prf(uint8_t *out, size_t outlen, const uint8_t seed[32], uint64_t nonce); | |||
#endif |
@@ -0,0 +1,22 @@ | |||
#ifndef PQCLEAN_KYBER102490S_AVX2_ALIGN_H | |||
#define PQCLEAN_KYBER102490S_AVX2_ALIGN_H | |||
#include <immintrin.h> | |||
#define ALIGN16_TYPE(t) \ | |||
union { \ | |||
__m128i vec; \ | |||
t orig; \ | |||
} | |||
#define ALIGN32_ARRAY(t, s) \ | |||
union { \ | |||
__m256i vec; \ | |||
t arr[(s)]; \ | |||
} | |||
#define ALIGN32_ARRAY_2D(t, n, m) \ | |||
union { \ | |||
__m256i vec; \ | |||
t arr[(n)][(m)]; \ | |||
} | |||
#endif |
@@ -1,4 +1,5 @@ | |||
#include "params.h" | |||
#include "cdecl.inc" | |||
.macro schoolbook off,sign | |||
#load | |||
@@ -48,7 +49,7 @@ vpaddd %ymm12,%ymm13,%ymm12 # y0 | |||
vpaddd %ymm7,%ymm8,%ymm7 # y1 | |||
.endm | |||
.macro red a0,a1,b0,b1 x,y,z | |||
.macro red a0,a1,b0,b1,x,y,z | |||
#pack | |||
vpxor %ymm\x,%ymm\x,%ymm\x | |||
vpblendw $0xAA,%ymm\x,%ymm\a0,%ymm\y | |||
@@ -73,13 +74,8 @@ vpsubw %ymm\z,%ymm\a0,%ymm\a0 | |||
vpsubw %ymm\y,%ymm\b0,%ymm\b0 | |||
.endm | |||
.global PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx | |||
PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx: | |||
#consts | |||
vmovdqa PQCLEAN_KYBER102490S_AVX2_16xq(%rip),%ymm0 | |||
vmovdqa PQCLEAN_KYBER102490S_AVX2_16xqinv(%rip),%ymm1 | |||
vmovdqu (%rcx),%ymm2 | |||
.text | |||
basemul64_acc_avx: | |||
poly0.0: | |||
schoolbook 0,0 | |||
@@ -117,7 +113,7 @@ vpaddd %ymm12,%ymm5,%ymm5 | |||
vpaddd %ymm7,%ymm6,%ymm6 | |||
#reduce | |||
red 3,4,5,6 7,8,9 | |||
red 3,4,5,6,7,8,9 | |||
#store | |||
vmovdqa %ymm3,(%rdi) | |||
@@ -160,7 +156,7 @@ vpaddd %ymm12,%ymm5,%ymm5 | |||
vpaddd %ymm7,%ymm6,%ymm6 | |||
#reduce | |||
red 3,4,5,6 7,8,9 | |||
red 3,4,5,6,7,8,9 | |||
#store | |||
vmovdqa %ymm3,64(%rdi) | |||
@@ -168,17 +164,40 @@ vmovdqa %ymm5,96(%rdi) | |||
ret | |||
.global PQCLEAN_KYBER102490S_AVX2_basemul_avx | |||
PQCLEAN_KYBER102490S_AVX2_basemul_avx: | |||
.global cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx) | |||
cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx): | |||
#consts | |||
vmovdqa PQCLEAN_KYBER102490S_AVX2_16xq(%rip),%ymm0 | |||
vmovdqa PQCLEAN_KYBER102490S_AVX2_16xqinv(%rip),%ymm1 | |||
vmovdqu (%rcx),%ymm2 | |||
vmovdqa _16XQ*2(%rcx),%ymm0 | |||
vmovdqa _16XQINV*2(%rcx),%ymm1 | |||
vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 | |||
call basemul64_acc_avx | |||
vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 | |||
add $128,%rdi | |||
add $128,%rsi | |||
add $128,%rdx | |||
call basemul64_acc_avx | |||
vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 | |||
add $128,%rdi | |||
add $128,%rsi | |||
add $128,%rdx | |||
call basemul64_acc_avx | |||
vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 | |||
add $128,%rdi | |||
add $128,%rsi | |||
add $128,%rdx | |||
call basemul64_acc_avx | |||
ret | |||
basemul64_avx: | |||
schoolbook 0,0 | |||
#reduce | |||
red 14,9,12,7 8,10,11 | |||
red 14,9,12,7,8,10,11 | |||
#store | |||
vmovdqa %ymm14,(%rdi) | |||
@@ -187,10 +206,39 @@ vmovdqa %ymm12,32(%rdi) | |||
schoolbook 64,1 | |||
#reduce | |||
red 14,9,12,7 8,10,11 | |||
red 14,9,12,7,8,10,11 | |||
#store | |||
vmovdqa %ymm14,64(%rdi) | |||
vmovdqa %ymm12,96(%rdi) | |||
ret | |||
.global cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_avx) | |||
cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_avx): | |||
#consts | |||
vmovdqa _16XQ*2(%rcx),%ymm0 | |||
vmovdqa _16XQINV*2(%rcx),%ymm1 | |||
vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 | |||
call basemul64_avx | |||
vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 | |||
add $128,%rdi | |||
add $128,%rsi | |||
add $128,%rdx | |||
call basemul64_avx | |||
vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 | |||
add $128,%rdi | |||
add $128,%rsi | |||
add $128,%rdx | |||
call basemul64_avx | |||
vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 | |||
add $128,%rdi | |||
add $128,%rsi | |||
add $128,%rdx | |||
call basemul64_avx | |||
ret |
@@ -1,27 +1,27 @@ | |||
#include "cbd.h" | |||
#include "params.h" | |||
#include "cbd.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
/************************************************* | |||
* Name: cbd | |||
* Name: PQCLEAN_KYBER102490S_AVX2_cbd | |||
* | |||
* Description: Given an array of uniformly random bytes, compute | |||
* polynomial with coefficients distributed according to | |||
* a centered binomial distribution with parameter KYBER_ETA | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *buf: pointer to input byte array | |||
* - const unsigned char *buf: pointer to input byte array | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_cbd(poly *r, const uint8_t *buf) { | |||
void PQCLEAN_KYBER102490S_AVX2_cbd(poly *restrict r, const uint8_t *restrict buf) { | |||
unsigned int i = 0; | |||
__m256i vec0, vec1, vec2, vec3, tmp; | |||
const __m256i mask55 = _mm256_set1_epi32(0x55555555); | |||
const __m256i mask33 = _mm256_set1_epi32(0x33333333); | |||
const __m256i mask03 = _mm256_set1_epi32(0x03030303); | |||
for (size_t i = 0; i < KYBER_N / 64; i++) { | |||
vec0 = _mm256_loadu_si256((__m256i *)&buf[32 * i]); | |||
for (i = 0; i < KYBER_N / 64; i++) { | |||
vec0 = _mm256_load_si256((__m256i *)&buf[32 * i]); | |||
vec1 = _mm256_srli_epi32(vec0, 1); | |||
vec0 = _mm256_and_si256(mask55, vec0); | |||
@@ -1,8 +1,11 @@ | |||
#ifndef CBD_H | |||
#define CBD_H | |||
#ifndef PQCLEAN_KYBER102490S_AVX2_CBD_H | |||
#define PQCLEAN_KYBER102490S_AVX2_CBD_H | |||
#include "params.h" | |||
#include "poly.h" | |||
#include <stdint.h> | |||
void PQCLEAN_KYBER102490S_AVX2_cbd(poly *r, const uint8_t *buf); | |||
void PQCLEAN_KYBER102490S_AVX2_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]); | |||
#endif |
@@ -0,0 +1,30 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_AVX2_CDECL | |||
#define PQCLEAN_DILITHIUM2_AVX2_CDECL | |||
#define _16XQ 0 | |||
#define _16XQINV 16 | |||
#define _16XV 32 | |||
#define _16XFLO 48 | |||
#define _16XFHI 64 | |||
#define _16XMONTSQLO 80 | |||
#define _16XMONTSQHI 96 | |||
#define _16XMASK 112 | |||
#define _ZETAS_EXP 128 | |||
#define _ZETAS_INV_EXP 528 | |||
/* The C ABI on MacOS exports all symbols with a leading | |||
* underscore. This means that any symbols we refer to from | |||
* C files (functions) can't be found, and all symbols we | |||
* refer to from ASM also can't be found (nttconsts.c). | |||
* | |||
* This define helps us get around this | |||
*/ | |||
#if defined(__WIN32__) || defined(__APPLE__) | |||
#define cdecl(s) _##s | |||
#else | |||
#define cdecl(s) s | |||
#endif | |||
#endif |
@@ -1,34 +1,155 @@ | |||
#include "consts.h" | |||
#include "params.h" | |||
const uint16_t PQCLEAN_KYBER102490S_AVX2_zetas_exp[396] = {31499, 31499, 2571, 2571, 14746, 14746, 2970, 2970, 13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, 53134, 53134, 53134, 53134, 53134, 53134, 53134, 53134, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 44630, 44630, 44630, 44630, 27758, 27758, 27758, 27758, 61737, 61737, 61737, 61737, 49846, 49846, 49846, 49846, 3158, 3158, 3158, 3158, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 182, 182, 182, 182, 59709, 59709, 17364, 17364, 39176, 39176, 36479, 36479, 5572, 5572, 64434, 64434, 21439, 21439, 39295, 39295, 573, 573, 2004, 2004, 264, 264, 383, 383, 2500, 2500, 1458, 1458, 1727, 1727, 3199, 3199, 59847, 59020, 1497, 30967, 41972, 20179, 20711, 25081, 52740, 26617, 16065, 53095, 9135, 64887, 39550, 27837, 1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, 65202, 54059, 33310, 20494, 37798, 945, 50654, 6182, 32011, 10631, 29176, 36775, 47051, 17561, 51106, 60261, 2226, 555, 2078, 1550, 422, 177, 3038, 1574, 3083, 1159, 2552, 2727, 1739, 2457, 418, 3173, 11182, 13387, 51303, 43881, 13131, 60950, 23093, 5493, 33034, 30318, 46795, 12639, 20100, 18525, 19529, 52918, 430, 843, 871, 105, 587, 3094, 2869, 1653, 778, 3182, 1483, 1119, 644, 349, 329, 3254, 788, 788, 1812, 1812, 28191, 28191, 28191, 28191, 28191, 28191, 28191, 28191, 48842, 48842, 48842, 48842, 48842, 48842, 48842, 48842, 287, 287, 287, 287, 287, 287, 287, 287, 202, 202, 202, 202, 202, 202, 202, 202, 10690, 10690, 10690, 10690, 1359, 1359, 1359, 1359, 54335, 54335, 54335, 54335, 31164, 31164, 31164, 31164, 962, 962, 962, 962, 2127, 2127, 2127, 2127, 1855, 1855, 1855, 1855, 1468, 1468, 1468, 1468, 37464, 37464, 24313, 24313, 55004, 55004, 8800, 8800, 18427, 18427, 8859, 8859, 26676, 26676, 49374, 49374, 2648, 2648, 1017, 1017, 732, 732, 608, 608, 1787, 1787, 411, 411, 3124, 3124, 1758, 1758, 19884, 37287, 49650, 56638, 37227, 9076, 35338, 18250, 13427, 14017, 36381, 52780, 16832, 4312, 41381, 47622, 2476, 3239, 3058, 830, 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, 34353, 25435, 58154, 24392, 44610, 10946, 24215, 16990, 10336, 57603, 43035, 10907, 31637, 28644, 23998, 48114, 817, 603, 1322, 1864, 2114, 1218, 2455, 2142, 2144, 2051, 1819, 2459, 3221, 996, 958, 1522, 20297, 2146, 15356, 33152, 59257, 50634, 54492, 14470, 44039, 45338, 23211, 48094, 41677, 45279, 7757, 23132, 1097, 610, 2044, 384, 3193, 1994, 220, 1670, 1799, 794, 2475, 478, 3021, 991, 1869, 1628}; | |||
const uint16_t PQCLEAN_KYBER102490S_AVX2_zetas_inv_exp[396] = {42405, 57780, 20258, 23860, 17443, 42326, 20199, 21498, 51067, 11045, 14903, 6280, 32385, 50181, 63391, 45240, 1701, 1460, 2338, 308, 2851, 854, 2535, 1530, 1659, 3109, 1335, 136, 2945, 1285, 2719, 2232, 17423, 41539, 36893, 33900, 54630, 22502, 7934, 55201, 48547, 41322, 54591, 20927, 41145, 7383, 40102, 31184, 1807, 2371, 2333, 108, 870, 1510, 1278, 1185, 1187, 874, 2111, 1215, 1465, 2007, 2726, 2512, 17915, 24156, 61225, 48705, 12757, 29156, 51520, 52110, 47287, 30199, 56461, 28310, 8899, 15887, 28250, 45653, 1275, 2652, 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, 16163, 16163, 38861, 38861, 56678, 56678, 47110, 47110, 56737, 56737, 10533, 10533, 41224, 41224, 28073, 28073, 1571, 1571, 205, 205, 2918, 2918, 1542, 1542, 2721, 2721, 2597, 2597, 2312, 2312, 681, 681, 34373, 34373, 34373, 34373, 11202, 11202, 11202, 11202, 64178, 64178, 64178, 64178, 54847, 54847, 54847, 54847, 1861, 1861, 1861, 1861, 1474, 1474, 1474, 1474, 1202, 1202, 1202, 1202, 2367, 2367, 2367, 2367, 16695, 16695, 16695, 16695, 16695, 16695, 16695, 16695, 37346, 37346, 37346, 37346, 37346, 37346, 37346, 37346, 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3042, 3042, 3042, 3042, 3042, 3042, 3042, 3042, 64749, 64749, 1517, 1517, 12619, 46008, 47012, 45437, 52898, 18742, 35219, 32503, 60044, 42444, 4587, 52406, 21656, 14234, 52150, 54355, 75, 3000, 2980, 2685, 2210, 1846, 147, 2551, 1676, 460, 235, 2742, 3224, 2458, 2486, 2899, 5276, 14431, 47976, 18486, 28762, 36361, 54906, 33526, 59355, 14883, 64592, 27739, 45043, 32227, 11478, 335, 156, 2911, 872, 1590, 602, 777, 2170, 246, 1755, 291, 3152, 2907, 1779, 1251, 2774, 1103, 37700, 25987, 650, 56402, 12442, 49472, 38920, 12797, 40456, 44826, 45358, 23565, 34570, 64040, 6517, 5690, 1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, 2677, 2106, 26242, 26242, 44098, 44098, 1103, 1103, 59965, 59965, 29058, 29058, 26361, 26361, 48173, 48173, 5828, 5828, 130, 130, 1602, 1602, 1871, 1871, 829, 829, 2946, 2946, 3065, 3065, 1325, 1325, 2756, 2756, 15691, 15691, 15691, 15691, 3800, 3800, 3800, 3800, 37779, 37779, 37779, 37779, 20907, 20907, 20907, 20907, 3147, 3147, 3147, 3147, 1752, 1752, 1752, 1752, 2707, 2707, 2707, 2707, 171, 171, 171, 171, 12403, 12403, 12403, 12403, 12403, 12403, 12403, 12403, 52012, 52012, 52012, 52012, 52012, 52012, 52012, 52012, 1907, 1907, 1907, 1907, 1907, 1907, 1907, 1907, 1836, 1836, 1836, 1836, 1836, 1836, 1836, 1836, 50791, 50791, 359, 359, 60300, 60300, 1932, 1932}; | |||
#include "consts.h" | |||
#include <stdint.h> | |||
#define Q KYBER_Q | |||
#define MONT ((1U << 16) % KYBER_Q) | |||
#define MONT ((1U << 16) % Q) | |||
#define QINV 62209 // q^-1 mod 2^16 | |||
#define V ((1U << 26)/KYBER_Q + 1) | |||
#define FHI (MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q) | |||
#define FLO (FHI * QINV % 65536) | |||
#define MONTSQHI (MONT * MONT % KYBER_Q) | |||
#define MONTSQLO (MONTSQHI * QINV % 65536) | |||
#define V (((1U << 26) + Q/2)/Q) | |||
#define FHI (MONT*(MONT*(Q-1)*((Q-1)/128) % Q) % Q) | |||
#define FLO (FHI*QINV % 65536) | |||
#define MONTSQHI (MONT*MONT % Q) | |||
#define MONTSQLO (MONTSQHI*QINV % 65536) | |||
#define MASK 4095 | |||
const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q}}; | |||
const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}}; | |||
const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xv = {.as_arr = {V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V}}; | |||
const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xflo = {.as_arr = {FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO}}; | |||
const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xfhi = {.as_arr = {FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI}}; | |||
const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xmontsqlo = {.as_arr = {MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO}}; | |||
const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xmontsqhi = {.as_arr = {MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI}}; | |||
const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xmask = {.as_arr = {MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK}}; | |||
#undef Q | |||
#undef QINV | |||
#undef MONT | |||
#undef V | |||
#undef FLO | |||
#undef FHI | |||
#undef MONTSQLO | |||
#undef MONTSQHI | |||
#undef MASK | |||
const qdata_t PQCLEAN_KYBER102490S_AVX2_qdata = {.as_arr = { | |||
#define _16XQ 0 | |||
Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, | |||
#define _16XQINV 16 | |||
QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, | |||
QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, | |||
#define _16XV 32 | |||
V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, | |||
#define _16XFLO 48 | |||
FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, | |||
FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, | |||
#define _16XFHI 64 | |||
FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, | |||
FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, | |||
#define _16XMONTSQLO 80 | |||
MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, | |||
MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, | |||
MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, | |||
MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, | |||
#define _16XMONTSQHI 96 | |||
MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, | |||
MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, | |||
MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, | |||
MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, | |||
#define _16XMASK 112 | |||
MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, | |||
MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, | |||
#define _ZETAS_EXP 128 | |||
31499, 31499, 2571, 2571, 14746, 14746, 2970, 2970, | |||
13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, | |||
53134, 53134, 53134, 53134, 53134, 53134, 53134, 53134, | |||
1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, | |||
1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, | |||
44630, 44630, 44630, 44630, 27758, 27758, 27758, 27758, | |||
61737, 61737, 61737, 61737, 49846, 49846, 49846, 49846, | |||
3158, 3158, 3158, 3158, 622, 622, 622, 622, | |||
1577, 1577, 1577, 1577, 182, 182, 182, 182, | |||
59709, 59709, 17364, 17364, 39176, 39176, 36479, 36479, | |||
5572, 5572, 64434, 64434, 21439, 21439, 39295, 39295, | |||
573, 573, 2004, 2004, 264, 264, 383, 383, | |||
2500, 2500, 1458, 1458, 1727, 1727, 3199, 3199, | |||
59847, 59020, 1497, 30967, 41972, 20179, 20711, 25081, | |||
52740, 26617, 16065, 53095, 9135, 64887, 39550, 27837, | |||
1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, | |||
516, 3321, 3009, 2663, 1711, 2167, 126, 1469, | |||
65202, 54059, 33310, 20494, 37798, 945, 50654, 6182, | |||
32011, 10631, 29176, 36775, 47051, 17561, 51106, 60261, | |||
2226, 555, 2078, 1550, 422, 177, 3038, 1574, | |||
3083, 1159, 2552, 2727, 1739, 2457, 418, 3173, | |||
11182, 13387, 51303, 43881, 13131, 60950, 23093, 5493, | |||
33034, 30318, 46795, 12639, 20100, 18525, 19529, 52918, | |||
430, 843, 871, 105, 587, 3094, 2869, 1653, | |||
778, 3182, 1483, 1119, 644, 349, 329, 3254, | |||
788, 788, 1812, 1812, 28191, 28191, 28191, 28191, | |||
28191, 28191, 28191, 28191, 48842, 48842, 48842, 48842, | |||
48842, 48842, 48842, 48842, 287, 287, 287, 287, | |||
287, 287, 287, 287, 202, 202, 202, 202, | |||
202, 202, 202, 202, 10690, 10690, 10690, 10690, | |||
1359, 1359, 1359, 1359, 54335, 54335, 54335, 54335, | |||
31164, 31164, 31164, 31164, 962, 962, 962, 962, | |||
2127, 2127, 2127, 2127, 1855, 1855, 1855, 1855, | |||
1468, 1468, 1468, 1468, 37464, 37464, 24313, 24313, | |||
55004, 55004, 8800, 8800, 18427, 18427, 8859, 8859, | |||
26676, 26676, 49374, 49374, 2648, 2648, 1017, 1017, | |||
732, 732, 608, 608, 1787, 1787, 411, 411, | |||
3124, 3124, 1758, 1758, 19884, 37287, 49650, 56638, | |||
37227, 9076, 35338, 18250, 13427, 14017, 36381, 52780, | |||
16832, 4312, 41381, 47622, 2476, 3239, 3058, 830, | |||
107, 1908, 3082, 2378, 2931, 961, 1821, 2604, | |||
448, 2264, 677, 2054, 34353, 25435, 58154, 24392, | |||
44610, 10946, 24215, 16990, 10336, 57603, 43035, 10907, | |||
31637, 28644, 23998, 48114, 817, 603, 1322, 1864, | |||
2114, 1218, 2455, 2142, 2144, 2051, 1819, 2459, | |||
3221, 996, 958, 1522, 20297, 2146, 15356, 33152, | |||
59257, 50634, 54492, 14470, 44039, 45338, 23211, 48094, | |||
41677, 45279, 7757, 23132, 1097, 610, 2044, 384, | |||
3193, 1994, 220, 1670, 1799, 794, 2475, 478, | |||
3021, 991, 1869, 1628, 0, 0, 0, 0, | |||
#define _ZETAS_INV_EXP 528 | |||
42405, 57780, 20258, 23860, 17443, 42326, 20199, 21498, | |||
51067, 11045, 14903, 6280, 32385, 50181, 63391, 45240, | |||
1701, 1460, 2338, 308, 2851, 854, 2535, 1530, | |||
1659, 3109, 1335, 136, 2945, 1285, 2719, 2232, | |||
17423, 41539, 36893, 33900, 54630, 22502, 7934, 55201, | |||
48547, 41322, 54591, 20927, 41145, 7383, 40102, 31184, | |||
1807, 2371, 2333, 108, 870, 1510, 1278, 1185, | |||
1187, 874, 2111, 1215, 1465, 2007, 2726, 2512, | |||
17915, 24156, 61225, 48705, 12757, 29156, 51520, 52110, | |||
47287, 30199, 56461, 28310, 8899, 15887, 28250, 45653, | |||
1275, 2652, 1065, 2881, 725, 1508, 2368, 398, | |||
951, 247, 1421, 3222, 2499, 271, 90, 853, | |||
16163, 16163, 38861, 38861, 56678, 56678, 47110, 47110, | |||
56737, 56737, 10533, 10533, 41224, 41224, 28073, 28073, | |||
1571, 1571, 205, 205, 2918, 2918, 1542, 1542, | |||
2721, 2721, 2597, 2597, 2312, 2312, 681, 681, | |||
34373, 34373, 34373, 34373, 11202, 11202, 11202, 11202, | |||
64178, 64178, 64178, 64178, 54847, 54847, 54847, 54847, | |||
1861, 1861, 1861, 1861, 1474, 1474, 1474, 1474, | |||
1202, 1202, 1202, 1202, 2367, 2367, 2367, 2367, | |||
16695, 16695, 16695, 16695, 16695, 16695, 16695, 16695, | |||
37346, 37346, 37346, 37346, 37346, 37346, 37346, 37346, | |||
3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127, | |||
3042, 3042, 3042, 3042, 3042, 3042, 3042, 3042, | |||
64749, 64749, 1517, 1517, 12619, 46008, 47012, 45437, | |||
52898, 18742, 35219, 32503, 60044, 42444, 4587, 52406, | |||
21656, 14234, 52150, 54355, 75, 3000, 2980, 2685, | |||
2210, 1846, 147, 2551, 1676, 460, 235, 2742, | |||
3224, 2458, 2486, 2899, 5276, 14431, 47976, 18486, | |||
28762, 36361, 54906, 33526, 59355, 14883, 64592, 27739, | |||
45043, 32227, 11478, 335, 156, 2911, 872, 1590, | |||
602, 777, 2170, 246, 1755, 291, 3152, 2907, | |||
1779, 1251, 2774, 1103, 37700, 25987, 650, 56402, | |||
12442, 49472, 38920, 12797, 40456, 44826, 45358, 23565, | |||
34570, 64040, 6517, 5690, 1860, 3203, 1162, 1618, | |||
666, 320, 8, 2813, 1544, 282, 1838, 1293, | |||
2314, 552, 2677, 2106, 26242, 26242, 44098, 44098, | |||
1103, 1103, 59965, 59965, 29058, 29058, 26361, 26361, | |||
48173, 48173, 5828, 5828, 130, 130, 1602, 1602, | |||
1871, 1871, 829, 829, 2946, 2946, 3065, 3065, | |||
1325, 1325, 2756, 2756, 15691, 15691, 15691, 15691, | |||
3800, 3800, 3800, 3800, 37779, 37779, 37779, 37779, | |||
20907, 20907, 20907, 20907, 3147, 3147, 3147, 3147, | |||
1752, 1752, 1752, 1752, 2707, 2707, 2707, 2707, | |||
171, 171, 171, 171, 12403, 12403, 12403, 12403, | |||
12403, 12403, 12403, 12403, 52012, 52012, 52012, 52012, | |||
52012, 52012, 52012, 52012, 1907, 1907, 1907, 1907, | |||
1907, 1907, 1907, 1907, 1836, 1836, 1836, 1836, | |||
1836, 1836, 1836, 1836, 50791, 50791, 359, 359, | |||
60300, 60300, 1932, 1932, 0, 0, 0, 0 | |||
} | |||
}; |
@@ -1,24 +1,20 @@ | |||
#ifndef CONSTS_H | |||
#define CONSTS_H | |||
#ifndef PQCLEAN_KYBER102490S_AVX2_CONSTS_H | |||
#define PQCLEAN_KYBER102490S_AVX2_CONSTS_H | |||
#include "cdecl.inc" | |||
#include "params.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
typedef union { | |||
uint16_t as_arr[16]; | |||
__m256i as_vec; | |||
} aligned_uint16_t; | |||
#define ALIGNED_UINT16_T(N) \ | |||
union { \ | |||
__m256i as_vec; \ | |||
uint16_t as_arr[(N)]; \ | |||
} | |||
extern const uint16_t PQCLEAN_KYBER102490S_AVX2_zetas_exp[396]; | |||
extern const uint16_t PQCLEAN_KYBER102490S_AVX2_zetas_inv_exp[396]; | |||
typedef ALIGNED_UINT16_T(928) qdata_t; | |||
extern const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xq; | |||
extern const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xqinv; | |||
extern const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xv; | |||
extern const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xflo; | |||
extern const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xfhi; | |||
extern const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xmontsqlo; | |||
extern const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xmontsqhi; | |||
extern const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xmask; | |||
extern const qdata_t PQCLEAN_KYBER102490S_AVX2_qdata; | |||
#endif |
@@ -0,0 +1,129 @@ | |||
#include "cdecl.inc" | |||
.include "fq.inc" | |||
.text | |||
reduce128_avx: | |||
#load | |||
vmovdqa (%rdi),%ymm2 | |||
vmovdqa 32(%rdi),%ymm3 | |||
vmovdqa 64(%rdi),%ymm4 | |||
vmovdqa 96(%rdi),%ymm5 | |||
vmovdqa 128(%rdi),%ymm6 | |||
vmovdqa 160(%rdi),%ymm7 | |||
vmovdqa 192(%rdi),%ymm8 | |||
vmovdqa 224(%rdi),%ymm9 | |||
red16 2,10 | |||
red16 3,11 | |||
red16 4,12 | |||
red16 5,13 | |||
red16 6,14 | |||
red16 7,15 | |||
red16 8,10 | |||
red16 9,11 | |||
#store | |||
vmovdqa %ymm2,(%rdi) | |||
vmovdqa %ymm3,32(%rdi) | |||
vmovdqa %ymm4,64(%rdi) | |||
vmovdqa %ymm5,96(%rdi) | |||
vmovdqa %ymm6,128(%rdi) | |||
vmovdqa %ymm7,160(%rdi) | |||
vmovdqa %ymm8,192(%rdi) | |||
vmovdqa %ymm9,224(%rdi) | |||
ret | |||
.global cdecl(PQCLEAN_KYBER102490S_AVX2_reduce_avx) | |||
cdecl(PQCLEAN_KYBER102490S_AVX2_reduce_avx): | |||
#consts | |||
vmovdqa _16XQ*2(%rsi),%ymm0 | |||
vmovdqa _16XV*2(%rsi),%ymm1 | |||
call reduce128_avx | |||
add $256,%rdi | |||
call reduce128_avx | |||
ret | |||
csubq128_avx: | |||
#load | |||
vmovdqa (%rdi),%ymm1 | |||
vmovdqa 32(%rdi),%ymm2 | |||
vmovdqa 64(%rdi),%ymm3 | |||
vmovdqa 96(%rdi),%ymm4 | |||
vmovdqa 128(%rdi),%ymm5 | |||
vmovdqa 160(%rdi),%ymm6 | |||
vmovdqa 192(%rdi),%ymm7 | |||
vmovdqa 224(%rdi),%ymm8 | |||
csubq 1,9 | |||
csubq 2,10 | |||
csubq 3,11 | |||
csubq 4,12 | |||
csubq 5,13 | |||
csubq 6,14 | |||
csubq 7,15 | |||
csubq 8,9 | |||
#store | |||
vmovdqa %ymm1,(%rdi) | |||
vmovdqa %ymm2,32(%rdi) | |||
vmovdqa %ymm3,64(%rdi) | |||
vmovdqa %ymm4,96(%rdi) | |||
vmovdqa %ymm5,128(%rdi) | |||
vmovdqa %ymm6,160(%rdi) | |||
vmovdqa %ymm7,192(%rdi) | |||
vmovdqa %ymm8,224(%rdi) | |||
ret | |||
.global cdecl(PQCLEAN_KYBER102490S_AVX2_csubq_avx) | |||
cdecl(PQCLEAN_KYBER102490S_AVX2_csubq_avx): | |||
#consts | |||
vmovdqa _16XQ*2(%rsi),%ymm0 | |||
call csubq128_avx | |||
add $256,%rdi | |||
call csubq128_avx | |||
ret | |||
tomont128_avx: | |||
#load | |||
vmovdqa (%rdi),%ymm3 | |||
vmovdqa 32(%rdi),%ymm4 | |||
vmovdqa 64(%rdi),%ymm5 | |||
vmovdqa 96(%rdi),%ymm6 | |||
vmovdqa 128(%rdi),%ymm7 | |||
vmovdqa 160(%rdi),%ymm8 | |||
vmovdqa 192(%rdi),%ymm9 | |||
vmovdqa 224(%rdi),%ymm10 | |||
fqmulprecomp 1,2,3,11 | |||
fqmulprecomp 1,2,4,12 | |||
fqmulprecomp 1,2,5,13 | |||
fqmulprecomp 1,2,6,14 | |||
fqmulprecomp 1,2,7,15 | |||
fqmulprecomp 1,2,8,11 | |||
fqmulprecomp 1,2,9,12 | |||
fqmulprecomp 1,2,10,13 | |||
#store | |||
vmovdqa %ymm3,(%rdi) | |||
vmovdqa %ymm4,32(%rdi) | |||
vmovdqa %ymm5,64(%rdi) | |||
vmovdqa %ymm6,96(%rdi) | |||
vmovdqa %ymm7,128(%rdi) | |||
vmovdqa %ymm8,160(%rdi) | |||
vmovdqa %ymm9,192(%rdi) | |||
vmovdqa %ymm10,224(%rdi) | |||
ret | |||
.global cdecl(PQCLEAN_KYBER102490S_AVX2_tomont_avx) | |||
cdecl(PQCLEAN_KYBER102490S_AVX2_tomont_avx): | |||
#consts | |||
vmovdqa _16XQ*2(%rsi),%ymm0 | |||
vmovdqa _16XMONTSQLO*2(%rsi),%ymm1 | |||
vmovdqa _16XMONTSQHI*2(%rsi),%ymm2 | |||
call tomont128_avx | |||
add $256,%rdi | |||
call tomont128_avx | |||
ret |
@@ -1,24 +1,27 @@ | |||
.macro red16 r x=12 | |||
.macro red16 r,x=12 | |||
vpmulhw %ymm1,%ymm\r,%ymm\x | |||
vpsraw $10,%ymm\x,%ymm\x | |||
vpmullw %ymm0,%ymm\x,%ymm\x | |||
vpsubw %ymm\x,%ymm\r,%ymm\r | |||
.endm | |||
.macro csubq r x=12 | |||
.macro csubq r,x=12 | |||
vpsubw %ymm0,%ymm\r,%ymm\r | |||
vpsraw $15,%ymm\r,%ymm\x | |||
vpand %ymm0,%ymm\x,%ymm\x | |||
vpaddw %ymm\x,%ymm\r,%ymm\r | |||
#vpcmpgtw %ymm0,%ymm\r,%ymm\x | |||
#vpand %ymm0,%ymm\x,%ymm\x | |||
#vpsubw %ymm\x,%ymm\r,%ymm\r | |||
.endm | |||
.macro caddq r x=12 | |||
.macro caddq r,x=12 | |||
vpsraw $15,%ymm\r,%ymm\x | |||
vpand %ymm0,%ymm\x,%ymm\x | |||
vpaddw %ymm\x,%ymm\r,%ymm\r | |||
.endm | |||
.macro fqmulprecomp al,ah,b x=12 | |||
.macro fqmulprecomp al,ah,b,x=12 | |||
vpmullw %ymm\al,%ymm\b,%ymm\x | |||
vpmulhw %ymm\ah,%ymm\b,%ymm\b | |||
vpmulhw %ymm0,%ymm\x,%ymm\x | |||
@@ -1,26 +1,33 @@ | |||
#include "align.h" | |||
#include "cbd.h" | |||
#include "indcpa.h" | |||
#include "ntt.h" | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "polyvec.h" | |||
#include "randombytes.h" | |||
#include "rejsample.h" | |||
#include "symmetric.h" | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
/************************************************* | |||
* Name: pack_pk | |||
* | |||
* Description: Serialize the public key as concatenation of the | |||
* compressed and serialized vector of polynomials pk | |||
* serialized vector of polynomials pk | |||
* and the public seed used to generate the matrix A. | |||
* | |||
* Arguments: uint8_t *r: pointer to the output serialized public key | |||
* const poly *pk: pointer to the input public-key polynomial | |||
* polyvec *pk: pointer to the input public-key polyvec | |||
* const uint8_t *seed: pointer to the input public seed | |||
**************************************************/ | |||
static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) { | |||
static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], | |||
polyvec *pk, | |||
const uint8_t seed[KYBER_SYMBYTES]) { | |||
size_t i = 0; | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_tobytes(r, pk); | |||
for (size_t i = 0; i < KYBER_SYMBYTES; i++) { | |||
for (i = 0; i < KYBER_SYMBYTES; i++) { | |||
r[i + KYBER_POLYVECBYTES] = seed[i]; | |||
} | |||
} | |||
@@ -28,16 +35,19 @@ static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) { | |||
/************************************************* | |||
* Name: unpack_pk | |||
* | |||
* Description: De-serialize and decompress public key from a byte array; | |||
* Description: De-serialize public key from a byte array; | |||
* approximate inverse of pack_pk | |||
* | |||
* Arguments: - polyvec *pk: pointer to output public-key vector of polynomials | |||
* - uint8_t *seed: pointer to output seed to generate matrix A | |||
* Arguments: - polyvec *pk: pointer to output public-key polynomial vector | |||
* - uint8_t *seed: pointer to output seed to generate matrix A | |||
* - const uint8_t *packedpk: pointer to input serialized public key | |||
**************************************************/ | |||
static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) { | |||
static void unpack_pk(polyvec *pk, | |||
uint8_t seed[KYBER_SYMBYTES], | |||
const uint8_t packedpk[KYBER_INDCPA_PUBLICKEYBYTES]) { | |||
size_t i = 0; | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_frombytes(pk, packedpk); | |||
for (size_t i = 0; i < KYBER_SYMBYTES; i++) { | |||
for (i = 0; i < KYBER_SYMBYTES; i++) { | |||
seed[i] = packedpk[i + KYBER_POLYVECBYTES]; | |||
} | |||
} | |||
@@ -48,9 +58,9 @@ static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) { | |||
* Description: Serialize the secret key | |||
* | |||
* Arguments: - uint8_t *r: pointer to output serialized secret key | |||
* - const polyvec *sk: pointer to input vector of polynomials (secret key) | |||
* - polyvec *sk: pointer to input vector of polynomials (secret key) | |||
**************************************************/ | |||
static void pack_sk(uint8_t *r, polyvec *sk) { | |||
static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_tobytes(r, sk); | |||
} | |||
@@ -60,10 +70,12 @@ static void pack_sk(uint8_t *r, polyvec *sk) { | |||
* Description: De-serialize the secret key; | |||
* inverse of pack_sk | |||
* | |||
* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key) | |||
* Arguments: - polyvec *sk: pointer to output vector of polynomials | |||
* (secret key) | |||
* - const uint8_t *packedsk: pointer to input serialized secret key | |||
**************************************************/ | |||
static void unpack_sk(polyvec *sk, const uint8_t *packedsk) { | |||
static void unpack_sk(polyvec *sk, | |||
const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_frombytes(sk, packedsk); | |||
} | |||
@@ -74,11 +86,13 @@ static void unpack_sk(polyvec *sk, const uint8_t *packedsk) { | |||
* compressed and serialized vector of polynomials b | |||
* and the compressed and serialized polynomial v | |||
* | |||
* Arguments: uint8_t *r: pointer to the output serialized ciphertext | |||
* const poly *pk: pointer to the input vector of polynomials b | |||
* const uint8_t *seed: pointer to the input polynomial v | |||
* Arguments: uint8_t *r: pointer to the output serialized ciphertext | |||
* poly *pk: pointer to the input vector of polynomials b | |||
* poly *v: pointer to the input polynomial v | |||
**************************************************/ | |||
static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) { | |||
static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], | |||
polyvec *b, | |||
poly *v) { | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_compress(r, b); | |||
PQCLEAN_KYBER102490S_AVX2_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v); | |||
} | |||
@@ -89,22 +103,42 @@ static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) { | |||
* Description: De-serialize and decompress ciphertext from a byte array; | |||
* approximate inverse of pack_ciphertext | |||
* | |||
* Arguments: - polyvec *b: pointer to the output vector of polynomials b | |||
* - poly *v: pointer to the output polynomial v | |||
* Arguments: - polyvec *b: pointer to the output vector of polynomials b | |||
* - poly *v: pointer to the output polynomial v | |||
* - const uint8_t *c: pointer to the input serialized ciphertext | |||
**************************************************/ | |||
static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t *c) { | |||
static void unpack_ciphertext(polyvec *b, | |||
poly *v, | |||
const uint8_t c[KYBER_INDCPA_BYTES]) { | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_decompress(b, c); | |||
PQCLEAN_KYBER102490S_AVX2_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES); | |||
} | |||
static size_t rej_uniform_ref(int16_t *r, size_t len, const uint8_t *buf, size_t buflen) { | |||
unsigned int ctr, pos; | |||
uint16_t val; | |||
/************************************************* | |||
* Name: rej_uniform | |||
* | |||
* Description: Run rejection sampling on uniform random bytes to generate | |||
* uniform random integers mod q | |||
* | |||
* Arguments: - int16_t *r: pointer to output buffer | |||
* - unsigned int len: requested number of 16-bit integers | |||
* (uniform mod q) | |||
* - const uint8_t *buf: pointer to input buffer | |||
* (assumed to be uniform random bytes) | |||
* - unsigned int buflen: length of input buffer in bytes | |||
* | |||
* Returns number of sampled 16-bit integers (at most len) | |||
**************************************************/ | |||
static unsigned int rej_uniform(int16_t *r, | |||
unsigned int len, | |||
const uint8_t *buf, | |||
unsigned int buflen) { | |||
unsigned int ctr = 0, pos = 0; | |||
uint16_t val = 0; | |||
ctr = pos = 0; | |||
while (ctr < len && pos + 2 <= buflen) { | |||
val = (uint16_t)(buf[pos] | ((uint16_t)buf[pos + 1] << 8)); | |||
val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); | |||
pos += 2; | |||
if (val < 19 * KYBER_Q) { | |||
@@ -116,46 +150,47 @@ static size_t rej_uniform_ref(int16_t *r, size_t len, const uint8_t *buf, size_t | |||
return ctr; | |||
} | |||
#define gen_a(A,B) gen_matrix(A,B,0) | |||
#define gen_at(A,B) gen_matrix(A,B,1) | |||
#define gen_a(A,B) PQCLEAN_KYBER102490S_AVX2_gen_matrix(A,B,0) | |||
#define gen_at(A,B) PQCLEAN_KYBER102490S_AVX2_gen_matrix(A,B,1) | |||
/************************************************* | |||
* Name: gen_matrix | |||
* Name: PQCLEAN_KYBER102490S_AVX2_gen_matrix | |||
* | |||
* Description: Deterministically generate matrix A (or the transpose of A) | |||
* from a seed. Entries of the matrix are polynomials that look | |||
* uniformly random. Performs rejection sampling on output of | |||
* a XOF | |||
* | |||
* Arguments: - polyvec *a: pointer to ouptput matrix A | |||
* Arguments: - polyvec *a: pointer to ouptput matrix A | |||
* - const uint8_t *seed: pointer to input seed | |||
* - int transposed: boolean deciding whether A or A^T is generated | |||
* - int transposed: boolean deciding whether A or A^T is generated | |||
**************************************************/ | |||
#define GEN_MATRIX_MAXNBLOCKS ((530+XOF_BLOCKBYTES)/XOF_BLOCKBYTES) /* 530 is expected number of required bytes */ | |||
static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { | |||
size_t ctr; | |||
union { | |||
uint8_t x[XOF_BLOCKBYTES * GEN_MATRIX_MAXNBLOCKS]; | |||
__m256i _dummy; | |||
} buf; | |||
#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \ | |||
+ XOF_BLOCKBYTES)/XOF_BLOCKBYTES) | |||
void PQCLEAN_KYBER102490S_AVX2_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed) { | |||
unsigned int ctr = 0, i = 0, j = 0; | |||
ALIGN16_TYPE(uint64_t) nonce = {.orig = 0}; | |||
ALIGN32_ARRAY(uint8_t, GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES) buf; | |||
aes256ctr_ctx state; | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, seed, 0); | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
for (size_t j = 0; j < KYBER_K; j++) { | |||
for (i = 0; i < KYBER_K; i++) { | |||
for (j = 0; j < KYBER_K; j++) { | |||
if (transposed) { | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_select(&state, (i << 8) + j); | |||
nonce.orig = (j << 8) | i; | |||
} else { | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_select(&state, (j << 8) + i); | |||
nonce.orig = (i << 8) | j; | |||
} | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.x, GEN_MATRIX_MAXNBLOCKS, &state); | |||
ctr = PQCLEAN_KYBER102490S_AVX2_rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf.x, GEN_MATRIX_MAXNBLOCKS * XOF_BLOCKBYTES); | |||
state.n = _mm_loadl_epi64(&nonce.vec); | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.arr, GEN_MATRIX_NBLOCKS, &state); | |||
ctr = PQCLEAN_KYBER102490S_AVX2_rej_uniform_avx(a[i].vec[j].coeffs, buf.arr); | |||
while (ctr < KYBER_N) { | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.x, 1, &state); | |||
ctr += rej_uniform_ref(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf.x, XOF_BLOCKBYTES); | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.arr, 1, &state); | |||
ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf.arr, | |||
XOF_BLOCKBYTES); | |||
} | |||
PQCLEAN_KYBER102490S_AVX2_poly_nttunpack(&a[i].vec[j]); | |||
@@ -164,47 +199,53 @@ static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { | |||
} | |||
/************************************************* | |||
* Name: indcpa_keypair | |||
* Name: PQCLEAN_KYBER102490S_AVX2_indcpa_keypair | |||
* | |||
* Description: Generates public and private key for the CPA-secure | |||
* public-key encryption scheme underlying Kyber | |||
* | |||
* Arguments: - uint8_t *pk: pointer to output public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) | |||
* - uint8_t *sk: pointer to output private key (of length KYBER_INDCPA_SECRETKEYBYTES bytes) | |||
* Arguments: - uint8_t *pk: pointer to output public key | |||
* (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) | |||
* - uint8_t *sk: pointer to output private key | |||
(of length KYBER_INDCPA_SECRETKEYBYTES bytes) | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk) { | |||
polyvec a[KYBER_K], skpv, e, pkpv; | |||
uint8_t buf[2 * KYBER_SYMBYTES]; | |||
const uint8_t *publicseed = buf; | |||
const uint8_t *noiseseed = buf + KYBER_SYMBYTES; | |||
uint8_t nonce = 0; | |||
void PQCLEAN_KYBER102490S_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], | |||
uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { | |||
unsigned int i = 0; | |||
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; | |||
const uint8_t *publicseed = buf.arr; | |||
const uint8_t *noiseseed = buf.arr + KYBER_SYMBYTES; | |||
polyvec a[KYBER_K], e, pkpv, skpv; | |||
randombytes(buf, KYBER_SYMBYTES); | |||
hash_g(buf, buf, KYBER_SYMBYTES); | |||
randombytes(buf.arr, KYBER_SYMBYTES); | |||
hash_g(buf.arr, buf.arr, KYBER_SYMBYTES); | |||
gen_a(a, publicseed); | |||
ALIGN16_TYPE(uint64_t) nonce = {.orig = 0}; | |||
aes256ctr_ctx state; | |||
uint8_t coins[128]; | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, noiseseed, 0); | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_select(&state, (uint16_t)nonce++ << 8); | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(coins, 1, &state); | |||
PQCLEAN_KYBER102490S_AVX2_cbd(skpv.vec + i, coins); | |||
ALIGN32_ARRAY(uint8_t, 128) coins; | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, noiseseed, nonce.orig++); | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(coins.arr, 2, &state); | |||
state.n = _mm_loadl_epi64(&nonce.vec); | |||
nonce.orig++; | |||
PQCLEAN_KYBER102490S_AVX2_cbd(&skpv.vec[i], coins.arr); | |||
} | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_select(&state, (uint16_t)nonce++ << 8); | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(coins, 1, &state); | |||
PQCLEAN_KYBER102490S_AVX2_cbd(e.vec + i, coins); | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(coins.arr, 2, &state); | |||
state.n = _mm_loadl_epi64(&nonce.vec); | |||
nonce.orig++; | |||
PQCLEAN_KYBER102490S_AVX2_cbd(&e.vec[i], coins.arr); | |||
} | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(&skpv); | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(&e); | |||
// matrix-vector multiplication | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc(pkpv.vec + i, a + i, &skpv); | |||
PQCLEAN_KYBER102490S_AVX2_poly_frommont(pkpv.vec + i); | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); | |||
PQCLEAN_KYBER102490S_AVX2_poly_tomont(&pkpv.vec[i]); | |||
} | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_add(&pkpv, &pkpv, &e); | |||
@@ -215,58 +256,67 @@ void PQCLEAN_KYBER102490S_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk) { | |||
} | |||
/************************************************* | |||
* Name: indcpa_enc | |||
* Name: PQCLEAN_KYBER102490S_AVX2_indcpa_enc | |||
* | |||
* Description: Encryption function of the CPA-secure | |||
* public-key encryption scheme underlying Kyber. | |||
* | |||
* Arguments: - uint8_t *c: pointer to output ciphertext (of length KYBER_INDCPA_BYTES bytes) | |||
* - const uint8_t *m: pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes) | |||
* - const uint8_t *pk: pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) | |||
* - const uint8_t *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes) | |||
* to deterministically generate all randomness | |||
* Arguments: - uint8_t *c: pointer to output ciphertext | |||
* (of length KYBER_INDCPA_BYTES bytes) | |||
* - const uint8_t *m: pointer to input message | |||
* (of length KYBER_INDCPA_MSGBYTES bytes) | |||
* - const uint8_t *pk: pointer to input public key | |||
* (of length KYBER_INDCPA_PUBLICKEYBYTES) | |||
* - const uint8_t *coins: pointer to input random coins | |||
* used as seed (of length KYBER_SYMBYTES) | |||
* to deterministically generate all | |||
* randomness | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_indcpa_enc(uint8_t *c, | |||
const uint8_t *m, | |||
const uint8_t *pk, | |||
const uint8_t *coins) { | |||
polyvec at[KYBER_K], pkpv, sp, ep, bp; | |||
poly k, v, epp; | |||
uint8_t seed[KYBER_SYMBYTES]; | |||
uint8_t nonce = 0; | |||
unpack_pk(&pkpv, seed, pk); | |||
void PQCLEAN_KYBER102490S_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], | |||
const uint8_t m[KYBER_INDCPA_MSGBYTES], | |||
const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], | |||
const uint8_t coins[KYBER_SYMBYTES]) { | |||
unsigned int i = 0; | |||
ALIGN32_ARRAY(uint8_t, KYBER_SYMBYTES) seed; | |||
polyvec sp, pkpv, ep, at[KYBER_K], bp; | |||
poly v, k, epp; | |||
unpack_pk(&pkpv, seed.arr, pk); | |||
PQCLEAN_KYBER102490S_AVX2_poly_frommsg(&k, m); | |||
gen_at(at, seed); | |||
gen_at(at, seed.arr); | |||
ALIGN16_TYPE(uint64_t) nonce = {.orig = 0}; | |||
aes256ctr_ctx state; | |||
uint8_t buf[128]; | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, coins, 0); | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_select(&state, (uint16_t)nonce++ << 8); | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf, 1, &state); | |||
PQCLEAN_KYBER102490S_AVX2_cbd(sp.vec + i, buf); | |||
ALIGN32_ARRAY(uint8_t, 128) buf; | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, coins, nonce.orig++); | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.arr, 2, &state); | |||
state.n = _mm_loadl_epi64(&nonce.vec); | |||
nonce.orig++; | |||
PQCLEAN_KYBER102490S_AVX2_cbd(&sp.vec[i], buf.arr); | |||
} | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_select(&state, (uint16_t)nonce++ << 8); | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf, 1, &state); | |||
PQCLEAN_KYBER102490S_AVX2_cbd(ep.vec + i, buf); | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.arr, 2, &state); | |||
state.n = _mm_loadl_epi64(&nonce.vec); | |||
nonce.orig++; | |||
PQCLEAN_KYBER102490S_AVX2_cbd(&ep.vec[i], buf.arr); | |||
} | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_select(&state, (uint16_t)nonce++ << 8); | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf, 1, &state); | |||
PQCLEAN_KYBER102490S_AVX2_cbd(&epp, buf); | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.arr, 2, &state); | |||
state.n = _mm_loadl_epi64(&nonce.vec); | |||
nonce.orig++; | |||
PQCLEAN_KYBER102490S_AVX2_cbd(&epp, buf.arr); | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(&sp); | |||
// matrix-vector multiplication | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc(bp.vec + i, at + i, &sp); | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp); | |||
} | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc(&v, &pkpv, &sp); | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp); | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_invntt(&bp); | |||
PQCLEAN_KYBER102490S_AVX2_poly_invntt(&v); | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_invntt_tomont(&bp); | |||
PQCLEAN_KYBER102490S_AVX2_poly_invntt_tomont(&v); | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_add(&bp, &bp, &ep); | |||
PQCLEAN_KYBER102490S_AVX2_poly_add(&v, &v, &epp); | |||
@@ -278,18 +328,21 @@ void PQCLEAN_KYBER102490S_AVX2_indcpa_enc(uint8_t *c, | |||
} | |||
/************************************************* | |||
* Name: indcpa_dec | |||
* Name: PQCLEAN_KYBER102490S_AVX2_indcpa_dec | |||
* | |||
* Description: Decryption function of the CPA-secure | |||
* public-key encryption scheme underlying Kyber. | |||
* | |||
* Arguments: - uint8_t *m: pointer to output decrypted message (of length KYBER_INDCPA_MSGBYTES) | |||
* - const uint8_t *c: pointer to input ciphertext (of length KYBER_INDCPA_BYTES) | |||
* - const uint8_t *sk: pointer to input secret key (of length KYBER_INDCPA_SECRETKEYBYTES) | |||
* Arguments: - uint8_t *m: pointer to output decrypted message | |||
* (of length KYBER_INDCPA_MSGBYTES) | |||
* - const uint8_t *c: pointer to input ciphertext | |||
* (of length KYBER_INDCPA_BYTES) | |||
* - const uint8_t *sk: pointer to input secret key | |||
* (of length KYBER_INDCPA_SECRETKEYBYTES) | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_indcpa_dec(uint8_t *m, | |||
const uint8_t *c, | |||
const uint8_t *sk) { | |||
void PQCLEAN_KYBER102490S_AVX2_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], | |||
const uint8_t c[KYBER_INDCPA_BYTES], | |||
const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { | |||
polyvec bp, skpv; | |||
poly v, mp; | |||
@@ -297,8 +350,8 @@ void PQCLEAN_KYBER102490S_AVX2_indcpa_dec(uint8_t *m, | |||
unpack_sk(&skpv, sk); | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(&bp); | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc(&mp, &skpv, &bp); | |||
PQCLEAN_KYBER102490S_AVX2_poly_invntt(&mp); | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp); | |||
PQCLEAN_KYBER102490S_AVX2_poly_invntt_tomont(&mp); | |||
PQCLEAN_KYBER102490S_AVX2_poly_sub(&mp, &v, &mp); | |||
PQCLEAN_KYBER102490S_AVX2_poly_reduce(&mp); | |||
@@ -1,21 +1,16 @@ | |||
#ifndef INDCPA_H | |||
#define INDCPA_H | |||
#ifndef PQCLEAN_KYBER102490S_AVX2_INDCPA_H | |||
#define PQCLEAN_KYBER102490S_AVX2_INDCPA_H | |||
#include "params.h" | |||
#include "polyvec.h" | |||
#include <stdint.h> | |||
void PQCLEAN_KYBER102490S_AVX2_indcpa_keypair( | |||
uint8_t *pk, | |||
uint8_t *sk); | |||
void PQCLEAN_KYBER102490S_AVX2_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed); | |||
void PQCLEAN_KYBER102490S_AVX2_indcpa_enc( | |||
uint8_t *c, | |||
const uint8_t *m, | |||
const uint8_t *pk, | |||
const uint8_t *coins); | |||
void PQCLEAN_KYBER102490S_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]); | |||
void PQCLEAN_KYBER102490S_AVX2_indcpa_dec( | |||
uint8_t *m, | |||
const uint8_t *c, | |||
const uint8_t *sk); | |||
void PQCLEAN_KYBER102490S_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], const uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], const uint8_t coins[KYBER_SYMBYTES]); | |||
void PQCLEAN_KYBER102490S_AVX2_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t c[KYBER_INDCPA_BYTES], const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]); | |||
#endif |
@@ -1,7 +1,8 @@ | |||
#include "cdecl.inc" | |||
.include "shuffle.inc" | |||
.include "fq.inc" | |||
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 zl0=1,zl1=1,zh0=2,zh1=2 | |||
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=1,zl1=1,zh0=2,zh1=2 | |||
#update & mul | |||
vpsubw %ymm\rh0,%ymm\rl0,%ymm12 | |||
vpsubw %ymm\rh1,%ymm\rl1,%ymm13 | |||
@@ -36,12 +37,8 @@ vpsubw %ymm\rh2,%ymm14,%ymm\rh2 | |||
vpsubw %ymm\rh3,%ymm15,%ymm\rh3 | |||
.endm | |||
.global PQCLEAN_KYBER768_AVX2_invntt_levels0t5_avx | |||
.p2align 5 | |||
PQCLEAN_KYBER768_AVX2_invntt_levels0t5_avx: | |||
#consts | |||
vmovdqa PQCLEAN_KYBER768_AVX2_16xq(%rip),%ymm0 | |||
.text | |||
invntt_levels0t5_avx: | |||
level0: | |||
#zetas | |||
vmovdqu (%rsi),%ymm15 | |||
@@ -59,14 +56,14 @@ vmovdqa 160(%rdi),%ymm9 | |||
vmovdqa 192(%rdi),%ymm10 | |||
vmovdqa 224(%rdi),%ymm11 | |||
butterfly 4,5,8,9,6,7,10,11 15,3,1,2 | |||
butterfly 4,5,8,9,6,7,10,11,15,3,1,2 | |||
level1: | |||
#zetas | |||
vmovdqu 128(%rsi),%ymm3 | |||
vmovdqu 160(%rsi),%ymm2 | |||
butterfly 4,5,6,7,8,9,10,11 3,3,2,2 | |||
butterfly 4,5,6,7,8,9,10,11,3,3,2,2 | |||
shuffle1 4,5,3,5 | |||
shuffle1 6,7,4,7 | |||
@@ -79,9 +76,9 @@ vmovdqu 192(%rsi),%ymm10 | |||
vmovdqu 224(%rsi),%ymm2 | |||
#consts | |||
vmovdqa PQCLEAN_KYBER768_AVX2_16xv(%rip),%ymm1 | |||
vmovdqa _16XV*2(%rdx),%ymm1 | |||
butterfly 3,4,6,8,5,7,9,11 10,10,2,2 | |||
butterfly 3,4,6,8,5,7,9,11,10,10,2,2 | |||
red16 3 | |||
@@ -95,7 +92,7 @@ level3: | |||
vmovdqu 256(%rsi),%ymm9 | |||
vmovdqu 288(%rsi),%ymm2 | |||
butterfly 10,3,6,5,4,8,7,11 9,9,2,2 | |||
butterfly 10,3,6,5,4,8,7,11,9,9,2,2 | |||
red16 10 | |||
@@ -109,7 +106,7 @@ level4: | |||
vmovdqu 320(%rsi),%ymm7 | |||
vmovdqu 352(%rsi),%ymm2 | |||
butterfly 9,10,6,4,3,5,8,11 7,7,2,2 | |||
butterfly 9,10,6,4,3,5,8,11,7,7,2,2 | |||
red16 9 | |||
@@ -123,7 +120,7 @@ level5: | |||
vpbroadcastd 384(%rsi),%ymm8 | |||
vpbroadcastd 388(%rsi),%ymm2 | |||
butterfly 7,9,6,3,10,4,5,11 8,8,2,2 | |||
butterfly 7,9,6,3,10,4,5,11,8,8,2,2 | |||
red16 7 | |||
@@ -139,11 +136,7 @@ vmovdqa %ymm11,224(%rdi) | |||
ret | |||
.global PQCLEAN_KYBER768_AVX2_invntt_level6_avx | |||
PQCLEAN_KYBER768_AVX2_invntt_level6_avx: | |||
#consts | |||
vmovdqa PQCLEAN_KYBER768_AVX2_16xq(%rip),%ymm0 | |||
invntt_level6_avx: | |||
#zetas | |||
vpbroadcastd (%rsi),%ymm1 | |||
vpbroadcastd 4(%rsi),%ymm2 | |||
@@ -161,8 +154,8 @@ vmovdqa 352(%rdi),%ymm11 | |||
butterfly 4,5,6,7,8,9,10,11 | |||
#consts | |||
vmovdqa PQCLEAN_KYBER768_AVX2_16xflo(%rip),%ymm12 | |||
vmovdqa PQCLEAN_KYBER768_AVX2_16xfhi(%rip),%ymm13 | |||
vmovdqa _16XFLO*2(%rdx),%ymm12 | |||
vmovdqa _16XFHI*2(%rdx),%ymm13 | |||
#store | |||
vmovdqa %ymm8,256(%rdi) | |||
@@ -170,10 +163,10 @@ vmovdqa %ymm9,288(%rdi) | |||
vmovdqa %ymm10,320(%rdi) | |||
vmovdqa %ymm11,352(%rdi) | |||
fqmulprecomp 12,13,4 8 | |||
fqmulprecomp 12,13,5 9 | |||
fqmulprecomp 12,13,6 10 | |||
fqmulprecomp 12,13,7 11 | |||
fqmulprecomp 12,13,4,8 | |||
fqmulprecomp 12,13,5,9 | |||
fqmulprecomp 12,13,6,10 | |||
fqmulprecomp 12,13,7,11 | |||
#store | |||
vmovdqa %ymm4,(%rdi) | |||
@@ -194,8 +187,8 @@ vmovdqa 480(%rdi),%ymm11 | |||
butterfly 4,5,6,7,8,9,10,11 | |||
#consts | |||
vmovdqa PQCLEAN_KYBER768_AVX2_16xflo(%rip),%ymm12 | |||
vmovdqa PQCLEAN_KYBER768_AVX2_16xfhi(%rip),%ymm13 | |||
vmovdqa _16XFLO*2(%rdx),%ymm12 | |||
vmovdqa _16XFHI*2(%rdx),%ymm13 | |||
#store | |||
vmovdqa %ymm8,384(%rdi) | |||
@@ -203,10 +196,10 @@ vmovdqa %ymm9,416(%rdi) | |||
vmovdqa %ymm10,448(%rdi) | |||
vmovdqa %ymm11,480(%rdi) | |||
fqmulprecomp 12,13,4 8 | |||
fqmulprecomp 12,13,5 9 | |||
fqmulprecomp 12,13,6 10 | |||
fqmulprecomp 12,13,7 11 | |||
fqmulprecomp 12,13,4,8 | |||
fqmulprecomp 12,13,5,9 | |||
fqmulprecomp 12,13,6,10 | |||
fqmulprecomp 12,13,7,11 | |||
#store | |||
vmovdqa %ymm4,128(%rdi) | |||
@@ -215,3 +208,18 @@ vmovdqa %ymm6,192(%rdi) | |||
vmovdqa %ymm7,224(%rdi) | |||
ret | |||
.global cdecl(PQCLEAN_KYBER102490S_AVX2_invntt_avx) | |||
cdecl(PQCLEAN_KYBER102490S_AVX2_invntt_avx): | |||
#consts | |||
vmovdqa _16XQ*2(%rsi),%ymm0 | |||
mov %rsi,%rdx | |||
add $_ZETAS_INV_EXP*2,%rsi | |||
call invntt_levels0t5_avx | |||
add $256,%rdi | |||
add $392,%rsi | |||
call invntt_levels0t5_avx | |||
sub $256,%rdi | |||
add $392,%rsi | |||
call invntt_level6_avx | |||
ret |
@@ -1,217 +0,0 @@ | |||
.include "shuffle.inc" | |||
.include "fq.inc" | |||
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 zl0=1,zl1=1,zh0=2,zh1=2 | |||
#update & mul | |||
vpsubw %ymm\rh0,%ymm\rl0,%ymm12 | |||
vpsubw %ymm\rh1,%ymm\rl1,%ymm13 | |||
vpsubw %ymm\rh2,%ymm\rl2,%ymm14 | |||
vpaddw %ymm\rh0,%ymm\rl0,%ymm\rl0 | |||
vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl1 | |||
vpmullw %ymm\zl0,%ymm12,%ymm\rh0 | |||
vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl2 | |||
vpmullw %ymm\zl0,%ymm13,%ymm\rh1 | |||
vpsubw %ymm\rh3,%ymm\rl3,%ymm15 | |||
vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl3 | |||
vpmullw %ymm\zl1,%ymm14,%ymm\rh2 | |||
vpmullw %ymm\zl1,%ymm15,%ymm\rh3 | |||
vpmulhw %ymm\zh0,%ymm12,%ymm12 | |||
vpmulhw %ymm\zh0,%ymm13,%ymm13 | |||
vpmulhw %ymm\zh1,%ymm14,%ymm14 | |||
vpmulhw %ymm\zh1,%ymm15,%ymm15 | |||
#reduce | |||
vpmulhw %ymm0,%ymm\rh0,%ymm\rh0 | |||
vpmulhw %ymm0,%ymm\rh1,%ymm\rh1 | |||
vpmulhw %ymm0,%ymm\rh2,%ymm\rh2 | |||
vpmulhw %ymm0,%ymm\rh3,%ymm\rh3 | |||
vpsubw %ymm\rh0,%ymm12,%ymm\rh0 | |||
vpsubw %ymm\rh1,%ymm13,%ymm\rh1 | |||
vpsubw %ymm\rh2,%ymm14,%ymm\rh2 | |||
vpsubw %ymm\rh3,%ymm15,%ymm\rh3 | |||
.endm | |||
.global PQCLEAN_KYBER102490S_AVX2_invntt_levels0t5_avx | |||
.p2align 5 | |||
PQCLEAN_KYBER102490S_AVX2_invntt_levels0t5_avx: | |||
#consts | |||
vmovdqa PQCLEAN_KYBER102490S_AVX2_16xq(%rip),%ymm0 | |||
level0: | |||
#zetas | |||
vmovdqu (%rsi),%ymm15 | |||
vmovdqu 64(%rsi),%ymm3 | |||
vmovdqu 32(%rsi),%ymm1 | |||
vmovdqu 96(%rsi),%ymm2 | |||
#load | |||
vmovdqa (%rdi),%ymm4 | |||
vmovdqa 32(%rdi),%ymm5 | |||
vmovdqa 64(%rdi),%ymm6 | |||
vmovdqa 96(%rdi),%ymm7 | |||
vmovdqa 128(%rdi),%ymm8 | |||
vmovdqa 160(%rdi),%ymm9 | |||
vmovdqa 192(%rdi),%ymm10 | |||
vmovdqa 224(%rdi),%ymm11 | |||
butterfly 4,5,8,9,6,7,10,11 15,3,1,2 | |||
level1: | |||
#zetas | |||
vmovdqu 128(%rsi),%ymm3 | |||
vmovdqu 160(%rsi),%ymm2 | |||
butterfly 4,5,6,7,8,9,10,11 3,3,2,2 | |||
shuffle1 4,5,3,5 | |||
shuffle1 6,7,4,7 | |||
shuffle1 8,9,6,9 | |||
shuffle1 10,11,8,11 | |||
level2: | |||
#zetas | |||
vmovdqu 192(%rsi),%ymm10 | |||
vmovdqu 224(%rsi),%ymm2 | |||
#consts | |||
vmovdqa PQCLEAN_KYBER102490S_AVX2_16xv(%rip),%ymm1 | |||
butterfly 3,4,6,8,5,7,9,11 10,10,2,2 | |||
red16 3 | |||
shuffle2 3,4,10,4 | |||
shuffle2 6,8,3,8 | |||
shuffle2 5,7,6,7 | |||
shuffle2 9,11,5,11 | |||
level3: | |||
#zetas | |||
vmovdqu 256(%rsi),%ymm9 | |||
vmovdqu 288(%rsi),%ymm2 | |||
butterfly 10,3,6,5,4,8,7,11 9,9,2,2 | |||
red16 10 | |||
shuffle4 10,3,9,3 | |||
shuffle4 6,5,10,5 | |||
shuffle4 4,8,6,8 | |||
shuffle4 7,11,4,11 | |||
level4: | |||
#zetas | |||
vmovdqu 320(%rsi),%ymm7 | |||
vmovdqu 352(%rsi),%ymm2 | |||
butterfly 9,10,6,4,3,5,8,11 7,7,2,2 | |||
red16 9 | |||
shuffle8 9,10,7,10 | |||
shuffle8 6,4,9,4 | |||
shuffle8 3,5,6,5 | |||
shuffle8 8,11,3,11 | |||
level5: | |||
#zetas | |||
vpbroadcastd 384(%rsi),%ymm8 | |||
vpbroadcastd 388(%rsi),%ymm2 | |||
butterfly 7,9,6,3,10,4,5,11 8,8,2,2 | |||
red16 7 | |||
#store | |||
vmovdqa %ymm7,(%rdi) | |||
vmovdqa %ymm9,32(%rdi) | |||
vmovdqa %ymm6,64(%rdi) | |||
vmovdqa %ymm3,96(%rdi) | |||
vmovdqa %ymm10,128(%rdi) | |||
vmovdqa %ymm4,160(%rdi) | |||
vmovdqa %ymm5,192(%rdi) | |||
vmovdqa %ymm11,224(%rdi) | |||
ret | |||
.global PQCLEAN_KYBER102490S_AVX2_invntt_level6_avx | |||
PQCLEAN_KYBER102490S_AVX2_invntt_level6_avx: | |||
#consts | |||
vmovdqa PQCLEAN_KYBER102490S_AVX2_16xq(%rip),%ymm0 | |||
#zetas | |||
vpbroadcastd (%rsi),%ymm1 | |||
vpbroadcastd 4(%rsi),%ymm2 | |||
#load | |||
vmovdqa (%rdi),%ymm4 | |||
vmovdqa 32(%rdi),%ymm5 | |||
vmovdqa 64(%rdi),%ymm6 | |||
vmovdqa 96(%rdi),%ymm7 | |||
vmovdqa 256(%rdi),%ymm8 | |||
vmovdqa 288(%rdi),%ymm9 | |||
vmovdqa 320(%rdi),%ymm10 | |||
vmovdqa 352(%rdi),%ymm11 | |||
butterfly 4,5,6,7,8,9,10,11 | |||
#consts | |||
vmovdqa PQCLEAN_KYBER102490S_AVX2_16xflo(%rip),%ymm12 | |||
vmovdqa PQCLEAN_KYBER102490S_AVX2_16xfhi(%rip),%ymm13 | |||
#store | |||
vmovdqa %ymm8,256(%rdi) | |||
vmovdqa %ymm9,288(%rdi) | |||
vmovdqa %ymm10,320(%rdi) | |||
vmovdqa %ymm11,352(%rdi) | |||
fqmulprecomp 12,13,4 8 | |||
fqmulprecomp 12,13,5 9 | |||
fqmulprecomp 12,13,6 10 | |||
fqmulprecomp 12,13,7 11 | |||
#store | |||
vmovdqa %ymm4,(%rdi) | |||
vmovdqa %ymm5,32(%rdi) | |||
vmovdqa %ymm6,64(%rdi) | |||
vmovdqa %ymm7,96(%rdi) | |||
#load | |||
vmovdqa 128(%rdi),%ymm4 | |||
vmovdqa 160(%rdi),%ymm5 | |||
vmovdqa 192(%rdi),%ymm6 | |||
vmovdqa 224(%rdi),%ymm7 | |||
vmovdqa 384(%rdi),%ymm8 | |||
vmovdqa 416(%rdi),%ymm9 | |||
vmovdqa 448(%rdi),%ymm10 | |||
vmovdqa 480(%rdi),%ymm11 | |||
butterfly 4,5,6,7,8,9,10,11 | |||
#consts | |||
vmovdqa PQCLEAN_KYBER102490S_AVX2_16xflo(%rip),%ymm12 | |||
vmovdqa PQCLEAN_KYBER102490S_AVX2_16xfhi(%rip),%ymm13 | |||
#store | |||
vmovdqa %ymm8,384(%rdi) | |||
vmovdqa %ymm9,416(%rdi) | |||
vmovdqa %ymm10,448(%rdi) | |||
vmovdqa %ymm11,480(%rdi) | |||
fqmulprecomp 12,13,4 8 | |||
fqmulprecomp 12,13,5 9 | |||
fqmulprecomp 12,13,6 10 | |||
fqmulprecomp 12,13,7 11 | |||
#store | |||
vmovdqa %ymm4,128(%rdi) | |||
vmovdqa %ymm5,160(%rdi) | |||
vmovdqa %ymm6,192(%rdi) | |||
vmovdqa %ymm7,224(%rdi) | |||
ret |
@@ -1,103 +1,127 @@ | |||
#include "api.h" | |||
#include "align.h" | |||
#include "indcpa.h" | |||
#include "kem.h" | |||
#include "params.h" | |||
#include "randombytes.h" | |||
#include "symmetric.h" | |||
#include "verify.h" | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
#include <stdlib.h> | |||
/************************************************* | |||
* Name: crypto_kem_keypair | |||
* Name: PQCLEAN_KYBER102490S_AVX2_crypto_kem_keypair | |||
* | |||
* Description: Generates public and private key | |||
* for CCA-secure Kyber key encapsulation mechanism | |||
* | |||
* Arguments: - uint8_t *pk: pointer to output public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) | |||
* - uint8_t *sk: pointer to output private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) | |||
* Arguments: - unsigned char *pk: pointer to output public key | |||
* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) | |||
* - unsigned char *sk: pointer to output private key | |||
* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_KYBER102490S_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { | |||
size_t i; | |||
int PQCLEAN_KYBER102490S_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { | |||
size_t i = 0; | |||
PQCLEAN_KYBER102490S_AVX2_indcpa_keypair(pk, sk); | |||
for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) { | |||
sk[i + KYBER_INDCPA_SECRETKEYBYTES] = pk[i]; | |||
} | |||
hash_h(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); | |||
randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES); /* Value z for pseudo-random output on reject */ | |||
/* Value z for pseudo-random output on reject */ | |||
randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES); | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: crypto_kem_enc | |||
* Name: PQCLEAN_KYBER102490S_AVX2_crypto_kem_enc | |||
* | |||
* Description: Generates cipher text and shared | |||
* secret for given public key | |||
* | |||
* Arguments: - uint8_t *ct: pointer to output cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) | |||
* - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes) | |||
* - const uint8_t *pk: pointer to input public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) | |||
* Arguments: - unsigned char *ct: pointer to output cipher text | |||
* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) | |||
* - unsigned char *ss: pointer to output shared secret | |||
* (an already allocated array of CRYPTO_BYTES bytes) | |||
* - const unsigned char *pk: pointer to input public key | |||
* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_KYBER102490S_AVX2_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { | |||
uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ | |||
uint8_t buf[2 * KYBER_SYMBYTES]; | |||
int PQCLEAN_KYBER102490S_AVX2_crypto_kem_enc(unsigned char *ct, | |||
unsigned char *ss, | |||
const unsigned char *pk) { | |||
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; | |||
/* Will contain key, coins */ | |||
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr; | |||
randombytes(buf, KYBER_SYMBYTES); | |||
hash_h(buf, buf, KYBER_SYMBYTES); /* Don't release system RNG output */ | |||
randombytes(buf.arr, KYBER_SYMBYTES); | |||
/* Don't release system RNG output */ | |||
hash_h(buf.arr, buf.arr, KYBER_SYMBYTES); | |||
hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); /* Multitarget countermeasure for coins + contributory KEM */ | |||
hash_g(kr, buf, 2 * KYBER_SYMBYTES); | |||
/* Multitarget countermeasure for coins + contributory KEM */ | |||
hash_h(buf.arr + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); | |||
hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES); | |||
PQCLEAN_KYBER102490S_AVX2_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ | |||
/* coins are in kr+KYBER_SYMBYTES */ | |||
PQCLEAN_KYBER102490S_AVX2_indcpa_enc(ct, buf.arr, pk, kr.arr + KYBER_SYMBYTES); | |||
hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ | |||
kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */ | |||
/* overwrite coins in kr with H(c) */ | |||
hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); | |||
/* hash concatenation of pre-k and H(c) to k */ | |||
kdf(ss, kr.arr, 2 * KYBER_SYMBYTES); | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: crypto_kem_dec | |||
* Name: PQCLEAN_KYBER102490S_AVX2_crypto_kem_dec | |||
* | |||
* Description: Generates shared secret for given | |||
* cipher text and private key | |||
* | |||
* Arguments: - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes) | |||
* - const uint8_t *ct: pointer to input cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) | |||
* - const uint8_t *sk: pointer to input private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) | |||
* Arguments: - unsigned char *ss: pointer to output shared secret | |||
* (an already allocated array of CRYPTO_BYTES bytes) | |||
* - const unsigned char *ct: pointer to input cipher text | |||
* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) | |||
* - const unsigned char *sk: pointer to input private key | |||
* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) | |||
* | |||
* Returns 0. | |||
* | |||
* On failure, ss will contain a pseudo-random value. | |||
**************************************************/ | |||
int PQCLEAN_KYBER102490S_AVX2_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { | |||
size_t i; | |||
uint8_t fail; | |||
union { | |||
uint8_t x[KYBER_CIPHERTEXTBYTES]; | |||
__m256i __dummy; | |||
} _cmp; | |||
uint8_t *cmp = _cmp.x; | |||
uint8_t buf[2 * KYBER_SYMBYTES]; | |||
uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ | |||
int PQCLEAN_KYBER102490S_AVX2_crypto_kem_dec(unsigned char *ss, | |||
const unsigned char *ct, | |||
const unsigned char *sk) { | |||
size_t i = 0; | |||
int fail = 0; | |||
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; | |||
/* Will contain key, coins */ | |||
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr; | |||
uint8_t cmp[KYBER_CIPHERTEXTBYTES]; | |||
const uint8_t *pk = sk + KYBER_INDCPA_SECRETKEYBYTES; | |||
PQCLEAN_KYBER102490S_AVX2_indcpa_dec(buf, ct, sk); | |||
PQCLEAN_KYBER102490S_AVX2_indcpa_dec(buf.arr, ct, sk); | |||
for (i = 0; i < KYBER_SYMBYTES; i++) { /* Multitarget countermeasure for coins + contributory KEM */ | |||
buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; /* Save hash by storing H(pk) in sk */ | |||
/* Multitarget countermeasure for coins + contributory KEM */ | |||
for (i = 0; i < KYBER_SYMBYTES; i++) { | |||
buf.arr[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; | |||
} | |||
hash_g(kr, buf, 2 * KYBER_SYMBYTES); | |||
hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES); | |||
PQCLEAN_KYBER102490S_AVX2_indcpa_enc(cmp, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ | |||
/* coins are in kr+KYBER_SYMBYTES */ | |||
PQCLEAN_KYBER102490S_AVX2_indcpa_enc(cmp, buf.arr, pk, kr.arr + KYBER_SYMBYTES); | |||
fail = PQCLEAN_KYBER102490S_AVX2_verify(ct, cmp, KYBER_CIPHERTEXTBYTES); | |||
hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ | |||
/* overwrite coins in kr with H(c) */ | |||
hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); | |||
PQCLEAN_KYBER102490S_AVX2_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); /* Overwrite pre-k with z on re-encryption failure */ | |||
/* Overwrite pre-k with z on re-encryption failure */ | |||
PQCLEAN_KYBER102490S_AVX2_cmov(kr.arr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); | |||
kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */ | |||
/* hash concatenation of pre-k and H(c) to k */ | |||
kdf(ss, kr.arr, 2 * KYBER_SYMBYTES); | |||
return 0; | |||
} |
@@ -0,0 +1,19 @@ | |||
#ifndef PQCLEAN_KYBER102490S_AVX2_KEM_H | |||
#define PQCLEAN_KYBER102490S_AVX2_KEM_H | |||
#include "params.h" | |||
int PQCLEAN_KYBER102490S_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); | |||
int PQCLEAN_KYBER102490S_AVX2_crypto_kem_enc(unsigned char *ct, | |||
unsigned char *ss, | |||
const unsigned char *pk); | |||
int PQCLEAN_KYBER102490S_AVX2_crypto_kem_dec(unsigned char *ss, | |||
const unsigned char *ct, | |||
const unsigned char *sk); | |||
#endif |
@@ -0,0 +1,220 @@ | |||
#include "cdecl.inc" | |||
.include "shuffle.inc" | |||
.include "fq.inc" | |||
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=1,zh1=1 | |||
#mul | |||
vpmullw %ymm\zl0,%ymm\rh0,%ymm12 | |||
vpmullw %ymm\zl0,%ymm\rh1,%ymm13 | |||
vpmullw %ymm\zl1,%ymm\rh2,%ymm14 | |||
vpmullw %ymm\zl1,%ymm\rh3,%ymm15 | |||
vpmulhw %ymm\zh0,%ymm\rh0,%ymm\rh0 | |||
vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh1 | |||
vpmulhw %ymm\zh1,%ymm\rh2,%ymm\rh2 | |||
vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh3 | |||
#reduce | |||
vpmulhw %ymm0,%ymm12,%ymm12 | |||
vpmulhw %ymm0,%ymm13,%ymm13 | |||
vpmulhw %ymm0,%ymm14,%ymm14 | |||
vpmulhw %ymm0,%ymm15,%ymm15 | |||
vpsubw %ymm12,%ymm\rh0,%ymm12 | |||
vpsubw %ymm13,%ymm\rh1,%ymm13 | |||
vpsubw %ymm14,%ymm\rh2,%ymm14 | |||
vpsubw %ymm15,%ymm\rh3,%ymm15 | |||
#update | |||
vpsubw %ymm12,%ymm\rl0,%ymm\rh0 | |||
vpaddw %ymm12,%ymm\rl0,%ymm\rl0 | |||
vpsubw %ymm13,%ymm\rl1,%ymm\rh1 | |||
vpaddw %ymm13,%ymm\rl1,%ymm\rl1 | |||
vpsubw %ymm14,%ymm\rl2,%ymm\rh2 | |||
vpaddw %ymm14,%ymm\rl2,%ymm\rl2 | |||
vpsubw %ymm15,%ymm\rl3,%ymm\rh3 | |||
vpaddw %ymm15,%ymm\rl3,%ymm\rl3 | |||
.endm | |||
# We break the dependency chains with the cost of slightly more additions. | |||
# But they can be run in parallel to the multiplications on execution port 5 | |||
# (multiplications only go to ports 0 and 1) | |||
.macro butterfly2 rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,x=3,y=2,zl0=15,zl1=15,zh0=1,zh1=1 | |||
#mul | |||
vpmullw %ymm\zl0,%ymm\rh0,%ymm12 | |||
vpmulhw %ymm\zh0,%ymm\rh0,%ymm\x | |||
vpmullw %ymm\zl0,%ymm\rh1,%ymm13 | |||
vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh0 | |||
vpmullw %ymm\zl1,%ymm\rh2,%ymm14 | |||
vpmulhw %ymm\zh1,%ymm\rh2,%ymm\y | |||
vpmullw %ymm\zl1,%ymm\rh3,%ymm15 | |||
vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh2 | |||
#reduce | |||
vpmulhw %ymm0,%ymm12,%ymm12 | |||
vpmulhw %ymm0,%ymm13,%ymm13 | |||
vpmulhw %ymm0,%ymm14,%ymm14 | |||
vpmulhw %ymm0,%ymm15,%ymm15 | |||
vpsubw %ymm\rh0,%ymm\rl1,%ymm\rh1 | |||
vpaddw %ymm\rh0,%ymm\rl1,%ymm\rl1 | |||
vpsubw %ymm\x,%ymm\rl0,%ymm\rh0 | |||
vpaddw %ymm\x,%ymm\rl0,%ymm\rl0 | |||
vpsubw %ymm\rh2,%ymm\rl3,%ymm\rh3 | |||
vpaddw %ymm\rh2,%ymm\rl3,%ymm\rl3 | |||
vpsubw %ymm\y,%ymm\rl2,%ymm\rh2 | |||
vpaddw %ymm\y,%ymm\rl2,%ymm\rl2 | |||
#update | |||
vpaddw %ymm12,%ymm\rh0,%ymm\rh0 | |||
vpsubw %ymm12,%ymm\rl0,%ymm\rl0 | |||
vpaddw %ymm13,%ymm\rh1,%ymm\rh1 | |||
vpsubw %ymm13,%ymm\rl1,%ymm\rl1 | |||
vpaddw %ymm14,%ymm\rh2,%ymm\rh2 | |||
vpsubw %ymm14,%ymm\rl2,%ymm\rl2 | |||
vpaddw %ymm15,%ymm\rh3,%ymm\rh3 | |||
vpsubw %ymm15,%ymm\rl3,%ymm\rl3 | |||
.endm | |||
.text | |||
ntt_level0_avx: | |||
level0: | |||
#zetas | |||
vpbroadcastd (%rsi),%ymm15 | |||
vpbroadcastd 4(%rsi),%ymm1 | |||
#load | |||
vmovdqa (%rdi),%ymm4 | |||
vmovdqa 32(%rdi),%ymm5 | |||
vmovdqa 64(%rdi),%ymm6 | |||
vmovdqa 96(%rdi),%ymm7 | |||
vmovdqa 256(%rdi),%ymm8 | |||
vmovdqa 288(%rdi),%ymm9 | |||
vmovdqa 320(%rdi),%ymm10 | |||
vmovdqa 352(%rdi),%ymm11 | |||
butterfly2 4,5,6,7,8,9,10,11 | |||
#store | |||
vmovdqa %ymm4,(%rdi) | |||
vmovdqa %ymm5,32(%rdi) | |||
vmovdqa %ymm6,64(%rdi) | |||
vmovdqa %ymm7,96(%rdi) | |||
vmovdqa %ymm8,256(%rdi) | |||
vmovdqa %ymm9,288(%rdi) | |||
vmovdqa %ymm10,320(%rdi) | |||
vmovdqa %ymm11,352(%rdi) | |||
ret | |||
ntt_levels1t6_avx: | |||
level1: | |||
#zetas | |||
vpbroadcastd (%rsi),%ymm15 | |||
vpbroadcastd 4(%rsi),%ymm1 | |||
#load | |||
vmovdqa (%rdi),%ymm4 | |||
vmovdqa 32(%rdi),%ymm5 | |||
vmovdqa 64(%rdi),%ymm6 | |||
vmovdqa 96(%rdi),%ymm7 | |||
vmovdqa 128(%rdi),%ymm8 | |||
vmovdqa 160(%rdi),%ymm9 | |||
vmovdqa 192(%rdi),%ymm10 | |||
vmovdqa 224(%rdi),%ymm11 | |||
butterfly2 4,5,6,7,8,9,10,11,3 | |||
level2: | |||
#zetas | |||
vmovdqu 8(%rsi),%ymm15 | |||
vmovdqu 40(%rsi),%ymm1 | |||
shuffle8 4,8,3,8 | |||
shuffle8 5,9,4,9 | |||
shuffle8 6,10,5,10 | |||
shuffle8 7,11,6,11 | |||
butterfly2 3,8,4,9,5,10,6,11,7 | |||
level3: | |||
#zetas | |||
vmovdqu 72(%rsi),%ymm15 | |||
vmovdqu 104(%rsi),%ymm1 | |||
shuffle4 3,5,7,5 | |||
shuffle4 8,10,3,10 | |||
shuffle4 4,6,8,6 | |||
shuffle4 9,11,4,11 | |||
butterfly2 7,5,3,10,8,6,4,11,9 | |||
level4: | |||
#zetas | |||
vmovdqu 136(%rsi),%ymm15 | |||
vmovdqu 168(%rsi),%ymm1 | |||
shuffle2 7,8,9,8 | |||
shuffle2 5,6,7,6 | |||
shuffle2 3,4,5,4 | |||
shuffle2 10,11,3,11 | |||
butterfly2 9,8,7,6,5,4,3,11,10 | |||
level5: | |||
#zetas | |||
vmovdqu 200(%rsi),%ymm15 | |||
vmovdqu 232(%rsi),%ymm1 | |||
shuffle1 9,5,10,5 | |||
shuffle1 8,4,9,4 | |||
shuffle1 7,3,8,3 | |||
shuffle1 6,11,7,11 | |||
butterfly2 10,5,9,4,8,3,7,11,6 | |||
level6: | |||
#zetas | |||
vmovdqu 264(%rsi),%ymm14 | |||
vmovdqu 328(%rsi),%ymm15 | |||
vmovdqu 296(%rsi),%ymm1 | |||
vmovdqu 360(%rsi),%ymm2 | |||
butterfly2 10,5,8,3,9,4,7,11,6,1,14,15,1,2 | |||
vmovdqa _16XV*2(%rdx),%ymm1 | |||
red16 10,12 | |||
red16 5,13 | |||
red16 9,14 | |||
red16 4,15 | |||
red16 8,2 | |||
red16 3,6 | |||
red16 7,12 | |||
red16 11,13 | |||
#store | |||
vmovdqa %ymm10,(%rdi) | |||
vmovdqa %ymm5,32(%rdi) | |||
vmovdqa %ymm9,64(%rdi) | |||
vmovdqa %ymm4,96(%rdi) | |||
vmovdqa %ymm8,128(%rdi) | |||
vmovdqa %ymm3,160(%rdi) | |||
vmovdqa %ymm7,192(%rdi) | |||
vmovdqa %ymm11,224(%rdi) | |||
ret | |||
.global cdecl(PQCLEAN_KYBER102490S_AVX2_ntt_avx) | |||
cdecl(PQCLEAN_KYBER102490S_AVX2_ntt_avx): | |||
#consts | |||
vmovdqa _16XQ*2(%rsi),%ymm0 | |||
mov %rsi,%rdx | |||
add $_ZETAS_EXP*2,%rsi | |||
call ntt_level0_avx | |||
add $128,%rdi | |||
call ntt_level0_avx | |||
sub $128,%rdi | |||
add $8,%rsi | |||
call ntt_levels1t6_avx | |||
add $256,%rdi | |||
add $392,%rsi | |||
call ntt_levels1t6_avx | |||
ret |
@@ -2,19 +2,27 @@ | |||
#define NTT_H | |||
#include "consts.h" | |||
#include "params.h" | |||
#include <stdint.h> | |||
void PQCLEAN_KYBER102490S_AVX2_ntt_level0_avx(int16_t *r, const uint16_t *zetas); | |||
void PQCLEAN_KYBER102490S_AVX2_ntt_levels1t6_avx(int16_t *r, const uint16_t *zetas); | |||
void PQCLEAN_KYBER102490S_AVX2_invntt_levels0t5_avx(int16_t *r, const uint16_t *zetas); | |||
void PQCLEAN_KYBER102490S_AVX2_invntt_level6_avx(int16_t *r, const uint16_t *zetas); | |||
void PQCLEAN_KYBER102490S_AVX2_nttpack_avx(int16_t *r); | |||
void PQCLEAN_KYBER102490S_AVX2_nttunpack_avx(int16_t *r); | |||
void PQCLEAN_KYBER102490S_AVX2_basemul_avx(int16_t *r, const int16_t *a, const int16_t *b, const uint16_t *zeta); | |||
void PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx(int16_t *r, const int16_t *a, const int16_t *b, const uint16_t *zeta); | |||
void PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx(uint8_t *r, const int16_t *a); | |||
void PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx(int16_t *r, const uint8_t *a); | |||
void PQCLEAN_KYBER102490S_AVX2_ntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); | |||
void PQCLEAN_KYBER102490S_AVX2_invntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); | |||
void nttpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); | |||
void PQCLEAN_KYBER102490S_AVX2_nttunpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); | |||
void PQCLEAN_KYBER102490S_AVX2_basemul_avx(int16_t *r, const int16_t *a, const int16_t *b, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); | |||
void PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx(int16_t *r, const int16_t *a, const int16_t *b, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); | |||
void PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx(uint8_t *r, const int16_t *a, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); | |||
void PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx(int16_t *r, const uint8_t *a, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); | |||
#endif |
@@ -1,8 +1,5 @@ | |||
#ifndef PARAMS_H | |||
#define PARAMS_H | |||
/* Don't change parameters below this line */ | |||
#ifndef PQCLEAN_KYBER102490S_AVX2_PARAMS_H | |||
#define PQCLEAN_KYBER102490S_AVX2_PARAMS_H | |||
#define KYBER_N 256 | |||
#define KYBER_Q 3329 | |||
@@ -12,9 +9,8 @@ | |||
#define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ | |||
#define KYBER_SSBYTES 32 /* size in bytes of shared key */ | |||
#define KYBER_POLYBYTES 384 | |||
#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) | |||
#define KYBER_POLYBYTES 384 | |||
#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) | |||
#define KYBER_K 4 | |||
#define KYBER_POLYCOMPRESSEDBYTES 160 | |||
@@ -23,10 +19,14 @@ | |||
#define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES | |||
#define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) | |||
#define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) | |||
#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES) | |||
#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES \ | |||
+ KYBER_POLYCOMPRESSEDBYTES) | |||
#define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES) | |||
#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) /* 32 bytes of additional space to save H(pk) */ | |||
/* 32 bytes of additional space to save H(pk) */ | |||
#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES \ | |||
+ KYBER_INDCPA_PUBLICKEYBYTES \ | |||
+ 2*KYBER_SYMBYTES) | |||
#define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES | |||
#endif |
@@ -1,113 +1,210 @@ | |||
#include "align.h" | |||
#include "cbd.h" | |||
#include "consts.h" | |||
#include "ntt.h" | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "reduce.h" | |||
#include "symmetric.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
/************************************************* | |||
* Name: poly_compress | |||
* Name: PQCLEAN_KYBER102490S_AVX2_poly_compress | |||
* | |||
* Description: Compression and subsequent serialization of a polynomial | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* - const poly *a: pointer to input polynomial | |||
* (of length KYBER_POLYCOMPRESSEDBYTES) | |||
* - poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_poly_compress(uint8_t *r, poly *a) { | |||
void PQCLEAN_KYBER102490S_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *restrict a) { | |||
unsigned int i = 0, j = 0; | |||
uint8_t t[8]; | |||
size_t i, j, k = 0; | |||
PQCLEAN_KYBER102490S_AVX2_poly_csubq(a); | |||
for (i = 0; i < KYBER_N; i += 8) { | |||
for (i = 0; i < KYBER_N / 8; i++) { | |||
for (j = 0; j < 8; j++) { | |||
t[j] = (uint8_t)(((((uint32_t)a->coeffs[i + j] << 5) + KYBER_Q / 2) / KYBER_Q) & 31); | |||
t[j] = ((((uint32_t)a->coeffs[8 * i + j] << 5) + KYBER_Q / 2) / KYBER_Q) & 31; | |||
} | |||
r[k] = (uint8_t)( t[0] | (t[1] << 5)); | |||
r[k + 1] = (uint8_t)((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)); | |||
r[k + 2] = (uint8_t)((t[3] >> 1) | (t[4] << 4)); | |||
r[k + 3] = (uint8_t)((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)); | |||
r[k + 4] = (uint8_t)((t[6] >> 2) | (t[7] << 3)); | |||
k += 5; | |||
r[0] = (t[0] >> 0) | (t[1] << 5); | |||
r[1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7); | |||
r[2] = (t[3] >> 1) | (t[4] << 4); | |||
r[3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6); | |||
r[4] = (t[6] >> 2) | (t[7] << 3); | |||
r += 5; | |||
} | |||
} | |||
/************************************************* | |||
* Name: poly_decompress | |||
* Name: PQCLEAN_KYBER102490S_AVX2_poly_decompress | |||
* | |||
* Description: De-serialization and subsequent decompression of a polynomial; | |||
* approximate inverse of poly_compress | |||
* approximate inverse of PQCLEAN_KYBER102490S_AVX2_poly_compress | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *a: pointer to input byte array | |||
* (of length KYBER_POLYCOMPRESSEDBYTES bytes) | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_poly_decompress(poly *r, const uint8_t *a) { | |||
size_t i; | |||
for (i = 0; i < KYBER_N; i += 8) { | |||
r->coeffs[i + 0] = (int16_t)( (((a[0] & 31) * KYBER_Q) + 16) >> 5); | |||
r->coeffs[i + 1] = (int16_t)(((((a[0] >> 5) | ((a[1] & 3) << 3)) * KYBER_Q) + 16) >> 5); | |||
r->coeffs[i + 2] = (int16_t)(((((a[1] >> 2) & 31) * KYBER_Q) + 16) >> 5); | |||
r->coeffs[i + 3] = (int16_t)(((((a[1] >> 7) | ((a[2] & 15) << 1)) * KYBER_Q) + 16) >> 5); | |||
r->coeffs[i + 4] = (int16_t)(((((a[2] >> 4) | ((a[3] & 1) << 4)) * KYBER_Q) + 16) >> 5); | |||
r->coeffs[i + 5] = (int16_t)(((((a[3] >> 1) & 31) * KYBER_Q) + 16) >> 5); | |||
r->coeffs[i + 6] = (int16_t)(((((a[3] >> 6) | ((a[4] & 7) << 2)) * KYBER_Q) + 16) >> 5); | |||
r->coeffs[i + 7] = (int16_t)( (((a[4] >> 3) * KYBER_Q) + 16) >> 5); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_decompress(poly *restrict r, | |||
const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) { | |||
unsigned int i = 0; | |||
unsigned int j = 0; | |||
uint8_t t[8]; | |||
for (i = 0; i < KYBER_N / 8; i++) { | |||
t[0] = (a[0] >> 0); | |||
t[1] = (a[0] >> 5) | (a[1] << 3); | |||
t[2] = (a[1] >> 2); | |||
t[3] = (a[1] >> 7) | (a[2] << 1); | |||
t[4] = (a[2] >> 4) | (a[3] << 4); | |||
t[5] = (a[3] >> 1); | |||
t[6] = (a[3] >> 6) | (a[4] << 2); | |||
t[7] = (a[4] >> 3); | |||
a += 5; | |||
for (j = 0; j < 8; j++) { | |||
r->coeffs[8 * i + j] = ((uint32_t)(t[j] & 31) * KYBER_Q + 16) >> 5; | |||
} | |||
} | |||
} | |||
/************************************************* | |||
* Name: poly_tobytes | |||
* Name: PQCLEAN_KYBER102490S_AVX2_poly_tobytes | |||
* | |||
* Description: Serialization of a polynomial | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* - const poly *a: pointer to input polynomial | |||
* (needs space for KYBER_POLYBYTES bytes) | |||
* - poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_poly_tobytes(uint8_t *r, poly *a) { | |||
PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx(r, a->coeffs); | |||
PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx(r + 192, a->coeffs + 128); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { | |||
PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx(r, a->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); | |||
} | |||
/************************************************* | |||
* Name: poly_frombytes | |||
* Name: PQCLEAN_KYBER102490S_AVX2_poly_frombytes | |||
* | |||
* Description: De-serialization of a polynomial; | |||
* inverse of poly_tobytes | |||
* inverse of PQCLEAN_KYBER102490S_AVX2_poly_tobytes | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *a: pointer to input byte array | |||
* (of KYBER_POLYBYTES bytes) | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]) { | |||
PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx(r->coeffs, a, &PQCLEAN_KYBER102490S_AVX2_qdata); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_AVX2_poly_frommsg | |||
* | |||
* Description: Convert 32-byte message to polynomial | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *msg: pointer to input message | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_poly_frommsg(poly *restrict r, | |||
const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { | |||
__m256i f, g0, g1, g2, g3, h0, h1, h2, h3; | |||
const __m256i shift = _mm256_broadcastsi128_si256(_mm_set_epi32(0, 1, 2, 3)); | |||
const __m256i idx = _mm256_broadcastsi128_si256(_mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); | |||
const __m256i hqs = _mm256_set1_epi16((KYBER_Q + 1) / 2); | |||
#define FROMMSG64(i) \ | |||
g3 = _mm256_shuffle_epi32(f,0x55*(i)); \ | |||
g3 = _mm256_sllv_epi32(g3,shift); \ | |||
g3 = _mm256_shuffle_epi8(g3,idx); \ | |||
g0 = _mm256_slli_epi16(g3,12); \ | |||
g1 = _mm256_slli_epi16(g3,8); \ | |||
g2 = _mm256_slli_epi16(g3,4); \ | |||
g0 = _mm256_srai_epi16(g0,15); \ | |||
g1 = _mm256_srai_epi16(g1,15); \ | |||
g2 = _mm256_srai_epi16(g2,15); \ | |||
g3 = _mm256_srai_epi16(g3,15); \ | |||
g0 = _mm256_and_si256(g0,hqs); /* 19 18 17 16 3 2 1 0 */ \ | |||
g1 = _mm256_and_si256(g1,hqs); /* 23 22 21 20 7 6 5 4 */ \ | |||
g2 = _mm256_and_si256(g2,hqs); /* 27 26 25 24 11 10 9 8 */ \ | |||
g3 = _mm256_and_si256(g3,hqs); /* 31 30 29 28 15 14 13 12 */ \ | |||
h0 = _mm256_unpacklo_epi64(g0,g1); \ | |||
h2 = _mm256_unpackhi_epi64(g0,g1); \ | |||
h1 = _mm256_unpacklo_epi64(g2,g3); \ | |||
h3 = _mm256_unpackhi_epi64(g2,g3); \ | |||
g0 = _mm256_permute2x128_si256(h0,h1,0x20); \ | |||
g2 = _mm256_permute2x128_si256(h0,h1,0x31); \ | |||
g1 = _mm256_permute2x128_si256(h2,h3,0x20); \ | |||
g3 = _mm256_permute2x128_si256(h2,h3,0x31); \ | |||
_mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+ 0],g0); \ | |||
_mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+16],g1); \ | |||
_mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+ 0],g2); \ | |||
_mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+16],g3) | |||
f = _mm256_load_si256((__m256i *)msg); | |||
FROMMSG64(0); | |||
FROMMSG64(1); | |||
FROMMSG64(2); | |||
FROMMSG64(3); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_AVX2_poly_tomsg | |||
* | |||
* Description: Convert polynomial to 32-byte message | |||
* | |||
* Arguments: - uint8_t *msg: pointer to output message | |||
* - poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_poly_frombytes(poly *r, const uint8_t *a) { | |||
PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx(r->coeffs, a); | |||
PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx(r->coeffs + 128, a + 192); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *restrict a) { | |||
unsigned int i = 0; | |||
uint32_t small = 0; | |||
__m256i f0, f1, g0, g1; | |||
const __m256i hqs = _mm256_set1_epi16((KYBER_Q - 1) / 2); | |||
const __m256i hhqs = _mm256_set1_epi16((KYBER_Q - 5) / 4); | |||
for (i = 0; i < KYBER_N / 32; i++) { | |||
f0 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i]); | |||
f1 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i + 16]); | |||
f0 = _mm256_sub_epi16(hqs, f0); | |||
f1 = _mm256_sub_epi16(hqs, f1); | |||
g0 = _mm256_srai_epi16(f0, 15); | |||
g1 = _mm256_srai_epi16(f1, 15); | |||
f0 = _mm256_xor_si256(f0, g0); | |||
f1 = _mm256_xor_si256(f1, g1); | |||
f0 = _mm256_sub_epi16(hhqs, f0); | |||
f1 = _mm256_sub_epi16(hhqs, f1); | |||
f0 = _mm256_packs_epi16(f0, f1); | |||
small = _mm256_movemask_epi8(f0); | |||
small = ~small; | |||
msg[4 * i + 0] = small; | |||
msg[4 * i + 1] = small >> 16; | |||
msg[4 * i + 2] = small >> 8; | |||
msg[4 * i + 3] = small >> 24; | |||
} | |||
} | |||
/************************************************* | |||
* Name: poly_getnoise | |||
* Name: PQCLEAN_KYBER102490S_AVX2_poly_getnoise | |||
* | |||
* Description: Sample a polynomial deterministically from a seed and a nonce, | |||
* with output polynomial close to centered binomial distribution | |||
* with parameter KYBER_ETA | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *seed: pointer to input seed | |||
* (of length KYBER_SYMBYTES bytes) | |||
* - uint8_t nonce: one-byte input nonce | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce) { | |||
uint8_t buf[KYBER_ETA * KYBER_N / 4]; | |||
prf(buf, KYBER_ETA * KYBER_N / 4, seed, nonce); | |||
PQCLEAN_KYBER102490S_AVX2_cbd(r, buf); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { | |||
ALIGN32_ARRAY(uint8_t, KYBER_ETA * KYBER_N / 4) buf; | |||
prf(buf.arr, sizeof(buf.arr), seed, nonce); | |||
PQCLEAN_KYBER102490S_AVX2_cbd(r, buf.arr); | |||
} | |||
/************************************************* | |||
* Name: poly_ntt | |||
* Name: PQCLEAN_KYBER102490S_AVX2_poly_ntt | |||
* | |||
* Description: Computes negacyclic number-theoretic transform (NTT) of | |||
* a polynomial in place; | |||
@@ -116,73 +213,78 @@ void PQCLEAN_KYBER102490S_AVX2_poly_getnoise(poly *r, const uint8_t *seed, uint8 | |||
* Arguments: - uint16_t *r: pointer to in/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_poly_ntt(poly *r) { | |||
PQCLEAN_KYBER102490S_AVX2_ntt_level0_avx(r->coeffs, PQCLEAN_KYBER102490S_AVX2_zetas_exp); | |||
PQCLEAN_KYBER102490S_AVX2_ntt_level0_avx(r->coeffs + 64, PQCLEAN_KYBER102490S_AVX2_zetas_exp); | |||
PQCLEAN_KYBER102490S_AVX2_ntt_levels1t6_avx(r->coeffs, PQCLEAN_KYBER102490S_AVX2_zetas_exp + 4); | |||
PQCLEAN_KYBER102490S_AVX2_ntt_levels1t6_avx(r->coeffs + 128, PQCLEAN_KYBER102490S_AVX2_zetas_exp + 200); | |||
PQCLEAN_KYBER102490S_AVX2_ntt_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); | |||
} | |||
/************************************************* | |||
* Name: poly_invntt | |||
* Name: PQCLEAN_KYBER102490S_AVX2_poly_invntt_tomont | |||
* | |||
* Description: Computes inverse of negacyclic number-theoretic transform (NTT) of | |||
* a polynomial in place; | |||
* Description: Computes inverse of negacyclic number-theoretic transform (NTT) | |||
* of a polynomial in place; | |||
* inputs assumed to be in bitreversed order, output in normal order | |||
* | |||
* Arguments: - uint16_t *a: pointer to in/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_poly_invntt(poly *r) { | |||
PQCLEAN_KYBER102490S_AVX2_invntt_levels0t5_avx(r->coeffs, PQCLEAN_KYBER102490S_AVX2_zetas_inv_exp); | |||
PQCLEAN_KYBER102490S_AVX2_invntt_levels0t5_avx(r->coeffs + 128, PQCLEAN_KYBER102490S_AVX2_zetas_inv_exp + 196); | |||
PQCLEAN_KYBER102490S_AVX2_invntt_level6_avx(r->coeffs, PQCLEAN_KYBER102490S_AVX2_zetas_inv_exp + 392); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_invntt_tomont(poly *r) { | |||
PQCLEAN_KYBER102490S_AVX2_invntt_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); | |||
} | |||
// FIXME | |||
void PQCLEAN_KYBER102490S_AVX2_poly_nttunpack(poly *r) { | |||
PQCLEAN_KYBER102490S_AVX2_nttunpack_avx(r->coeffs); | |||
PQCLEAN_KYBER102490S_AVX2_nttunpack_avx(r->coeffs + 128); | |||
PQCLEAN_KYBER102490S_AVX2_nttunpack_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); | |||
} | |||
//XXX Add comment | |||
void PQCLEAN_KYBER102490S_AVX2_poly_basemul(poly *r, const poly *a, const poly *b) { | |||
PQCLEAN_KYBER102490S_AVX2_basemul_avx(r->coeffs, | |||
a->coeffs, | |||
b->coeffs, | |||
PQCLEAN_KYBER102490S_AVX2_zetas_exp + 152); | |||
PQCLEAN_KYBER102490S_AVX2_basemul_avx(r->coeffs + 64, | |||
a->coeffs + 64, | |||
b->coeffs + 64, | |||
PQCLEAN_KYBER102490S_AVX2_zetas_exp + 184); | |||
PQCLEAN_KYBER102490S_AVX2_basemul_avx(r->coeffs + 128, | |||
a->coeffs + 128, | |||
b->coeffs + 128, | |||
PQCLEAN_KYBER102490S_AVX2_zetas_exp + 348); | |||
PQCLEAN_KYBER102490S_AVX2_basemul_avx(r->coeffs + 192, | |||
a->coeffs + 192, | |||
b->coeffs + 192, | |||
PQCLEAN_KYBER102490S_AVX2_zetas_exp + 380); | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_AVX2_poly_basemul_montgomery | |||
* | |||
* Description: Multiplication of two polynomials in NTT domain | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const poly *a: pointer to first input polynomial | |||
* - const poly *b: pointer to second input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const poly *b) { | |||
PQCLEAN_KYBER102490S_AVX2_basemul_avx(r->coeffs, a->coeffs, b->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); | |||
} | |||
// FIXME | |||
void PQCLEAN_KYBER102490S_AVX2_poly_frommont(poly *r) { | |||
PQCLEAN_KYBER102490S_AVX2_frommont_avx(r->coeffs); | |||
PQCLEAN_KYBER102490S_AVX2_frommont_avx(r->coeffs + 128); | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_AVX2_poly_tomont | |||
* | |||
* Description: Inplace conversion of all coefficients of a polynomial | |||
* from normal domain to Montgomery domain | |||
* | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_poly_tomont(poly *r) { | |||
PQCLEAN_KYBER102490S_AVX2_tomont_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); | |||
} | |||
// FIXME | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_AVX2_poly_reduce | |||
* | |||
* Description: Applies Barrett reduction to all coefficients of a polynomial | |||
* for details of the Barrett reduction see comments in reduce.c | |||
* | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_poly_reduce(poly *r) { | |||
PQCLEAN_KYBER102490S_AVX2_reduce_avx(r->coeffs); | |||
PQCLEAN_KYBER102490S_AVX2_reduce_avx(r->coeffs + 128); | |||
PQCLEAN_KYBER102490S_AVX2_reduce_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); | |||
} | |||
// FIXME | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_AVX2_poly_csubq | |||
* | |||
* Description: Applies conditional subtraction of q to each coefficient | |||
* of a polynomial. For details of conditional subtraction | |||
* of q see comments in reduce.c | |||
* | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_poly_csubq(poly *r) { | |||
PQCLEAN_KYBER102490S_AVX2_csubq_avx(r->coeffs); | |||
PQCLEAN_KYBER102490S_AVX2_csubq_avx(r->coeffs + 128); | |||
PQCLEAN_KYBER102490S_AVX2_csubq_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); | |||
} | |||
/************************************************* | |||
* Name: poly_add | |||
* Name: PQCLEAN_KYBER102490S_AVX2_poly_add | |||
* | |||
* Description: Add two polynomials | |||
* | |||
@@ -191,18 +293,19 @@ void PQCLEAN_KYBER102490S_AVX2_poly_csubq(poly *r) { | |||
* - const poly *b: pointer to second input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_poly_add(poly *r, const poly *a, const poly *b) { | |||
__m256i vec0, vec1; | |||
for (size_t i = 0; i < KYBER_N; i += 16) { | |||
vec0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); | |||
vec1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); | |||
vec0 = _mm256_add_epi16(vec0, vec1); | |||
_mm256_store_si256((__m256i *)&r->coeffs[i], vec0); | |||
unsigned int i = 0; | |||
__m256i f0, f1; | |||
for (i = 0; i < KYBER_N; i += 16) { | |||
f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); | |||
f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); | |||
f0 = _mm256_add_epi16(f0, f1); | |||
_mm256_store_si256((__m256i *)&r->coeffs[i], f0); | |||
} | |||
} | |||
/************************************************* | |||
* Name: poly_sub | |||
* Name: PQCLEAN_KYBER102490S_AVX2_poly_sub | |||
* | |||
* Description: Subtract two polynomials | |||
* | |||
@@ -211,127 +314,13 @@ void PQCLEAN_KYBER102490S_AVX2_poly_add(poly *r, const poly *a, const poly *b) { | |||
* - const poly *b: pointer to second input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_poly_sub(poly *r, const poly *a, const poly *b) { | |||
__m256i vec0, vec1; | |||
for (size_t i = 0; i < KYBER_N; i += 16) { | |||
vec0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); | |||
vec1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); | |||
vec0 = _mm256_sub_epi16(vec0, vec1); | |||
_mm256_store_si256((__m256i *)&r->coeffs[i], vec0); | |||
} | |||
} | |||
/************************************************* | |||
* Name: poly_frommsg | |||
* | |||
* Description: Convert 32-byte message to polynomial | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *msg: pointer to input message | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]) { | |||
__m128i tmp; | |||
__m256i a[4], d0, d1, d2, d3; | |||
const __m256i shift = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); | |||
const __m256i zeros = _mm256_setzero_si256(); | |||
const __m256i ones = _mm256_set1_epi32(1); | |||
const __m256i hqs = _mm256_set1_epi32((KYBER_Q + 1) / 2); | |||
tmp = _mm_loadu_si128((__m128i *)msg); | |||
for (size_t i = 0; i < 4; i++) { | |||
a[i] = _mm256_broadcastd_epi32(tmp); | |||
tmp = _mm_srli_si128(tmp, 4); | |||
} | |||
for (size_t i = 0; i < 4; i++) { | |||
d0 = _mm256_srlv_epi32(a[i], shift); | |||
d1 = _mm256_srli_epi32(d0, 8); | |||
d2 = _mm256_srli_epi32(d0, 16); | |||
d3 = _mm256_srli_epi32(d0, 24); | |||
d0 = _mm256_and_si256(d0, ones); | |||
d1 = _mm256_and_si256(d1, ones); | |||
d2 = _mm256_and_si256(d2, ones); | |||
d3 = _mm256_and_si256(d3, ones); | |||
d0 = _mm256_sub_epi32(zeros, d0); | |||
d1 = _mm256_sub_epi32(zeros, d1); | |||
d2 = _mm256_sub_epi32(zeros, d2); | |||
d3 = _mm256_sub_epi32(zeros, d3); | |||
d0 = _mm256_and_si256(hqs, d0); | |||
d1 = _mm256_and_si256(hqs, d1); | |||
d2 = _mm256_and_si256(hqs, d2); | |||
d3 = _mm256_and_si256(hqs, d3); | |||
d0 = _mm256_packus_epi32(d0, d1); | |||
d2 = _mm256_packus_epi32(d2, d3); | |||
d0 = _mm256_permute4x64_epi64(d0, 0xD8); | |||
d2 = _mm256_permute4x64_epi64(d2, 0xD8); | |||
_mm256_store_si256((__m256i *)&r->coeffs[32 * i + 0], d0); | |||
_mm256_store_si256((__m256i *)&r->coeffs[32 * i + 16], d2); | |||
} | |||
tmp = _mm_loadu_si128((__m128i *)&msg[16]); | |||
for (size_t i = 0; i < 4; i++) { | |||
a[i] = _mm256_broadcastd_epi32(tmp); | |||
tmp = _mm_srli_si128(tmp, 4); | |||
} | |||
for (size_t i = 0; i < 4; i++) { | |||
d0 = _mm256_srlv_epi32(a[i], shift); | |||
d1 = _mm256_srli_epi32(d0, 8); | |||
d2 = _mm256_srli_epi32(d0, 16); | |||
d3 = _mm256_srli_epi32(d0, 24); | |||
d0 = _mm256_and_si256(d0, ones); | |||
d1 = _mm256_and_si256(d1, ones); | |||
d2 = _mm256_and_si256(d2, ones); | |||
d3 = _mm256_and_si256(d3, ones); | |||
d0 = _mm256_sub_epi32(zeros, d0); | |||
d1 = _mm256_sub_epi32(zeros, d1); | |||
d2 = _mm256_sub_epi32(zeros, d2); | |||
d3 = _mm256_sub_epi32(zeros, d3); | |||
d0 = _mm256_and_si256(hqs, d0); | |||
d1 = _mm256_and_si256(hqs, d1); | |||
d2 = _mm256_and_si256(hqs, d2); | |||
d3 = _mm256_and_si256(hqs, d3); | |||
d0 = _mm256_packus_epi32(d0, d1); | |||
d2 = _mm256_packus_epi32(d2, d3); | |||
d0 = _mm256_permute4x64_epi64(d0, 0xD8); | |||
d2 = _mm256_permute4x64_epi64(d2, 0xD8); | |||
_mm256_store_si256((__m256i *)&r->coeffs[128 + 32 * i + 0], d0); | |||
_mm256_store_si256((__m256i *)&r->coeffs[128 + 32 * i + 16], d2); | |||
} | |||
} | |||
/************************************************* | |||
* Name: poly_tomsg | |||
* | |||
* Description: Convert polynomial to 32-byte message | |||
* | |||
* Arguments: - uint8_t *msg: pointer to output message | |||
* - const poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a) { | |||
uint32_t small; | |||
__m256i vec, tmp; | |||
const __m256i hqs = _mm256_set1_epi16((KYBER_Q - 1) / 2); | |||
const __m256i hhqs = _mm256_set1_epi16((KYBER_Q - 5) / 4); | |||
for (size_t i = 0; i < KYBER_N / 16; i++) { | |||
vec = _mm256_load_si256((__m256i *)&a->coeffs[16 * i]); | |||
vec = _mm256_sub_epi16(hqs, vec); | |||
tmp = _mm256_srai_epi16(vec, 15); | |||
vec = _mm256_xor_si256(vec, tmp); | |||
vec = _mm256_sub_epi16(hhqs, vec); | |||
small = (uint32_t)_mm256_movemask_epi8(vec); | |||
small = _pext_u32(small, 0xAAAAAAAA); | |||
small = ~small; | |||
msg[2 * i + 0] = (uint8_t)small; | |||
msg[2 * i + 1] = (uint8_t)(small >> 8); | |||
unsigned int i = 0; | |||
__m256i f0, f1; | |||
for (i = 0; i < KYBER_N; i += 16) { | |||
f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); | |||
f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); | |||
f0 = _mm256_sub_epi16(f0, f1); | |||
_mm256_store_si256((__m256i *)&r->coeffs[i], f0); | |||
} | |||
} |
@@ -1,8 +1,7 @@ | |||
#ifndef POLY_H | |||
#define POLY_H | |||
#ifndef PQCLEAN_KYBER102490S_AVX2_POLY_H | |||
#define PQCLEAN_KYBER102490S_AVX2_POLY_H | |||
#include "params.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
@@ -11,32 +10,47 @@ | |||
* coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1] | |||
*/ | |||
typedef union { | |||
__m256i dummy; | |||
int16_t coeffs[KYBER_N]; | |||
__m256i _dummy; | |||
} poly; | |||
void PQCLEAN_KYBER102490S_AVX2_poly_compress(uint8_t *r, poly *a); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_decompress(poly *r, const uint8_t *a); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_tobytes(uint8_t *r, poly *a); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_frombytes(poly *r, const uint8_t *a); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_ntt(poly *r); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_invntt(poly *r); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_invntt_tomont(poly *r); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_nttunpack(poly *r); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_basemul(poly *r, const poly *a, const poly *b); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_frommont(poly *r); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const poly *b); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_tomont(poly *r); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_reduce(poly *r); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_csubq(poly *r); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_add(poly *r, const poly *a, const poly *b); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_sub(poly *r, const poly *a, const poly *b); | |||
#endif |
@@ -1,167 +1,198 @@ | |||
#include "params.h" | |||
#include "consts.h" | |||
#include "ntt.h" | |||
#include "poly.h" | |||
#include "polyvec.h" | |||
#include <stdint.h> | |||
/************************************************* | |||
* Name: polyvec_compress | |||
* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_compress | |||
* | |||
* Description: Compress and serialize vector of polynomials | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* - const polyvec *a: pointer to input vector of polynomials | |||
* (needs space for KYBER_POLYVECCOMPRESSEDBYTES) | |||
* - polyvec *a: pointer to input vector of polynomials | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_compress(uint8_t *r, polyvec *a) { | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], | |||
polyvec *restrict a) { | |||
unsigned int i = 0, j = 0, k = 0; | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_csubq(a); | |||
uint16_t t[8]; | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
for (size_t j = 0; j < KYBER_N / 8; j++) { | |||
for (size_t k = 0; k < 8; k++) { | |||
t[k] = ((((uint32_t)a->vec[i].coeffs[8 * j + k] << 11) + KYBER_Q / 2) / KYBER_Q) & 0x7ff; | |||
for (i = 0; i < KYBER_K; i++) { | |||
for (j = 0; j < KYBER_N / 8; j++) { | |||
for (k = 0; k < 8; k++) { | |||
{ | |||
t[k] = ((((uint32_t)a->vec[i].coeffs[8 * j + k] << 11) + KYBER_Q / 2) | |||
/ KYBER_Q) & 0x7ff; | |||
} | |||
} | |||
r[11 * j + 0] = (uint8_t)t[0]; | |||
r[11 * j + 1] = (uint8_t)((t[0] >> 8) | ((t[1] & 0x1f) << 3)); | |||
r[11 * j + 2] = (uint8_t)((t[1] >> 5) | ((t[2] & 0x03) << 6)); | |||
r[11 * j + 3] = (uint8_t)((t[2] >> 2)); | |||
r[11 * j + 4] = (uint8_t)((t[2] >> 10) | ((t[3] & 0x7f) << 1)); | |||
r[11 * j + 5] = (uint8_t)((t[3] >> 7) | ((t[4] & 0x0f) << 4)); | |||
r[11 * j + 6] = (uint8_t)((t[4] >> 4) | ((t[5] & 0x01) << 7)); | |||
r[11 * j + 7] = (uint8_t)((t[5] >> 1)); | |||
r[11 * j + 8] = (uint8_t)((t[5] >> 9) | ((t[6] & 0x3f) << 2)); | |||
r[11 * j + 9] = (uint8_t)((t[6] >> 6) | ((t[7] & 0x07) << 5)); | |||
r[11 * j + 10] = (uint8_t)((t[7] >> 3)); | |||
r[ 0] = (t[0] >> 0); | |||
r[ 1] = (t[0] >> 8) | (t[1] << 3); | |||
r[ 2] = (t[1] >> 5) | (t[2] << 6); | |||
r[ 3] = (t[2] >> 2); | |||
r[ 4] = (t[2] >> 10) | (t[3] << 1); | |||
r[ 5] = (t[3] >> 7) | (t[4] << 4); | |||
r[ 6] = (t[4] >> 4) | (t[5] << 7); | |||
r[ 7] = (t[5] >> 1); | |||
r[ 8] = (t[5] >> 9) | (t[6] << 2); | |||
r[ 9] = (t[6] >> 6) | (t[7] << 5); | |||
r[10] = (t[7] >> 3); | |||
r += 11; | |||
} | |||
r += 352; | |||
} | |||
} | |||
/************************************************* | |||
* Name: polyvec_decompress | |||
* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_decompress | |||
* | |||
* Description: De-serialize and decompress vector of polynomials; | |||
* approximate inverse of polyvec_compress | |||
* approximate inverse of PQCLEAN_KYBER102490S_AVX2_polyvec_compress | |||
* | |||
* Arguments: - polyvec *r: pointer to output vector of polynomials | |||
* - uint8_t *a: pointer to input byte array | |||
* - const uint8_t *a: pointer to input byte array | |||
* (of length KYBER_POLYVECCOMPRESSEDBYTES) | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_decompress(polyvec *r, const uint8_t *a) { | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
for (size_t j = 0; j < KYBER_N / 8; j++) { | |||
r->vec[i].coeffs[8 * j + 0] = (int16_t)( (((a[11 * j + 0] | (((uint32_t)a[11 * j + 1] & 0x07) << 8)) * KYBER_Q) + 1024) >> 11); | |||
r->vec[i].coeffs[8 * j + 1] = (int16_t)(((((a[11 * j + 1] >> 3) | (((uint32_t)a[11 * j + 2] & 0x3f) << 5)) * KYBER_Q) + 1024) >> 11); | |||
r->vec[i].coeffs[8 * j + 2] = (int16_t)(((((a[11 * j + 2] >> 6) | (((uint32_t)a[11 * j + 3] & 0xff) << 2) | (((uint32_t)a[11 * j + 4] & 0x01) << 10)) * KYBER_Q) + 1024) >> 11); | |||
r->vec[i].coeffs[8 * j + 3] = (int16_t)(((((a[11 * j + 4] >> 1) | (((uint32_t)a[11 * j + 5] & 0x0f) << 7)) * KYBER_Q) + 1024) >> 11); | |||
r->vec[i].coeffs[8 * j + 4] = (int16_t)(((((a[11 * j + 5] >> 4) | (((uint32_t)a[11 * j + 6] & 0x7f) << 4)) * KYBER_Q) + 1024) >> 11); | |||
r->vec[i].coeffs[8 * j + 5] = (int16_t)(((((a[11 * j + 6] >> 7) | (((uint32_t)a[11 * j + 7] & 0xff) << 1) | (((uint32_t)a[11 * j + 8] & 0x03) << 9)) * KYBER_Q) + 1024) >> 11); | |||
r->vec[i].coeffs[8 * j + 6] = (int16_t)(((((a[11 * j + 8] >> 2) | (((uint32_t)a[11 * j + 9] & 0x1f) << 6)) * KYBER_Q) + 1024) >> 11); | |||
r->vec[i].coeffs[8 * j + 7] = (int16_t)(((((a[11 * j + 9] >> 5) | (((uint32_t)a[11 * j + 10] & 0xff) << 3)) * KYBER_Q) + 1024) >> 11); | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_decompress(polyvec *restrict r, | |||
const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { | |||
unsigned int i = 0, j = 0, k = 0; | |||
uint16_t t[8]; | |||
for (i = 0; i < KYBER_K; i++) { | |||
for (j = 0; j < KYBER_N / 8; j++) { | |||
t[0] = (a[0] >> 0) | ((uint16_t)a[ 1] << 8); | |||
t[1] = (a[1] >> 3) | ((uint16_t)a[ 2] << 5); | |||
t[2] = (a[2] >> 6) | ((uint16_t)a[ 3] << 2) | ((uint16_t)a[4] << 10); | |||
t[3] = (a[4] >> 1) | ((uint16_t)a[ 5] << 7); | |||
t[4] = (a[5] >> 4) | ((uint16_t)a[ 6] << 4); | |||
t[5] = (a[6] >> 7) | ((uint16_t)a[ 7] << 1) | ((uint16_t)a[8] << 9); | |||
t[6] = (a[8] >> 2) | ((uint16_t)a[ 9] << 6); | |||
t[7] = (a[9] >> 5) | ((uint16_t)a[10] << 3); | |||
a += 11; | |||
for (k = 0; k < 8; k++) { | |||
r->vec[i].coeffs[8 * j + k] = ((uint32_t)(t[k] & 0x7FF) * KYBER_Q + 1024) >> 11; | |||
} | |||
} | |||
a += 352; | |||
} | |||
} | |||
/************************************************* | |||
* Name: polyvec_tobytes | |||
* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_tobytes | |||
* | |||
* Description: Serialize vector of polynomials | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* - const polyvec *a: pointer to input vector of polynomials | |||
* (needs space for KYBER_POLYVECBYTES) | |||
* - polyvec *a: pointer to input vector of polynomials | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_tobytes(uint8_t *r, polyvec *a) { | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a) { | |||
unsigned int i = 0; | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_AVX2_poly_tobytes(r + i * KYBER_POLYBYTES, &a->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: polyvec_frombytes | |||
* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_frombytes | |||
* | |||
* Description: De-serialize vector of polynomials; | |||
* inverse of polyvec_tobytes | |||
* inverse of PQCLEAN_KYBER102490S_AVX2_polyvec_tobytes | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* - const polyvec *a: pointer to input vector of polynomials | |||
* (of length KYBER_POLYVECBYTES) | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_frombytes(polyvec *r, const uint8_t *a) { | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]) { | |||
unsigned int i = 0; | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_AVX2_poly_frombytes(&r->vec[i], a + i * KYBER_POLYBYTES); | |||
} | |||
} | |||
/************************************************* | |||
* Name: polyvec_ntt | |||
* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_ntt | |||
* | |||
* Description: Apply forward NTT to all elements of a vector of polynomials | |||
* | |||
* Arguments: - polyvec *r: pointer to in/output vector of polynomials | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(polyvec *r) { | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
unsigned int i = 0; | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_AVX2_poly_ntt(&r->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: polyvec_invntt | |||
* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_invntt_tomont | |||
* | |||
* Description: Apply inverse NTT to all elements of a vector of polynomials | |||
* and multiply by Montgomery factor 2^16 | |||
* | |||
* Arguments: - polyvec *r: pointer to in/output vector of polynomials | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_invntt(polyvec *r) { | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_AVX2_poly_invntt(&r->vec[i]); | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_invntt_tomont(polyvec *r) { | |||
unsigned int i = 0; | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_AVX2_poly_invntt_tomont(&r->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: polyvec_pointwise_acc | |||
* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery | |||
* | |||
* Description: Pointwise multiply elements of a and b and accumulate into r | |||
* Description: Pointwise multiply elements of a and b, accumulate into r, | |||
* and multiply by 2^-16. | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const polyvec *a: pointer to first input vector of polynomials | |||
* - const polyvec *b: pointer to second input vector of polynomials | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b) { | |||
PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx(r->coeffs, | |||
a->vec->coeffs, | |||
b->vec->coeffs, | |||
PQCLEAN_KYBER102490S_AVX2_zetas_exp + 152); | |||
PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx(r->coeffs + 64, | |||
a->vec->coeffs + 64, | |||
b->vec->coeffs + 64, | |||
PQCLEAN_KYBER102490S_AVX2_zetas_exp + 184); | |||
PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx(r->coeffs + 128, | |||
a->vec->coeffs + 128, | |||
b->vec->coeffs + 128, | |||
PQCLEAN_KYBER102490S_AVX2_zetas_exp + 348); | |||
PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx(r->coeffs + 192, | |||
a->vec->coeffs + 192, | |||
b->vec->coeffs + 192, | |||
PQCLEAN_KYBER102490S_AVX2_zetas_exp + 380); | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(poly *r, | |||
const polyvec *a, | |||
const polyvec *b) { | |||
PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx(r->coeffs, a->vec->coeffs, b->vec->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); | |||
} | |||
// FIXME | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_reduce | |||
* | |||
* Description: Applies Barrett reduction to each coefficient | |||
* of each element of a vector of polynomials | |||
* for details of the Barrett reduction see comments in reduce.c | |||
* | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_reduce(polyvec *r) { | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
unsigned int i = 0; | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_AVX2_poly_reduce(&r->vec[i]); | |||
} | |||
} | |||
// FIXME | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_csubq | |||
* | |||
* Description: Applies conditional subtraction of q to each coefficient | |||
* of each element of a vector of polynomials | |||
* for details of conditional subtraction of q see comments in | |||
* reduce.c | |||
* | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_csubq(polyvec *r) { | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
unsigned int i = 0; | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_AVX2_poly_csubq(&r->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: polyvec_add | |||
* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_add | |||
* | |||
* Description: Add vectors of polynomials | |||
* | |||
@@ -170,7 +201,8 @@ void PQCLEAN_KYBER102490S_AVX2_polyvec_csubq(polyvec *r) { | |||
* - const polyvec *b: pointer to second input vector of polynomials | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b) { | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
unsigned int i = 0; | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_AVX2_poly_add(&r->vec[i], &a->vec[i], &b->vec[i]); | |||
} | |||
} |
@@ -1,29 +1,41 @@ | |||
#ifndef POLYVEC_H | |||
#define POLYVEC_H | |||
#ifndef PQCLEAN_KYBER102490S_AVX2_POLYVEC_H | |||
#define PQCLEAN_KYBER102490S_AVX2_POLYVEC_H | |||
#include "params.h" | |||
#include "poly.h" | |||
#include <stdint.h> | |||
typedef struct { | |||
poly vec[KYBER_K]; | |||
} polyvec; | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_compress(uint8_t *r, polyvec *a); | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_decompress(polyvec *r, const uint8_t *a); | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_tobytes(uint8_t *r, polyvec *a); | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_frombytes(polyvec *r, const uint8_t *a); | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a); | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_decompress(polyvec *r, | |||
const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a); | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]); | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(polyvec *r); | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_invntt(polyvec *r); | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b); | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_invntt_tomont(polyvec *r); | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(poly *r, | |||
const polyvec *a, | |||
const polyvec *b); | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_reduce(polyvec *r); | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_csubq(polyvec *r); | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); | |||
#endif |
@@ -3,8 +3,14 @@ | |||
#include <stdint.h> | |||
int16_t PQCLEAN_KYBER102490S_AVX2_reduce_avx(int16_t *r); | |||
int16_t PQCLEAN_KYBER102490S_AVX2_csubq_avx(int16_t *r); | |||
int16_t PQCLEAN_KYBER102490S_AVX2_frommont_avx(int16_t *r); | |||
#include "consts.h" | |||
#include "params.h" | |||
int16_t PQCLEAN_KYBER102490S_AVX2_reduce_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); | |||
int16_t PQCLEAN_KYBER102490S_AVX2_csubq_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); | |||
int16_t PQCLEAN_KYBER102490S_AVX2_tomont_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); | |||
#endif |
@@ -1,386 +1,360 @@ | |||
#include "align.h" | |||
#include "consts.h" | |||
#include "params.h" | |||
#include "rejsample.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
static const uint8_t idx[256][8] = { | |||
{ 0, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 2, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 0, 0, 0, 0, 0, 0}, | |||
{ 4, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 0, 0, 0, 0, 0, 0}, | |||
{ 2, 4, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 0, 0, 0, 0, 0}, | |||
{ 6, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 6, 0, 0, 0, 0, 0, 0}, | |||
{ 2, 6, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 6, 0, 0, 0, 0, 0}, | |||
{ 4, 6, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 6, 0, 0, 0, 0, 0}, | |||
{ 2, 4, 6, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 6, 0, 0, 0, 0}, | |||
{ 8, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 8, 0, 0, 0, 0, 0, 0}, | |||
{ 2, 8, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 8, 0, 0, 0, 0, 0}, | |||
{ 4, 8, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 8, 0, 0, 0, 0, 0}, | |||
{ 2, 4, 8, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 8, 0, 0, 0, 0}, | |||
{ 6, 8, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 6, 8, 0, 0, 0, 0, 0}, | |||
{ 2, 6, 8, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 6, 8, 0, 0, 0, 0}, | |||
{ 4, 6, 8, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 6, 8, 0, 0, 0, 0}, | |||
{ 2, 4, 6, 8, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 6, 8, 0, 0, 0}, | |||
{10, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 10, 0, 0, 0, 0, 0, 0}, | |||
{ 2, 10, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 10, 0, 0, 0, 0, 0}, | |||
{ 4, 10, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 10, 0, 0, 0, 0, 0}, | |||
{ 2, 4, 10, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 10, 0, 0, 0, 0}, | |||
{ 6, 10, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 6, 10, 0, 0, 0, 0, 0}, | |||
{ 2, 6, 10, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 6, 10, 0, 0, 0, 0}, | |||
{ 4, 6, 10, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 6, 10, 0, 0, 0, 0}, | |||
{ 2, 4, 6, 10, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 6, 10, 0, 0, 0}, | |||
{ 8, 10, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 8, 10, 0, 0, 0, 0, 0}, | |||
{ 2, 8, 10, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 8, 10, 0, 0, 0, 0}, | |||
{ 4, 8, 10, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 8, 10, 0, 0, 0, 0}, | |||
{ 2, 4, 8, 10, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 8, 10, 0, 0, 0}, | |||
{ 6, 8, 10, 0, 0, 0, 0, 0}, | |||
{ 0, 6, 8, 10, 0, 0, 0, 0}, | |||
{ 2, 6, 8, 10, 0, 0, 0, 0}, | |||
{ 0, 2, 6, 8, 10, 0, 0, 0}, | |||
{ 4, 6, 8, 10, 0, 0, 0, 0}, | |||
{ 0, 4, 6, 8, 10, 0, 0, 0}, | |||
{ 2, 4, 6, 8, 10, 0, 0, 0}, | |||
{ 0, 2, 4, 6, 8, 10, 0, 0}, | |||
{12, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 12, 0, 0, 0, 0, 0, 0}, | |||
{ 2, 12, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 12, 0, 0, 0, 0, 0}, | |||
{ 4, 12, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 12, 0, 0, 0, 0, 0}, | |||
{ 2, 4, 12, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 12, 0, 0, 0, 0}, | |||
{ 6, 12, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 6, 12, 0, 0, 0, 0, 0}, | |||
{ 2, 6, 12, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 6, 12, 0, 0, 0, 0}, | |||
{ 4, 6, 12, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 6, 12, 0, 0, 0, 0}, | |||
{ 2, 4, 6, 12, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 6, 12, 0, 0, 0}, | |||
{ 8, 12, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 8, 12, 0, 0, 0, 0, 0}, | |||
{ 2, 8, 12, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 8, 12, 0, 0, 0, 0}, | |||
{ 4, 8, 12, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 8, 12, 0, 0, 0, 0}, | |||
{ 2, 4, 8, 12, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 8, 12, 0, 0, 0}, | |||
{ 6, 8, 12, 0, 0, 0, 0, 0}, | |||
{ 0, 6, 8, 12, 0, 0, 0, 0}, | |||
{ 2, 6, 8, 12, 0, 0, 0, 0}, | |||
{ 0, 2, 6, 8, 12, 0, 0, 0}, | |||
{ 4, 6, 8, 12, 0, 0, 0, 0}, | |||
{ 0, 4, 6, 8, 12, 0, 0, 0}, | |||
{ 2, 4, 6, 8, 12, 0, 0, 0}, | |||
{ 0, 2, 4, 6, 8, 12, 0, 0}, | |||
{10, 12, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 10, 12, 0, 0, 0, 0, 0}, | |||
{ 2, 10, 12, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 10, 12, 0, 0, 0, 0}, | |||
{ 4, 10, 12, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 10, 12, 0, 0, 0, 0}, | |||
{ 2, 4, 10, 12, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 10, 12, 0, 0, 0}, | |||
{ 6, 10, 12, 0, 0, 0, 0, 0}, | |||
{ 0, 6, 10, 12, 0, 0, 0, 0}, | |||
{ 2, 6, 10, 12, 0, 0, 0, 0}, | |||
{ 0, 2, 6, 10, 12, 0, 0, 0}, | |||
{ 4, 6, 10, 12, 0, 0, 0, 0}, | |||
{ 0, 4, 6, 10, 12, 0, 0, 0}, | |||
{ 2, 4, 6, 10, 12, 0, 0, 0}, | |||
{ 0, 2, 4, 6, 10, 12, 0, 0}, | |||
{ 8, 10, 12, 0, 0, 0, 0, 0}, | |||
{ 0, 8, 10, 12, 0, 0, 0, 0}, | |||
{ 2, 8, 10, 12, 0, 0, 0, 0}, | |||
{ 0, 2, 8, 10, 12, 0, 0, 0}, | |||
{ 4, 8, 10, 12, 0, 0, 0, 0}, | |||
{ 0, 4, 8, 10, 12, 0, 0, 0}, | |||
{ 2, 4, 8, 10, 12, 0, 0, 0}, | |||
{ 0, 2, 4, 8, 10, 12, 0, 0}, | |||
{ 6, 8, 10, 12, 0, 0, 0, 0}, | |||
{ 0, 6, 8, 10, 12, 0, 0, 0}, | |||
{ 2, 6, 8, 10, 12, 0, 0, 0}, | |||
{ 0, 2, 6, 8, 10, 12, 0, 0}, | |||
{ 4, 6, 8, 10, 12, 0, 0, 0}, | |||
{ 0, 4, 6, 8, 10, 12, 0, 0}, | |||
{ 2, 4, 6, 8, 10, 12, 0, 0}, | |||
{ 0, 2, 4, 6, 8, 10, 12, 0}, | |||
{14, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 14, 0, 0, 0, 0, 0, 0}, | |||
{ 2, 14, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 14, 0, 0, 0, 0, 0}, | |||
{ 4, 14, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 14, 0, 0, 0, 0, 0}, | |||
{ 2, 4, 14, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 14, 0, 0, 0, 0}, | |||
{ 6, 14, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 6, 14, 0, 0, 0, 0, 0}, | |||
{ 2, 6, 14, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 6, 14, 0, 0, 0, 0}, | |||
{ 4, 6, 14, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 6, 14, 0, 0, 0, 0}, | |||
{ 2, 4, 6, 14, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 6, 14, 0, 0, 0}, | |||
{ 8, 14, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 8, 14, 0, 0, 0, 0, 0}, | |||
{ 2, 8, 14, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 8, 14, 0, 0, 0, 0}, | |||
{ 4, 8, 14, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 8, 14, 0, 0, 0, 0}, | |||
{ 2, 4, 8, 14, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 8, 14, 0, 0, 0}, | |||
{ 6, 8, 14, 0, 0, 0, 0, 0}, | |||
{ 0, 6, 8, 14, 0, 0, 0, 0}, | |||
{ 2, 6, 8, 14, 0, 0, 0, 0}, | |||
{ 0, 2, 6, 8, 14, 0, 0, 0}, | |||
{ 4, 6, 8, 14, 0, 0, 0, 0}, | |||
{ 0, 4, 6, 8, 14, 0, 0, 0}, | |||
{ 2, 4, 6, 8, 14, 0, 0, 0}, | |||
{ 0, 2, 4, 6, 8, 14, 0, 0}, | |||
{10, 14, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 10, 14, 0, 0, 0, 0, 0}, | |||
{ 2, 10, 14, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 10, 14, 0, 0, 0, 0}, | |||
{ 4, 10, 14, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 10, 14, 0, 0, 0, 0}, | |||
{ 2, 4, 10, 14, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 10, 14, 0, 0, 0}, | |||
{ 6, 10, 14, 0, 0, 0, 0, 0}, | |||
{ 0, 6, 10, 14, 0, 0, 0, 0}, | |||
{ 2, 6, 10, 14, 0, 0, 0, 0}, | |||
{ 0, 2, 6, 10, 14, 0, 0, 0}, | |||
{ 4, 6, 10, 14, 0, 0, 0, 0}, | |||
{ 0, 4, 6, 10, 14, 0, 0, 0}, | |||
{ 2, 4, 6, 10, 14, 0, 0, 0}, | |||
{ 0, 2, 4, 6, 10, 14, 0, 0}, | |||
{ 8, 10, 14, 0, 0, 0, 0, 0}, | |||
{ 0, 8, 10, 14, 0, 0, 0, 0}, | |||
{ 2, 8, 10, 14, 0, 0, 0, 0}, | |||
{ 0, 2, 8, 10, 14, 0, 0, 0}, | |||
{ 4, 8, 10, 14, 0, 0, 0, 0}, | |||
{ 0, 4, 8, 10, 14, 0, 0, 0}, | |||
{ 2, 4, 8, 10, 14, 0, 0, 0}, | |||
{ 0, 2, 4, 8, 10, 14, 0, 0}, | |||
{ 6, 8, 10, 14, 0, 0, 0, 0}, | |||
{ 0, 6, 8, 10, 14, 0, 0, 0}, | |||
{ 2, 6, 8, 10, 14, 0, 0, 0}, | |||
{ 0, 2, 6, 8, 10, 14, 0, 0}, | |||
{ 4, 6, 8, 10, 14, 0, 0, 0}, | |||
{ 0, 4, 6, 8, 10, 14, 0, 0}, | |||
{ 2, 4, 6, 8, 10, 14, 0, 0}, | |||
{ 0, 2, 4, 6, 8, 10, 14, 0}, | |||
{12, 14, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 12, 14, 0, 0, 0, 0, 0}, | |||
{ 2, 12, 14, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 12, 14, 0, 0, 0, 0}, | |||
{ 4, 12, 14, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 12, 14, 0, 0, 0, 0}, | |||
{ 2, 4, 12, 14, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 12, 14, 0, 0, 0}, | |||
{ 6, 12, 14, 0, 0, 0, 0, 0}, | |||
{ 0, 6, 12, 14, 0, 0, 0, 0}, | |||
{ 2, 6, 12, 14, 0, 0, 0, 0}, | |||
{ 0, 2, 6, 12, 14, 0, 0, 0}, | |||
{ 4, 6, 12, 14, 0, 0, 0, 0}, | |||
{ 0, 4, 6, 12, 14, 0, 0, 0}, | |||
{ 2, 4, 6, 12, 14, 0, 0, 0}, | |||
{ 0, 2, 4, 6, 12, 14, 0, 0}, | |||
{ 8, 12, 14, 0, 0, 0, 0, 0}, | |||
{ 0, 8, 12, 14, 0, 0, 0, 0}, | |||
{ 2, 8, 12, 14, 0, 0, 0, 0}, | |||
{ 0, 2, 8, 12, 14, 0, 0, 0}, | |||
{ 4, 8, 12, 14, 0, 0, 0, 0}, | |||
{ 0, 4, 8, 12, 14, 0, 0, 0}, | |||
{ 2, 4, 8, 12, 14, 0, 0, 0}, | |||
{ 0, 2, 4, 8, 12, 14, 0, 0}, | |||
{ 6, 8, 12, 14, 0, 0, 0, 0}, | |||
{ 0, 6, 8, 12, 14, 0, 0, 0}, | |||
{ 2, 6, 8, 12, 14, 0, 0, 0}, | |||
{ 0, 2, 6, 8, 12, 14, 0, 0}, | |||
{ 4, 6, 8, 12, 14, 0, 0, 0}, | |||
{ 0, 4, 6, 8, 12, 14, 0, 0}, | |||
{ 2, 4, 6, 8, 12, 14, 0, 0}, | |||
{ 0, 2, 4, 6, 8, 12, 14, 0}, | |||
{10, 12, 14, 0, 0, 0, 0, 0}, | |||
{ 0, 10, 12, 14, 0, 0, 0, 0}, | |||
{ 2, 10, 12, 14, 0, 0, 0, 0}, | |||
{ 0, 2, 10, 12, 14, 0, 0, 0}, | |||
{ 4, 10, 12, 14, 0, 0, 0, 0}, | |||
{ 0, 4, 10, 12, 14, 0, 0, 0}, | |||
{ 2, 4, 10, 12, 14, 0, 0, 0}, | |||
{ 0, 2, 4, 10, 12, 14, 0, 0}, | |||
{ 6, 10, 12, 14, 0, 0, 0, 0}, | |||
{ 0, 6, 10, 12, 14, 0, 0, 0}, | |||
{ 2, 6, 10, 12, 14, 0, 0, 0}, | |||
{ 0, 2, 6, 10, 12, 14, 0, 0}, | |||
{ 4, 6, 10, 12, 14, 0, 0, 0}, | |||
{ 0, 4, 6, 10, 12, 14, 0, 0}, | |||
{ 2, 4, 6, 10, 12, 14, 0, 0}, | |||
{ 0, 2, 4, 6, 10, 12, 14, 0}, | |||
{ 8, 10, 12, 14, 0, 0, 0, 0}, | |||
{ 0, 8, 10, 12, 14, 0, 0, 0}, | |||
{ 2, 8, 10, 12, 14, 0, 0, 0}, | |||
{ 0, 2, 8, 10, 12, 14, 0, 0}, | |||
{ 4, 8, 10, 12, 14, 0, 0, 0}, | |||
{ 0, 4, 8, 10, 12, 14, 0, 0}, | |||
{ 2, 4, 8, 10, 12, 14, 0, 0}, | |||
{ 0, 2, 4, 8, 10, 12, 14, 0}, | |||
{ 6, 8, 10, 12, 14, 0, 0, 0}, | |||
{ 0, 6, 8, 10, 12, 14, 0, 0}, | |||
{ 2, 6, 8, 10, 12, 14, 0, 0}, | |||
{ 0, 2, 6, 8, 10, 12, 14, 0}, | |||
{ 4, 6, 8, 10, 12, 14, 0, 0}, | |||
{ 0, 4, 6, 8, 10, 12, 14, 0}, | |||
{ 2, 4, 6, 8, 10, 12, 14, 0}, | |||
{ 0, 2, 4, 6, 8, 10, 12, 14} | |||
static const ALIGN32_ARRAY_2D(uint8_t, 256, 8) idx = {.arr = { | |||
{-1, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 0, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 2, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 2, -1, -1, -1, -1, -1, -1}, | |||
{ 4, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 4, -1, -1, -1, -1, -1, -1}, | |||
{ 2, 4, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 4, -1, -1, -1, -1, -1}, | |||
{ 6, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 6, -1, -1, -1, -1, -1, -1}, | |||
{ 2, 6, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 6, -1, -1, -1, -1, -1}, | |||
{ 4, 6, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 6, -1, -1, -1, -1, -1}, | |||
{ 2, 4, 6, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 6, -1, -1, -1, -1}, | |||
{ 8, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 8, -1, -1, -1, -1, -1, -1}, | |||
{ 2, 8, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 8, -1, -1, -1, -1, -1}, | |||
{ 4, 8, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 8, -1, -1, -1, -1, -1}, | |||
{ 2, 4, 8, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 8, -1, -1, -1, -1}, | |||
{ 6, 8, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 8, -1, -1, -1, -1, -1}, | |||
{ 2, 6, 8, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 8, -1, -1, -1, -1}, | |||
{ 4, 6, 8, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 8, -1, -1, -1, -1}, | |||
{ 2, 4, 6, 8, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 8, -1, -1, -1}, | |||
{10, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 10, -1, -1, -1, -1, -1, -1}, | |||
{ 2, 10, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 10, -1, -1, -1, -1, -1}, | |||
{ 4, 10, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 10, -1, -1, -1, -1, -1}, | |||
{ 2, 4, 10, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 10, -1, -1, -1, -1}, | |||
{ 6, 10, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 10, -1, -1, -1, -1, -1}, | |||
{ 2, 6, 10, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 10, -1, -1, -1, -1}, | |||
{ 4, 6, 10, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 10, -1, -1, -1, -1}, | |||
{ 2, 4, 6, 10, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 10, -1, -1, -1}, | |||
{ 8, 10, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 8, 10, -1, -1, -1, -1, -1}, | |||
{ 2, 8, 10, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 8, 10, -1, -1, -1, -1}, | |||
{ 4, 8, 10, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 8, 10, -1, -1, -1, -1}, | |||
{ 2, 4, 8, 10, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 8, 10, -1, -1, -1}, | |||
{ 6, 8, 10, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 8, 10, -1, -1, -1, -1}, | |||
{ 2, 6, 8, 10, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 8, 10, -1, -1, -1}, | |||
{ 4, 6, 8, 10, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 8, 10, -1, -1, -1}, | |||
{ 2, 4, 6, 8, 10, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 8, 10, -1, -1}, | |||
{12, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 12, -1, -1, -1, -1, -1, -1}, | |||
{ 2, 12, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 12, -1, -1, -1, -1, -1}, | |||
{ 4, 12, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 12, -1, -1, -1, -1, -1}, | |||
{ 2, 4, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 12, -1, -1, -1, -1}, | |||
{ 6, 12, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 12, -1, -1, -1, -1, -1}, | |||
{ 2, 6, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 12, -1, -1, -1, -1}, | |||
{ 4, 6, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 12, -1, -1, -1, -1}, | |||
{ 2, 4, 6, 12, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 12, -1, -1, -1}, | |||
{ 8, 12, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 8, 12, -1, -1, -1, -1, -1}, | |||
{ 2, 8, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 8, 12, -1, -1, -1, -1}, | |||
{ 4, 8, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 8, 12, -1, -1, -1, -1}, | |||
{ 2, 4, 8, 12, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 8, 12, -1, -1, -1}, | |||
{ 6, 8, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 8, 12, -1, -1, -1, -1}, | |||
{ 2, 6, 8, 12, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 8, 12, -1, -1, -1}, | |||
{ 4, 6, 8, 12, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 8, 12, -1, -1, -1}, | |||
{ 2, 4, 6, 8, 12, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 8, 12, -1, -1}, | |||
{10, 12, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 10, 12, -1, -1, -1, -1, -1}, | |||
{ 2, 10, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 10, 12, -1, -1, -1, -1}, | |||
{ 4, 10, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 10, 12, -1, -1, -1, -1}, | |||
{ 2, 4, 10, 12, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 10, 12, -1, -1, -1}, | |||
{ 6, 10, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 10, 12, -1, -1, -1, -1}, | |||
{ 2, 6, 10, 12, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 10, 12, -1, -1, -1}, | |||
{ 4, 6, 10, 12, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 10, 12, -1, -1, -1}, | |||
{ 2, 4, 6, 10, 12, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 10, 12, -1, -1}, | |||
{ 8, 10, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 8, 10, 12, -1, -1, -1, -1}, | |||
{ 2, 8, 10, 12, -1, -1, -1, -1}, | |||
{ 0, 2, 8, 10, 12, -1, -1, -1}, | |||
{ 4, 8, 10, 12, -1, -1, -1, -1}, | |||
{ 0, 4, 8, 10, 12, -1, -1, -1}, | |||
{ 2, 4, 8, 10, 12, -1, -1, -1}, | |||
{ 0, 2, 4, 8, 10, 12, -1, -1}, | |||
{ 6, 8, 10, 12, -1, -1, -1, -1}, | |||
{ 0, 6, 8, 10, 12, -1, -1, -1}, | |||
{ 2, 6, 8, 10, 12, -1, -1, -1}, | |||
{ 0, 2, 6, 8, 10, 12, -1, -1}, | |||
{ 4, 6, 8, 10, 12, -1, -1, -1}, | |||
{ 0, 4, 6, 8, 10, 12, -1, -1}, | |||
{ 2, 4, 6, 8, 10, 12, -1, -1}, | |||
{ 0, 2, 4, 6, 8, 10, 12, -1}, | |||
{14, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 14, -1, -1, -1, -1, -1, -1}, | |||
{ 2, 14, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 14, -1, -1, -1, -1, -1}, | |||
{ 4, 14, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 14, -1, -1, -1, -1, -1}, | |||
{ 2, 4, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 14, -1, -1, -1, -1}, | |||
{ 6, 14, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 14, -1, -1, -1, -1, -1}, | |||
{ 2, 6, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 14, -1, -1, -1, -1}, | |||
{ 4, 6, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 14, -1, -1, -1, -1}, | |||
{ 2, 4, 6, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 14, -1, -1, -1}, | |||
{ 8, 14, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 8, 14, -1, -1, -1, -1, -1}, | |||
{ 2, 8, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 8, 14, -1, -1, -1, -1}, | |||
{ 4, 8, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 8, 14, -1, -1, -1, -1}, | |||
{ 2, 4, 8, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 8, 14, -1, -1, -1}, | |||
{ 6, 8, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 8, 14, -1, -1, -1, -1}, | |||
{ 2, 6, 8, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 8, 14, -1, -1, -1}, | |||
{ 4, 6, 8, 14, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 8, 14, -1, -1, -1}, | |||
{ 2, 4, 6, 8, 14, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 8, 14, -1, -1}, | |||
{10, 14, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 10, 14, -1, -1, -1, -1, -1}, | |||
{ 2, 10, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 10, 14, -1, -1, -1, -1}, | |||
{ 4, 10, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 10, 14, -1, -1, -1, -1}, | |||
{ 2, 4, 10, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 10, 14, -1, -1, -1}, | |||
{ 6, 10, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 10, 14, -1, -1, -1, -1}, | |||
{ 2, 6, 10, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 10, 14, -1, -1, -1}, | |||
{ 4, 6, 10, 14, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 10, 14, -1, -1, -1}, | |||
{ 2, 4, 6, 10, 14, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 10, 14, -1, -1}, | |||
{ 8, 10, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 8, 10, 14, -1, -1, -1, -1}, | |||
{ 2, 8, 10, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 8, 10, 14, -1, -1, -1}, | |||
{ 4, 8, 10, 14, -1, -1, -1, -1}, | |||
{ 0, 4, 8, 10, 14, -1, -1, -1}, | |||
{ 2, 4, 8, 10, 14, -1, -1, -1}, | |||
{ 0, 2, 4, 8, 10, 14, -1, -1}, | |||
{ 6, 8, 10, 14, -1, -1, -1, -1}, | |||
{ 0, 6, 8, 10, 14, -1, -1, -1}, | |||
{ 2, 6, 8, 10, 14, -1, -1, -1}, | |||
{ 0, 2, 6, 8, 10, 14, -1, -1}, | |||
{ 4, 6, 8, 10, 14, -1, -1, -1}, | |||
{ 0, 4, 6, 8, 10, 14, -1, -1}, | |||
{ 2, 4, 6, 8, 10, 14, -1, -1}, | |||
{ 0, 2, 4, 6, 8, 10, 14, -1}, | |||
{12, 14, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 12, 14, -1, -1, -1, -1, -1}, | |||
{ 2, 12, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 12, 14, -1, -1, -1, -1}, | |||
{ 4, 12, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 12, 14, -1, -1, -1, -1}, | |||
{ 2, 4, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 12, 14, -1, -1, -1}, | |||
{ 6, 12, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 12, 14, -1, -1, -1, -1}, | |||
{ 2, 6, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 12, 14, -1, -1, -1}, | |||
{ 4, 6, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 12, 14, -1, -1, -1}, | |||
{ 2, 4, 6, 12, 14, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 12, 14, -1, -1}, | |||
{ 8, 12, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 8, 12, 14, -1, -1, -1, -1}, | |||
{ 2, 8, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 8, 12, 14, -1, -1, -1}, | |||
{ 4, 8, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 4, 8, 12, 14, -1, -1, -1}, | |||
{ 2, 4, 8, 12, 14, -1, -1, -1}, | |||
{ 0, 2, 4, 8, 12, 14, -1, -1}, | |||
{ 6, 8, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 6, 8, 12, 14, -1, -1, -1}, | |||
{ 2, 6, 8, 12, 14, -1, -1, -1}, | |||
{ 0, 2, 6, 8, 12, 14, -1, -1}, | |||
{ 4, 6, 8, 12, 14, -1, -1, -1}, | |||
{ 0, 4, 6, 8, 12, 14, -1, -1}, | |||
{ 2, 4, 6, 8, 12, 14, -1, -1}, | |||
{ 0, 2, 4, 6, 8, 12, 14, -1}, | |||
{10, 12, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 10, 12, 14, -1, -1, -1, -1}, | |||
{ 2, 10, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 10, 12, 14, -1, -1, -1}, | |||
{ 4, 10, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 4, 10, 12, 14, -1, -1, -1}, | |||
{ 2, 4, 10, 12, 14, -1, -1, -1}, | |||
{ 0, 2, 4, 10, 12, 14, -1, -1}, | |||
{ 6, 10, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 6, 10, 12, 14, -1, -1, -1}, | |||
{ 2, 6, 10, 12, 14, -1, -1, -1}, | |||
{ 0, 2, 6, 10, 12, 14, -1, -1}, | |||
{ 4, 6, 10, 12, 14, -1, -1, -1}, | |||
{ 0, 4, 6, 10, 12, 14, -1, -1}, | |||
{ 2, 4, 6, 10, 12, 14, -1, -1}, | |||
{ 0, 2, 4, 6, 10, 12, 14, -1}, | |||
{ 8, 10, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 8, 10, 12, 14, -1, -1, -1}, | |||
{ 2, 8, 10, 12, 14, -1, -1, -1}, | |||
{ 0, 2, 8, 10, 12, 14, -1, -1}, | |||
{ 4, 8, 10, 12, 14, -1, -1, -1}, | |||
{ 0, 4, 8, 10, 12, 14, -1, -1}, | |||
{ 2, 4, 8, 10, 12, 14, -1, -1}, | |||
{ 0, 2, 4, 8, 10, 12, 14, -1}, | |||
{ 6, 8, 10, 12, 14, -1, -1, -1}, | |||
{ 0, 6, 8, 10, 12, 14, -1, -1}, | |||
{ 2, 6, 8, 10, 12, 14, -1, -1}, | |||
{ 0, 2, 6, 8, 10, 12, 14, -1}, | |||
{ 4, 6, 8, 10, 12, 14, -1, -1}, | |||
{ 0, 4, 6, 8, 10, 12, 14, -1}, | |||
{ 2, 4, 6, 8, 10, 12, 14, -1}, | |||
{ 0, 2, 4, 6, 8, 10, 12, 14} | |||
} | |||
}; | |||
#define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a) | |||
#define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a) | |||
#define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a) | |||
#define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a) | |||
size_t PQCLEAN_KYBER102490S_AVX2_rej_uniform(int16_t *r, | |||
size_t len, | |||
const uint8_t *buf, | |||
size_t buflen) { | |||
size_t ctr, pos; | |||
uint16_t val; | |||
uint32_t good0, good1, good2; | |||
const __m256i bound = _mm256_set1_epi16((int16_t)(19 * KYBER_Q - 1)); // -1 to use cheaper >= instead of > comparison | |||
#define REJ_UNIFORM_BUFLEN 576 | |||
unsigned int PQCLEAN_KYBER102490S_AVX2_rej_uniform_avx(int16_t *restrict r, | |||
const uint8_t *restrict buf) { | |||
unsigned int ctr = 0, pos = 0; | |||
uint16_t val = 0; | |||
uint32_t good = 0; | |||
const __m256i bound = _mm256_set1_epi16((int16_t)(19 * KYBER_Q - 1)); | |||
const __m256i ones = _mm256_set1_epi8(1); | |||
const __m256i kyberq = _mm256_load_si256(&PQCLEAN_KYBER102490S_AVX2_16xq.as_vec); | |||
const __m256i v = _mm256_load_si256(&PQCLEAN_KYBER102490S_AVX2_16xv.as_vec); | |||
__m256i d0, d1, d2, tmp0, tmp1, tmp2, pi0, pi1, pi2; | |||
__m128i d, tmp, pilo, pihi; | |||
const __m256i kyberq = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER102490S_AVX2_qdata.as_arr[_16XQ]); | |||
const __m256i v = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER102490S_AVX2_qdata.as_arr[_16XV]); | |||
__m256i f0, f1, g0, g1, g2, g3; | |||
__m128i f, t, pilo, pihi; | |||
ctr = pos = 0; | |||
while (ctr + 48 <= len && pos + 96 <= buflen) { | |||
d0 = _mm256_loadu_si256((__m256i *)&buf[pos + 0]); | |||
d1 = _mm256_loadu_si256((__m256i *)&buf[pos + 32]); | |||
d2 = _mm256_loadu_si256((__m256i *)&buf[pos + 64]); | |||
ctr = 0; | |||
for (pos = 0; pos < 2 * KYBER_N; pos += 64) { | |||
f0 = _mm256_load_si256((__m256i *)&buf[pos + 0]); | |||
f1 = _mm256_load_si256((__m256i *)&buf[pos + 32]); | |||
tmp0 = _mm256_cmpge_epu16(bound, d0); | |||
tmp1 = _mm256_cmpge_epu16(bound, d1); | |||
tmp2 = _mm256_cmpge_epu16(bound, d2); | |||
good0 = (uint32_t)_mm256_movemask_epi8(tmp0); | |||
good1 = (uint32_t)_mm256_movemask_epi8(tmp1); | |||
good2 = (uint32_t)_mm256_movemask_epi8(tmp2); | |||
good0 = _pext_u32(good0, 0x55555555); | |||
good1 = _pext_u32(good1, 0x55555555); | |||
good2 = _pext_u32(good2, 0x55555555); | |||
g0 = _mm256_cmpge_epu16(bound, f0); | |||
g1 = _mm256_cmpge_epu16(bound, f1); | |||
pilo = _mm_loadl_epi64((__m128i *)&idx[good0 & 0xFF]); | |||
pihi = _mm_loadl_epi64((__m128i *)&idx[(good0 >> 8) & 0xFF]); | |||
pi0 = _mm256_castsi128_si256(pilo); | |||
pi0 = _mm256_inserti128_si256(pi0, pihi, 1); | |||
g0 = _mm256_packs_epi16(g0, g1); | |||
good = _mm256_movemask_epi8(g0); | |||
pilo = _mm_loadl_epi64((__m128i *)&idx[good1 & 0xFF]); | |||
pihi = _mm_loadl_epi64((__m128i *)&idx[(good1 >> 8) & 0xFF]); | |||
pi1 = _mm256_castsi128_si256(pilo); | |||
pi1 = _mm256_inserti128_si256(pi1, pihi, 1); | |||
g0 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 0) & 0xFF])); | |||
g1 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 8) & 0xFF])); | |||
g0 = _mm256_inserti128_si256(g0, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 16) & 0xFF]), 1); | |||
g1 = _mm256_inserti128_si256(g1, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 24) & 0xFF]), 1); | |||
pilo = _mm_loadl_epi64((__m128i *)&idx[good2 & 0xFF]); | |||
pihi = _mm_loadl_epi64((__m128i *)&idx[(good2 >> 8) & 0xFF]); | |||
pi2 = _mm256_castsi128_si256(pilo); | |||
pi2 = _mm256_inserti128_si256(pi2, pihi, 1); | |||
//g0 = _mm256_cvtepu8_epi64(_mm_loadl_epi64((__m128i *)&good)); | |||
//g1 = _mm256_i64gather_epi64((long long *)idx, g0, 8); | |||
tmp0 = _mm256_add_epi8(pi0, ones); | |||
tmp1 = _mm256_add_epi8(pi1, ones); | |||
tmp2 = _mm256_add_epi8(pi2, ones); | |||
pi0 = _mm256_unpacklo_epi8(pi0, tmp0); | |||
pi1 = _mm256_unpacklo_epi8(pi1, tmp1); | |||
pi2 = _mm256_unpacklo_epi8(pi2, tmp2); | |||
/* Barrett reduction of (still unsigned) values */ | |||
g2 = _mm256_mulhi_epu16(f0, v); | |||
g3 = _mm256_mulhi_epu16(f1, v); | |||
g2 = _mm256_srli_epi16(g2, 10); | |||
g3 = _mm256_srli_epi16(g3, 10); | |||
g2 = _mm256_mullo_epi16(g2, kyberq); | |||
g3 = _mm256_mullo_epi16(g3, kyberq); | |||
f0 = _mm256_sub_epi16(f0, g2); | |||
f1 = _mm256_sub_epi16(f1, g3); | |||
d0 = _mm256_shuffle_epi8(d0, pi0); | |||
d1 = _mm256_shuffle_epi8(d1, pi1); | |||
d2 = _mm256_shuffle_epi8(d2, pi2); | |||
g2 = _mm256_add_epi8(g0, ones); | |||
g3 = _mm256_add_epi8(g1, ones); | |||
g0 = _mm256_unpacklo_epi8(g0, g2); | |||
g1 = _mm256_unpacklo_epi8(g1, g3); | |||
/* Barrett reduction of (still unsigned) d values */ | |||
tmp0 = _mm256_mulhi_epu16(d0, v); | |||
tmp1 = _mm256_mulhi_epu16(d1, v); | |||
tmp2 = _mm256_mulhi_epu16(d2, v); | |||
tmp0 = _mm256_srli_epi16(tmp0, 10); | |||
tmp1 = _mm256_srli_epi16(tmp1, 10); | |||
tmp2 = _mm256_srli_epi16(tmp2, 10); | |||
tmp0 = _mm256_mullo_epi16(tmp0, kyberq); | |||
tmp1 = _mm256_mullo_epi16(tmp1, kyberq); | |||
tmp2 = _mm256_mullo_epi16(tmp2, kyberq); | |||
d0 = _mm256_sub_epi16(d0, tmp0); | |||
d1 = _mm256_sub_epi16(d1, tmp1); | |||
d2 = _mm256_sub_epi16(d2, tmp2); | |||
f0 = _mm256_shuffle_epi8(f0, g0); | |||
f1 = _mm256_shuffle_epi8(f1, g1); | |||
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(d0)); | |||
ctr += (unsigned int)_mm_popcnt_u32(good0 & 0xFF); | |||
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(d0, 1)); | |||
ctr += (unsigned int)_mm_popcnt_u32((good0 >> 8) & 0xFF); | |||
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(d1)); | |||
ctr += (unsigned int)_mm_popcnt_u32(good1 & 0xFF); | |||
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(d1, 1)); | |||
ctr += (unsigned int)_mm_popcnt_u32((good1 >> 8) & 0xFF); | |||
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(d2)); | |||
ctr += (unsigned int)_mm_popcnt_u32(good2 & 0xFF); | |||
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(d2, 1)); | |||
ctr += (unsigned int)_mm_popcnt_u32((good2 >> 8) & 0xFF); | |||
pos += 96; | |||
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f0)); | |||
ctr += _mm_popcnt_u32((good >> 0) & 0xFF); | |||
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f0, 1)); | |||
ctr += _mm_popcnt_u32((good >> 16) & 0xFF); | |||
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f1)); | |||
ctr += _mm_popcnt_u32((good >> 8) & 0xFF); | |||
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f1, 1)); | |||
ctr += _mm_popcnt_u32((good >> 24) & 0xFF); | |||
} | |||
while (ctr + 8 <= len && pos + 16 <= buflen) { | |||
d = _mm_loadu_si128((__m128i *)&buf[pos]); | |||
tmp = _mm_cmpge_epu16(_mm256_castsi256_si128(bound), d); | |||
good0 = (uint32_t)_mm_movemask_epi8(tmp); | |||
good0 = _pext_u32(good0, 0x55555555); | |||
pilo = _mm_loadl_epi64((__m128i *)&idx[good0]); | |||
while (ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_BUFLEN - 16) { | |||
f = _mm_load_si128((__m128i *)&buf[pos]); | |||
t = _mm_cmpge_epu16(_mm256_castsi256_si128(bound), f); | |||
good = _mm_movemask_epi8(t); | |||
good = _pext_u32(good, 0x5555); | |||
pilo = _mm_loadl_epi64((__m128i *)&idx.arr[good]); | |||
pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones)); | |||
pilo = _mm_unpacklo_epi8(pilo, pihi); | |||
d = _mm_shuffle_epi8(d, pilo); | |||
/* Barrett reduction */ | |||
tmp = _mm_mulhi_epu16(d, _mm256_castsi256_si128(v)); | |||
tmp = _mm_srli_epi16(tmp, 10); | |||
tmp = _mm_mullo_epi16(tmp, _mm256_castsi256_si128(kyberq)); | |||
d = _mm_sub_epi16(d, tmp); | |||
t = _mm_mulhi_epu16(f, _mm256_castsi256_si128(v)); | |||
t = _mm_srli_epi16(t, 10); | |||
t = _mm_mullo_epi16(t, _mm256_castsi256_si128(kyberq)); | |||
f = _mm_sub_epi16(f, t); | |||
_mm_storeu_si128((__m128i *)&r[ctr], d); | |||
ctr += (unsigned int)_mm_popcnt_u32(good0); | |||
f = _mm_shuffle_epi8(f, pilo); | |||
_mm_storeu_si128((__m128i *)&r[ctr], f); | |||
ctr += _mm_popcnt_u32(good); | |||
pos += 16; | |||
} | |||
while (ctr < len && pos + 2 <= buflen) { | |||
val = (uint16_t)(buf[pos] | ((uint16_t)buf[pos + 1] << 8)); | |||
while (ctr < KYBER_N && pos <= REJ_UNIFORM_BUFLEN - 2) { | |||
val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); | |||
pos += 2; | |||
if (val < 19 * KYBER_Q) { | |||
val -= ((int32_t)val * 20159 >> 26) * KYBER_Q; | |||
r[ctr++] = (int16_t)val; | |||
r[ctr++] = val; | |||
} | |||
} | |||
@@ -1,12 +1,11 @@ | |||
#ifndef REJSAMPLE_H | |||
#define REJSAMPLE_H | |||
#include <stddef.h> | |||
#include "params.h" | |||
#include <stdint.h> | |||
size_t PQCLEAN_KYBER102490S_AVX2_rej_uniform(int16_t *r, | |||
size_t len, | |||
const uint8_t *buf, | |||
size_t buflen); | |||
unsigned int PQCLEAN_KYBER102490S_AVX2_rej_uniform_avx(int16_t *r, | |||
const unsigned char *buf); | |||
#endif |
@@ -0,0 +1,255 @@ | |||
#include "cdecl.inc" | |||
.include "fq.inc" | |||
.include "shuffle.inc" | |||
/* | |||
nttpack_avx: | |||
#load | |||
vmovdqa (%rdi),%ymm4 | |||
vmovdqa 32(%rdi),%ymm5 | |||
vmovdqa 64(%rdi),%ymm6 | |||
vmovdqa 96(%rdi),%ymm7 | |||
vmovdqa 128(%rdi),%ymm8 | |||
vmovdqa 160(%rdi),%ymm9 | |||
vmovdqa 192(%rdi),%ymm10 | |||
vmovdqa 224(%rdi),%ymm11 | |||
shuffle1 4,5,3,5 | |||
shuffle1 6,7,4,7 | |||
shuffle1 8,9,6,9 | |||
shuffle1 10,11,8,11 | |||
shuffle2 3,4,10,4 | |||
shuffle2 6,8,3,8 | |||
shuffle2 5,7,6,7 | |||
shuffle2 9,11,5,11 | |||
shuffle4 10,3,9,3 | |||
shuffle4 6,5,10,5 | |||
shuffle4 4,8,6,8 | |||
shuffle4 7,11,4,11 | |||
shuffle8 9,10,7,10 | |||
shuffle8 6,4,9,4 | |||
shuffle8 3,5,6,5 | |||
shuffle8 8,11,3,11 | |||
#store | |||
vmovdqa %ymm7,(%rdi) | |||
vmovdqa %ymm9,32(%rdi) | |||
vmovdqa %ymm6,64(%rdi) | |||
vmovdqa %ymm3,96(%rdi) | |||
vmovdqa %ymm10,128(%rdi) | |||
vmovdqa %ymm4,160(%rdi) | |||
vmovdqa %ymm5,192(%rdi) | |||
vmovdqa %ymm11,224(%rdi) | |||
ret | |||
*/ | |||
.text | |||
nttunpack128_avx: | |||
#load | |||
vmovdqa (%rdi),%ymm4 | |||
vmovdqa 32(%rdi),%ymm5 | |||
vmovdqa 64(%rdi),%ymm6 | |||
vmovdqa 96(%rdi),%ymm7 | |||
vmovdqa 128(%rdi),%ymm8 | |||
vmovdqa 160(%rdi),%ymm9 | |||
vmovdqa 192(%rdi),%ymm10 | |||
vmovdqa 224(%rdi),%ymm11 | |||
shuffle8 4,8,3,8 | |||
shuffle8 5,9,4,9 | |||
shuffle8 6,10,5,10 | |||
shuffle8 7,11,6,11 | |||
shuffle4 3,5,7,5 | |||
shuffle4 8,10,3,10 | |||
shuffle4 4,6,8,6 | |||
shuffle4 9,11,4,11 | |||
shuffle2 7,8,9,8 | |||
shuffle2 5,6,7,6 | |||
shuffle2 3,4,5,4 | |||
shuffle2 10,11,3,11 | |||
shuffle1 9,5,10,5 | |||
shuffle1 8,4,9,4 | |||
shuffle1 7,3,8,3 | |||
shuffle1 6,11,7,11 | |||
#store | |||
vmovdqa %ymm10,(%rdi) | |||
vmovdqa %ymm5,32(%rdi) | |||
vmovdqa %ymm9,64(%rdi) | |||
vmovdqa %ymm4,96(%rdi) | |||
vmovdqa %ymm8,128(%rdi) | |||
vmovdqa %ymm3,160(%rdi) | |||
vmovdqa %ymm7,192(%rdi) | |||
vmovdqa %ymm11,224(%rdi) | |||
ret | |||
.global cdecl(PQCLEAN_KYBER102490S_AVX2_nttunpack_avx) | |||
cdecl(PQCLEAN_KYBER102490S_AVX2_nttunpack_avx): | |||
call nttunpack128_avx | |||
add $256,%rdi | |||
call nttunpack128_avx | |||
ret | |||
ntttobytes128_avx: | |||
#load | |||
vmovdqa (%rsi),%ymm5 | |||
vmovdqa 32(%rsi),%ymm6 | |||
vmovdqa 64(%rsi),%ymm7 | |||
vmovdqa 96(%rsi),%ymm8 | |||
vmovdqa 128(%rsi),%ymm9 | |||
vmovdqa 160(%rsi),%ymm10 | |||
vmovdqa 192(%rsi),%ymm11 | |||
vmovdqa 224(%rsi),%ymm12 | |||
#csubq | |||
csubq 5,13 | |||
csubq 6,14 | |||
csubq 7,15 | |||
csubq 8,1 | |||
csubq 9,13 | |||
csubq 10,14 | |||
csubq 11,15 | |||
csubq 12,1 | |||
#bitpack | |||
vpsllw $12,%ymm6,%ymm4 | |||
vpor %ymm4,%ymm5,%ymm4 | |||
vpsrlw $4,%ymm6,%ymm5 | |||
vpsllw $8,%ymm7,%ymm6 | |||
vpor %ymm5,%ymm6,%ymm5 | |||
vpsrlw $8,%ymm7,%ymm6 | |||
vpsllw $4,%ymm8,%ymm7 | |||
vpor %ymm6,%ymm7,%ymm6 | |||
vpsllw $12,%ymm10,%ymm7 | |||
vpor %ymm7,%ymm9,%ymm7 | |||
vpsrlw $4,%ymm10,%ymm8 | |||
vpsllw $8,%ymm11,%ymm9 | |||
vpor %ymm8,%ymm9,%ymm8 | |||
vpsrlw $8,%ymm11,%ymm9 | |||
vpsllw $4,%ymm12,%ymm10 | |||
vpor %ymm9,%ymm10,%ymm9 | |||
shuffle1 4,5,3,5 | |||
shuffle1 6,7,4,7 | |||
shuffle1 8,9,6,9 | |||
shuffle2 3,4,8,4 | |||
shuffle2 6,5,3,5 | |||
shuffle2 7,9,6,9 | |||
shuffle4 8,3,7,3 | |||
shuffle4 6,4,8,4 | |||
shuffle4 5,9,6,9 | |||
shuffle8 7,8,5,8 | |||
shuffle8 6,3,7,3 | |||
shuffle8 4,9,6,9 | |||
#store | |||
vmovdqu %ymm5,(%rdi) | |||
vmovdqu %ymm7,32(%rdi) | |||
vmovdqu %ymm6,64(%rdi) | |||
vmovdqu %ymm8,96(%rdi) | |||
vmovdqu %ymm3,128(%rdi) | |||
vmovdqu %ymm9,160(%rdi) | |||
ret | |||
.global cdecl(PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx) | |||
cdecl(PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx): | |||
#consts | |||
vmovdqa _16XQ*2(%rdx),%ymm0 | |||
call ntttobytes128_avx | |||
add $256,%rsi | |||
add $192,%rdi | |||
call ntttobytes128_avx | |||
ret | |||
nttfrombytes128_avx: | |||
#load | |||
vmovdqu (%rsi),%ymm4 | |||
vmovdqu 32(%rsi),%ymm5 | |||
vmovdqu 64(%rsi),%ymm6 | |||
vmovdqu 96(%rsi),%ymm7 | |||
vmovdqu 128(%rsi),%ymm8 | |||
vmovdqu 160(%rsi),%ymm9 | |||
shuffle8 4,7,3,7 | |||
shuffle8 5,8,4,8 | |||
shuffle8 6,9,5,9 | |||
shuffle4 3,8,6,8 | |||
shuffle4 7,5,3,5 | |||
shuffle4 4,9,7,9 | |||
shuffle2 6,5,4,5 | |||
shuffle2 8,7,6,7 | |||
shuffle2 3,9,8,9 | |||
shuffle1 4,7,10,7 | |||
shuffle1 5,8,4,8 | |||
shuffle1 6,9,5,9 | |||
#bitunpack | |||
vpsrlw $12,%ymm10,%ymm11 | |||
vpsllw $4,%ymm7,%ymm12 | |||
vpor %ymm11,%ymm12,%ymm11 | |||
vpand %ymm0,%ymm10,%ymm10 | |||
vpand %ymm0,%ymm11,%ymm11 | |||
vpsrlw $8,%ymm7,%ymm12 | |||
vpsllw $8,%ymm4,%ymm13 | |||
vpor %ymm12,%ymm13,%ymm12 | |||
vpand %ymm0,%ymm12,%ymm12 | |||
vpsrlw $4,%ymm4,%ymm13 | |||
vpand %ymm0,%ymm13,%ymm13 | |||
vpsrlw $12,%ymm8,%ymm14 | |||
vpsllw $4,%ymm5,%ymm15 | |||
vpor %ymm14,%ymm15,%ymm14 | |||
vpand %ymm0,%ymm8,%ymm8 | |||
vpand %ymm0,%ymm14,%ymm14 | |||
vpsrlw $8,%ymm5,%ymm15 | |||
vpsllw $8,%ymm9,%ymm1 | |||
vpor %ymm15,%ymm1,%ymm15 | |||
vpand %ymm0,%ymm15,%ymm15 | |||
vpsrlw $4,%ymm9,%ymm1 | |||
vpand %ymm0,%ymm1,%ymm1 | |||
#store | |||
vmovdqa %ymm10,(%rdi) | |||
vmovdqa %ymm11,32(%rdi) | |||
vmovdqa %ymm12,64(%rdi) | |||
vmovdqa %ymm13,96(%rdi) | |||
vmovdqa %ymm8,128(%rdi) | |||
vmovdqa %ymm14,160(%rdi) | |||
vmovdqa %ymm15,192(%rdi) | |||
vmovdqa %ymm1,224(%rdi) | |||
ret | |||
.global cdecl(PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx) | |||
cdecl(PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx): | |||
#consts | |||
vmovdqa _16XMASK*2(%rdx),%ymm0 | |||
call nttfrombytes128_avx | |||
add $256,%rdi | |||
add $192,%rsi | |||
call nttfrombytes128_avx | |||
ret |
@@ -9,6 +9,8 @@ vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 | |||
.endm | |||
.macro shuffle2 r0,r1,r2,r3 | |||
#vshufps $0x20,%ymm\r1,%ymm\r0,%ymm\r2 | |||
#vshufps $0x31,%ymm\r1,%ymm\r0,%ymm\r3 | |||
vpsllq $32,%ymm\r1,%ymm12 | |||
vpsrlq $32,%ymm\r0,%ymm13 | |||
vpblendd $0xAA,%ymm12,%ymm\r0,%ymm\r2 | |||
@@ -2,22 +2,26 @@ | |||
#define SYMMETRIC_H | |||
#include "params.h" | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
#include "aes256ctr.h" | |||
#include "sha2.h" | |||
#define hash_h(OUT, IN, INBYTES) sha256((OUT), (IN), (INBYTES)) | |||
#define hash_g(OUT, IN, INBYTES) sha512((OUT), (IN), (INBYTES)) | |||
#define xof_absorb(STATE, IN, X, Y) PQCLEAN_KYBER102490S_AVX2_aes256ctr_init((STATE), (IN), (Y) + ((uint16_t)(X) << 8)) | |||
#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks((OUT), (OUTBLOCKS), (STATE)) | |||
#define xof_ctx_release(STATE) | |||
#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER102490S_AVX2_aes256ctr_prf((OUT), (OUTBYTES), (KEY), (NONCE)) | |||
#define kdf(OUT, IN, INBYTES) sha256((OUT), (IN), (INBYTES)) | |||
#define XOF_BLOCKBYTES 128 | |||
typedef aes256ctr_ctx xof_state; | |||
#define XOF_BLOCKBYTES AES256CTR_BLOCKBYTES | |||
#define hash_h(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES) | |||
#define hash_g(OUT, IN, INBYTES) sha512(OUT, IN, INBYTES) | |||
#define xof_absorb(STATE, SEED, X, Y) \ | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(STATE, SEED, (X) | ((uint16_t)(Y) << 8)) | |||
#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) \ | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) | |||
#define prf(OUT, OUTBYTES, KEY, NONCE) \ | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_prf(OUT, OUTBYTES, KEY, NONCE) | |||
#define kdf(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES) | |||
#endif /* SYMMETRIC_H */ |
@@ -1,23 +1,22 @@ | |||
#include "verify.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
#include <stdlib.h> | |||
/************************************************* | |||
* Name: verify | |||
* Name: PQCLEAN_KYBER102490S_AVX2_verify | |||
* | |||
* Description: Compare two arrays for equality in constant time. | |||
* | |||
* Arguments: const uint8_t *a: pointer to first byte array | |||
* const uint8_t *b: pointer to second byte array | |||
* Arguments: const unsigned char *a: pointer to first byte array | |||
* const unsigned char *b: pointer to second byte array | |||
* size_t len: length of the byte arrays | |||
* | |||
* Returns 0 if the byte arrays are equal, 1 otherwise | |||
**************************************************/ | |||
uint8_t PQCLEAN_KYBER102490S_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) { | |||
size_t pos; | |||
uint64_t r; | |||
int PQCLEAN_KYBER102490S_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) { | |||
size_t pos = 0; | |||
uint64_t r = 0; | |||
__m256i avec, bvec, cvec; | |||
cvec = _mm256_setzero_si256(); | |||
@@ -27,38 +26,38 @@ uint8_t PQCLEAN_KYBER102490S_AVX2_verify(const uint8_t *a, const uint8_t *b, siz | |||
avec = _mm256_xor_si256(avec, bvec); | |||
cvec = _mm256_or_si256(cvec, avec); | |||
} | |||
r = !_mm256_testz_si256(cvec, cvec); | |||
cvec = _mm256_cmpeq_epi8(cvec, _mm256_setzero_si256()); | |||
r = (uint32_t)(_mm256_movemask_epi8(cvec) ^ -1); | |||
while (pos < len) { | |||
r |= a[pos] ^ b[pos]; | |||
pos += 1; | |||
if (pos < len) { | |||
avec = _mm256_loadu_si256((__m256i *)&a[pos]); | |||
bvec = _mm256_loadu_si256((__m256i *)&b[pos]); | |||
cvec = _mm256_cmpeq_epi8(avec, bvec); | |||
r |= _mm256_movemask_epi8(cvec) & (-(uint32_t)1 >> (32 + pos - len)); | |||
} | |||
r = (-r) >> 63; | |||
return (uint8_t)r; | |||
return r; | |||
} | |||
/************************************************* | |||
* Name: cmov | |||
* Name: PQCLEAN_KYBER102490S_AVX2_cmov | |||
* | |||
* Description: Copy len bytes from x to r if b is 1; | |||
* don't modify x if b is 0. Requires b to be in {0,1}; | |||
* assumes two's complement representation of negative integers. | |||
* Runs in constant time. | |||
* | |||
* Arguments: uint8_t *r: pointer to output byte array | |||
* const uint8_t *x: pointer to input byte array | |||
* Arguments: unsigned char *r: pointer to output byte array | |||
* const unsigned char *x: pointer to input byte array | |||
* size_t len: Amount of bytes to be copied | |||
* uint8_t b: Condition bit; has to be in {0,1} | |||
* unsigned char b: Condition bit; has to be in {0,1} | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b) { | |||
size_t pos; | |||
void PQCLEAN_KYBER102490S_AVX2_cmov(uint8_t *restrict r, const uint8_t *restrict x, size_t len, uint8_t b) { | |||
size_t pos = 0; | |||
__m256i xvec, rvec, bvec; | |||
b = -b; | |||
bvec = _mm256_set1_epi8((char)b); | |||
bvec = _mm256_set1_epi8(b); | |||
for (pos = 0; pos + 32 <= len; pos += 32) { | |||
rvec = _mm256_loadu_si256((__m256i *)&r[pos]); | |||
@@ -1,10 +1,13 @@ | |||
#ifndef VERIFY_H | |||
#define VERIFY_H | |||
#ifndef PQCLEAN_KYBER102490S_AVX2_VERIFY_H | |||
#define PQCLEAN_KYBER102490S_AVX2_VERIFY_H | |||
#include "params.h" | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
uint8_t PQCLEAN_KYBER102490S_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len); | |||
int PQCLEAN_KYBER102490S_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len); | |||
void PQCLEAN_KYBER102490S_AVX2_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b); | |||
@@ -1,14 +1,4 @@ | |||
kyber-20170627 | |||
Public Domain | |||
Authors: Joppe Bos, | |||
Léo Ducas, | |||
Eike Kiltz , | |||
Tancrède Lepoint, | |||
Vadim Lyubashevsky, | |||
John Schanck, | |||
Peter Schwabe, | |||
Gregor Seiler, | |||
Damien Stehlé | |||
Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) | |||
For Keccak and AES we are using public-domain | |||
code from sources and by authors listed in | |||
@@ -1,8 +1,29 @@ | |||
# This Makefile can be used with GNU Make or BSD Make | |||
LIB=libkyber1024-90s_clean.a | |||
HEADERS=api.h cbd.h indcpa.h ntt.h params.h poly.h polyvec.h reduce.h verify.h symmetric.h aes256ctr.h | |||
OBJECTS=cbd.o indcpa.o kem.o ntt.o poly.o polyvec.o reduce.o verify.o aes256ctr.o | |||
HEADERS= \ | |||
api.h \ | |||
cbd.h \ | |||
indcpa.h \ | |||
kem.h \ | |||
ntt.h \ | |||
params.h \ | |||
poly.h \ | |||
polyvec.h \ | |||
reduce.h \ | |||
symmetric-aes.h \ | |||
symmetric.h \ | |||
verify.h | |||
OBJECTS= \ | |||
cbd.o \ | |||
indcpa.o \ | |||
kem.o \ | |||
ntt.o \ | |||
poly.o \ | |||
polyvec.o \ | |||
reduce.o \ | |||
verify.o \ | |||
symmetric-aes.o | |||
CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) | |||
@@ -2,7 +2,7 @@ | |||
# nmake /f Makefile.Microsoft_nmake | |||
LIBRARY=libkyber1024-90s_clean.lib | |||
OBJECTS=cbd.obj indcpa.obj kem.obj ntt.obj poly.obj polyvec.obj reduce.obj verify.obj aes256ctr.obj | |||
OBJECTS=cbd.obj indcpa.obj kem.obj ntt.obj poly.obj polyvec.obj reduce.obj verify.obj symmetric-aes.o | |||
# Warning C4146 is raised when a unary minus operator is applied to an | |||
# unsigned type; this has nonetheless been standard and portable for as | |||
@@ -1,7 +1,5 @@ | |||
#include "cbd.h" | |||
#include "params.h" | |||
#include <stddef.h> | |||
#include "cbd.h" | |||
#include <stdint.h> | |||
/************************************************* | |||
@@ -14,8 +12,8 @@ | |||
* | |||
* Returns 32-bit unsigned integer loaded from x | |||
**************************************************/ | |||
static uint32_t load32_littleendian(const uint8_t *x) { | |||
uint32_t r; | |||
static uint32_t load32_littleendian(const uint8_t x[4]) { | |||
uint32_t r = 0; | |||
r = (uint32_t)x[0]; | |||
r |= (uint32_t)x[1] << 8; | |||
r |= (uint32_t)x[2] << 16; | |||
@@ -24,27 +22,27 @@ static uint32_t load32_littleendian(const uint8_t *x) { | |||
} | |||
/************************************************* | |||
* Name: cbd | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_cbd | |||
* | |||
* Description: Given an array of uniformly random bytes, compute | |||
* polynomial with coefficients distributed according to | |||
* a centered binomial distribution with parameter KYBER_ETA | |||
* specialized for KYBER_ETA=2 | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *buf: pointer to input byte array | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_cbd(poly *r, const uint8_t *buf) { | |||
uint32_t d, t; | |||
int16_t a, b; | |||
void PQCLEAN_KYBER102490S_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]) { | |||
unsigned int i = 0, j = 0; | |||
uint32_t t = 0, d = 0; | |||
int16_t a = 0, b = 0; | |||
for (size_t i = 0; i < KYBER_N / 8; i++) { | |||
t = load32_littleendian(buf + 4 * i); | |||
for (i = 0; i < KYBER_N / 8; i++) { | |||
t = load32_littleendian(buf + 4 * i); | |||
d = t & 0x55555555; | |||
d += (t >> 1) & 0x55555555; | |||
for (size_t j = 0; j < 8; j++) { | |||
a = (d >> 4 * j) & 0x3; | |||
for (j = 0; j < 8; j++) { | |||
a = (d >> (4 * j + 0)) & 0x3; | |||
b = (d >> (4 * j + 2)) & 0x3; | |||
r->coeffs[8 * i + j] = a - b; | |||
} | |||
@@ -1,8 +1,11 @@ | |||
#ifndef CBD_H | |||
#define CBD_H | |||
#ifndef PQCLEAN_KYBER102490S_CLEAN_CBD_H | |||
#define PQCLEAN_KYBER102490S_CLEAN_CBD_H | |||
#include "params.h" | |||
#include "poly.h" | |||
#include <stdint.h> | |||
void PQCLEAN_KYBER102490S_CLEAN_cbd(poly *r, const uint8_t *buf); | |||
void PQCLEAN_KYBER102490S_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]); | |||
#endif |
@@ -5,7 +5,7 @@ | |||
#include "polyvec.h" | |||
#include "randombytes.h" | |||
#include "symmetric.h" | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
/************************************************* | |||
@@ -16,12 +16,15 @@ | |||
* and the public seed used to generate the matrix A. | |||
* | |||
* Arguments: uint8_t *r: pointer to the output serialized public key | |||
* const poly *pk: pointer to the input public-key polynomial | |||
* polyvec *pk: pointer to the input public-key polyvec | |||
* const uint8_t *seed: pointer to the input public seed | |||
**************************************************/ | |||
static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) { | |||
static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], | |||
polyvec *pk, | |||
const uint8_t seed[KYBER_SYMBYTES]) { | |||
size_t i = 0; | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes(r, pk); | |||
for (size_t i = 0; i < KYBER_SYMBYTES; i++) { | |||
for (i = 0; i < KYBER_SYMBYTES; i++) { | |||
r[i + KYBER_POLYVECBYTES] = seed[i]; | |||
} | |||
} | |||
@@ -32,13 +35,18 @@ static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) { | |||
* Description: De-serialize public key from a byte array; | |||
* approximate inverse of pack_pk | |||
* | |||
* Arguments: - polyvec *pk: pointer to output public-key vector of polynomials | |||
* - uint8_t *seed: pointer to output seed to generate matrix A | |||
* Arguments: - polyvec *pk: pointer to output public-key | |||
* polynomial vector | |||
* - uint8_t *seed: pointer to output seed to generate | |||
* matrix A | |||
* - const uint8_t *packedpk: pointer to input serialized public key | |||
**************************************************/ | |||
static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) { | |||
static void unpack_pk(polyvec *pk, | |||
uint8_t seed[KYBER_SYMBYTES], | |||
const uint8_t packedpk[KYBER_INDCPA_PUBLICKEYBYTES]) { | |||
size_t i = 0; | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_frombytes(pk, packedpk); | |||
for (size_t i = 0; i < KYBER_SYMBYTES; i++) { | |||
for (i = 0; i < KYBER_SYMBYTES; i++) { | |||
seed[i] = packedpk[i + KYBER_POLYVECBYTES]; | |||
} | |||
} | |||
@@ -49,9 +57,9 @@ static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) { | |||
* Description: Serialize the secret key | |||
* | |||
* Arguments: - uint8_t *r: pointer to output serialized secret key | |||
* - const polyvec *sk: pointer to input vector of polynomials (secret key) | |||
* - polyvec *sk: pointer to input vector of polynomials (secret key) | |||
**************************************************/ | |||
static void pack_sk(uint8_t *r, polyvec *sk) { | |||
static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes(r, sk); | |||
} | |||
@@ -61,10 +69,12 @@ static void pack_sk(uint8_t *r, polyvec *sk) { | |||
* Description: De-serialize the secret key; | |||
* inverse of pack_sk | |||
* | |||
* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key) | |||
* Arguments: - polyvec *sk: pointer to output vector of | |||
* polynomials (secret key) | |||
* - const uint8_t *packedsk: pointer to input serialized secret key | |||
**************************************************/ | |||
static void unpack_sk(polyvec *sk, const uint8_t *packedsk) { | |||
static void unpack_sk(polyvec *sk, | |||
const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_frombytes(sk, packedsk); | |||
} | |||
@@ -75,11 +85,13 @@ static void unpack_sk(polyvec *sk, const uint8_t *packedsk) { | |||
* compressed and serialized vector of polynomials b | |||
* and the compressed and serialized polynomial v | |||
* | |||
* Arguments: uint8_t *r: pointer to the output serialized ciphertext | |||
* const poly *pk: pointer to the input vector of polynomials b | |||
* const uint8_t *seed: pointer to the input polynomial v | |||
* Arguments: uint8_t *r: pointer to the output serialized ciphertext | |||
* poly *pk: pointer to the input vector of polynomials b | |||
* poly *v: pointer to the input polynomial v | |||
**************************************************/ | |||
static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) { | |||
static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], | |||
polyvec *b, | |||
poly *v) { | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_compress(r, b); | |||
PQCLEAN_KYBER102490S_CLEAN_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v); | |||
} | |||
@@ -90,11 +102,13 @@ static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) { | |||
* Description: De-serialize and decompress ciphertext from a byte array; | |||
* approximate inverse of pack_ciphertext | |||
* | |||
* Arguments: - polyvec *b: pointer to the output vector of polynomials b | |||
* - poly *v: pointer to the output polynomial v | |||
* Arguments: - polyvec *b: pointer to the output vector of polynomials b | |||
* - poly *v: pointer to the output polynomial v | |||
* - const uint8_t *c: pointer to the input serialized ciphertext | |||
**************************************************/ | |||
static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t *c) { | |||
static void unpack_ciphertext(polyvec *b, | |||
poly *v, | |||
const uint8_t c[KYBER_INDCPA_BYTES]) { | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress(b, c); | |||
PQCLEAN_KYBER102490S_CLEAN_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES); | |||
} | |||
@@ -105,24 +119,29 @@ static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t *c) { | |||
* Description: Run rejection sampling on uniform random bytes to generate | |||
* uniform random integers mod q | |||
* | |||
* Arguments: - int16_t *r: pointer to output buffer | |||
* - size_t len: requested number of 16-bit integers (uniform mod q) | |||
* - const uint8_t *buf: pointer to input buffer (assumed to be uniform random bytes) | |||
* - size_t buflen: length of input buffer in bytes | |||
* Arguments: - int16_t *r: pointer to output buffer | |||
* - unsigned int len: requested number of 16-bit integers | |||
* (uniform mod q) | |||
* - const uint8_t *buf: pointer to input buffer | |||
* (assumed to be uniform random bytes) | |||
* - unsigned int buflen: length of input buffer in bytes | |||
* | |||
* Returns number of sampled 16-bit integers (at most len) | |||
**************************************************/ | |||
static size_t rej_uniform(int16_t *r, size_t len, const uint8_t *buf, size_t buflen) { | |||
size_t ctr, pos; | |||
uint16_t val; | |||
static unsigned int rej_uniform(int16_t *r, | |||
unsigned int len, | |||
const uint8_t *buf, | |||
unsigned int buflen) { | |||
unsigned int ctr = 0, pos = 0; | |||
uint16_t val = 0; | |||
ctr = pos = 0; | |||
while (ctr < len && pos + 2 <= buflen) { | |||
val = (uint16_t)(buf[pos] | ((uint16_t)buf[pos + 1] << 8)); | |||
val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); | |||
pos += 2; | |||
if (val < 19 * KYBER_Q) { | |||
val -= (uint16_t)((val >> 12) * KYBER_Q); // Barrett reduction | |||
val -= (val >> 12) * KYBER_Q; // Barrett reduction | |||
r[ctr++] = (int16_t)val; | |||
} | |||
} | |||
@@ -130,26 +149,28 @@ static size_t rej_uniform(int16_t *r, size_t len, const uint8_t *buf, size_t buf | |||
return ctr; | |||
} | |||
#define gen_a(A,B) gen_matrix(A,B,0) | |||
#define gen_at(A,B) gen_matrix(A,B,1) | |||
#define gen_a(A,B) PQCLEAN_KYBER102490S_CLEAN_gen_matrix(A,B,0) | |||
#define gen_at(A,B) PQCLEAN_KYBER102490S_CLEAN_gen_matrix(A,B,1) | |||
/************************************************* | |||
* Name: gen_matrix | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_gen_matrix | |||
* | |||
* Description: Deterministically generate matrix A (or the transpose of A) | |||
* from a seed. Entries of the matrix are polynomials that look | |||
* uniformly random. Performs rejection sampling on output of | |||
* a XOF | |||
* | |||
* Arguments: - polyvec *a: pointer to ouptput matrix A | |||
* Arguments: - polyvec *a: pointer to ouptput matrix A | |||
* - const uint8_t *seed: pointer to input seed | |||
* - int transposed: boolean deciding whether A or A^T is generated | |||
* - int transposed: boolean deciding whether A or A^T | |||
* is generated | |||
**************************************************/ | |||
#define MAXNBLOCKS ((530+XOF_BLOCKBYTES)/XOF_BLOCKBYTES) /* 530 is expected number of required bytes */ | |||
static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { | |||
size_t ctr; | |||
uint8_t i, j; | |||
uint8_t buf[XOF_BLOCKBYTES * MAXNBLOCKS + 1]; | |||
#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \ | |||
+ XOF_BLOCKBYTES)/XOF_BLOCKBYTES) | |||
// Not static for benchmarking | |||
void PQCLEAN_KYBER102490S_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed) { | |||
unsigned int ctr = 0, i = 0, j = 0; | |||
uint8_t buf[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES]; | |||
xof_state state; | |||
for (i = 0; i < KYBER_K; i++) { | |||
@@ -160,12 +181,13 @@ static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { | |||
xof_absorb(&state, seed, j, i); | |||
} | |||
xof_squeezeblocks(buf, MAXNBLOCKS, &state); | |||
ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, MAXNBLOCKS * XOF_BLOCKBYTES); | |||
xof_squeezeblocks(buf, GEN_MATRIX_NBLOCKS, &state); | |||
ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, sizeof(buf)); | |||
while (ctr < KYBER_N) { | |||
xof_squeezeblocks(buf, 1, &state); | |||
ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, XOF_BLOCKBYTES); | |||
ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, | |||
XOF_BLOCKBYTES); | |||
} | |||
xof_ctx_release(&state); | |||
} | |||
@@ -173,40 +195,44 @@ static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { | |||
} | |||
/************************************************* | |||
* Name: indcpa_keypair | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_indcpa_keypair | |||
* | |||
* Description: Generates public and private key for the CPA-secure | |||
* public-key encryption scheme underlying Kyber | |||
* | |||
* Arguments: - uint8_t *pk: pointer to output public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) | |||
* - uint8_t *sk: pointer to output private key (of length KYBER_INDCPA_SECRETKEYBYTES bytes) | |||
* Arguments: - uint8_t *pk: pointer to output public key | |||
* (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) | |||
* - uint8_t *sk: pointer to output private key | |||
(of length KYBER_INDCPA_SECRETKEYBYTES bytes) | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_indcpa_keypair(uint8_t *pk, uint8_t *sk) { | |||
polyvec a[KYBER_K], e, pkpv, skpv; | |||
void PQCLEAN_KYBER102490S_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], | |||
uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { | |||
unsigned int i = 0; | |||
uint8_t buf[2 * KYBER_SYMBYTES]; | |||
uint8_t *publicseed = buf; | |||
uint8_t *noiseseed = buf + KYBER_SYMBYTES; | |||
const uint8_t *publicseed = buf; | |||
const uint8_t *noiseseed = buf + KYBER_SYMBYTES; | |||
uint8_t nonce = 0; | |||
polyvec a[KYBER_K], e, pkpv, skpv; | |||
randombytes(buf, KYBER_SYMBYTES); | |||
hash_g(buf, buf, KYBER_SYMBYTES); | |||
gen_a(a, publicseed); | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(skpv.vec + i, noiseseed, nonce++); | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(&skpv.vec[i], noiseseed, nonce++); | |||
} | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(e.vec + i, noiseseed, nonce++); | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(&e.vec[i], noiseseed, nonce++); | |||
} | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt(&skpv); | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt(&e); | |||
// matrix-vector multiplication | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc(&pkpv.vec[i], &a[i], &skpv); | |||
PQCLEAN_KYBER102490S_CLEAN_poly_frommont(&pkpv.vec[i]); | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); | |||
PQCLEAN_KYBER102490S_CLEAN_poly_tomont(&pkpv.vec[i]); | |||
} | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_add(&pkpv, &pkpv, &e); | |||
@@ -217,34 +243,40 @@ void PQCLEAN_KYBER102490S_CLEAN_indcpa_keypair(uint8_t *pk, uint8_t *sk) { | |||
} | |||
/************************************************* | |||
* Name: indcpa_enc | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_indcpa_enc | |||
* | |||
* Description: Encryption function of the CPA-secure | |||
* public-key encryption scheme underlying Kyber. | |||
* | |||
* Arguments: - uint8_t *c: pointer to output ciphertext (of length KYBER_INDCPA_BYTES bytes) | |||
* - const uint8_t *m: pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes) | |||
* - const uint8_t *pk: pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) | |||
* - const uint8_t *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes) | |||
* to deterministically generate all randomness | |||
* Arguments: - uint8_t *c: pointer to output ciphertext | |||
* (of length KYBER_INDCPA_BYTES bytes) | |||
* - const uint8_t *m: pointer to input message | |||
* (of length KYBER_INDCPA_MSGBYTES bytes) | |||
* - const uint8_t *pk: pointer to input public key | |||
* (of length KYBER_INDCPA_PUBLICKEYBYTES) | |||
* - const uint8_t *coins: pointer to input random coins | |||
* used as seed (of length KYBER_SYMBYTES) | |||
* to deterministically generate all | |||
* randomness | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(uint8_t *c, | |||
const uint8_t *m, | |||
const uint8_t *pk, | |||
const uint8_t *coins) { | |||
polyvec sp, pkpv, ep, at[KYBER_K], bp; | |||
poly v, k, epp; | |||
void PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], | |||
const uint8_t m[KYBER_INDCPA_MSGBYTES], | |||
const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], | |||
const uint8_t coins[KYBER_SYMBYTES]) { | |||
unsigned int i = 0; | |||
uint8_t seed[KYBER_SYMBYTES]; | |||
uint8_t nonce = 0; | |||
polyvec sp, pkpv, ep, at[KYBER_K], bp; | |||
poly v, k, epp; | |||
unpack_pk(&pkpv, seed, pk); | |||
PQCLEAN_KYBER102490S_CLEAN_poly_frommsg(&k, m); | |||
gen_at(at, seed); | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(sp.vec + i, coins, nonce++); | |||
} | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(ep.vec + i, coins, nonce++); | |||
} | |||
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(&epp, coins, nonce++); | |||
@@ -252,14 +284,14 @@ void PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(uint8_t *c, | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt(&sp); | |||
// matrix-vector multiplication | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc(&bp.vec[i], &at[i], &sp); | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp); | |||
} | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc(&v, &pkpv, &sp); | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp); | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_invntt(&bp); | |||
PQCLEAN_KYBER102490S_CLEAN_poly_invntt(&v); | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_invntt_tomont(&bp); | |||
PQCLEAN_KYBER102490S_CLEAN_poly_invntt_tomont(&v); | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_add(&bp, &bp, &ep); | |||
PQCLEAN_KYBER102490S_CLEAN_poly_add(&v, &v, &epp); | |||
@@ -271,18 +303,21 @@ void PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(uint8_t *c, | |||
} | |||
/************************************************* | |||
* Name: indcpa_dec | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_indcpa_dec | |||
* | |||
* Description: Decryption function of the CPA-secure | |||
* public-key encryption scheme underlying Kyber. | |||
* | |||
* Arguments: - uint8_t *m: pointer to output decrypted message (of length KYBER_INDCPA_MSGBYTES) | |||
* - const uint8_t *c: pointer to input ciphertext (of length KYBER_INDCPA_BYTES) | |||
* - const uint8_t *sk: pointer to input secret key (of length KYBER_INDCPA_SECRETKEYBYTES) | |||
* Arguments: - uint8_t *m: pointer to output decrypted message | |||
* (of length KYBER_INDCPA_MSGBYTES) | |||
* - const uint8_t *c: pointer to input ciphertext | |||
* (of length KYBER_INDCPA_BYTES) | |||
* - const uint8_t *sk: pointer to input secret key | |||
* (of length KYBER_INDCPA_SECRETKEYBYTES) | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_indcpa_dec(uint8_t *m, | |||
const uint8_t *c, | |||
const uint8_t *sk) { | |||
void PQCLEAN_KYBER102490S_CLEAN_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], | |||
const uint8_t c[KYBER_INDCPA_BYTES], | |||
const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { | |||
polyvec bp, skpv; | |||
poly v, mp; | |||
@@ -290,8 +325,8 @@ void PQCLEAN_KYBER102490S_CLEAN_indcpa_dec(uint8_t *m, | |||
unpack_sk(&skpv, sk); | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt(&bp); | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc(&mp, &skpv, &bp); | |||
PQCLEAN_KYBER102490S_CLEAN_poly_invntt(&mp); | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp); | |||
PQCLEAN_KYBER102490S_CLEAN_poly_invntt_tomont(&mp); | |||
PQCLEAN_KYBER102490S_CLEAN_poly_sub(&mp, &v, &mp); | |||
PQCLEAN_KYBER102490S_CLEAN_poly_reduce(&mp); | |||
@@ -1,21 +1,16 @@ | |||
#ifndef INDCPA_H | |||
#define INDCPA_H | |||
#ifndef PQCLEAN_KYBER102490S_CLEAN_INDCPA_H | |||
#define PQCLEAN_KYBER102490S_CLEAN_INDCPA_H | |||
#include "params.h" | |||
#include "polyvec.h" | |||
#include <stdint.h> | |||
void PQCLEAN_KYBER102490S_CLEAN_indcpa_keypair( | |||
uint8_t *pk, | |||
uint8_t *sk); | |||
void PQCLEAN_KYBER102490S_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed); | |||
void PQCLEAN_KYBER102490S_CLEAN_indcpa_enc( | |||
uint8_t *c, | |||
const uint8_t *m, | |||
const uint8_t *pk, | |||
const uint8_t *coins); | |||
void PQCLEAN_KYBER102490S_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]); | |||
void PQCLEAN_KYBER102490S_CLEAN_indcpa_dec( | |||
uint8_t *m, | |||
const uint8_t *c, | |||
const uint8_t *sk); | |||
void PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], const uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], const uint8_t coins[KYBER_SYMBYTES]); | |||
void PQCLEAN_KYBER102490S_CLEAN_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t c[KYBER_INDCPA_BYTES], const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]); | |||
#endif |
@@ -1,99 +1,125 @@ | |||
#include "api.h" | |||
#include "indcpa.h" | |||
#include "kem.h" | |||
#include "params.h" | |||
#include "randombytes.h" | |||
#include "symmetric.h" | |||
#include "verify.h" | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
#include <stdlib.h> | |||
/************************************************* | |||
* Name: crypto_kem_keypair | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_crypto_kem_keypair | |||
* | |||
* Description: Generates public and private key | |||
* for CCA-secure Kyber key encapsulation mechanism | |||
* | |||
* Arguments: - uint8_t *pk: pointer to output public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) | |||
* - uint8_t *sk: pointer to output private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) | |||
* Arguments: - unsigned char *pk: pointer to output public key | |||
* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) | |||
* - unsigned char *sk: pointer to output private key | |||
* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { | |||
size_t i; | |||
int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { | |||
size_t i = 0; | |||
PQCLEAN_KYBER102490S_CLEAN_indcpa_keypair(pk, sk); | |||
for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) { | |||
sk[i + KYBER_INDCPA_SECRETKEYBYTES] = pk[i]; | |||
} | |||
hash_h(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); | |||
randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES); /* Value z for pseudo-random output on reject */ | |||
/* Value z for pseudo-random output on reject */ | |||
randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES); | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: crypto_kem_enc | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_crypto_kem_enc | |||
* | |||
* Description: Generates cipher text and shared | |||
* secret for given public key | |||
* | |||
* Arguments: - uint8_t *ct: pointer to output cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) | |||
* - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes) | |||
* - const uint8_t *pk: pointer to input public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) | |||
* Arguments: - unsigned char *ct: pointer to output cipher text | |||
* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) | |||
* - unsigned char *ss: pointer to output shared secret | |||
* (an already allocated array of CRYPTO_BYTES bytes) | |||
* - const unsigned char *pk: pointer to input public key | |||
* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { | |||
uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ | |||
int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_enc(unsigned char *ct, | |||
unsigned char *ss, | |||
const unsigned char *pk) { | |||
uint8_t buf[2 * KYBER_SYMBYTES]; | |||
/* Will contain key, coins */ | |||
uint8_t kr[2 * KYBER_SYMBYTES]; | |||
randombytes(buf, KYBER_SYMBYTES); | |||
hash_h(buf, buf, KYBER_SYMBYTES); /* Don't release system RNG output */ | |||
/* Don't release system RNG output */ | |||
hash_h(buf, buf, KYBER_SYMBYTES); | |||
hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); /* Multitarget countermeasure for coins + contributory KEM */ | |||
/* Multitarget countermeasure for coins + contributory KEM */ | |||
hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); | |||
hash_g(kr, buf, 2 * KYBER_SYMBYTES); | |||
PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ | |||
/* coins are in kr+KYBER_SYMBYTES */ | |||
PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); | |||
hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ | |||
kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */ | |||
/* overwrite coins in kr with H(c) */ | |||
hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); | |||
/* hash concatenation of pre-k and H(c) to k */ | |||
kdf(ss, kr, 2 * KYBER_SYMBYTES); | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: crypto_kem_dec | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_crypto_kem_dec | |||
* | |||
* Description: Generates shared secret for given | |||
* cipher text and private key | |||
* | |||
* Arguments: - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes) | |||
* - const uint8_t *ct: pointer to input cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) | |||
* - const uint8_t *sk: pointer to input private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) | |||
* Arguments: - unsigned char *ss: pointer to output shared secret | |||
* (an already allocated array of CRYPTO_BYTES bytes) | |||
* - const unsigned char *ct: pointer to input cipher text | |||
* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) | |||
* - const unsigned char *sk: pointer to input private key | |||
* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) | |||
* | |||
* Returns 0. | |||
* | |||
* On failure, ss will contain a pseudo-random value. | |||
**************************************************/ | |||
int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { | |||
size_t i; | |||
uint8_t fail; | |||
uint8_t cmp[KYBER_CIPHERTEXTBYTES]; | |||
int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_dec(unsigned char *ss, | |||
const unsigned char *ct, | |||
const unsigned char *sk) { | |||
size_t i = 0; | |||
int fail = 0; | |||
uint8_t buf[2 * KYBER_SYMBYTES]; | |||
uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ | |||
/* Will contain key, coins */ | |||
uint8_t kr[2 * KYBER_SYMBYTES]; | |||
uint8_t cmp[KYBER_CIPHERTEXTBYTES]; | |||
const uint8_t *pk = sk + KYBER_INDCPA_SECRETKEYBYTES; | |||
PQCLEAN_KYBER102490S_CLEAN_indcpa_dec(buf, ct, sk); | |||
for (i = 0; i < KYBER_SYMBYTES; i++) { /* Multitarget countermeasure for coins + contributory KEM */ | |||
buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; /* Save hash by storing H(pk) in sk */ | |||
/* Multitarget countermeasure for coins + contributory KEM */ | |||
for (i = 0; i < KYBER_SYMBYTES; i++) { | |||
buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; | |||
} | |||
hash_g(kr, buf, 2 * KYBER_SYMBYTES); | |||
PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(cmp, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ | |||
/* coins are in kr+KYBER_SYMBYTES */ | |||
PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(cmp, buf, pk, kr + KYBER_SYMBYTES); | |||
fail = PQCLEAN_KYBER102490S_CLEAN_verify(ct, cmp, KYBER_CIPHERTEXTBYTES); | |||
hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ | |||
/* overwrite coins in kr with H(c) */ | |||
hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); | |||
PQCLEAN_KYBER102490S_CLEAN_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); /* Overwrite pre-k with z on re-encryption failure */ | |||
/* Overwrite pre-k with z on re-encryption failure */ | |||
PQCLEAN_KYBER102490S_CLEAN_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); | |||
kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */ | |||
/* hash concatenation of pre-k and H(c) to k */ | |||
kdf(ss, kr, 2 * KYBER_SYMBYTES); | |||
return 0; | |||
} |
@@ -0,0 +1,19 @@ | |||
#ifndef PQCLEAN_KYBER102490S_CLEAN_KEM_H | |||
#define PQCLEAN_KYBER102490S_CLEAN_KEM_H | |||
#include "params.h" | |||
int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); | |||
int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_enc(unsigned char *ct, | |||
unsigned char *ss, | |||
const unsigned char *pk); | |||
int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_dec(unsigned char *ss, | |||
const unsigned char *ct, | |||
const unsigned char *sk); | |||
#endif |
@@ -1,11 +1,9 @@ | |||
#include "ntt.h" | |||
#include "params.h" | |||
#include "ntt.h" | |||
#include "reduce.h" | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
/* Code to generate zetas and zetas_inv used in the number-theoretic transform: | |||
/* Code to generate PQCLEAN_KYBER102490S_CLEAN_zetas and PQCLEAN_KYBER102490S_CLEAN_zetas_inv used in the number-theoretic transform: | |||
#define KYBER_ROOT_OF_UNITY 17 | |||
@@ -17,12 +15,8 @@ static const uint16_t tree[128] = { | |||
1, 65, 33, 97, 17, 81, 49, 113, 9, 73, 41, 105, 25, 89, 57, 121, | |||
5, 69, 37, 101, 21, 85, 53, 117, 13, 77, 45, 109, 29, 93, 61, 125, | |||
3, 67, 35, 99, 19, 83, 51, 115, 11, 75, 43, 107, 27, 91, 59, 123, | |||
7, 71, 39, 103, 23, 87, 55, 119, 15, 79, 47, 111, 31, 95, 63, 127}; | |||
static int16_t fqmul(int16_t a, int16_t b) { | |||
return montgomery_reduce((int32_t)a*b); | |||
} | |||
7, 71, 39, 103, 23, 87, 55, 119, 15, 79, 47, 111, 31, 95, 63, 127 | |||
}; | |||
void init_ntt() { | |||
unsigned int i, j, k; | |||
@@ -33,40 +27,44 @@ void init_ntt() { | |||
tmp[i] = fqmul(tmp[i-1], KYBER_ROOT_OF_UNITY*MONT % KYBER_Q); | |||
for(i = 0; i < 128; ++i) | |||
zetas[i] = tmp[tree[i]]; | |||
PQCLEAN_KYBER102490S_CLEAN_zetas[i] = tmp[tree[i]]; | |||
k = 0; | |||
for(i = 64; i >= 1; i >>= 1) | |||
for(j = i; j < 2*i; ++j) | |||
zetas_inv[k++] = -tmp[128 - tree[j]]; | |||
PQCLEAN_KYBER102490S_CLEAN_zetas_inv[k++] = -tmp[128 - tree[j]]; | |||
zetas_inv[127] = MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q; | |||
PQCLEAN_KYBER102490S_CLEAN_zetas_inv[127] = MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q; | |||
} | |||
*/ | |||
const int16_t PQCLEAN_KYBER102490S_CLEAN_zetas[128] = { | |||
2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182, 962, 2127, 1855, 1468, | |||
573, 2004, 264, 383, 2500, 1458, 1727, 3199, 2648, 1017, 732, 608, 1787, 411, 3124, 1758, | |||
1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, | |||
2476, 3239, 3058, 830, 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, | |||
2226, 430, 555, 843, 2078, 871, 1550, 105, 422, 587, 177, 3094, 3038, 2869, 1574, 1653, | |||
3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349, 418, 329, 3173, 3254, | |||
817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193, 1218, 1994, 2455, 220, 2142, 1670, | |||
2144, 1799, 2051, 794, 1819, 2475, 2459, 478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628 | |||
2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182, 962, | |||
2127, 1855, 1468, 573, 2004, 264, 383, 2500, 1458, 1727, 3199, 2648, 1017, | |||
732, 608, 1787, 411, 3124, 1758, 1223, 652, 2777, 1015, 2036, 1491, 3047, | |||
1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, 2476, 3239, 3058, 830, | |||
107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, 2226, | |||
430, 555, 843, 2078, 871, 1550, 105, 422, 587, 177, 3094, 3038, 2869, 1574, | |||
1653, 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349, | |||
418, 329, 3173, 3254, 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193, | |||
1218, 1994, 2455, 220, 2142, 1670, 2144, 1799, 2051, 794, 1819, 2475, 2459, | |||
478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628 | |||
}; | |||
const int16_t PQCLEAN_KYBER102490S_CLEAN_zetas_inv[128] = { | |||
1701, 1807, 1460, 2371, 2338, 2333, 308, 108, 2851, 870, 854, 1510, 2535, 1278, 1530, 1185, | |||
1659, 1187, 3109, 874, 1335, 2111, 136, 1215, 2945, 1465, 1285, 2007, 2719, 2726, 2232, 2512, | |||
75, 156, 3000, 2911, 2980, 872, 2685, 1590, 2210, 602, 1846, 777, 147, 2170, 2551, 246, | |||
1676, 1755, 460, 291, 235, 3152, 2742, 2907, 3224, 1779, 2458, 1251, 2486, 2774, 2899, 1103, | |||
1275, 2652, 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, | |||
1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, 2677, 2106, | |||
1571, 205, 2918, 1542, 2721, 2597, 2312, 681, 130, 1602, 1871, 829, 2946, 3065, 1325, 2756, | |||
1861, 1474, 1202, 2367, 3147, 1752, 2707, 171, 3127, 3042, 1907, 1836, 1517, 359, 758, 1441 | |||
1701, 1807, 1460, 2371, 2338, 2333, 308, 108, 2851, 870, 854, 1510, 2535, | |||
1278, 1530, 1185, 1659, 1187, 3109, 874, 1335, 2111, 136, 1215, 2945, 1465, | |||
1285, 2007, 2719, 2726, 2232, 2512, 75, 156, 3000, 2911, 2980, 872, 2685, | |||
1590, 2210, 602, 1846, 777, 147, 2170, 2551, 246, 1676, 1755, 460, 291, 235, | |||
3152, 2742, 2907, 3224, 1779, 2458, 1251, 2486, 2774, 2899, 1103, 1275, 2652, | |||
1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, | |||
1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, | |||
2677, 2106, 1571, 205, 2918, 1542, 2721, 2597, 2312, 681, 130, 1602, 1871, | |||
829, 2946, 3065, 1325, 2756, 1861, 1474, 1202, 2367, 3147, 1752, 2707, 171, | |||
3127, 3042, 1907, 1836, 1517, 359, 758, 1441 | |||
}; | |||
/************************************************* | |||
* Name: fqmul | |||
* | |||
@@ -82,68 +80,73 @@ static int16_t fqmul(int16_t a, int16_t b) { | |||
} | |||
/************************************************* | |||
* Name: ntt | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_ntt | |||
* | |||
* Description: Inplace number-theoretic transform (NTT) in Rq | |||
* input is in standard order, output is in bitreversed order | |||
* | |||
* Arguments: - int16_t poly[256]: pointer to input/output vector of elements of Zq | |||
* Arguments: - int16_t r[256]: pointer to input/output vector of elements | |||
* of Zq | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_ntt(int16_t poly[256]) { | |||
size_t j, k = 1; | |||
int16_t t, zeta; | |||
void PQCLEAN_KYBER102490S_CLEAN_ntt(int16_t r[256]) { | |||
unsigned int len = 0, start = 0, j = 0, k = 0; | |||
int16_t t = 0, zeta = 0; | |||
for (size_t len = 128; len >= 2; len >>= 1) { | |||
for (size_t start = 0; start < 256; start = j + len) { | |||
k = 1; | |||
for (len = 128; len >= 2; len >>= 1) { | |||
for (start = 0; start < 256; start = j + len) { | |||
zeta = PQCLEAN_KYBER102490S_CLEAN_zetas[k++]; | |||
for (j = start; j < start + len; ++j) { | |||
t = fqmul(zeta, poly[j + len]); | |||
poly[j + len] = poly[j] - t; | |||
poly[j] = poly[j] + t; | |||
t = fqmul(zeta, r[j + len]); | |||
r[j + len] = r[j] - t; | |||
r[j] = r[j] + t; | |||
} | |||
} | |||
} | |||
} | |||
/************************************************* | |||
* Name: invntt | |||
* Name: invntt_tomont | |||
* | |||
* Description: Inplace inverse number-theoretic transform in Rq | |||
* input is in bitreversed order, output is in standard order | |||
* Description: Inplace inverse number-theoretic transform in Rq and | |||
* multiplication by Montgomery factor 2^16. | |||
* Input is in bitreversed order, output is in standard order | |||
* | |||
* Arguments: - int16_t poly[256]: pointer to input/output vector of elements of Zq | |||
* Arguments: - int16_t r[256]: pointer to input/output vector of elements | |||
* of Zq | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_invntt(int16_t poly[256]) { | |||
size_t j, k = 0; | |||
int16_t t, zeta; | |||
void PQCLEAN_KYBER102490S_CLEAN_invntt(int16_t r[256]) { | |||
unsigned int start = 0, len = 0, j = 0, k = 0; | |||
int16_t t = 0, zeta = 0; | |||
for (size_t len = 2; len <= 128; len <<= 1) { | |||
for (size_t start = 0; start < 256; start = j + len) { | |||
k = 0; | |||
for (len = 2; len <= 128; len <<= 1) { | |||
for (start = 0; start < 256; start = j + len) { | |||
zeta = PQCLEAN_KYBER102490S_CLEAN_zetas_inv[k++]; | |||
for (j = start; j < start + len; ++j) { | |||
t = poly[j]; | |||
poly[j] = PQCLEAN_KYBER102490S_CLEAN_barrett_reduce(t + poly[j + len]); | |||
poly[j + len] = t - poly[j + len]; | |||
poly[j + len] = fqmul(zeta, poly[j + len]); | |||
t = r[j]; | |||
r[j] = PQCLEAN_KYBER102490S_CLEAN_barrett_reduce(t + r[j + len]); | |||
r[j + len] = t - r[j + len]; | |||
r[j + len] = fqmul(zeta, r[j + len]); | |||
} | |||
} | |||
} | |||
for (j = 0; j < 256; ++j) { | |||
poly[j] = fqmul(poly[j], PQCLEAN_KYBER102490S_CLEAN_zetas_inv[127]); | |||
r[j] = fqmul(r[j], PQCLEAN_KYBER102490S_CLEAN_zetas_inv[127]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: basemul | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_basemul | |||
* | |||
* Description: Multiplication of polynomials in Zq[X]/((X^2-zeta)) | |||
* Description: Multiplication of polynomials in Zq[X]/(X^2-zeta) | |||
* used for multiplication of elements in Rq in NTT domain | |||
* | |||
* Arguments: - int16_t r[2]: pointer to the output polynomial | |||
* Arguments: - int16_t r[2]: pointer to the output polynomial | |||
* - const int16_t a[2]: pointer to the first factor | |||
* - const int16_t b[2]: pointer to the second factor | |||
* - int16_t zeta: integer defining the reduction polynomial | |||
* - int16_t zeta: integer defining the reduction polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta) { | |||
r[0] = fqmul(a[1], b[1]); | |||
@@ -1,13 +1,22 @@ | |||
#ifndef NTT_H | |||
#define NTT_H | |||
#ifndef PQCLEAN_KYBER102490S_CLEAN_NTT_H | |||
#define PQCLEAN_KYBER102490S_CLEAN_NTT_H | |||
#include "params.h" | |||
#include <stdint.h> | |||
extern const int16_t PQCLEAN_KYBER102490S_CLEAN_zetas[128]; | |||
extern const int16_t PQCLEAN_KYBER102490S_CLEAN_zetasinv[128]; | |||
void PQCLEAN_KYBER102490S_CLEAN_ntt(int16_t *poly); | |||
void PQCLEAN_KYBER102490S_CLEAN_invntt(int16_t *poly); | |||
extern const int16_t PQCLEAN_KYBER102490S_CLEAN_zetas_inv[128]; | |||
void PQCLEAN_KYBER102490S_CLEAN_ntt(int16_t r[256]); | |||
void PQCLEAN_KYBER102490S_CLEAN_invntt(int16_t r[256]); | |||
void PQCLEAN_KYBER102490S_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta); | |||
#endif |
@@ -1,8 +1,5 @@ | |||
#ifndef PARAMS_H | |||
#define PARAMS_H | |||
/* Don't change parameters below this line */ | |||
#ifndef PQCLEAN_KYBER102490S_CLEAN_PARAMS_H | |||
#define PQCLEAN_KYBER102490S_CLEAN_PARAMS_H | |||
#define KYBER_N 256 | |||
#define KYBER_Q 3329 | |||
@@ -12,9 +9,8 @@ | |||
#define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ | |||
#define KYBER_SSBYTES 32 /* size in bytes of shared key */ | |||
#define KYBER_POLYBYTES 384 | |||
#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) | |||
#define KYBER_POLYBYTES 384 | |||
#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) | |||
#define KYBER_K 4 | |||
#define KYBER_POLYCOMPRESSEDBYTES 160 | |||
@@ -23,10 +19,14 @@ | |||
#define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES | |||
#define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) | |||
#define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) | |||
#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES) | |||
#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES \ | |||
+ KYBER_POLYCOMPRESSEDBYTES) | |||
#define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES) | |||
#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) /* 32 bytes of additional space to save H(pk) */ | |||
/* 32 bytes of additional space to save H(pk) */ | |||
#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES \ | |||
+ KYBER_INDCPA_PUBLICKEYBYTES \ | |||
+ 2*KYBER_SYMBYTES) | |||
#define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES | |||
#endif |
@@ -1,120 +1,177 @@ | |||
#include "params.h" | |||
#include "cbd.h" | |||
#include "ntt.h" | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "reduce.h" | |||
#include "symmetric.h" | |||
#include <stdint.h> | |||
/************************************************* | |||
* Name: poly_compress | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_compress | |||
* | |||
* Description: Compression and subsequent serialization of a polynomial | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array (needs space for KYBER_POLYCOMPRESSEDBYTES bytes) | |||
* - const poly *a: pointer to input polynomial | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* (of length KYBER_POLYCOMPRESSEDBYTES) | |||
* - poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_compress(uint8_t *r, poly *a) { | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a) { | |||
unsigned int i = 0, j = 0; | |||
uint8_t t[8]; | |||
size_t k = 0; | |||
PQCLEAN_KYBER102490S_CLEAN_poly_csubq(a); | |||
for (size_t i = 0; i < KYBER_N; i += 8) { | |||
for (size_t j = 0; j < 8; j++) { | |||
t[j] = ((((uint32_t)a->coeffs[i + j] << 5) + KYBER_Q / 2) / KYBER_Q) & 31; | |||
for (i = 0; i < KYBER_N / 8; i++) { | |||
for (j = 0; j < 8; j++) { | |||
t[j] = ((((uint32_t)a->coeffs[8 * i + j] << 5) + KYBER_Q / 2) / KYBER_Q) & 31; | |||
} | |||
r[k] = (uint8_t)( t[0] | (t[1] << 5)); | |||
r[k + 1] = (uint8_t)((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)); | |||
r[k + 2] = (uint8_t)((t[3] >> 1) | (t[4] << 4)); | |||
r[k + 3] = (uint8_t)((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)); | |||
r[k + 4] = (uint8_t)((t[6] >> 2) | (t[7] << 3)); | |||
k += 5; | |||
r[0] = (t[0] >> 0) | (t[1] << 5); | |||
r[1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7); | |||
r[2] = (t[3] >> 1) | (t[4] << 4); | |||
r[3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6); | |||
r[4] = (t[6] >> 2) | (t[7] << 3); | |||
r += 5; | |||
} | |||
} | |||
/************************************************* | |||
* Name: poly_decompress | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_decompress | |||
* | |||
* Description: De-serialization and subsequent decompression of a polynomial; | |||
* approximate inverse of poly_compress | |||
* approximate inverse of PQCLEAN_KYBER102490S_CLEAN_poly_compress | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *a: pointer to input byte array (of length KYBER_POLYCOMPRESSEDBYTES bytes) | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *a: pointer to input byte array | |||
* (of length KYBER_POLYCOMPRESSEDBYTES bytes) | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_decompress(poly *r, const uint8_t *a) { | |||
for (size_t i = 0; i < KYBER_N; i += 8) { | |||
r->coeffs[i + 0] = (int16_t)( (((a[0] & 31) * KYBER_Q) + 16) >> 5); | |||
r->coeffs[i + 1] = (int16_t)(((((a[0] >> 5) | ((a[1] & 3) << 3)) * KYBER_Q) + 16) >> 5); | |||
r->coeffs[i + 2] = (int16_t)(((((a[1] >> 2) & 31) * KYBER_Q) + 16) >> 5); | |||
r->coeffs[i + 3] = (int16_t)(((((a[1] >> 7) | ((a[2] & 15) << 1)) * KYBER_Q) + 16) >> 5); | |||
r->coeffs[i + 4] = (int16_t)(((((a[2] >> 4) | ((a[3] & 1) << 4)) * KYBER_Q) + 16) >> 5); | |||
r->coeffs[i + 5] = (int16_t)(((((a[3] >> 1) & 31) * KYBER_Q) + 16) >> 5); | |||
r->coeffs[i + 6] = (int16_t)(((((a[3] >> 6) | ((a[4] & 7) << 2)) * KYBER_Q) + 16) >> 5); | |||
r->coeffs[i + 7] = (int16_t)( (((a[4] >> 3) * KYBER_Q) + 16) >> 5); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) { | |||
unsigned int i = 0; | |||
unsigned int j = 0; | |||
uint8_t t[8]; | |||
for (i = 0; i < KYBER_N / 8; i++) { | |||
t[0] = (a[0] >> 0); | |||
t[1] = (a[0] >> 5) | (a[1] << 3); | |||
t[2] = (a[1] >> 2); | |||
t[3] = (a[1] >> 7) | (a[2] << 1); | |||
t[4] = (a[2] >> 4) | (a[3] << 4); | |||
t[5] = (a[3] >> 1); | |||
t[6] = (a[3] >> 6) | (a[4] << 2); | |||
t[7] = (a[4] >> 3); | |||
a += 5; | |||
for (j = 0; j < 8; j++) { | |||
r->coeffs[8 * i + j] = ((uint32_t)(t[j] & 31) * KYBER_Q + 16) >> 5; | |||
} | |||
} | |||
} | |||
/************************************************* | |||
* Name: poly_tobytes | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_tobytes | |||
* | |||
* Description: Serialization of a polynomial | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array (needs space for KYBER_POLYBYTES bytes) | |||
* - const poly *a: pointer to input polynomial | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* (needs space for KYBER_POLYBYTES bytes) | |||
* - poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_tobytes(uint8_t *r, poly *a) { | |||
int16_t t0, t1; | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { | |||
unsigned int i = 0; | |||
uint16_t t0 = 0, t1 = 0; | |||
PQCLEAN_KYBER102490S_CLEAN_poly_csubq(a); | |||
for (size_t i = 0; i < KYBER_N / 2; i++) { | |||
for (i = 0; i < KYBER_N / 2; i++) { | |||
t0 = a->coeffs[2 * i]; | |||
t1 = a->coeffs[2 * i + 1]; | |||
r[3 * i] = t0 & 0xff; | |||
r[3 * i + 1] = (uint8_t)((t0 >> 8) | ((t1 & 0xf) << 4)); | |||
r[3 * i + 2] = (uint8_t)(t1 >> 4); | |||
r[3 * i + 0] = (t0 >> 0); | |||
r[3 * i + 1] = (t0 >> 8) | (t1 << 4); | |||
r[3 * i + 2] = (t1 >> 4); | |||
} | |||
} | |||
/************************************************* | |||
* Name: poly_frombytes | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_frombytes | |||
* | |||
* Description: De-serialization of a polynomial; | |||
* inverse of poly_tobytes | |||
* inverse of PQCLEAN_KYBER102490S_CLEAN_poly_tobytes | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *a: pointer to input byte array (of KYBER_POLYBYTES bytes) | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *a: pointer to input byte array | |||
* (of KYBER_POLYBYTES bytes) | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_frombytes(poly *r, const uint8_t *a) { | |||
for (size_t i = 0; i < KYBER_N / 2; i++) { | |||
r->coeffs[2 * i] = (int16_t)(a[3 * i] | ((uint16_t)a[3 * i + 1] & 0x0f) << 8); | |||
r->coeffs[2 * i + 1] = (int16_t)(a[3 * i + 1] >> 4 | ((uint16_t)a[3 * i + 2] & 0xff) << 4); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]) { | |||
unsigned int i = 0; | |||
for (i = 0; i < KYBER_N / 2; i++) { | |||
r->coeffs[2 * i] = ((a[3 * i + 0] >> 0) | ((uint16_t)a[3 * i + 1] << 8)) & 0xFFF; | |||
r->coeffs[2 * i + 1] = ((a[3 * i + 1] >> 4) | ((uint16_t)a[3 * i + 2] << 4)) & 0xFFF; | |||
} | |||
} | |||
/************************************************* | |||
* Name: poly_getnoise | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_frommsg | |||
* | |||
* Description: Convert 32-byte message to polynomial | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *msg: pointer to input message | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { | |||
unsigned int i = 0, j = 0; | |||
int16_t mask = 0; | |||
for (i = 0; i < KYBER_N / 8; i++) { | |||
for (j = 0; j < 8; j++) { | |||
mask = -(int16_t)((msg[i] >> j) & 1); | |||
r->coeffs[8 * i + j] = mask & ((KYBER_Q + 1) / 2); | |||
} | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_tomsg | |||
* | |||
* Description: Convert polynomial to 32-byte message | |||
* | |||
* Arguments: - uint8_t *msg: pointer to output message | |||
* - poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a) { | |||
unsigned int i = 0, j = 0; | |||
uint16_t t = 0; | |||
PQCLEAN_KYBER102490S_CLEAN_poly_csubq(a); | |||
for (i = 0; i < KYBER_N / 8; i++) { | |||
msg[i] = 0; | |||
for (j = 0; j < 8; j++) { | |||
t = ((((uint16_t)a->coeffs[8 * i + j] << 1) + KYBER_Q / 2) / KYBER_Q) & 1; | |||
msg[i] |= t << j; | |||
} | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_getnoise | |||
* | |||
* Description: Sample a polynomial deterministically from a seed and a nonce, | |||
* with output polynomial close to centered binomial distribution | |||
* with parameter KYBER_ETA | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *seed: pointer to input seed (pointing to array of length KYBER_SYMBYTES bytes) | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *seed: pointer to input seed | |||
* (of length KYBER_SYMBYTES bytes) | |||
* - uint8_t nonce: one-byte input nonce | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce) { | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { | |||
uint8_t buf[KYBER_ETA * KYBER_N / 4]; | |||
prf(buf, KYBER_ETA * KYBER_N / 4, seed, nonce); | |||
prf(buf, sizeof(buf), seed, nonce); | |||
PQCLEAN_KYBER102490S_CLEAN_cbd(r, buf); | |||
} | |||
/************************************************* | |||
* Name: poly_ntt | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_ntt | |||
* | |||
* Description: Computes negacyclic number-theoretic transform (NTT) of | |||
* a polynomial in place; | |||
@@ -128,20 +185,20 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_ntt(poly *r) { | |||
} | |||
/************************************************* | |||
* Name: poly_invntt | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_invntt_tomont | |||
* | |||
* Description: Computes inverse of negacyclic number-theoretic transform (NTT) of | |||
* a polynomial in place; | |||
* Description: Computes inverse of negacyclic number-theoretic transform (NTT) | |||
* of a polynomial in place; | |||
* inputs assumed to be in bitreversed order, output in normal order | |||
* | |||
* Arguments: - uint16_t *a: pointer to in/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_invntt(poly *r) { | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_invntt_tomont(poly *r) { | |||
PQCLEAN_KYBER102490S_CLEAN_invntt(r->coeffs); | |||
} | |||
/************************************************* | |||
* Name: poly_basemul | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_basemul_montgomery | |||
* | |||
* Description: Multiplication of two polynomials in NTT domain | |||
* | |||
@@ -149,68 +206,64 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_invntt(poly *r) { | |||
* - const poly *a: pointer to first input polynomial | |||
* - const poly *b: pointer to second input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_basemul(poly *r, const poly *a, const poly *b) { | |||
for (size_t i = 0; i < KYBER_N / 4; ++i) { | |||
PQCLEAN_KYBER102490S_CLEAN_basemul( | |||
r->coeffs + 4 * i, | |||
a->coeffs + 4 * i, | |||
b->coeffs + 4 * i, | |||
PQCLEAN_KYBER102490S_CLEAN_zetas[64 + i]); | |||
PQCLEAN_KYBER102490S_CLEAN_basemul( | |||
r->coeffs + 4 * i + 2, | |||
a->coeffs + 4 * i + 2, | |||
b->coeffs + 4 * i + 2, | |||
-PQCLEAN_KYBER102490S_CLEAN_zetas[64 + i]); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, const poly *b) { | |||
unsigned int i = 0; | |||
for (i = 0; i < KYBER_N / 4; i++) { | |||
PQCLEAN_KYBER102490S_CLEAN_basemul(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i], PQCLEAN_KYBER102490S_CLEAN_zetas[64 + i]); | |||
PQCLEAN_KYBER102490S_CLEAN_basemul(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2], &b->coeffs[4 * i + 2], | |||
-PQCLEAN_KYBER102490S_CLEAN_zetas[64 + i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: poly_frommont | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_tomont | |||
* | |||
* Description: Inplace conversion of all coefficients of a polynomial | |||
* from Montgomery domain to normal domain | |||
* from normal domain to Montgomery domain | |||
* | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_frommont(poly *r) { | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_tomont(poly *r) { | |||
unsigned int i = 0; | |||
const int16_t f = (1ULL << 32) % KYBER_Q; | |||
for (size_t i = 0; i < KYBER_N; i++) { | |||
r->coeffs[i] = PQCLEAN_KYBER102490S_CLEAN_montgomery_reduce( | |||
(int32_t)r->coeffs[i] * f); | |||
for (i = 0; i < KYBER_N; i++) { | |||
r->coeffs[i] = PQCLEAN_KYBER102490S_CLEAN_montgomery_reduce((int32_t)r->coeffs[i] * f); | |||
} | |||
} | |||
/************************************************* | |||
* Name: poly_reduce | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_reduce | |||
* | |||
* Description: Applies Barrett reduction to all coefficients of a polynomial | |||
* for details of the Barrett reduction see comments in reduce.c | |||
* | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_reduce(poly *r) { | |||
for (size_t i = 0; i < KYBER_N; i++) { | |||
unsigned int i = 0; | |||
for (i = 0; i < KYBER_N; i++) { | |||
r->coeffs[i] = PQCLEAN_KYBER102490S_CLEAN_barrett_reduce(r->coeffs[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: poly_csubq | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_csubq | |||
* | |||
* Description: Applies conditional subtraction of q to each coefficient of a polynomial | |||
* for details of conditional subtraction of q see comments in reduce.c | |||
* Description: Applies conditional subtraction of q to each coefficient | |||
* of a polynomial. For details of conditional subtraction | |||
* of q see comments in reduce.c | |||
* | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_csubq(poly *r) { | |||
for (size_t i = 0; i < KYBER_N; i++) { | |||
unsigned int i = 0; | |||
for (i = 0; i < KYBER_N; i++) { | |||
r->coeffs[i] = PQCLEAN_KYBER102490S_CLEAN_csubq(r->coeffs[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: poly_add | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_add | |||
* | |||
* Description: Add two polynomials | |||
* | |||
@@ -219,13 +272,14 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_csubq(poly *r) { | |||
* - const poly *b: pointer to second input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_add(poly *r, const poly *a, const poly *b) { | |||
for (size_t i = 0; i < KYBER_N; i++) { | |||
unsigned int i = 0; | |||
for (i = 0; i < KYBER_N; i++) { | |||
r->coeffs[i] = a->coeffs[i] + b->coeffs[i]; | |||
} | |||
} | |||
/************************************************* | |||
* Name: poly_sub | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_sub | |||
* | |||
* Description: Subtract two polynomials | |||
* | |||
@@ -234,48 +288,8 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_add(poly *r, const poly *a, const poly *b) | |||
* - const poly *b: pointer to second input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_sub(poly *r, const poly *a, const poly *b) { | |||
for (size_t i = 0; i < KYBER_N; i++) { | |||
unsigned int i = 0; | |||
for (i = 0; i < KYBER_N; i++) { | |||
r->coeffs[i] = a->coeffs[i] - b->coeffs[i]; | |||
} | |||
} | |||
/************************************************* | |||
* Name: poly_frommsg | |||
* | |||
* Description: Convert 32-byte message to polynomial | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *msg: pointer to input message | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]) { | |||
uint16_t mask; | |||
for (size_t i = 0; i < KYBER_SYMBYTES; i++) { | |||
for (size_t j = 0; j < 8; j++) { | |||
mask = -((msg[i] >> j) & 1); | |||
r->coeffs[8 * i + j] = mask & ((KYBER_Q + 1) / 2); | |||
} | |||
} | |||
} | |||
/************************************************* | |||
* Name: poly_tomsg | |||
* | |||
* Description: Convert polynomial to 32-byte message | |||
* | |||
* Arguments: - uint8_t *msg: pointer to output message | |||
* - const poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a) { | |||
uint16_t t; | |||
PQCLEAN_KYBER102490S_CLEAN_poly_csubq(a); | |||
for (size_t i = 0; i < KYBER_SYMBYTES; i++) { | |||
msg[i] = 0; | |||
for (size_t j = 0; j < 8; j++) { | |||
t = (((a->coeffs[8 * i + j] << 1) + KYBER_Q / 2) / KYBER_Q) & 1; | |||
msg[i] |= t << j; | |||
} | |||
} | |||
} |
@@ -1,9 +1,9 @@ | |||
#ifndef POLY_H | |||
#define POLY_H | |||
#ifndef PQCLEAN_KYBER102490S_CLEAN_POLY_H | |||
#define PQCLEAN_KYBER102490S_CLEAN_POLY_H | |||
#include "params.h" | |||
#include <stdint.h> | |||
/* | |||
* Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial | |||
* coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1] | |||
@@ -12,26 +12,41 @@ typedef struct { | |||
int16_t coeffs[KYBER_N]; | |||
} poly; | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_compress(uint8_t *r, poly *a); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_decompress(poly *r, const uint8_t *a); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_tobytes(uint8_t *r, poly *a); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_frombytes(poly *r, const uint8_t *a); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_ntt(poly *r); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_invntt(poly *r); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_basemul(poly *r, const poly *a, const poly *b); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_frommont(poly *r); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_invntt_tomont(poly *r); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, const poly *b); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_tomont(poly *r); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_reduce(poly *r); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_csubq(poly *r); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_add(poly *r, const poly *a, const poly *b); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_sub(poly *r, const poly *a, const poly *b); | |||
#endif |
@@ -1,138 +1,163 @@ | |||
#include "polyvec.h" | |||
#include "params.h" | |||
#include "poly.h" | |||
#include <stddef.h> | |||
#include "polyvec.h" | |||
#include <stdint.h> | |||
/************************************************* | |||
* Name: polyvec_compress | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_compress | |||
* | |||
* Description: Compress and serialize vector of polynomials | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array (needs space for KYBER_POLYVECCOMPRESSEDBYTES) | |||
* - const polyvec *a: pointer to input vector of polynomials | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* (needs space for KYBER_POLYVECCOMPRESSEDBYTES) | |||
* - polyvec *a: pointer to input vector of polynomials | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_compress(uint8_t *r, polyvec *a) { | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a) { | |||
unsigned int i = 0, j = 0, k = 0; | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_csubq(a); | |||
uint16_t t[8]; | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
for (size_t j = 0; j < KYBER_N / 8; j++) { | |||
for (size_t k = 0; k < 8; k++) { | |||
t[k] = ((((uint32_t)a->vec[i].coeffs[8 * j + k] << 11) + KYBER_Q / 2) / KYBER_Q) & 0x7ff; | |||
for (i = 0; i < KYBER_K; i++) { | |||
for (j = 0; j < KYBER_N / 8; j++) { | |||
for (k = 0; k < 8; k++) { | |||
{ | |||
t[k] = ((((uint32_t)a->vec[i].coeffs[8 * j + k] << 11) + KYBER_Q / 2) | |||
/ KYBER_Q) & 0x7ff; | |||
} | |||
} | |||
r[11 * j + 0] = (uint8_t)t[0]; | |||
r[11 * j + 1] = (uint8_t)((t[0] >> 8) | ((t[1] & 0x1f) << 3)); | |||
r[11 * j + 2] = (uint8_t)((t[1] >> 5) | ((t[2] & 0x03) << 6)); | |||
r[11 * j + 3] = (uint8_t)((t[2] >> 2)); | |||
r[11 * j + 4] = (uint8_t)((t[2] >> 10) | ((t[3] & 0x7f) << 1)); | |||
r[11 * j + 5] = (uint8_t)((t[3] >> 7) | ((t[4] & 0x0f) << 4)); | |||
r[11 * j + 6] = (uint8_t)((t[4] >> 4) | ((t[5] & 0x01) << 7)); | |||
r[11 * j + 7] = (uint8_t)((t[5] >> 1)); | |||
r[11 * j + 8] = (uint8_t)((t[5] >> 9) | ((t[6] & 0x3f) << 2)); | |||
r[11 * j + 9] = (uint8_t)((t[6] >> 6) | ((t[7] & 0x07) << 5)); | |||
r[11 * j + 10] = (uint8_t)((t[7] >> 3)); | |||
r[ 0] = (t[0] >> 0); | |||
r[ 1] = (t[0] >> 8) | (t[1] << 3); | |||
r[ 2] = (t[1] >> 5) | (t[2] << 6); | |||
r[ 3] = (t[2] >> 2); | |||
r[ 4] = (t[2] >> 10) | (t[3] << 1); | |||
r[ 5] = (t[3] >> 7) | (t[4] << 4); | |||
r[ 6] = (t[4] >> 4) | (t[5] << 7); | |||
r[ 7] = (t[5] >> 1); | |||
r[ 8] = (t[5] >> 9) | (t[6] << 2); | |||
r[ 9] = (t[6] >> 6) | (t[7] << 5); | |||
r[10] = (t[7] >> 3); | |||
r += 11; | |||
} | |||
r += 352; | |||
} | |||
} | |||
/************************************************* | |||
* Name: polyvec_decompress | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress | |||
* | |||
* Description: De-serialize and decompress vector of polynomials; | |||
* approximate inverse of polyvec_compress | |||
* approximate inverse of PQCLEAN_KYBER102490S_CLEAN_polyvec_compress | |||
* | |||
* Arguments: - polyvec *r: pointer to output vector of polynomials | |||
* - uint8_t *a: pointer to input byte array (of length KYBER_POLYVECCOMPRESSEDBYTES) | |||
* - const uint8_t *a: pointer to input byte array | |||
* (of length KYBER_POLYVECCOMPRESSEDBYTES) | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress(polyvec *r, const uint8_t *a) { | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
for (size_t j = 0; j < KYBER_N / 8; j++) { | |||
r->vec[i].coeffs[8 * j + 0] = (int16_t)( (((a[11 * j + 0] | (((uint32_t)a[11 * j + 1] & 0x07) << 8)) * KYBER_Q) + 1024) >> 11); | |||
r->vec[i].coeffs[8 * j + 1] = (int16_t)(((((a[11 * j + 1] >> 3) | (((uint32_t)a[11 * j + 2] & 0x3f) << 5)) * KYBER_Q) + 1024) >> 11); | |||
r->vec[i].coeffs[8 * j + 2] = (int16_t)(((((a[11 * j + 2] >> 6) | (((uint32_t)a[11 * j + 3] & 0xff) << 2) | (((uint32_t)a[11 * j + 4] & 0x01) << 10)) * KYBER_Q) + 1024) >> 11); | |||
r->vec[i].coeffs[8 * j + 3] = (int16_t)(((((a[11 * j + 4] >> 1) | (((uint32_t)a[11 * j + 5] & 0x0f) << 7)) * KYBER_Q) + 1024) >> 11); | |||
r->vec[i].coeffs[8 * j + 4] = (int16_t)(((((a[11 * j + 5] >> 4) | (((uint32_t)a[11 * j + 6] & 0x7f) << 4)) * KYBER_Q) + 1024) >> 11); | |||
r->vec[i].coeffs[8 * j + 5] = (int16_t)(((((a[11 * j + 6] >> 7) | (((uint32_t)a[11 * j + 7] & 0xff) << 1) | (((uint32_t)a[11 * j + 8] & 0x03) << 9)) * KYBER_Q) + 1024) >> 11); | |||
r->vec[i].coeffs[8 * j + 6] = (int16_t)(((((a[11 * j + 8] >> 2) | (((uint32_t)a[11 * j + 9] & 0x1f) << 6)) * KYBER_Q) + 1024) >> 11); | |||
r->vec[i].coeffs[8 * j + 7] = (int16_t)(((((a[11 * j + 9] >> 5) | (((uint32_t)a[11 * j + 10] & 0xff) << 3)) * KYBER_Q) + 1024) >> 11); | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress(polyvec *r, | |||
const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { | |||
unsigned int i = 0, j = 0, k = 0; | |||
uint16_t t[8]; | |||
for (i = 0; i < KYBER_K; i++) { | |||
for (j = 0; j < KYBER_N / 8; j++) { | |||
t[0] = (a[0] >> 0) | ((uint16_t)a[ 1] << 8); | |||
t[1] = (a[1] >> 3) | ((uint16_t)a[ 2] << 5); | |||
t[2] = (a[2] >> 6) | ((uint16_t)a[ 3] << 2) | ((uint16_t)a[4] << 10); | |||
t[3] = (a[4] >> 1) | ((uint16_t)a[ 5] << 7); | |||
t[4] = (a[5] >> 4) | ((uint16_t)a[ 6] << 4); | |||
t[5] = (a[6] >> 7) | ((uint16_t)a[ 7] << 1) | ((uint16_t)a[8] << 9); | |||
t[6] = (a[8] >> 2) | ((uint16_t)a[ 9] << 6); | |||
t[7] = (a[9] >> 5) | ((uint16_t)a[10] << 3); | |||
a += 11; | |||
for (k = 0; k < 8; k++) { | |||
r->vec[i].coeffs[8 * j + k] = ((uint32_t)(t[k] & 0x7FF) * KYBER_Q + 1024) >> 11; | |||
} | |||
} | |||
a += 352; | |||
} | |||
} | |||
/************************************************* | |||
* Name: polyvec_tobytes | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes | |||
* | |||
* Description: Serialize vector of polynomials | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array (needs space for KYBER_POLYVECBYTES) | |||
* - const polyvec *a: pointer to input vector of polynomials | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* (needs space for KYBER_POLYVECBYTES) | |||
* - polyvec *a: pointer to input vector of polynomials | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes(uint8_t *r, polyvec *a) { | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a) { | |||
unsigned int i = 0; | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_CLEAN_poly_tobytes(r + i * KYBER_POLYBYTES, &a->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: polyvec_frombytes | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_frombytes | |||
* | |||
* Description: De-serialize vector of polynomials; | |||
* inverse of polyvec_tobytes | |||
* inverse of PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* - const polyvec *a: pointer to input vector of polynomials (of length KYBER_POLYVECBYTES) | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* - const polyvec *a: pointer to input vector of polynomials | |||
* (of length KYBER_POLYVECBYTES) | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t *a) { | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]) { | |||
unsigned int i = 0; | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_CLEAN_poly_frombytes(&r->vec[i], a + i * KYBER_POLYBYTES); | |||
} | |||
} | |||
/************************************************* | |||
* Name: polyvec_ntt | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt | |||
* | |||
* Description: Apply forward NTT to all elements of a vector of polynomials | |||
* | |||
* Arguments: - polyvec *r: pointer to in/output vector of polynomials | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt(polyvec *r) { | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
unsigned int i = 0; | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_CLEAN_poly_ntt(&r->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: polyvec_invntt | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_invntt_tomont | |||
* | |||
* Description: Apply inverse NTT to all elements of a vector of polynomials | |||
* and multiply by Montgomery factor 2^16 | |||
* | |||
* Arguments: - polyvec *r: pointer to in/output vector of polynomials | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_invntt(polyvec *r) { | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_CLEAN_poly_invntt(&r->vec[i]); | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_invntt_tomont(polyvec *r) { | |||
unsigned int i = 0; | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_CLEAN_poly_invntt_tomont(&r->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: polyvec_pointwise_acc | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery | |||
* | |||
* Description: Pointwise multiply elements of a and b and accumulate into r | |||
* Description: Pointwise multiply elements of a and b, accumulate into r, | |||
* and multiply by 2^-16. | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const polyvec *a: pointer to first input vector of polynomials | |||
* - const polyvec *b: pointer to second input vector of polynomials | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b) { | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, | |||
const polyvec *a, | |||
const polyvec *b) { | |||
unsigned int i = 0; | |||
poly t; | |||
PQCLEAN_KYBER102490S_CLEAN_poly_basemul(r, &a->vec[0], &b->vec[0]); | |||
for (size_t i = 1; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_CLEAN_poly_basemul(&t, &a->vec[i], &b->vec[i]); | |||
PQCLEAN_KYBER102490S_CLEAN_poly_basemul_montgomery(r, &a->vec[0], &b->vec[0]); | |||
for (i = 1; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_CLEAN_poly_basemul_montgomery(&t, &a->vec[i], &b->vec[i]); | |||
PQCLEAN_KYBER102490S_CLEAN_poly_add(r, r, &t); | |||
} | |||
@@ -140,37 +165,40 @@ void PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc(poly *r, const polyvec *a, | |||
} | |||
/************************************************* | |||
* Name: polyvec_reduce | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_reduce | |||
* | |||
* Description: Applies Barrett reduction to each coefficient | |||
* of each element of a vector of polynomials | |||
* for details of the Barrett reduction see comments in reduce.c | |||
* | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_reduce(polyvec *r) { | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
unsigned int i = 0; | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_CLEAN_poly_reduce(&r->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: polyvec_csubq | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_csubq | |||
* | |||
* Description: Applies conditional subtraction of q to each coefficient | |||
* of each element of a vector of polynomials | |||
* for details of conditional subtraction of q see comments in reduce.c | |||
* for details of conditional subtraction of q see comments in | |||
* reduce.c | |||
* | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_csubq(polyvec *r) { | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
unsigned int i = 0; | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_CLEAN_poly_csubq(&r->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: polyvec_add | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_add | |||
* | |||
* Description: Add vectors of polynomials | |||
* | |||
@@ -179,7 +207,8 @@ void PQCLEAN_KYBER102490S_CLEAN_polyvec_csubq(polyvec *r) { | |||
* - const polyvec *b: pointer to second input vector of polynomials | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b) { | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
unsigned int i = 0; | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_CLEAN_poly_add(&r->vec[i], &a->vec[i], &b->vec[i]); | |||
} | |||
} |
@@ -1,29 +1,41 @@ | |||
#ifndef POLYVEC_H | |||
#define POLYVEC_H | |||
#ifndef PQCLEAN_KYBER102490S_CLEAN_POLYVEC_H | |||
#define PQCLEAN_KYBER102490S_CLEAN_POLYVEC_H | |||
#include "params.h" | |||
#include "poly.h" | |||
#include <stdint.h> | |||
typedef struct { | |||
poly vec[KYBER_K]; | |||
} polyvec; | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_compress(uint8_t *r, polyvec *a); | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress(polyvec *r, const uint8_t *a); | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes(uint8_t *r, polyvec *a); | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t *a); | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a); | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress(polyvec *r, | |||
const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a); | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]); | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt(polyvec *r); | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_invntt(polyvec *r); | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b); | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_invntt_tomont(polyvec *r); | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, | |||
const polyvec *a, | |||
const polyvec *b); | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_reduce(polyvec *r); | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_csubq(polyvec *r); | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); | |||
#endif |
@@ -1,32 +1,32 @@ | |||
#include "reduce.h" | |||
#include "params.h" | |||
#include "reduce.h" | |||
#include <stdint.h> | |||
/************************************************* | |||
* Name: montgomery_reduce | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_montgomery_reduce | |||
* | |||
* Description: Montgomery reduction; given a 32-bit integer a, computes | |||
* 16-bit integer congruent to a * R^-1 mod q, | |||
* where R=2^16 | |||
* | |||
* Arguments: - int32_t a: input integer to be reduced; has to be in {-q2^15,...,q2^15-1} | |||
* Arguments: - int32_t a: input integer to be reduced; | |||
* has to be in {-q2^15,...,q2^15-1} | |||
* | |||
* Returns: integer in {-q+1,...,q-1} congruent to a * R^-1 modulo q. | |||
**************************************************/ | |||
int16_t PQCLEAN_KYBER102490S_CLEAN_montgomery_reduce(int32_t a) { | |||
int32_t t; | |||
int16_t u; | |||
int32_t t = 0; | |||
int16_t u = 0; | |||
u = (int16_t)(a * (int64_t)QINV); | |||
t = (int32_t)u * KYBER_Q; | |||
t = a - t; | |||
t >>= 16; | |||
return (int16_t)t; | |||
return t; | |||
} | |||
/************************************************* | |||
* Name: barrett_reduce | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_barrett_reduce | |||
* | |||
* Description: Barrett reduction; given a 16-bit integer a, computes | |||
* 16-bit integer congruent to a mod q in {0,...,q} | |||
@@ -36,21 +36,20 @@ int16_t PQCLEAN_KYBER102490S_CLEAN_montgomery_reduce(int32_t a) { | |||
* Returns: integer in {0,...,q} congruent to a modulo q. | |||
**************************************************/ | |||
int16_t PQCLEAN_KYBER102490S_CLEAN_barrett_reduce(int16_t a) { | |||
int32_t t; | |||
const int32_t v = (1U << 26) / KYBER_Q + 1; | |||
int16_t t = 0; | |||
const int16_t v = ((1U << 26) + KYBER_Q / 2) / KYBER_Q; | |||
t = v * a; | |||
t >>= 26; | |||
t = (int32_t)v * a >> 26; | |||
t *= KYBER_Q; | |||
return a - (int16_t)t; | |||
return a - t; | |||
} | |||
/************************************************* | |||
* Name: csubq | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_csubq | |||
* | |||
* Description: Conditionallly subtract q | |||
* | |||
* Arguments: - int16_t a: input integer | |||
* Arguments: - int16_t x: input integer | |||
* | |||
* Returns: a - q if a >= q, else a | |||
**************************************************/ | |||
@@ -1,15 +1,19 @@ | |||
#ifndef REDUCE_H | |||
#define REDUCE_H | |||
#ifndef PQCLEAN_KYBER102490S_CLEAN_REDUCE_H | |||
#define PQCLEAN_KYBER102490S_CLEAN_REDUCE_H | |||
#include "params.h" | |||
#include <stdint.h> | |||
#define MONT 2285 // 2^16 % Q | |||
#define QINV 62209 // q^(-1) mod 2^16 | |||
#define MONT 2285 // 2^16 mod q | |||
#define QINV 62209 // q^-1 mod 2^16 | |||
int16_t PQCLEAN_KYBER102490S_CLEAN_montgomery_reduce(int32_t a); | |||
int16_t PQCLEAN_KYBER102490S_CLEAN_barrett_reduce(int16_t a); | |||
int16_t PQCLEAN_KYBER102490S_CLEAN_csubq(int16_t a); | |||
#endif |
@@ -1,4 +1,4 @@ | |||
#include "aes256ctr.h" | |||
#include "symmetric-aes.h" | |||
#include "aes.h" | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
@@ -14,7 +14,7 @@ static inline void br_enc32be(unsigned char *dst, uint32_t x) { | |||
static void aes256_ctr_xof(unsigned char *out, size_t outlen, const unsigned char *iv, uint32_t ctr, const aes256ctx *ctx) { | |||
uint8_t ivw[16]; | |||
uint8_t buf[AES_BLOCKBYTES]; | |||
size_t i; | |||
size_t i = 0; | |||
memcpy(ivw, iv, AESCTR_NONCEBYTES); | |||
br_enc32be(ivw + AESCTR_NONCEBYTES, ctr); | |||
@@ -94,7 +94,6 @@ void PQCLEAN_KYBER102490S_CLEAN_aes256xof_squeezeblocks(uint8_t *out, size_t nbl | |||
s->ctr += (uint32_t) (4 * nblocks); | |||
} | |||
/** Free the AES ctx **/ | |||
void PQCLEAN_KYBER102490S_CLEAN_aes256xof_ctx_release(aes256xof_ctx *s) { | |||
aes256_ctx_release(&s->sk_exp); | |||
} |
@@ -2,22 +2,24 @@ | |||
#define SYMMETRIC_H | |||
#include "params.h" | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
#include "aes256ctr.h" | |||
#include "sha2.h" | |||
#include "symmetric-aes.h" | |||
typedef aes256xof_ctx xof_state; | |||
#define XOF_BLOCKBYTES 64 | |||
#define hash_h(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES) | |||
#define hash_g(OUT, IN, INBYTES) sha512(OUT, IN, INBYTES) | |||
#define xof_absorb(STATE, IN, X, Y) PQCLEAN_KYBER102490S_CLEAN_aes256xof_absorb(STATE, IN, X, Y) | |||
#define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER102490S_CLEAN_aes256xof_absorb(STATE, SEED, X, Y) | |||
#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_KYBER102490S_CLEAN_aes256xof_squeezeblocks(OUT, OUTBLOCKS, STATE) | |||
#define xof_ctx_release(STATE) PQCLEAN_KYBER102490S_CLEAN_aes256xof_ctx_release(STATE) | |||
#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER102490S_CLEAN_aes256_prf(OUT, OUTBYTES, KEY, NONCE) | |||
#define kdf(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES) | |||
#define XOF_BLOCKBYTES 64 | |||
typedef aes256xof_ctx xof_state; | |||
#endif /* SYMMETRIC_H */ |
@@ -1,34 +1,31 @@ | |||
#include "verify.h" | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
/************************************************* | |||
* Name: verify | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_verify | |||
* | |||
* Description: Compare two arrays for equality in constant time. | |||
* | |||
* Arguments: const uint8_t *a: pointer to first byte array | |||
* const uint8_t *b: pointer to second byte array | |||
* size_t len: length of the byte arrays | |||
* size_t len: length of the byte arrays | |||
* | |||
* Returns 0 if the byte arrays are equal, 1 otherwise | |||
**************************************************/ | |||
uint8_t PQCLEAN_KYBER102490S_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len) { | |||
uint64_t r; | |||
size_t i; | |||
r = 0; | |||
int PQCLEAN_KYBER102490S_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len) { | |||
size_t i = 0; | |||
uint8_t r = 0; | |||
for (i = 0; i < len; i++) { | |||
r |= a[i] ^ b[i]; | |||
} | |||
r = (-r) >> 63; | |||
return (uint8_t)r; | |||
return (-(uint64_t)r) >> 63; | |||
} | |||
/************************************************* | |||
* Name: cmov | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_cmov | |||
* | |||
* Description: Copy len bytes from x to r if b is 1; | |||
* don't modify x if b is 0. Requires b to be in {0,1}; | |||
@@ -37,14 +34,14 @@ uint8_t PQCLEAN_KYBER102490S_CLEAN_verify(const uint8_t *a, const uint8_t *b, si | |||
* | |||
* Arguments: uint8_t *r: pointer to output byte array | |||
* const uint8_t *x: pointer to input byte array | |||
* size_t len: Amount of bytes to be copied | |||
* size_t len: Amount of bytes to be copied | |||
* uint8_t b: Condition bit; has to be in {0,1} | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b) { | |||
size_t i; | |||
size_t i = 0; | |||
b = -b; | |||
for (i = 0; i < len; i++) { | |||
r[i] ^= b & (x[i] ^ r[i]); | |||
r[i] ^= b & (r[i] ^ x[i]); | |||
} | |||
} |
@@ -1,10 +1,13 @@ | |||
#ifndef VERIFY_H | |||
#define VERIFY_H | |||
#ifndef PQCLEAN_KYBER102490S_CLEAN_VERIFY_H | |||
#define PQCLEAN_KYBER102490S_CLEAN_VERIFY_H | |||
#include "params.h" | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
uint8_t PQCLEAN_KYBER102490S_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len); | |||
int PQCLEAN_KYBER102490S_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len); | |||
void PQCLEAN_KYBER102490S_CLEAN_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b); | |||
@@ -28,6 +28,7 @@ implementations: | |||
- architecture: x86_64 | |||
operating_systems: | |||
- Linux | |||
- Darwin | |||
required_flags: | |||
- avx2 | |||
- bmi2 | |||
@@ -1,14 +1,4 @@ | |||
kyber-20170627 | |||
Public Domain | |||
Authors: Joppe Bos, | |||
Léo Ducas, | |||
Eike Kiltz , | |||
Tancrède Lepoint, | |||
Vadim Lyubashevsky, | |||
John Schanck, | |||
Peter Schwabe, | |||
Gregor Seiler, | |||
Damien Stehlé | |||
Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) | |||
For Keccak and AES we are using public-domain | |||
code from sources and by authors listed in | |||
@@ -1,26 +1,58 @@ | |||
# This Makefile can be used with GNU Make or BSD Make | |||
LIB=libkyber1024_avx2.a | |||
HEADERS=api.h params.h poly.h polyvec.h reduce.h fq.inc cbd.h consts.h ntt.h shuffle.inc verify.h indcpa.h rejsample.h symmetric.h fips202x4.h | |||
OBJECTS=kem.o poly.o polyvec.o fq.o shuffle.o cbd.o ntt.o invntt.o basemul.o consts.o \ | |||
verify.o indcpa.o rejsample.o fips202x4.o symmetric-fips202.o | |||
HEADERS= \ | |||
align.h \ | |||
api.h \ | |||
cbd.h \ | |||
cdecl.inc \ | |||
consts.h \ | |||
fips202x4.h \ | |||
fq.inc \ | |||
indcpa.h \ | |||
kem.h \ | |||
ntt.h \ | |||
params.h \ | |||
poly.h \ | |||
polyvec.h \ | |||
reduce.h \ | |||
rejsample.h \ | |||
shuffle.inc \ | |||
symmetric.h \ | |||
verify.h | |||
OBJECTS= \ | |||
basemul.o \ | |||
cbd.o \ | |||
consts.o \ | |||
fips202x4.o \ | |||
fq.o \ | |||
indcpa.o \ | |||
invntt.o \ | |||
kem.o \ | |||
ntt.o \ | |||
poly.o \ | |||
polyvec.o \ | |||
rejsample.o \ | |||
shuffle.o \ | |||
symmetric-shake.o \ | |||
verify.o | |||
KECCAK4XDIR=../../../common/keccak4x | |||
KECCAK4XOBJ=KeccakP-1600-times4-SIMD256.o | |||
KECCAK4X=$(KECCAK4XDIR)/$(KECCAK4XOBJ) | |||
CFLAGS=-mavx2 -mbmi2 -mpopcnt -O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) | |||
CFLAGS=-mavx2 -mbmi2 -mpopcnt -O3 -Wall -Wextra -Wpedantic -Werror \ | |||
-Wmissing-prototypes -Wredundant-decls \ | |||
-Wpointer-arith -Wshadow \ | |||
-std=c99 -I../../../common $(EXTRAFLAGS) | |||
all: $(LIB) | |||
%.o: %.c $(HEADERS) | |||
$(CC) $(CFLAGS) -c -o $@ $< | |||
%.o: %.s $(HEADERS) | |||
$(AS) -o $@ $< | |||
%.o: %.S $(HEADERS) | |||
$(AS) -c -o $@ $< | |||
$(CC) $(CFLAGS) -c -o $@ $< | |||
$(LIB): $(OBJECTS) $(KECCAK4X) | |||
$(AR) -r $@ $(OBJECTS) $(KECCAK4X) | |||
@@ -0,0 +1,22 @@ | |||
#ifndef PQCLEAN_KYBER1024_AVX2_ALIGN_H | |||
#define PQCLEAN_KYBER1024_AVX2_ALIGN_H | |||
#include <immintrin.h> | |||
#define ALIGN16_TYPE(t) \ | |||
union { \ | |||
__m128i vec; \ | |||
t orig; \ | |||
} | |||
#define ALIGN32_ARRAY(t, s) \ | |||
union { \ | |||
__m256i vec; \ | |||
t arr[(s)]; \ | |||
} | |||
#define ALIGN32_ARRAY_2D(t, n, m) \ | |||
union { \ | |||
__m256i vec; \ | |||
t arr[(n)][(m)]; \ | |||
} | |||
#endif |
@@ -1,4 +1,5 @@ | |||
#include "params.h" | |||
#include "cdecl.inc" | |||
.macro schoolbook off,sign | |||
#load | |||
@@ -48,7 +49,7 @@ vpaddd %ymm12,%ymm13,%ymm12 # y0 | |||
vpaddd %ymm7,%ymm8,%ymm7 # y1 | |||
.endm | |||
.macro red a0,a1,b0,b1 x,y,z | |||
.macro red a0,a1,b0,b1,x,y,z | |||
#pack | |||
vpxor %ymm\x,%ymm\x,%ymm\x | |||
vpblendw $0xAA,%ymm\x,%ymm\a0,%ymm\y | |||
@@ -73,13 +74,8 @@ vpsubw %ymm\z,%ymm\a0,%ymm\a0 | |||
vpsubw %ymm\y,%ymm\b0,%ymm\b0 | |||
.endm | |||
.global PQCLEAN_KYBER1024_AVX2_basemul_acc_avx | |||
PQCLEAN_KYBER1024_AVX2_basemul_acc_avx: | |||
#consts | |||
vmovdqa PQCLEAN_KYBER1024_AVX2_16xq(%rip),%ymm0 | |||
vmovdqa PQCLEAN_KYBER1024_AVX2_16xqinv(%rip),%ymm1 | |||
vmovdqu (%rcx),%ymm2 | |||
.text | |||
basemul64_acc_avx: | |||
poly0.0: | |||
schoolbook 0,0 | |||
@@ -117,7 +113,7 @@ vpaddd %ymm12,%ymm5,%ymm5 | |||
vpaddd %ymm7,%ymm6,%ymm6 | |||
#reduce | |||
red 3,4,5,6 7,8,9 | |||
red 3,4,5,6,7,8,9 | |||
#store | |||
vmovdqa %ymm3,(%rdi) | |||
@@ -160,7 +156,7 @@ vpaddd %ymm12,%ymm5,%ymm5 | |||
vpaddd %ymm7,%ymm6,%ymm6 | |||
#reduce | |||
red 3,4,5,6 7,8,9 | |||
red 3,4,5,6,7,8,9 | |||
#store | |||
vmovdqa %ymm3,64(%rdi) | |||
@@ -168,17 +164,40 @@ vmovdqa %ymm5,96(%rdi) | |||
ret | |||
.global PQCLEAN_KYBER1024_AVX2_basemul_avx | |||
PQCLEAN_KYBER1024_AVX2_basemul_avx: | |||
.global cdecl(PQCLEAN_KYBER1024_AVX2_basemul_acc_avx) | |||
cdecl(PQCLEAN_KYBER1024_AVX2_basemul_acc_avx): | |||
#consts | |||
vmovdqa PQCLEAN_KYBER1024_AVX2_16xq(%rip),%ymm0 | |||
vmovdqa PQCLEAN_KYBER1024_AVX2_16xqinv(%rip),%ymm1 | |||
vmovdqu (%rcx),%ymm2 | |||
vmovdqa _16XQ*2(%rcx),%ymm0 | |||
vmovdqa _16XQINV*2(%rcx),%ymm1 | |||
vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 | |||
call basemul64_acc_avx | |||
vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 | |||
add $128,%rdi | |||
add $128,%rsi | |||
add $128,%rdx | |||
call basemul64_acc_avx | |||
vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 | |||
add $128,%rdi | |||
add $128,%rsi | |||
add $128,%rdx | |||
call basemul64_acc_avx | |||
vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 | |||
add $128,%rdi | |||
add $128,%rsi | |||
add $128,%rdx | |||
call basemul64_acc_avx | |||
ret | |||
basemul64_avx: | |||
schoolbook 0,0 | |||
#reduce | |||
red 14,9,12,7 8,10,11 | |||
red 14,9,12,7,8,10,11 | |||
#store | |||
vmovdqa %ymm14,(%rdi) | |||
@@ -187,10 +206,39 @@ vmovdqa %ymm12,32(%rdi) | |||
schoolbook 64,1 | |||
#reduce | |||
red 14,9,12,7 8,10,11 | |||
red 14,9,12,7,8,10,11 | |||
#store | |||
vmovdqa %ymm14,64(%rdi) | |||
vmovdqa %ymm12,96(%rdi) | |||
ret | |||
.global cdecl(PQCLEAN_KYBER1024_AVX2_basemul_avx) | |||
cdecl(PQCLEAN_KYBER1024_AVX2_basemul_avx): | |||
#consts | |||
vmovdqa _16XQ*2(%rcx),%ymm0 | |||
vmovdqa _16XQINV*2(%rcx),%ymm1 | |||
vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 | |||
call basemul64_avx | |||
vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 | |||
add $128,%rdi | |||
add $128,%rsi | |||
add $128,%rdx | |||
call basemul64_avx | |||
vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 | |||
add $128,%rdi | |||
add $128,%rsi | |||
add $128,%rdx | |||
call basemul64_avx | |||
vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 | |||
add $128,%rdi | |||
add $128,%rsi | |||
add $128,%rdx | |||
call basemul64_avx | |||
ret |
@@ -1,27 +1,27 @@ | |||
#include "cbd.h" | |||
#include "params.h" | |||
#include "cbd.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
/************************************************* | |||
* Name: cbd | |||
* Name: PQCLEAN_KYBER1024_AVX2_cbd | |||
* | |||
* Description: Given an array of uniformly random bytes, compute | |||
* polynomial with coefficients distributed according to | |||
* a centered binomial distribution with parameter KYBER_ETA | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *buf: pointer to input byte array | |||
* - const unsigned char *buf: pointer to input byte array | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_cbd(poly *r, const uint8_t *buf) { | |||
void PQCLEAN_KYBER1024_AVX2_cbd(poly *restrict r, const uint8_t *restrict buf) { | |||
unsigned int i = 0; | |||
__m256i vec0, vec1, vec2, vec3, tmp; | |||
const __m256i mask55 = _mm256_set1_epi32(0x55555555); | |||
const __m256i mask33 = _mm256_set1_epi32(0x33333333); | |||
const __m256i mask03 = _mm256_set1_epi32(0x03030303); | |||
for (size_t i = 0; i < KYBER_N / 64; i++) { | |||
vec0 = _mm256_loadu_si256((__m256i *)&buf[32 * i]); | |||
for (i = 0; i < KYBER_N / 64; i++) { | |||
vec0 = _mm256_load_si256((__m256i *)&buf[32 * i]); | |||
vec1 = _mm256_srli_epi32(vec0, 1); | |||
vec0 = _mm256_and_si256(mask55, vec0); | |||
@@ -1,8 +1,11 @@ | |||
#ifndef CBD_H | |||
#define CBD_H | |||
#ifndef PQCLEAN_KYBER1024_AVX2_CBD_H | |||
#define PQCLEAN_KYBER1024_AVX2_CBD_H | |||
#include "params.h" | |||
#include "poly.h" | |||
#include <stdint.h> | |||
void PQCLEAN_KYBER1024_AVX2_cbd(poly *r, const uint8_t *buf); | |||
void PQCLEAN_KYBER1024_AVX2_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]); | |||
#endif |
@@ -0,0 +1,30 @@ | |||
#ifndef PQCLEAN_DILITHIUM2_AVX2_CDECL | |||
#define PQCLEAN_DILITHIUM2_AVX2_CDECL | |||
#define _16XQ 0 | |||
#define _16XQINV 16 | |||
#define _16XV 32 | |||
#define _16XFLO 48 | |||
#define _16XFHI 64 | |||
#define _16XMONTSQLO 80 | |||
#define _16XMONTSQHI 96 | |||
#define _16XMASK 112 | |||
#define _ZETAS_EXP 128 | |||
#define _ZETAS_INV_EXP 528 | |||
/* The C ABI on MacOS exports all symbols with a leading | |||
* underscore. This means that any symbols we refer to from | |||
* C files (functions) can't be found, and all symbols we | |||
* refer to from ASM also can't be found (nttconsts.c). | |||
* | |||
* This define helps us get around this | |||
*/ | |||
#if defined(__WIN32__) || defined(__APPLE__) | |||
#define cdecl(s) _##s | |||
#else | |||
#define cdecl(s) s | |||
#endif | |||
#endif |
@@ -1,34 +1,155 @@ | |||
#include "consts.h" | |||
#include "params.h" | |||
const uint16_t PQCLEAN_KYBER1024_AVX2_zetas_exp[396] = {31499, 31499, 2571, 2571, 14746, 14746, 2970, 2970, 13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, 53134, 53134, 53134, 53134, 53134, 53134, 53134, 53134, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 44630, 44630, 44630, 44630, 27758, 27758, 27758, 27758, 61737, 61737, 61737, 61737, 49846, 49846, 49846, 49846, 3158, 3158, 3158, 3158, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 182, 182, 182, 182, 59709, 59709, 17364, 17364, 39176, 39176, 36479, 36479, 5572, 5572, 64434, 64434, 21439, 21439, 39295, 39295, 573, 573, 2004, 2004, 264, 264, 383, 383, 2500, 2500, 1458, 1458, 1727, 1727, 3199, 3199, 59847, 59020, 1497, 30967, 41972, 20179, 20711, 25081, 52740, 26617, 16065, 53095, 9135, 64887, 39550, 27837, 1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, 65202, 54059, 33310, 20494, 37798, 945, 50654, 6182, 32011, 10631, 29176, 36775, 47051, 17561, 51106, 60261, 2226, 555, 2078, 1550, 422, 177, 3038, 1574, 3083, 1159, 2552, 2727, 1739, 2457, 418, 3173, 11182, 13387, 51303, 43881, 13131, 60950, 23093, 5493, 33034, 30318, 46795, 12639, 20100, 18525, 19529, 52918, 430, 843, 871, 105, 587, 3094, 2869, 1653, 778, 3182, 1483, 1119, 644, 349, 329, 3254, 788, 788, 1812, 1812, 28191, 28191, 28191, 28191, 28191, 28191, 28191, 28191, 48842, 48842, 48842, 48842, 48842, 48842, 48842, 48842, 287, 287, 287, 287, 287, 287, 287, 287, 202, 202, 202, 202, 202, 202, 202, 202, 10690, 10690, 10690, 10690, 1359, 1359, 1359, 1359, 54335, 54335, 54335, 54335, 31164, 31164, 31164, 31164, 962, 962, 962, 962, 2127, 2127, 2127, 2127, 1855, 1855, 1855, 1855, 1468, 1468, 1468, 1468, 37464, 37464, 24313, 24313, 55004, 55004, 8800, 8800, 18427, 18427, 8859, 8859, 26676, 26676, 49374, 49374, 2648, 2648, 1017, 1017, 732, 732, 608, 608, 1787, 1787, 411, 411, 3124, 3124, 1758, 1758, 19884, 37287, 49650, 56638, 37227, 9076, 35338, 18250, 13427, 14017, 36381, 52780, 16832, 4312, 41381, 47622, 2476, 3239, 3058, 830, 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, 34353, 25435, 58154, 24392, 44610, 10946, 24215, 16990, 10336, 57603, 43035, 10907, 31637, 28644, 23998, 48114, 817, 603, 1322, 1864, 2114, 1218, 2455, 2142, 2144, 2051, 1819, 2459, 3221, 996, 958, 1522, 20297, 2146, 15356, 33152, 59257, 50634, 54492, 14470, 44039, 45338, 23211, 48094, 41677, 45279, 7757, 23132, 1097, 610, 2044, 384, 3193, 1994, 220, 1670, 1799, 794, 2475, 478, 3021, 991, 1869, 1628}; | |||
const uint16_t PQCLEAN_KYBER1024_AVX2_zetas_inv_exp[396] = {42405, 57780, 20258, 23860, 17443, 42326, 20199, 21498, 51067, 11045, 14903, 6280, 32385, 50181, 63391, 45240, 1701, 1460, 2338, 308, 2851, 854, 2535, 1530, 1659, 3109, 1335, 136, 2945, 1285, 2719, 2232, 17423, 41539, 36893, 33900, 54630, 22502, 7934, 55201, 48547, 41322, 54591, 20927, 41145, 7383, 40102, 31184, 1807, 2371, 2333, 108, 870, 1510, 1278, 1185, 1187, 874, 2111, 1215, 1465, 2007, 2726, 2512, 17915, 24156, 61225, 48705, 12757, 29156, 51520, 52110, 47287, 30199, 56461, 28310, 8899, 15887, 28250, 45653, 1275, 2652, 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, 16163, 16163, 38861, 38861, 56678, 56678, 47110, 47110, 56737, 56737, 10533, 10533, 41224, 41224, 28073, 28073, 1571, 1571, 205, 205, 2918, 2918, 1542, 1542, 2721, 2721, 2597, 2597, 2312, 2312, 681, 681, 34373, 34373, 34373, 34373, 11202, 11202, 11202, 11202, 64178, 64178, 64178, 64178, 54847, 54847, 54847, 54847, 1861, 1861, 1861, 1861, 1474, 1474, 1474, 1474, 1202, 1202, 1202, 1202, 2367, 2367, 2367, 2367, 16695, 16695, 16695, 16695, 16695, 16695, 16695, 16695, 37346, 37346, 37346, 37346, 37346, 37346, 37346, 37346, 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3042, 3042, 3042, 3042, 3042, 3042, 3042, 3042, 64749, 64749, 1517, 1517, 12619, 46008, 47012, 45437, 52898, 18742, 35219, 32503, 60044, 42444, 4587, 52406, 21656, 14234, 52150, 54355, 75, 3000, 2980, 2685, 2210, 1846, 147, 2551, 1676, 460, 235, 2742, 3224, 2458, 2486, 2899, 5276, 14431, 47976, 18486, 28762, 36361, 54906, 33526, 59355, 14883, 64592, 27739, 45043, 32227, 11478, 335, 156, 2911, 872, 1590, 602, 777, 2170, 246, 1755, 291, 3152, 2907, 1779, 1251, 2774, 1103, 37700, 25987, 650, 56402, 12442, 49472, 38920, 12797, 40456, 44826, 45358, 23565, 34570, 64040, 6517, 5690, 1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, 2677, 2106, 26242, 26242, 44098, 44098, 1103, 1103, 59965, 59965, 29058, 29058, 26361, 26361, 48173, 48173, 5828, 5828, 130, 130, 1602, 1602, 1871, 1871, 829, 829, 2946, 2946, 3065, 3065, 1325, 1325, 2756, 2756, 15691, 15691, 15691, 15691, 3800, 3800, 3800, 3800, 37779, 37779, 37779, 37779, 20907, 20907, 20907, 20907, 3147, 3147, 3147, 3147, 1752, 1752, 1752, 1752, 2707, 2707, 2707, 2707, 171, 171, 171, 171, 12403, 12403, 12403, 12403, 12403, 12403, 12403, 12403, 52012, 52012, 52012, 52012, 52012, 52012, 52012, 52012, 1907, 1907, 1907, 1907, 1907, 1907, 1907, 1907, 1836, 1836, 1836, 1836, 1836, 1836, 1836, 1836, 50791, 50791, 359, 359, 60300, 60300, 1932, 1932}; | |||
#include "consts.h" | |||
#include <stdint.h> | |||
#define Q KYBER_Q | |||
#define MONT ((1U << 16) % KYBER_Q) | |||
#define MONT ((1U << 16) % Q) | |||
#define QINV 62209 // q^-1 mod 2^16 | |||
#define V ((1U << 26)/KYBER_Q + 1) | |||
#define FHI (MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q) | |||
#define FLO (FHI * QINV % 65536) | |||
#define MONTSQHI (MONT * MONT % KYBER_Q) | |||
#define MONTSQLO (MONTSQHI * QINV % 65536) | |||
#define V (((1U << 26) + Q/2)/Q) | |||
#define FHI (MONT*(MONT*(Q-1)*((Q-1)/128) % Q) % Q) | |||
#define FLO (FHI*QINV % 65536) | |||
#define MONTSQHI (MONT*MONT % Q) | |||
#define MONTSQLO (MONTSQHI*QINV % 65536) | |||
#define MASK 4095 | |||
const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q}}; | |||
const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}}; | |||
const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xv = {.as_arr = {V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V}}; | |||
const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xflo = {.as_arr = {FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO}}; | |||
const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xfhi = {.as_arr = {FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI}}; | |||
const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xmontsqlo = {.as_arr = {MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO}}; | |||
const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xmontsqhi = {.as_arr = {MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI}}; | |||
const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xmask = {.as_arr = {MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK}}; | |||
#undef Q | |||
#undef QINV | |||
#undef MONT | |||
#undef V | |||
#undef FLO | |||
#undef FHI | |||
#undef MONTSQLO | |||
#undef MONTSQHI | |||
#undef MASK | |||
const qdata_t PQCLEAN_KYBER1024_AVX2_qdata = {.as_arr = { | |||
#define _16XQ 0 | |||
Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, | |||
#define _16XQINV 16 | |||
QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, | |||
QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, | |||
#define _16XV 32 | |||
V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, | |||
#define _16XFLO 48 | |||
FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, | |||
FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, | |||
#define _16XFHI 64 | |||
FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, | |||
FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, | |||
#define _16XMONTSQLO 80 | |||
MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, | |||
MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, | |||
MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, | |||
MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, | |||
#define _16XMONTSQHI 96 | |||
MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, | |||
MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, | |||
MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, | |||
MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, | |||
#define _16XMASK 112 | |||
MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, | |||
MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, | |||
#define _ZETAS_EXP 128 | |||
31499, 31499, 2571, 2571, 14746, 14746, 2970, 2970, | |||
13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, | |||
53134, 53134, 53134, 53134, 53134, 53134, 53134, 53134, | |||
1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, | |||
1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, | |||
44630, 44630, 44630, 44630, 27758, 27758, 27758, 27758, | |||
61737, 61737, 61737, 61737, 49846, 49846, 49846, 49846, | |||
3158, 3158, 3158, 3158, 622, 622, 622, 622, | |||
1577, 1577, 1577, 1577, 182, 182, 182, 182, | |||
59709, 59709, 17364, 17364, 39176, 39176, 36479, 36479, | |||
5572, 5572, 64434, 64434, 21439, 21439, 39295, 39295, | |||
573, 573, 2004, 2004, 264, 264, 383, 383, | |||
2500, 2500, 1458, 1458, 1727, 1727, 3199, 3199, | |||
59847, 59020, 1497, 30967, 41972, 20179, 20711, 25081, | |||
52740, 26617, 16065, 53095, 9135, 64887, 39550, 27837, | |||
1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, | |||
516, 3321, 3009, 2663, 1711, 2167, 126, 1469, | |||
65202, 54059, 33310, 20494, 37798, 945, 50654, 6182, | |||
32011, 10631, 29176, 36775, 47051, 17561, 51106, 60261, | |||
2226, 555, 2078, 1550, 422, 177, 3038, 1574, | |||
3083, 1159, 2552, 2727, 1739, 2457, 418, 3173, | |||
11182, 13387, 51303, 43881, 13131, 60950, 23093, 5493, | |||
33034, 30318, 46795, 12639, 20100, 18525, 19529, 52918, | |||
430, 843, 871, 105, 587, 3094, 2869, 1653, | |||
778, 3182, 1483, 1119, 644, 349, 329, 3254, | |||
788, 788, 1812, 1812, 28191, 28191, 28191, 28191, | |||
28191, 28191, 28191, 28191, 48842, 48842, 48842, 48842, | |||
48842, 48842, 48842, 48842, 287, 287, 287, 287, | |||
287, 287, 287, 287, 202, 202, 202, 202, | |||
202, 202, 202, 202, 10690, 10690, 10690, 10690, | |||
1359, 1359, 1359, 1359, 54335, 54335, 54335, 54335, | |||
31164, 31164, 31164, 31164, 962, 962, 962, 962, | |||
2127, 2127, 2127, 2127, 1855, 1855, 1855, 1855, | |||
1468, 1468, 1468, 1468, 37464, 37464, 24313, 24313, | |||
55004, 55004, 8800, 8800, 18427, 18427, 8859, 8859, | |||
26676, 26676, 49374, 49374, 2648, 2648, 1017, 1017, | |||
732, 732, 608, 608, 1787, 1787, 411, 411, | |||
3124, 3124, 1758, 1758, 19884, 37287, 49650, 56638, | |||
37227, 9076, 35338, 18250, 13427, 14017, 36381, 52780, | |||
16832, 4312, 41381, 47622, 2476, 3239, 3058, 830, | |||
107, 1908, 3082, 2378, 2931, 961, 1821, 2604, | |||
448, 2264, 677, 2054, 34353, 25435, 58154, 24392, | |||
44610, 10946, 24215, 16990, 10336, 57603, 43035, 10907, | |||
31637, 28644, 23998, 48114, 817, 603, 1322, 1864, | |||
2114, 1218, 2455, 2142, 2144, 2051, 1819, 2459, | |||
3221, 996, 958, 1522, 20297, 2146, 15356, 33152, | |||
59257, 50634, 54492, 14470, 44039, 45338, 23211, 48094, | |||
41677, 45279, 7757, 23132, 1097, 610, 2044, 384, | |||
3193, 1994, 220, 1670, 1799, 794, 2475, 478, | |||
3021, 991, 1869, 1628, 0, 0, 0, 0, | |||
#define _ZETAS_INV_EXP 528 | |||
42405, 57780, 20258, 23860, 17443, 42326, 20199, 21498, | |||
51067, 11045, 14903, 6280, 32385, 50181, 63391, 45240, | |||
1701, 1460, 2338, 308, 2851, 854, 2535, 1530, | |||
1659, 3109, 1335, 136, 2945, 1285, 2719, 2232, | |||
17423, 41539, 36893, 33900, 54630, 22502, 7934, 55201, | |||
48547, 41322, 54591, 20927, 41145, 7383, 40102, 31184, | |||
1807, 2371, 2333, 108, 870, 1510, 1278, 1185, | |||
1187, 874, 2111, 1215, 1465, 2007, 2726, 2512, | |||
17915, 24156, 61225, 48705, 12757, 29156, 51520, 52110, | |||
47287, 30199, 56461, 28310, 8899, 15887, 28250, 45653, | |||
1275, 2652, 1065, 2881, 725, 1508, 2368, 398, | |||
951, 247, 1421, 3222, 2499, 271, 90, 853, | |||
16163, 16163, 38861, 38861, 56678, 56678, 47110, 47110, | |||
56737, 56737, 10533, 10533, 41224, 41224, 28073, 28073, | |||
1571, 1571, 205, 205, 2918, 2918, 1542, 1542, | |||
2721, 2721, 2597, 2597, 2312, 2312, 681, 681, | |||
34373, 34373, 34373, 34373, 11202, 11202, 11202, 11202, | |||
64178, 64178, 64178, 64178, 54847, 54847, 54847, 54847, | |||
1861, 1861, 1861, 1861, 1474, 1474, 1474, 1474, | |||
1202, 1202, 1202, 1202, 2367, 2367, 2367, 2367, | |||
16695, 16695, 16695, 16695, 16695, 16695, 16695, 16695, | |||
37346, 37346, 37346, 37346, 37346, 37346, 37346, 37346, | |||
3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127, | |||
3042, 3042, 3042, 3042, 3042, 3042, 3042, 3042, | |||
64749, 64749, 1517, 1517, 12619, 46008, 47012, 45437, | |||
52898, 18742, 35219, 32503, 60044, 42444, 4587, 52406, | |||
21656, 14234, 52150, 54355, 75, 3000, 2980, 2685, | |||
2210, 1846, 147, 2551, 1676, 460, 235, 2742, | |||
3224, 2458, 2486, 2899, 5276, 14431, 47976, 18486, | |||
28762, 36361, 54906, 33526, 59355, 14883, 64592, 27739, | |||
45043, 32227, 11478, 335, 156, 2911, 872, 1590, | |||
602, 777, 2170, 246, 1755, 291, 3152, 2907, | |||
1779, 1251, 2774, 1103, 37700, 25987, 650, 56402, | |||
12442, 49472, 38920, 12797, 40456, 44826, 45358, 23565, | |||
34570, 64040, 6517, 5690, 1860, 3203, 1162, 1618, | |||
666, 320, 8, 2813, 1544, 282, 1838, 1293, | |||
2314, 552, 2677, 2106, 26242, 26242, 44098, 44098, | |||
1103, 1103, 59965, 59965, 29058, 29058, 26361, 26361, | |||
48173, 48173, 5828, 5828, 130, 130, 1602, 1602, | |||
1871, 1871, 829, 829, 2946, 2946, 3065, 3065, | |||
1325, 1325, 2756, 2756, 15691, 15691, 15691, 15691, | |||
3800, 3800, 3800, 3800, 37779, 37779, 37779, 37779, | |||
20907, 20907, 20907, 20907, 3147, 3147, 3147, 3147, | |||
1752, 1752, 1752, 1752, 2707, 2707, 2707, 2707, | |||
171, 171, 171, 171, 12403, 12403, 12403, 12403, | |||
12403, 12403, 12403, 12403, 52012, 52012, 52012, 52012, | |||
52012, 52012, 52012, 52012, 1907, 1907, 1907, 1907, | |||
1907, 1907, 1907, 1907, 1836, 1836, 1836, 1836, | |||
1836, 1836, 1836, 1836, 50791, 50791, 359, 359, | |||
60300, 60300, 1932, 1932, 0, 0, 0, 0 | |||
} | |||
}; |
@@ -1,24 +1,20 @@ | |||
#ifndef CONSTS_H | |||
#define CONSTS_H | |||
#ifndef PQCLEAN_KYBER1024_AVX2_CONSTS_H | |||
#define PQCLEAN_KYBER1024_AVX2_CONSTS_H | |||
#include "cdecl.inc" | |||
#include "params.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
typedef union { | |||
uint16_t as_arr[16]; | |||
__m256i as_vec; | |||
} aligned_uint16_t; | |||
#define ALIGNED_UINT16_T(N) \ | |||
union { \ | |||
__m256i as_vec; \ | |||
uint16_t as_arr[(N)]; \ | |||
} | |||
extern const uint16_t PQCLEAN_KYBER1024_AVX2_zetas_exp[396]; | |||
extern const uint16_t PQCLEAN_KYBER1024_AVX2_zetas_inv_exp[396]; | |||
typedef ALIGNED_UINT16_T(928) qdata_t; | |||
extern const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xq; | |||
extern const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xqinv; | |||
extern const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xv; | |||
extern const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xflo; | |||
extern const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xfhi; | |||
extern const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xmontsqlo; | |||
extern const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xmontsqhi; | |||
extern const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xmask; | |||
extern const qdata_t PQCLEAN_KYBER1024_AVX2_qdata; | |||
#endif |
@@ -1,148 +1,111 @@ | |||
#include "fips202.h" | |||
#include "fips202x4.h" | |||
#include "params.h" | |||
#include <immintrin.h> | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
#include <string.h> | |||
/* Use implementation from the Keccak Code Package */ | |||
#define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds | |||
extern void KeccakF1600_StatePermute4x(__m256i *s); | |||
#define NROUNDS 24 | |||
#define ROL(a, offset) (((a) << (offset)) ^ ((a) >> (64-(offset)))) | |||
static uint64_t load64(const uint8_t *x) { | |||
unsigned long long r = 0, i; | |||
for (i = 0; i < 8; ++i) { | |||
r |= (unsigned long long)x[i] << 8 * i; | |||
} | |||
return r; | |||
} | |||
static void store64(uint8_t *x, uint64_t u) { | |||
size_t i; | |||
static inline void store64(uint8_t x[8], uint64_t u) { | |||
unsigned int i = 0; | |||
for (i = 0; i < 8; ++i) { | |||
x[i] = (uint8_t)u; | |||
u >>= 8; | |||
for (i = 0; i < 8; i++) { | |||
x[i] = u >> 8 * i; | |||
} | |||
} | |||
/* Use implementation from the Keccak Code Package */ | |||
extern void KeccakP1600times4_PermuteAll_24rounds(__m256i *s); | |||
#define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds | |||
static void keccak_absorb4x(__m256i *s, | |||
static void keccakx4_absorb(__m256i s[25], | |||
unsigned int r, | |||
const uint8_t *m0, | |||
const uint8_t *m1, | |||
const uint8_t *m2, | |||
const uint8_t *m3, | |||
size_t mlen, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen, | |||
uint8_t p) { | |||
size_t i; | |||
uint8_t t0[200] = {0}; | |||
uint8_t t1[200] = {0}; | |||
uint8_t t2[200] = {0}; | |||
uint8_t t3[200] = {0}; | |||
size_t i = 0, pos = 0; | |||
__m256i t, idx; | |||
unsigned long long *ss = (unsigned long long *)s; | |||
for (i = 0; i < 25; ++i) { | |||
s[i] = _mm256_setzero_si256(); | |||
} | |||
while (mlen >= r) { | |||
idx = _mm256_set_epi64x((long long)in3, (long long)in2, (long long)in1, (long long)in0); | |||
while (inlen >= r) { | |||
for (i = 0; i < r / 8; ++i) { | |||
ss[4 * i + 0] ^= load64(m0 + 8 * i); | |||
ss[4 * i + 1] ^= load64(m1 + 8 * i); | |||
ss[4 * i + 2] ^= load64(m2 + 8 * i); | |||
ss[4 * i + 3] ^= load64(m3 + 8 * i); | |||
t = _mm256_i64gather_epi64((long long *)pos, idx, 1); | |||
s[i] = _mm256_xor_si256(s[i], t); | |||
pos += 8; | |||
} | |||
KeccakF1600_StatePermute4x(s); | |||
mlen -= r; | |||
m0 += r; | |||
m1 += r; | |||
m2 += r; | |||
m3 += r; | |||
inlen -= r; | |||
} | |||
for (i = 0; i < mlen; ++i) { | |||
t0[i] = m0[i]; | |||
t1[i] = m1[i]; | |||
t2[i] = m2[i]; | |||
t3[i] = m3[i]; | |||
i = 0; | |||
while (inlen >= 8) { | |||
t = _mm256_i64gather_epi64((long long *)pos, idx, 1); | |||
s[i] = _mm256_xor_si256(s[i], t); | |||
i++; | |||
pos += 8; | |||
inlen -= 8; | |||
} | |||
t0[i] = p; | |||
t1[i] = p; | |||
t2[i] = p; | |||
t3[i] = p; | |||
t0[r - 1] |= 128; | |||
t1[r - 1] |= 128; | |||
t2[r - 1] |= 128; | |||
t3[r - 1] |= 128; | |||
for (i = 0; i < r / 8; ++i) { | |||
ss[4 * i + 0] ^= load64(t0 + 8 * i); | |||
ss[4 * i + 1] ^= load64(t1 + 8 * i); | |||
ss[4 * i + 2] ^= load64(t2 + 8 * i); | |||
ss[4 * i + 3] ^= load64(t3 + 8 * i); | |||
if (inlen) { | |||
t = _mm256_i64gather_epi64((long long *)pos, idx, 1); | |||
idx = _mm256_set1_epi64x((long long)((1ULL << (8 * inlen)) - 1)); | |||
t = _mm256_and_si256(t, idx); | |||
s[i] = _mm256_xor_si256(s[i], t); | |||
} | |||
t = _mm256_set1_epi64x((uint64_t)p << 8 * inlen); | |||
s[i] = _mm256_xor_si256(s[i], t); | |||
t = _mm256_set1_epi64x((long long)(1ULL << 63)); | |||
s[r / 8 - 1] = _mm256_xor_si256(s[r / 8 - 1], t); | |||
} | |||
static void keccak_squeezeblocks4x(uint8_t *h0, | |||
uint8_t *h1, | |||
uint8_t *h2, | |||
uint8_t *h3, | |||
static void keccakx4_squeezeblocks(uint8_t *out0, | |||
uint8_t *out1, | |||
uint8_t *out2, | |||
uint8_t *out3, | |||
size_t nblocks, | |||
__m256i *s, | |||
unsigned int r) { | |||
unsigned long long *ss = (unsigned long long *)s; | |||
unsigned int r, | |||
__m256i s[25]) { | |||
unsigned int i = 0; | |||
uint64_t f0 = 0, f1 = 0, f2 = 0, f3 = 0; | |||
while (nblocks > 0) { | |||
KeccakF1600_StatePermute4x(s); | |||
for (size_t i = 0; i < (r >> 3); i++) { | |||
store64(h0 + 8 * i, ss[4 * i + 0]); | |||
store64(h1 + 8 * i, ss[4 * i + 1]); | |||
store64(h2 + 8 * i, ss[4 * i + 2]); | |||
store64(h3 + 8 * i, ss[4 * i + 3]); | |||
for (i = 0; i < r / 8; ++i) { | |||
f0 = _mm256_extract_epi64(s[i], 0); | |||
f1 = _mm256_extract_epi64(s[i], 1); | |||
f2 = _mm256_extract_epi64(s[i], 2); | |||
f3 = _mm256_extract_epi64(s[i], 3); | |||
store64(out0, f0); | |||
store64(out1, f1); | |||
store64(out2, f2); | |||
store64(out3, f3); | |||
out0 += 8; | |||
out1 += 8; | |||
out2 += 8; | |||
out3 += 8; | |||
} | |||
h0 += r; | |||
h1 += r; | |||
h2 += r; | |||
h3 += r; | |||
nblocks--; | |||
} | |||
} | |||
void PQCLEAN_KYBER1024_AVX2_kyber_shake128x4_absorb(keccak4x_state *state, | |||
const uint8_t *seed, | |||
uint16_t nonce0, | |||
uint16_t nonce1, | |||
uint16_t nonce2, | |||
uint16_t nonce3) { | |||
uint8_t extseed[4][KYBER_SYMBYTES + 2]; | |||
for (size_t i = 0; i < KYBER_SYMBYTES; ++i) { | |||
extseed[0][i] = seed[i]; | |||
extseed[1][i] = seed[i]; | |||
extseed[2][i] = seed[i]; | |||
extseed[3][i] = seed[i]; | |||
} | |||
extseed[0][KYBER_SYMBYTES + 0] = (uint8_t)nonce0; | |||
extseed[0][KYBER_SYMBYTES + 1] = (uint8_t)(nonce0 >> 8); | |||
extseed[1][KYBER_SYMBYTES + 0] = (uint8_t)nonce1; | |||
extseed[1][KYBER_SYMBYTES + 1] = (uint8_t)(nonce1 >> 8); | |||
extseed[2][KYBER_SYMBYTES + 0] = (uint8_t)nonce2; | |||
extseed[2][KYBER_SYMBYTES + 1] = (uint8_t)(nonce2 >> 8); | |||
extseed[3][KYBER_SYMBYTES + 0] = (uint8_t)nonce3; | |||
extseed[3][KYBER_SYMBYTES + 1] = (uint8_t)(nonce3 >> 8); | |||
/* zero state */ | |||
for (size_t i = 0; i < 25; i++) { | |||
state->s[i] = _mm256_xor_si256(state->s[i], state->s[i]); | |||
--nblocks; | |||
} | |||
} | |||
/* absorb 4 message of identical length in parallel */ | |||
keccak_absorb4x(state->s, SHAKE128_RATE, extseed[0], extseed[1], extseed[2], extseed[3], KYBER_SYMBYTES + 2, 0x1F); | |||
void PQCLEAN_KYBER1024_AVX2_shake128x4_absorb(keccakx4_state *state, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen) { | |||
keccakx4_absorb(state->s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F); | |||
} | |||
void PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(uint8_t *out0, | |||
@@ -150,82 +113,78 @@ void PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(uint8_t *out0, | |||
uint8_t *out2, | |||
uint8_t *out3, | |||
size_t nblocks, | |||
keccak4x_state *state) { | |||
keccak_squeezeblocks4x(out0, out1, out2, out3, nblocks, state->s, SHAKE128_RATE); | |||
keccakx4_state *state) { | |||
keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE128_RATE, | |||
state->s); | |||
} | |||
static void shake256x4(uint8_t *out0, | |||
uint8_t *out1, | |||
uint8_t *out2, | |||
uint8_t *out3, size_t outlen, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, size_t inlen) { | |||
__m256i s[25]; | |||
uint8_t t0[SHAKE256_RATE]; | |||
uint8_t t1[SHAKE256_RATE]; | |||
uint8_t t2[SHAKE256_RATE]; | |||
uint8_t t3[SHAKE256_RATE]; | |||
/* zero state */ | |||
for (size_t i = 0; i < 25; i++) { | |||
s[i] = _mm256_xor_si256(s[i], s[i]); | |||
} | |||
/* absorb 4 message of identical length in parallel */ | |||
keccak_absorb4x(s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F); | |||
/* Squeeze output */ | |||
keccak_squeezeblocks4x(out0, out1, out2, out3, outlen / SHAKE256_RATE, s, SHAKE256_RATE); | |||
void PQCLEAN_KYBER1024_AVX2_shake256x4_absorb(keccakx4_state *state, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen) { | |||
keccakx4_absorb(state->s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F); | |||
} | |||
out0 += (outlen / SHAKE256_RATE) * SHAKE256_RATE; | |||
out1 += (outlen / SHAKE256_RATE) * SHAKE256_RATE; | |||
out2 += (outlen / SHAKE256_RATE) * SHAKE256_RATE; | |||
out3 += (outlen / SHAKE256_RATE) * SHAKE256_RATE; | |||
void PQCLEAN_KYBER1024_AVX2_shake256x4_squeezeblocks(uint8_t *out0, | |||
uint8_t *out1, | |||
uint8_t *out2, | |||
uint8_t *out3, | |||
size_t nblocks, | |||
keccakx4_state *state) { | |||
keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE256_RATE, | |||
state->s); | |||
} | |||
if (outlen % SHAKE256_RATE) { | |||
keccak_squeezeblocks4x(t0, t1, t2, t3, 1, s, SHAKE256_RATE); | |||
for (size_t i = 0; i < outlen % SHAKE256_RATE; i++) { | |||
out0[i] = t0[i]; | |||
out1[i] = t1[i]; | |||
out2[i] = t2[i]; | |||
out3[i] = t3[i]; | |||
void PQCLEAN_KYBER1024_AVX2_shake128x4(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t outlen, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen) { | |||
unsigned int i = 0; | |||
size_t nblocks = outlen / SHAKE128_RATE; | |||
uint8_t t[4][SHAKE128_RATE]; | |||
keccakx4_state state; | |||
PQCLEAN_KYBER1024_AVX2_shake128x4_absorb(&state, in0, in1, in2, in3, inlen); | |||
PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); | |||
out0 += nblocks * SHAKE128_RATE; | |||
out1 += nblocks * SHAKE128_RATE; | |||
out2 += nblocks * SHAKE128_RATE; | |||
out3 += nblocks * SHAKE128_RATE; | |||
outlen -= nblocks * SHAKE128_RATE; | |||
if (outlen) { | |||
PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state); | |||
for (i = 0; i < outlen; ++i) { | |||
out0[i] = t[0][i]; | |||
out1[i] = t[1][i]; | |||
out2[i] = t[2][i]; | |||
out3[i] = t[3][i]; | |||
} | |||
} | |||
} | |||
void PQCLEAN_KYBER1024_AVX2_shake256x4_prf(uint8_t *out0, | |||
uint8_t *out1, | |||
uint8_t *out2, | |||
uint8_t *out3, | |||
size_t outlen, | |||
const uint8_t *key, | |||
uint8_t nonce0, | |||
uint8_t nonce1, | |||
uint8_t nonce2, | |||
uint8_t nonce3) { | |||
uint8_t extseed[4][KYBER_SYMBYTES + 1]; | |||
for (size_t i = 0; i < KYBER_SYMBYTES; i++) { | |||
extseed[0][i] = key[i]; | |||
extseed[1][i] = key[i]; | |||
extseed[2][i] = key[i]; | |||
extseed[3][i] = key[i]; | |||
void PQCLEAN_KYBER1024_AVX2_shake256x4(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t outlen, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen) { | |||
unsigned int i = 0; | |||
size_t nblocks = outlen / SHAKE256_RATE; | |||
uint8_t t[4][SHAKE256_RATE]; | |||
keccakx4_state state; | |||
PQCLEAN_KYBER1024_AVX2_shake256x4_absorb(&state, in0, in1, in2, in3, inlen); | |||
PQCLEAN_KYBER1024_AVX2_shake256x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); | |||
out0 += nblocks * SHAKE256_RATE; | |||
out1 += nblocks * SHAKE256_RATE; | |||
out2 += nblocks * SHAKE256_RATE; | |||
out3 += nblocks * SHAKE256_RATE; | |||
outlen -= nblocks * SHAKE256_RATE; | |||
if (outlen) { | |||
PQCLEAN_KYBER1024_AVX2_shake256x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state); | |||
for (i = 0; i < outlen; ++i) { | |||
out0[i] = t[0][i]; | |||
out1[i] = t[1][i]; | |||
out2[i] = t[2][i]; | |||
out3[i] = t[3][i]; | |||
} | |||
} | |||
extseed[0][KYBER_SYMBYTES] = nonce0; | |||
extseed[1][KYBER_SYMBYTES] = nonce1; | |||
extseed[2][KYBER_SYMBYTES] = nonce2; | |||
extseed[3][KYBER_SYMBYTES] = nonce3; | |||
shake256x4(out0, | |||
out1, | |||
out2, | |||
out3, | |||
outlen, | |||
extseed[0], | |||
extseed[1], | |||
extseed[2], | |||
extseed[3], | |||
KYBER_SYMBYTES + 1); | |||
} |
@@ -7,31 +7,19 @@ | |||
typedef struct { | |||
__m256i s[25]; | |||
} keccak4x_state; | |||
void PQCLEAN_KYBER1024_AVX2_kyber_shake128x4_absorb(keccak4x_state *state, | |||
const uint8_t *seed, | |||
uint16_t nonce0, | |||
uint16_t nonce1, | |||
uint16_t nonce2, | |||
uint16_t nonce3); | |||
void PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(uint8_t *out0, | |||
uint8_t *out1, | |||
uint8_t *out2, | |||
uint8_t *out3, | |||
size_t nblocks, | |||
keccak4x_state *state); | |||
void PQCLEAN_KYBER1024_AVX2_shake256x4_prf(uint8_t *out0, | |||
uint8_t *out1, | |||
uint8_t *out2, | |||
uint8_t *out3, | |||
size_t outlen, | |||
const uint8_t *key, | |||
uint8_t nonce0, | |||
uint8_t nonce1, | |||
uint8_t nonce2, | |||
uint8_t nonce3); | |||
} keccakx4_state; | |||
void PQCLEAN_KYBER1024_AVX2_shake128x4_absorb(keccakx4_state *state, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen); | |||
void PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t nblocks, keccakx4_state *state); | |||
void PQCLEAN_KYBER1024_AVX2_shake256x4_absorb(keccakx4_state *state, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen); | |||
void PQCLEAN_KYBER1024_AVX2_shake256x4_squeezeblocks(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t nblocks, | |||
keccakx4_state *state); | |||
void PQCLEAN_KYBER1024_AVX2_shake128x4(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t outlen, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen); | |||
void PQCLEAN_KYBER1024_AVX2_shake256x4(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t outlen, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen); | |||
#endif |
@@ -1,11 +1,8 @@ | |||
#include "cdecl.inc" | |||
.include "fq.inc" | |||
.global PQCLEAN_KYBER512_AVX2_reduce_avx | |||
PQCLEAN_KYBER512_AVX2_reduce_avx: | |||
#consts | |||
vmovdqa PQCLEAN_KYBER512_AVX2_16xq(%rip),%ymm0 | |||
vmovdqa PQCLEAN_KYBER512_AVX2_16xv(%rip),%ymm1 | |||
.text | |||
reduce128_avx: | |||
#load | |||
vmovdqa (%rdi),%ymm2 | |||
vmovdqa 32(%rdi),%ymm3 | |||
@@ -16,14 +13,14 @@ vmovdqa 160(%rdi),%ymm7 | |||
vmovdqa 192(%rdi),%ymm8 | |||
vmovdqa 224(%rdi),%ymm9 | |||
red16 2 10 | |||
red16 3 11 | |||
red16 4 12 | |||
red16 5 13 | |||
red16 6 14 | |||
red16 7 15 | |||
red16 8 10 | |||
red16 9 11 | |||
red16 2,10 | |||
red16 3,11 | |||
red16 4,12 | |||
red16 5,13 | |||
red16 6,14 | |||
red16 7,15 | |||
red16 8,10 | |||
red16 9,11 | |||
#store | |||
vmovdqa %ymm2,(%rdi) | |||
@@ -37,11 +34,17 @@ vmovdqa %ymm9,224(%rdi) | |||
ret | |||
.global PQCLEAN_KYBER512_AVX2_csubq_avx | |||
PQCLEAN_KYBER512_AVX2_csubq_avx: | |||
.global cdecl(PQCLEAN_KYBER1024_AVX2_reduce_avx) | |||
cdecl(PQCLEAN_KYBER1024_AVX2_reduce_avx): | |||
#consts | |||
vmovdqa PQCLEAN_KYBER512_AVX2_16xq(%rip),%ymm0 | |||
vmovdqa _16XQ*2(%rsi),%ymm0 | |||
vmovdqa _16XV*2(%rsi),%ymm1 | |||
call reduce128_avx | |||
add $256,%rdi | |||
call reduce128_avx | |||
ret | |||
csubq128_avx: | |||
#load | |||
vmovdqa (%rdi),%ymm1 | |||
vmovdqa 32(%rdi),%ymm2 | |||
@@ -52,14 +55,14 @@ vmovdqa 160(%rdi),%ymm6 | |||
vmovdqa 192(%rdi),%ymm7 | |||
vmovdqa 224(%rdi),%ymm8 | |||
csubq 1 9 | |||
csubq 2 10 | |||
csubq 3 11 | |||
csubq 4 12 | |||
csubq 5 13 | |||
csubq 6 14 | |||
csubq 7 15 | |||
csubq 8 9 | |||
csubq 1,9 | |||
csubq 2,10 | |||
csubq 3,11 | |||
csubq 4,12 | |||
csubq 5,13 | |||
csubq 6,14 | |||
csubq 7,15 | |||
csubq 8,9 | |||
#store | |||
vmovdqa %ymm1,(%rdi) | |||
@@ -73,13 +76,16 @@ vmovdqa %ymm8,224(%rdi) | |||
ret | |||
.global PQCLEAN_KYBER512_AVX2_frommont_avx | |||
PQCLEAN_KYBER512_AVX2_frommont_avx: | |||
.global cdecl(PQCLEAN_KYBER1024_AVX2_csubq_avx) | |||
cdecl(PQCLEAN_KYBER1024_AVX2_csubq_avx): | |||
#consts | |||
vmovdqa PQCLEAN_KYBER512_AVX2_16xq(%rip),%ymm0 | |||
vmovdqa PQCLEAN_KYBER512_AVX2_16xmontsqlo(%rip),%ymm1 | |||
vmovdqa PQCLEAN_KYBER512_AVX2_16xmontsqhi(%rip),%ymm2 | |||
vmovdqa _16XQ*2(%rsi),%ymm0 | |||
call csubq128_avx | |||
add $256,%rdi | |||
call csubq128_avx | |||
ret | |||
tomont128_avx: | |||
#load | |||
vmovdqa (%rdi),%ymm3 | |||
vmovdqa 32(%rdi),%ymm4 | |||
@@ -90,14 +96,14 @@ vmovdqa 160(%rdi),%ymm8 | |||
vmovdqa 192(%rdi),%ymm9 | |||
vmovdqa 224(%rdi),%ymm10 | |||
fqmulprecomp 1,2,3 11 | |||
fqmulprecomp 1,2,4 12 | |||
fqmulprecomp 1,2,5 13 | |||
fqmulprecomp 1,2,6 14 | |||
fqmulprecomp 1,2,7 15 | |||
fqmulprecomp 1,2,8 11 | |||
fqmulprecomp 1,2,9 12 | |||
fqmulprecomp 1,2,10 13 | |||
fqmulprecomp 1,2,3,11 | |||
fqmulprecomp 1,2,4,12 | |||
fqmulprecomp 1,2,5,13 | |||
fqmulprecomp 1,2,6,14 | |||
fqmulprecomp 1,2,7,15 | |||
fqmulprecomp 1,2,8,11 | |||
fqmulprecomp 1,2,9,12 | |||
fqmulprecomp 1,2,10,13 | |||
#store | |||
vmovdqa %ymm3,(%rdi) | |||
@@ -110,3 +116,14 @@ vmovdqa %ymm9,192(%rdi) | |||
vmovdqa %ymm10,224(%rdi) | |||
ret | |||
.global cdecl(PQCLEAN_KYBER1024_AVX2_tomont_avx) | |||
cdecl(PQCLEAN_KYBER1024_AVX2_tomont_avx): | |||
#consts | |||
vmovdqa _16XQ*2(%rsi),%ymm0 | |||
vmovdqa _16XMONTSQLO*2(%rsi),%ymm1 | |||
vmovdqa _16XMONTSQHI*2(%rsi),%ymm2 | |||
call tomont128_avx | |||
add $256,%rdi | |||
call tomont128_avx | |||
ret |
@@ -1,24 +1,27 @@ | |||
.macro red16 r x=12 | |||
.macro red16 r,x=12 | |||
vpmulhw %ymm1,%ymm\r,%ymm\x | |||
vpsraw $10,%ymm\x,%ymm\x | |||
vpmullw %ymm0,%ymm\x,%ymm\x | |||
vpsubw %ymm\x,%ymm\r,%ymm\r | |||
.endm | |||
.macro csubq r x=12 | |||
.macro csubq r,x=12 | |||
vpsubw %ymm0,%ymm\r,%ymm\r | |||
vpsraw $15,%ymm\r,%ymm\x | |||
vpand %ymm0,%ymm\x,%ymm\x | |||
vpaddw %ymm\x,%ymm\r,%ymm\r | |||
#vpcmpgtw %ymm0,%ymm\r,%ymm\x | |||
#vpand %ymm0,%ymm\x,%ymm\x | |||
#vpsubw %ymm\x,%ymm\r,%ymm\r | |||
.endm | |||
.macro caddq r x=12 | |||
.macro caddq r,x=12 | |||
vpsraw $15,%ymm\r,%ymm\x | |||
vpand %ymm0,%ymm\x,%ymm\x | |||
vpaddw %ymm\x,%ymm\r,%ymm\r | |||
.endm | |||
.macro fqmulprecomp al,ah,b x=12 | |||
.macro fqmulprecomp al,ah,b,x=12 | |||
vpmullw %ymm\al,%ymm\b,%ymm\x | |||
vpmulhw %ymm\ah,%ymm\b,%ymm\b | |||
vpmulhw %ymm0,%ymm\x,%ymm\x | |||
@@ -1,26 +1,33 @@ | |||
#include "align.h" | |||
#include "cbd.h" | |||
#include "indcpa.h" | |||
#include "ntt.h" | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "polyvec.h" | |||
#include "randombytes.h" | |||
#include "rejsample.h" | |||
#include "symmetric.h" | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
/************************************************* | |||
* Name: pack_pk | |||
* | |||
* Description: Serialize the public key as concatenation of the | |||
* compressed and serialized vector of polynomials pk | |||
* serialized vector of polynomials pk | |||
* and the public seed used to generate the matrix A. | |||
* | |||
* Arguments: uint8_t *r: pointer to the output serialized public key | |||
* const poly *pk: pointer to the input public-key polynomial | |||
* polyvec *pk: pointer to the input public-key polyvec | |||
* const uint8_t *seed: pointer to the input public seed | |||
**************************************************/ | |||
static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) { | |||
static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], | |||
polyvec *pk, | |||
const uint8_t seed[KYBER_SYMBYTES]) { | |||
size_t i = 0; | |||
PQCLEAN_KYBER1024_AVX2_polyvec_tobytes(r, pk); | |||
for (size_t i = 0; i < KYBER_SYMBYTES; i++) { | |||
for (i = 0; i < KYBER_SYMBYTES; i++) { | |||
r[i + KYBER_POLYVECBYTES] = seed[i]; | |||
} | |||
} | |||
@@ -28,16 +35,19 @@ static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) { | |||
/************************************************* | |||
* Name: unpack_pk | |||
* | |||
* Description: De-serialize and decompress public key from a byte array; | |||
* Description: De-serialize public key from a byte array; | |||
* approximate inverse of pack_pk | |||
* | |||
* Arguments: - polyvec *pk: pointer to output public-key vector of polynomials | |||
* - uint8_t *seed: pointer to output seed to generate matrix A | |||
* Arguments: - polyvec *pk: pointer to output public-key polynomial vector | |||
* - uint8_t *seed: pointer to output seed to generate matrix A | |||
* - const uint8_t *packedpk: pointer to input serialized public key | |||
**************************************************/ | |||
static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) { | |||
static void unpack_pk(polyvec *pk, | |||
uint8_t seed[KYBER_SYMBYTES], | |||
const uint8_t packedpk[KYBER_INDCPA_PUBLICKEYBYTES]) { | |||
size_t i = 0; | |||
PQCLEAN_KYBER1024_AVX2_polyvec_frombytes(pk, packedpk); | |||
for (size_t i = 0; i < KYBER_SYMBYTES; i++) { | |||
for (i = 0; i < KYBER_SYMBYTES; i++) { | |||
seed[i] = packedpk[i + KYBER_POLYVECBYTES]; | |||
} | |||
} | |||
@@ -48,9 +58,9 @@ static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) { | |||
* Description: Serialize the secret key | |||
* | |||
* Arguments: - uint8_t *r: pointer to output serialized secret key | |||
* - const polyvec *sk: pointer to input vector of polynomials (secret key) | |||
* - polyvec *sk: pointer to input vector of polynomials (secret key) | |||
**************************************************/ | |||
static void pack_sk(uint8_t *r, polyvec *sk) { | |||
static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { | |||
PQCLEAN_KYBER1024_AVX2_polyvec_tobytes(r, sk); | |||
} | |||
@@ -60,10 +70,12 @@ static void pack_sk(uint8_t *r, polyvec *sk) { | |||
* Description: De-serialize the secret key; | |||
* inverse of pack_sk | |||
* | |||
* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key) | |||
* Arguments: - polyvec *sk: pointer to output vector of polynomials | |||
* (secret key) | |||
* - const uint8_t *packedsk: pointer to input serialized secret key | |||
**************************************************/ | |||
static void unpack_sk(polyvec *sk, const uint8_t *packedsk) { | |||
static void unpack_sk(polyvec *sk, | |||
const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { | |||
PQCLEAN_KYBER1024_AVX2_polyvec_frombytes(sk, packedsk); | |||
} | |||
@@ -74,11 +86,13 @@ static void unpack_sk(polyvec *sk, const uint8_t *packedsk) { | |||
* compressed and serialized vector of polynomials b | |||
* and the compressed and serialized polynomial v | |||
* | |||
* Arguments: uint8_t *r: pointer to the output serialized ciphertext | |||
* const poly *pk: pointer to the input vector of polynomials b | |||
* const uint8_t *seed: pointer to the input polynomial v | |||
* Arguments: uint8_t *r: pointer to the output serialized ciphertext | |||
* poly *pk: pointer to the input vector of polynomials b | |||
* poly *v: pointer to the input polynomial v | |||
**************************************************/ | |||
static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) { | |||
static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], | |||
polyvec *b, | |||
poly *v) { | |||
PQCLEAN_KYBER1024_AVX2_polyvec_compress(r, b); | |||
PQCLEAN_KYBER1024_AVX2_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v); | |||
} | |||
@@ -89,22 +103,42 @@ static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) { | |||
* Description: De-serialize and decompress ciphertext from a byte array; | |||
* approximate inverse of pack_ciphertext | |||
* | |||
* Arguments: - polyvec *b: pointer to the output vector of polynomials b | |||
* - poly *v: pointer to the output polynomial v | |||
* Arguments: - polyvec *b: pointer to the output vector of polynomials b | |||
* - poly *v: pointer to the output polynomial v | |||
* - const uint8_t *c: pointer to the input serialized ciphertext | |||
**************************************************/ | |||
static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t *c) { | |||
static void unpack_ciphertext(polyvec *b, | |||
poly *v, | |||
const uint8_t c[KYBER_INDCPA_BYTES]) { | |||
PQCLEAN_KYBER1024_AVX2_polyvec_decompress(b, c); | |||
PQCLEAN_KYBER1024_AVX2_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES); | |||
} | |||
static size_t rej_uniform_ref(int16_t *r, size_t len, const uint8_t *buf, size_t buflen) { | |||
unsigned int ctr, pos; | |||
uint16_t val; | |||
/************************************************* | |||
* Name: rej_uniform | |||
* | |||
* Description: Run rejection sampling on uniform random bytes to generate | |||
* uniform random integers mod q | |||
* | |||
* Arguments: - int16_t *r: pointer to output buffer | |||
* - unsigned int len: requested number of 16-bit integers | |||
* (uniform mod q) | |||
* - const uint8_t *buf: pointer to input buffer | |||
* (assumed to be uniform random bytes) | |||
* - unsigned int buflen: length of input buffer in bytes | |||
* | |||
* Returns number of sampled 16-bit integers (at most len) | |||
**************************************************/ | |||
static unsigned int rej_uniform(int16_t *r, | |||
unsigned int len, | |||
const uint8_t *buf, | |||
unsigned int buflen) { | |||
unsigned int ctr = 0, pos = 0; | |||
uint16_t val = 0; | |||
ctr = pos = 0; | |||
while (ctr < len && pos + 2 <= buflen) { | |||
val = (uint16_t)(buf[pos] | ((uint16_t)buf[pos + 1] << 8)); | |||
val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); | |||
pos += 2; | |||
if (val < 19 * KYBER_Q) { | |||
@@ -116,57 +150,76 @@ static size_t rej_uniform_ref(int16_t *r, size_t len, const uint8_t *buf, size_t | |||
return ctr; | |||
} | |||
#define gen_a(A,B) gen_matrix(A,B,0) | |||
#define gen_at(A,B) gen_matrix(A,B,1) | |||
#define gen_a(A,B) PQCLEAN_KYBER1024_AVX2_gen_matrix(A,B,0) | |||
#define gen_at(A,B) PQCLEAN_KYBER1024_AVX2_gen_matrix(A,B,1) | |||
/************************************************* | |||
* Name: gen_matrix | |||
* Name: PQCLEAN_KYBER1024_AVX2_gen_matrix | |||
* | |||
* Description: Deterministically generate matrix A (or the transpose of A) | |||
* from a seed. Entries of the matrix are polynomials that look | |||
* uniformly random. Performs rejection sampling on output of | |||
* a XOF | |||
* | |||
* Arguments: - polyvec *a: pointer to ouptput matrix A | |||
* Arguments: - polyvec *a: pointer to ouptput matrix A | |||
* - const uint8_t *seed: pointer to input seed | |||
* - int transposed: boolean deciding whether A or A^T is generated | |||
* - int transposed: boolean deciding whether A or A^T is generated | |||
**************************************************/ | |||
#define GEN_MATRIX_MAXNBLOCKS ((530+XOF_BLOCKBYTES)/XOF_BLOCKBYTES) /* 530 is expected number of required bytes */ | |||
static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { | |||
uint16_t i; | |||
size_t ctr0, ctr1, ctr2, ctr3, bufbytes; | |||
union { | |||
uint8_t x[4][XOF_BLOCKBYTES * GEN_MATRIX_MAXNBLOCKS]; | |||
__m256i _dummy; | |||
} buf; | |||
keccak4x_state state; | |||
#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \ | |||
+ XOF_BLOCKBYTES)/XOF_BLOCKBYTES) | |||
void PQCLEAN_KYBER1024_AVX2_gen_matrix(polyvec *a, const uint8_t seed[32], int transposed) { | |||
unsigned int i = 0, ctr0 = 0, ctr1 = 0, ctr2 = 0, ctr3 = 0; | |||
ALIGN32_ARRAY_2D(uint8_t, 4, (GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 31) / 32 * 32) buf; | |||
__m256i f; | |||
keccakx4_state state; | |||
for (i = 0; i < 4; i++) { | |||
f = _mm256_load_si256((__m256i *)seed); | |||
_mm256_store_si256((__m256i *)buf.arr[0], f); | |||
_mm256_store_si256((__m256i *)buf.arr[1], f); | |||
_mm256_store_si256((__m256i *)buf.arr[2], f); | |||
_mm256_store_si256((__m256i *)buf.arr[3], f); | |||
if (transposed) { | |||
PQCLEAN_KYBER1024_AVX2_kyber_shake128x4_absorb( | |||
&state, seed, i + 0, i + 256, i + 512, i + 768); | |||
buf.arr[0][KYBER_SYMBYTES + 0] = i; | |||
buf.arr[0][KYBER_SYMBYTES + 1] = 0; | |||
buf.arr[1][KYBER_SYMBYTES + 0] = i; | |||
buf.arr[1][KYBER_SYMBYTES + 1] = 1; | |||
buf.arr[2][KYBER_SYMBYTES + 0] = i; | |||
buf.arr[2][KYBER_SYMBYTES + 1] = 2; | |||
buf.arr[3][KYBER_SYMBYTES + 0] = i; | |||
buf.arr[3][KYBER_SYMBYTES + 1] = 3; | |||
} else { | |||
PQCLEAN_KYBER1024_AVX2_kyber_shake128x4_absorb( | |||
&state, seed, 256 * i + 0, 256 * i + 1, 256 * i + 2, 256 * i + 3); | |||
buf.arr[0][KYBER_SYMBYTES + 0] = 0; | |||
buf.arr[0][KYBER_SYMBYTES + 1] = i; | |||
buf.arr[1][KYBER_SYMBYTES + 0] = 1; | |||
buf.arr[1][KYBER_SYMBYTES + 1] = i; | |||
buf.arr[2][KYBER_SYMBYTES + 0] = 2; | |||
buf.arr[2][KYBER_SYMBYTES + 1] = i; | |||
buf.arr[3][KYBER_SYMBYTES + 0] = 3; | |||
buf.arr[3][KYBER_SYMBYTES + 1] = i; | |||
} | |||
PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks( | |||
buf.x[0], buf.x[1], buf.x[2], buf.x[3], GEN_MATRIX_MAXNBLOCKS, &state); | |||
bufbytes = GEN_MATRIX_MAXNBLOCKS * XOF_BLOCKBYTES; | |||
PQCLEAN_KYBER1024_AVX2_shake128x4_absorb(&state, buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], KYBER_SYMBYTES + 2); | |||
PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], | |||
GEN_MATRIX_NBLOCKS, &state); | |||
ctr0 = PQCLEAN_KYBER1024_AVX2_rej_uniform(a[i].vec[0].coeffs, KYBER_N, buf.x[0], bufbytes); | |||
ctr1 = PQCLEAN_KYBER1024_AVX2_rej_uniform(a[i].vec[1].coeffs, KYBER_N, buf.x[1], bufbytes); | |||
ctr2 = PQCLEAN_KYBER1024_AVX2_rej_uniform(a[i].vec[2].coeffs, KYBER_N, buf.x[2], bufbytes); | |||
ctr3 = PQCLEAN_KYBER1024_AVX2_rej_uniform(a[i].vec[3].coeffs, KYBER_N, buf.x[3], bufbytes); | |||
ctr0 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[0].coeffs, buf.arr[0]); | |||
ctr1 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[1].coeffs, buf.arr[1]); | |||
ctr2 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[2].coeffs, buf.arr[2]); | |||
ctr3 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[3].coeffs, buf.arr[3]); | |||
while (ctr0 < KYBER_N || ctr1 < KYBER_N || ctr2 < KYBER_N || ctr3 < KYBER_N) { | |||
PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(buf.x[0], buf.x[1], buf.x[2], buf.x[3], 1, &state); | |||
bufbytes = XOF_BLOCKBYTES; | |||
ctr0 += rej_uniform_ref(a[i].vec[0].coeffs + ctr0, KYBER_N - ctr0, buf.x[0], bufbytes); | |||
ctr1 += rej_uniform_ref(a[i].vec[1].coeffs + ctr1, KYBER_N - ctr1, buf.x[1], bufbytes); | |||
ctr2 += rej_uniform_ref(a[i].vec[2].coeffs + ctr2, KYBER_N - ctr2, buf.x[2], bufbytes); | |||
ctr3 += rej_uniform_ref(a[i].vec[3].coeffs + ctr3, KYBER_N - ctr3, buf.x[3], bufbytes); | |||
PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], 1, &state); | |||
ctr0 += rej_uniform(a[i].vec[0].coeffs + ctr0, KYBER_N - ctr0, buf.arr[0], | |||
XOF_BLOCKBYTES); | |||
ctr1 += rej_uniform(a[i].vec[1].coeffs + ctr1, KYBER_N - ctr1, buf.arr[1], | |||
XOF_BLOCKBYTES); | |||
ctr2 += rej_uniform(a[i].vec[2].coeffs + ctr2, KYBER_N - ctr2, buf.arr[2], | |||
XOF_BLOCKBYTES); | |||
ctr3 += rej_uniform(a[i].vec[3].coeffs + ctr3, KYBER_N - ctr3, buf.arr[3], | |||
XOF_BLOCKBYTES); | |||
} | |||
PQCLEAN_KYBER1024_AVX2_poly_nttunpack(&a[i].vec[0]); | |||
@@ -177,36 +230,41 @@ static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { | |||
} | |||
/************************************************* | |||
* Name: indcpa_keypair | |||
* Name: PQCLEAN_KYBER1024_AVX2_indcpa_keypair | |||
* | |||
* Description: Generates public and private key for the CPA-secure | |||
* public-key encryption scheme underlying Kyber | |||
* | |||
* Arguments: - uint8_t *pk: pointer to output public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) | |||
* - uint8_t *sk: pointer to output private key (of length KYBER_INDCPA_SECRETKEYBYTES bytes) | |||
* Arguments: - uint8_t *pk: pointer to output public key | |||
* (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) | |||
* - uint8_t *sk: pointer to output private key | |||
(of length KYBER_INDCPA_SECRETKEYBYTES bytes) | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk) { | |||
polyvec a[KYBER_K], skpv, e, pkpv; | |||
uint8_t buf[2 * KYBER_SYMBYTES]; | |||
const uint8_t *publicseed = buf; | |||
const uint8_t *noiseseed = buf + KYBER_SYMBYTES; | |||
uint8_t nonce = 0; | |||
void PQCLEAN_KYBER1024_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], | |||
uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { | |||
unsigned int i = 0; | |||
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; | |||
const uint8_t *publicseed = buf.arr; | |||
const uint8_t *noiseseed = buf.arr + KYBER_SYMBYTES; | |||
polyvec a[KYBER_K], e, pkpv, skpv; | |||
randombytes(buf, KYBER_SYMBYTES); | |||
hash_g(buf, buf, KYBER_SYMBYTES); | |||
randombytes(buf.arr, KYBER_SYMBYTES); | |||
hash_g(buf.arr, buf.arr, KYBER_SYMBYTES); | |||
gen_a(a, publicseed); | |||
PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(skpv.vec + 0, skpv.vec + 1, skpv.vec + 2, skpv.vec + 3, noiseseed, nonce + 0, nonce + 1, nonce + 2, nonce + 3); | |||
PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(e.vec + 0, e.vec + 1, e.vec + 2, e.vec + 3, noiseseed, nonce + 4, nonce + 5, nonce + 6, nonce + 7); | |||
PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(skpv.vec + 0, skpv.vec + 1, skpv.vec + 2, skpv.vec + 3, noiseseed, | |||
0, 1, 2, 3); | |||
PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(e.vec + 0, e.vec + 1, e.vec + 2, e.vec + 3, noiseseed, | |||
4, 5, 6, 7); | |||
PQCLEAN_KYBER1024_AVX2_polyvec_ntt(&skpv); | |||
PQCLEAN_KYBER1024_AVX2_polyvec_ntt(&e); | |||
// matrix-vector multiplication | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc(pkpv.vec + i, a + i, &skpv); | |||
PQCLEAN_KYBER1024_AVX2_poly_frommont(pkpv.vec + i); | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); | |||
PQCLEAN_KYBER1024_AVX2_poly_tomont(&pkpv.vec[i]); | |||
} | |||
PQCLEAN_KYBER1024_AVX2_polyvec_add(&pkpv, &pkpv, &e); | |||
@@ -217,45 +275,52 @@ void PQCLEAN_KYBER1024_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk) { | |||
} | |||
/************************************************* | |||
* Name: indcpa_enc | |||
* Name: PQCLEAN_KYBER1024_AVX2_indcpa_enc | |||
* | |||
* Description: Encryption function of the CPA-secure | |||
* public-key encryption scheme underlying Kyber. | |||
* | |||
* Arguments: - uint8_t *c: pointer to output ciphertext (of length KYBER_INDCPA_BYTES bytes) | |||
* - const uint8_t *m: pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes) | |||
* - const uint8_t *pk: pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) | |||
* - const uint8_t *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes) | |||
* to deterministically generate all randomness | |||
* Arguments: - uint8_t *c: pointer to output ciphertext | |||
* (of length KYBER_INDCPA_BYTES bytes) | |||
* - const uint8_t *m: pointer to input message | |||
* (of length KYBER_INDCPA_MSGBYTES bytes) | |||
* - const uint8_t *pk: pointer to input public key | |||
* (of length KYBER_INDCPA_PUBLICKEYBYTES) | |||
* - const uint8_t *coins: pointer to input random coins | |||
* used as seed (of length KYBER_SYMBYTES) | |||
* to deterministically generate all | |||
* randomness | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_indcpa_enc(uint8_t *c, | |||
const uint8_t *m, | |||
const uint8_t *pk, | |||
const uint8_t *coins) { | |||
polyvec at[KYBER_K], pkpv, sp, ep, bp; | |||
poly k, v, epp; | |||
uint8_t seed[KYBER_SYMBYTES]; | |||
uint8_t nonce = 0; | |||
unpack_pk(&pkpv, seed, pk); | |||
void PQCLEAN_KYBER1024_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], | |||
const uint8_t m[KYBER_INDCPA_MSGBYTES], | |||
const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], | |||
const uint8_t coins[KYBER_SYMBYTES]) { | |||
unsigned int i = 0; | |||
ALIGN32_ARRAY(uint8_t, KYBER_SYMBYTES) seed; | |||
polyvec sp, pkpv, ep, at[KYBER_K], bp; | |||
poly v, k, epp; | |||
unpack_pk(&pkpv, seed.arr, pk); | |||
PQCLEAN_KYBER1024_AVX2_poly_frommsg(&k, m); | |||
gen_at(at, seed); | |||
gen_at(at, seed.arr); | |||
PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(sp.vec + 0, sp.vec + 1, sp.vec + 2, sp.vec + 3, coins, nonce + 0, nonce + 1, nonce + 2, nonce + 3); | |||
PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(ep.vec + 0, ep.vec + 1, ep.vec + 2, ep.vec + 3, coins, nonce + 4, nonce + 5, nonce + 6, nonce + 7); | |||
PQCLEAN_KYBER1024_AVX2_poly_getnoise(&epp, coins, nonce + 8); | |||
PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(sp.vec + 0, sp.vec + 1, sp.vec + 2, sp.vec + 3, coins, | |||
0, 1, 2, 3); | |||
PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(ep.vec + 0, ep.vec + 1, ep.vec + 2, ep.vec + 3, coins, | |||
4, 5, 6, 7); | |||
PQCLEAN_KYBER1024_AVX2_poly_getnoise(&epp, coins, 8); | |||
PQCLEAN_KYBER1024_AVX2_polyvec_ntt(&sp); | |||
// matrix-vector multiplication | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc(bp.vec + i, at + i, &sp); | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp); | |||
} | |||
PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc(&v, &pkpv, &sp); | |||
PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp); | |||
PQCLEAN_KYBER1024_AVX2_polyvec_invntt(&bp); | |||
PQCLEAN_KYBER1024_AVX2_poly_invntt(&v); | |||
PQCLEAN_KYBER1024_AVX2_polyvec_invntt_tomont(&bp); | |||
PQCLEAN_KYBER1024_AVX2_poly_invntt_tomont(&v); | |||
PQCLEAN_KYBER1024_AVX2_polyvec_add(&bp, &bp, &ep); | |||
PQCLEAN_KYBER1024_AVX2_poly_add(&v, &v, &epp); | |||
@@ -267,18 +332,21 @@ void PQCLEAN_KYBER1024_AVX2_indcpa_enc(uint8_t *c, | |||
} | |||
/************************************************* | |||
* Name: indcpa_dec | |||
* Name: PQCLEAN_KYBER1024_AVX2_indcpa_dec | |||
* | |||
* Description: Decryption function of the CPA-secure | |||
* public-key encryption scheme underlying Kyber. | |||
* | |||
* Arguments: - uint8_t *m: pointer to output decrypted message (of length KYBER_INDCPA_MSGBYTES) | |||
* - const uint8_t *c: pointer to input ciphertext (of length KYBER_INDCPA_BYTES) | |||
* - const uint8_t *sk: pointer to input secret key (of length KYBER_INDCPA_SECRETKEYBYTES) | |||
* Arguments: - uint8_t *m: pointer to output decrypted message | |||
* (of length KYBER_INDCPA_MSGBYTES) | |||
* - const uint8_t *c: pointer to input ciphertext | |||
* (of length KYBER_INDCPA_BYTES) | |||
* - const uint8_t *sk: pointer to input secret key | |||
* (of length KYBER_INDCPA_SECRETKEYBYTES) | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_indcpa_dec(uint8_t *m, | |||
const uint8_t *c, | |||
const uint8_t *sk) { | |||
void PQCLEAN_KYBER1024_AVX2_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], | |||
const uint8_t c[KYBER_INDCPA_BYTES], | |||
const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { | |||
polyvec bp, skpv; | |||
poly v, mp; | |||
@@ -286,8 +354,8 @@ void PQCLEAN_KYBER1024_AVX2_indcpa_dec(uint8_t *m, | |||
unpack_sk(&skpv, sk); | |||
PQCLEAN_KYBER1024_AVX2_polyvec_ntt(&bp); | |||
PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc(&mp, &skpv, &bp); | |||
PQCLEAN_KYBER1024_AVX2_poly_invntt(&mp); | |||
PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp); | |||
PQCLEAN_KYBER1024_AVX2_poly_invntt_tomont(&mp); | |||
PQCLEAN_KYBER1024_AVX2_poly_sub(&mp, &v, &mp); | |||
PQCLEAN_KYBER1024_AVX2_poly_reduce(&mp); | |||
@@ -1,21 +1,16 @@ | |||
#ifndef INDCPA_H | |||
#define INDCPA_H | |||
#ifndef PQCLEAN_KYBER1024_AVX2_INDCPA_H | |||
#define PQCLEAN_KYBER1024_AVX2_INDCPA_H | |||
#include "params.h" | |||
#include "polyvec.h" | |||
#include <stdint.h> | |||
void PQCLEAN_KYBER1024_AVX2_indcpa_keypair( | |||
uint8_t *pk, | |||
uint8_t *sk); | |||
void PQCLEAN_KYBER1024_AVX2_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed); | |||
void PQCLEAN_KYBER1024_AVX2_indcpa_enc( | |||
uint8_t *c, | |||
const uint8_t *m, | |||
const uint8_t *pk, | |||
const uint8_t *coins); | |||
void PQCLEAN_KYBER1024_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]); | |||
void PQCLEAN_KYBER1024_AVX2_indcpa_dec( | |||
uint8_t *m, | |||
const uint8_t *c, | |||
const uint8_t *sk); | |||
void PQCLEAN_KYBER1024_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], const uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], const uint8_t coins[KYBER_SYMBYTES]); | |||
void PQCLEAN_KYBER1024_AVX2_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t c[KYBER_INDCPA_BYTES], const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]); | |||
#endif |
@@ -1,7 +1,8 @@ | |||
#include "cdecl.inc" | |||
.include "shuffle.inc" | |||
.include "fq.inc" | |||
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 zl0=1,zl1=1,zh0=2,zh1=2 | |||
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=1,zl1=1,zh0=2,zh1=2 | |||
#update & mul | |||
vpsubw %ymm\rh0,%ymm\rl0,%ymm12 | |||
vpsubw %ymm\rh1,%ymm\rl1,%ymm13 | |||
@@ -36,12 +37,8 @@ vpsubw %ymm\rh2,%ymm14,%ymm\rh2 | |||
vpsubw %ymm\rh3,%ymm15,%ymm\rh3 | |||
.endm | |||
.global PQCLEAN_KYBER512_AVX2_invntt_levels0t5_avx | |||
.p2align 5 | |||
PQCLEAN_KYBER512_AVX2_invntt_levels0t5_avx: | |||
#consts | |||
vmovdqa PQCLEAN_KYBER512_AVX2_16xq(%rip),%ymm0 | |||
.text | |||
invntt_levels0t5_avx: | |||
level0: | |||
#zetas | |||
vmovdqu (%rsi),%ymm15 | |||
@@ -59,14 +56,14 @@ vmovdqa 160(%rdi),%ymm9 | |||
vmovdqa 192(%rdi),%ymm10 | |||
vmovdqa 224(%rdi),%ymm11 | |||
butterfly 4,5,8,9,6,7,10,11 15,3,1,2 | |||
butterfly 4,5,8,9,6,7,10,11,15,3,1,2 | |||
level1: | |||
#zetas | |||
vmovdqu 128(%rsi),%ymm3 | |||
vmovdqu 160(%rsi),%ymm2 | |||
butterfly 4,5,6,7,8,9,10,11 3,3,2,2 | |||
butterfly 4,5,6,7,8,9,10,11,3,3,2,2 | |||
shuffle1 4,5,3,5 | |||
shuffle1 6,7,4,7 | |||
@@ -79,9 +76,9 @@ vmovdqu 192(%rsi),%ymm10 | |||
vmovdqu 224(%rsi),%ymm2 | |||
#consts | |||
vmovdqa PQCLEAN_KYBER512_AVX2_16xv(%rip),%ymm1 | |||
vmovdqa _16XV*2(%rdx),%ymm1 | |||
butterfly 3,4,6,8,5,7,9,11 10,10,2,2 | |||
butterfly 3,4,6,8,5,7,9,11,10,10,2,2 | |||
red16 3 | |||
@@ -95,7 +92,7 @@ level3: | |||
vmovdqu 256(%rsi),%ymm9 | |||
vmovdqu 288(%rsi),%ymm2 | |||
butterfly 10,3,6,5,4,8,7,11 9,9,2,2 | |||
butterfly 10,3,6,5,4,8,7,11,9,9,2,2 | |||
red16 10 | |||
@@ -109,7 +106,7 @@ level4: | |||
vmovdqu 320(%rsi),%ymm7 | |||
vmovdqu 352(%rsi),%ymm2 | |||
butterfly 9,10,6,4,3,5,8,11 7,7,2,2 | |||
butterfly 9,10,6,4,3,5,8,11,7,7,2,2 | |||
red16 9 | |||
@@ -123,7 +120,7 @@ level5: | |||
vpbroadcastd 384(%rsi),%ymm8 | |||
vpbroadcastd 388(%rsi),%ymm2 | |||
butterfly 7,9,6,3,10,4,5,11 8,8,2,2 | |||
butterfly 7,9,6,3,10,4,5,11,8,8,2,2 | |||
red16 7 | |||
@@ -139,11 +136,7 @@ vmovdqa %ymm11,224(%rdi) | |||
ret | |||
.global PQCLEAN_KYBER512_AVX2_invntt_level6_avx | |||
PQCLEAN_KYBER512_AVX2_invntt_level6_avx: | |||
#consts | |||
vmovdqa PQCLEAN_KYBER512_AVX2_16xq(%rip),%ymm0 | |||
invntt_level6_avx: | |||
#zetas | |||
vpbroadcastd (%rsi),%ymm1 | |||
vpbroadcastd 4(%rsi),%ymm2 | |||
@@ -161,8 +154,8 @@ vmovdqa 352(%rdi),%ymm11 | |||
butterfly 4,5,6,7,8,9,10,11 | |||
#consts | |||
vmovdqa PQCLEAN_KYBER512_AVX2_16xflo(%rip),%ymm12 | |||
vmovdqa PQCLEAN_KYBER512_AVX2_16xfhi(%rip),%ymm13 | |||
vmovdqa _16XFLO*2(%rdx),%ymm12 | |||
vmovdqa _16XFHI*2(%rdx),%ymm13 | |||
#store | |||
vmovdqa %ymm8,256(%rdi) | |||
@@ -170,10 +163,10 @@ vmovdqa %ymm9,288(%rdi) | |||
vmovdqa %ymm10,320(%rdi) | |||
vmovdqa %ymm11,352(%rdi) | |||
fqmulprecomp 12,13,4 8 | |||
fqmulprecomp 12,13,5 9 | |||
fqmulprecomp 12,13,6 10 | |||
fqmulprecomp 12,13,7 11 | |||
fqmulprecomp 12,13,4,8 | |||
fqmulprecomp 12,13,5,9 | |||
fqmulprecomp 12,13,6,10 | |||
fqmulprecomp 12,13,7,11 | |||
#store | |||
vmovdqa %ymm4,(%rdi) | |||
@@ -194,8 +187,8 @@ vmovdqa 480(%rdi),%ymm11 | |||
butterfly 4,5,6,7,8,9,10,11 | |||
#consts | |||
vmovdqa PQCLEAN_KYBER512_AVX2_16xflo(%rip),%ymm12 | |||
vmovdqa PQCLEAN_KYBER512_AVX2_16xfhi(%rip),%ymm13 | |||
vmovdqa _16XFLO*2(%rdx),%ymm12 | |||
vmovdqa _16XFHI*2(%rdx),%ymm13 | |||
#store | |||
vmovdqa %ymm8,384(%rdi) | |||
@@ -203,10 +196,10 @@ vmovdqa %ymm9,416(%rdi) | |||
vmovdqa %ymm10,448(%rdi) | |||
vmovdqa %ymm11,480(%rdi) | |||
fqmulprecomp 12,13,4 8 | |||
fqmulprecomp 12,13,5 9 | |||
fqmulprecomp 12,13,6 10 | |||
fqmulprecomp 12,13,7 11 | |||
fqmulprecomp 12,13,4,8 | |||
fqmulprecomp 12,13,5,9 | |||
fqmulprecomp 12,13,6,10 | |||
fqmulprecomp 12,13,7,11 | |||
#store | |||
vmovdqa %ymm4,128(%rdi) | |||
@@ -215,3 +208,18 @@ vmovdqa %ymm6,192(%rdi) | |||
vmovdqa %ymm7,224(%rdi) | |||
ret | |||
.global cdecl(PQCLEAN_KYBER1024_AVX2_invntt_avx) | |||
cdecl(PQCLEAN_KYBER1024_AVX2_invntt_avx): | |||
#consts | |||
vmovdqa _16XQ*2(%rsi),%ymm0 | |||
mov %rsi,%rdx | |||
add $_ZETAS_INV_EXP*2,%rsi | |||
call invntt_levels0t5_avx | |||
add $256,%rdi | |||
add $392,%rsi | |||
call invntt_levels0t5_avx | |||
sub $256,%rdi | |||
add $392,%rsi | |||
call invntt_level6_avx | |||
ret |
@@ -1,103 +1,127 @@ | |||
#include "api.h" | |||
#include "align.h" | |||
#include "indcpa.h" | |||
#include "kem.h" | |||
#include "params.h" | |||
#include "randombytes.h" | |||
#include "symmetric.h" | |||
#include "verify.h" | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
#include <stdlib.h> | |||
/************************************************* | |||
* Name: crypto_kem_keypair | |||
* Name: PQCLEAN_KYBER1024_AVX2_crypto_kem_keypair | |||
* | |||
* Description: Generates public and private key | |||
* for CCA-secure Kyber key encapsulation mechanism | |||
* | |||
* Arguments: - uint8_t *pk: pointer to output public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) | |||
* - uint8_t *sk: pointer to output private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) | |||
* Arguments: - unsigned char *pk: pointer to output public key | |||
* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) | |||
* - unsigned char *sk: pointer to output private key | |||
* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_KYBER1024_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { | |||
size_t i; | |||
int PQCLEAN_KYBER1024_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { | |||
size_t i = 0; | |||
PQCLEAN_KYBER1024_AVX2_indcpa_keypair(pk, sk); | |||
for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) { | |||
sk[i + KYBER_INDCPA_SECRETKEYBYTES] = pk[i]; | |||
} | |||
hash_h(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); | |||
randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES); /* Value z for pseudo-random output on reject */ | |||
/* Value z for pseudo-random output on reject */ | |||
randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES); | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: crypto_kem_enc | |||
* Name: PQCLEAN_KYBER1024_AVX2_crypto_kem_enc | |||
* | |||
* Description: Generates cipher text and shared | |||
* secret for given public key | |||
* | |||
* Arguments: - uint8_t *ct: pointer to output cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) | |||
* - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes) | |||
* - const uint8_t *pk: pointer to input public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) | |||
* Arguments: - unsigned char *ct: pointer to output cipher text | |||
* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) | |||
* - unsigned char *ss: pointer to output shared secret | |||
* (an already allocated array of CRYPTO_BYTES bytes) | |||
* - const unsigned char *pk: pointer to input public key | |||
* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_KYBER1024_AVX2_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { | |||
uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ | |||
uint8_t buf[2 * KYBER_SYMBYTES]; | |||
int PQCLEAN_KYBER1024_AVX2_crypto_kem_enc(unsigned char *ct, | |||
unsigned char *ss, | |||
const unsigned char *pk) { | |||
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; | |||
/* Will contain key, coins */ | |||
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr; | |||
randombytes(buf, KYBER_SYMBYTES); | |||
hash_h(buf, buf, KYBER_SYMBYTES); /* Don't release system RNG output */ | |||
randombytes(buf.arr, KYBER_SYMBYTES); | |||
/* Don't release system RNG output */ | |||
hash_h(buf.arr, buf.arr, KYBER_SYMBYTES); | |||
hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); /* Multitarget countermeasure for coins + contributory KEM */ | |||
hash_g(kr, buf, 2 * KYBER_SYMBYTES); | |||
/* Multitarget countermeasure for coins + contributory KEM */ | |||
hash_h(buf.arr + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); | |||
hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES); | |||
PQCLEAN_KYBER1024_AVX2_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ | |||
/* coins are in kr+KYBER_SYMBYTES */ | |||
PQCLEAN_KYBER1024_AVX2_indcpa_enc(ct, buf.arr, pk, kr.arr + KYBER_SYMBYTES); | |||
hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ | |||
kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */ | |||
/* overwrite coins in kr with H(c) */ | |||
hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); | |||
/* hash concatenation of pre-k and H(c) to k */ | |||
kdf(ss, kr.arr, 2 * KYBER_SYMBYTES); | |||
return 0; | |||
} | |||
/************************************************* | |||
* Name: crypto_kem_dec | |||
* Name: PQCLEAN_KYBER1024_AVX2_crypto_kem_dec | |||
* | |||
* Description: Generates shared secret for given | |||
* cipher text and private key | |||
* | |||
* Arguments: - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes) | |||
* - const uint8_t *ct: pointer to input cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) | |||
* - const uint8_t *sk: pointer to input private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) | |||
* Arguments: - unsigned char *ss: pointer to output shared secret | |||
* (an already allocated array of CRYPTO_BYTES bytes) | |||
* - const unsigned char *ct: pointer to input cipher text | |||
* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) | |||
* - const unsigned char *sk: pointer to input private key | |||
* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) | |||
* | |||
* Returns 0. | |||
* | |||
* On failure, ss will contain a pseudo-random value. | |||
**************************************************/ | |||
int PQCLEAN_KYBER1024_AVX2_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { | |||
size_t i; | |||
uint8_t fail; | |||
union { | |||
uint8_t x[KYBER_CIPHERTEXTBYTES]; | |||
__m256i __dummy; | |||
} _cmp; | |||
uint8_t *cmp = _cmp.x; | |||
uint8_t buf[2 * KYBER_SYMBYTES]; | |||
uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ | |||
int PQCLEAN_KYBER1024_AVX2_crypto_kem_dec(unsigned char *ss, | |||
const unsigned char *ct, | |||
const unsigned char *sk) { | |||
size_t i = 0; | |||
int fail = 0; | |||
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; | |||
/* Will contain key, coins */ | |||
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr; | |||
uint8_t cmp[KYBER_CIPHERTEXTBYTES]; | |||
const uint8_t *pk = sk + KYBER_INDCPA_SECRETKEYBYTES; | |||
PQCLEAN_KYBER1024_AVX2_indcpa_dec(buf, ct, sk); | |||
PQCLEAN_KYBER1024_AVX2_indcpa_dec(buf.arr, ct, sk); | |||
for (i = 0; i < KYBER_SYMBYTES; i++) { /* Multitarget countermeasure for coins + contributory KEM */ | |||
buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; /* Save hash by storing H(pk) in sk */ | |||
/* Multitarget countermeasure for coins + contributory KEM */ | |||
for (i = 0; i < KYBER_SYMBYTES; i++) { | |||
buf.arr[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; | |||
} | |||
hash_g(kr, buf, 2 * KYBER_SYMBYTES); | |||
hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES); | |||
PQCLEAN_KYBER1024_AVX2_indcpa_enc(cmp, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ | |||
/* coins are in kr+KYBER_SYMBYTES */ | |||
PQCLEAN_KYBER1024_AVX2_indcpa_enc(cmp, buf.arr, pk, kr.arr + KYBER_SYMBYTES); | |||
fail = PQCLEAN_KYBER1024_AVX2_verify(ct, cmp, KYBER_CIPHERTEXTBYTES); | |||
hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ | |||
/* overwrite coins in kr with H(c) */ | |||
hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); | |||
PQCLEAN_KYBER1024_AVX2_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); /* Overwrite pre-k with z on re-encryption failure */ | |||
/* Overwrite pre-k with z on re-encryption failure */ | |||
PQCLEAN_KYBER1024_AVX2_cmov(kr.arr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); | |||
kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */ | |||
/* hash concatenation of pre-k and H(c) to k */ | |||
kdf(ss, kr.arr, 2 * KYBER_SYMBYTES); | |||
return 0; | |||
} |
@@ -0,0 +1,19 @@ | |||
#ifndef PQCLEAN_KYBER1024_AVX2_KEM_H | |||
#define PQCLEAN_KYBER1024_AVX2_KEM_H | |||
#include "params.h" | |||
int PQCLEAN_KYBER1024_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); | |||
int PQCLEAN_KYBER1024_AVX2_crypto_kem_enc(unsigned char *ct, | |||
unsigned char *ss, | |||
const unsigned char *pk); | |||
int PQCLEAN_KYBER1024_AVX2_crypto_kem_dec(unsigned char *ss, | |||
const unsigned char *ct, | |||
const unsigned char *sk); | |||
#endif |
@@ -1,7 +1,8 @@ | |||
#include "cdecl.inc" | |||
.include "shuffle.inc" | |||
.include "fq.inc" | |||
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 zl0=15,zl1=15,zh0=1,zh1=1 | |||
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=1,zh1=1 | |||
#mul | |||
vpmullw %ymm\zl0,%ymm\rh0,%ymm12 | |||
vpmullw %ymm\zl0,%ymm\rh1,%ymm13 | |||
@@ -36,7 +37,7 @@ vpaddw %ymm15,%ymm\rl3,%ymm\rl3 | |||
# We break the dependency chains with the cost of slightly more additions. | |||
# But they can be run in parallel to the multiplications on execution port 5 | |||
# (multiplications only go to ports 0 and 1) | |||
.macro butterfly2 rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 x=3,y=2,zl0=15,zl1=15,zh0=1,zh1=1 | |||
.macro butterfly2 rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,x=3,y=2,zl0=15,zl1=15,zh0=1,zh1=1 | |||
#mul | |||
vpmullw %ymm\zl0,%ymm\rh0,%ymm12 | |||
vpmulhw %ymm\zh0,%ymm\rh0,%ymm\x | |||
@@ -73,11 +74,8 @@ vpaddw %ymm15,%ymm\rh3,%ymm\rh3 | |||
vpsubw %ymm15,%ymm\rl3,%ymm\rl3 | |||
.endm | |||
.global PQCLEAN_KYBER102490S_AVX2_ntt_level0_avx | |||
PQCLEAN_KYBER102490S_AVX2_ntt_level0_avx: | |||
#consts | |||
vmovdqa PQCLEAN_KYBER102490S_AVX2_16xq(%rip),%ymm0 | |||
.text | |||
ntt_level0_avx: | |||
level0: | |||
#zetas | |||
vpbroadcastd (%rsi),%ymm15 | |||
@@ -107,11 +105,7 @@ vmovdqa %ymm11,352(%rdi) | |||
ret | |||
.global PQCLEAN_KYBER102490S_AVX2_ntt_levels1t6_avx | |||
PQCLEAN_KYBER102490S_AVX2_ntt_levels1t6_avx: | |||
#consts | |||
vmovdqa PQCLEAN_KYBER102490S_AVX2_16xq(%rip),%ymm0 | |||
ntt_levels1t6_avx: | |||
level1: | |||
#zetas | |||
vpbroadcastd (%rsi),%ymm15 | |||
@@ -127,7 +121,7 @@ vmovdqa 160(%rdi),%ymm9 | |||
vmovdqa 192(%rdi),%ymm10 | |||
vmovdqa 224(%rdi),%ymm11 | |||
butterfly2 4,5,6,7,8,9,10,11 3 | |||
butterfly2 4,5,6,7,8,9,10,11,3 | |||
level2: | |||
#zetas | |||
@@ -139,7 +133,7 @@ shuffle8 5,9,4,9 | |||
shuffle8 6,10,5,10 | |||
shuffle8 7,11,6,11 | |||
butterfly2 3,8,4,9,5,10,6,11 7 | |||
butterfly2 3,8,4,9,5,10,6,11,7 | |||
level3: | |||
#zetas | |||
@@ -151,7 +145,7 @@ shuffle4 8,10,3,10 | |||
shuffle4 4,6,8,6 | |||
shuffle4 9,11,4,11 | |||
butterfly2 7,5,3,10,8,6,4,11 9 | |||
butterfly2 7,5,3,10,8,6,4,11,9 | |||
level4: | |||
#zetas | |||
@@ -163,7 +157,7 @@ shuffle2 5,6,7,6 | |||
shuffle2 3,4,5,4 | |||
shuffle2 10,11,3,11 | |||
butterfly2 9,8,7,6,5,4,3,11 10 | |||
butterfly2 9,8,7,6,5,4,3,11,10 | |||
level5: | |||
#zetas | |||
@@ -175,7 +169,7 @@ shuffle1 8,4,9,4 | |||
shuffle1 7,3,8,3 | |||
shuffle1 6,11,7,11 | |||
butterfly2 10,5,9,4,8,3,7,11 6 | |||
butterfly2 10,5,9,4,8,3,7,11,6 | |||
level6: | |||
#zetas | |||
@@ -184,17 +178,17 @@ vmovdqu 328(%rsi),%ymm15 | |||
vmovdqu 296(%rsi),%ymm1 | |||
vmovdqu 360(%rsi),%ymm2 | |||
butterfly2 10,5,8,3,9,4,7,11 6,1,14,15,1,2 | |||
butterfly2 10,5,8,3,9,4,7,11,6,1,14,15,1,2 | |||
vmovdqa PQCLEAN_KYBER102490S_AVX2_16xv(%rip),%ymm1 | |||
red16 10 12 | |||
red16 5 13 | |||
red16 9 14 | |||
red16 4 15 | |||
red16 8 2 | |||
red16 3 6 | |||
red16 7 12 | |||
red16 11 13 | |||
vmovdqa _16XV*2(%rdx),%ymm1 | |||
red16 10,12 | |||
red16 5,13 | |||
red16 9,14 | |||
red16 4,15 | |||
red16 8,2 | |||
red16 3,6 | |||
red16 7,12 | |||
red16 11,13 | |||
#store | |||
vmovdqa %ymm10,(%rdi) | |||
@@ -207,3 +201,20 @@ vmovdqa %ymm7,192(%rdi) | |||
vmovdqa %ymm11,224(%rdi) | |||
ret | |||
.global cdecl(PQCLEAN_KYBER1024_AVX2_ntt_avx) | |||
cdecl(PQCLEAN_KYBER1024_AVX2_ntt_avx): | |||
#consts | |||
vmovdqa _16XQ*2(%rsi),%ymm0 | |||
mov %rsi,%rdx | |||
add $_ZETAS_EXP*2,%rsi | |||
call ntt_level0_avx | |||
add $128,%rdi | |||
call ntt_level0_avx | |||
sub $128,%rdi | |||
add $8,%rsi | |||
call ntt_levels1t6_avx | |||
add $256,%rdi | |||
add $392,%rsi | |||
call ntt_levels1t6_avx | |||
ret |
@@ -2,19 +2,27 @@ | |||
#define NTT_H | |||
#include "consts.h" | |||
#include "params.h" | |||
#include <stdint.h> | |||
void PQCLEAN_KYBER1024_AVX2_ntt_level0_avx(int16_t *r, const uint16_t *zetas); | |||
void PQCLEAN_KYBER1024_AVX2_ntt_levels1t6_avx(int16_t *r, const uint16_t *zetas); | |||
void PQCLEAN_KYBER1024_AVX2_invntt_levels0t5_avx(int16_t *r, const uint16_t *zetas); | |||
void PQCLEAN_KYBER1024_AVX2_invntt_level6_avx(int16_t *r, const uint16_t *zetas); | |||
void PQCLEAN_KYBER1024_AVX2_nttpack_avx(int16_t *r); | |||
void PQCLEAN_KYBER1024_AVX2_nttunpack_avx(int16_t *r); | |||
void PQCLEAN_KYBER1024_AVX2_basemul_avx(int16_t *r, const int16_t *a, const int16_t *b, const uint16_t *zeta); | |||
void PQCLEAN_KYBER1024_AVX2_basemul_acc_avx(int16_t *r, const int16_t *a, const int16_t *b, const uint16_t *zeta); | |||
void PQCLEAN_KYBER1024_AVX2_ntttobytes_avx(uint8_t *r, const int16_t *a); | |||
void PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx(int16_t *r, const uint8_t *a); | |||
void PQCLEAN_KYBER1024_AVX2_ntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); | |||
void PQCLEAN_KYBER1024_AVX2_invntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); | |||
void nttpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); | |||
void PQCLEAN_KYBER1024_AVX2_nttunpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); | |||
void PQCLEAN_KYBER1024_AVX2_basemul_avx(int16_t *r, const int16_t *a, const int16_t *b, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); | |||
void PQCLEAN_KYBER1024_AVX2_basemul_acc_avx(int16_t *r, const int16_t *a, const int16_t *b, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); | |||
void PQCLEAN_KYBER1024_AVX2_ntttobytes_avx(uint8_t *r, const int16_t *a, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); | |||
void PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx(int16_t *r, const uint8_t *a, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); | |||
#endif |
@@ -1,8 +1,5 @@ | |||
#ifndef PARAMS_H | |||
#define PARAMS_H | |||
/* Don't change parameters below this line */ | |||
#ifndef PQCLEAN_KYBER1024_AVX2_PARAMS_H | |||
#define PQCLEAN_KYBER1024_AVX2_PARAMS_H | |||
#define KYBER_N 256 | |||
#define KYBER_Q 3329 | |||
@@ -12,9 +9,8 @@ | |||
#define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ | |||
#define KYBER_SSBYTES 32 /* size in bytes of shared key */ | |||
#define KYBER_POLYBYTES 384 | |||
#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) | |||
#define KYBER_POLYBYTES 384 | |||
#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) | |||
#define KYBER_K 4 | |||
#define KYBER_POLYCOMPRESSEDBYTES 160 | |||
@@ -23,10 +19,14 @@ | |||
#define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES | |||
#define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) | |||
#define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) | |||
#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES) | |||
#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES \ | |||
+ KYBER_POLYCOMPRESSEDBYTES) | |||
#define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES) | |||
#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) /* 32 bytes of additional space to save H(pk) */ | |||
/* 32 bytes of additional space to save H(pk) */ | |||
#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES \ | |||
+ KYBER_INDCPA_PUBLICKEYBYTES \ | |||
+ 2*KYBER_SYMBYTES) | |||
#define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES | |||
#endif |
@@ -1,132 +1,242 @@ | |||
#include "align.h" | |||
#include "cbd.h" | |||
#include "consts.h" | |||
#include "ntt.h" | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "reduce.h" | |||
#include "symmetric.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
/************************************************* | |||
* Name: poly_compress | |||
* Name: PQCLEAN_KYBER1024_AVX2_poly_compress | |||
* | |||
* Description: Compression and subsequent serialization of a polynomial | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* - const poly *a: pointer to input polynomial | |||
* (of length KYBER_POLYCOMPRESSEDBYTES) | |||
* - poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_poly_compress(uint8_t *r, poly *a) { | |||
void PQCLEAN_KYBER1024_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *restrict a) { | |||
unsigned int i = 0, j = 0; | |||
uint8_t t[8]; | |||
size_t i, j, k = 0; | |||
PQCLEAN_KYBER1024_AVX2_poly_csubq(a); | |||
for (i = 0; i < KYBER_N; i += 8) { | |||
for (i = 0; i < KYBER_N / 8; i++) { | |||
for (j = 0; j < 8; j++) { | |||
t[j] = (uint8_t)(((((uint32_t)a->coeffs[i + j] << 5) + KYBER_Q / 2) / KYBER_Q) & 31); | |||
t[j] = ((((uint32_t)a->coeffs[8 * i + j] << 5) + KYBER_Q / 2) / KYBER_Q) & 31; | |||
} | |||
r[k] = (uint8_t)( t[0] | (t[1] << 5)); | |||
r[k + 1] = (uint8_t)((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)); | |||
r[k + 2] = (uint8_t)((t[3] >> 1) | (t[4] << 4)); | |||
r[k + 3] = (uint8_t)((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)); | |||
r[k + 4] = (uint8_t)((t[6] >> 2) | (t[7] << 3)); | |||
k += 5; | |||
r[0] = (t[0] >> 0) | (t[1] << 5); | |||
r[1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7); | |||
r[2] = (t[3] >> 1) | (t[4] << 4); | |||
r[3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6); | |||
r[4] = (t[6] >> 2) | (t[7] << 3); | |||
r += 5; | |||
} | |||
} | |||
/************************************************* | |||
* Name: poly_decompress | |||
* Name: PQCLEAN_KYBER1024_AVX2_poly_decompress | |||
* | |||
* Description: De-serialization and subsequent decompression of a polynomial; | |||
* approximate inverse of poly_compress | |||
* approximate inverse of PQCLEAN_KYBER1024_AVX2_poly_compress | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *a: pointer to input byte array | |||
* (of length KYBER_POLYCOMPRESSEDBYTES bytes) | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_poly_decompress(poly *r, const uint8_t *a) { | |||
size_t i; | |||
for (i = 0; i < KYBER_N; i += 8) { | |||
r->coeffs[i + 0] = (int16_t)( (((a[0] & 31) * KYBER_Q) + 16) >> 5); | |||
r->coeffs[i + 1] = (int16_t)(((((a[0] >> 5) | ((a[1] & 3) << 3)) * KYBER_Q) + 16) >> 5); | |||
r->coeffs[i + 2] = (int16_t)(((((a[1] >> 2) & 31) * KYBER_Q) + 16) >> 5); | |||
r->coeffs[i + 3] = (int16_t)(((((a[1] >> 7) | ((a[2] & 15) << 1)) * KYBER_Q) + 16) >> 5); | |||
r->coeffs[i + 4] = (int16_t)(((((a[2] >> 4) | ((a[3] & 1) << 4)) * KYBER_Q) + 16) >> 5); | |||
r->coeffs[i + 5] = (int16_t)(((((a[3] >> 1) & 31) * KYBER_Q) + 16) >> 5); | |||
r->coeffs[i + 6] = (int16_t)(((((a[3] >> 6) | ((a[4] & 7) << 2)) * KYBER_Q) + 16) >> 5); | |||
r->coeffs[i + 7] = (int16_t)( (((a[4] >> 3) * KYBER_Q) + 16) >> 5); | |||
void PQCLEAN_KYBER1024_AVX2_poly_decompress(poly *restrict r, | |||
const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) { | |||
unsigned int i = 0; | |||
unsigned int j = 0; | |||
uint8_t t[8]; | |||
for (i = 0; i < KYBER_N / 8; i++) { | |||
t[0] = (a[0] >> 0); | |||
t[1] = (a[0] >> 5) | (a[1] << 3); | |||
t[2] = (a[1] >> 2); | |||
t[3] = (a[1] >> 7) | (a[2] << 1); | |||
t[4] = (a[2] >> 4) | (a[3] << 4); | |||
t[5] = (a[3] >> 1); | |||
t[6] = (a[3] >> 6) | (a[4] << 2); | |||
t[7] = (a[4] >> 3); | |||
a += 5; | |||
for (j = 0; j < 8; j++) { | |||
r->coeffs[8 * i + j] = ((uint32_t)(t[j] & 31) * KYBER_Q + 16) >> 5; | |||
} | |||
} | |||
} | |||
/************************************************* | |||
* Name: poly_tobytes | |||
* Name: PQCLEAN_KYBER1024_AVX2_poly_tobytes | |||
* | |||
* Description: Serialization of a polynomial | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* - const poly *a: pointer to input polynomial | |||
* (needs space for KYBER_POLYBYTES bytes) | |||
* - poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_poly_tobytes(uint8_t *r, poly *a) { | |||
PQCLEAN_KYBER1024_AVX2_ntttobytes_avx(r, a->coeffs); | |||
PQCLEAN_KYBER1024_AVX2_ntttobytes_avx(r + 192, a->coeffs + 128); | |||
void PQCLEAN_KYBER1024_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { | |||
PQCLEAN_KYBER1024_AVX2_ntttobytes_avx(r, a->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); | |||
} | |||
/************************************************* | |||
* Name: poly_frombytes | |||
* Name: PQCLEAN_KYBER1024_AVX2_poly_frombytes | |||
* | |||
* Description: De-serialization of a polynomial; | |||
* inverse of poly_tobytes | |||
* inverse of PQCLEAN_KYBER1024_AVX2_poly_tobytes | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *a: pointer to input byte array | |||
* (of KYBER_POLYBYTES bytes) | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]) { | |||
PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx(r->coeffs, a, &PQCLEAN_KYBER1024_AVX2_qdata); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_AVX2_poly_frommsg | |||
* | |||
* Description: Convert 32-byte message to polynomial | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *msg: pointer to input message | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_poly_frombytes(poly *r, const uint8_t *a) { | |||
PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx(r->coeffs, a); | |||
PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx(r->coeffs + 128, a + 192); | |||
void PQCLEAN_KYBER1024_AVX2_poly_frommsg(poly *restrict r, | |||
const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { | |||
__m256i f, g0, g1, g2, g3, h0, h1, h2, h3; | |||
const __m256i shift = _mm256_broadcastsi128_si256(_mm_set_epi32(0, 1, 2, 3)); | |||
const __m256i idx = _mm256_broadcastsi128_si256(_mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); | |||
const __m256i hqs = _mm256_set1_epi16((KYBER_Q + 1) / 2); | |||
#define FROMMSG64(i) \ | |||
g3 = _mm256_shuffle_epi32(f,0x55*(i)); \ | |||
g3 = _mm256_sllv_epi32(g3,shift); \ | |||
g3 = _mm256_shuffle_epi8(g3,idx); \ | |||
g0 = _mm256_slli_epi16(g3,12); \ | |||
g1 = _mm256_slli_epi16(g3,8); \ | |||
g2 = _mm256_slli_epi16(g3,4); \ | |||
g0 = _mm256_srai_epi16(g0,15); \ | |||
g1 = _mm256_srai_epi16(g1,15); \ | |||
g2 = _mm256_srai_epi16(g2,15); \ | |||
g3 = _mm256_srai_epi16(g3,15); \ | |||
g0 = _mm256_and_si256(g0,hqs); /* 19 18 17 16 3 2 1 0 */ \ | |||
g1 = _mm256_and_si256(g1,hqs); /* 23 22 21 20 7 6 5 4 */ \ | |||
g2 = _mm256_and_si256(g2,hqs); /* 27 26 25 24 11 10 9 8 */ \ | |||
g3 = _mm256_and_si256(g3,hqs); /* 31 30 29 28 15 14 13 12 */ \ | |||
h0 = _mm256_unpacklo_epi64(g0,g1); \ | |||
h2 = _mm256_unpackhi_epi64(g0,g1); \ | |||
h1 = _mm256_unpacklo_epi64(g2,g3); \ | |||
h3 = _mm256_unpackhi_epi64(g2,g3); \ | |||
g0 = _mm256_permute2x128_si256(h0,h1,0x20); \ | |||
g2 = _mm256_permute2x128_si256(h0,h1,0x31); \ | |||
g1 = _mm256_permute2x128_si256(h2,h3,0x20); \ | |||
g3 = _mm256_permute2x128_si256(h2,h3,0x31); \ | |||
_mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+ 0],g0); \ | |||
_mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+16],g1); \ | |||
_mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+ 0],g2); \ | |||
_mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+16],g3) | |||
f = _mm256_load_si256((__m256i *)msg); | |||
FROMMSG64(0); | |||
FROMMSG64(1); | |||
FROMMSG64(2); | |||
FROMMSG64(3); | |||
} | |||
/************************************************* | |||
* Name: poly_getnoise | |||
* Name: PQCLEAN_KYBER1024_AVX2_poly_tomsg | |||
* | |||
* Description: Convert polynomial to 32-byte message | |||
* | |||
* Arguments: - uint8_t *msg: pointer to output message | |||
* - poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *restrict a) { | |||
unsigned int i = 0; | |||
uint32_t small = 0; | |||
__m256i f0, f1, g0, g1; | |||
const __m256i hqs = _mm256_set1_epi16((KYBER_Q - 1) / 2); | |||
const __m256i hhqs = _mm256_set1_epi16((KYBER_Q - 5) / 4); | |||
for (i = 0; i < KYBER_N / 32; i++) { | |||
f0 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i]); | |||
f1 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i + 16]); | |||
f0 = _mm256_sub_epi16(hqs, f0); | |||
f1 = _mm256_sub_epi16(hqs, f1); | |||
g0 = _mm256_srai_epi16(f0, 15); | |||
g1 = _mm256_srai_epi16(f1, 15); | |||
f0 = _mm256_xor_si256(f0, g0); | |||
f1 = _mm256_xor_si256(f1, g1); | |||
f0 = _mm256_sub_epi16(hhqs, f0); | |||
f1 = _mm256_sub_epi16(hhqs, f1); | |||
f0 = _mm256_packs_epi16(f0, f1); | |||
small = _mm256_movemask_epi8(f0); | |||
small = ~small; | |||
msg[4 * i + 0] = small; | |||
msg[4 * i + 1] = small >> 16; | |||
msg[4 * i + 2] = small >> 8; | |||
msg[4 * i + 3] = small >> 24; | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_AVX2_poly_getnoise | |||
* | |||
* Description: Sample a polynomial deterministically from a seed and a nonce, | |||
* with output polynomial close to centered binomial distribution | |||
* with parameter KYBER_ETA | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *seed: pointer to input seed | |||
* (of length KYBER_SYMBYTES bytes) | |||
* - uint8_t nonce: one-byte input nonce | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce) { | |||
uint8_t buf[KYBER_ETA * KYBER_N / 4]; | |||
prf(buf, KYBER_ETA * KYBER_N / 4, seed, nonce); | |||
PQCLEAN_KYBER1024_AVX2_cbd(r, buf); | |||
void PQCLEAN_KYBER1024_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { | |||
ALIGN32_ARRAY(uint8_t, KYBER_ETA * KYBER_N / 4) buf; | |||
prf(buf.arr, sizeof(buf.arr), seed, nonce); | |||
PQCLEAN_KYBER1024_AVX2_cbd(r, buf.arr); | |||
} | |||
// FIXME | |||
void PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(poly *r0, | |||
poly *r1, | |||
poly *r2, | |||
poly *r3, | |||
const uint8_t *seed, | |||
const uint8_t seed[32], | |||
uint8_t nonce0, | |||
uint8_t nonce1, | |||
uint8_t nonce2, | |||
uint8_t nonce3) { | |||
uint8_t buf[4][SHAKE256_RATE]; | |||
PQCLEAN_KYBER1024_AVX2_shake256x4_prf(buf[0], buf[1], buf[2], buf[3], SHAKE256_RATE, seed, nonce0, nonce1, nonce2, nonce3); | |||
PQCLEAN_KYBER1024_AVX2_cbd(r0, buf[0]); | |||
PQCLEAN_KYBER1024_AVX2_cbd(r1, buf[1]); | |||
PQCLEAN_KYBER1024_AVX2_cbd(r2, buf[2]); | |||
PQCLEAN_KYBER1024_AVX2_cbd(r3, buf[3]); | |||
ALIGN32_ARRAY_2D(uint8_t, 4, 160) buf; | |||
__m256i f; | |||
keccakx4_state state; | |||
f = _mm256_load_si256((__m256i *)seed); | |||
_mm256_store_si256((__m256i *)buf.arr[0], f); | |||
_mm256_store_si256((__m256i *)buf.arr[1], f); | |||
_mm256_store_si256((__m256i *)buf.arr[2], f); | |||
_mm256_store_si256((__m256i *)buf.arr[3], f); | |||
buf.arr[0][32] = nonce0; | |||
buf.arr[1][32] = nonce1; | |||
buf.arr[2][32] = nonce2; | |||
buf.arr[3][32] = nonce3; | |||
PQCLEAN_KYBER1024_AVX2_shake256x4_absorb(&state, buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], 33); | |||
PQCLEAN_KYBER1024_AVX2_shake256x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], 1, &state); | |||
PQCLEAN_KYBER1024_AVX2_cbd(r0, buf.arr[0]); | |||
PQCLEAN_KYBER1024_AVX2_cbd(r1, buf.arr[1]); | |||
PQCLEAN_KYBER1024_AVX2_cbd(r2, buf.arr[2]); | |||
PQCLEAN_KYBER1024_AVX2_cbd(r3, buf.arr[3]); | |||
} | |||
/************************************************* | |||
* Name: poly_ntt | |||
* Name: PQCLEAN_KYBER1024_AVX2_poly_ntt | |||
* | |||
* Description: Computes negacyclic number-theoretic transform (NTT) of | |||
* a polynomial in place; | |||
@@ -135,73 +245,78 @@ void PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(poly *r0, | |||
* Arguments: - uint16_t *r: pointer to in/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_poly_ntt(poly *r) { | |||
PQCLEAN_KYBER1024_AVX2_ntt_level0_avx(r->coeffs, PQCLEAN_KYBER1024_AVX2_zetas_exp); | |||
PQCLEAN_KYBER1024_AVX2_ntt_level0_avx(r->coeffs + 64, PQCLEAN_KYBER1024_AVX2_zetas_exp); | |||
PQCLEAN_KYBER1024_AVX2_ntt_levels1t6_avx(r->coeffs, PQCLEAN_KYBER1024_AVX2_zetas_exp + 4); | |||
PQCLEAN_KYBER1024_AVX2_ntt_levels1t6_avx(r->coeffs + 128, PQCLEAN_KYBER1024_AVX2_zetas_exp + 200); | |||
PQCLEAN_KYBER1024_AVX2_ntt_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); | |||
} | |||
/************************************************* | |||
* Name: poly_invntt | |||
* Name: PQCLEAN_KYBER1024_AVX2_poly_invntt_tomont | |||
* | |||
* Description: Computes inverse of negacyclic number-theoretic transform (NTT) of | |||
* a polynomial in place; | |||
* Description: Computes inverse of negacyclic number-theoretic transform (NTT) | |||
* of a polynomial in place; | |||
* inputs assumed to be in bitreversed order, output in normal order | |||
* | |||
* Arguments: - uint16_t *a: pointer to in/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_poly_invntt(poly *r) { | |||
PQCLEAN_KYBER1024_AVX2_invntt_levels0t5_avx(r->coeffs, PQCLEAN_KYBER1024_AVX2_zetas_inv_exp); | |||
PQCLEAN_KYBER1024_AVX2_invntt_levels0t5_avx(r->coeffs + 128, PQCLEAN_KYBER1024_AVX2_zetas_inv_exp + 196); | |||
PQCLEAN_KYBER1024_AVX2_invntt_level6_avx(r->coeffs, PQCLEAN_KYBER1024_AVX2_zetas_inv_exp + 392); | |||
void PQCLEAN_KYBER1024_AVX2_poly_invntt_tomont(poly *r) { | |||
PQCLEAN_KYBER1024_AVX2_invntt_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); | |||
} | |||
// FIXME | |||
void PQCLEAN_KYBER1024_AVX2_poly_nttunpack(poly *r) { | |||
PQCLEAN_KYBER1024_AVX2_nttunpack_avx(r->coeffs); | |||
PQCLEAN_KYBER1024_AVX2_nttunpack_avx(r->coeffs + 128); | |||
PQCLEAN_KYBER1024_AVX2_nttunpack_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); | |||
} | |||
//XXX Add comment | |||
void PQCLEAN_KYBER1024_AVX2_poly_basemul(poly *r, const poly *a, const poly *b) { | |||
PQCLEAN_KYBER1024_AVX2_basemul_avx(r->coeffs, | |||
a->coeffs, | |||
b->coeffs, | |||
PQCLEAN_KYBER1024_AVX2_zetas_exp + 152); | |||
PQCLEAN_KYBER1024_AVX2_basemul_avx(r->coeffs + 64, | |||
a->coeffs + 64, | |||
b->coeffs + 64, | |||
PQCLEAN_KYBER1024_AVX2_zetas_exp + 184); | |||
PQCLEAN_KYBER1024_AVX2_basemul_avx(r->coeffs + 128, | |||
a->coeffs + 128, | |||
b->coeffs + 128, | |||
PQCLEAN_KYBER1024_AVX2_zetas_exp + 348); | |||
PQCLEAN_KYBER1024_AVX2_basemul_avx(r->coeffs + 192, | |||
a->coeffs + 192, | |||
b->coeffs + 192, | |||
PQCLEAN_KYBER1024_AVX2_zetas_exp + 380); | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_AVX2_poly_basemul_montgomery | |||
* | |||
* Description: Multiplication of two polynomials in NTT domain | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const poly *a: pointer to first input polynomial | |||
* - const poly *b: pointer to second input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const poly *b) { | |||
PQCLEAN_KYBER1024_AVX2_basemul_avx(r->coeffs, a->coeffs, b->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); | |||
} | |||
// FIXME | |||
void PQCLEAN_KYBER1024_AVX2_poly_frommont(poly *r) { | |||
PQCLEAN_KYBER1024_AVX2_frommont_avx(r->coeffs); | |||
PQCLEAN_KYBER1024_AVX2_frommont_avx(r->coeffs + 128); | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_AVX2_poly_tomont | |||
* | |||
* Description: Inplace conversion of all coefficients of a polynomial | |||
* from normal domain to Montgomery domain | |||
* | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_poly_tomont(poly *r) { | |||
PQCLEAN_KYBER1024_AVX2_tomont_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); | |||
} | |||
// FIXME | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_AVX2_poly_reduce | |||
* | |||
* Description: Applies Barrett reduction to all coefficients of a polynomial | |||
* for details of the Barrett reduction see comments in reduce.c | |||
* | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_poly_reduce(poly *r) { | |||
PQCLEAN_KYBER1024_AVX2_reduce_avx(r->coeffs); | |||
PQCLEAN_KYBER1024_AVX2_reduce_avx(r->coeffs + 128); | |||
PQCLEAN_KYBER1024_AVX2_reduce_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); | |||
} | |||
// FIXME | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_AVX2_poly_csubq | |||
* | |||
* Description: Applies conditional subtraction of q to each coefficient | |||
* of a polynomial. For details of conditional subtraction | |||
* of q see comments in reduce.c | |||
* | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_poly_csubq(poly *r) { | |||
PQCLEAN_KYBER1024_AVX2_csubq_avx(r->coeffs); | |||
PQCLEAN_KYBER1024_AVX2_csubq_avx(r->coeffs + 128); | |||
PQCLEAN_KYBER1024_AVX2_csubq_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); | |||
} | |||
/************************************************* | |||
* Name: poly_add | |||
* Name: PQCLEAN_KYBER1024_AVX2_poly_add | |||
* | |||
* Description: Add two polynomials | |||
* | |||
@@ -210,18 +325,19 @@ void PQCLEAN_KYBER1024_AVX2_poly_csubq(poly *r) { | |||
* - const poly *b: pointer to second input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_poly_add(poly *r, const poly *a, const poly *b) { | |||
__m256i vec0, vec1; | |||
for (size_t i = 0; i < KYBER_N; i += 16) { | |||
vec0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); | |||
vec1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); | |||
vec0 = _mm256_add_epi16(vec0, vec1); | |||
_mm256_store_si256((__m256i *)&r->coeffs[i], vec0); | |||
unsigned int i = 0; | |||
__m256i f0, f1; | |||
for (i = 0; i < KYBER_N; i += 16) { | |||
f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); | |||
f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); | |||
f0 = _mm256_add_epi16(f0, f1); | |||
_mm256_store_si256((__m256i *)&r->coeffs[i], f0); | |||
} | |||
} | |||
/************************************************* | |||
* Name: poly_sub | |||
* Name: PQCLEAN_KYBER1024_AVX2_poly_sub | |||
* | |||
* Description: Subtract two polynomials | |||
* | |||
@@ -230,127 +346,13 @@ void PQCLEAN_KYBER1024_AVX2_poly_add(poly *r, const poly *a, const poly *b) { | |||
* - const poly *b: pointer to second input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_poly_sub(poly *r, const poly *a, const poly *b) { | |||
__m256i vec0, vec1; | |||
for (size_t i = 0; i < KYBER_N; i += 16) { | |||
vec0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); | |||
vec1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); | |||
vec0 = _mm256_sub_epi16(vec0, vec1); | |||
_mm256_store_si256((__m256i *)&r->coeffs[i], vec0); | |||
} | |||
} | |||
/************************************************* | |||
* Name: poly_frommsg | |||
* | |||
* Description: Convert 32-byte message to polynomial | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *msg: pointer to input message | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]) { | |||
__m128i tmp; | |||
__m256i a[4], d0, d1, d2, d3; | |||
const __m256i shift = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); | |||
const __m256i zeros = _mm256_setzero_si256(); | |||
const __m256i ones = _mm256_set1_epi32(1); | |||
const __m256i hqs = _mm256_set1_epi32((KYBER_Q + 1) / 2); | |||
tmp = _mm_loadu_si128((__m128i *)msg); | |||
for (size_t i = 0; i < 4; i++) { | |||
a[i] = _mm256_broadcastd_epi32(tmp); | |||
tmp = _mm_srli_si128(tmp, 4); | |||
} | |||
for (size_t i = 0; i < 4; i++) { | |||
d0 = _mm256_srlv_epi32(a[i], shift); | |||
d1 = _mm256_srli_epi32(d0, 8); | |||
d2 = _mm256_srli_epi32(d0, 16); | |||
d3 = _mm256_srli_epi32(d0, 24); | |||
d0 = _mm256_and_si256(d0, ones); | |||
d1 = _mm256_and_si256(d1, ones); | |||
d2 = _mm256_and_si256(d2, ones); | |||
d3 = _mm256_and_si256(d3, ones); | |||
d0 = _mm256_sub_epi32(zeros, d0); | |||
d1 = _mm256_sub_epi32(zeros, d1); | |||
d2 = _mm256_sub_epi32(zeros, d2); | |||
d3 = _mm256_sub_epi32(zeros, d3); | |||
d0 = _mm256_and_si256(hqs, d0); | |||
d1 = _mm256_and_si256(hqs, d1); | |||
d2 = _mm256_and_si256(hqs, d2); | |||
d3 = _mm256_and_si256(hqs, d3); | |||
d0 = _mm256_packus_epi32(d0, d1); | |||
d2 = _mm256_packus_epi32(d2, d3); | |||
d0 = _mm256_permute4x64_epi64(d0, 0xD8); | |||
d2 = _mm256_permute4x64_epi64(d2, 0xD8); | |||
_mm256_store_si256((__m256i *)&r->coeffs[32 * i + 0], d0); | |||
_mm256_store_si256((__m256i *)&r->coeffs[32 * i + 16], d2); | |||
} | |||
tmp = _mm_loadu_si128((__m128i *)&msg[16]); | |||
for (size_t i = 0; i < 4; i++) { | |||
a[i] = _mm256_broadcastd_epi32(tmp); | |||
tmp = _mm_srli_si128(tmp, 4); | |||
} | |||
for (size_t i = 0; i < 4; i++) { | |||
d0 = _mm256_srlv_epi32(a[i], shift); | |||
d1 = _mm256_srli_epi32(d0, 8); | |||
d2 = _mm256_srli_epi32(d0, 16); | |||
d3 = _mm256_srli_epi32(d0, 24); | |||
d0 = _mm256_and_si256(d0, ones); | |||
d1 = _mm256_and_si256(d1, ones); | |||
d2 = _mm256_and_si256(d2, ones); | |||
d3 = _mm256_and_si256(d3, ones); | |||
d0 = _mm256_sub_epi32(zeros, d0); | |||
d1 = _mm256_sub_epi32(zeros, d1); | |||
d2 = _mm256_sub_epi32(zeros, d2); | |||
d3 = _mm256_sub_epi32(zeros, d3); | |||
d0 = _mm256_and_si256(hqs, d0); | |||
d1 = _mm256_and_si256(hqs, d1); | |||
d2 = _mm256_and_si256(hqs, d2); | |||
d3 = _mm256_and_si256(hqs, d3); | |||
d0 = _mm256_packus_epi32(d0, d1); | |||
d2 = _mm256_packus_epi32(d2, d3); | |||
d0 = _mm256_permute4x64_epi64(d0, 0xD8); | |||
d2 = _mm256_permute4x64_epi64(d2, 0xD8); | |||
_mm256_store_si256((__m256i *)&r->coeffs[128 + 32 * i + 0], d0); | |||
_mm256_store_si256((__m256i *)&r->coeffs[128 + 32 * i + 16], d2); | |||
} | |||
} | |||
/************************************************* | |||
* Name: poly_tomsg | |||
* | |||
* Description: Convert polynomial to 32-byte message | |||
* | |||
* Arguments: - uint8_t *msg: pointer to output message | |||
* - const poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a) { | |||
uint32_t small; | |||
__m256i vec, tmp; | |||
const __m256i hqs = _mm256_set1_epi16((KYBER_Q - 1) / 2); | |||
const __m256i hhqs = _mm256_set1_epi16((KYBER_Q - 5) / 4); | |||
for (size_t i = 0; i < KYBER_N / 16; i++) { | |||
vec = _mm256_load_si256((__m256i *)&a->coeffs[16 * i]); | |||
vec = _mm256_sub_epi16(hqs, vec); | |||
tmp = _mm256_srai_epi16(vec, 15); | |||
vec = _mm256_xor_si256(vec, tmp); | |||
vec = _mm256_sub_epi16(hhqs, vec); | |||
small = (uint32_t)_mm256_movemask_epi8(vec); | |||
small = _pext_u32(small, 0xAAAAAAAA); | |||
small = ~small; | |||
msg[2 * i + 0] = (uint8_t)small; | |||
msg[2 * i + 1] = (uint8_t)(small >> 8); | |||
unsigned int i = 0; | |||
__m256i f0, f1; | |||
for (i = 0; i < KYBER_N; i += 16) { | |||
f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); | |||
f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); | |||
f0 = _mm256_sub_epi16(f0, f1); | |||
_mm256_store_si256((__m256i *)&r->coeffs[i], f0); | |||
} | |||
} |
@@ -1,8 +1,7 @@ | |||
#ifndef POLY_H | |||
#define POLY_H | |||
#ifndef PQCLEAN_KYBER1024_AVX2_POLY_H | |||
#define PQCLEAN_KYBER1024_AVX2_POLY_H | |||
#include "params.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
@@ -11,20 +10,28 @@ | |||
* coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1] | |||
*/ | |||
typedef union { | |||
__m256i dummy; | |||
int16_t coeffs[KYBER_N]; | |||
__m256i _dummy; | |||
} poly; | |||
void PQCLEAN_KYBER1024_AVX2_poly_compress(uint8_t *r, poly *a); | |||
void PQCLEAN_KYBER1024_AVX2_poly_decompress(poly *r, const uint8_t *a); | |||
void PQCLEAN_KYBER1024_AVX2_poly_tobytes(uint8_t *r, poly *a); | |||
void PQCLEAN_KYBER1024_AVX2_poly_frombytes(poly *r, const uint8_t *a); | |||
void PQCLEAN_KYBER1024_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a); | |||
void PQCLEAN_KYBER1024_AVX2_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]); | |||
void PQCLEAN_KYBER1024_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a); | |||
void PQCLEAN_KYBER1024_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]); | |||
void PQCLEAN_KYBER1024_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]); | |||
void PQCLEAN_KYBER1024_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a); | |||
void PQCLEAN_KYBER1024_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]); | |||
void PQCLEAN_KYBER1024_AVX2_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a); | |||
void PQCLEAN_KYBER1024_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); | |||
void PQCLEAN_KYBER1024_AVX2_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce); | |||
void PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(poly *r0, | |||
poly *r1, | |||
poly *r2, | |||
@@ -37,15 +44,23 @@ void PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(poly *r0, | |||
void PQCLEAN_KYBER1024_AVX2_poly_ntt(poly *r); | |||
void PQCLEAN_KYBER1024_AVX2_poly_invntt(poly *r); | |||
void PQCLEAN_KYBER1024_AVX2_poly_invntt_tomont(poly *r); | |||
void PQCLEAN_KYBER1024_AVX2_poly_nttunpack(poly *r); | |||
void PQCLEAN_KYBER1024_AVX2_poly_basemul(poly *r, const poly *a, const poly *b); | |||
void PQCLEAN_KYBER1024_AVX2_poly_frommont(poly *r); | |||
void PQCLEAN_KYBER1024_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const poly *b); | |||
void PQCLEAN_KYBER1024_AVX2_poly_tomont(poly *r); | |||
void PQCLEAN_KYBER1024_AVX2_poly_reduce(poly *r); | |||
void PQCLEAN_KYBER1024_AVX2_poly_csubq(poly *r); | |||
void PQCLEAN_KYBER1024_AVX2_poly_add(poly *r, const poly *a, const poly *b); | |||
void PQCLEAN_KYBER1024_AVX2_poly_sub(poly *r, const poly *a, const poly *b); | |||
#endif |
@@ -1,167 +1,198 @@ | |||
#include "params.h" | |||
#include "consts.h" | |||
#include "ntt.h" | |||
#include "poly.h" | |||
#include "polyvec.h" | |||
#include <stdint.h> | |||
/************************************************* | |||
* Name: polyvec_compress | |||
* Name: PQCLEAN_KYBER1024_AVX2_polyvec_compress | |||
* | |||
* Description: Compress and serialize vector of polynomials | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* - const polyvec *a: pointer to input vector of polynomials | |||
* (needs space for KYBER_POLYVECCOMPRESSEDBYTES) | |||
* - polyvec *a: pointer to input vector of polynomials | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_compress(uint8_t *r, polyvec *a) { | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], | |||
polyvec *restrict a) { | |||
unsigned int i = 0, j = 0, k = 0; | |||
PQCLEAN_KYBER1024_AVX2_polyvec_csubq(a); | |||
uint16_t t[8]; | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
for (size_t j = 0; j < KYBER_N / 8; j++) { | |||
for (size_t k = 0; k < 8; k++) { | |||
t[k] = ((((uint32_t)a->vec[i].coeffs[8 * j + k] << 11) + KYBER_Q / 2) / KYBER_Q) & 0x7ff; | |||
for (i = 0; i < KYBER_K; i++) { | |||
for (j = 0; j < KYBER_N / 8; j++) { | |||
for (k = 0; k < 8; k++) { | |||
{ | |||
t[k] = ((((uint32_t)a->vec[i].coeffs[8 * j + k] << 11) + KYBER_Q / 2) | |||
/ KYBER_Q) & 0x7ff; | |||
} | |||
} | |||
r[11 * j + 0] = (uint8_t)t[0]; | |||
r[11 * j + 1] = (uint8_t)((t[0] >> 8) | ((t[1] & 0x1f) << 3)); | |||
r[11 * j + 2] = (uint8_t)((t[1] >> 5) | ((t[2] & 0x03) << 6)); | |||
r[11 * j + 3] = (uint8_t)((t[2] >> 2)); | |||
r[11 * j + 4] = (uint8_t)((t[2] >> 10) | ((t[3] & 0x7f) << 1)); | |||
r[11 * j + 5] = (uint8_t)((t[3] >> 7) | ((t[4] & 0x0f) << 4)); | |||
r[11 * j + 6] = (uint8_t)((t[4] >> 4) | ((t[5] & 0x01) << 7)); | |||
r[11 * j + 7] = (uint8_t)((t[5] >> 1)); | |||
r[11 * j + 8] = (uint8_t)((t[5] >> 9) | ((t[6] & 0x3f) << 2)); | |||
r[11 * j + 9] = (uint8_t)((t[6] >> 6) | ((t[7] & 0x07) << 5)); | |||
r[11 * j + 10] = (uint8_t)((t[7] >> 3)); | |||
r[ 0] = (t[0] >> 0); | |||
r[ 1] = (t[0] >> 8) | (t[1] << 3); | |||
r[ 2] = (t[1] >> 5) | (t[2] << 6); | |||
r[ 3] = (t[2] >> 2); | |||
r[ 4] = (t[2] >> 10) | (t[3] << 1); | |||
r[ 5] = (t[3] >> 7) | (t[4] << 4); | |||
r[ 6] = (t[4] >> 4) | (t[5] << 7); | |||
r[ 7] = (t[5] >> 1); | |||
r[ 8] = (t[5] >> 9) | (t[6] << 2); | |||
r[ 9] = (t[6] >> 6) | (t[7] << 5); | |||
r[10] = (t[7] >> 3); | |||
r += 11; | |||
} | |||
r += 352; | |||
} | |||
} | |||
/************************************************* | |||
* Name: polyvec_decompress | |||
* Name: PQCLEAN_KYBER1024_AVX2_polyvec_decompress | |||
* | |||
* Description: De-serialize and decompress vector of polynomials; | |||
* approximate inverse of polyvec_compress | |||
* approximate inverse of PQCLEAN_KYBER1024_AVX2_polyvec_compress | |||
* | |||
* Arguments: - polyvec *r: pointer to output vector of polynomials | |||
* - uint8_t *a: pointer to input byte array | |||
* - const uint8_t *a: pointer to input byte array | |||
* (of length KYBER_POLYVECCOMPRESSEDBYTES) | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_decompress(polyvec *r, const uint8_t *a) { | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
for (size_t j = 0; j < KYBER_N / 8; j++) { | |||
r->vec[i].coeffs[8 * j + 0] = (int16_t)( (((a[11 * j + 0] | (((uint32_t)a[11 * j + 1] & 0x07) << 8)) * KYBER_Q) + 1024) >> 11); | |||
r->vec[i].coeffs[8 * j + 1] = (int16_t)(((((a[11 * j + 1] >> 3) | (((uint32_t)a[11 * j + 2] & 0x3f) << 5)) * KYBER_Q) + 1024) >> 11); | |||
r->vec[i].coeffs[8 * j + 2] = (int16_t)(((((a[11 * j + 2] >> 6) | (((uint32_t)a[11 * j + 3] & 0xff) << 2) | (((uint32_t)a[11 * j + 4] & 0x01) << 10)) * KYBER_Q) + 1024) >> 11); | |||
r->vec[i].coeffs[8 * j + 3] = (int16_t)(((((a[11 * j + 4] >> 1) | (((uint32_t)a[11 * j + 5] & 0x0f) << 7)) * KYBER_Q) + 1024) >> 11); | |||
r->vec[i].coeffs[8 * j + 4] = (int16_t)(((((a[11 * j + 5] >> 4) | (((uint32_t)a[11 * j + 6] & 0x7f) << 4)) * KYBER_Q) + 1024) >> 11); | |||
r->vec[i].coeffs[8 * j + 5] = (int16_t)(((((a[11 * j + 6] >> 7) | (((uint32_t)a[11 * j + 7] & 0xff) << 1) | (((uint32_t)a[11 * j + 8] & 0x03) << 9)) * KYBER_Q) + 1024) >> 11); | |||
r->vec[i].coeffs[8 * j + 6] = (int16_t)(((((a[11 * j + 8] >> 2) | (((uint32_t)a[11 * j + 9] & 0x1f) << 6)) * KYBER_Q) + 1024) >> 11); | |||
r->vec[i].coeffs[8 * j + 7] = (int16_t)(((((a[11 * j + 9] >> 5) | (((uint32_t)a[11 * j + 10] & 0xff) << 3)) * KYBER_Q) + 1024) >> 11); | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_decompress(polyvec *restrict r, | |||
const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { | |||
unsigned int i = 0, j = 0, k = 0; | |||
uint16_t t[8]; | |||
for (i = 0; i < KYBER_K; i++) { | |||
for (j = 0; j < KYBER_N / 8; j++) { | |||
t[0] = (a[0] >> 0) | ((uint16_t)a[ 1] << 8); | |||
t[1] = (a[1] >> 3) | ((uint16_t)a[ 2] << 5); | |||
t[2] = (a[2] >> 6) | ((uint16_t)a[ 3] << 2) | ((uint16_t)a[4] << 10); | |||
t[3] = (a[4] >> 1) | ((uint16_t)a[ 5] << 7); | |||
t[4] = (a[5] >> 4) | ((uint16_t)a[ 6] << 4); | |||
t[5] = (a[6] >> 7) | ((uint16_t)a[ 7] << 1) | ((uint16_t)a[8] << 9); | |||
t[6] = (a[8] >> 2) | ((uint16_t)a[ 9] << 6); | |||
t[7] = (a[9] >> 5) | ((uint16_t)a[10] << 3); | |||
a += 11; | |||
for (k = 0; k < 8; k++) { | |||
r->vec[i].coeffs[8 * j + k] = ((uint32_t)(t[k] & 0x7FF) * KYBER_Q + 1024) >> 11; | |||
} | |||
} | |||
a += 352; | |||
} | |||
} | |||
/************************************************* | |||
* Name: polyvec_tobytes | |||
* Name: PQCLEAN_KYBER1024_AVX2_polyvec_tobytes | |||
* | |||
* Description: Serialize vector of polynomials | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* - const polyvec *a: pointer to input vector of polynomials | |||
* (needs space for KYBER_POLYVECBYTES) | |||
* - polyvec *a: pointer to input vector of polynomials | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_tobytes(uint8_t *r, polyvec *a) { | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a) { | |||
unsigned int i = 0; | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_AVX2_poly_tobytes(r + i * KYBER_POLYBYTES, &a->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: polyvec_frombytes | |||
* Name: PQCLEAN_KYBER1024_AVX2_polyvec_frombytes | |||
* | |||
* Description: De-serialize vector of polynomials; | |||
* inverse of polyvec_tobytes | |||
* inverse of PQCLEAN_KYBER1024_AVX2_polyvec_tobytes | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* - const polyvec *a: pointer to input vector of polynomials | |||
* (of length KYBER_POLYVECBYTES) | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_frombytes(polyvec *r, const uint8_t *a) { | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]) { | |||
unsigned int i = 0; | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_AVX2_poly_frombytes(&r->vec[i], a + i * KYBER_POLYBYTES); | |||
} | |||
} | |||
/************************************************* | |||
* Name: polyvec_ntt | |||
* Name: PQCLEAN_KYBER1024_AVX2_polyvec_ntt | |||
* | |||
* Description: Apply forward NTT to all elements of a vector of polynomials | |||
* | |||
* Arguments: - polyvec *r: pointer to in/output vector of polynomials | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_ntt(polyvec *r) { | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
unsigned int i = 0; | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_AVX2_poly_ntt(&r->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: polyvec_invntt | |||
* Name: PQCLEAN_KYBER1024_AVX2_polyvec_invntt_tomont | |||
* | |||
* Description: Apply inverse NTT to all elements of a vector of polynomials | |||
* and multiply by Montgomery factor 2^16 | |||
* | |||
* Arguments: - polyvec *r: pointer to in/output vector of polynomials | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_invntt(polyvec *r) { | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_AVX2_poly_invntt(&r->vec[i]); | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_invntt_tomont(polyvec *r) { | |||
unsigned int i = 0; | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_AVX2_poly_invntt_tomont(&r->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: polyvec_pointwise_acc | |||
* Name: PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery | |||
* | |||
* Description: Pointwise multiply elements of a and b and accumulate into r | |||
* Description: Pointwise multiply elements of a and b, accumulate into r, | |||
* and multiply by 2^-16. | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const polyvec *a: pointer to first input vector of polynomials | |||
* - const polyvec *b: pointer to second input vector of polynomials | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b) { | |||
PQCLEAN_KYBER1024_AVX2_basemul_acc_avx(r->coeffs, | |||
a->vec->coeffs, | |||
b->vec->coeffs, | |||
PQCLEAN_KYBER1024_AVX2_zetas_exp + 152); | |||
PQCLEAN_KYBER1024_AVX2_basemul_acc_avx(r->coeffs + 64, | |||
a->vec->coeffs + 64, | |||
b->vec->coeffs + 64, | |||
PQCLEAN_KYBER1024_AVX2_zetas_exp + 184); | |||
PQCLEAN_KYBER1024_AVX2_basemul_acc_avx(r->coeffs + 128, | |||
a->vec->coeffs + 128, | |||
b->vec->coeffs + 128, | |||
PQCLEAN_KYBER1024_AVX2_zetas_exp + 348); | |||
PQCLEAN_KYBER1024_AVX2_basemul_acc_avx(r->coeffs + 192, | |||
a->vec->coeffs + 192, | |||
b->vec->coeffs + 192, | |||
PQCLEAN_KYBER1024_AVX2_zetas_exp + 380); | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(poly *r, | |||
const polyvec *a, | |||
const polyvec *b) { | |||
PQCLEAN_KYBER1024_AVX2_basemul_acc_avx(r->coeffs, a->vec->coeffs, b->vec->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); | |||
} | |||
// FIXME | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_AVX2_polyvec_reduce | |||
* | |||
* Description: Applies Barrett reduction to each coefficient | |||
* of each element of a vector of polynomials | |||
* for details of the Barrett reduction see comments in reduce.c | |||
* | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_reduce(polyvec *r) { | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
unsigned int i = 0; | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_AVX2_poly_reduce(&r->vec[i]); | |||
} | |||
} | |||
// FIXME | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_AVX2_polyvec_csubq | |||
* | |||
* Description: Applies conditional subtraction of q to each coefficient | |||
* of each element of a vector of polynomials | |||
* for details of conditional subtraction of q see comments in | |||
* reduce.c | |||
* | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_csubq(polyvec *r) { | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
unsigned int i = 0; | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_AVX2_poly_csubq(&r->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: polyvec_add | |||
* Name: PQCLEAN_KYBER1024_AVX2_polyvec_add | |||
* | |||
* Description: Add vectors of polynomials | |||
* | |||
@@ -170,7 +201,8 @@ void PQCLEAN_KYBER1024_AVX2_polyvec_csubq(polyvec *r) { | |||
* - const polyvec *b: pointer to second input vector of polynomials | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b) { | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
unsigned int i = 0; | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_AVX2_poly_add(&r->vec[i], &a->vec[i], &b->vec[i]); | |||
} | |||
} |
@@ -1,29 +1,41 @@ | |||
#ifndef POLYVEC_H | |||
#define POLYVEC_H | |||
#ifndef PQCLEAN_KYBER1024_AVX2_POLYVEC_H | |||
#define PQCLEAN_KYBER1024_AVX2_POLYVEC_H | |||
#include "params.h" | |||
#include "poly.h" | |||
#include <stdint.h> | |||
typedef struct { | |||
poly vec[KYBER_K]; | |||
} polyvec; | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_compress(uint8_t *r, polyvec *a); | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_decompress(polyvec *r, const uint8_t *a); | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_tobytes(uint8_t *r, polyvec *a); | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_frombytes(polyvec *r, const uint8_t *a); | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a); | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_decompress(polyvec *r, | |||
const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a); | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]); | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_ntt(polyvec *r); | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_invntt(polyvec *r); | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b); | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_invntt_tomont(polyvec *r); | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(poly *r, | |||
const polyvec *a, | |||
const polyvec *b); | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_reduce(polyvec *r); | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_csubq(polyvec *r); | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); | |||
#endif |
@@ -3,8 +3,14 @@ | |||
#include <stdint.h> | |||
int16_t PQCLEAN_KYBER1024_AVX2_reduce_avx(int16_t *r); | |||
int16_t PQCLEAN_KYBER1024_AVX2_csubq_avx(int16_t *r); | |||
int16_t PQCLEAN_KYBER1024_AVX2_frommont_avx(int16_t *r); | |||
#include "consts.h" | |||
#include "params.h" | |||
int16_t PQCLEAN_KYBER1024_AVX2_reduce_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); | |||
int16_t PQCLEAN_KYBER1024_AVX2_csubq_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); | |||
int16_t PQCLEAN_KYBER1024_AVX2_tomont_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); | |||
#endif |
@@ -1,386 +1,360 @@ | |||
#include "align.h" | |||
#include "consts.h" | |||
#include "params.h" | |||
#include "rejsample.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
static const uint8_t idx[256][8] = { | |||
{ 0, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 2, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 0, 0, 0, 0, 0, 0}, | |||
{ 4, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 0, 0, 0, 0, 0, 0}, | |||
{ 2, 4, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 0, 0, 0, 0, 0}, | |||
{ 6, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 6, 0, 0, 0, 0, 0, 0}, | |||
{ 2, 6, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 6, 0, 0, 0, 0, 0}, | |||
{ 4, 6, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 6, 0, 0, 0, 0, 0}, | |||
{ 2, 4, 6, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 6, 0, 0, 0, 0}, | |||
{ 8, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 8, 0, 0, 0, 0, 0, 0}, | |||
{ 2, 8, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 8, 0, 0, 0, 0, 0}, | |||
{ 4, 8, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 8, 0, 0, 0, 0, 0}, | |||
{ 2, 4, 8, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 8, 0, 0, 0, 0}, | |||
{ 6, 8, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 6, 8, 0, 0, 0, 0, 0}, | |||
{ 2, 6, 8, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 6, 8, 0, 0, 0, 0}, | |||
{ 4, 6, 8, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 6, 8, 0, 0, 0, 0}, | |||
{ 2, 4, 6, 8, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 6, 8, 0, 0, 0}, | |||
{10, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 10, 0, 0, 0, 0, 0, 0}, | |||
{ 2, 10, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 10, 0, 0, 0, 0, 0}, | |||
{ 4, 10, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 10, 0, 0, 0, 0, 0}, | |||
{ 2, 4, 10, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 10, 0, 0, 0, 0}, | |||
{ 6, 10, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 6, 10, 0, 0, 0, 0, 0}, | |||
{ 2, 6, 10, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 6, 10, 0, 0, 0, 0}, | |||
{ 4, 6, 10, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 6, 10, 0, 0, 0, 0}, | |||
{ 2, 4, 6, 10, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 6, 10, 0, 0, 0}, | |||
{ 8, 10, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 8, 10, 0, 0, 0, 0, 0}, | |||
{ 2, 8, 10, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 8, 10, 0, 0, 0, 0}, | |||
{ 4, 8, 10, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 8, 10, 0, 0, 0, 0}, | |||
{ 2, 4, 8, 10, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 8, 10, 0, 0, 0}, | |||
{ 6, 8, 10, 0, 0, 0, 0, 0}, | |||
{ 0, 6, 8, 10, 0, 0, 0, 0}, | |||
{ 2, 6, 8, 10, 0, 0, 0, 0}, | |||
{ 0, 2, 6, 8, 10, 0, 0, 0}, | |||
{ 4, 6, 8, 10, 0, 0, 0, 0}, | |||
{ 0, 4, 6, 8, 10, 0, 0, 0}, | |||
{ 2, 4, 6, 8, 10, 0, 0, 0}, | |||
{ 0, 2, 4, 6, 8, 10, 0, 0}, | |||
{12, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 12, 0, 0, 0, 0, 0, 0}, | |||
{ 2, 12, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 12, 0, 0, 0, 0, 0}, | |||
{ 4, 12, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 12, 0, 0, 0, 0, 0}, | |||
{ 2, 4, 12, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 12, 0, 0, 0, 0}, | |||
{ 6, 12, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 6, 12, 0, 0, 0, 0, 0}, | |||
{ 2, 6, 12, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 6, 12, 0, 0, 0, 0}, | |||
{ 4, 6, 12, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 6, 12, 0, 0, 0, 0}, | |||
{ 2, 4, 6, 12, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 6, 12, 0, 0, 0}, | |||
{ 8, 12, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 8, 12, 0, 0, 0, 0, 0}, | |||
{ 2, 8, 12, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 8, 12, 0, 0, 0, 0}, | |||
{ 4, 8, 12, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 8, 12, 0, 0, 0, 0}, | |||
{ 2, 4, 8, 12, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 8, 12, 0, 0, 0}, | |||
{ 6, 8, 12, 0, 0, 0, 0, 0}, | |||
{ 0, 6, 8, 12, 0, 0, 0, 0}, | |||
{ 2, 6, 8, 12, 0, 0, 0, 0}, | |||
{ 0, 2, 6, 8, 12, 0, 0, 0}, | |||
{ 4, 6, 8, 12, 0, 0, 0, 0}, | |||
{ 0, 4, 6, 8, 12, 0, 0, 0}, | |||
{ 2, 4, 6, 8, 12, 0, 0, 0}, | |||
{ 0, 2, 4, 6, 8, 12, 0, 0}, | |||
{10, 12, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 10, 12, 0, 0, 0, 0, 0}, | |||
{ 2, 10, 12, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 10, 12, 0, 0, 0, 0}, | |||
{ 4, 10, 12, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 10, 12, 0, 0, 0, 0}, | |||
{ 2, 4, 10, 12, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 10, 12, 0, 0, 0}, | |||
{ 6, 10, 12, 0, 0, 0, 0, 0}, | |||
{ 0, 6, 10, 12, 0, 0, 0, 0}, | |||
{ 2, 6, 10, 12, 0, 0, 0, 0}, | |||
{ 0, 2, 6, 10, 12, 0, 0, 0}, | |||
{ 4, 6, 10, 12, 0, 0, 0, 0}, | |||
{ 0, 4, 6, 10, 12, 0, 0, 0}, | |||
{ 2, 4, 6, 10, 12, 0, 0, 0}, | |||
{ 0, 2, 4, 6, 10, 12, 0, 0}, | |||
{ 8, 10, 12, 0, 0, 0, 0, 0}, | |||
{ 0, 8, 10, 12, 0, 0, 0, 0}, | |||
{ 2, 8, 10, 12, 0, 0, 0, 0}, | |||
{ 0, 2, 8, 10, 12, 0, 0, 0}, | |||
{ 4, 8, 10, 12, 0, 0, 0, 0}, | |||
{ 0, 4, 8, 10, 12, 0, 0, 0}, | |||
{ 2, 4, 8, 10, 12, 0, 0, 0}, | |||
{ 0, 2, 4, 8, 10, 12, 0, 0}, | |||
{ 6, 8, 10, 12, 0, 0, 0, 0}, | |||
{ 0, 6, 8, 10, 12, 0, 0, 0}, | |||
{ 2, 6, 8, 10, 12, 0, 0, 0}, | |||
{ 0, 2, 6, 8, 10, 12, 0, 0}, | |||
{ 4, 6, 8, 10, 12, 0, 0, 0}, | |||
{ 0, 4, 6, 8, 10, 12, 0, 0}, | |||
{ 2, 4, 6, 8, 10, 12, 0, 0}, | |||
{ 0, 2, 4, 6, 8, 10, 12, 0}, | |||
{14, 0, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 14, 0, 0, 0, 0, 0, 0}, | |||
{ 2, 14, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 14, 0, 0, 0, 0, 0}, | |||
{ 4, 14, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 14, 0, 0, 0, 0, 0}, | |||
{ 2, 4, 14, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 14, 0, 0, 0, 0}, | |||
{ 6, 14, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 6, 14, 0, 0, 0, 0, 0}, | |||
{ 2, 6, 14, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 6, 14, 0, 0, 0, 0}, | |||
{ 4, 6, 14, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 6, 14, 0, 0, 0, 0}, | |||
{ 2, 4, 6, 14, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 6, 14, 0, 0, 0}, | |||
{ 8, 14, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 8, 14, 0, 0, 0, 0, 0}, | |||
{ 2, 8, 14, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 8, 14, 0, 0, 0, 0}, | |||
{ 4, 8, 14, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 8, 14, 0, 0, 0, 0}, | |||
{ 2, 4, 8, 14, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 8, 14, 0, 0, 0}, | |||
{ 6, 8, 14, 0, 0, 0, 0, 0}, | |||
{ 0, 6, 8, 14, 0, 0, 0, 0}, | |||
{ 2, 6, 8, 14, 0, 0, 0, 0}, | |||
{ 0, 2, 6, 8, 14, 0, 0, 0}, | |||
{ 4, 6, 8, 14, 0, 0, 0, 0}, | |||
{ 0, 4, 6, 8, 14, 0, 0, 0}, | |||
{ 2, 4, 6, 8, 14, 0, 0, 0}, | |||
{ 0, 2, 4, 6, 8, 14, 0, 0}, | |||
{10, 14, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 10, 14, 0, 0, 0, 0, 0}, | |||
{ 2, 10, 14, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 10, 14, 0, 0, 0, 0}, | |||
{ 4, 10, 14, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 10, 14, 0, 0, 0, 0}, | |||
{ 2, 4, 10, 14, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 10, 14, 0, 0, 0}, | |||
{ 6, 10, 14, 0, 0, 0, 0, 0}, | |||
{ 0, 6, 10, 14, 0, 0, 0, 0}, | |||
{ 2, 6, 10, 14, 0, 0, 0, 0}, | |||
{ 0, 2, 6, 10, 14, 0, 0, 0}, | |||
{ 4, 6, 10, 14, 0, 0, 0, 0}, | |||
{ 0, 4, 6, 10, 14, 0, 0, 0}, | |||
{ 2, 4, 6, 10, 14, 0, 0, 0}, | |||
{ 0, 2, 4, 6, 10, 14, 0, 0}, | |||
{ 8, 10, 14, 0, 0, 0, 0, 0}, | |||
{ 0, 8, 10, 14, 0, 0, 0, 0}, | |||
{ 2, 8, 10, 14, 0, 0, 0, 0}, | |||
{ 0, 2, 8, 10, 14, 0, 0, 0}, | |||
{ 4, 8, 10, 14, 0, 0, 0, 0}, | |||
{ 0, 4, 8, 10, 14, 0, 0, 0}, | |||
{ 2, 4, 8, 10, 14, 0, 0, 0}, | |||
{ 0, 2, 4, 8, 10, 14, 0, 0}, | |||
{ 6, 8, 10, 14, 0, 0, 0, 0}, | |||
{ 0, 6, 8, 10, 14, 0, 0, 0}, | |||
{ 2, 6, 8, 10, 14, 0, 0, 0}, | |||
{ 0, 2, 6, 8, 10, 14, 0, 0}, | |||
{ 4, 6, 8, 10, 14, 0, 0, 0}, | |||
{ 0, 4, 6, 8, 10, 14, 0, 0}, | |||
{ 2, 4, 6, 8, 10, 14, 0, 0}, | |||
{ 0, 2, 4, 6, 8, 10, 14, 0}, | |||
{12, 14, 0, 0, 0, 0, 0, 0}, | |||
{ 0, 12, 14, 0, 0, 0, 0, 0}, | |||
{ 2, 12, 14, 0, 0, 0, 0, 0}, | |||
{ 0, 2, 12, 14, 0, 0, 0, 0}, | |||
{ 4, 12, 14, 0, 0, 0, 0, 0}, | |||
{ 0, 4, 12, 14, 0, 0, 0, 0}, | |||
{ 2, 4, 12, 14, 0, 0, 0, 0}, | |||
{ 0, 2, 4, 12, 14, 0, 0, 0}, | |||
{ 6, 12, 14, 0, 0, 0, 0, 0}, | |||
{ 0, 6, 12, 14, 0, 0, 0, 0}, | |||
{ 2, 6, 12, 14, 0, 0, 0, 0}, | |||
{ 0, 2, 6, 12, 14, 0, 0, 0}, | |||
{ 4, 6, 12, 14, 0, 0, 0, 0}, | |||
{ 0, 4, 6, 12, 14, 0, 0, 0}, | |||
{ 2, 4, 6, 12, 14, 0, 0, 0}, | |||
{ 0, 2, 4, 6, 12, 14, 0, 0}, | |||
{ 8, 12, 14, 0, 0, 0, 0, 0}, | |||
{ 0, 8, 12, 14, 0, 0, 0, 0}, | |||
{ 2, 8, 12, 14, 0, 0, 0, 0}, | |||
{ 0, 2, 8, 12, 14, 0, 0, 0}, | |||
{ 4, 8, 12, 14, 0, 0, 0, 0}, | |||
{ 0, 4, 8, 12, 14, 0, 0, 0}, | |||
{ 2, 4, 8, 12, 14, 0, 0, 0}, | |||
{ 0, 2, 4, 8, 12, 14, 0, 0}, | |||
{ 6, 8, 12, 14, 0, 0, 0, 0}, | |||
{ 0, 6, 8, 12, 14, 0, 0, 0}, | |||
{ 2, 6, 8, 12, 14, 0, 0, 0}, | |||
{ 0, 2, 6, 8, 12, 14, 0, 0}, | |||
{ 4, 6, 8, 12, 14, 0, 0, 0}, | |||
{ 0, 4, 6, 8, 12, 14, 0, 0}, | |||
{ 2, 4, 6, 8, 12, 14, 0, 0}, | |||
{ 0, 2, 4, 6, 8, 12, 14, 0}, | |||
{10, 12, 14, 0, 0, 0, 0, 0}, | |||
{ 0, 10, 12, 14, 0, 0, 0, 0}, | |||
{ 2, 10, 12, 14, 0, 0, 0, 0}, | |||
{ 0, 2, 10, 12, 14, 0, 0, 0}, | |||
{ 4, 10, 12, 14, 0, 0, 0, 0}, | |||
{ 0, 4, 10, 12, 14, 0, 0, 0}, | |||
{ 2, 4, 10, 12, 14, 0, 0, 0}, | |||
{ 0, 2, 4, 10, 12, 14, 0, 0}, | |||
{ 6, 10, 12, 14, 0, 0, 0, 0}, | |||
{ 0, 6, 10, 12, 14, 0, 0, 0}, | |||
{ 2, 6, 10, 12, 14, 0, 0, 0}, | |||
{ 0, 2, 6, 10, 12, 14, 0, 0}, | |||
{ 4, 6, 10, 12, 14, 0, 0, 0}, | |||
{ 0, 4, 6, 10, 12, 14, 0, 0}, | |||
{ 2, 4, 6, 10, 12, 14, 0, 0}, | |||
{ 0, 2, 4, 6, 10, 12, 14, 0}, | |||
{ 8, 10, 12, 14, 0, 0, 0, 0}, | |||
{ 0, 8, 10, 12, 14, 0, 0, 0}, | |||
{ 2, 8, 10, 12, 14, 0, 0, 0}, | |||
{ 0, 2, 8, 10, 12, 14, 0, 0}, | |||
{ 4, 8, 10, 12, 14, 0, 0, 0}, | |||
{ 0, 4, 8, 10, 12, 14, 0, 0}, | |||
{ 2, 4, 8, 10, 12, 14, 0, 0}, | |||
{ 0, 2, 4, 8, 10, 12, 14, 0}, | |||
{ 6, 8, 10, 12, 14, 0, 0, 0}, | |||
{ 0, 6, 8, 10, 12, 14, 0, 0}, | |||
{ 2, 6, 8, 10, 12, 14, 0, 0}, | |||
{ 0, 2, 6, 8, 10, 12, 14, 0}, | |||
{ 4, 6, 8, 10, 12, 14, 0, 0}, | |||
{ 0, 4, 6, 8, 10, 12, 14, 0}, | |||
{ 2, 4, 6, 8, 10, 12, 14, 0}, | |||
{ 0, 2, 4, 6, 8, 10, 12, 14} | |||
static const ALIGN32_ARRAY_2D(uint8_t, 256, 8) idx = {.arr = { | |||
{-1, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 0, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 2, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 2, -1, -1, -1, -1, -1, -1}, | |||
{ 4, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 4, -1, -1, -1, -1, -1, -1}, | |||
{ 2, 4, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 4, -1, -1, -1, -1, -1}, | |||
{ 6, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 6, -1, -1, -1, -1, -1, -1}, | |||
{ 2, 6, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 6, -1, -1, -1, -1, -1}, | |||
{ 4, 6, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 6, -1, -1, -1, -1, -1}, | |||
{ 2, 4, 6, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 6, -1, -1, -1, -1}, | |||
{ 8, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 8, -1, -1, -1, -1, -1, -1}, | |||
{ 2, 8, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 8, -1, -1, -1, -1, -1}, | |||
{ 4, 8, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 8, -1, -1, -1, -1, -1}, | |||
{ 2, 4, 8, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 8, -1, -1, -1, -1}, | |||
{ 6, 8, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 8, -1, -1, -1, -1, -1}, | |||
{ 2, 6, 8, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 8, -1, -1, -1, -1}, | |||
{ 4, 6, 8, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 8, -1, -1, -1, -1}, | |||
{ 2, 4, 6, 8, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 8, -1, -1, -1}, | |||
{10, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 10, -1, -1, -1, -1, -1, -1}, | |||
{ 2, 10, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 10, -1, -1, -1, -1, -1}, | |||
{ 4, 10, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 10, -1, -1, -1, -1, -1}, | |||
{ 2, 4, 10, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 10, -1, -1, -1, -1}, | |||
{ 6, 10, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 10, -1, -1, -1, -1, -1}, | |||
{ 2, 6, 10, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 10, -1, -1, -1, -1}, | |||
{ 4, 6, 10, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 10, -1, -1, -1, -1}, | |||
{ 2, 4, 6, 10, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 10, -1, -1, -1}, | |||
{ 8, 10, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 8, 10, -1, -1, -1, -1, -1}, | |||
{ 2, 8, 10, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 8, 10, -1, -1, -1, -1}, | |||
{ 4, 8, 10, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 8, 10, -1, -1, -1, -1}, | |||
{ 2, 4, 8, 10, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 8, 10, -1, -1, -1}, | |||
{ 6, 8, 10, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 8, 10, -1, -1, -1, -1}, | |||
{ 2, 6, 8, 10, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 8, 10, -1, -1, -1}, | |||
{ 4, 6, 8, 10, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 8, 10, -1, -1, -1}, | |||
{ 2, 4, 6, 8, 10, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 8, 10, -1, -1}, | |||
{12, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 12, -1, -1, -1, -1, -1, -1}, | |||
{ 2, 12, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 12, -1, -1, -1, -1, -1}, | |||
{ 4, 12, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 12, -1, -1, -1, -1, -1}, | |||
{ 2, 4, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 12, -1, -1, -1, -1}, | |||
{ 6, 12, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 12, -1, -1, -1, -1, -1}, | |||
{ 2, 6, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 12, -1, -1, -1, -1}, | |||
{ 4, 6, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 12, -1, -1, -1, -1}, | |||
{ 2, 4, 6, 12, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 12, -1, -1, -1}, | |||
{ 8, 12, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 8, 12, -1, -1, -1, -1, -1}, | |||
{ 2, 8, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 8, 12, -1, -1, -1, -1}, | |||
{ 4, 8, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 8, 12, -1, -1, -1, -1}, | |||
{ 2, 4, 8, 12, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 8, 12, -1, -1, -1}, | |||
{ 6, 8, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 8, 12, -1, -1, -1, -1}, | |||
{ 2, 6, 8, 12, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 8, 12, -1, -1, -1}, | |||
{ 4, 6, 8, 12, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 8, 12, -1, -1, -1}, | |||
{ 2, 4, 6, 8, 12, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 8, 12, -1, -1}, | |||
{10, 12, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 10, 12, -1, -1, -1, -1, -1}, | |||
{ 2, 10, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 10, 12, -1, -1, -1, -1}, | |||
{ 4, 10, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 10, 12, -1, -1, -1, -1}, | |||
{ 2, 4, 10, 12, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 10, 12, -1, -1, -1}, | |||
{ 6, 10, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 10, 12, -1, -1, -1, -1}, | |||
{ 2, 6, 10, 12, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 10, 12, -1, -1, -1}, | |||
{ 4, 6, 10, 12, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 10, 12, -1, -1, -1}, | |||
{ 2, 4, 6, 10, 12, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 10, 12, -1, -1}, | |||
{ 8, 10, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 8, 10, 12, -1, -1, -1, -1}, | |||
{ 2, 8, 10, 12, -1, -1, -1, -1}, | |||
{ 0, 2, 8, 10, 12, -1, -1, -1}, | |||
{ 4, 8, 10, 12, -1, -1, -1, -1}, | |||
{ 0, 4, 8, 10, 12, -1, -1, -1}, | |||
{ 2, 4, 8, 10, 12, -1, -1, -1}, | |||
{ 0, 2, 4, 8, 10, 12, -1, -1}, | |||
{ 6, 8, 10, 12, -1, -1, -1, -1}, | |||
{ 0, 6, 8, 10, 12, -1, -1, -1}, | |||
{ 2, 6, 8, 10, 12, -1, -1, -1}, | |||
{ 0, 2, 6, 8, 10, 12, -1, -1}, | |||
{ 4, 6, 8, 10, 12, -1, -1, -1}, | |||
{ 0, 4, 6, 8, 10, 12, -1, -1}, | |||
{ 2, 4, 6, 8, 10, 12, -1, -1}, | |||
{ 0, 2, 4, 6, 8, 10, 12, -1}, | |||
{14, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 14, -1, -1, -1, -1, -1, -1}, | |||
{ 2, 14, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 14, -1, -1, -1, -1, -1}, | |||
{ 4, 14, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 14, -1, -1, -1, -1, -1}, | |||
{ 2, 4, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 14, -1, -1, -1, -1}, | |||
{ 6, 14, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 14, -1, -1, -1, -1, -1}, | |||
{ 2, 6, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 14, -1, -1, -1, -1}, | |||
{ 4, 6, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 14, -1, -1, -1, -1}, | |||
{ 2, 4, 6, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 14, -1, -1, -1}, | |||
{ 8, 14, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 8, 14, -1, -1, -1, -1, -1}, | |||
{ 2, 8, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 8, 14, -1, -1, -1, -1}, | |||
{ 4, 8, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 8, 14, -1, -1, -1, -1}, | |||
{ 2, 4, 8, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 8, 14, -1, -1, -1}, | |||
{ 6, 8, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 8, 14, -1, -1, -1, -1}, | |||
{ 2, 6, 8, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 8, 14, -1, -1, -1}, | |||
{ 4, 6, 8, 14, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 8, 14, -1, -1, -1}, | |||
{ 2, 4, 6, 8, 14, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 8, 14, -1, -1}, | |||
{10, 14, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 10, 14, -1, -1, -1, -1, -1}, | |||
{ 2, 10, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 10, 14, -1, -1, -1, -1}, | |||
{ 4, 10, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 10, 14, -1, -1, -1, -1}, | |||
{ 2, 4, 10, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 10, 14, -1, -1, -1}, | |||
{ 6, 10, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 10, 14, -1, -1, -1, -1}, | |||
{ 2, 6, 10, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 10, 14, -1, -1, -1}, | |||
{ 4, 6, 10, 14, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 10, 14, -1, -1, -1}, | |||
{ 2, 4, 6, 10, 14, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 10, 14, -1, -1}, | |||
{ 8, 10, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 8, 10, 14, -1, -1, -1, -1}, | |||
{ 2, 8, 10, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 8, 10, 14, -1, -1, -1}, | |||
{ 4, 8, 10, 14, -1, -1, -1, -1}, | |||
{ 0, 4, 8, 10, 14, -1, -1, -1}, | |||
{ 2, 4, 8, 10, 14, -1, -1, -1}, | |||
{ 0, 2, 4, 8, 10, 14, -1, -1}, | |||
{ 6, 8, 10, 14, -1, -1, -1, -1}, | |||
{ 0, 6, 8, 10, 14, -1, -1, -1}, | |||
{ 2, 6, 8, 10, 14, -1, -1, -1}, | |||
{ 0, 2, 6, 8, 10, 14, -1, -1}, | |||
{ 4, 6, 8, 10, 14, -1, -1, -1}, | |||
{ 0, 4, 6, 8, 10, 14, -1, -1}, | |||
{ 2, 4, 6, 8, 10, 14, -1, -1}, | |||
{ 0, 2, 4, 6, 8, 10, 14, -1}, | |||
{12, 14, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 12, 14, -1, -1, -1, -1, -1}, | |||
{ 2, 12, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 12, 14, -1, -1, -1, -1}, | |||
{ 4, 12, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 12, 14, -1, -1, -1, -1}, | |||
{ 2, 4, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 12, 14, -1, -1, -1}, | |||
{ 6, 12, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 12, 14, -1, -1, -1, -1}, | |||
{ 2, 6, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 12, 14, -1, -1, -1}, | |||
{ 4, 6, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 12, 14, -1, -1, -1}, | |||
{ 2, 4, 6, 12, 14, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 12, 14, -1, -1}, | |||
{ 8, 12, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 8, 12, 14, -1, -1, -1, -1}, | |||
{ 2, 8, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 8, 12, 14, -1, -1, -1}, | |||
{ 4, 8, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 4, 8, 12, 14, -1, -1, -1}, | |||
{ 2, 4, 8, 12, 14, -1, -1, -1}, | |||
{ 0, 2, 4, 8, 12, 14, -1, -1}, | |||
{ 6, 8, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 6, 8, 12, 14, -1, -1, -1}, | |||
{ 2, 6, 8, 12, 14, -1, -1, -1}, | |||
{ 0, 2, 6, 8, 12, 14, -1, -1}, | |||
{ 4, 6, 8, 12, 14, -1, -1, -1}, | |||
{ 0, 4, 6, 8, 12, 14, -1, -1}, | |||
{ 2, 4, 6, 8, 12, 14, -1, -1}, | |||
{ 0, 2, 4, 6, 8, 12, 14, -1}, | |||
{10, 12, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 10, 12, 14, -1, -1, -1, -1}, | |||
{ 2, 10, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 10, 12, 14, -1, -1, -1}, | |||
{ 4, 10, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 4, 10, 12, 14, -1, -1, -1}, | |||
{ 2, 4, 10, 12, 14, -1, -1, -1}, | |||
{ 0, 2, 4, 10, 12, 14, -1, -1}, | |||
{ 6, 10, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 6, 10, 12, 14, -1, -1, -1}, | |||
{ 2, 6, 10, 12, 14, -1, -1, -1}, | |||
{ 0, 2, 6, 10, 12, 14, -1, -1}, | |||
{ 4, 6, 10, 12, 14, -1, -1, -1}, | |||
{ 0, 4, 6, 10, 12, 14, -1, -1}, | |||
{ 2, 4, 6, 10, 12, 14, -1, -1}, | |||
{ 0, 2, 4, 6, 10, 12, 14, -1}, | |||
{ 8, 10, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 8, 10, 12, 14, -1, -1, -1}, | |||
{ 2, 8, 10, 12, 14, -1, -1, -1}, | |||
{ 0, 2, 8, 10, 12, 14, -1, -1}, | |||
{ 4, 8, 10, 12, 14, -1, -1, -1}, | |||
{ 0, 4, 8, 10, 12, 14, -1, -1}, | |||
{ 2, 4, 8, 10, 12, 14, -1, -1}, | |||
{ 0, 2, 4, 8, 10, 12, 14, -1}, | |||
{ 6, 8, 10, 12, 14, -1, -1, -1}, | |||
{ 0, 6, 8, 10, 12, 14, -1, -1}, | |||
{ 2, 6, 8, 10, 12, 14, -1, -1}, | |||
{ 0, 2, 6, 8, 10, 12, 14, -1}, | |||
{ 4, 6, 8, 10, 12, 14, -1, -1}, | |||
{ 0, 4, 6, 8, 10, 12, 14, -1}, | |||
{ 2, 4, 6, 8, 10, 12, 14, -1}, | |||
{ 0, 2, 4, 6, 8, 10, 12, 14} | |||
} | |||
}; | |||
#define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a) | |||
#define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a) | |||
#define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a) | |||
#define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a) | |||
size_t PQCLEAN_KYBER1024_AVX2_rej_uniform(int16_t *r, | |||
size_t len, | |||
const uint8_t *buf, | |||
size_t buflen) { | |||
size_t ctr, pos; | |||
uint16_t val; | |||
uint32_t good0, good1, good2; | |||
const __m256i bound = _mm256_set1_epi16((int16_t)(19 * KYBER_Q - 1)); // -1 to use cheaper >= instead of > comparison | |||
#define REJ_UNIFORM_BUFLEN 672 | |||
unsigned int PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(int16_t *restrict r, | |||
const uint8_t *restrict buf) { | |||
unsigned int ctr = 0, pos = 0; | |||
uint16_t val = 0; | |||
uint32_t good = 0; | |||
const __m256i bound = _mm256_set1_epi16((int16_t)(19 * KYBER_Q - 1)); | |||
const __m256i ones = _mm256_set1_epi8(1); | |||
const __m256i kyberq = _mm256_load_si256(&PQCLEAN_KYBER1024_AVX2_16xq.as_vec); | |||
const __m256i v = _mm256_load_si256(&PQCLEAN_KYBER1024_AVX2_16xv.as_vec); | |||
__m256i d0, d1, d2, tmp0, tmp1, tmp2, pi0, pi1, pi2; | |||
__m128i d, tmp, pilo, pihi; | |||
const __m256i kyberq = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER1024_AVX2_qdata.as_arr[_16XQ]); | |||
const __m256i v = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER1024_AVX2_qdata.as_arr[_16XV]); | |||
__m256i f0, f1, g0, g1, g2, g3; | |||
__m128i f, t, pilo, pihi; | |||
ctr = pos = 0; | |||
while (ctr + 48 <= len && pos + 96 <= buflen) { | |||
d0 = _mm256_loadu_si256((__m256i *)&buf[pos + 0]); | |||
d1 = _mm256_loadu_si256((__m256i *)&buf[pos + 32]); | |||
d2 = _mm256_loadu_si256((__m256i *)&buf[pos + 64]); | |||
ctr = 0; | |||
for (pos = 0; pos < 2 * KYBER_N; pos += 64) { | |||
f0 = _mm256_load_si256((__m256i *)&buf[pos + 0]); | |||
f1 = _mm256_load_si256((__m256i *)&buf[pos + 32]); | |||
tmp0 = _mm256_cmpge_epu16(bound, d0); | |||
tmp1 = _mm256_cmpge_epu16(bound, d1); | |||
tmp2 = _mm256_cmpge_epu16(bound, d2); | |||
good0 = (uint32_t)_mm256_movemask_epi8(tmp0); | |||
good1 = (uint32_t)_mm256_movemask_epi8(tmp1); | |||
good2 = (uint32_t)_mm256_movemask_epi8(tmp2); | |||
good0 = _pext_u32(good0, 0x55555555); | |||
good1 = _pext_u32(good1, 0x55555555); | |||
good2 = _pext_u32(good2, 0x55555555); | |||
g0 = _mm256_cmpge_epu16(bound, f0); | |||
g1 = _mm256_cmpge_epu16(bound, f1); | |||
pilo = _mm_loadl_epi64((__m128i *)&idx[good0 & 0xFF]); | |||
pihi = _mm_loadl_epi64((__m128i *)&idx[(good0 >> 8) & 0xFF]); | |||
pi0 = _mm256_castsi128_si256(pilo); | |||
pi0 = _mm256_inserti128_si256(pi0, pihi, 1); | |||
g0 = _mm256_packs_epi16(g0, g1); | |||
good = _mm256_movemask_epi8(g0); | |||
pilo = _mm_loadl_epi64((__m128i *)&idx[good1 & 0xFF]); | |||
pihi = _mm_loadl_epi64((__m128i *)&idx[(good1 >> 8) & 0xFF]); | |||
pi1 = _mm256_castsi128_si256(pilo); | |||
pi1 = _mm256_inserti128_si256(pi1, pihi, 1); | |||
g0 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 0) & 0xFF])); | |||
g1 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 8) & 0xFF])); | |||
g0 = _mm256_inserti128_si256(g0, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 16) & 0xFF]), 1); | |||
g1 = _mm256_inserti128_si256(g1, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 24) & 0xFF]), 1); | |||
pilo = _mm_loadl_epi64((__m128i *)&idx[good2 & 0xFF]); | |||
pihi = _mm_loadl_epi64((__m128i *)&idx[(good2 >> 8) & 0xFF]); | |||
pi2 = _mm256_castsi128_si256(pilo); | |||
pi2 = _mm256_inserti128_si256(pi2, pihi, 1); | |||
//g0 = _mm256_cvtepu8_epi64(_mm_loadl_epi64((__m128i *)&good)); | |||
//g1 = _mm256_i64gather_epi64((long long *)idx, g0, 8); | |||
tmp0 = _mm256_add_epi8(pi0, ones); | |||
tmp1 = _mm256_add_epi8(pi1, ones); | |||
tmp2 = _mm256_add_epi8(pi2, ones); | |||
pi0 = _mm256_unpacklo_epi8(pi0, tmp0); | |||
pi1 = _mm256_unpacklo_epi8(pi1, tmp1); | |||
pi2 = _mm256_unpacklo_epi8(pi2, tmp2); | |||
/* Barrett reduction of (still unsigned) values */ | |||
g2 = _mm256_mulhi_epu16(f0, v); | |||
g3 = _mm256_mulhi_epu16(f1, v); | |||
g2 = _mm256_srli_epi16(g2, 10); | |||
g3 = _mm256_srli_epi16(g3, 10); | |||
g2 = _mm256_mullo_epi16(g2, kyberq); | |||
g3 = _mm256_mullo_epi16(g3, kyberq); | |||
f0 = _mm256_sub_epi16(f0, g2); | |||
f1 = _mm256_sub_epi16(f1, g3); | |||
d0 = _mm256_shuffle_epi8(d0, pi0); | |||
d1 = _mm256_shuffle_epi8(d1, pi1); | |||
d2 = _mm256_shuffle_epi8(d2, pi2); | |||
g2 = _mm256_add_epi8(g0, ones); | |||
g3 = _mm256_add_epi8(g1, ones); | |||
g0 = _mm256_unpacklo_epi8(g0, g2); | |||
g1 = _mm256_unpacklo_epi8(g1, g3); | |||
/* Barrett reduction of (still unsigned) d values */ | |||
tmp0 = _mm256_mulhi_epu16(d0, v); | |||
tmp1 = _mm256_mulhi_epu16(d1, v); | |||
tmp2 = _mm256_mulhi_epu16(d2, v); | |||
tmp0 = _mm256_srli_epi16(tmp0, 10); | |||
tmp1 = _mm256_srli_epi16(tmp1, 10); | |||
tmp2 = _mm256_srli_epi16(tmp2, 10); | |||
tmp0 = _mm256_mullo_epi16(tmp0, kyberq); | |||
tmp1 = _mm256_mullo_epi16(tmp1, kyberq); | |||
tmp2 = _mm256_mullo_epi16(tmp2, kyberq); | |||
d0 = _mm256_sub_epi16(d0, tmp0); | |||
d1 = _mm256_sub_epi16(d1, tmp1); | |||
d2 = _mm256_sub_epi16(d2, tmp2); | |||
f0 = _mm256_shuffle_epi8(f0, g0); | |||
f1 = _mm256_shuffle_epi8(f1, g1); | |||
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(d0)); | |||
ctr += (unsigned int)_mm_popcnt_u32(good0 & 0xFF); | |||
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(d0, 1)); | |||
ctr += (unsigned int)_mm_popcnt_u32((good0 >> 8) & 0xFF); | |||
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(d1)); | |||
ctr += (unsigned int)_mm_popcnt_u32(good1 & 0xFF); | |||
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(d1, 1)); | |||
ctr += (unsigned int)_mm_popcnt_u32((good1 >> 8) & 0xFF); | |||
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(d2)); | |||
ctr += (unsigned int)_mm_popcnt_u32(good2 & 0xFF); | |||
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(d2, 1)); | |||
ctr += (unsigned int)_mm_popcnt_u32((good2 >> 8) & 0xFF); | |||
pos += 96; | |||
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f0)); | |||
ctr += _mm_popcnt_u32((good >> 0) & 0xFF); | |||
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f0, 1)); | |||
ctr += _mm_popcnt_u32((good >> 16) & 0xFF); | |||
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f1)); | |||
ctr += _mm_popcnt_u32((good >> 8) & 0xFF); | |||
_mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f1, 1)); | |||
ctr += _mm_popcnt_u32((good >> 24) & 0xFF); | |||
} | |||
while (ctr + 8 <= len && pos + 16 <= buflen) { | |||
d = _mm_loadu_si128((__m128i *)&buf[pos]); | |||
tmp = _mm_cmpge_epu16(_mm256_castsi256_si128(bound), d); | |||
good0 = (uint32_t)_mm_movemask_epi8(tmp); | |||
good0 = _pext_u32(good0, 0x55555555); | |||
pilo = _mm_loadl_epi64((__m128i *)&idx[good0]); | |||
while (ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_BUFLEN - 16) { | |||
f = _mm_load_si128((__m128i *)&buf[pos]); | |||
t = _mm_cmpge_epu16(_mm256_castsi256_si128(bound), f); | |||
good = _mm_movemask_epi8(t); | |||
good = _pext_u32(good, 0x5555); | |||
pilo = _mm_loadl_epi64((__m128i *)&idx.arr[good]); | |||
pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones)); | |||
pilo = _mm_unpacklo_epi8(pilo, pihi); | |||
d = _mm_shuffle_epi8(d, pilo); | |||
/* Barrett reduction */ | |||
tmp = _mm_mulhi_epu16(d, _mm256_castsi256_si128(v)); | |||
tmp = _mm_srli_epi16(tmp, 10); | |||
tmp = _mm_mullo_epi16(tmp, _mm256_castsi256_si128(kyberq)); | |||
d = _mm_sub_epi16(d, tmp); | |||
t = _mm_mulhi_epu16(f, _mm256_castsi256_si128(v)); | |||
t = _mm_srli_epi16(t, 10); | |||
t = _mm_mullo_epi16(t, _mm256_castsi256_si128(kyberq)); | |||
f = _mm_sub_epi16(f, t); | |||
_mm_storeu_si128((__m128i *)&r[ctr], d); | |||
ctr += (unsigned int)_mm_popcnt_u32(good0); | |||
f = _mm_shuffle_epi8(f, pilo); | |||
_mm_storeu_si128((__m128i *)&r[ctr], f); | |||
ctr += _mm_popcnt_u32(good); | |||
pos += 16; | |||
} | |||
while (ctr < len && pos + 2 <= buflen) { | |||
val = (uint16_t)(buf[pos] | ((uint16_t)buf[pos + 1] << 8)); | |||
while (ctr < KYBER_N && pos <= REJ_UNIFORM_BUFLEN - 2) { | |||
val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); | |||
pos += 2; | |||
if (val < 19 * KYBER_Q) { | |||
val -= ((int32_t)val * 20159 >> 26) * KYBER_Q; | |||
r[ctr++] = (int16_t)val; | |||
r[ctr++] = val; | |||
} | |||
} | |||
@@ -1,12 +1,11 @@ | |||
#ifndef REJSAMPLE_H | |||
#define REJSAMPLE_H | |||
#include <stddef.h> | |||
#include "params.h" | |||
#include <stdint.h> | |||
size_t PQCLEAN_KYBER1024_AVX2_rej_uniform(int16_t *r, | |||
size_t len, | |||
const uint8_t *buf, | |||
size_t buflen); | |||
unsigned int PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(int16_t *r, | |||
const unsigned char *buf); | |||
#endif |
@@ -1,12 +1,9 @@ | |||
#include "cdecl.inc" | |||
.include "fq.inc" | |||
.include "shuffle.inc" | |||
.global PQCLEAN_KYBER51290S_AVX2_nttunpack_avx | |||
PQCLEAN_KYBER51290S_AVX2_nttunpack_avx: | |||
#consts | |||
vmovdqa PQCLEAN_KYBER51290S_AVX2_16xq(%rip),%ymm0 | |||
vmovdqa PQCLEAN_KYBER51290S_AVX2_16xv(%rip),%ymm1 | |||
/* | |||
nttpack_avx: | |||
#load | |||
vmovdqa (%rdi),%ymm4 | |||
vmovdqa 32(%rdi),%ymm5 | |||
@@ -17,18 +14,51 @@ vmovdqa 160(%rdi),%ymm9 | |||
vmovdqa 192(%rdi),%ymm10 | |||
vmovdqa 224(%rdi),%ymm11 | |||
/* | |||
#reduce | |||
red16 4 12 | |||
red16 5 13 | |||
red16 6 14 | |||
red16 7 15 | |||
red16 8 12 | |||
red16 9 13 | |||
red16 10 14 | |||
red16 11 15 | |||
shuffle1 4,5,3,5 | |||
shuffle1 6,7,4,7 | |||
shuffle1 8,9,6,9 | |||
shuffle1 10,11,8,11 | |||
shuffle2 3,4,10,4 | |||
shuffle2 6,8,3,8 | |||
shuffle2 5,7,6,7 | |||
shuffle2 9,11,5,11 | |||
shuffle4 10,3,9,3 | |||
shuffle4 6,5,10,5 | |||
shuffle4 4,8,6,8 | |||
shuffle4 7,11,4,11 | |||
shuffle8 9,10,7,10 | |||
shuffle8 6,4,9,4 | |||
shuffle8 3,5,6,5 | |||
shuffle8 8,11,3,11 | |||
#store | |||
vmovdqa %ymm7,(%rdi) | |||
vmovdqa %ymm9,32(%rdi) | |||
vmovdqa %ymm6,64(%rdi) | |||
vmovdqa %ymm3,96(%rdi) | |||
vmovdqa %ymm10,128(%rdi) | |||
vmovdqa %ymm4,160(%rdi) | |||
vmovdqa %ymm5,192(%rdi) | |||
vmovdqa %ymm11,224(%rdi) | |||
ret | |||
*/ | |||
.text | |||
nttunpack128_avx: | |||
#load | |||
vmovdqa (%rdi),%ymm4 | |||
vmovdqa 32(%rdi),%ymm5 | |||
vmovdqa 64(%rdi),%ymm6 | |||
vmovdqa 96(%rdi),%ymm7 | |||
vmovdqa 128(%rdi),%ymm8 | |||
vmovdqa 160(%rdi),%ymm9 | |||
vmovdqa 192(%rdi),%ymm10 | |||
vmovdqa 224(%rdi),%ymm11 | |||
shuffle8 4,8,3,8 | |||
shuffle8 5,9,4,9 | |||
shuffle8 6,10,5,10 | |||
@@ -61,11 +91,14 @@ vmovdqa %ymm11,224(%rdi) | |||
ret | |||
.global PQCLEAN_KYBER51290S_AVX2_ntttobytes_avx | |||
PQCLEAN_KYBER51290S_AVX2_ntttobytes_avx: | |||
#consts | |||
vmovdqa PQCLEAN_KYBER51290S_AVX2_16xq(%rip),%ymm0 | |||
.global cdecl(PQCLEAN_KYBER1024_AVX2_nttunpack_avx) | |||
cdecl(PQCLEAN_KYBER1024_AVX2_nttunpack_avx): | |||
call nttunpack128_avx | |||
add $256,%rdi | |||
call nttunpack128_avx | |||
ret | |||
ntttobytes128_avx: | |||
#load | |||
vmovdqa (%rsi),%ymm5 | |||
vmovdqa 32(%rsi),%ymm6 | |||
@@ -77,14 +110,14 @@ vmovdqa 192(%rsi),%ymm11 | |||
vmovdqa 224(%rsi),%ymm12 | |||
#csubq | |||
csubq 5 13 | |||
csubq 6 14 | |||
csubq 7 15 | |||
csubq 8 1 | |||
csubq 9 13 | |||
csubq 10 14 | |||
csubq 11 15 | |||
csubq 12 1 | |||
csubq 5,13 | |||
csubq 6,14 | |||
csubq 7,15 | |||
csubq 8,1 | |||
csubq 9,13 | |||
csubq 10,14 | |||
csubq 11,15 | |||
csubq 12,1 | |||
#bitpack | |||
vpsllw $12,%ymm6,%ymm4 | |||
@@ -135,11 +168,17 @@ vmovdqu %ymm9,160(%rdi) | |||
ret | |||
.global PQCLEAN_KYBER51290S_AVX2_nttfrombytes_avx | |||
PQCLEAN_KYBER51290S_AVX2_nttfrombytes_avx: | |||
.global cdecl(PQCLEAN_KYBER1024_AVX2_ntttobytes_avx) | |||
cdecl(PQCLEAN_KYBER1024_AVX2_ntttobytes_avx): | |||
#consts | |||
vmovdqa PQCLEAN_KYBER51290S_AVX2_16xmask(%rip),%ymm0 | |||
vmovdqa _16XQ*2(%rdx),%ymm0 | |||
call ntttobytes128_avx | |||
add $256,%rsi | |||
add $192,%rdi | |||
call ntttobytes128_avx | |||
ret | |||
nttfrombytes128_avx: | |||
#load | |||
vmovdqu (%rsi),%ymm4 | |||
vmovdqu 32(%rsi),%ymm5 | |||
@@ -204,3 +243,13 @@ vmovdqa %ymm15,192(%rdi) | |||
vmovdqa %ymm1,224(%rdi) | |||
ret | |||
.global cdecl(PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx) | |||
cdecl(PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx): | |||
#consts | |||
vmovdqa _16XMASK*2(%rdx),%ymm0 | |||
call nttfrombytes128_avx | |||
add $256,%rdi | |||
add $192,%rsi | |||
call nttfrombytes128_avx | |||
ret |
@@ -9,6 +9,8 @@ vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 | |||
.endm | |||
.macro shuffle2 r0,r1,r2,r3 | |||
#vshufps $0x20,%ymm\r1,%ymm\r0,%ymm\r2 | |||
#vshufps $0x31,%ymm\r1,%ymm\r0,%ymm\r3 | |||
vpsllq $32,%ymm\r1,%ymm12 | |||
vpsrlq $32,%ymm\r0,%ymm13 | |||
vpblendd $0xAA,%ymm12,%ymm\r0,%ymm\r2 | |||
@@ -1,63 +0,0 @@ | |||
#include "fips202.h" | |||
#include "symmetric.h" | |||
#include <stdlib.h> | |||
/************************************************* | |||
* Name: kyber_shake128_absorb | |||
* | |||
* Description: Absorb step of the SHAKE128 specialized for the Kyber context. | |||
* Arguments: - keccak_state *s: pointer to (uninitialized) output Keccak state | |||
* - const uint8_t *input: pointer to KYBER_SYMBYTES input to be absorbed into s | |||
* - uint8_t i additional byte of input | |||
* - uint8_t j additional byte of input | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb(keccak_state *s, const uint8_t *input, uint8_t x, uint8_t y) { | |||
size_t i; | |||
uint8_t extseed[KYBER_SYMBYTES + 2]; | |||
for (i = 0; i < KYBER_SYMBYTES; i++) { | |||
extseed[i] = input[i]; | |||
} | |||
extseed[i++] = x; | |||
extseed[i] = y; | |||
shake128_absorb(s, extseed, KYBER_SYMBYTES + 2); | |||
} | |||
/************************************************* | |||
* Name: kyber_shake128_squeezeblocks | |||
* | |||
* Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of SHAKE128_RATE bytes each. | |||
* Modifies the state. Can be called multiple times to keep squeezing, | |||
* i.e., is incremental. | |||
* | |||
* Arguments: - uint8_t *output: pointer to output blocks | |||
* - unsigned long long nblocks: number of blocks to be squeezed (written to output) | |||
* - keccak_state *s: pointer to in/output Keccak state | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_kyber_shake128_squeezeblocks(uint8_t *output, size_t nblocks, keccak_state *s) { | |||
shake128_squeezeblocks(output, nblocks, s); | |||
} | |||
/************************************************* | |||
* Name: shake256_prf | |||
* | |||
* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input | |||
* and then generates outlen bytes of SHAKE256 output | |||
* | |||
* Arguments: - uint8_t *output: pointer to output | |||
* - size_t outlen: number of requested output bytes | |||
* - const uint8_t * key: pointer to the key (of length KYBER_SYMBYTES) | |||
* - const uint8_t nonce: single-byte nonce (public PRF input) | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_shake256_prf(uint8_t *output, size_t outlen, const uint8_t *key, uint8_t nonce) { | |||
uint8_t extkey[KYBER_SYMBYTES + 1]; | |||
size_t i; | |||
for (i = 0; i < KYBER_SYMBYTES; i++) { | |||
extkey[i] = key[i]; | |||
} | |||
extkey[i] = nonce; | |||
shake256(output, outlen, extkey, KYBER_SYMBYTES + 1); | |||
} |
@@ -0,0 +1,60 @@ | |||
#include "fips202.h" | |||
#include "params.h" | |||
#include "symmetric.h" | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb | |||
* | |||
* Description: Absorb step of the SHAKE128 specialized for the Kyber context. | |||
* | |||
* Arguments: - keccak_state *state: pointer to (uninitialized) output | |||
* Keccak state | |||
* - const uint8_t *seed: pointer to KYBER_SYMBYTES input | |||
* to be absorbed into state | |||
* - uint8_t i additional byte of input | |||
* - uint8_t j additional byte of input | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb(xof_state *state, | |||
const uint8_t seed[KYBER_SYMBYTES], | |||
uint8_t x, | |||
uint8_t y) { | |||
unsigned int i = 0; | |||
uint8_t extseed[KYBER_SYMBYTES + 2]; | |||
for (i = 0; i < KYBER_SYMBYTES; i++) { | |||
extseed[i] = seed[i]; | |||
} | |||
extseed[i++] = x; | |||
extseed[i] = y; | |||
shake128_absorb(state, extseed, sizeof(extseed)); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf | |||
* | |||
* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input | |||
* and then generates outlen bytes of SHAKE256 output | |||
* | |||
* Arguments: - uint8_t *out: pointer to output | |||
* - size_t outlen: number of requested output bytes | |||
* - const uint8_t *key: pointer to the key | |||
* (of length KYBER_SYMBYTES) | |||
* - uint8_t nonce: single-byte nonce (public PRF input) | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf(uint8_t *out, | |||
size_t outlen, | |||
const uint8_t key[KYBER_SYMBYTES], | |||
uint8_t nonce) { | |||
unsigned int i = 0; | |||
uint8_t extkey[KYBER_SYMBYTES + 1]; | |||
for (i = 0; i < KYBER_SYMBYTES; i++) { | |||
extkey[i] = key[i]; | |||
} | |||
extkey[i] = nonce; | |||
shake256(out, outlen, extkey, sizeof(extkey)); | |||
} |
@@ -2,28 +2,36 @@ | |||
#define SYMMETRIC_H | |||
#include "params.h" | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
#include "fips202.h" | |||
#include "fips202x4.h" | |||
typedef shake128ctx keccak_state; | |||
typedef shake128ctx xof_state; | |||
void PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb(keccak_state *s, const uint8_t *input, uint8_t x, uint8_t y); | |||
void PQCLEAN_KYBER1024_AVX2_kyber_shake128_squeezeblocks(uint8_t *output, size_t nblocks, keccak_state *s); | |||
void PQCLEAN_KYBER1024_AVX2_shake256_prf(uint8_t *output, size_t outlen, const uint8_t *key, uint8_t nonce); | |||
void PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb(shake128ctx *s, | |||
const uint8_t seed[KYBER_SYMBYTES], | |||
uint8_t x, | |||
uint8_t y); | |||
void PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf(uint8_t *out, | |||
size_t outlen, | |||
const uint8_t key[KYBER_SYMBYTES], | |||
uint8_t nonce); | |||
#define XOF_BLOCKBYTES SHAKE128_RATE | |||
#define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES) | |||
#define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES) | |||
#define xof_absorb(STATE, IN, X, Y) PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb(STATE, IN, X, Y) | |||
#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) | |||
#define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb(STATE, SEED, X, Y) | |||
#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) \ | |||
shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) | |||
#define xof_ctx_release(STATE) shake128_ctx_release(STATE) | |||
#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER1024_AVX2_shake256_prf(OUT, OUTBYTES, KEY, NONCE) | |||
#define prf(OUT, OUTBYTES, KEY, NONCE) \ | |||
PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE) | |||
#define kdf(OUT, IN, INBYTES) shake256(OUT, KYBER_SSBYTES, IN, INBYTES) | |||
#define XOF_BLOCKBYTES SHAKE128_RATE | |||
typedef keccak_state xof_state; | |||
#endif /* SYMMETRIC_H */ |
@@ -1,23 +1,22 @@ | |||
#include "verify.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
#include <stdlib.h> | |||
/************************************************* | |||
* Name: verify | |||
* Name: PQCLEAN_KYBER1024_AVX2_verify | |||
* | |||
* Description: Compare two arrays for equality in constant time. | |||
* | |||
* Arguments: const uint8_t *a: pointer to first byte array | |||
* const uint8_t *b: pointer to second byte array | |||
* Arguments: const unsigned char *a: pointer to first byte array | |||
* const unsigned char *b: pointer to second byte array | |||
* size_t len: length of the byte arrays | |||
* | |||
* Returns 0 if the byte arrays are equal, 1 otherwise | |||
**************************************************/ | |||
uint8_t PQCLEAN_KYBER1024_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) { | |||
size_t pos; | |||
uint64_t r; | |||
int PQCLEAN_KYBER1024_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) { | |||
size_t pos = 0; | |||
uint64_t r = 0; | |||
__m256i avec, bvec, cvec; | |||
cvec = _mm256_setzero_si256(); | |||
@@ -27,38 +26,38 @@ uint8_t PQCLEAN_KYBER1024_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t | |||
avec = _mm256_xor_si256(avec, bvec); | |||
cvec = _mm256_or_si256(cvec, avec); | |||
} | |||
r = !_mm256_testz_si256(cvec, cvec); | |||
cvec = _mm256_cmpeq_epi8(cvec, _mm256_setzero_si256()); | |||
r = (uint32_t)(_mm256_movemask_epi8(cvec) ^ -1); | |||
while (pos < len) { | |||
r |= a[pos] ^ b[pos]; | |||
pos += 1; | |||
if (pos < len) { | |||
avec = _mm256_loadu_si256((__m256i *)&a[pos]); | |||
bvec = _mm256_loadu_si256((__m256i *)&b[pos]); | |||
cvec = _mm256_cmpeq_epi8(avec, bvec); | |||
r |= _mm256_movemask_epi8(cvec) & (-(uint32_t)1 >> (32 + pos - len)); | |||
} | |||
r = (-r) >> 63; | |||
return (uint8_t)r; | |||
return r; | |||
} | |||
/************************************************* | |||
* Name: cmov | |||
* Name: PQCLEAN_KYBER1024_AVX2_cmov | |||
* | |||
* Description: Copy len bytes from x to r if b is 1; | |||
* don't modify x if b is 0. Requires b to be in {0,1}; | |||
* assumes two's complement representation of negative integers. | |||
* Runs in constant time. | |||
* | |||
* Arguments: uint8_t *r: pointer to output byte array | |||
* const uint8_t *x: pointer to input byte array | |||
* Arguments: unsigned char *r: pointer to output byte array | |||
* const unsigned char *x: pointer to input byte array | |||
* size_t len: Amount of bytes to be copied | |||
* uint8_t b: Condition bit; has to be in {0,1} | |||
* unsigned char b: Condition bit; has to be in {0,1} | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b) { | |||
size_t pos; | |||
void PQCLEAN_KYBER1024_AVX2_cmov(uint8_t *restrict r, const uint8_t *restrict x, size_t len, uint8_t b) { | |||
size_t pos = 0; | |||
__m256i xvec, rvec, bvec; | |||
b = -b; | |||
bvec = _mm256_set1_epi8((char)b); | |||
bvec = _mm256_set1_epi8(b); | |||
for (pos = 0; pos + 32 <= len; pos += 32) { | |||
rvec = _mm256_loadu_si256((__m256i *)&r[pos]); | |||
@@ -1,10 +1,13 @@ | |||
#ifndef VERIFY_H | |||
#define VERIFY_H | |||
#ifndef PQCLEAN_KYBER1024_AVX2_VERIFY_H | |||
#define PQCLEAN_KYBER1024_AVX2_VERIFY_H | |||
#include "params.h" | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
uint8_t PQCLEAN_KYBER1024_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len); | |||
int PQCLEAN_KYBER1024_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len); | |||
void PQCLEAN_KYBER1024_AVX2_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b); | |||
@@ -1,14 +1,4 @@ | |||
kyber-20170627 | |||
Public Domain | |||
Authors: Joppe Bos, | |||
Léo Ducas, | |||
Eike Kiltz , | |||
Tancrède Lepoint, | |||
Vadim Lyubashevsky, | |||
John Schanck, | |||
Peter Schwabe, | |||
Gregor Seiler, | |||
Damien Stehlé | |||
Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) | |||
For Keccak and AES we are using public-domain | |||
code from sources and by authors listed in | |||
@@ -1,8 +1,8 @@ | |||
# This Makefile can be used with GNU Make or BSD Make | |||
LIB=libkyber1024_clean.a | |||
HEADERS=api.h cbd.h indcpa.h ntt.h params.h poly.h polyvec.h reduce.h verify.h symmetric.h | |||
OBJECTS=cbd.o indcpa.o kem.o ntt.o poly.o polyvec.o reduce.o verify.o symmetric-fips202.o | |||
HEADERS=api.h cbd.h indcpa.h kem.h ntt.h params.h poly.h polyvec.h reduce.h verify.h symmetric.h | |||
OBJECTS=cbd.o indcpa.o kem.o ntt.o poly.o polyvec.o reduce.o verify.o symmetric-shake.o | |||
CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) | |||
@@ -2,7 +2,7 @@ | |||
# nmake /f Makefile.Microsoft_nmake | |||
LIBRARY=libkyber1024_clean.lib | |||
OBJECTS=cbd.obj indcpa.obj kem.obj ntt.obj poly.obj polyvec.obj reduce.obj verify.obj symmetric-fips202.obj | |||
OBJECTS=cbd.obj indcpa.obj kem.obj ntt.obj poly.obj polyvec.obj reduce.obj verify.obj symmetric-shake.obj | |||
# Warning C4146 is raised when a unary minus operator is applied to an | |||
# unsigned type; this has nonetheless been standard and portable for as | |||
@@ -1,7 +1,5 @@ | |||
#include "cbd.h" | |||
#include "params.h" | |||
#include <stddef.h> | |||
#include "cbd.h" | |||
#include <stdint.h> | |||
/************************************************* | |||
@@ -14,8 +12,8 @@ | |||
* | |||
* Returns 32-bit unsigned integer loaded from x | |||
**************************************************/ | |||
static uint32_t load32_littleendian(const uint8_t *x) { | |||
uint32_t r; | |||
static uint32_t load32_littleendian(const uint8_t x[4]) { | |||
uint32_t r = 0; | |||
r = (uint32_t)x[0]; | |||
r |= (uint32_t)x[1] << 8; | |||
r |= (uint32_t)x[2] << 16; | |||
@@ -24,27 +22,27 @@ static uint32_t load32_littleendian(const uint8_t *x) { | |||
} | |||
/************************************************* | |||
* Name: cbd | |||
* Name: PQCLEAN_KYBER1024_CLEAN_cbd | |||
* | |||
* Description: Given an array of uniformly random bytes, compute | |||
* polynomial with coefficients distributed according to | |||
* a centered binomial distribution with parameter KYBER_ETA | |||
* specialized for KYBER_ETA=2 | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *buf: pointer to input byte array | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_CLEAN_cbd(poly *r, const uint8_t *buf) { | |||
uint32_t d, t; | |||
int16_t a, b; | |||
void PQCLEAN_KYBER1024_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]) { | |||
unsigned int i = 0, j = 0; | |||
uint32_t t = 0, d = 0; | |||
int16_t a = 0, b = 0; | |||
for (size_t i = 0; i < KYBER_N / 8; i++) { | |||
t = load32_littleendian(buf + 4 * i); | |||
for (i = 0; i < KYBER_N / 8; i++) { | |||
t = load32_littleendian(buf + 4 * i); | |||
d = t & 0x55555555; | |||
d += (t >> 1) & 0x55555555; | |||
for (size_t j = 0; j < 8; j++) { | |||
a = (d >> 4 * j) & 0x3; | |||
for (j = 0; j < 8; j++) { | |||
a = (d >> (4 * j + 0)) & 0x3; | |||
b = (d >> (4 * j + 2)) & 0x3; | |||
r->coeffs[8 * i + j] = a - b; | |||
} | |||
@@ -1,8 +1,11 @@ | |||
#ifndef CBD_H | |||
#define CBD_H | |||
#ifndef PQCLEAN_KYBER1024_CLEAN_CBD_H | |||
#define PQCLEAN_KYBER1024_CLEAN_CBD_H | |||
#include "params.h" | |||
#include "poly.h" | |||
#include <stdint.h> | |||
void PQCLEAN_KYBER1024_CLEAN_cbd(poly *r, const uint8_t *buf); | |||
void PQCLEAN_KYBER1024_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]); | |||
#endif |
@@ -5,7 +5,7 @@ | |||
#include "polyvec.h" | |||
#include "randombytes.h" | |||
#include "symmetric.h" | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
/************************************************* | |||
@@ -16,12 +16,15 @@ | |||
* and the public seed used to generate the matrix A. | |||
* | |||
* Arguments: uint8_t *r: pointer to the output serialized public key | |||
* const poly *pk: pointer to the input public-key polynomial | |||
* polyvec *pk: pointer to the input public-key polyvec | |||
* const uint8_t *seed: pointer to the input public seed | |||
**************************************************/ | |||
static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) { | |||
static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], | |||
polyvec *pk, | |||
const uint8_t seed[KYBER_SYMBYTES]) { | |||
size_t i = 0; | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_tobytes(r, pk); | |||
for (size_t i = 0; i < KYBER_SYMBYTES; i++) { | |||
for (i = 0; i < KYBER_SYMBYTES; i++) { | |||
r[i + KYBER_POLYVECBYTES] = seed[i]; | |||
} | |||
} | |||
@@ -32,13 +35,18 @@ static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) { | |||
* Description: De-serialize public key from a byte array; | |||
* approximate inverse of pack_pk | |||
* | |||
* Arguments: - polyvec *pk: pointer to output public-key vector of polynomials | |||
* - uint8_t *seed: pointer to output seed to generate matrix A | |||
* Arguments: - polyvec *pk: pointer to output public-key | |||
* polynomial vector | |||
* - uint8_t *seed: pointer to output seed to generate | |||
* matrix A | |||
* - const uint8_t *packedpk: pointer to input serialized public key | |||
**************************************************/ | |||
static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) { | |||
static void unpack_pk(polyvec *pk, | |||
uint8_t seed[KYBER_SYMBYTES], | |||
const uint8_t packedpk[KYBER_INDCPA_PUBLICKEYBYTES]) { | |||
size_t i = 0; | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_frombytes(pk, packedpk); | |||
for (size_t i = 0; i < KYBER_SYMBYTES; i++) { | |||
for (i = 0; i < KYBER_SYMBYTES; i++) { | |||
seed[i] = packedpk[i + KYBER_POLYVECBYTES]; | |||
} | |||
} | |||
@@ -49,9 +57,9 @@ static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) { | |||
* Description: Serialize the secret key | |||
* | |||
* Arguments: - uint8_t *r: pointer to output serialized secret key | |||
* - const polyvec *sk: pointer to input vector of polynomials (secret key) | |||
* - polyvec *sk: pointer to input vector of polynomials (secret key) | |||
**************************************************/ | |||
static void pack_sk(uint8_t *r, polyvec *sk) { | |||
static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_tobytes(r, sk); | |||
} | |||
@@ -61,10 +69,12 @@ static void pack_sk(uint8_t *r, polyvec *sk) { | |||
* Description: De-serialize the secret key; | |||
* inverse of pack_sk | |||
* | |||
* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key) | |||
* Arguments: - polyvec *sk: pointer to output vector of | |||
* polynomials (secret key) | |||
* - const uint8_t *packedsk: pointer to input serialized secret key | |||
**************************************************/ | |||
static void unpack_sk(polyvec *sk, const uint8_t *packedsk) { | |||
static void unpack_sk(polyvec *sk, | |||
const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_frombytes(sk, packedsk); | |||
} | |||
@@ -75,11 +85,13 @@ static void unpack_sk(polyvec *sk, const uint8_t *packedsk) { | |||
* compressed and serialized vector of polynomials b | |||
* and the compressed and serialized polynomial v | |||
* | |||
* Arguments: uint8_t *r: pointer to the output serialized ciphertext | |||
* const poly *pk: pointer to the input vector of polynomials b | |||
* const uint8_t *seed: pointer to the input polynomial v | |||
* Arguments: uint8_t *r: pointer to the output serialized ciphertext | |||
* poly *pk: pointer to the input vector of polynomials b | |||
* poly *v: pointer to the input polynomial v | |||
**************************************************/ | |||
static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) { | |||
static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], | |||
polyvec *b, | |||
poly *v) { | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_compress(r, b); | |||
PQCLEAN_KYBER1024_CLEAN_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v); | |||
} | |||
@@ -90,11 +102,13 @@ static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) { | |||
* Description: De-serialize and decompress ciphertext from a byte array; | |||
* approximate inverse of pack_ciphertext | |||
* | |||
* Arguments: - polyvec *b: pointer to the output vector of polynomials b | |||
* - poly *v: pointer to the output polynomial v | |||
* Arguments: - polyvec *b: pointer to the output vector of polynomials b | |||
* - poly *v: pointer to the output polynomial v | |||
* - const uint8_t *c: pointer to the input serialized ciphertext | |||
**************************************************/ | |||
static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t *c) { | |||
static void unpack_ciphertext(polyvec *b, | |||
poly *v, | |||
const uint8_t c[KYBER_INDCPA_BYTES]) { | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_decompress(b, c); | |||
PQCLEAN_KYBER1024_CLEAN_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES); | |||
} | |||
@@ -105,24 +119,29 @@ static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t *c) { | |||
* Description: Run rejection sampling on uniform random bytes to generate | |||
* uniform random integers mod q | |||
* | |||
* Arguments: - int16_t *r: pointer to output buffer | |||
* - size_t len: requested number of 16-bit integers (uniform mod q) | |||
* - const uint8_t *buf: pointer to input buffer (assumed to be uniform random bytes) | |||
* - size_t buflen: length of input buffer in bytes | |||
* Arguments: - int16_t *r: pointer to output buffer | |||
* - unsigned int len: requested number of 16-bit integers | |||
* (uniform mod q) | |||
* - const uint8_t *buf: pointer to input buffer | |||
* (assumed to be uniform random bytes) | |||
* - unsigned int buflen: length of input buffer in bytes | |||
* | |||
* Returns number of sampled 16-bit integers (at most len) | |||
**************************************************/ | |||
static size_t rej_uniform(int16_t *r, size_t len, const uint8_t *buf, size_t buflen) { | |||
size_t ctr, pos; | |||
uint16_t val; | |||
static unsigned int rej_uniform(int16_t *r, | |||
unsigned int len, | |||
const uint8_t *buf, | |||
unsigned int buflen) { | |||
unsigned int ctr = 0, pos = 0; | |||
uint16_t val = 0; | |||
ctr = pos = 0; | |||
while (ctr < len && pos + 2 <= buflen) { | |||
val = (uint16_t)(buf[pos] | ((uint16_t)buf[pos + 1] << 8)); | |||
val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); | |||
pos += 2; | |||
if (val < 19 * KYBER_Q) { | |||
val -= (uint16_t)((val >> 12) * KYBER_Q); // Barrett reduction | |||
val -= (val >> 12) * KYBER_Q; // Barrett reduction | |||
r[ctr++] = (int16_t)val; | |||
} | |||
} | |||
@@ -130,26 +149,28 @@ static size_t rej_uniform(int16_t *r, size_t len, const uint8_t *buf, size_t buf | |||
return ctr; | |||
} | |||
#define gen_a(A,B) gen_matrix(A,B,0) | |||
#define gen_at(A,B) gen_matrix(A,B,1) | |||
#define gen_a(A,B) PQCLEAN_KYBER1024_CLEAN_gen_matrix(A,B,0) | |||
#define gen_at(A,B) PQCLEAN_KYBER1024_CLEAN_gen_matrix(A,B,1) | |||
/************************************************* | |||
* Name: gen_matrix | |||
* Name: PQCLEAN_KYBER1024_CLEAN_gen_matrix | |||
* | |||
* Description: Deterministically generate matrix A (or the transpose of A) | |||
* from a seed. Entries of the matrix are polynomials that look | |||
* uniformly random. Performs rejection sampling on output of | |||
* a XOF | |||
* | |||
* Arguments: - polyvec *a: pointer to ouptput matrix A | |||
* Arguments: - polyvec *a: pointer to ouptput matrix A | |||
* - const uint8_t *seed: pointer to input seed | |||
* - int transposed: boolean deciding whether A or A^T is generated | |||
* - int transposed: boolean deciding whether A or A^T | |||
* is generated | |||
**************************************************/ | |||
#define MAXNBLOCKS ((530+XOF_BLOCKBYTES)/XOF_BLOCKBYTES) /* 530 is expected number of required bytes */ | |||
static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { | |||
size_t ctr; | |||
uint8_t i, j; | |||
uint8_t buf[XOF_BLOCKBYTES * MAXNBLOCKS + 1]; | |||
#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \ | |||
+ XOF_BLOCKBYTES)/XOF_BLOCKBYTES) | |||
// Not static for benchmarking | |||
void PQCLEAN_KYBER1024_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed) { | |||
unsigned int ctr = 0, i = 0, j = 0; | |||
uint8_t buf[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES]; | |||
xof_state state; | |||
for (i = 0; i < KYBER_K; i++) { | |||
@@ -160,12 +181,13 @@ static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { | |||
xof_absorb(&state, seed, j, i); | |||
} | |||
xof_squeezeblocks(buf, MAXNBLOCKS, &state); | |||
ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, MAXNBLOCKS * XOF_BLOCKBYTES); | |||
xof_squeezeblocks(buf, GEN_MATRIX_NBLOCKS, &state); | |||
ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, sizeof(buf)); | |||
while (ctr < KYBER_N) { | |||
xof_squeezeblocks(buf, 1, &state); | |||
ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, XOF_BLOCKBYTES); | |||
ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, | |||
XOF_BLOCKBYTES); | |||
} | |||
xof_ctx_release(&state); | |||
} | |||
@@ -173,40 +195,44 @@ static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { | |||
} | |||
/************************************************* | |||
* Name: indcpa_keypair | |||
* Name: PQCLEAN_KYBER1024_CLEAN_indcpa_keypair | |||
* | |||
* Description: Generates public and private key for the CPA-secure | |||
* public-key encryption scheme underlying Kyber | |||
* | |||
* Arguments: - uint8_t *pk: pointer to output public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) | |||
* - uint8_t *sk: pointer to output private key (of length KYBER_INDCPA_SECRETKEYBYTES bytes) | |||
* Arguments: - uint8_t *pk: pointer to output public key | |||
* (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) | |||
* - uint8_t *sk: pointer to output private key | |||
(of length KYBER_INDCPA_SECRETKEYBYTES bytes) | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_CLEAN_indcpa_keypair(uint8_t *pk, uint8_t *sk) { | |||
polyvec a[KYBER_K], e, pkpv, skpv; | |||
void PQCLEAN_KYBER1024_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], | |||
uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { | |||
unsigned int i = 0; | |||
uint8_t buf[2 * KYBER_SYMBYTES]; | |||
uint8_t *publicseed = buf; | |||
uint8_t *noiseseed = buf + KYBER_SYMBYTES; | |||
const uint8_t *publicseed = buf; | |||
const uint8_t *noiseseed = buf + KYBER_SYMBYTES; | |||
uint8_t nonce = 0; | |||
polyvec a[KYBER_K], e, pkpv, skpv; | |||
randombytes(buf, KYBER_SYMBYTES); | |||
hash_g(buf, buf, KYBER_SYMBYTES); | |||
gen_a(a, publicseed); | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_CLEAN_poly_getnoise(skpv.vec + i, noiseseed, nonce++); | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_CLEAN_poly_getnoise(&skpv.vec[i], noiseseed, nonce++); | |||
} | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_CLEAN_poly_getnoise(e.vec + i, noiseseed, nonce++); | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_CLEAN_poly_getnoise(&e.vec[i], noiseseed, nonce++); | |||
} | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_ntt(&skpv); | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_ntt(&e); | |||
// matrix-vector multiplication | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc(&pkpv.vec[i], &a[i], &skpv); | |||
PQCLEAN_KYBER1024_CLEAN_poly_frommont(&pkpv.vec[i]); | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); | |||
PQCLEAN_KYBER1024_CLEAN_poly_tomont(&pkpv.vec[i]); | |||
} | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_add(&pkpv, &pkpv, &e); | |||
@@ -217,34 +243,40 @@ void PQCLEAN_KYBER1024_CLEAN_indcpa_keypair(uint8_t *pk, uint8_t *sk) { | |||
} | |||
/************************************************* | |||
* Name: indcpa_enc | |||
* Name: PQCLEAN_KYBER1024_CLEAN_indcpa_enc | |||
* | |||
* Description: Encryption function of the CPA-secure | |||
* public-key encryption scheme underlying Kyber. | |||
* | |||
* Arguments: - uint8_t *c: pointer to output ciphertext (of length KYBER_INDCPA_BYTES bytes) | |||
* - const uint8_t *m: pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes) | |||
* - const uint8_t *pk: pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) | |||
* - const uint8_t *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes) | |||
* to deterministically generate all randomness | |||
* Arguments: - uint8_t *c: pointer to output ciphertext | |||
* (of length KYBER_INDCPA_BYTES bytes) | |||
* - const uint8_t *m: pointer to input message | |||
* (of length KYBER_INDCPA_MSGBYTES bytes) | |||
* - const uint8_t *pk: pointer to input public key | |||
* (of length KYBER_INDCPA_PUBLICKEYBYTES) | |||
* - const uint8_t *coins: pointer to input random coins | |||
* used as seed (of length KYBER_SYMBYTES) | |||
* to deterministically generate all | |||
* randomness | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_CLEAN_indcpa_enc(uint8_t *c, | |||
const uint8_t *m, | |||
const uint8_t *pk, | |||
const uint8_t *coins) { | |||
polyvec sp, pkpv, ep, at[KYBER_K], bp; | |||
poly v, k, epp; | |||
void PQCLEAN_KYBER1024_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], | |||
const uint8_t m[KYBER_INDCPA_MSGBYTES], | |||
const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], | |||
const uint8_t coins[KYBER_SYMBYTES]) { | |||
unsigned int i = 0; | |||
uint8_t seed[KYBER_SYMBYTES]; | |||
uint8_t nonce = 0; | |||
polyvec sp, pkpv, ep, at[KYBER_K], bp; | |||
poly v, k, epp; | |||
unpack_pk(&pkpv, seed, pk); | |||
PQCLEAN_KYBER1024_CLEAN_poly_frommsg(&k, m); | |||
gen_at(at, seed); | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_CLEAN_poly_getnoise(sp.vec + i, coins, nonce++); | |||
} | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_CLEAN_poly_getnoise(ep.vec + i, coins, nonce++); | |||
} | |||
PQCLEAN_KYBER1024_CLEAN_poly_getnoise(&epp, coins, nonce++); | |||
@@ -252,14 +284,14 @@ void PQCLEAN_KYBER1024_CLEAN_indcpa_enc(uint8_t *c, | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_ntt(&sp); | |||
// matrix-vector multiplication | |||
for (size_t i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc(&bp.vec[i], &at[i], &sp); | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp); | |||
} | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc(&v, &pkpv, &sp); | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp); | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_invntt(&bp); | |||
PQCLEAN_KYBER1024_CLEAN_poly_invntt(&v); | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_invntt_tomont(&bp); | |||
PQCLEAN_KYBER1024_CLEAN_poly_invntt_tomont(&v); | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_add(&bp, &bp, &ep); | |||
PQCLEAN_KYBER1024_CLEAN_poly_add(&v, &v, &epp); | |||
@@ -271,18 +303,21 @@ void PQCLEAN_KYBER1024_CLEAN_indcpa_enc(uint8_t *c, | |||
} | |||
/************************************************* | |||
* Name: indcpa_dec | |||
* Name: PQCLEAN_KYBER1024_CLEAN_indcpa_dec | |||
* | |||
* Description: Decryption function of the CPA-secure | |||
* public-key encryption scheme underlying Kyber. | |||
* | |||
* Arguments: - uint8_t *m: pointer to output decrypted message (of length KYBER_INDCPA_MSGBYTES) | |||
* - const uint8_t *c: pointer to input ciphertext (of length KYBER_INDCPA_BYTES) | |||
* - const uint8_t *sk: pointer to input secret key (of length KYBER_INDCPA_SECRETKEYBYTES) | |||
* Arguments: - uint8_t *m: pointer to output decrypted message | |||
* (of length KYBER_INDCPA_MSGBYTES) | |||
* - const uint8_t *c: pointer to input ciphertext | |||
* (of length KYBER_INDCPA_BYTES) | |||
* - const uint8_t *sk: pointer to input secret key | |||
* (of length KYBER_INDCPA_SECRETKEYBYTES) | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_CLEAN_indcpa_dec(uint8_t *m, | |||
const uint8_t *c, | |||
const uint8_t *sk) { | |||
void PQCLEAN_KYBER1024_CLEAN_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], | |||
const uint8_t c[KYBER_INDCPA_BYTES], | |||
const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { | |||
polyvec bp, skpv; | |||
poly v, mp; | |||
@@ -290,8 +325,8 @@ void PQCLEAN_KYBER1024_CLEAN_indcpa_dec(uint8_t *m, | |||
unpack_sk(&skpv, sk); | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_ntt(&bp); | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc(&mp, &skpv, &bp); | |||
PQCLEAN_KYBER1024_CLEAN_poly_invntt(&mp); | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp); | |||
PQCLEAN_KYBER1024_CLEAN_poly_invntt_tomont(&mp); | |||
PQCLEAN_KYBER1024_CLEAN_poly_sub(&mp, &v, &mp); | |||
PQCLEAN_KYBER1024_CLEAN_poly_reduce(&mp); | |||