@@ -6,7 +6,7 @@ length-public-key: 1568 | |||
length-ciphertext: 1568 | |||
length-secret-key: 3168 | |||
length-shared-secret: 32 | |||
nistkat-sha256: d3064040a33c15b65eb55dfd1bb116d092dab2cf5d693f8ab02b91ed105d66e3 | |||
nistkat-sha256: a1b564348a126a118fbc49a6aeaebcb74896753fd99f30eeb0f75f0b2d25115f | |||
principal-submitters: | |||
- Peter Schwabe | |||
auxiliary-submitters: | |||
@@ -21,9 +21,9 @@ auxiliary-submitters: | |||
- Damien Stehlé | |||
implementations: | |||
- name: clean | |||
version: https://github.com/pq-crystals/kyber/commit/844057468e69527bd15b17fbe03f4b61f9a22065 via https://github.com/jschanck/package-pqclean/tree/b45068b8/kyber | |||
version: https://github.com/pq-crystals/kyber/commit/e7faae9f662f5b92fee4e966f09b2f23e1e91c65 via https://github.com/jschanck/package-pqclean/tree/231c9bec/kyber | |||
- name: avx2 | |||
version: https://github.com/pq-crystals/kyber/commit/844057468e69527bd15b17fbe03f4b61f9a22065 via https://github.com/jschanck/package-pqclean/tree/b45068b8/kyber | |||
version: https://github.com/pq-crystals/kyber/commit/e7faae9f662f5b92fee4e966f09b2f23e1e91c65 via https://github.com/jschanck/package-pqclean/tree/231c9bec/kyber | |||
supported_platforms: | |||
- architecture: x86_64 | |||
operating_systems: | |||
@@ -2,52 +2,48 @@ | |||
#include <immintrin.h> | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
/* | |||
Based heavily on public-domain code by Romain Dolbeau | |||
Different handling of nonce+counter than original version | |||
using separated 64-bit nonce and internal 64-bit counter, starting from zero | |||
Public Domain | |||
*/ | |||
/* Based heavily on public-domain code by Romain Dolbeau | |||
* Different handling of nonce+counter than original version using | |||
* separated 64-bit nonce and internal 64-bit counter, starting from zero | |||
* Public Domain */ | |||
static inline void aesni_encrypt4(uint8_t out[64], | |||
__m128i *n, | |||
const __m128i rkeys[16]) { | |||
__m128i f, f0, f1, f2, f3, t; | |||
static inline void aesni_encrypt4(uint8_t out[64], __m128i *n, const __m128i rkeys[16]) { | |||
__m128i f, f0, f1, f2, f3; | |||
const __m128i idx = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); | |||
/* Load current counter value */ | |||
f = _mm_load_si128(n); | |||
/* Increase counter in 4 consecutive blocks */ | |||
t = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); | |||
f0 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(0, 0)), t); | |||
f1 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(1, 0)), t); | |||
f2 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(2, 0)), t); | |||
f3 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(3, 0)), t); | |||
f0 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(0, 0)), idx); | |||
f1 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(1, 0)), idx); | |||
f2 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(2, 0)), idx); | |||
f3 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(3, 0)), idx); | |||
/* Write counter for next iteration, increased by 4 */ | |||
_mm_store_si128(n, _mm_add_epi64(f, _mm_set_epi64x(4, 0))); | |||
/* Actual AES encryption, 4x interleaved */ | |||
t = _mm_load_si128(&rkeys[0]); | |||
f0 = _mm_xor_si128(f0, t); | |||
f1 = _mm_xor_si128(f1, t); | |||
f2 = _mm_xor_si128(f2, t); | |||
f3 = _mm_xor_si128(f3, t); | |||
f = _mm_load_si128(&rkeys[0]); | |||
f0 = _mm_xor_si128(f0, f); | |||
f1 = _mm_xor_si128(f1, f); | |||
f2 = _mm_xor_si128(f2, f); | |||
f3 = _mm_xor_si128(f3, f); | |||
for (int i = 1; i < 14; i++) { | |||
t = _mm_load_si128(&rkeys[i]); | |||
f0 = _mm_aesenc_si128(f0, t); | |||
f1 = _mm_aesenc_si128(f1, t); | |||
f2 = _mm_aesenc_si128(f2, t); | |||
f3 = _mm_aesenc_si128(f3, t); | |||
f = _mm_load_si128(&rkeys[i]); | |||
f0 = _mm_aesenc_si128(f0, f); | |||
f1 = _mm_aesenc_si128(f1, f); | |||
f2 = _mm_aesenc_si128(f2, f); | |||
f3 = _mm_aesenc_si128(f3, f); | |||
} | |||
t = _mm_load_si128(&rkeys[14]); | |||
f0 = _mm_aesenclast_si128(f0, t); | |||
f1 = _mm_aesenclast_si128(f1, t); | |||
f2 = _mm_aesenclast_si128(f2, t); | |||
f3 = _mm_aesenclast_si128(f3, t); | |||
f = _mm_load_si128(&rkeys[14]); | |||
f0 = _mm_aesenclast_si128(f0, f); | |||
f1 = _mm_aesenclast_si128(f1, f); | |||
f2 = _mm_aesenclast_si128(f2, f); | |||
f3 = _mm_aesenclast_si128(f3, f); | |||
/* Write results */ | |||
_mm_storeu_si128((__m128i *)(out + 0), f0); | |||
@@ -134,6 +130,7 @@ void PQCLEAN_KYBER102490S_AVX2_aes256ctr_prf(uint8_t *out, | |||
while (outlen >= 64) { | |||
aesni_encrypt4(out, &state.n, state.rkeys); | |||
outlen -= 64; | |||
out += 64; | |||
} | |||
if (outlen) { | |||
@@ -2,22 +2,18 @@ | |||
#define PQCLEAN_KYBER102490S_AVX2_ALIGN_H | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
#define ALIGN16_TYPE(t) \ | |||
union { \ | |||
__m128i vec; \ | |||
t orig; \ | |||
#define ALIGNED_UINT8(N) \ | |||
union { \ | |||
uint8_t coeffs[(N)]; \ | |||
__m256i vec[((N)+31)/32]; \ | |||
} | |||
#define ALIGN32_ARRAY(t, s) \ | |||
union { \ | |||
__m256i vec; \ | |||
t arr[(s)]; \ | |||
#define ALIGNED_INT16(N) \ | |||
union { \ | |||
int16_t coeffs[(N)]; \ | |||
__m256i vec[((N)+15)/16]; \ | |||
} | |||
#define ALIGN32_ARRAY_2D(t, n, m) \ | |||
union { \ | |||
__m256i vec; \ | |||
t arr[(n)][(m)]; \ | |||
} | |||
#endif |
@@ -1,248 +1,107 @@ | |||
#include "cdecl.h" | |||
#include "params.h" | |||
.macro schoolbook off,sign | |||
#load | |||
vmovdqa \off+32(%rsi),%ymm7 # b | |||
vmovdqa \off+32(%rdx),%ymm8 # d | |||
vmovdqa \off(%rsi),%ymm9 # a | |||
vmovdqa \off(%rdx),%ymm10 # c | |||
#mul | |||
vpmullw %ymm7,%ymm8,%ymm11 # bd.lo | |||
vpmulhw %ymm7,%ymm8,%ymm12 # bd.hi | |||
vpmullw %ymm7,%ymm10,%ymm13 # bc.lo | |||
vpmulhw %ymm7,%ymm10,%ymm7 # bc.hi | |||
vpmullw %ymm9,%ymm8,%ymm14 # ad.lo | |||
vpmulhw %ymm9,%ymm8,%ymm8 # ad.hi | |||
vpmullw %ymm9,%ymm10,%ymm15 # ac.lo | |||
vpmulhw %ymm9,%ymm10,%ymm9 # ac.hi | |||
#reduce | |||
vpmullw %ymm1,%ymm11,%ymm11 | |||
vpmulhw %ymm0,%ymm11,%ymm11 | |||
vpsubw %ymm11,%ymm12,%ymm11 # bd | |||
#mul | |||
vpmullw %ymm2,%ymm11,%ymm10 # rbd.lo | |||
vpmulhw %ymm2,%ymm11,%ymm11 # rbd.hi | |||
#unpack | |||
vpunpcklwd %ymm7,%ymm13,%ymm12 # bc0 | |||
vpunpckhwd %ymm7,%ymm13,%ymm7 # bc1 | |||
vpunpcklwd %ymm8,%ymm14,%ymm13 # ad0 | |||
vpunpckhwd %ymm8,%ymm14,%ymm8 # ad1 | |||
vpunpcklwd %ymm9,%ymm15,%ymm14 # ac0 | |||
vpunpckhwd %ymm9,%ymm15,%ymm9 # ac1 | |||
vpunpcklwd %ymm11,%ymm10,%ymm15 # rbd0 | |||
vpunpckhwd %ymm11,%ymm10,%ymm10 # rbd1 | |||
#add | |||
.ifeq \sign | |||
vpaddd %ymm14,%ymm15,%ymm14 # x0 | |||
vpaddd %ymm9,%ymm10,%ymm9 # x1 | |||
.else | |||
vpsubd %ymm15,%ymm14,%ymm14 # x0 | |||
vpsubd %ymm10,%ymm9,%ymm9 # x1 | |||
.endif | |||
vpaddd %ymm12,%ymm13,%ymm12 # y0 | |||
vpaddd %ymm7,%ymm8,%ymm7 # y1 | |||
.endm | |||
.macro red a0,a1,b0,b1,x,y,z | |||
#pack | |||
vpxor %ymm\x,%ymm\x,%ymm\x | |||
vpblendw $0xAA,%ymm\x,%ymm\a0,%ymm\y | |||
vpblendw $0xAA,%ymm\x,%ymm\a1,%ymm\z | |||
vpsrld $16,%ymm\a0,%ymm\a0 | |||
vpsrld $16,%ymm\a1,%ymm\a1 | |||
vpackusdw %ymm\z,%ymm\y,%ymm\z | |||
vpackusdw %ymm\a1,%ymm\a0,%ymm\a0 | |||
vpblendw $0xAA,%ymm\x,%ymm\b0,%ymm\y | |||
vpblendw $0xAA,%ymm\x,%ymm\b1,%ymm\x | |||
vpsrld $16,%ymm\b0,%ymm\b0 | |||
vpsrld $16,%ymm\b1,%ymm\b1 | |||
vpackusdw %ymm\x,%ymm\y,%ymm\y | |||
vpackusdw %ymm\b1,%ymm\b0,%ymm\b0 | |||
#reduce | |||
vpmullw %ymm1,%ymm\z,%ymm\z | |||
vpmullw %ymm1,%ymm\y,%ymm\y | |||
vpmulhw %ymm0,%ymm\z,%ymm\z | |||
vpmulhw %ymm0,%ymm\y,%ymm\y | |||
vpsubw %ymm\z,%ymm\a0,%ymm\a0 | |||
vpsubw %ymm\y,%ymm\b0,%ymm\b0 | |||
.macro schoolbook off | |||
vmovdqa _16XQINV*2(%rcx),%ymm0 | |||
vmovdqa (64*\off+ 0)*2(%rsi),%ymm1 # a0 | |||
vmovdqa (64*\off+16)*2(%rsi),%ymm2 # b0 | |||
vmovdqa (64*\off+32)*2(%rsi),%ymm3 # a1 | |||
vmovdqa (64*\off+48)*2(%rsi),%ymm4 # b1 | |||
vpmullw %ymm0,%ymm1,%ymm9 # a0.lo | |||
vpmullw %ymm0,%ymm2,%ymm10 # b0.lo | |||
vpmullw %ymm0,%ymm3,%ymm11 # a1.lo | |||
vpmullw %ymm0,%ymm4,%ymm12 # b1.lo | |||
vmovdqa (64*\off+ 0)*2(%rdx),%ymm5 # c0 | |||
vmovdqa (64*\off+16)*2(%rdx),%ymm6 # d0 | |||
vpmulhw %ymm5,%ymm1,%ymm13 # a0c0.hi | |||
vpmulhw %ymm6,%ymm1,%ymm1 # a0d0.hi | |||
vpmulhw %ymm5,%ymm2,%ymm14 # b0c0.hi | |||
vpmulhw %ymm6,%ymm2,%ymm2 # b0d0.hi | |||
vmovdqa (64*\off+32)*2(%rdx),%ymm7 # c1 | |||
vmovdqa (64*\off+48)*2(%rdx),%ymm8 # d1 | |||
vpmulhw %ymm7,%ymm3,%ymm15 # a1c1.hi | |||
vpmulhw %ymm8,%ymm3,%ymm3 # a1d1.hi | |||
vpmulhw %ymm7,%ymm4,%ymm0 # b1c1.hi | |||
vpmulhw %ymm8,%ymm4,%ymm4 # b1d1.hi | |||
vmovdqa %ymm13,(%rsp) | |||
vpmullw %ymm5,%ymm9,%ymm13 # a0c0.lo | |||
vpmullw %ymm6,%ymm9,%ymm9 # a0d0.lo | |||
vpmullw %ymm5,%ymm10,%ymm5 # b0c0.lo | |||
vpmullw %ymm6,%ymm10,%ymm10 # b0d0.lo | |||
vpmullw %ymm7,%ymm11,%ymm6 # a1c1.lo | |||
vpmullw %ymm8,%ymm11,%ymm11 # a1d1.lo | |||
vpmullw %ymm7,%ymm12,%ymm7 # b1c1.lo | |||
vpmullw %ymm8,%ymm12,%ymm12 # b1d1.lo | |||
vmovdqa _16XQ*2(%rcx),%ymm8 | |||
vpmulhw %ymm8,%ymm13,%ymm13 | |||
vpmulhw %ymm8,%ymm9,%ymm9 | |||
vpmulhw %ymm8,%ymm5,%ymm5 | |||
vpmulhw %ymm8,%ymm10,%ymm10 | |||
vpmulhw %ymm8,%ymm6,%ymm6 | |||
vpmulhw %ymm8,%ymm11,%ymm11 | |||
vpmulhw %ymm8,%ymm7,%ymm7 | |||
vpmulhw %ymm8,%ymm12,%ymm12 | |||
vpsubw (%rsp),%ymm13,%ymm13 # -a0c0 | |||
vpsubw %ymm9,%ymm1,%ymm9 # a0d0 | |||
vpsubw %ymm5,%ymm14,%ymm5 # b0c0 | |||
vpsubw %ymm10,%ymm2,%ymm10 # b0d0 | |||
vpsubw %ymm6,%ymm15,%ymm6 # a1c1 | |||
vpsubw %ymm11,%ymm3,%ymm11 # a1d1 | |||
vpsubw %ymm7,%ymm0,%ymm7 # b1c1 | |||
vpsubw %ymm12,%ymm4,%ymm12 # b1d1 | |||
vmovdqa (%r9),%ymm0 | |||
vmovdqa 32(%r9),%ymm1 | |||
vpmullw %ymm0,%ymm10,%ymm2 | |||
vpmullw %ymm0,%ymm12,%ymm3 | |||
vpmulhw %ymm1,%ymm10,%ymm10 | |||
vpmulhw %ymm1,%ymm12,%ymm12 | |||
vpmulhw %ymm8,%ymm2,%ymm2 | |||
vpmulhw %ymm8,%ymm3,%ymm3 | |||
vpsubw %ymm2,%ymm10,%ymm10 # rb0d0 | |||
vpsubw %ymm3,%ymm12,%ymm12 # rb1d1 | |||
vpaddw %ymm5,%ymm9,%ymm9 | |||
vpaddw %ymm7,%ymm11,%ymm11 | |||
vpsubw %ymm13,%ymm10,%ymm13 | |||
vpsubw %ymm12,%ymm6,%ymm6 | |||
vmovdqa %ymm13,(64*\off+ 0)*2(%rdi) | |||
vmovdqa %ymm9,(64*\off+16)*2(%rdi) | |||
vmovdqa %ymm6,(64*\off+32)*2(%rdi) | |||
vmovdqa %ymm11,(64*\off+48)*2(%rdi) | |||
.endm | |||
.text | |||
basemul64_acc_avx: | |||
poly0.0: | |||
schoolbook 0,0 | |||
#mov | |||
vmovdqa %ymm14,%ymm3 | |||
vmovdqa %ymm9,%ymm4 | |||
vmovdqa %ymm12,%ymm5 | |||
vmovdqa %ymm7,%ymm6 | |||
poly1.0: | |||
schoolbook 512,0 | |||
#add | |||
vpaddd %ymm14,%ymm3,%ymm3 | |||
vpaddd %ymm9,%ymm4,%ymm4 | |||
vpaddd %ymm12,%ymm5,%ymm5 | |||
vpaddd %ymm7,%ymm6,%ymm6 | |||
poly2.0: | |||
schoolbook 1024,0 | |||
#add | |||
vpaddd %ymm14,%ymm3,%ymm3 | |||
vpaddd %ymm9,%ymm4,%ymm4 | |||
vpaddd %ymm12,%ymm5,%ymm5 | |||
vpaddd %ymm7,%ymm6,%ymm6 | |||
poly3.0: | |||
schoolbook 1536,0 | |||
#add | |||
vpaddd %ymm14,%ymm3,%ymm3 | |||
vpaddd %ymm9,%ymm4,%ymm4 | |||
vpaddd %ymm12,%ymm5,%ymm5 | |||
vpaddd %ymm7,%ymm6,%ymm6 | |||
#reduce | |||
red 3,4,5,6,7,8,9 | |||
#store | |||
vmovdqa %ymm3,(%rdi) | |||
vmovdqa %ymm5,32(%rdi) | |||
poly0.1: | |||
schoolbook 64,1 | |||
#mov | |||
vmovdqa %ymm14,%ymm3 | |||
vmovdqa %ymm9,%ymm4 | |||
vmovdqa %ymm12,%ymm5 | |||
vmovdqa %ymm7,%ymm6 | |||
poly1.1: | |||
schoolbook 576,1 | |||
#add | |||
vpaddd %ymm14,%ymm3,%ymm3 | |||
vpaddd %ymm9,%ymm4,%ymm4 | |||
vpaddd %ymm12,%ymm5,%ymm5 | |||
vpaddd %ymm7,%ymm6,%ymm6 | |||
poly2.1: | |||
schoolbook 1088,1 | |||
#add | |||
vpaddd %ymm14,%ymm3,%ymm3 | |||
vpaddd %ymm9,%ymm4,%ymm4 | |||
vpaddd %ymm12,%ymm5,%ymm5 | |||
vpaddd %ymm7,%ymm6,%ymm6 | |||
poly3.1: | |||
schoolbook 1600,1 | |||
#add | |||
vpaddd %ymm14,%ymm3,%ymm3 | |||
vpaddd %ymm9,%ymm4,%ymm4 | |||
vpaddd %ymm12,%ymm5,%ymm5 | |||
vpaddd %ymm7,%ymm6,%ymm6 | |||
#reduce | |||
red 3,4,5,6,7,8,9 | |||
#store | |||
vmovdqa %ymm3,64(%rdi) | |||
vmovdqa %ymm5,96(%rdi) | |||
ret | |||
.global cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx) | |||
.global _cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx) | |||
cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx): | |||
_cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx): | |||
#consts | |||
vmovdqa _16XQ*2(%rcx),%ymm0 | |||
vmovdqa _16XQINV*2(%rcx),%ymm1 | |||
vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 | |||
call basemul64_acc_avx | |||
vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 | |||
add $128,%rdi | |||
add $128,%rsi | |||
add $128,%rdx | |||
call basemul64_acc_avx | |||
vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 | |||
add $128,%rdi | |||
add $128,%rsi | |||
add $128,%rdx | |||
call basemul64_acc_avx | |||
vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 | |||
add $128,%rdi | |||
add $128,%rsi | |||
add $128,%rdx | |||
call basemul64_acc_avx | |||
ret | |||
basemul64_avx: | |||
schoolbook 0,0 | |||
#reduce | |||
red 14,9,12,7,8,10,11 | |||
#store | |||
vmovdqa %ymm14,(%rdi) | |||
vmovdqa %ymm12,32(%rdi) | |||
schoolbook 64,1 | |||
#reduce | |||
red 14,9,12,7,8,10,11 | |||
#store | |||
vmovdqa %ymm14,64(%rdi) | |||
vmovdqa %ymm12,96(%rdi) | |||
ret | |||
.global cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_avx) | |||
.global _cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_avx) | |||
cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_avx): | |||
_cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_avx): | |||
#consts | |||
vmovdqa _16XQ*2(%rcx),%ymm0 | |||
vmovdqa _16XQINV*2(%rcx),%ymm1 | |||
vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 | |||
call basemul64_avx | |||
vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 | |||
add $128,%rdi | |||
add $128,%rsi | |||
add $128,%rdx | |||
call basemul64_avx | |||
vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 | |||
add $128,%rdi | |||
add $128,%rsi | |||
add $128,%rdx | |||
call basemul64_avx | |||
vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 | |||
add $128,%rdi | |||
add $128,%rsi | |||
add $128,%rdx | |||
call basemul64_avx | |||
mov %rsp,%r8 | |||
and $-32,%rsp | |||
sub $32,%rsp | |||
lea (_ZETAS_EXP+176)*2(%rcx),%r9 | |||
schoolbook 0 | |||
add $32*2,%r9 | |||
schoolbook 1 | |||
add $192*2,%r9 | |||
schoolbook 2 | |||
add $32*2,%r9 | |||
schoolbook 3 | |||
mov %r8,%rsp | |||
ret |
@@ -4,66 +4,64 @@ | |||
#include <stdint.h> | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_AVX2_cbd | |||
* Name: cbd2 | |||
* | |||
* Description: Given an array of uniformly random bytes, compute | |||
* polynomial with coefficients distributed according to | |||
* a centered binomial distribution with parameter KYBER_ETA | |||
* a centered binomial distribution with parameter eta=2 | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const unsigned char *buf: pointer to input byte array | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const __m256i *buf: pointer to aligned input byte array | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_cbd(poly *restrict r, const uint8_t *restrict buf) { | |||
static void cbd2(poly *restrict r, const __m256i buf[2 * KYBER_N / 128]) { | |||
unsigned int i; | |||
__m256i vec0, vec1, vec2, vec3, tmp; | |||
__m256i f0, f1, f2, f3; | |||
const __m256i mask55 = _mm256_set1_epi32(0x55555555); | |||
const __m256i mask33 = _mm256_set1_epi32(0x33333333); | |||
const __m256i mask03 = _mm256_set1_epi32(0x03030303); | |||
const __m256i mask0F = _mm256_set1_epi32(0x0F0F0F0F); | |||
for (i = 0; i < KYBER_N / 64; i++) { | |||
vec0 = _mm256_load_si256((__m256i *)&buf[32 * i]); | |||
f0 = _mm256_load_si256(&buf[i]); | |||
vec1 = _mm256_srli_epi32(vec0, 1); | |||
vec0 = _mm256_and_si256(mask55, vec0); | |||
vec1 = _mm256_and_si256(mask55, vec1); | |||
vec0 = _mm256_add_epi32(vec0, vec1); | |||
f1 = _mm256_srli_epi16(f0, 1); | |||
f0 = _mm256_and_si256(mask55, f0); | |||
f1 = _mm256_and_si256(mask55, f1); | |||
f0 = _mm256_add_epi8(f0, f1); | |||
vec1 = _mm256_srli_epi32(vec0, 2); | |||
vec0 = _mm256_and_si256(mask33, vec0); | |||
vec1 = _mm256_and_si256(mask33, vec1); | |||
f1 = _mm256_srli_epi16(f0, 2); | |||
f0 = _mm256_and_si256(mask33, f0); | |||
f1 = _mm256_and_si256(mask33, f1); | |||
f0 = _mm256_add_epi8(f0, mask33); | |||
f0 = _mm256_sub_epi8(f0, f1); | |||
vec2 = _mm256_srli_epi32(vec0, 4); | |||
vec3 = _mm256_srli_epi32(vec1, 4); | |||
vec0 = _mm256_and_si256(mask03, vec0); | |||
vec1 = _mm256_and_si256(mask03, vec1); | |||
vec2 = _mm256_and_si256(mask03, vec2); | |||
vec3 = _mm256_and_si256(mask03, vec3); | |||
f1 = _mm256_srli_epi16(f0, 4); | |||
f0 = _mm256_and_si256(mask0F, f0); | |||
f1 = _mm256_and_si256(mask0F, f1); | |||
f0 = _mm256_sub_epi8(f0, mask03); | |||
f1 = _mm256_sub_epi8(f1, mask03); | |||
vec1 = _mm256_sub_epi8(vec0, vec1); | |||
vec3 = _mm256_sub_epi8(vec2, vec3); | |||
f2 = _mm256_unpacklo_epi8(f0, f1); | |||
f3 = _mm256_unpackhi_epi8(f0, f1); | |||
vec0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec1)); | |||
vec1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec1, 1)); | |||
vec2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec3)); | |||
vec3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec3, 1)); | |||
f0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f2)); | |||
f1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f2, 1)); | |||
f2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f3)); | |||
f3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f3, 1)); | |||
tmp = _mm256_unpacklo_epi16(vec0, vec2); | |||
vec2 = _mm256_unpackhi_epi16(vec0, vec2); | |||
vec0 = tmp; | |||
tmp = _mm256_unpacklo_epi16(vec1, vec3); | |||
vec3 = _mm256_unpackhi_epi16(vec1, vec3); | |||
vec1 = tmp; | |||
_mm256_store_si256(&r->vec[4 * i + 0], f0); | |||
_mm256_store_si256(&r->vec[4 * i + 1], f2); | |||
_mm256_store_si256(&r->vec[4 * i + 2], f1); | |||
_mm256_store_si256(&r->vec[4 * i + 3], f3); | |||
} | |||
} | |||
tmp = _mm256_permute2x128_si256(vec0, vec2, 0x20); | |||
vec2 = _mm256_permute2x128_si256(vec0, vec2, 0x31); | |||
vec0 = tmp; | |||
tmp = _mm256_permute2x128_si256(vec1, vec3, 0x20); | |||
vec3 = _mm256_permute2x128_si256(vec1, vec3, 0x31); | |||
vec1 = tmp; | |||
_mm256_store_si256((__m256i *)&r->coeffs[64 * i + 0], vec0); | |||
_mm256_store_si256((__m256i *)&r->coeffs[64 * i + 16], vec2); | |||
_mm256_store_si256((__m256i *)&r->coeffs[64 * i + 32], vec1); | |||
_mm256_store_si256((__m256i *)&r->coeffs[64 * i + 48], vec3); | |||
} | |||
/* buf 32 bytes longer for cbd3 */ | |||
void PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta1(poly *r, const __m256i buf[KYBER_ETA1 * KYBER_N / 128 + 1]) { | |||
cbd2(r, buf); | |||
} | |||
void PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta2(poly *r, const __m256i buf[KYBER_ETA2 * KYBER_N / 128]) { | |||
cbd2(r, buf); | |||
} |
@@ -2,8 +2,11 @@ | |||
#define PQCLEAN_KYBER102490S_AVX2_CBD_H | |||
#include "params.h" | |||
#include "poly.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
void PQCLEAN_KYBER102490S_AVX2_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta1(poly *r, const __m256i buf[KYBER_ETA1 * KYBER_N / 128 + 1]); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta2(poly *r, const __m256i buf[KYBER_ETA2 * KYBER_N / 128]); | |||
#endif |
@@ -1,6 +1,8 @@ | |||
#ifndef PQCLEAN_KYBER102490S_AVX2_CDECL_H | |||
#define PQCLEAN_KYBER102490S_AVX2_CDECL_H | |||
#define _16XQ 0 | |||
#define _16XQINV 16 | |||
#define _16XV 32 | |||
@@ -9,9 +11,10 @@ | |||
#define _16XMONTSQLO 80 | |||
#define _16XMONTSQHI 96 | |||
#define _16XMASK 112 | |||
#define _ZETAS_EXP 128 | |||
#define _ZETAS_INV_EXP 528 | |||
#define _REVIDXB 128 | |||
#define _REVIDXD 144 | |||
#define _ZETAS_EXP 160 | |||
#define _16XSHIFT 624 | |||
/* The C ABI on MacOS exports all symbols with a leading | |||
* underscore. This means that any symbols we refer to from | |||
@@ -23,4 +26,5 @@ | |||
#define _cdecl(s) _##s | |||
#define cdecl(s) s | |||
#endif |
@@ -1,155 +1,123 @@ | |||
#include "align.h" | |||
#include "consts.h" | |||
#include "params.h" | |||
#include <stdint.h> | |||
#define Q KYBER_Q | |||
#define MONT ((1U << 16) % Q) | |||
#define QINV 62209 // q^-1 mod 2^16 | |||
#define V (((1U << 26) + Q/2)/Q) | |||
#define FHI (MONT*(MONT*(Q-1)*((Q-1)/128) % Q) % Q) | |||
#define FLO (FHI*QINV % 65536) | |||
#define MONTSQHI (MONT*MONT % Q) | |||
#define MONTSQLO (MONTSQHI*QINV % 65536) | |||
#define MONT (-1044) // 2^16 mod q | |||
#define QINV (-3327) // q^-1 mod 2^16 | |||
#define V 20159 // floor(2^26/q + 0.5) | |||
#define FHI 1441 // mont^2/128 | |||
#define FLO (-10079) // qinv*FHI | |||
#define MONTSQHI 1353 // mont^2 | |||
#define MONTSQLO 20553 // qinv*MONTSQHI | |||
#define MASK 4095 | |||
#define SHIFT 32 | |||
const qdata_t PQCLEAN_KYBER102490S_AVX2_qdata = {.as_arr = { | |||
#define _16XQ 0 | |||
const qdata_t PQCLEAN_KYBER102490S_AVX2_qdata = {.coeffs = { | |||
//#define _16XQ 0 | |||
Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, | |||
#define _16XQINV 16 | |||
//#define _16XQINV 16 | |||
QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, | |||
QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, | |||
#define _16XV 32 | |||
//#define _16XV 32 | |||
V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, | |||
#define _16XFLO 48 | |||
//#define _16XFLO 48 | |||
FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, | |||
FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, | |||
#define _16XFHI 64 | |||
//#define _16XFHI 64 | |||
FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, | |||
FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, | |||
#define _16XMONTSQLO 80 | |||
//#define _16XMONTSQLO 80 | |||
MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, | |||
MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, | |||
MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, | |||
MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, | |||
#define _16XMONTSQHI 96 | |||
//#define _16XMONTSQHI 96 | |||
MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, | |||
MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, | |||
MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, | |||
MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, | |||
#define _16XMASK 112 | |||
//#define _16XMASK 112 | |||
MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, | |||
MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, | |||
#define _ZETAS_EXP 128 | |||
31499, 31499, 2571, 2571, 14746, 14746, 2970, 2970, | |||
13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, | |||
53134, 53134, 53134, 53134, 53134, 53134, 53134, 53134, | |||
1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, | |||
1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, | |||
44630, 44630, 44630, 44630, 27758, 27758, 27758, 27758, | |||
61737, 61737, 61737, 61737, 49846, 49846, 49846, 49846, | |||
3158, 3158, 3158, 3158, 622, 622, 622, 622, | |||
1577, 1577, 1577, 1577, 182, 182, 182, 182, | |||
59709, 59709, 17364, 17364, 39176, 39176, 36479, 36479, | |||
5572, 5572, 64434, 64434, 21439, 21439, 39295, 39295, | |||
573, 573, 2004, 2004, 264, 264, 383, 383, | |||
2500, 2500, 1458, 1458, 1727, 1727, 3199, 3199, | |||
59847, 59020, 1497, 30967, 41972, 20179, 20711, 25081, | |||
52740, 26617, 16065, 53095, 9135, 64887, 39550, 27837, | |||
1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, | |||
516, 3321, 3009, 2663, 1711, 2167, 126, 1469, | |||
65202, 54059, 33310, 20494, 37798, 945, 50654, 6182, | |||
32011, 10631, 29176, 36775, 47051, 17561, 51106, 60261, | |||
2226, 555, 2078, 1550, 422, 177, 3038, 1574, | |||
3083, 1159, 2552, 2727, 1739, 2457, 418, 3173, | |||
11182, 13387, 51303, 43881, 13131, 60950, 23093, 5493, | |||
33034, 30318, 46795, 12639, 20100, 18525, 19529, 52918, | |||
430, 843, 871, 105, 587, 3094, 2869, 1653, | |||
778, 3182, 1483, 1119, 644, 349, 329, 3254, | |||
788, 788, 1812, 1812, 28191, 28191, 28191, 28191, | |||
28191, 28191, 28191, 28191, 48842, 48842, 48842, 48842, | |||
48842, 48842, 48842, 48842, 287, 287, 287, 287, | |||
287, 287, 287, 287, 202, 202, 202, 202, | |||
202, 202, 202, 202, 10690, 10690, 10690, 10690, | |||
1359, 1359, 1359, 1359, 54335, 54335, 54335, 54335, | |||
31164, 31164, 31164, 31164, 962, 962, 962, 962, | |||
2127, 2127, 2127, 2127, 1855, 1855, 1855, 1855, | |||
1468, 1468, 1468, 1468, 37464, 37464, 24313, 24313, | |||
55004, 55004, 8800, 8800, 18427, 18427, 8859, 8859, | |||
26676, 26676, 49374, 49374, 2648, 2648, 1017, 1017, | |||
732, 732, 608, 608, 1787, 1787, 411, 411, | |||
3124, 3124, 1758, 1758, 19884, 37287, 49650, 56638, | |||
37227, 9076, 35338, 18250, 13427, 14017, 36381, 52780, | |||
16832, 4312, 41381, 47622, 2476, 3239, 3058, 830, | |||
107, 1908, 3082, 2378, 2931, 961, 1821, 2604, | |||
448, 2264, 677, 2054, 34353, 25435, 58154, 24392, | |||
44610, 10946, 24215, 16990, 10336, 57603, 43035, 10907, | |||
31637, 28644, 23998, 48114, 817, 603, 1322, 1864, | |||
2114, 1218, 2455, 2142, 2144, 2051, 1819, 2459, | |||
3221, 996, 958, 1522, 20297, 2146, 15356, 33152, | |||
59257, 50634, 54492, 14470, 44039, 45338, 23211, 48094, | |||
41677, 45279, 7757, 23132, 1097, 610, 2044, 384, | |||
3193, 1994, 220, 1670, 1799, 794, 2475, 478, | |||
3021, 991, 1869, 1628, 0, 0, 0, 0, | |||
//#define _REVIDXB 128 | |||
3854, 3340, 2826, 2312, 1798, 1284, 770, 256, | |||
3854, 3340, 2826, 2312, 1798, 1284, 770, 256, | |||
//#define _REVIDXD 144 | |||
7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0, 0, | |||
//#define _ZETAS_EXP 160 | |||
31498, 31498, 31498, 31498, -758, -758, -758, -758, | |||
5237, 5237, 5237, 5237, 1397, 1397, 1397, 1397, | |||
14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745, | |||
14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745, | |||
-359, -359, -359, -359, -359, -359, -359, -359, | |||
-359, -359, -359, -359, -359, -359, -359, -359, | |||
13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, | |||
-12402, -12402, -12402, -12402, -12402, -12402, -12402, -12402, | |||
1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, | |||
1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, | |||
-20907, -20907, -20907, -20907, 27758, 27758, 27758, 27758, | |||
-3799, -3799, -3799, -3799, -15690, -15690, -15690, -15690, | |||
-171, -171, -171, -171, 622, 622, 622, 622, | |||
1577, 1577, 1577, 1577, 182, 182, 182, 182, | |||
-5827, -5827, 17363, 17363, -26360, -26360, -29057, -29057, | |||
5571, 5571, -1102, -1102, 21438, 21438, -26242, -26242, | |||
573, 573, -1325, -1325, 264, 264, 383, 383, | |||
-829, -829, 1458, 1458, -1602, -1602, -130, -130, | |||
-5689, -6516, 1496, 30967, -23565, 20179, 20710, 25080, | |||
-12796, 26616, 16064, -12442, 9134, -650, -25986, 27837, | |||
1223, 652, -552, 1015, -1293, 1491, -282, -1544, | |||
516, -8, -320, -666, -1618, -1162, 126, 1469, | |||
-335, -11477, -32227, 20494, -27738, 945, -14883, 6182, | |||
32010, 10631, 29175, -28762, -18486, 17560, -14430, -5276, | |||
-1103, 555, -1251, 1550, 422, 177, -291, 1574, | |||
-246, 1159, -777, -602, -1590, -872, 418, -156, | |||
11182, 13387, -14233, -21655, 13131, -4587, 23092, 5493, | |||
-32502, 30317, -18741, 12639, 20100, 18525, 19529, -12619, | |||
430, 843, 871, 105, 587, -235, -460, 1653, | |||
778, -147, 1483, 1119, 644, 349, 329, -75, | |||
787, 787, 787, 787, 787, 787, 787, 787, | |||
787, 787, 787, 787, 787, 787, 787, 787, | |||
-1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, | |||
-1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, | |||
28191, 28191, 28191, 28191, 28191, 28191, 28191, 28191, | |||
-16694, -16694, -16694, -16694, -16694, -16694, -16694, -16694, | |||
287, 287, 287, 287, 287, 287, 287, 287, | |||
202, 202, 202, 202, 202, 202, 202, 202, | |||
10690, 10690, 10690, 10690, 1358, 1358, 1358, 1358, | |||
-11202, -11202, -11202, -11202, 31164, 31164, 31164, 31164, | |||
962, 962, 962, 962, -1202, -1202, -1202, -1202, | |||
-1474, -1474, -1474, -1474, 1468, 1468, 1468, 1468, | |||
-28073, -28073, 24313, 24313, -10532, -10532, 8800, 8800, | |||
18426, 18426, 8859, 8859, 26675, 26675, -16163, -16163, | |||
-681, -681, 1017, 1017, 732, 732, 608, 608, | |||
-1542, -1542, 411, 411, -205, -205, -1571, -1571, | |||
19883, -28250, -15887, -8898, -28309, 9075, -30199, 18249, | |||
13426, 14017, -29156, -12757, 16832, 4311, -24155, -17915, | |||
-853, -90, -271, 830, 107, -1421, -247, -951, | |||
-398, 961, -1508, -725, 448, -1065, 677, -1275, | |||
-31183, 25435, -7382, 24391, -20927, 10946, 24214, 16989, | |||
10335, -7934, -22502, 10906, 31636, 28644, 23998, -17422, | |||
817, 603, 1322, -1465, -1215, 1218, -874, -1187, | |||
-1185, -1278, -1510, -870, -108, 996, 958, 1522, | |||
20297, 2146, 15355, -32384, -6280, -14903, -11044, 14469, | |||
-21498, -20198, 23210, -17442, -23860, -20257, 7756, 23132, | |||
1097, 610, -1285, 384, -136, -1335, 220, -1659, | |||
-1530, 794, -854, 478, -308, 991, -1460, 1628, | |||
#define _ZETAS_INV_EXP 528 | |||
42405, 57780, 20258, 23860, 17443, 42326, 20199, 21498, | |||
51067, 11045, 14903, 6280, 32385, 50181, 63391, 45240, | |||
1701, 1460, 2338, 308, 2851, 854, 2535, 1530, | |||
1659, 3109, 1335, 136, 2945, 1285, 2719, 2232, | |||
17423, 41539, 36893, 33900, 54630, 22502, 7934, 55201, | |||
48547, 41322, 54591, 20927, 41145, 7383, 40102, 31184, | |||
1807, 2371, 2333, 108, 870, 1510, 1278, 1185, | |||
1187, 874, 2111, 1215, 1465, 2007, 2726, 2512, | |||
17915, 24156, 61225, 48705, 12757, 29156, 51520, 52110, | |||
47287, 30199, 56461, 28310, 8899, 15887, 28250, 45653, | |||
1275, 2652, 1065, 2881, 725, 1508, 2368, 398, | |||
951, 247, 1421, 3222, 2499, 271, 90, 853, | |||
16163, 16163, 38861, 38861, 56678, 56678, 47110, 47110, | |||
56737, 56737, 10533, 10533, 41224, 41224, 28073, 28073, | |||
1571, 1571, 205, 205, 2918, 2918, 1542, 1542, | |||
2721, 2721, 2597, 2597, 2312, 2312, 681, 681, | |||
34373, 34373, 34373, 34373, 11202, 11202, 11202, 11202, | |||
64178, 64178, 64178, 64178, 54847, 54847, 54847, 54847, | |||
1861, 1861, 1861, 1861, 1474, 1474, 1474, 1474, | |||
1202, 1202, 1202, 1202, 2367, 2367, 2367, 2367, | |||
16695, 16695, 16695, 16695, 16695, 16695, 16695, 16695, | |||
37346, 37346, 37346, 37346, 37346, 37346, 37346, 37346, | |||
3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127, | |||
3042, 3042, 3042, 3042, 3042, 3042, 3042, 3042, | |||
64749, 64749, 1517, 1517, 12619, 46008, 47012, 45437, | |||
52898, 18742, 35219, 32503, 60044, 42444, 4587, 52406, | |||
21656, 14234, 52150, 54355, 75, 3000, 2980, 2685, | |||
2210, 1846, 147, 2551, 1676, 460, 235, 2742, | |||
3224, 2458, 2486, 2899, 5276, 14431, 47976, 18486, | |||
28762, 36361, 54906, 33526, 59355, 14883, 64592, 27739, | |||
45043, 32227, 11478, 335, 156, 2911, 872, 1590, | |||
602, 777, 2170, 246, 1755, 291, 3152, 2907, | |||
1779, 1251, 2774, 1103, 37700, 25987, 650, 56402, | |||
12442, 49472, 38920, 12797, 40456, 44826, 45358, 23565, | |||
34570, 64040, 6517, 5690, 1860, 3203, 1162, 1618, | |||
666, 320, 8, 2813, 1544, 282, 1838, 1293, | |||
2314, 552, 2677, 2106, 26242, 26242, 44098, 44098, | |||
1103, 1103, 59965, 59965, 29058, 29058, 26361, 26361, | |||
48173, 48173, 5828, 5828, 130, 130, 1602, 1602, | |||
1871, 1871, 829, 829, 2946, 2946, 3065, 3065, | |||
1325, 1325, 2756, 2756, 15691, 15691, 15691, 15691, | |||
3800, 3800, 3800, 3800, 37779, 37779, 37779, 37779, | |||
20907, 20907, 20907, 20907, 3147, 3147, 3147, 3147, | |||
1752, 1752, 1752, 1752, 2707, 2707, 2707, 2707, | |||
171, 171, 171, 171, 12403, 12403, 12403, 12403, | |||
12403, 12403, 12403, 12403, 52012, 52012, 52012, 52012, | |||
52012, 52012, 52012, 52012, 1907, 1907, 1907, 1907, | |||
1907, 1907, 1907, 1907, 1836, 1836, 1836, 1836, | |||
1836, 1836, 1836, 1836, 50791, 50791, 359, 359, | |||
60300, 60300, 1932, 1932, 0, 0, 0, 0 | |||
//#define _16XSHIFT 624 | |||
SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, | |||
SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT | |||
} | |||
}; |
@@ -1,19 +1,10 @@ | |||
#ifndef PQCLEAN_KYBER102490S_AVX2_CONSTS_H | |||
#define PQCLEAN_KYBER102490S_AVX2_CONSTS_H | |||
#include "align.h" | |||
#include "cdecl.h" | |||
#include "params.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
#define ALIGNED_UINT16_T(N) \ | |||
union { \ | |||
__m256i as_vec; \ | |||
uint16_t as_arr[(N)]; \ | |||
} | |||
typedef ALIGNED_UINT16_T(928) qdata_t; | |||
typedef ALIGNED_INT16(640) qdata_t; | |||
extern const qdata_t PQCLEAN_KYBER102490S_AVX2_qdata; | |||
#endif |
@@ -13,14 +13,14 @@ vmovdqa 160(%rdi),%ymm7 | |||
vmovdqa 192(%rdi),%ymm8 | |||
vmovdqa 224(%rdi),%ymm9 | |||
red16 2,10 | |||
red16 3,11 | |||
red16 4,12 | |||
red16 5,13 | |||
red16 6,14 | |||
red16 7,15 | |||
red16 8,10 | |||
red16 9,11 | |||
red16 2 | |||
red16 3 | |||
red16 4 | |||
red16 5 | |||
red16 6 | |||
red16 7 | |||
red16 8 | |||
red16 9 | |||
#store | |||
vmovdqa %ymm2,(%rdi) | |||
@@ -46,49 +46,6 @@ add $256,%rdi | |||
call reduce128_avx | |||
ret | |||
csubq128_avx: | |||
#load | |||
vmovdqa (%rdi),%ymm1 | |||
vmovdqa 32(%rdi),%ymm2 | |||
vmovdqa 64(%rdi),%ymm3 | |||
vmovdqa 96(%rdi),%ymm4 | |||
vmovdqa 128(%rdi),%ymm5 | |||
vmovdqa 160(%rdi),%ymm6 | |||
vmovdqa 192(%rdi),%ymm7 | |||
vmovdqa 224(%rdi),%ymm8 | |||
csubq 1,9 | |||
csubq 2,10 | |||
csubq 3,11 | |||
csubq 4,12 | |||
csubq 5,13 | |||
csubq 6,14 | |||
csubq 7,15 | |||
csubq 8,9 | |||
#store | |||
vmovdqa %ymm1,(%rdi) | |||
vmovdqa %ymm2,32(%rdi) | |||
vmovdqa %ymm3,64(%rdi) | |||
vmovdqa %ymm4,96(%rdi) | |||
vmovdqa %ymm5,128(%rdi) | |||
vmovdqa %ymm6,160(%rdi) | |||
vmovdqa %ymm7,192(%rdi) | |||
vmovdqa %ymm8,224(%rdi) | |||
ret | |||
.global cdecl(PQCLEAN_KYBER102490S_AVX2_csubq_avx) | |||
.global _cdecl(PQCLEAN_KYBER102490S_AVX2_csubq_avx) | |||
cdecl(PQCLEAN_KYBER102490S_AVX2_csubq_avx): | |||
_cdecl(PQCLEAN_KYBER102490S_AVX2_csubq_avx): | |||
#consts | |||
vmovdqa _16XQ*2(%rsi),%ymm0 | |||
call csubq128_avx | |||
add $256,%rdi | |||
call csubq128_avx | |||
ret | |||
tomont128_avx: | |||
#load | |||
vmovdqa (%rdi),%ymm3 | |||
@@ -1,6 +1,10 @@ | |||
.macro red16 r,x=12 | |||
.macro red16 r,rs=0,x=12 | |||
vpmulhw %ymm1,%ymm\r,%ymm\x | |||
.if \rs | |||
vpmulhrsw %ymm\rs,%ymm\x,%ymm\x | |||
.else | |||
vpsraw $10,%ymm\x,%ymm\x | |||
.endif | |||
vpmullw %ymm0,%ymm\x,%ymm\x | |||
vpsubw %ymm\x,%ymm\r,%ymm\r | |||
.endm | |||
@@ -10,9 +14,6 @@ vpsubw %ymm0,%ymm\r,%ymm\r | |||
vpsraw $15,%ymm\r,%ymm\x | |||
vpand %ymm0,%ymm\x,%ymm\x | |||
vpaddw %ymm\x,%ymm\r,%ymm\r | |||
#vpcmpgtw %ymm0,%ymm\r,%ymm\x | |||
#vpand %ymm0,%ymm\x,%ymm\x | |||
#vpsubw %ymm\x,%ymm\r,%ymm\r | |||
.endm | |||
.macro caddq r,x=12 | |||
@@ -8,6 +8,7 @@ | |||
#include "randombytes.h" | |||
#include "rejsample.h" | |||
#include "symmetric.h" | |||
#include <immintrin.h> | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
@@ -15,11 +16,14 @@ | |||
* Name: pack_pk | |||
* | |||
* Description: Serialize the public key as concatenation of the | |||
* serialized vector of polynomials pk | |||
* and the public seed used to generate the matrix A. | |||
* serialized vector of polynomials pk and the | |||
* public seed used to generate the matrix A. | |||
* The polynomial coefficients in pk are assumed to | |||
* lie in the invertal [0,q], i.e. pk must be reduced | |||
* by PQCLEAN_KYBER102490S_AVX2_polyvec_reduce(). | |||
* | |||
* Arguments: uint8_t *r: pointer to the output serialized public key | |||
* polyvec *pk: pointer to the input public-key polyvec | |||
* Arguments: uint8_t *r: pointer to the output serialized public key | |||
* polyvec *pk: pointer to the input public-key polyvec | |||
* const uint8_t *seed: pointer to the input public seed | |||
**************************************************/ | |||
static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], | |||
@@ -55,9 +59,12 @@ static void unpack_pk(polyvec *pk, | |||
/************************************************* | |||
* Name: pack_sk | |||
* | |||
* Description: Serialize the secret key | |||
* Description: Serialize the secret key. | |||
* The polynomial coefficients in sk are assumed to | |||
* lie in the invertal [0,q], i.e. sk must be reduced | |||
* by PQCLEAN_KYBER102490S_AVX2_polyvec_reduce(). | |||
* | |||
* Arguments: - uint8_t *r: pointer to output serialized secret key | |||
* Arguments: - uint8_t *r: pointer to output serialized secret key | |||
* - polyvec *sk: pointer to input vector of polynomials (secret key) | |||
**************************************************/ | |||
static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { | |||
@@ -67,15 +74,12 @@ static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { | |||
/************************************************* | |||
* Name: unpack_sk | |||
* | |||
* Description: De-serialize the secret key; | |||
* inverse of pack_sk | |||
* Description: De-serialize the secret key; inverse of pack_sk | |||
* | |||
* Arguments: - polyvec *sk: pointer to output vector of polynomials | |||
* (secret key) | |||
* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key) | |||
* - const uint8_t *packedsk: pointer to input serialized secret key | |||
**************************************************/ | |||
static void unpack_sk(polyvec *sk, | |||
const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { | |||
static void unpack_sk(polyvec *sk, const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_frombytes(sk, packedsk); | |||
} | |||
@@ -84,15 +88,16 @@ static void unpack_sk(polyvec *sk, | |||
* | |||
* Description: Serialize the ciphertext as concatenation of the | |||
* compressed and serialized vector of polynomials b | |||
* and the compressed and serialized polynomial v | |||
* and the compressed and serialized polynomial v. | |||
* The polynomial coefficients in b and v are assumed to | |||
* lie in the invertal [0,q], i.e. b and v must be reduced | |||
* by PQCLEAN_KYBER102490S_AVX2_polyvec_reduce() and PQCLEAN_KYBER102490S_AVX2_poly_reduce(), respectively. | |||
* | |||
* Arguments: uint8_t *r: pointer to the output serialized ciphertext | |||
* poly *pk: pointer to the input vector of polynomials b | |||
* poly *v: pointer to the input polynomial v | |||
* poly *pk: pointer to the input vector of polynomials b | |||
* poly *v: pointer to the input polynomial v | |||
**************************************************/ | |||
static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], | |||
polyvec *b, | |||
poly *v) { | |||
static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], polyvec *b, poly *v) { | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_compress(r, b); | |||
PQCLEAN_KYBER102490S_AVX2_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v); | |||
} | |||
@@ -103,13 +108,11 @@ static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], | |||
* Description: De-serialize and decompress ciphertext from a byte array; | |||
* approximate inverse of pack_ciphertext | |||
* | |||
* Arguments: - polyvec *b: pointer to the output vector of polynomials b | |||
* - poly *v: pointer to the output polynomial v | |||
* Arguments: - polyvec *b: pointer to the output vector of polynomials b | |||
* - poly *v: pointer to the output polynomial v | |||
* - const uint8_t *c: pointer to the input serialized ciphertext | |||
**************************************************/ | |||
static void unpack_ciphertext(polyvec *b, | |||
poly *v, | |||
const uint8_t c[KYBER_INDCPA_BYTES]) { | |||
static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t c[KYBER_INDCPA_BYTES]) { | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_decompress(b, c); | |||
PQCLEAN_KYBER102490S_AVX2_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES); | |||
} | |||
@@ -120,11 +123,9 @@ static void unpack_ciphertext(polyvec *b, | |||
* Description: Run rejection sampling on uniform random bytes to generate | |||
* uniform random integers mod q | |||
* | |||
* Arguments: - int16_t *r: pointer to output buffer | |||
* - unsigned int len: requested number of 16-bit integers | |||
* (uniform mod q) | |||
* - const uint8_t *buf: pointer to input buffer | |||
* (assumed to be uniform random bytes) | |||
* Arguments: - int16_t *r: pointer to output array | |||
* - unsigned int len: requested number of 16-bit integers (uniform mod q) | |||
* - const uint8_t *buf: pointer to input buffer (assumed to be uniformly random bytes) | |||
* - unsigned int buflen: length of input buffer in bytes | |||
* | |||
* Returns number of sampled 16-bit integers (at most len) | |||
@@ -134,16 +135,19 @@ static unsigned int rej_uniform(int16_t *r, | |||
const uint8_t *buf, | |||
unsigned int buflen) { | |||
unsigned int ctr, pos; | |||
uint16_t val; | |||
uint16_t val0, val1; | |||
ctr = pos = 0; | |||
while (ctr < len && pos + 2 <= buflen) { | |||
val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); | |||
pos += 2; | |||
while (ctr < len && pos + 3 <= buflen) { | |||
val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; | |||
val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF; | |||
pos += 3; | |||
if (val < 19 * KYBER_Q) { | |||
val -= ((uint32_t)val * 20159 >> 26) * KYBER_Q; // Barrett reduction | |||
r[ctr++] = (int16_t)val; | |||
if (val0 < KYBER_Q) { | |||
r[ctr++] = val0; | |||
} | |||
if (ctr < len && val1 < KYBER_Q) { | |||
r[ctr++] = val1; | |||
} | |||
} | |||
@@ -165,12 +169,11 @@ static unsigned int rej_uniform(int16_t *r, | |||
* - const uint8_t *seed: pointer to input seed | |||
* - int transposed: boolean deciding whether A or A^T is generated | |||
**************************************************/ | |||
#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \ | |||
+ XOF_BLOCKBYTES)/XOF_BLOCKBYTES) | |||
void PQCLEAN_KYBER102490S_AVX2_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed) { | |||
unsigned int ctr, i, j; | |||
ALIGN16_TYPE(uint64_t) nonce = {.orig = 0}; | |||
ALIGN32_ARRAY(uint8_t, GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES) buf; | |||
void PQCLEAN_KYBER102490S_AVX2_gen_matrix(polyvec *a, const uint8_t seed[32], int transposed) { | |||
unsigned int ctr, i, j, k; | |||
unsigned int buflen, off; | |||
uint64_t nonce = 0; | |||
ALIGNED_UINT8(REJ_UNIFORM_AVX_NBLOCKS * AES256CTR_BLOCKBYTES) buf; | |||
aes256ctr_ctx state; | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, seed, 0); | |||
@@ -178,19 +181,24 @@ void PQCLEAN_KYBER102490S_AVX2_gen_matrix(polyvec *a, const uint8_t seed[KYBER_S | |||
for (i = 0; i < KYBER_K; i++) { | |||
for (j = 0; j < KYBER_K; j++) { | |||
if (transposed) { | |||
nonce.orig = (j << 8) | i; | |||
nonce = (j << 8) | i; | |||
} else { | |||
nonce.orig = (i << 8) | j; | |||
nonce = (i << 8) | j; | |||
} | |||
state.n = _mm_loadl_epi64(&nonce.vec); | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.arr, GEN_MATRIX_NBLOCKS, &state); | |||
ctr = PQCLEAN_KYBER102490S_AVX2_rej_uniform_avx(a[i].vec[j].coeffs, buf.arr); | |||
state.n = _mm_loadl_epi64((__m128i *)&nonce); | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.coeffs, REJ_UNIFORM_AVX_NBLOCKS, &state); | |||
buflen = REJ_UNIFORM_AVX_NBLOCKS * AES256CTR_BLOCKBYTES; | |||
ctr = PQCLEAN_KYBER102490S_AVX2_rej_uniform_avx(a[i].vec[j].coeffs, buf.coeffs); | |||
while (ctr < KYBER_N) { | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.arr, 1, &state); | |||
ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf.arr, | |||
XOF_BLOCKBYTES); | |||
off = buflen % 3; | |||
for (k = 0; k < off; k++) { | |||
buf.coeffs[k] = buf.coeffs[buflen - off + k]; | |||
} | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.coeffs + off, 1, &state); | |||
buflen = off + AES256CTR_BLOCKBYTES; | |||
ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf.coeffs, buflen); | |||
} | |||
PQCLEAN_KYBER102490S_AVX2_poly_nttunpack(&a[i].vec[j]); | |||
@@ -212,39 +220,41 @@ void PQCLEAN_KYBER102490S_AVX2_gen_matrix(polyvec *a, const uint8_t seed[KYBER_S | |||
void PQCLEAN_KYBER102490S_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], | |||
uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { | |||
unsigned int i; | |||
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; | |||
const uint8_t *publicseed = buf.arr; | |||
const uint8_t *noiseseed = buf.arr + KYBER_SYMBYTES; | |||
uint8_t buf[2 * KYBER_SYMBYTES]; | |||
const uint8_t *publicseed = buf; | |||
const uint8_t *noiseseed = buf + KYBER_SYMBYTES; | |||
polyvec a[KYBER_K], e, pkpv, skpv; | |||
randombytes(buf.arr, KYBER_SYMBYTES); | |||
hash_g(buf.arr, buf.arr, KYBER_SYMBYTES); | |||
randombytes(buf, KYBER_SYMBYTES); | |||
hash_g(buf, buf, KYBER_SYMBYTES); | |||
gen_a(a, publicseed); | |||
ALIGN16_TYPE(uint64_t) nonce = {.orig = 0}; | |||
#define NOISE_NBLOCKS ((KYBER_ETA1*KYBER_N/4)/AES256CTR_BLOCKBYTES) /* Assumes divisibility */ | |||
uint64_t nonce = 0; | |||
ALIGNED_UINT8(NOISE_NBLOCKS * AES256CTR_BLOCKBYTES + 32) coins; // +32 bytes as required by PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta1 | |||
aes256ctr_ctx state; | |||
ALIGN32_ARRAY(uint8_t, 128) coins; | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, noiseseed, nonce.orig++); | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, noiseseed, nonce++); | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(coins.arr, 2, &state); | |||
state.n = _mm_loadl_epi64(&nonce.vec); | |||
nonce.orig++; | |||
PQCLEAN_KYBER102490S_AVX2_cbd(&skpv.vec[i], coins.arr); | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(coins.coeffs, NOISE_NBLOCKS, &state); | |||
state.n = _mm_loadl_epi64((__m128i *)&nonce); | |||
nonce += 1; | |||
PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta1(&skpv.vec[i], coins.vec); | |||
} | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(coins.arr, 2, &state); | |||
state.n = _mm_loadl_epi64(&nonce.vec); | |||
nonce.orig++; | |||
PQCLEAN_KYBER102490S_AVX2_cbd(&e.vec[i], coins.arr); | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(coins.coeffs, NOISE_NBLOCKS, &state); | |||
state.n = _mm_loadl_epi64((__m128i *)&nonce); | |||
nonce += 1; | |||
PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta1(&e.vec[i], coins.vec); | |||
} | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(&skpv); | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_reduce(&skpv); | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(&e); | |||
// matrix-vector multiplication | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_basemul_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); | |||
PQCLEAN_KYBER102490S_AVX2_poly_tomont(&pkpv.vec[i]); | |||
} | |||
@@ -261,70 +271,70 @@ void PQCLEAN_KYBER102490S_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYB | |||
* Description: Encryption function of the CPA-secure | |||
* public-key encryption scheme underlying Kyber. | |||
* | |||
* Arguments: - uint8_t *c: pointer to output ciphertext | |||
* (of length KYBER_INDCPA_BYTES bytes) | |||
* - const uint8_t *m: pointer to input message | |||
* (of length KYBER_INDCPA_MSGBYTES bytes) | |||
* - const uint8_t *pk: pointer to input public key | |||
* (of length KYBER_INDCPA_PUBLICKEYBYTES) | |||
* - const uint8_t *coins: pointer to input random coins | |||
* used as seed (of length KYBER_SYMBYTES) | |||
* to deterministically generate all | |||
* randomness | |||
* Arguments: - uint8_t *c: pointer to output ciphertext | |||
* (of length KYBER_INDCPA_BYTES bytes) | |||
* - const uint8_t *m: pointer to input message | |||
* (of length KYBER_INDCPA_MSGBYTES bytes) | |||
* - const uint8_t *pk: pointer to input public key | |||
* (of length KYBER_INDCPA_PUBLICKEYBYTES) | |||
* - const uint8_t *coins: pointer to input random coins used as seed | |||
* (of length KYBER_SYMBYTES) to deterministically | |||
* generate all randomness | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], | |||
const uint8_t m[KYBER_INDCPA_MSGBYTES], | |||
const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], | |||
const uint8_t coins[KYBER_SYMBYTES]) { | |||
unsigned int i; | |||
ALIGN32_ARRAY(uint8_t, KYBER_SYMBYTES) seed; | |||
polyvec sp, pkpv, ep, at[KYBER_K], bp; | |||
uint8_t seed[KYBER_SYMBYTES]; | |||
polyvec sp, pkpv, ep, at[KYBER_K], b; | |||
poly v, k, epp; | |||
unpack_pk(&pkpv, seed.arr, pk); | |||
unpack_pk(&pkpv, seed, pk); | |||
PQCLEAN_KYBER102490S_AVX2_poly_frommsg(&k, m); | |||
gen_at(at, seed.arr); | |||
gen_at(at, seed); | |||
ALIGN16_TYPE(uint64_t) nonce = {.orig = 0}; | |||
#define NOISE_NBLOCKS ((KYBER_ETA1*KYBER_N/4)/AES256CTR_BLOCKBYTES) /* Assumes divisibility */ | |||
#define CIPHERTEXTNOISE_NBLOCKS ((KYBER_ETA2*KYBER_N/4)/AES256CTR_BLOCKBYTES) /* Assumes divisibility */ | |||
uint64_t nonce = 0; | |||
ALIGNED_UINT8(NOISE_NBLOCKS * AES256CTR_BLOCKBYTES + 32) buf; /* +32 bytes as required by PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta1 */ | |||
aes256ctr_ctx state; | |||
ALIGN32_ARRAY(uint8_t, 128) buf; | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, coins, nonce.orig++); | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, coins, nonce++); | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.arr, 2, &state); | |||
state.n = _mm_loadl_epi64(&nonce.vec); | |||
nonce.orig++; | |||
PQCLEAN_KYBER102490S_AVX2_cbd(&sp.vec[i], buf.arr); | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.coeffs, NOISE_NBLOCKS, &state); | |||
state.n = _mm_loadl_epi64((__m128i *)&nonce); | |||
nonce += 1; | |||
PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta1(&sp.vec[i], buf.vec); | |||
} | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.arr, 2, &state); | |||
state.n = _mm_loadl_epi64(&nonce.vec); | |||
nonce.orig++; | |||
PQCLEAN_KYBER102490S_AVX2_cbd(&ep.vec[i], buf.arr); | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.coeffs, CIPHERTEXTNOISE_NBLOCKS, &state); | |||
state.n = _mm_loadl_epi64((__m128i *)&nonce); | |||
nonce += 1; | |||
PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta2(&ep.vec[i], buf.vec); | |||
} | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.arr, 2, &state); | |||
state.n = _mm_loadl_epi64(&nonce.vec); | |||
nonce.orig++; | |||
PQCLEAN_KYBER102490S_AVX2_cbd(&epp, buf.arr); | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.coeffs, CIPHERTEXTNOISE_NBLOCKS, &state); | |||
state.n = _mm_loadl_epi64((__m128i *)&nonce); | |||
nonce += 1; | |||
PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta2(&epp, buf.vec); | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(&sp); | |||
// matrix-vector multiplication | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp); | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_basemul_acc_montgomery(&b.vec[i], &at[i], &sp); | |||
} | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_basemul_acc_montgomery(&v, &pkpv, &sp); | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp); | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_invntt_tomont(&bp); | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_invntt_tomont(&b); | |||
PQCLEAN_KYBER102490S_AVX2_poly_invntt_tomont(&v); | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_add(&bp, &bp, &ep); | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_add(&b, &b, &ep); | |||
PQCLEAN_KYBER102490S_AVX2_poly_add(&v, &v, &epp); | |||
PQCLEAN_KYBER102490S_AVX2_poly_add(&v, &v, &k); | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_reduce(&bp); | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_reduce(&b); | |||
PQCLEAN_KYBER102490S_AVX2_poly_reduce(&v); | |||
pack_ciphertext(c, &bp, &v); | |||
pack_ciphertext(c, &b, &v); | |||
} | |||
/************************************************* | |||
@@ -333,24 +343,24 @@ void PQCLEAN_KYBER102490S_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], | |||
* Description: Decryption function of the CPA-secure | |||
* public-key encryption scheme underlying Kyber. | |||
* | |||
* Arguments: - uint8_t *m: pointer to output decrypted message | |||
* (of length KYBER_INDCPA_MSGBYTES) | |||
* - const uint8_t *c: pointer to input ciphertext | |||
* (of length KYBER_INDCPA_BYTES) | |||
* Arguments: - uint8_t *m: pointer to output decrypted message | |||
* (of length KYBER_INDCPA_MSGBYTES) | |||
* - const uint8_t *c: pointer to input ciphertext | |||
* (of length KYBER_INDCPA_BYTES) | |||
* - const uint8_t *sk: pointer to input secret key | |||
* (of length KYBER_INDCPA_SECRETKEYBYTES) | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], | |||
const uint8_t c[KYBER_INDCPA_BYTES], | |||
const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { | |||
polyvec bp, skpv; | |||
polyvec b, skpv; | |||
poly v, mp; | |||
unpack_ciphertext(&bp, &v, c); | |||
unpack_ciphertext(&b, &v, c); | |||
unpack_sk(&skpv, sk); | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(&bp); | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp); | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(&b); | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_basemul_acc_montgomery(&mp, &skpv, &b); | |||
PQCLEAN_KYBER102490S_AVX2_poly_invntt_tomont(&mp); | |||
PQCLEAN_KYBER102490S_AVX2_poly_sub(&mp, &v, &mp); | |||
@@ -2,22 +2,21 @@ | |||
.include "shuffle.inc" | |||
.include "fq.inc" | |||
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=1,zl1=1,zh0=2,zh1=2 | |||
#update & mul | |||
vpsubw %ymm\rh0,%ymm\rl0,%ymm12 | |||
vpsubw %ymm\rh1,%ymm\rl1,%ymm13 | |||
vpsubw %ymm\rh2,%ymm\rl2,%ymm14 | |||
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=2,zl1=2,zh0=3,zh1=3 | |||
vpsubw %ymm\rl0,%ymm\rh0,%ymm12 | |||
vpaddw %ymm\rh0,%ymm\rl0,%ymm\rl0 | |||
vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl1 | |||
vpsubw %ymm\rl1,%ymm\rh1,%ymm13 | |||
vpmullw %ymm\zl0,%ymm12,%ymm\rh0 | |||
vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl1 | |||
vpsubw %ymm\rl2,%ymm\rh2,%ymm14 | |||
vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl2 | |||
vpmullw %ymm\zl0,%ymm13,%ymm\rh1 | |||
vpsubw %ymm\rh3,%ymm\rl3,%ymm15 | |||
vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl2 | |||
vpsubw %ymm\rl3,%ymm\rh3,%ymm15 | |||
vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl3 | |||
vpmullw %ymm\zl1,%ymm14,%ymm\rh2 | |||
vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl3 | |||
vpmullw %ymm\zl1,%ymm15,%ymm\rh3 | |||
vpmulhw %ymm\zh0,%ymm12,%ymm12 | |||
@@ -26,60 +25,84 @@ vpmulhw %ymm\zh0,%ymm13,%ymm13 | |||
vpmulhw %ymm\zh1,%ymm14,%ymm14 | |||
vpmulhw %ymm\zh1,%ymm15,%ymm15 | |||
#reduce | |||
vpmulhw %ymm0,%ymm\rh0,%ymm\rh0 | |||
vpmulhw %ymm0,%ymm\rh1,%ymm\rh1 | |||
vpmulhw %ymm0,%ymm\rh2,%ymm\rh2 | |||
vpmulhw %ymm0,%ymm\rh3,%ymm\rh3 | |||
# | |||
# | |||
vpsubw %ymm\rh0,%ymm12,%ymm\rh0 | |||
vpsubw %ymm\rh1,%ymm13,%ymm\rh1 | |||
vpsubw %ymm\rh2,%ymm14,%ymm\rh2 | |||
vpsubw %ymm\rh3,%ymm15,%ymm\rh3 | |||
.endm | |||
.text | |||
invntt_levels0t5_avx: | |||
level0: | |||
#zetas | |||
vmovdqu (%rsi),%ymm15 | |||
vmovdqu 64(%rsi),%ymm3 | |||
vmovdqu 32(%rsi),%ymm1 | |||
vmovdqu 96(%rsi),%ymm2 | |||
#load | |||
vmovdqa (%rdi),%ymm4 | |||
vmovdqa 32(%rdi),%ymm5 | |||
vmovdqa 64(%rdi),%ymm6 | |||
vmovdqa 96(%rdi),%ymm7 | |||
vmovdqa 128(%rdi),%ymm8 | |||
vmovdqa 160(%rdi),%ymm9 | |||
vmovdqa 192(%rdi),%ymm10 | |||
vmovdqa 224(%rdi),%ymm11 | |||
butterfly 4,5,8,9,6,7,10,11,15,3,1,2 | |||
level1: | |||
#zetas | |||
vmovdqu 128(%rsi),%ymm3 | |||
vmovdqu 160(%rsi),%ymm2 | |||
butterfly 4,5,6,7,8,9,10,11,3,3,2,2 | |||
.macro intt_levels0t5 off | |||
/* level 0 */ | |||
vmovdqa _16XFLO*2(%rsi),%ymm2 | |||
vmovdqa _16XFHI*2(%rsi),%ymm3 | |||
vmovdqa (128*\off+ 0)*2(%rdi),%ymm4 | |||
vmovdqa (128*\off+ 32)*2(%rdi),%ymm6 | |||
vmovdqa (128*\off+ 16)*2(%rdi),%ymm5 | |||
vmovdqa (128*\off+ 48)*2(%rdi),%ymm7 | |||
fqmulprecomp 2,3,4 | |||
fqmulprecomp 2,3,6 | |||
fqmulprecomp 2,3,5 | |||
fqmulprecomp 2,3,7 | |||
vmovdqa (128*\off+ 64)*2(%rdi),%ymm8 | |||
vmovdqa (128*\off+ 96)*2(%rdi),%ymm10 | |||
vmovdqa (128*\off+ 80)*2(%rdi),%ymm9 | |||
vmovdqa (128*\off+112)*2(%rdi),%ymm11 | |||
fqmulprecomp 2,3,8 | |||
fqmulprecomp 2,3,10 | |||
fqmulprecomp 2,3,9 | |||
fqmulprecomp 2,3,11 | |||
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+208)*2(%rsi),%ymm15 | |||
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+176)*2(%rsi),%ymm1 | |||
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+224)*2(%rsi),%ymm2 | |||
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+192)*2(%rsi),%ymm3 | |||
vmovdqa _REVIDXB*2(%rsi),%ymm12 | |||
vpshufb %ymm12,%ymm15,%ymm15 | |||
vpshufb %ymm12,%ymm1,%ymm1 | |||
vpshufb %ymm12,%ymm2,%ymm2 | |||
vpshufb %ymm12,%ymm3,%ymm3 | |||
butterfly 4,5,8,9,6,7,10,11,15,1,2,3 | |||
/* level 1 */ | |||
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+144)*2(%rsi),%ymm2 | |||
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+160)*2(%rsi),%ymm3 | |||
vmovdqa _REVIDXB*2(%rsi),%ymm1 | |||
vpshufb %ymm1,%ymm2,%ymm2 | |||
vpshufb %ymm1,%ymm3,%ymm3 | |||
butterfly 4,5,6,7,8,9,10,11,2,2,3,3 | |||
shuffle1 4,5,3,5 | |||
shuffle1 6,7,4,7 | |||
shuffle1 8,9,6,9 | |||
shuffle1 10,11,8,11 | |||
level2: | |||
#zetas | |||
vmovdqu 192(%rsi),%ymm10 | |||
vmovdqu 224(%rsi),%ymm2 | |||
#consts | |||
vmovdqa _16XV*2(%rdx),%ymm1 | |||
/* level 2 */ | |||
vmovdqa _REVIDXD*2(%rsi),%ymm12 | |||
vpermd (_ZETAS_EXP+(1-\off)*224+112)*2(%rsi),%ymm12,%ymm2 | |||
vpermd (_ZETAS_EXP+(1-\off)*224+128)*2(%rsi),%ymm12,%ymm10 | |||
butterfly 3,4,6,8,5,7,9,11,10,10,2,2 | |||
butterfly 3,4,6,8,5,7,9,11,2,2,10,10 | |||
vmovdqa _16XV*2(%rsi),%ymm1 | |||
red16 3 | |||
shuffle2 3,4,10,4 | |||
@@ -87,26 +110,22 @@ shuffle2 6,8,3,8 | |||
shuffle2 5,7,6,7 | |||
shuffle2 9,11,5,11 | |||
level3: | |||
#zetas | |||
vmovdqu 256(%rsi),%ymm9 | |||
vmovdqu 288(%rsi),%ymm2 | |||
/* level 3 */ | |||
vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+80)*2(%rsi),%ymm2 | |||
vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+96)*2(%rsi),%ymm9 | |||
butterfly 10,3,6,5,4,8,7,11,9,9,2,2 | |||
red16 10 | |||
butterfly 10,3,6,5,4,8,7,11,2,2,9,9 | |||
shuffle4 10,3,9,3 | |||
shuffle4 6,5,10,5 | |||
shuffle4 4,8,6,8 | |||
shuffle4 7,11,4,11 | |||
level4: | |||
#zetas | |||
vmovdqu 320(%rsi),%ymm7 | |||
vmovdqu 352(%rsi),%ymm2 | |||
/* level 4 */ | |||
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+48)*2(%rsi),%ymm2 | |||
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+64)*2(%rsi),%ymm7 | |||
butterfly 9,10,6,4,3,5,8,11,7,7,2,2 | |||
butterfly 9,10,6,4,3,5,8,11,2,2,7,7 | |||
red16 9 | |||
@@ -115,113 +134,62 @@ shuffle8 6,4,9,4 | |||
shuffle8 3,5,6,5 | |||
shuffle8 8,11,3,11 | |||
level5: | |||
#zetas | |||
vpbroadcastd 384(%rsi),%ymm8 | |||
vpbroadcastd 388(%rsi),%ymm2 | |||
butterfly 7,9,6,3,10,4,5,11,8,8,2,2 | |||
red16 7 | |||
#store | |||
vmovdqa %ymm7,(%rdi) | |||
vmovdqa %ymm9,32(%rdi) | |||
vmovdqa %ymm6,64(%rdi) | |||
vmovdqa %ymm3,96(%rdi) | |||
vmovdqa %ymm10,128(%rdi) | |||
vmovdqa %ymm4,160(%rdi) | |||
vmovdqa %ymm5,192(%rdi) | |||
vmovdqa %ymm11,224(%rdi) | |||
/* level5 */ | |||
vmovdqa (_ZETAS_EXP+(1-\off)*224+16)*2(%rsi),%ymm2 | |||
vmovdqa (_ZETAS_EXP+(1-\off)*224+32)*2(%rsi),%ymm8 | |||
ret | |||
butterfly 7,9,6,3,10,4,5,11,2,2,8,8 | |||
invntt_level6_avx: | |||
#zetas | |||
vpbroadcastd (%rsi),%ymm1 | |||
vpbroadcastd 4(%rsi),%ymm2 | |||
#load | |||
vmovdqa (%rdi),%ymm4 | |||
vmovdqa 32(%rdi),%ymm5 | |||
vmovdqa 64(%rdi),%ymm6 | |||
vmovdqa 96(%rdi),%ymm7 | |||
vmovdqa 256(%rdi),%ymm8 | |||
vmovdqa 288(%rdi),%ymm9 | |||
vmovdqa 320(%rdi),%ymm10 | |||
vmovdqa 352(%rdi),%ymm11 | |||
vmovdqa %ymm7,(128*\off+ 0)*2(%rdi) | |||
vmovdqa %ymm9,(128*\off+ 16)*2(%rdi) | |||
vmovdqa %ymm6,(128*\off+ 32)*2(%rdi) | |||
vmovdqa %ymm3,(128*\off+ 48)*2(%rdi) | |||
vmovdqa %ymm10,(128*\off+ 64)*2(%rdi) | |||
vmovdqa %ymm4,(128*\off+ 80)*2(%rdi) | |||
vmovdqa %ymm5,(128*\off+ 96)*2(%rdi) | |||
vmovdqa %ymm11,(128*\off+112)*2(%rdi) | |||
.endm | |||
butterfly 4,5,6,7,8,9,10,11 | |||
.macro intt_level6 off | |||
/* level 6 */ | |||
vmovdqa (64*\off+ 0)*2(%rdi),%ymm4 | |||
vmovdqa (64*\off+128)*2(%rdi),%ymm8 | |||
vmovdqa (64*\off+ 16)*2(%rdi),%ymm5 | |||
vmovdqa (64*\off+144)*2(%rdi),%ymm9 | |||
vpbroadcastq (_ZETAS_EXP+0)*2(%rsi),%ymm2 | |||
#consts | |||
vmovdqa _16XFLO*2(%rdx),%ymm12 | |||
vmovdqa _16XFHI*2(%rdx),%ymm13 | |||
#store | |||
vmovdqa %ymm8,256(%rdi) | |||
vmovdqa %ymm9,288(%rdi) | |||
vmovdqa %ymm10,320(%rdi) | |||
vmovdqa %ymm11,352(%rdi) | |||
fqmulprecomp 12,13,4,8 | |||
fqmulprecomp 12,13,5,9 | |||
fqmulprecomp 12,13,6,10 | |||
fqmulprecomp 12,13,7,11 | |||
#store | |||
vmovdqa %ymm4,(%rdi) | |||
vmovdqa %ymm5,32(%rdi) | |||
vmovdqa %ymm6,64(%rdi) | |||
vmovdqa %ymm7,96(%rdi) | |||
#load | |||
vmovdqa 128(%rdi),%ymm4 | |||
vmovdqa 160(%rdi),%ymm5 | |||
vmovdqa 192(%rdi),%ymm6 | |||
vmovdqa 224(%rdi),%ymm7 | |||
vmovdqa 384(%rdi),%ymm8 | |||
vmovdqa 416(%rdi),%ymm9 | |||
vmovdqa 448(%rdi),%ymm10 | |||
vmovdqa 480(%rdi),%ymm11 | |||
vmovdqa (64*\off+ 32)*2(%rdi),%ymm6 | |||
vmovdqa (64*\off+160)*2(%rdi),%ymm10 | |||
vmovdqa (64*\off+ 48)*2(%rdi),%ymm7 | |||
vmovdqa (64*\off+176)*2(%rdi),%ymm11 | |||
vpbroadcastq (_ZETAS_EXP+4)*2(%rsi),%ymm3 | |||
butterfly 4,5,6,7,8,9,10,11 | |||
#consts | |||
vmovdqa _16XFLO*2(%rdx),%ymm12 | |||
vmovdqa _16XFHI*2(%rdx),%ymm13 | |||
#store | |||
vmovdqa %ymm8,384(%rdi) | |||
vmovdqa %ymm9,416(%rdi) | |||
vmovdqa %ymm10,448(%rdi) | |||
vmovdqa %ymm11,480(%rdi) | |||
fqmulprecomp 12,13,4,8 | |||
fqmulprecomp 12,13,5,9 | |||
fqmulprecomp 12,13,6,10 | |||
fqmulprecomp 12,13,7,11 | |||
#store | |||
vmovdqa %ymm4,128(%rdi) | |||
vmovdqa %ymm5,160(%rdi) | |||
vmovdqa %ymm6,192(%rdi) | |||
vmovdqa %ymm7,224(%rdi) | |||
ret | |||
.if \off == 0 | |||
red16 4 | |||
.endif | |||
vmovdqa %ymm4,(64*\off+ 0)*2(%rdi) | |||
vmovdqa %ymm5,(64*\off+ 16)*2(%rdi) | |||
vmovdqa %ymm6,(64*\off+ 32)*2(%rdi) | |||
vmovdqa %ymm7,(64*\off+ 48)*2(%rdi) | |||
vmovdqa %ymm8,(64*\off+128)*2(%rdi) | |||
vmovdqa %ymm9,(64*\off+144)*2(%rdi) | |||
vmovdqa %ymm10,(64*\off+160)*2(%rdi) | |||
vmovdqa %ymm11,(64*\off+176)*2(%rdi) | |||
.endm | |||
.text | |||
.global cdecl(PQCLEAN_KYBER102490S_AVX2_invntt_avx) | |||
.global _cdecl(PQCLEAN_KYBER102490S_AVX2_invntt_avx) | |||
cdecl(PQCLEAN_KYBER102490S_AVX2_invntt_avx): | |||
_cdecl(PQCLEAN_KYBER102490S_AVX2_invntt_avx): | |||
#consts | |||
vmovdqa _16XQ*2(%rsi),%ymm0 | |||
mov %rsi,%rdx | |||
add $_ZETAS_INV_EXP*2,%rsi | |||
call invntt_levels0t5_avx | |||
add $256,%rdi | |||
add $392,%rsi | |||
call invntt_levels0t5_avx | |||
sub $256,%rdi | |||
add $392,%rsi | |||
call invntt_level6_avx | |||
intt_levels0t5 0 | |||
intt_levels0t5 1 | |||
intt_level6 0 | |||
intt_level6 1 | |||
ret |
@@ -1,4 +1,3 @@ | |||
#include "align.h" | |||
#include "indcpa.h" | |||
#include "kem.h" | |||
#include "params.h" | |||
@@ -15,13 +14,14 @@ | |||
* for CCA-secure Kyber key encapsulation mechanism | |||
* | |||
* Arguments: - unsigned char *pk: pointer to output public key | |||
* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) | |||
* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) | |||
* - unsigned char *sk: pointer to output private key | |||
* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) | |||
* (an already allocated array of KYBER_SECRETKEYBYTES bytes) | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_KYBER102490S_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { | |||
int PQCLEAN_KYBER102490S_AVX2_crypto_kem_keypair(unsigned char pk[KYBER_PUBLICKEYBYTES], | |||
unsigned char sk[KYBER_SECRETKEYBYTES]) { | |||
size_t i; | |||
PQCLEAN_KYBER102490S_AVX2_indcpa_keypair(pk, sk); | |||
for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) { | |||
@@ -40,36 +40,36 @@ int PQCLEAN_KYBER102490S_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned cha | |||
* secret for given public key | |||
* | |||
* Arguments: - unsigned char *ct: pointer to output cipher text | |||
* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) | |||
* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) | |||
* - unsigned char *ss: pointer to output shared secret | |||
* (an already allocated array of CRYPTO_BYTES bytes) | |||
* (an already allocated array of KYBER_SSBYTES bytes) | |||
* - const unsigned char *pk: pointer to input public key | |||
* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) | |||
* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_KYBER102490S_AVX2_crypto_kem_enc(unsigned char *ct, | |||
unsigned char *ss, | |||
const unsigned char *pk) { | |||
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; | |||
int PQCLEAN_KYBER102490S_AVX2_crypto_kem_enc(unsigned char ct[KYBER_CIPHERTEXTBYTES], | |||
unsigned char ss[KYBER_SSBYTES], | |||
const unsigned char pk[KYBER_PUBLICKEYBYTES]) { | |||
uint8_t buf[2 * KYBER_SYMBYTES]; | |||
/* Will contain key, coins */ | |||
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr; | |||
uint8_t kr[2 * KYBER_SYMBYTES]; | |||
randombytes(buf.arr, KYBER_SYMBYTES); | |||
randombytes(buf, KYBER_SYMBYTES); | |||
/* Don't release system RNG output */ | |||
hash_h(buf.arr, buf.arr, KYBER_SYMBYTES); | |||
hash_h(buf, buf, KYBER_SYMBYTES); | |||
/* Multitarget countermeasure for coins + contributory KEM */ | |||
hash_h(buf.arr + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); | |||
hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES); | |||
hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); | |||
hash_g(kr, buf, 2 * KYBER_SYMBYTES); | |||
/* coins are in kr+KYBER_SYMBYTES */ | |||
PQCLEAN_KYBER102490S_AVX2_indcpa_enc(ct, buf.arr, pk, kr.arr + KYBER_SYMBYTES); | |||
PQCLEAN_KYBER102490S_AVX2_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); | |||
/* overwrite coins in kr with H(c) */ | |||
hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); | |||
hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); | |||
/* hash concatenation of pre-k and H(c) to k */ | |||
kdf(ss, kr.arr, 2 * KYBER_SYMBYTES); | |||
kdf(ss, kr, 2 * KYBER_SYMBYTES); | |||
return 0; | |||
} | |||
@@ -80,47 +80,47 @@ int PQCLEAN_KYBER102490S_AVX2_crypto_kem_enc(unsigned char *ct, | |||
* cipher text and private key | |||
* | |||
* Arguments: - unsigned char *ss: pointer to output shared secret | |||
* (an already allocated array of CRYPTO_BYTES bytes) | |||
* (an already allocated array of KYBER_SSBYTES bytes) | |||
* - const unsigned char *ct: pointer to input cipher text | |||
* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) | |||
* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) | |||
* - const unsigned char *sk: pointer to input private key | |||
* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) | |||
* (an already allocated array of KYBER_SECRETKEYBYTES bytes) | |||
* | |||
* Returns 0. | |||
* | |||
* On failure, ss will contain a pseudo-random value. | |||
**************************************************/ | |||
int PQCLEAN_KYBER102490S_AVX2_crypto_kem_dec(unsigned char *ss, | |||
const unsigned char *ct, | |||
const unsigned char *sk) { | |||
int PQCLEAN_KYBER102490S_AVX2_crypto_kem_dec(unsigned char ss[KYBER_SSBYTES], | |||
const unsigned char ct[KYBER_CIPHERTEXTBYTES], | |||
const unsigned char sk[KYBER_SECRETKEYBYTES]) { | |||
size_t i; | |||
int fail; | |||
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; | |||
uint8_t buf[2 * KYBER_SYMBYTES]; | |||
/* Will contain key, coins */ | |||
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr; | |||
uint8_t cmp[KYBER_CIPHERTEXTBYTES]; | |||
uint8_t kr[2 * KYBER_SYMBYTES]; | |||
ALIGNED_UINT8(KYBER_CIPHERTEXTBYTES) cmp; | |||
const uint8_t *pk = sk + KYBER_INDCPA_SECRETKEYBYTES; | |||
PQCLEAN_KYBER102490S_AVX2_indcpa_dec(buf.arr, ct, sk); | |||
PQCLEAN_KYBER102490S_AVX2_indcpa_dec(buf, ct, sk); | |||
/* Multitarget countermeasure for coins + contributory KEM */ | |||
for (i = 0; i < KYBER_SYMBYTES; i++) { | |||
buf.arr[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; | |||
buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; | |||
} | |||
hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES); | |||
hash_g(kr, buf, 2 * KYBER_SYMBYTES); | |||
/* coins are in kr+KYBER_SYMBYTES */ | |||
PQCLEAN_KYBER102490S_AVX2_indcpa_enc(cmp, buf.arr, pk, kr.arr + KYBER_SYMBYTES); | |||
PQCLEAN_KYBER102490S_AVX2_indcpa_enc(cmp.coeffs, buf, pk, kr + KYBER_SYMBYTES); | |||
fail = PQCLEAN_KYBER102490S_AVX2_verify(ct, cmp, KYBER_CIPHERTEXTBYTES); | |||
fail = PQCLEAN_KYBER102490S_AVX2_verify(ct, cmp.coeffs, KYBER_CIPHERTEXTBYTES); | |||
/* overwrite coins in kr with H(c) */ | |||
hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); | |||
hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); | |||
/* Overwrite pre-k with z on re-encryption failure */ | |||
PQCLEAN_KYBER102490S_AVX2_cmov(kr.arr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, (uint8_t)fail); | |||
PQCLEAN_KYBER102490S_AVX2_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); | |||
/* hash concatenation of pre-k and H(c) to k */ | |||
kdf(ss, kr.arr, 2 * KYBER_SYMBYTES); | |||
kdf(ss, kr, 2 * KYBER_SYMBYTES); | |||
return 0; | |||
} |
@@ -1,222 +1,191 @@ | |||
#include "cdecl.h" | |||
.include "shuffle.inc" | |||
.include "fq.inc" | |||
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=1,zh1=1 | |||
#mul | |||
.macro mul rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=2,zh1=2 | |||
vpmullw %ymm\zl0,%ymm\rh0,%ymm12 | |||
vpmullw %ymm\zl0,%ymm\rh1,%ymm13 | |||
vpmullw %ymm\zl1,%ymm\rh2,%ymm14 | |||
vpmullw %ymm\zl1,%ymm\rh3,%ymm15 | |||
vpmulhw %ymm\zh0,%ymm\rh0,%ymm\rh0 | |||
vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh1 | |||
vpmulhw %ymm\zh1,%ymm\rh2,%ymm\rh2 | |||
vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh3 | |||
.endm | |||
#reduce | |||
.macro reduce | |||
vpmulhw %ymm0,%ymm12,%ymm12 | |||
vpmulhw %ymm0,%ymm13,%ymm13 | |||
vpmulhw %ymm0,%ymm14,%ymm14 | |||
vpmulhw %ymm0,%ymm15,%ymm15 | |||
vpsubw %ymm12,%ymm\rh0,%ymm12 | |||
vpsubw %ymm13,%ymm\rh1,%ymm13 | |||
vpsubw %ymm14,%ymm\rh2,%ymm14 | |||
vpsubw %ymm15,%ymm\rh3,%ymm15 | |||
#update | |||
vpsubw %ymm12,%ymm\rl0,%ymm\rh0 | |||
vpaddw %ymm12,%ymm\rl0,%ymm\rl0 | |||
vpsubw %ymm13,%ymm\rl1,%ymm\rh1 | |||
vpaddw %ymm13,%ymm\rl1,%ymm\rl1 | |||
vpsubw %ymm14,%ymm\rl2,%ymm\rh2 | |||
vpaddw %ymm14,%ymm\rl2,%ymm\rl2 | |||
vpsubw %ymm15,%ymm\rl3,%ymm\rh3 | |||
vpaddw %ymm15,%ymm\rl3,%ymm\rl3 | |||
.endm | |||
# We break the dependency chains with the cost of slightly more additions. | |||
# But they can be run in parallel to the multiplications on execution port 5 | |||
# (multiplications only go to ports 0 and 1) | |||
.macro butterfly2 rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,x=3,y=2,zl0=15,zl1=15,zh0=1,zh1=1 | |||
#mul | |||
vpmullw %ymm\zl0,%ymm\rh0,%ymm12 | |||
vpmulhw %ymm\zh0,%ymm\rh0,%ymm\x | |||
vpmullw %ymm\zl0,%ymm\rh1,%ymm13 | |||
vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh0 | |||
vpmullw %ymm\zl1,%ymm\rh2,%ymm14 | |||
vpmulhw %ymm\zh1,%ymm\rh2,%ymm\y | |||
vpmullw %ymm\zl1,%ymm\rh3,%ymm15 | |||
vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh2 | |||
.macro update rln,rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 | |||
vpaddw %ymm\rh0,%ymm\rl0,%ymm\rln | |||
vpsubw %ymm\rh0,%ymm\rl0,%ymm\rh0 | |||
vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl0 | |||
#reduce | |||
vpmulhw %ymm0,%ymm12,%ymm12 | |||
vpmulhw %ymm0,%ymm13,%ymm13 | |||
vpmulhw %ymm0,%ymm14,%ymm14 | |||
vpmulhw %ymm0,%ymm15,%ymm15 | |||
vpsubw %ymm\rh1,%ymm\rl1,%ymm\rh1 | |||
vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl1 | |||
vpsubw %ymm\rh2,%ymm\rl2,%ymm\rh2 | |||
vpsubw %ymm\rh0,%ymm\rl1,%ymm\rh1 | |||
vpaddw %ymm\rh0,%ymm\rl1,%ymm\rl1 | |||
vpsubw %ymm\x,%ymm\rl0,%ymm\rh0 | |||
vpaddw %ymm\x,%ymm\rl0,%ymm\rl0 | |||
vpsubw %ymm\rh2,%ymm\rl3,%ymm\rh3 | |||
vpaddw %ymm\rh2,%ymm\rl3,%ymm\rl3 | |||
vpsubw %ymm\y,%ymm\rl2,%ymm\rh2 | |||
vpaddw %ymm\y,%ymm\rl2,%ymm\rl2 | |||
vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl2 | |||
vpsubw %ymm\rh3,%ymm\rl3,%ymm\rh3 | |||
#update | |||
vpsubw %ymm12,%ymm\rln,%ymm\rln | |||
vpaddw %ymm12,%ymm\rh0,%ymm\rh0 | |||
vpsubw %ymm12,%ymm\rl0,%ymm\rl0 | |||
vpsubw %ymm13,%ymm\rl0,%ymm\rl0 | |||
vpaddw %ymm13,%ymm\rh1,%ymm\rh1 | |||
vpsubw %ymm13,%ymm\rl1,%ymm\rl1 | |||
vpsubw %ymm14,%ymm\rl1,%ymm\rl1 | |||
vpaddw %ymm14,%ymm\rh2,%ymm\rh2 | |||
vpsubw %ymm14,%ymm\rl2,%ymm\rl2 | |||
vpsubw %ymm15,%ymm\rl2,%ymm\rl2 | |||
vpaddw %ymm15,%ymm\rh3,%ymm\rh3 | |||
vpsubw %ymm15,%ymm\rl3,%ymm\rl3 | |||
.endm | |||
.text | |||
ntt_level0_avx: | |||
level0: | |||
#zetas | |||
vpbroadcastd (%rsi),%ymm15 | |||
vpbroadcastd 4(%rsi),%ymm1 | |||
#load | |||
vmovdqa (%rdi),%ymm4 | |||
vmovdqa 32(%rdi),%ymm5 | |||
vmovdqa 64(%rdi),%ymm6 | |||
vmovdqa 96(%rdi),%ymm7 | |||
vmovdqa 256(%rdi),%ymm8 | |||
vmovdqa 288(%rdi),%ymm9 | |||
vmovdqa 320(%rdi),%ymm10 | |||
vmovdqa 352(%rdi),%ymm11 | |||
butterfly2 4,5,6,7,8,9,10,11 | |||
#store | |||
vmovdqa %ymm4,(%rdi) | |||
vmovdqa %ymm5,32(%rdi) | |||
vmovdqa %ymm6,64(%rdi) | |||
vmovdqa %ymm7,96(%rdi) | |||
vmovdqa %ymm8,256(%rdi) | |||
vmovdqa %ymm9,288(%rdi) | |||
vmovdqa %ymm10,320(%rdi) | |||
vmovdqa %ymm11,352(%rdi) | |||
.macro level0 off | |||
vpbroadcastq (_ZETAS_EXP+0)*2(%rsi),%ymm15 | |||
vmovdqa (64*\off+128)*2(%rdi),%ymm8 | |||
vmovdqa (64*\off+144)*2(%rdi),%ymm9 | |||
vmovdqa (64*\off+160)*2(%rdi),%ymm10 | |||
vmovdqa (64*\off+176)*2(%rdi),%ymm11 | |||
vpbroadcastq (_ZETAS_EXP+4)*2(%rsi),%ymm2 | |||
mul 8,9,10,11 | |||
vmovdqa (64*\off+ 0)*2(%rdi),%ymm4 | |||
vmovdqa (64*\off+ 16)*2(%rdi),%ymm5 | |||
vmovdqa (64*\off+ 32)*2(%rdi),%ymm6 | |||
vmovdqa (64*\off+ 48)*2(%rdi),%ymm7 | |||
reduce | |||
update 3,4,5,6,7,8,9,10,11 | |||
vmovdqa %ymm3,(64*\off+ 0)*2(%rdi) | |||
vmovdqa %ymm4,(64*\off+ 16)*2(%rdi) | |||
vmovdqa %ymm5,(64*\off+ 32)*2(%rdi) | |||
vmovdqa %ymm6,(64*\off+ 48)*2(%rdi) | |||
vmovdqa %ymm8,(64*\off+128)*2(%rdi) | |||
vmovdqa %ymm9,(64*\off+144)*2(%rdi) | |||
vmovdqa %ymm10,(64*\off+160)*2(%rdi) | |||
vmovdqa %ymm11,(64*\off+176)*2(%rdi) | |||
.endm | |||
ret | |||
.macro levels1t6 off | |||
/* level 1 */ | |||
vmovdqa (_ZETAS_EXP+224*\off+16)*2(%rsi),%ymm15 | |||
vmovdqa (128*\off+ 64)*2(%rdi),%ymm8 | |||
vmovdqa (128*\off+ 80)*2(%rdi),%ymm9 | |||
vmovdqa (128*\off+ 96)*2(%rdi),%ymm10 | |||
vmovdqa (128*\off+112)*2(%rdi),%ymm11 | |||
vmovdqa (_ZETAS_EXP+224*\off+32)*2(%rsi),%ymm2 | |||
ntt_levels1t6_avx: | |||
level1: | |||
#zetas | |||
vpbroadcastd (%rsi),%ymm15 | |||
vpbroadcastd 4(%rsi),%ymm1 | |||
#load | |||
vmovdqa (%rdi),%ymm4 | |||
vmovdqa 32(%rdi),%ymm5 | |||
vmovdqa 64(%rdi),%ymm6 | |||
vmovdqa 96(%rdi),%ymm7 | |||
vmovdqa 128(%rdi),%ymm8 | |||
vmovdqa 160(%rdi),%ymm9 | |||
vmovdqa 192(%rdi),%ymm10 | |||
vmovdqa 224(%rdi),%ymm11 | |||
butterfly2 4,5,6,7,8,9,10,11,3 | |||
level2: | |||
#zetas | |||
vmovdqu 8(%rsi),%ymm15 | |||
vmovdqu 40(%rsi),%ymm1 | |||
shuffle8 4,8,3,8 | |||
shuffle8 5,9,4,9 | |||
shuffle8 6,10,5,10 | |||
shuffle8 7,11,6,11 | |||
butterfly2 3,8,4,9,5,10,6,11,7 | |||
level3: | |||
#zetas | |||
vmovdqu 72(%rsi),%ymm15 | |||
vmovdqu 104(%rsi),%ymm1 | |||
shuffle4 3,5,7,5 | |||
shuffle4 8,10,3,10 | |||
shuffle4 4,6,8,6 | |||
shuffle4 9,11,4,11 | |||
butterfly2 7,5,3,10,8,6,4,11,9 | |||
level4: | |||
#zetas | |||
vmovdqu 136(%rsi),%ymm15 | |||
vmovdqu 168(%rsi),%ymm1 | |||
shuffle2 7,8,9,8 | |||
shuffle2 5,6,7,6 | |||
shuffle2 3,4,5,4 | |||
shuffle2 10,11,3,11 | |||
butterfly2 9,8,7,6,5,4,3,11,10 | |||
level5: | |||
#zetas | |||
vmovdqu 200(%rsi),%ymm15 | |||
vmovdqu 232(%rsi),%ymm1 | |||
shuffle1 9,5,10,5 | |||
shuffle1 8,4,9,4 | |||
shuffle1 7,3,8,3 | |||
shuffle1 6,11,7,11 | |||
butterfly2 10,5,9,4,8,3,7,11,6 | |||
level6: | |||
#zetas | |||
vmovdqu 264(%rsi),%ymm14 | |||
vmovdqu 328(%rsi),%ymm15 | |||
vmovdqu 296(%rsi),%ymm1 | |||
vmovdqu 360(%rsi),%ymm2 | |||
butterfly2 10,5,8,3,9,4,7,11,6,1,14,15,1,2 | |||
vmovdqa _16XV*2(%rdx),%ymm1 | |||
red16 10,12 | |||
red16 5,13 | |||
red16 9,14 | |||
red16 4,15 | |||
red16 8,2 | |||
red16 3,6 | |||
red16 7,12 | |||
red16 11,13 | |||
#store | |||
vmovdqa %ymm10,(%rdi) | |||
vmovdqa %ymm5,32(%rdi) | |||
vmovdqa %ymm9,64(%rdi) | |||
vmovdqa %ymm4,96(%rdi) | |||
vmovdqa %ymm8,128(%rdi) | |||
vmovdqa %ymm3,160(%rdi) | |||
vmovdqa %ymm7,192(%rdi) | |||
vmovdqa %ymm11,224(%rdi) | |||
mul 8,9,10,11 | |||
ret | |||
vmovdqa (128*\off+ 0)*2(%rdi),%ymm4 | |||
vmovdqa (128*\off+ 16)*2(%rdi),%ymm5 | |||
vmovdqa (128*\off+ 32)*2(%rdi),%ymm6 | |||
vmovdqa (128*\off+ 48)*2(%rdi),%ymm7 | |||
reduce | |||
update 3,4,5,6,7,8,9,10,11 | |||
/* level 2 */ | |||
shuffle8 5,10,7,10 | |||
shuffle8 6,11,5,11 | |||
vmovdqa (_ZETAS_EXP+224*\off+48)*2(%rsi),%ymm15 | |||
vmovdqa (_ZETAS_EXP+224*\off+64)*2(%rsi),%ymm2 | |||
mul 7,10,5,11 | |||
shuffle8 3,8,6,8 | |||
shuffle8 4,9,3,9 | |||
reduce | |||
update 4,6,8,3,9,7,10,5,11 | |||
/* level 3 */ | |||
shuffle4 8,5,9,5 | |||
shuffle4 3,11,8,11 | |||
vmovdqa (_ZETAS_EXP+224*\off+80)*2(%rsi),%ymm15 | |||
vmovdqa (_ZETAS_EXP+224*\off+96)*2(%rsi),%ymm2 | |||
mul 9,5,8,11 | |||
shuffle4 4,7,3,7 | |||
shuffle4 6,10,4,10 | |||
reduce | |||
update 6,3,7,4,10,9,5,8,11 | |||
/* level 4 */ | |||
shuffle2 7,8,10,8 | |||
shuffle2 4,11,7,11 | |||
vmovdqa (_ZETAS_EXP+224*\off+112)*2(%rsi),%ymm15 | |||
vmovdqa (_ZETAS_EXP+224*\off+128)*2(%rsi),%ymm2 | |||
mul 10,8,7,11 | |||
shuffle2 6,9,4,9 | |||
shuffle2 3,5,6,5 | |||
reduce | |||
update 3,4,9,6,5,10,8,7,11 | |||
/* level 5 */ | |||
shuffle1 9,7,5,7 | |||
shuffle1 6,11,9,11 | |||
vmovdqa (_ZETAS_EXP+224*\off+144)*2(%rsi),%ymm15 | |||
vmovdqa (_ZETAS_EXP+224*\off+160)*2(%rsi),%ymm2 | |||
mul 5,7,9,11 | |||
shuffle1 3,10,6,10 | |||
shuffle1 4,8,3,8 | |||
reduce | |||
update 4,6,10,3,8,5,7,9,11 | |||
/* level 6 */ | |||
vmovdqa (_ZETAS_EXP+224*\off+176)*2(%rsi),%ymm14 | |||
vmovdqa (_ZETAS_EXP+224*\off+208)*2(%rsi),%ymm15 | |||
vmovdqa (_ZETAS_EXP+224*\off+192)*2(%rsi),%ymm8 | |||
vmovdqa (_ZETAS_EXP+224*\off+224)*2(%rsi),%ymm2 | |||
mul 10,3,9,11,14,15,8,2 | |||
reduce | |||
update 8,4,6,5,7,10,3,9,11 | |||
vmovdqa %ymm8,(128*\off+ 0)*2(%rdi) | |||
vmovdqa %ymm4,(128*\off+ 16)*2(%rdi) | |||
vmovdqa %ymm10,(128*\off+ 32)*2(%rdi) | |||
vmovdqa %ymm3,(128*\off+ 48)*2(%rdi) | |||
vmovdqa %ymm6,(128*\off+ 64)*2(%rdi) | |||
vmovdqa %ymm5,(128*\off+ 80)*2(%rdi) | |||
vmovdqa %ymm9,(128*\off+ 96)*2(%rdi) | |||
vmovdqa %ymm11,(128*\off+112)*2(%rdi) | |||
.endm | |||
.text | |||
.global cdecl(PQCLEAN_KYBER102490S_AVX2_ntt_avx) | |||
.global _cdecl(PQCLEAN_KYBER102490S_AVX2_ntt_avx) | |||
cdecl(PQCLEAN_KYBER102490S_AVX2_ntt_avx): | |||
_cdecl(PQCLEAN_KYBER102490S_AVX2_ntt_avx): | |||
#consts | |||
vmovdqa _16XQ*2(%rsi),%ymm0 | |||
mov %rsi,%rdx | |||
add $_ZETAS_EXP*2,%rsi | |||
call ntt_level0_avx | |||
add $128,%rdi | |||
call ntt_level0_avx | |||
sub $128,%rdi | |||
add $8,%rsi | |||
call ntt_levels1t6_avx | |||
add $256,%rdi | |||
add $392,%rsi | |||
call ntt_levels1t6_avx | |||
level0 0 | |||
level0 1 | |||
levels1t6 0 | |||
levels1t6 1 | |||
ret |
@@ -1,24 +1,21 @@ | |||
#ifndef PQCLEAN_KYBER102490S_AVX2_NTT_H | |||
#define PQCLEAN_KYBER102490S_AVX2_NTT_H | |||
#include "consts.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
void PQCLEAN_KYBER102490S_AVX2_ntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); | |||
void PQCLEAN_KYBER102490S_AVX2_invntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); | |||
void PQCLEAN_KYBER102490S_AVX2_ntt_avx(__m256i *r, const __m256i *PQCLEAN_KYBER102490S_AVX2_qdata); | |||
void PQCLEAN_KYBER102490S_AVX2_invntt_avx(__m256i *r, const __m256i *PQCLEAN_KYBER102490S_AVX2_qdata); | |||
void PQCLEAN_KYBER102490S_AVX2_nttpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); | |||
void PQCLEAN_KYBER102490S_AVX2_nttunpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); | |||
void PQCLEAN_KYBER102490S_AVX2_nttpack_avx(__m256i *r, const __m256i *PQCLEAN_KYBER102490S_AVX2_qdata); | |||
void PQCLEAN_KYBER102490S_AVX2_nttunpack_avx(__m256i *r, const __m256i *PQCLEAN_KYBER102490S_AVX2_qdata); | |||
void PQCLEAN_KYBER102490S_AVX2_basemul_avx(int16_t *r, | |||
const int16_t *a, | |||
const int16_t *b, | |||
const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); | |||
void PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx(int16_t *r, | |||
const int16_t *a, | |||
const int16_t *b, | |||
const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); | |||
void PQCLEAN_KYBER102490S_AVX2_basemul_avx(__m256i *r, | |||
const __m256i *a, | |||
const __m256i *b, | |||
const __m256i *PQCLEAN_KYBER102490S_AVX2_qdata); | |||
void PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx(uint8_t *r, const int16_t *a, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); | |||
void PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx(int16_t *r, const uint8_t *a, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); | |||
void PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx(uint8_t *r, const __m256i *a, const __m256i *PQCLEAN_KYBER102490S_AVX2_qdata); | |||
void PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx(__m256i *r, const uint8_t *a, const __m256i *PQCLEAN_KYBER102490S_AVX2_qdata); | |||
#endif |
@@ -7,8 +7,6 @@ | |||
#define KYBER_N 256 | |||
#define KYBER_Q 3329 | |||
#define KYBER_ETA 2 | |||
#define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ | |||
#define KYBER_SSBYTES 32 /* size in bytes of shared key */ | |||
@@ -16,9 +14,12 @@ | |||
#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) | |||
#define KYBER_K 4 | |||
#define KYBER_ETA1 2 | |||
#define KYBER_POLYCOMPRESSEDBYTES 160 | |||
#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 352) | |||
#define KYBER_ETA2 2 | |||
#define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES | |||
#define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) | |||
#define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) | |||
@@ -12,76 +12,99 @@ | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_AVX2_poly_compress | |||
* | |||
* Description: Compression and subsequent serialization of a polynomial | |||
* Description: Compression and subsequent serialization of a polynomial. | |||
* The coefficients of the input polynomial are assumed to | |||
* lie in the invertal [0,q], i.e. the polynomial must be reduced | |||
* by PQCLEAN_KYBER102490S_AVX2_poly_reduce(). | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* (of length KYBER_POLYCOMPRESSEDBYTES) | |||
* - poly *a: pointer to input polynomial | |||
* - const poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *restrict a) { | |||
unsigned int i, j; | |||
uint8_t t[8]; | |||
PQCLEAN_KYBER102490S_AVX2_poly_csubq(a); | |||
for (i = 0; i < KYBER_N / 8; i++) { | |||
for (j = 0; j < 8; j++) { | |||
t[j] = ((((uint32_t)a->coeffs[8 * i + j] << 5) + KYBER_Q / 2) / KYBER_Q) & 31; | |||
} | |||
void PQCLEAN_KYBER102490S_AVX2_poly_compress(uint8_t r[160], const poly *restrict a) { | |||
size_t i; | |||
uint32_t low; | |||
__m256i f0, f1; | |||
__m128i t0, t1; | |||
const __m256i v = _mm256_load_si256(&PQCLEAN_KYBER102490S_AVX2_qdata.vec[_16XV / 16]); | |||
const __m256i shift1 = _mm256_set1_epi16(1 << 10); | |||
const __m256i mask = _mm256_set1_epi16(31); | |||
const __m256i shift2 = _mm256_set1_epi16((32 << 8) + 1); | |||
const __m256i shift3 = _mm256_set1_epi32((1024 << 16) + 1); | |||
const __m256i sllvdidx = _mm256_set1_epi64x(12); | |||
const __m256i shufbidx = _mm256_set_epi8( 8, -1, -1, -1, -1, -1, 4, 3, 2, 1, 0, -1, 12, 11, 10, 9, | |||
-1, 12, 11, 10, 9, 8, -1, -1, -1, -1, -1, 4, 3, 2, 1, 0); | |||
r[0] = (t[0] >> 0) | (t[1] << 5); | |||
r[1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7); | |||
r[2] = (t[3] >> 1) | (t[4] << 4); | |||
r[3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6); | |||
r[4] = (t[6] >> 2) | (t[7] << 3); | |||
r += 5; | |||
for (i = 0; i < KYBER_N / 32; i++) { | |||
f0 = _mm256_load_si256(&a->vec[2 * i + 0]); | |||
f1 = _mm256_load_si256(&a->vec[2 * i + 1]); | |||
f0 = _mm256_mulhi_epi16(f0, v); | |||
f1 = _mm256_mulhi_epi16(f1, v); | |||
f0 = _mm256_mulhrs_epi16(f0, shift1); | |||
f1 = _mm256_mulhrs_epi16(f1, shift1); | |||
f0 = _mm256_and_si256(f0, mask); | |||
f1 = _mm256_and_si256(f1, mask); | |||
f0 = _mm256_packus_epi16(f0, f1); | |||
f0 = _mm256_maddubs_epi16(f0, shift2); // a0 a1 a2 a3 b0 b1 b2 b3 a4 a5 a6 a7 b4 b5 b6 b7 | |||
f0 = _mm256_madd_epi16(f0, shift3); // a0 a1 b0 b1 a2 a3 b2 b3 | |||
f0 = _mm256_sllv_epi32(f0, sllvdidx); | |||
f0 = _mm256_srlv_epi64(f0, sllvdidx); | |||
f0 = _mm256_shuffle_epi8(f0, shufbidx); | |||
t0 = _mm256_castsi256_si128(f0); | |||
t1 = _mm256_extracti128_si256(f0, 1); | |||
t0 = _mm_blendv_epi8(t0, t1, _mm256_castsi256_si128(shufbidx)); | |||
_mm_storeu_si128((__m128i *)&r[20 * i + 0], t0); | |||
_mm_store_ss((float *)&low, _mm_castsi128_ps(t1)); | |||
r[20 * i + 16] = (uint8_t)low; | |||
r[20 * i + 17] = (uint8_t)(low >> 0x08); | |||
r[20 * i + 18] = (uint8_t)(low >> 0x10); | |||
r[20 * i + 19] = (uint8_t)(low >> 0x18); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_AVX2_poly_decompress | |||
* | |||
* Description: De-serialization and subsequent decompression of a polynomial; | |||
* approximate inverse of PQCLEAN_KYBER102490S_AVX2_poly_compress | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *a: pointer to input byte array | |||
* (of length KYBER_POLYCOMPRESSEDBYTES bytes) | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_poly_decompress(poly *restrict r, | |||
const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) { | |||
void PQCLEAN_KYBER102490S_AVX2_poly_decompress(poly *restrict r, const uint8_t a[160]) { | |||
unsigned int i; | |||
int16_t h; | |||
__m128i t; | |||
__m256i f; | |||
const __m256i q = _mm256_load_si256(&PQCLEAN_KYBER102490S_AVX2_qdata.vec[_16XQ / 16]); | |||
const __m256i shufbidx = _mm256_set_epi8(9, 9, 9, 8, 8, 8, 8, 7, 7, 6, 6, 6, 6, 5, 5, 5, | |||
4, 4, 4, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0); | |||
const __m256i mask = _mm256_set_epi16(248, 1984, 62, 496, 3968, 124, 992, 31, | |||
248, 1984, 62, 496, 3968, 124, 992, 31); | |||
const __m256i shift = _mm256_set_epi16(128, 16, 512, 64, 8, 256, 32, 1024, | |||
128, 16, 512, 64, 8, 256, 32, 1024); | |||
unsigned int j; | |||
uint8_t t[8]; | |||
for (i = 0; i < KYBER_N / 8; i++) { | |||
t[0] = (a[0] >> 0); | |||
t[1] = (a[0] >> 5) | (a[1] << 3); | |||
t[2] = (a[1] >> 2); | |||
t[3] = (a[1] >> 7) | (a[2] << 1); | |||
t[4] = (a[2] >> 4) | (a[3] << 4); | |||
t[5] = (a[3] >> 1); | |||
t[6] = (a[3] >> 6) | (a[4] << 2); | |||
t[7] = (a[4] >> 3); | |||
a += 5; | |||
for (j = 0; j < 8; j++) { | |||
r->coeffs[8 * i + j] = ((uint32_t)(t[j] & 31) * KYBER_Q + 16) >> 5; | |||
} | |||
for (i = 0; i < KYBER_N / 16; i++) { | |||
t = _mm_loadl_epi64((__m128i *)&a[10 * i + 0]); | |||
h = (a[10 * i + 9] << 8) + a[10 * i + 8]; | |||
t = _mm_insert_epi16(t, h, 4); | |||
f = _mm256_broadcastsi128_si256(t); | |||
f = _mm256_shuffle_epi8(f, shufbidx); | |||
f = _mm256_and_si256(f, mask); | |||
f = _mm256_mullo_epi16(f, shift); | |||
f = _mm256_mulhrs_epi16(f, q); | |||
_mm256_store_si256(&r->vec[i], f); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_AVX2_poly_tobytes | |||
* | |||
* Description: Serialization of a polynomial | |||
* Description: Serialization of a polynomial in NTT representation. | |||
* The coefficients of the input polynomial are assumed to | |||
* lie in the invertal [0,q], i.e. the polynomial must be reduced | |||
* by PQCLEAN_KYBER102490S_AVX2_poly_reduce(). The coefficients are orderd as output by | |||
* PQCLEAN_KYBER102490S_AVX2_poly_ntt(); the serialized output coefficients are in bitreversed | |||
* order. | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* (needs space for KYBER_POLYBYTES bytes) | |||
* - poly *a: pointer to input polynomial | |||
* - poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { | |||
PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx(r, a->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); | |||
PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx(r, a->vec, PQCLEAN_KYBER102490S_AVX2_qdata.vec); | |||
} | |||
/************************************************* | |||
@@ -90,12 +113,12 @@ void PQCLEAN_KYBER102490S_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) | |||
* Description: De-serialization of a polynomial; | |||
* inverse of PQCLEAN_KYBER102490S_AVX2_poly_tobytes | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *a: pointer to input byte array | |||
* (of KYBER_POLYBYTES bytes) | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]) { | |||
PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx(r->coeffs, a, &PQCLEAN_KYBER102490S_AVX2_qdata); | |||
PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx(r->vec, a, PQCLEAN_KYBER102490S_AVX2_qdata.vec); | |||
} | |||
/************************************************* | |||
@@ -103,11 +126,10 @@ void PQCLEAN_KYBER102490S_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POL | |||
* | |||
* Description: Convert 32-byte message to polynomial | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *msg: pointer to input message | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_poly_frommsg(poly *restrict r, | |||
const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { | |||
void PQCLEAN_KYBER102490S_AVX2_poly_frommsg(poly *restrict r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { | |||
__m256i f, g0, g1, g2, g3, h0, h1, h2, h3; | |||
const __m256i shift = _mm256_broadcastsi128_si256(_mm_set_epi32(0, 1, 2, 3)); | |||
const __m256i idx = _mm256_broadcastsi128_si256(_mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); | |||
@@ -136,12 +158,12 @@ void PQCLEAN_KYBER102490S_AVX2_poly_frommsg(poly *restrict r, | |||
g2 = _mm256_permute2x128_si256(h0,h1,0x31); \ | |||
g1 = _mm256_permute2x128_si256(h2,h3,0x20); \ | |||
g3 = _mm256_permute2x128_si256(h2,h3,0x31); \ | |||
_mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+ 0],g0); \ | |||
_mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+16],g1); \ | |||
_mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+ 0],g2); \ | |||
_mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+16],g3) | |||
_mm256_store_si256(&r->vec[0+2*(i)+0],g0); \ | |||
_mm256_store_si256(&r->vec[0+2*(i)+1],g1); \ | |||
_mm256_store_si256(&r->vec[8+2*(i)+0],g2); \ | |||
_mm256_store_si256(&r->vec[8+2*(i)+1],g3) | |||
f = _mm256_load_si256((__m256i *)msg); | |||
f = _mm256_loadu_si256((__m256i *)msg); | |||
FROMMSG64(0); | |||
FROMMSG64(1); | |||
FROMMSG64(2); | |||
@@ -151,32 +173,34 @@ void PQCLEAN_KYBER102490S_AVX2_poly_frommsg(poly *restrict r, | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_AVX2_poly_tomsg | |||
* | |||
* Description: Convert polynomial to 32-byte message | |||
* Description: Convert polynomial to 32-byte message. | |||
* The coefficients of the input polynomial are assumed to | |||
* lie in the invertal [0,q], i.e. the polynomial must be reduced | |||
* by PQCLEAN_KYBER102490S_AVX2_poly_reduce(). | |||
* | |||
* Arguments: - uint8_t *msg: pointer to output message | |||
* - poly *a: pointer to input polynomial | |||
* - poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *restrict a) { | |||
unsigned int i; | |||
uint32_t small; | |||
__m256i f0, f1, g0, g1; | |||
const __m256i hqs = _mm256_set1_epi16((KYBER_Q - 1) / 2); | |||
const __m256i hhqs = _mm256_set1_epi16((KYBER_Q - 5) / 4); | |||
const __m256i hq = _mm256_set1_epi16((KYBER_Q - 1) / 2); | |||
const __m256i hhq = _mm256_set1_epi16((KYBER_Q - 1) / 4); | |||
for (i = 0; i < KYBER_N / 32; i++) { | |||
f0 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i]); | |||
f1 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i + 16]); | |||
f0 = _mm256_sub_epi16(hqs, f0); | |||
f1 = _mm256_sub_epi16(hqs, f1); | |||
f0 = _mm256_load_si256(&a->vec[2 * i + 0]); | |||
f1 = _mm256_load_si256(&a->vec[2 * i + 1]); | |||
f0 = _mm256_sub_epi16(hq, f0); | |||
f1 = _mm256_sub_epi16(hq, f1); | |||
g0 = _mm256_srai_epi16(f0, 15); | |||
g1 = _mm256_srai_epi16(f1, 15); | |||
f0 = _mm256_xor_si256(f0, g0); | |||
f1 = _mm256_xor_si256(f1, g1); | |||
f0 = _mm256_sub_epi16(hhqs, f0); | |||
f1 = _mm256_sub_epi16(hhqs, f1); | |||
f0 = _mm256_sub_epi16(f0, hhq); | |||
f1 = _mm256_sub_epi16(f1, hhq); | |||
f0 = _mm256_packs_epi16(f0, f1); | |||
small = _mm256_movemask_epi8(f0); | |||
small = ~small; | |||
msg[4 * i + 0] = small; | |||
msg[4 * i + 1] = small >> 16; | |||
msg[4 * i + 2] = small >> 8; | |||
@@ -185,21 +209,39 @@ void PQCLEAN_KYBER102490S_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], po | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_AVX2_poly_getnoise | |||
* Name: PQCLEAN_KYBER102490S_AVX2_poly_getnoise_eta1 | |||
* | |||
* Description: Sample a polynomial deterministically from a seed and a nonce, | |||
* with output polynomial close to centered binomial distribution | |||
* with parameter KYBER_ETA1 | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *seed: pointer to input seed | |||
* (of length KYBER_SYMBYTES bytes) | |||
* - uint8_t nonce: one-byte input nonce | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { | |||
ALIGNED_UINT8(KYBER_ETA1 * KYBER_N / 4 + 32) buf; // +32 bytes as required by PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta1 | |||
prf(buf.coeffs, KYBER_ETA1 * KYBER_N / 4, seed, nonce); | |||
PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta1(r, buf.vec); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_AVX2_poly_getnoise_eta2 | |||
* | |||
* Description: Sample a polynomial deterministically from a seed and a nonce, | |||
* with output polynomial close to centered binomial distribution | |||
* with parameter KYBER_ETA | |||
* with parameter KYBER_ETA2 | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *seed: pointer to input seed | |||
* (of length KYBER_SYMBYTES bytes) | |||
* - uint8_t nonce: one-byte input nonce | |||
* - uint8_t nonce: one-byte input nonce | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { | |||
ALIGN32_ARRAY(uint8_t, KYBER_ETA * KYBER_N / 4) buf; | |||
prf(buf.arr, sizeof(buf.arr), seed, nonce); | |||
PQCLEAN_KYBER102490S_AVX2_cbd(r, buf.arr); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { | |||
ALIGNED_UINT8(KYBER_ETA2 * KYBER_N / 4) buf; | |||
prf(buf.coeffs, KYBER_ETA2 * KYBER_N / 4, seed, nonce); | |||
PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta2(r, buf.vec); | |||
} | |||
@@ -207,13 +249,17 @@ void PQCLEAN_KYBER102490S_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_S | |||
* Name: PQCLEAN_KYBER102490S_AVX2_poly_ntt | |||
* | |||
* Description: Computes negacyclic number-theoretic transform (NTT) of | |||
* a polynomial in place; | |||
* inputs assumed to be in normal order, output in bitreversed order | |||
* a polynomial in place. | |||
* Input coefficients assumed to be in normal order, | |||
* output coefficients are in special order that is natural | |||
* for the vectorization. Input coefficients are assumed to be | |||
* bounded by q in absolute value, output coefficients are bounded | |||
* by 16118 in absolute value. | |||
* | |||
* Arguments: - uint16_t *r: pointer to in/output polynomial | |||
* Arguments: - poly *r: pointer to in/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_poly_ntt(poly *r) { | |||
PQCLEAN_KYBER102490S_AVX2_ntt_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); | |||
PQCLEAN_KYBER102490S_AVX2_ntt_avx(r->vec, PQCLEAN_KYBER102490S_AVX2_qdata.vec); | |||
} | |||
/************************************************* | |||
@@ -221,29 +267,35 @@ void PQCLEAN_KYBER102490S_AVX2_poly_ntt(poly *r) { | |||
* | |||
* Description: Computes inverse of negacyclic number-theoretic transform (NTT) | |||
* of a polynomial in place; | |||
* inputs assumed to be in bitreversed order, output in normal order | |||
* Input coefficients assumed to be in special order from vectorized | |||
* forward ntt, output in normal order. Input coefficients can be | |||
* arbitrary 16-bit integers, output coefficients are bounded by 14870 | |||
* in absolute value. | |||
* | |||
* Arguments: - uint16_t *a: pointer to in/output polynomial | |||
* Arguments: - poly *a: pointer to in/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_poly_invntt_tomont(poly *r) { | |||
PQCLEAN_KYBER102490S_AVX2_invntt_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); | |||
PQCLEAN_KYBER102490S_AVX2_invntt_avx(r->vec, PQCLEAN_KYBER102490S_AVX2_qdata.vec); | |||
} | |||
void PQCLEAN_KYBER102490S_AVX2_poly_nttunpack(poly *r) { | |||
PQCLEAN_KYBER102490S_AVX2_nttunpack_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); | |||
PQCLEAN_KYBER102490S_AVX2_nttunpack_avx(r->vec, PQCLEAN_KYBER102490S_AVX2_qdata.vec); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_AVX2_poly_basemul_montgomery | |||
* | |||
* Description: Multiplication of two polynomials in NTT domain | |||
* Description: Multiplication of two polynomials in NTT domain. | |||
* One of the input polynomials needs to have coefficients | |||
* bounded by q, the other polynomial can have arbitrary | |||
* coefficients. Output coefficients are bounded by 6656. | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const poly *a: pointer to first input polynomial | |||
* - const poly *b: pointer to second input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const poly *b) { | |||
PQCLEAN_KYBER102490S_AVX2_basemul_avx(r->coeffs, a->coeffs, b->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); | |||
PQCLEAN_KYBER102490S_AVX2_basemul_avx(r->vec, a->vec, b->vec, PQCLEAN_KYBER102490S_AVX2_qdata.vec); | |||
} | |||
/************************************************* | |||
@@ -255,7 +307,7 @@ void PQCLEAN_KYBER102490S_AVX2_poly_basemul_montgomery(poly *r, const poly *a, c | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_poly_tomont(poly *r) { | |||
PQCLEAN_KYBER102490S_AVX2_tomont_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); | |||
PQCLEAN_KYBER102490S_AVX2_tomont_avx(r->vec, PQCLEAN_KYBER102490S_AVX2_qdata.vec); | |||
} | |||
/************************************************* | |||
@@ -267,28 +319,16 @@ void PQCLEAN_KYBER102490S_AVX2_poly_tomont(poly *r) { | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_poly_reduce(poly *r) { | |||
PQCLEAN_KYBER102490S_AVX2_reduce_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_AVX2_poly_csubq | |||
* | |||
* Description: Applies conditional subtraction of q to each coefficient | |||
* of a polynomial. For details of conditional subtraction | |||
* of q see comments in reduce.c | |||
* | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_poly_csubq(poly *r) { | |||
PQCLEAN_KYBER102490S_AVX2_csubq_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); | |||
PQCLEAN_KYBER102490S_AVX2_reduce_avx(r->vec, PQCLEAN_KYBER102490S_AVX2_qdata.vec); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_AVX2_poly_add | |||
* | |||
* Description: Add two polynomials | |||
* Description: Add two polynomials. No modular reduction | |||
* is performed. | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const poly *a: pointer to first input polynomial | |||
* - const poly *b: pointer to second input polynomial | |||
**************************************************/ | |||
@@ -296,20 +336,21 @@ void PQCLEAN_KYBER102490S_AVX2_poly_add(poly *r, const poly *a, const poly *b) { | |||
unsigned int i; | |||
__m256i f0, f1; | |||
for (i = 0; i < KYBER_N; i += 16) { | |||
f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); | |||
f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); | |||
for (i = 0; i < KYBER_N / 16; i++) { | |||
f0 = _mm256_load_si256(&a->vec[i]); | |||
f1 = _mm256_load_si256(&b->vec[i]); | |||
f0 = _mm256_add_epi16(f0, f1); | |||
_mm256_store_si256((__m256i *)&r->coeffs[i], f0); | |||
_mm256_store_si256(&r->vec[i], f0); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_AVX2_poly_sub | |||
* | |||
* Description: Subtract two polynomials | |||
* Description: Subtract two polynomials. No modular reduction | |||
* is performed. | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const poly *a: pointer to first input polynomial | |||
* - const poly *b: pointer to second input polynomial | |||
**************************************************/ | |||
@@ -317,10 +358,10 @@ void PQCLEAN_KYBER102490S_AVX2_poly_sub(poly *r, const poly *a, const poly *b) { | |||
unsigned int i; | |||
__m256i f0, f1; | |||
for (i = 0; i < KYBER_N; i += 16) { | |||
f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); | |||
f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); | |||
for (i = 0; i < KYBER_N / 16; i++) { | |||
f0 = _mm256_load_si256(&a->vec[i]); | |||
f1 = _mm256_load_si256(&b->vec[i]); | |||
f0 = _mm256_sub_epi16(f0, f1); | |||
_mm256_store_si256((__m256i *)&r->coeffs[i], f0); | |||
_mm256_store_si256(&r->vec[i], f0); | |||
} | |||
} |
@@ -1,19 +1,13 @@ | |||
#ifndef PQCLEAN_KYBER102490S_AVX2_POLY_H | |||
#define PQCLEAN_KYBER102490S_AVX2_POLY_H | |||
#include "align.h" | |||
#include "params.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
/* | |||
* Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial | |||
* coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1] | |||
*/ | |||
typedef union { | |||
__m256i dummy; | |||
int16_t coeffs[KYBER_N]; | |||
} poly; | |||
typedef ALIGNED_INT16(KYBER_N) poly; | |||
void PQCLEAN_KYBER102490S_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a); | |||
@@ -22,7 +16,11 @@ void PQCLEAN_KYBER102490S_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POL | |||
void PQCLEAN_KYBER102490S_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_ntt(poly *r); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_invntt_tomont(poly *r); | |||
@@ -31,7 +29,6 @@ void PQCLEAN_KYBER102490S_AVX2_poly_basemul_montgomery(poly *r, const poly *a, c | |||
void PQCLEAN_KYBER102490S_AVX2_poly_tomont(poly *r); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_reduce(poly *r); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_csubq(poly *r); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_add(poly *r, const poly *a, const poly *b); | |||
void PQCLEAN_KYBER102490S_AVX2_poly_sub(poly *r, const poly *a, const poly *b); | |||
@@ -3,8 +3,79 @@ | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "polyvec.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
static void poly_compress11(uint8_t r[352 + 2], const poly *restrict a) { | |||
unsigned int i; | |||
__m256i f0, f1, f2; | |||
__m128i t0, t1; | |||
const __m256i v = _mm256_load_si256(&PQCLEAN_KYBER102490S_AVX2_qdata.vec[_16XV / 16]); | |||
const __m256i v8 = _mm256_slli_epi16(v, 3); | |||
const __m256i off = _mm256_set1_epi16(36); | |||
const __m256i shift1 = _mm256_set1_epi16(1 << 13); | |||
const __m256i mask = _mm256_set1_epi16(2047); | |||
const __m256i shift2 = _mm256_set1_epi64x((2048LL << 48) + (1LL << 32) + (2048 << 16) + 1); | |||
const __m256i sllvdidx = _mm256_set1_epi64x(10); | |||
const __m256i srlvqidx = _mm256_set_epi64x(30, 10, 30, 10); | |||
const __m256i shufbidx = _mm256_set_epi8( 4, 3, 2, 1, 0, 0, -1, -1, -1, -1, 10, 9, 8, 7, 6, 5, | |||
-1, -1, -1, -1, -1, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); | |||
for (i = 0; i < KYBER_N / 16; i++) { | |||
f0 = _mm256_load_si256(&a->vec[i]); | |||
f1 = _mm256_mullo_epi16(f0, v8); | |||
f2 = _mm256_add_epi16(f0, off); | |||
f0 = _mm256_slli_epi16(f0, 3); | |||
f0 = _mm256_mulhi_epi16(f0, v); | |||
f2 = _mm256_sub_epi16(f1, f2); | |||
f1 = _mm256_andnot_si256(f1, f2); | |||
f1 = _mm256_srli_epi16(f1, 15); | |||
f0 = _mm256_sub_epi16(f0, f1); | |||
f0 = _mm256_mulhrs_epi16(f0, shift1); | |||
f0 = _mm256_and_si256(f0, mask); | |||
f0 = _mm256_madd_epi16(f0, shift2); | |||
f0 = _mm256_sllv_epi32(f0, sllvdidx); | |||
f1 = _mm256_bsrli_epi128(f0, 8); | |||
f0 = _mm256_srlv_epi64(f0, srlvqidx); | |||
f1 = _mm256_slli_epi64(f1, 34); | |||
f0 = _mm256_add_epi64(f0, f1); | |||
f0 = _mm256_shuffle_epi8(f0, shufbidx); | |||
t0 = _mm256_castsi256_si128(f0); | |||
t1 = _mm256_extracti128_si256(f0, 1); | |||
t0 = _mm_blendv_epi8(t0, t1, _mm256_castsi256_si128(shufbidx)); | |||
_mm_storeu_si128((__m128i *)&r[22 * i + 0], t0); | |||
_mm_storel_epi64((__m128i *)&r[22 * i + 16], t1); | |||
} | |||
} | |||
static void poly_decompress11(poly *restrict r, const uint8_t a[352 + 10]) { | |||
unsigned int i; | |||
__m256i f; | |||
const __m256i q = _mm256_load_si256(&PQCLEAN_KYBER102490S_AVX2_qdata.vec[_16XQ / 16]); | |||
const __m256i shufbidx = _mm256_set_epi8(13, 12, 12, 11, 10, 9, 9, 8, | |||
8, 7, 6, 5, 5, 4, 4, 3, | |||
10, 9, 9, 8, 7, 6, 6, 5, | |||
5, 4, 3, 2, 2, 1, 1, 0); | |||
const __m256i srlvdidx = _mm256_set_epi32(0, 0, 1, 0, 0, 0, 1, 0); | |||
const __m256i srlvqidx = _mm256_set_epi64x(2, 0, 2, 0); | |||
const __m256i shift = _mm256_set_epi16(4, 32, 1, 8, 32, 1, 4, 32, 4, 32, 1, 8, 32, 1, 4, 32); | |||
const __m256i mask = _mm256_set1_epi16(32752); | |||
for (i = 0; i < KYBER_N / 16; i++) { | |||
f = _mm256_loadu_si256((__m256i *)&a[22 * i]); | |||
f = _mm256_permute4x64_epi64(f, 0x94); | |||
f = _mm256_shuffle_epi8(f, shufbidx); | |||
f = _mm256_srlv_epi32(f, srlvdidx); | |||
f = _mm256_srlv_epi64(f, srlvqidx); | |||
f = _mm256_mullo_epi16(f, shift); | |||
f = _mm256_srli_epi16(f, 1); | |||
f = _mm256_and_si256(f, mask); | |||
f = _mm256_mulhrs_epi16(f, q); | |||
_mm256_store_si256(&r->vec[i], f); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_compress | |||
* | |||
@@ -14,33 +85,11 @@ | |||
* (needs space for KYBER_POLYVECCOMPRESSEDBYTES) | |||
* - polyvec *a: pointer to input vector of polynomials | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], | |||
polyvec *restrict a) { | |||
size_t i, j, k; | |||
PQCLEAN_KYBER102490S_AVX2_polyvec_csubq(a); | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES + 2], polyvec *a) { | |||
size_t i; | |||
uint16_t t[8]; | |||
for (i = 0; i < KYBER_K; i++) { | |||
for (j = 0; j < KYBER_N / 8; j++) { | |||
for (k = 0; k < 8; k++) { | |||
t[k] = ((((uint32_t)a->vec[i].coeffs[8 * j + k] << 11) + KYBER_Q / 2) | |||
/ KYBER_Q) & 0x7ff; | |||
} | |||
r[ 0] = (t[0] >> 0); | |||
r[ 1] = (t[0] >> 8) | (t[1] << 3); | |||
r[ 2] = (t[1] >> 5) | (t[2] << 6); | |||
r[ 3] = (t[2] >> 2); | |||
r[ 4] = (t[2] >> 10) | (t[3] << 1); | |||
r[ 5] = (t[3] >> 7) | (t[4] << 4); | |||
r[ 6] = (t[4] >> 4) | (t[5] << 7); | |||
r[ 7] = (t[5] >> 1); | |||
r[ 8] = (t[5] >> 9) | (t[6] << 2); | |||
r[ 9] = (t[6] >> 6) | (t[7] << 5); | |||
r[10] = (t[7] >> 3); | |||
r += 11; | |||
} | |||
poly_compress11(&r[352 * i], &a->vec[i]); | |||
} | |||
} | |||
@@ -50,31 +99,15 @@ void PQCLEAN_KYBER102490S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSE | |||
* Description: De-serialize and decompress vector of polynomials; | |||
* approximate inverse of PQCLEAN_KYBER102490S_AVX2_polyvec_compress | |||
* | |||
* Arguments: - polyvec *r: pointer to output vector of polynomials | |||
* Arguments: - polyvec *r: pointer to output vector of polynomials | |||
* - const uint8_t *a: pointer to input byte array | |||
* (of length KYBER_POLYVECCOMPRESSEDBYTES) | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_decompress(polyvec *restrict r, | |||
const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { | |||
size_t i, j, k; | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES + 12]) { | |||
size_t i; | |||
uint16_t t[8]; | |||
for (i = 0; i < KYBER_K; i++) { | |||
for (j = 0; j < KYBER_N / 8; j++) { | |||
t[0] = (a[0] >> 0) | ((uint16_t)a[ 1] << 8); | |||
t[1] = (a[1] >> 3) | ((uint16_t)a[ 2] << 5); | |||
t[2] = (a[2] >> 6) | ((uint16_t)a[ 3] << 2) | ((uint16_t)a[4] << 10); | |||
t[3] = (a[4] >> 1) | ((uint16_t)a[ 5] << 7); | |||
t[4] = (a[5] >> 4) | ((uint16_t)a[ 6] << 4); | |||
t[5] = (a[6] >> 7) | ((uint16_t)a[ 7] << 1) | ((uint16_t)a[8] << 9); | |||
t[6] = (a[8] >> 2) | ((uint16_t)a[ 9] << 6); | |||
t[7] = (a[9] >> 5) | ((uint16_t)a[10] << 3); | |||
a += 11; | |||
for (k = 0; k < 8; k++) { | |||
r->vec[i].coeffs[8 * j + k] = ((uint32_t)(t[k] & 0x7FF) * KYBER_Q + 1024) >> 11; | |||
} | |||
} | |||
poly_decompress11(&r->vec[i], &a[352 * i]); | |||
} | |||
} | |||
@@ -100,7 +133,7 @@ void PQCLEAN_KYBER102490S_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], po | |||
* Description: De-serialize vector of polynomials; | |||
* inverse of PQCLEAN_KYBER102490S_AVX2_polyvec_tobytes | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* - const polyvec *a: pointer to input vector of polynomials | |||
* (of length KYBER_POLYVECBYTES) | |||
**************************************************/ | |||
@@ -141,29 +174,34 @@ void PQCLEAN_KYBER102490S_AVX2_polyvec_invntt_tomont(polyvec *r) { | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery | |||
* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_basemul_acc_montgomery | |||
* | |||
* Description: Pointwise multiply elements of a and b, accumulate into r, | |||
* Description: Multiply elements in a and b in NTT domain, accumulate into r, | |||
* and multiply by 2^-16. | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const polyvec *a: pointer to first input vector of polynomials | |||
* - const polyvec *b: pointer to second input vector of polynomials | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(poly *r, | |||
const polyvec *a, | |||
const polyvec *b) { | |||
PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx(r->coeffs, a->vec->coeffs, b->vec->coeffs, & PQCLEAN_KYBER102490S_AVX2_qdata); | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) { | |||
size_t i; | |||
poly tmp; | |||
PQCLEAN_KYBER102490S_AVX2_poly_basemul_montgomery(r, &a->vec[0], &b->vec[0]); | |||
for (i = 1; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_AVX2_poly_basemul_montgomery(&tmp, &a->vec[i], &b->vec[i]); | |||
PQCLEAN_KYBER102490S_AVX2_poly_add(r, r, &tmp); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_reduce | |||
* | |||
* Description: Applies Barrett reduction to each coefficient | |||
* of each element of a vector of polynomials | |||
* of each element of a vector of polynomials; | |||
* for details of the Barrett reduction see comments in reduce.c | |||
* | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
* Arguments: - polyvec *r: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_reduce(polyvec *r) { | |||
size_t i; | |||
@@ -172,23 +210,6 @@ void PQCLEAN_KYBER102490S_AVX2_polyvec_reduce(polyvec *r) { | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_csubq | |||
* | |||
* Description: Applies conditional subtraction of q to each coefficient | |||
* of each element of a vector of polynomials | |||
* for details of conditional subtraction of q see comments in | |||
* reduce.c | |||
* | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_csubq(polyvec *r) { | |||
size_t i; | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_AVX2_poly_csubq(&r->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_add | |||
* | |||
@@ -8,9 +8,8 @@ typedef struct { | |||
poly vec[KYBER_K]; | |||
} polyvec; | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a); | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_decompress(polyvec *r, | |||
const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES + 2], polyvec *a); | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES + 12]); | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a); | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]); | |||
@@ -18,12 +17,9 @@ void PQCLEAN_KYBER102490S_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYB | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(polyvec *r); | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_invntt_tomont(polyvec *r); | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(poly *r, | |||
const polyvec *a, | |||
const polyvec *b); | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b); | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_reduce(polyvec *r); | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_csubq(polyvec *r); | |||
void PQCLEAN_KYBER102490S_AVX2_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); | |||
@@ -1,10 +1,9 @@ | |||
#ifndef PQCLEAN_KYBER102490S_AVX2_REDUCE_H | |||
#define PQCLEAN_KYBER102490S_AVX2_REDUCE_H | |||
#include "consts.h" | |||
#include <stdint.h> | |||
#include "params.h" | |||
#include <immintrin.h> | |||
int16_t PQCLEAN_KYBER102490S_AVX2_reduce_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); | |||
int16_t PQCLEAN_KYBER102490S_AVX2_csubq_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); | |||
int16_t PQCLEAN_KYBER102490S_AVX2_tomont_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); | |||
void PQCLEAN_KYBER102490S_AVX2_reduce_avx(__m256i *r, const __m256i *PQCLEAN_KYBER102490S_AVX2_qdata); | |||
void PQCLEAN_KYBER102490S_AVX2_tomont_avx(__m256i *r, const __m256i *PQCLEAN_KYBER102490S_AVX2_qdata); | |||
#endif |
@@ -4,311 +4,68 @@ | |||
#include "rejsample.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
#include <string.h> | |||
//#define BMI | |||
static const ALIGN32_ARRAY_2D(uint8_t, 256, 8) idx = {.arr = { | |||
{-1, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 0, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 2, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 2, -1, -1, -1, -1, -1, -1}, | |||
{ 4, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 4, -1, -1, -1, -1, -1, -1}, | |||
{ 2, 4, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 4, -1, -1, -1, -1, -1}, | |||
{ 6, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 6, -1, -1, -1, -1, -1, -1}, | |||
{ 2, 6, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 6, -1, -1, -1, -1, -1}, | |||
{ 4, 6, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 6, -1, -1, -1, -1, -1}, | |||
{ 2, 4, 6, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 6, -1, -1, -1, -1}, | |||
{ 8, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 8, -1, -1, -1, -1, -1, -1}, | |||
{ 2, 8, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 8, -1, -1, -1, -1, -1}, | |||
{ 4, 8, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 8, -1, -1, -1, -1, -1}, | |||
{ 2, 4, 8, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 8, -1, -1, -1, -1}, | |||
{ 6, 8, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 8, -1, -1, -1, -1, -1}, | |||
{ 2, 6, 8, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 8, -1, -1, -1, -1}, | |||
{ 4, 6, 8, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 8, -1, -1, -1, -1}, | |||
{ 2, 4, 6, 8, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 8, -1, -1, -1}, | |||
{10, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 10, -1, -1, -1, -1, -1, -1}, | |||
{ 2, 10, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 10, -1, -1, -1, -1, -1}, | |||
{ 4, 10, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 10, -1, -1, -1, -1, -1}, | |||
{ 2, 4, 10, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 10, -1, -1, -1, -1}, | |||
{ 6, 10, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 10, -1, -1, -1, -1, -1}, | |||
{ 2, 6, 10, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 10, -1, -1, -1, -1}, | |||
{ 4, 6, 10, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 10, -1, -1, -1, -1}, | |||
{ 2, 4, 6, 10, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 10, -1, -1, -1}, | |||
{ 8, 10, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 8, 10, -1, -1, -1, -1, -1}, | |||
{ 2, 8, 10, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 8, 10, -1, -1, -1, -1}, | |||
{ 4, 8, 10, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 8, 10, -1, -1, -1, -1}, | |||
{ 2, 4, 8, 10, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 8, 10, -1, -1, -1}, | |||
{ 6, 8, 10, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 8, 10, -1, -1, -1, -1}, | |||
{ 2, 6, 8, 10, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 8, 10, -1, -1, -1}, | |||
{ 4, 6, 8, 10, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 8, 10, -1, -1, -1}, | |||
{ 2, 4, 6, 8, 10, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 8, 10, -1, -1}, | |||
{12, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 12, -1, -1, -1, -1, -1, -1}, | |||
{ 2, 12, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 12, -1, -1, -1, -1, -1}, | |||
{ 4, 12, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 12, -1, -1, -1, -1, -1}, | |||
{ 2, 4, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 12, -1, -1, -1, -1}, | |||
{ 6, 12, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 12, -1, -1, -1, -1, -1}, | |||
{ 2, 6, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 12, -1, -1, -1, -1}, | |||
{ 4, 6, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 12, -1, -1, -1, -1}, | |||
{ 2, 4, 6, 12, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 12, -1, -1, -1}, | |||
{ 8, 12, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 8, 12, -1, -1, -1, -1, -1}, | |||
{ 2, 8, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 8, 12, -1, -1, -1, -1}, | |||
{ 4, 8, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 8, 12, -1, -1, -1, -1}, | |||
{ 2, 4, 8, 12, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 8, 12, -1, -1, -1}, | |||
{ 6, 8, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 8, 12, -1, -1, -1, -1}, | |||
{ 2, 6, 8, 12, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 8, 12, -1, -1, -1}, | |||
{ 4, 6, 8, 12, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 8, 12, -1, -1, -1}, | |||
{ 2, 4, 6, 8, 12, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 8, 12, -1, -1}, | |||
{10, 12, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 10, 12, -1, -1, -1, -1, -1}, | |||
{ 2, 10, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 10, 12, -1, -1, -1, -1}, | |||
{ 4, 10, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 10, 12, -1, -1, -1, -1}, | |||
{ 2, 4, 10, 12, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 10, 12, -1, -1, -1}, | |||
{ 6, 10, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 10, 12, -1, -1, -1, -1}, | |||
{ 2, 6, 10, 12, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 10, 12, -1, -1, -1}, | |||
{ 4, 6, 10, 12, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 10, 12, -1, -1, -1}, | |||
{ 2, 4, 6, 10, 12, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 10, 12, -1, -1}, | |||
{ 8, 10, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 8, 10, 12, -1, -1, -1, -1}, | |||
{ 2, 8, 10, 12, -1, -1, -1, -1}, | |||
{ 0, 2, 8, 10, 12, -1, -1, -1}, | |||
{ 4, 8, 10, 12, -1, -1, -1, -1}, | |||
{ 0, 4, 8, 10, 12, -1, -1, -1}, | |||
{ 2, 4, 8, 10, 12, -1, -1, -1}, | |||
{ 0, 2, 4, 8, 10, 12, -1, -1}, | |||
{ 6, 8, 10, 12, -1, -1, -1, -1}, | |||
{ 0, 6, 8, 10, 12, -1, -1, -1}, | |||
{ 2, 6, 8, 10, 12, -1, -1, -1}, | |||
{ 0, 2, 6, 8, 10, 12, -1, -1}, | |||
{ 4, 6, 8, 10, 12, -1, -1, -1}, | |||
{ 0, 4, 6, 8, 10, 12, -1, -1}, | |||
{ 2, 4, 6, 8, 10, 12, -1, -1}, | |||
{ 0, 2, 4, 6, 8, 10, 12, -1}, | |||
{14, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 14, -1, -1, -1, -1, -1, -1}, | |||
{ 2, 14, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 14, -1, -1, -1, -1, -1}, | |||
{ 4, 14, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 14, -1, -1, -1, -1, -1}, | |||
{ 2, 4, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 14, -1, -1, -1, -1}, | |||
{ 6, 14, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 14, -1, -1, -1, -1, -1}, | |||
{ 2, 6, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 14, -1, -1, -1, -1}, | |||
{ 4, 6, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 14, -1, -1, -1, -1}, | |||
{ 2, 4, 6, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 14, -1, -1, -1}, | |||
{ 8, 14, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 8, 14, -1, -1, -1, -1, -1}, | |||
{ 2, 8, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 8, 14, -1, -1, -1, -1}, | |||
{ 4, 8, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 8, 14, -1, -1, -1, -1}, | |||
{ 2, 4, 8, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 8, 14, -1, -1, -1}, | |||
{ 6, 8, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 8, 14, -1, -1, -1, -1}, | |||
{ 2, 6, 8, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 8, 14, -1, -1, -1}, | |||
{ 4, 6, 8, 14, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 8, 14, -1, -1, -1}, | |||
{ 2, 4, 6, 8, 14, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 8, 14, -1, -1}, | |||
{10, 14, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 10, 14, -1, -1, -1, -1, -1}, | |||
{ 2, 10, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 10, 14, -1, -1, -1, -1}, | |||
{ 4, 10, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 10, 14, -1, -1, -1, -1}, | |||
{ 2, 4, 10, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 10, 14, -1, -1, -1}, | |||
{ 6, 10, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 10, 14, -1, -1, -1, -1}, | |||
{ 2, 6, 10, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 10, 14, -1, -1, -1}, | |||
{ 4, 6, 10, 14, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 10, 14, -1, -1, -1}, | |||
{ 2, 4, 6, 10, 14, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 10, 14, -1, -1}, | |||
{ 8, 10, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 8, 10, 14, -1, -1, -1, -1}, | |||
{ 2, 8, 10, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 8, 10, 14, -1, -1, -1}, | |||
{ 4, 8, 10, 14, -1, -1, -1, -1}, | |||
{ 0, 4, 8, 10, 14, -1, -1, -1}, | |||
{ 2, 4, 8, 10, 14, -1, -1, -1}, | |||
{ 0, 2, 4, 8, 10, 14, -1, -1}, | |||
{ 6, 8, 10, 14, -1, -1, -1, -1}, | |||
{ 0, 6, 8, 10, 14, -1, -1, -1}, | |||
{ 2, 6, 8, 10, 14, -1, -1, -1}, | |||
{ 0, 2, 6, 8, 10, 14, -1, -1}, | |||
{ 4, 6, 8, 10, 14, -1, -1, -1}, | |||
{ 0, 4, 6, 8, 10, 14, -1, -1}, | |||
{ 2, 4, 6, 8, 10, 14, -1, -1}, | |||
{ 0, 2, 4, 6, 8, 10, 14, -1}, | |||
{12, 14, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 12, 14, -1, -1, -1, -1, -1}, | |||
{ 2, 12, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 12, 14, -1, -1, -1, -1}, | |||
{ 4, 12, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 12, 14, -1, -1, -1, -1}, | |||
{ 2, 4, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 12, 14, -1, -1, -1}, | |||
{ 6, 12, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 12, 14, -1, -1, -1, -1}, | |||
{ 2, 6, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 12, 14, -1, -1, -1}, | |||
{ 4, 6, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 12, 14, -1, -1, -1}, | |||
{ 2, 4, 6, 12, 14, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 12, 14, -1, -1}, | |||
{ 8, 12, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 8, 12, 14, -1, -1, -1, -1}, | |||
{ 2, 8, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 8, 12, 14, -1, -1, -1}, | |||
{ 4, 8, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 4, 8, 12, 14, -1, -1, -1}, | |||
{ 2, 4, 8, 12, 14, -1, -1, -1}, | |||
{ 0, 2, 4, 8, 12, 14, -1, -1}, | |||
{ 6, 8, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 6, 8, 12, 14, -1, -1, -1}, | |||
{ 2, 6, 8, 12, 14, -1, -1, -1}, | |||
{ 0, 2, 6, 8, 12, 14, -1, -1}, | |||
{ 4, 6, 8, 12, 14, -1, -1, -1}, | |||
{ 0, 4, 6, 8, 12, 14, -1, -1}, | |||
{ 2, 4, 6, 8, 12, 14, -1, -1}, | |||
{ 0, 2, 4, 6, 8, 12, 14, -1}, | |||
{10, 12, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 10, 12, 14, -1, -1, -1, -1}, | |||
{ 2, 10, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 10, 12, 14, -1, -1, -1}, | |||
{ 4, 10, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 4, 10, 12, 14, -1, -1, -1}, | |||
{ 2, 4, 10, 12, 14, -1, -1, -1}, | |||
{ 0, 2, 4, 10, 12, 14, -1, -1}, | |||
{ 6, 10, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 6, 10, 12, 14, -1, -1, -1}, | |||
{ 2, 6, 10, 12, 14, -1, -1, -1}, | |||
{ 0, 2, 6, 10, 12, 14, -1, -1}, | |||
{ 4, 6, 10, 12, 14, -1, -1, -1}, | |||
{ 0, 4, 6, 10, 12, 14, -1, -1}, | |||
{ 2, 4, 6, 10, 12, 14, -1, -1}, | |||
{ 0, 2, 4, 6, 10, 12, 14, -1}, | |||
{ 8, 10, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 8, 10, 12, 14, -1, -1, -1}, | |||
{ 2, 8, 10, 12, 14, -1, -1, -1}, | |||
{ 0, 2, 8, 10, 12, 14, -1, -1}, | |||
{ 4, 8, 10, 12, 14, -1, -1, -1}, | |||
{ 0, 4, 8, 10, 12, 14, -1, -1}, | |||
{ 2, 4, 8, 10, 12, 14, -1, -1}, | |||
{ 0, 2, 4, 8, 10, 12, 14, -1}, | |||
{ 6, 8, 10, 12, 14, -1, -1, -1}, | |||
{ 0, 6, 8, 10, 12, 14, -1, -1}, | |||
{ 2, 6, 8, 10, 12, 14, -1, -1}, | |||
{ 0, 2, 6, 8, 10, 12, 14, -1}, | |||
{ 4, 6, 8, 10, 12, 14, -1, -1}, | |||
{ 0, 4, 6, 8, 10, 12, 14, -1}, | |||
{ 2, 4, 6, 8, 10, 12, 14, -1}, | |||
{ 0, 2, 4, 6, 8, 10, 12, 14} | |||
} | |||
}; | |||
#define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a) | |||
#define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a) | |||
#define REJ_UNIFORM_BUFLEN 576 | |||
unsigned int PQCLEAN_KYBER102490S_AVX2_rej_uniform_avx(int16_t *restrict r, | |||
const uint8_t *restrict buf) { | |||
unsigned int PQCLEAN_KYBER102490S_AVX2_rej_uniform_avx(int16_t *restrict r, const uint8_t *buf) { | |||
unsigned int ctr, pos; | |||
uint16_t val; | |||
uint16_t val0, val1; | |||
uint32_t good; | |||
const __m256i bound = _mm256_set1_epi16((int16_t)(19 * KYBER_Q - 1)); | |||
uint64_t idx0, idx1, idx2, idx3; | |||
const __m256i bound = _mm256_load_si256(&PQCLEAN_KYBER102490S_AVX2_qdata.vec[_16XQ / 16]); | |||
const __m256i ones = _mm256_set1_epi8(1); | |||
const __m256i kyberq = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER102490S_AVX2_qdata.as_arr[_16XQ]); | |||
const __m256i v = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER102490S_AVX2_qdata.as_arr[_16XV]); | |||
const __m256i mask = _mm256_set1_epi16(0xFFF); | |||
const __m256i idx8 = _mm256_set_epi8(15, 14, 14, 13, 12, 11, 11, 10, | |||
9, 8, 8, 7, 6, 5, 5, 4, | |||
11, 10, 10, 9, 8, 7, 7, 6, | |||
5, 4, 4, 3, 2, 1, 1, 0); | |||
__m256i f0, f1, g0, g1, g2, g3; | |||
__m128i f, t, pilo, pihi; | |||
ctr = 0; | |||
for (pos = 0; pos < 2 * KYBER_N; pos += 64) { | |||
f0 = _mm256_load_si256((__m256i *)&buf[pos + 0]); | |||
f1 = _mm256_load_si256((__m256i *)&buf[pos + 32]); | |||
g0 = _mm256_cmpge_epu16(bound, f0); | |||
g1 = _mm256_cmpge_epu16(bound, f1); | |||
ctr = pos = 0; | |||
while (ctr <= KYBER_N - 32 && pos <= REJ_UNIFORM_AVX_BUFLEN - 48) { | |||
f0 = _mm256_loadu_si256((__m256i *)&buf[pos]); | |||
f1 = _mm256_loadu_si256((__m256i *)&buf[pos + 24]); | |||
f0 = _mm256_permute4x64_epi64(f0, 0x94); | |||
f1 = _mm256_permute4x64_epi64(f1, 0x94); | |||
f0 = _mm256_shuffle_epi8(f0, idx8); | |||
f1 = _mm256_shuffle_epi8(f1, idx8); | |||
g0 = _mm256_srli_epi16(f0, 4); | |||
g1 = _mm256_srli_epi16(f1, 4); | |||
f0 = _mm256_blend_epi16(f0, g0, 0xAA); | |||
f1 = _mm256_blend_epi16(f1, g1, 0xAA); | |||
f0 = _mm256_and_si256(f0, mask); | |||
f1 = _mm256_and_si256(f1, mask); | |||
pos += 48; | |||
g0 = _mm256_cmpgt_epi16(bound, f0); | |||
g1 = _mm256_cmpgt_epi16(bound, f1); | |||
g0 = _mm256_packs_epi16(g0, g1); | |||
good = _mm256_movemask_epi8(g0); | |||
g0 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 0) & 0xFF])); | |||
g1 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 8) & 0xFF])); | |||
g0 = _mm256_inserti128_si256(g0, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 16) & 0xFF]), 1); | |||
g1 = _mm256_inserti128_si256(g1, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 24) & 0xFF]), 1); | |||
//g0 = _mm256_cvtepu8_epi64(_mm_loadl_epi64((__m128i *)&good)); | |||
//g1 = _mm256_i64gather_epi64((long long *)idx.arr, g0, 8); | |||
/* Barrett reduction of (still unsigned) values */ | |||
g2 = _mm256_mulhi_epu16(f0, v); | |||
g3 = _mm256_mulhi_epu16(f1, v); | |||
g2 = _mm256_srli_epi16(g2, 10); | |||
g3 = _mm256_srli_epi16(g3, 10); | |||
g2 = _mm256_mullo_epi16(g2, kyberq); | |||
g3 = _mm256_mullo_epi16(g3, kyberq); | |||
f0 = _mm256_sub_epi16(f0, g2); | |||
f1 = _mm256_sub_epi16(f1, g3); | |||
idx0 = _pdep_u64(good >> 0, 0x0101010101010101); | |||
idx1 = _pdep_u64(good >> 8, 0x0101010101010101); | |||
idx2 = _pdep_u64(good >> 16, 0x0101010101010101); | |||
idx3 = _pdep_u64(good >> 24, 0x0101010101010101); | |||
idx0 = (idx0 << 8) - idx0; | |||
idx0 = _pext_u64(0x0E0C0A0806040200, idx0); | |||
idx1 = (idx1 << 8) - idx1; | |||
idx1 = _pext_u64(0x0E0C0A0806040200, idx1); | |||
idx2 = (idx2 << 8) - idx2; | |||
idx2 = _pext_u64(0x0E0C0A0806040200, idx2); | |||
idx3 = (idx3 << 8) - idx3; | |||
idx3 = _pext_u64(0x0E0C0A0806040200, idx3); | |||
g0 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx0)); | |||
g1 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx1)); | |||
g0 = _mm256_inserti128_si256(g0, _mm_cvtsi64_si128(idx2), 1); | |||
g1 = _mm256_inserti128_si256(g1, _mm_cvtsi64_si128(idx3), 1); | |||
g2 = _mm256_add_epi8(g0, ones); | |||
g3 = _mm256_add_epi8(g1, ones); | |||
@@ -328,34 +85,40 @@ unsigned int PQCLEAN_KYBER102490S_AVX2_rej_uniform_avx(int16_t *restrict r, | |||
ctr += _mm_popcnt_u32((good >> 24) & 0xFF); | |||
} | |||
while (ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_BUFLEN - 16) { | |||
f = _mm_load_si128((__m128i *)&buf[pos]); | |||
t = _mm_cmpge_epu16(_mm256_castsi256_si128(bound), f); | |||
while (ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_AVX_BUFLEN - 12) { | |||
f = _mm_loadu_si128((__m128i *)&buf[pos]); | |||
f = _mm_shuffle_epi8(f, _mm256_castsi256_si128(idx8)); | |||
t = _mm_srli_epi16(f, 4); | |||
f = _mm_blend_epi16(f, t, 0xAA); | |||
f = _mm_and_si128(f, _mm256_castsi256_si128(mask)); | |||
pos += 12; | |||
t = _mm_cmpgt_epi16(_mm256_castsi256_si128(bound), f); | |||
good = _mm_movemask_epi8(t); | |||
good = _pext_u32(good, 0x5555); | |||
pilo = _mm_loadl_epi64((__m128i *)&idx.arr[good]); | |||
pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones)); | |||
pilo = _mm_unpacklo_epi8(pilo, pihi); | |||
/* Barrett reduction */ | |||
t = _mm_mulhi_epu16(f, _mm256_castsi256_si128(v)); | |||
t = _mm_srli_epi16(t, 10); | |||
t = _mm_mullo_epi16(t, _mm256_castsi256_si128(kyberq)); | |||
f = _mm_sub_epi16(f, t); | |||
good &= 0x5555; | |||
idx0 = _pdep_u64(good, 0x1111111111111111); | |||
idx0 = (idx0 << 8) - idx0; | |||
idx0 = _pext_u64(0x0E0C0A0806040200, idx0); | |||
pilo = _mm_cvtsi64_si128(idx0); | |||
pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones)); | |||
pilo = _mm_unpacklo_epi8(pilo, pihi); | |||
f = _mm_shuffle_epi8(f, pilo); | |||
_mm_storeu_si128((__m128i *)&r[ctr], f); | |||
ctr += _mm_popcnt_u32(good); | |||
pos += 16; | |||
} | |||
while (ctr < KYBER_N && pos <= REJ_UNIFORM_BUFLEN - 2) { | |||
val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); | |||
pos += 2; | |||
while (ctr < KYBER_N && pos <= REJ_UNIFORM_AVX_BUFLEN - 3) { | |||
val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; | |||
val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)); | |||
pos += 3; | |||
if (val < 19 * KYBER_Q) { | |||
val -= ((int32_t)val * 20159 >> 26) * KYBER_Q; | |||
r[ctr++] = val; | |||
if (val0 < KYBER_Q) { | |||
r[ctr++] = val0; | |||
} | |||
if (val1 < KYBER_Q && ctr < KYBER_N) { | |||
r[ctr++] = val1; | |||
} | |||
} | |||
@@ -1,9 +1,12 @@ | |||
#ifndef PQCLEAN_KYBER102490S_AVX2_REJSAMPLE_H | |||
#define PQCLEAN_KYBER102490S_AVX2_REJSAMPLE_H | |||
#include "params.h" | |||
#include "symmetric.h" | |||
#include <stdint.h> | |||
unsigned int PQCLEAN_KYBER102490S_AVX2_rej_uniform_avx(int16_t *r, | |||
const unsigned char *buf); | |||
#define REJ_UNIFORM_AVX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) | |||
#define REJ_UNIFORM_AVX_BUFLEN (REJ_UNIFORM_AVX_NBLOCKS*XOF_BLOCKBYTES) | |||
unsigned int PQCLEAN_KYBER102490S_AVX2_rej_uniform_avx(int16_t *r, const uint8_t *buf); | |||
#endif |
@@ -113,13 +113,13 @@ vmovdqa 224(%rsi),%ymm12 | |||
#csubq | |||
csubq 5,13 | |||
csubq 6,14 | |||
csubq 7,15 | |||
csubq 8,1 | |||
csubq 6,13 | |||
csubq 7,13 | |||
csubq 8,13 | |||
csubq 9,13 | |||
csubq 10,14 | |||
csubq 11,15 | |||
csubq 12,1 | |||
csubq 10,13 | |||
csubq 11,13 | |||
csubq 12,13 | |||
#bitpack | |||
vpsllw $12,%ymm6,%ymm4 | |||
@@ -9,17 +9,17 @@ vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 | |||
.endm | |||
.macro shuffle2 r0,r1,r2,r3 | |||
#vshufps $0x20,%ymm\r1,%ymm\r0,%ymm\r2 | |||
#vshufps $0x31,%ymm\r1,%ymm\r0,%ymm\r3 | |||
vpsllq $32,%ymm\r1,%ymm12 | |||
vpsrlq $32,%ymm\r0,%ymm13 | |||
vpblendd $0xAA,%ymm12,%ymm\r0,%ymm\r2 | |||
vpblendd $0xAA,%ymm\r1,%ymm13,%ymm\r3 | |||
#vpsllq $32,%ymm\r1,%ymm\r2 | |||
vmovsldup %ymm\r1,%ymm\r2 | |||
vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 | |||
vpsrlq $32,%ymm\r0,%ymm\r0 | |||
#vmovshdup %ymm\r0,%ymm\r0 | |||
vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 | |||
.endm | |||
.macro shuffle1 r0,r1,r2,r3 | |||
vpslld $16,%ymm\r1,%ymm12 | |||
vpsrld $16,%ymm\r0,%ymm13 | |||
vpblendw $0xAA,%ymm12,%ymm\r0,%ymm\r2 | |||
vpblendw $0xAA,%ymm\r1,%ymm13,%ymm\r3 | |||
vpslld $16,%ymm\r1,%ymm\r2 | |||
vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 | |||
vpsrld $16,%ymm\r0,%ymm\r0 | |||
vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 | |||
.endm |
@@ -14,12 +14,10 @@ typedef aes256ctr_ctx xof_state; | |||
#define hash_h(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES) | |||
#define hash_g(OUT, IN, INBYTES) sha512(OUT, IN, INBYTES) | |||
#define xof_absorb(STATE, SEED, X, Y) \ | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(STATE, SEED, (X) | ((uint16_t)(Y) << 8)) | |||
#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) \ | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) | |||
#define prf(OUT, OUTBYTES, KEY, NONCE) \ | |||
PQCLEAN_KYBER102490S_AVX2_aes256ctr_prf(OUT, OUTBYTES, KEY, NONCE) | |||
#define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(STATE, SEED, (X) | ((uint16_t)(Y) << 8)) | |||
#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) | |||
#define xof_ctx_release(STATE) | |||
#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER102490S_AVX2_aes256ctr_prf(OUT, OUTBYTES, KEY, NONCE) | |||
#define kdf(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES) | |||
@@ -8,31 +8,31 @@ | |||
* | |||
* Description: Compare two arrays for equality in constant time. | |||
* | |||
* Arguments: const unsigned char *a: pointer to first byte array | |||
* const unsigned char *b: pointer to second byte array | |||
* size_t len: length of the byte arrays | |||
* Arguments: const uint8_t *a: pointer to first byte array | |||
* const uint8_t *b: pointer to second byte array | |||
* size_t len: length of the byte arrays | |||
* | |||
* Returns 0 if the byte arrays are equal, 1 otherwise | |||
**************************************************/ | |||
int PQCLEAN_KYBER102490S_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) { | |||
size_t pos; | |||
size_t i; | |||
uint64_t r; | |||
__m256i avec, bvec, cvec; | |||
__m256i f, g, h; | |||
cvec = _mm256_setzero_si256(); | |||
for (pos = 0; pos + 32 <= len; pos += 32) { | |||
avec = _mm256_loadu_si256((__m256i *)&a[pos]); | |||
bvec = _mm256_loadu_si256((__m256i *)&b[pos]); | |||
avec = _mm256_xor_si256(avec, bvec); | |||
cvec = _mm256_or_si256(cvec, avec); | |||
h = _mm256_setzero_si256(); | |||
for (i = 0; i < len / 32; i++) { | |||
f = _mm256_loadu_si256((__m256i *)&a[32 * i]); | |||
g = _mm256_loadu_si256((__m256i *)&b[32 * i]); | |||
f = _mm256_xor_si256(f, g); | |||
h = _mm256_or_si256(h, f); | |||
} | |||
r = 1 - _mm256_testz_si256(cvec, cvec); | |||
r = 1 - _mm256_testz_si256(h, h); | |||
if (pos < len) { | |||
avec = _mm256_loadu_si256((__m256i *)&a[pos]); | |||
bvec = _mm256_loadu_si256((__m256i *)&b[pos]); | |||
cvec = _mm256_cmpeq_epi8(avec, bvec); | |||
r |= _mm256_movemask_epi8(cvec) & (-(uint32_t)1 >> (32 + pos - len)); | |||
a += 32 * i; | |||
b += 32 * i; | |||
len -= 32 * i; | |||
for (i = 0; i < len; i++) { | |||
r |= a[i] ^ b[i]; | |||
} | |||
r = (-r) >> 63; | |||
@@ -47,29 +47,27 @@ int PQCLEAN_KYBER102490S_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t | |||
* assumes two's complement representation of negative integers. | |||
* Runs in constant time. | |||
* | |||
* Arguments: unsigned char *r: pointer to output byte array | |||
* Arguments: unsigned char *r: pointer to output byte array | |||
* const unsigned char *x: pointer to input byte array | |||
* size_t len: Amount of bytes to be copied | |||
* unsigned char b: Condition bit; has to be in {0,1} | |||
* size_t len: Amount of bytes to be copied | |||
* unsigned char b: Condition bit; has to be in {0,1} | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_AVX2_cmov(uint8_t *restrict r, const uint8_t *restrict x, size_t len, uint8_t b) { | |||
size_t pos; | |||
void PQCLEAN_KYBER102490S_AVX2_cmov(uint8_t *restrict r, const uint8_t *x, size_t len, uint8_t b) { | |||
size_t i; | |||
__m256i xvec, rvec, bvec; | |||
b = -b; | |||
bvec = _mm256_set1_epi8(b); | |||
for (pos = 0; pos + 32 <= len; pos += 32) { | |||
rvec = _mm256_loadu_si256((__m256i *)&r[pos]); | |||
xvec = _mm256_loadu_si256((__m256i *)&x[pos]); | |||
xvec = _mm256_xor_si256(xvec, rvec); | |||
xvec = _mm256_and_si256(xvec, bvec); | |||
rvec = _mm256_xor_si256(rvec, xvec); | |||
_mm256_storeu_si256((__m256i *)&r[pos], rvec); | |||
bvec = _mm256_set1_epi64x(-(uint64_t)b); | |||
for (i = 0; i < len / 32; i++) { | |||
rvec = _mm256_loadu_si256((__m256i *)&r[32 * i]); | |||
xvec = _mm256_loadu_si256((__m256i *)&x[32 * i]); | |||
rvec = _mm256_blendv_epi8(rvec, xvec, bvec); | |||
_mm256_storeu_si256((__m256i *)&r[32 * i], rvec); | |||
} | |||
while (pos < len) { | |||
r[pos] ^= b & (x[pos] ^ r[pos]); | |||
pos += 1; | |||
r += 32 * i; | |||
x += 32 * i; | |||
len -= 32 * i; | |||
for (i = 0; i < len; i++) { | |||
r[i] ^= -b & (x[i] ^ r[i]); | |||
} | |||
} |
@@ -1,8 +1,8 @@ | |||
# This Makefile can be used with GNU Make or BSD Make | |||
LIB=libkyber1024-90s_clean.a | |||
HEADERS=api.h cbd.h indcpa.h kem.h ntt.h params.h poly.h polyvec.h reduce.h symmetric-aes.h symmetric.h verify.h | |||
OBJECTS=cbd.o indcpa.o kem.o ntt.o poly.o polyvec.o reduce.o symmetric-aes.o verify.o | |||
HEADERS=aes256ctr.h api.h cbd.h indcpa.h kem.h ntt.h params.h poly.h polyvec.h reduce.h symmetric.h verify.h | |||
OBJECTS=aes256ctr.o cbd.o indcpa.o kem.o ntt.o poly.o polyvec.o reduce.o symmetric-aes.o verify.o | |||
CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) | |||
@@ -2,7 +2,7 @@ | |||
# nmake /f Makefile.Microsoft_nmake | |||
LIBRARY=libkyber1024-90s_clean.lib | |||
OBJECTS=cbd.obj indcpa.obj kem.obj ntt.obj poly.obj polyvec.obj reduce.obj symmetric-aes.obj verify.obj | |||
OBJECTS=aes256ctr.obj cbd.obj indcpa.obj kem.obj ntt.obj poly.obj polyvec.obj reduce.obj symmetric-aes.obj verify.obj | |||
# Warning C4146 is raised when a unary minus operator is applied to an | |||
# unsigned type; this has nonetheless been standard and portable for as | |||
@@ -0,0 +1,564 @@ | |||
#include "aes256ctr.h" | |||
#include <stdint.h> | |||
#include <string.h> | |||
/* | |||
* Copyright (c) 2016 Thomas Pornin <pornin@bolet.org> | |||
* | |||
* Permission is hereby granted, free of charge, to any person obtaining | |||
* a copy of this software and associated documentation files (the | |||
* "Software"), to deal in the Software without restriction, including | |||
* without limitation the rights to use, copy, modify, merge, publish, | |||
* distribute, sublicense, and/or sell copies of the Software, and to | |||
* permit persons to whom the Software is furnished to do so, subject to | |||
* the following conditions: | |||
* | |||
* The above copyright notice and this permission notice shall be | |||
* included in all copies or substantial portions of the Software. | |||
* | |||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |||
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |||
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |||
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
* SOFTWARE. | |||
*/ | |||
static inline uint32_t br_dec32le(const uint8_t *src) { | |||
return (uint32_t)src[0] | |||
| ((uint32_t)src[1] << 8) | |||
| ((uint32_t)src[2] << 16) | |||
| ((uint32_t)src[3] << 24); | |||
} | |||
static void br_range_dec32le(uint32_t *v, size_t num, const uint8_t *src) { | |||
while (num-- > 0) { | |||
*v ++ = br_dec32le(src); | |||
src += 4; | |||
} | |||
} | |||
static inline uint32_t br_swap32(uint32_t x) { | |||
x = ((x & (uint32_t)0x00FF00FF) << 8) | |||
| ((x >> 8) & (uint32_t)0x00FF00FF); | |||
return (x << 16) | (x >> 16); | |||
} | |||
static inline void br_enc32le(uint8_t *dst, uint32_t x) { | |||
dst[0] = (uint8_t)x; | |||
dst[1] = (uint8_t)(x >> 8); | |||
dst[2] = (uint8_t)(x >> 16); | |||
dst[3] = (uint8_t)(x >> 24); | |||
} | |||
static void br_range_enc32le(uint8_t *dst, const uint32_t *v, size_t num) { | |||
while (num-- > 0) { | |||
br_enc32le(dst, *v ++); | |||
dst += 4; | |||
} | |||
} | |||
static void br_aes_ct64_bitslice_Sbox(uint64_t *q) { | |||
/* | |||
* This S-box implementation is a straightforward translation of | |||
* the circuit described by Boyar and Peralta in "A new | |||
* combinational logic minimization technique with applications | |||
* to cryptology" (https://eprint.iacr.org/2009/191.pdf). | |||
* | |||
* Note that variables x* (input) and s* (output) are numbered | |||
* in "reverse" order (x0 is the high bit, x7 is the low bit). | |||
*/ | |||
uint64_t x0, x1, x2, x3, x4, x5, x6, x7; | |||
uint64_t y1, y2, y3, y4, y5, y6, y7, y8, y9; | |||
uint64_t y10, y11, y12, y13, y14, y15, y16, y17, y18, y19; | |||
uint64_t y20, y21; | |||
uint64_t z0, z1, z2, z3, z4, z5, z6, z7, z8, z9; | |||
uint64_t z10, z11, z12, z13, z14, z15, z16, z17; | |||
uint64_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9; | |||
uint64_t t10, t11, t12, t13, t14, t15, t16, t17, t18, t19; | |||
uint64_t t20, t21, t22, t23, t24, t25, t26, t27, t28, t29; | |||
uint64_t t30, t31, t32, t33, t34, t35, t36, t37, t38, t39; | |||
uint64_t t40, t41, t42, t43, t44, t45, t46, t47, t48, t49; | |||
uint64_t t50, t51, t52, t53, t54, t55, t56, t57, t58, t59; | |||
uint64_t t60, t61, t62, t63, t64, t65, t66, t67; | |||
uint64_t s0, s1, s2, s3, s4, s5, s6, s7; | |||
x0 = q[7]; | |||
x1 = q[6]; | |||
x2 = q[5]; | |||
x3 = q[4]; | |||
x4 = q[3]; | |||
x5 = q[2]; | |||
x6 = q[1]; | |||
x7 = q[0]; | |||
/* | |||
* Top linear transformation. | |||
*/ | |||
y14 = x3 ^ x5; | |||
y13 = x0 ^ x6; | |||
y9 = x0 ^ x3; | |||
y8 = x0 ^ x5; | |||
t0 = x1 ^ x2; | |||
y1 = t0 ^ x7; | |||
y4 = y1 ^ x3; | |||
y12 = y13 ^ y14; | |||
y2 = y1 ^ x0; | |||
y5 = y1 ^ x6; | |||
y3 = y5 ^ y8; | |||
t1 = x4 ^ y12; | |||
y15 = t1 ^ x5; | |||
y20 = t1 ^ x1; | |||
y6 = y15 ^ x7; | |||
y10 = y15 ^ t0; | |||
y11 = y20 ^ y9; | |||
y7 = x7 ^ y11; | |||
y17 = y10 ^ y11; | |||
y19 = y10 ^ y8; | |||
y16 = t0 ^ y11; | |||
y21 = y13 ^ y16; | |||
y18 = x0 ^ y16; | |||
/* | |||
* Non-linear section. | |||
*/ | |||
t2 = y12 & y15; | |||
t3 = y3 & y6; | |||
t4 = t3 ^ t2; | |||
t5 = y4 & x7; | |||
t6 = t5 ^ t2; | |||
t7 = y13 & y16; | |||
t8 = y5 & y1; | |||
t9 = t8 ^ t7; | |||
t10 = y2 & y7; | |||
t11 = t10 ^ t7; | |||
t12 = y9 & y11; | |||
t13 = y14 & y17; | |||
t14 = t13 ^ t12; | |||
t15 = y8 & y10; | |||
t16 = t15 ^ t12; | |||
t17 = t4 ^ t14; | |||
t18 = t6 ^ t16; | |||
t19 = t9 ^ t14; | |||
t20 = t11 ^ t16; | |||
t21 = t17 ^ y20; | |||
t22 = t18 ^ y19; | |||
t23 = t19 ^ y21; | |||
t24 = t20 ^ y18; | |||
t25 = t21 ^ t22; | |||
t26 = t21 & t23; | |||
t27 = t24 ^ t26; | |||
t28 = t25 & t27; | |||
t29 = t28 ^ t22; | |||
t30 = t23 ^ t24; | |||
t31 = t22 ^ t26; | |||
t32 = t31 & t30; | |||
t33 = t32 ^ t24; | |||
t34 = t23 ^ t33; | |||
t35 = t27 ^ t33; | |||
t36 = t24 & t35; | |||
t37 = t36 ^ t34; | |||
t38 = t27 ^ t36; | |||
t39 = t29 & t38; | |||
t40 = t25 ^ t39; | |||
t41 = t40 ^ t37; | |||
t42 = t29 ^ t33; | |||
t43 = t29 ^ t40; | |||
t44 = t33 ^ t37; | |||
t45 = t42 ^ t41; | |||
z0 = t44 & y15; | |||
z1 = t37 & y6; | |||
z2 = t33 & x7; | |||
z3 = t43 & y16; | |||
z4 = t40 & y1; | |||
z5 = t29 & y7; | |||
z6 = t42 & y11; | |||
z7 = t45 & y17; | |||
z8 = t41 & y10; | |||
z9 = t44 & y12; | |||
z10 = t37 & y3; | |||
z11 = t33 & y4; | |||
z12 = t43 & y13; | |||
z13 = t40 & y5; | |||
z14 = t29 & y2; | |||
z15 = t42 & y9; | |||
z16 = t45 & y14; | |||
z17 = t41 & y8; | |||
/* | |||
* Bottom linear transformation. | |||
*/ | |||
t46 = z15 ^ z16; | |||
t47 = z10 ^ z11; | |||
t48 = z5 ^ z13; | |||
t49 = z9 ^ z10; | |||
t50 = z2 ^ z12; | |||
t51 = z2 ^ z5; | |||
t52 = z7 ^ z8; | |||
t53 = z0 ^ z3; | |||
t54 = z6 ^ z7; | |||
t55 = z16 ^ z17; | |||
t56 = z12 ^ t48; | |||
t57 = t50 ^ t53; | |||
t58 = z4 ^ t46; | |||
t59 = z3 ^ t54; | |||
t60 = t46 ^ t57; | |||
t61 = z14 ^ t57; | |||
t62 = t52 ^ t58; | |||
t63 = t49 ^ t58; | |||
t64 = z4 ^ t59; | |||
t65 = t61 ^ t62; | |||
t66 = z1 ^ t63; | |||
s0 = t59 ^ t63; | |||
s6 = t56 ^ ~t62; | |||
s7 = t48 ^ ~t60; | |||
t67 = t64 ^ t65; | |||
s3 = t53 ^ t66; | |||
s4 = t51 ^ t66; | |||
s5 = t47 ^ t65; | |||
s1 = t64 ^ ~s3; | |||
s2 = t55 ^ ~t67; | |||
q[7] = s0; | |||
q[6] = s1; | |||
q[5] = s2; | |||
q[4] = s3; | |||
q[3] = s4; | |||
q[2] = s5; | |||
q[1] = s6; | |||
q[0] = s7; | |||
} | |||
static void br_aes_ct64_ortho(uint64_t *q) { | |||
#define SWAPN(cl, ch, s, x, y) do { \ | |||
uint64_t a, b; \ | |||
a = (x); \ | |||
b = (y); \ | |||
(x) = (a & (uint64_t)(cl)) | ((b & (uint64_t)(cl)) << (s)); \ | |||
(y) = ((a & (uint64_t)(ch)) >> (s)) | (b & (uint64_t)(ch)); \ | |||
} while (0) | |||
#define SWAP2(x, y) SWAPN(0x5555555555555555, 0xAAAAAAAAAAAAAAAA, 1, x, y) | |||
#define SWAP4(x, y) SWAPN(0x3333333333333333, 0xCCCCCCCCCCCCCCCC, 2, x, y) | |||
#define SWAP8(x, y) SWAPN(0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 4, x, y) | |||
SWAP2(q[0], q[1]); | |||
SWAP2(q[2], q[3]); | |||
SWAP2(q[4], q[5]); | |||
SWAP2(q[6], q[7]); | |||
SWAP4(q[0], q[2]); | |||
SWAP4(q[1], q[3]); | |||
SWAP4(q[4], q[6]); | |||
SWAP4(q[5], q[7]); | |||
SWAP8(q[0], q[4]); | |||
SWAP8(q[1], q[5]); | |||
SWAP8(q[2], q[6]); | |||
SWAP8(q[3], q[7]); | |||
} | |||
static void br_aes_ct64_interleave_in(uint64_t *q0, uint64_t *q1, const uint32_t *w) { | |||
uint64_t x0, x1, x2, x3; | |||
x0 = w[0]; | |||
x1 = w[1]; | |||
x2 = w[2]; | |||
x3 = w[3]; | |||
x0 |= (x0 << 16); | |||
x1 |= (x1 << 16); | |||
x2 |= (x2 << 16); | |||
x3 |= (x3 << 16); | |||
x0 &= (uint64_t)0x0000FFFF0000FFFF; | |||
x1 &= (uint64_t)0x0000FFFF0000FFFF; | |||
x2 &= (uint64_t)0x0000FFFF0000FFFF; | |||
x3 &= (uint64_t)0x0000FFFF0000FFFF; | |||
x0 |= (x0 << 8); | |||
x1 |= (x1 << 8); | |||
x2 |= (x2 << 8); | |||
x3 |= (x3 << 8); | |||
x0 &= (uint64_t)0x00FF00FF00FF00FF; | |||
x1 &= (uint64_t)0x00FF00FF00FF00FF; | |||
x2 &= (uint64_t)0x00FF00FF00FF00FF; | |||
x3 &= (uint64_t)0x00FF00FF00FF00FF; | |||
*q0 = x0 | (x2 << 8); | |||
*q1 = x1 | (x3 << 8); | |||
} | |||
static void br_aes_ct64_interleave_out(uint32_t *w, uint64_t q0, uint64_t q1) { | |||
uint64_t x0, x1, x2, x3; | |||
x0 = q0 & (uint64_t)0x00FF00FF00FF00FF; | |||
x1 = q1 & (uint64_t)0x00FF00FF00FF00FF; | |||
x2 = (q0 >> 8) & (uint64_t)0x00FF00FF00FF00FF; | |||
x3 = (q1 >> 8) & (uint64_t)0x00FF00FF00FF00FF; | |||
x0 |= (x0 >> 8); | |||
x1 |= (x1 >> 8); | |||
x2 |= (x2 >> 8); | |||
x3 |= (x3 >> 8); | |||
x0 &= (uint64_t)0x0000FFFF0000FFFF; | |||
x1 &= (uint64_t)0x0000FFFF0000FFFF; | |||
x2 &= (uint64_t)0x0000FFFF0000FFFF; | |||
x3 &= (uint64_t)0x0000FFFF0000FFFF; | |||
w[0] = (uint32_t)x0 | (uint32_t)(x0 >> 16); | |||
w[1] = (uint32_t)x1 | (uint32_t)(x1 >> 16); | |||
w[2] = (uint32_t)x2 | (uint32_t)(x2 >> 16); | |||
w[3] = (uint32_t)x3 | (uint32_t)(x3 >> 16); | |||
} | |||
static const uint8_t Rcon[] = { | |||
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36 | |||
}; | |||
static uint32_t sub_word(uint32_t x) { | |||
uint64_t q[8]; | |||
memset(q, 0, sizeof q); | |||
q[0] = x; | |||
br_aes_ct64_ortho(q); | |||
br_aes_ct64_bitslice_Sbox(q); | |||
br_aes_ct64_ortho(q); | |||
return (uint32_t)q[0]; | |||
} | |||
static void br_aes_ct64_keysched(uint64_t *comp_skey, const uint8_t *key) { | |||
int i, j, k, nk, nkf; | |||
uint32_t tmp; | |||
uint32_t skey[60]; | |||
int key_len = 32; | |||
nk = (int)(key_len >> 2); | |||
nkf = (int)((14 + 1) << 2); | |||
br_range_dec32le(skey, (key_len >> 2), key); | |||
tmp = skey[(key_len >> 2) - 1]; | |||
for (i = nk, j = 0, k = 0; i < nkf; i ++) { | |||
if (j == 0) { | |||
tmp = (tmp << 24) | (tmp >> 8); | |||
tmp = sub_word(tmp) ^ Rcon[k]; | |||
} else if (nk > 6 && j == 4) { | |||
tmp = sub_word(tmp); | |||
} | |||
tmp ^= skey[i - nk]; | |||
skey[i] = tmp; | |||
if (++ j == nk) { | |||
j = 0; | |||
k ++; | |||
} | |||
} | |||
for (i = 0, j = 0; i < nkf; i += 4, j += 2) { | |||
uint64_t q[8]; | |||
br_aes_ct64_interleave_in(&q[0], &q[4], skey + i); | |||
q[1] = q[0]; | |||
q[2] = q[0]; | |||
q[3] = q[0]; | |||
q[5] = q[4]; | |||
q[6] = q[4]; | |||
q[7] = q[4]; | |||
br_aes_ct64_ortho(q); | |||
comp_skey[j + 0] = | |||
(q[0] & (uint64_t)0x1111111111111111) | |||
| (q[1] & (uint64_t)0x2222222222222222) | |||
| (q[2] & (uint64_t)0x4444444444444444) | |||
| (q[3] & (uint64_t)0x8888888888888888); | |||
comp_skey[j + 1] = | |||
(q[4] & (uint64_t)0x1111111111111111) | |||
| (q[5] & (uint64_t)0x2222222222222222) | |||
| (q[6] & (uint64_t)0x4444444444444444) | |||
| (q[7] & (uint64_t)0x8888888888888888); | |||
} | |||
} | |||
static void br_aes_ct64_skey_expand(uint64_t *skey, const uint64_t *comp_skey) { | |||
unsigned u, v, n; | |||
n = (14 + 1) << 1; | |||
for (u = 0, v = 0; u < n; u ++, v += 4) { | |||
uint64_t x0, x1, x2, x3; | |||
x0 = x1 = x2 = x3 = comp_skey[u]; | |||
x0 &= (uint64_t)0x1111111111111111; | |||
x1 &= (uint64_t)0x2222222222222222; | |||
x2 &= (uint64_t)0x4444444444444444; | |||
x3 &= (uint64_t)0x8888888888888888; | |||
x1 >>= 1; | |||
x2 >>= 2; | |||
x3 >>= 3; | |||
skey[v + 0] = (x0 << 4) - x0; | |||
skey[v + 1] = (x1 << 4) - x1; | |||
skey[v + 2] = (x2 << 4) - x2; | |||
skey[v + 3] = (x3 << 4) - x3; | |||
} | |||
} | |||
static inline void add_round_key(uint64_t *q, const uint64_t *sk) { | |||
q[0] ^= sk[0]; | |||
q[1] ^= sk[1]; | |||
q[2] ^= sk[2]; | |||
q[3] ^= sk[3]; | |||
q[4] ^= sk[4]; | |||
q[5] ^= sk[5]; | |||
q[6] ^= sk[6]; | |||
q[7] ^= sk[7]; | |||
} | |||
static inline void shift_rows(uint64_t *q) { | |||
int i; | |||
for (i = 0; i < 8; i ++) { | |||
uint64_t x; | |||
x = q[i]; | |||
q[i] = (x & (uint64_t)0x000000000000FFFF) | |||
| ((x & (uint64_t)0x00000000FFF00000) >> 4) | |||
| ((x & (uint64_t)0x00000000000F0000) << 12) | |||
| ((x & (uint64_t)0x0000FF0000000000) >> 8) | |||
| ((x & (uint64_t)0x000000FF00000000) << 8) | |||
| ((x & (uint64_t)0xF000000000000000) >> 12) | |||
| ((x & (uint64_t)0x0FFF000000000000) << 4); | |||
} | |||
} | |||
static inline uint64_t rotr32(uint64_t x) { | |||
return (x << 32) | (x >> 32); | |||
} | |||
static inline void mix_columns(uint64_t *q) { | |||
uint64_t q0, q1, q2, q3, q4, q5, q6, q7; | |||
uint64_t r0, r1, r2, r3, r4, r5, r6, r7; | |||
q0 = q[0]; | |||
q1 = q[1]; | |||
q2 = q[2]; | |||
q3 = q[3]; | |||
q4 = q[4]; | |||
q5 = q[5]; | |||
q6 = q[6]; | |||
q7 = q[7]; | |||
r0 = (q0 >> 16) | (q0 << 48); | |||
r1 = (q1 >> 16) | (q1 << 48); | |||
r2 = (q2 >> 16) | (q2 << 48); | |||
r3 = (q3 >> 16) | (q3 << 48); | |||
r4 = (q4 >> 16) | (q4 << 48); | |||
r5 = (q5 >> 16) | (q5 << 48); | |||
r6 = (q6 >> 16) | (q6 << 48); | |||
r7 = (q7 >> 16) | (q7 << 48); | |||
q[0] = q7 ^ r7 ^ r0 ^ rotr32(q0 ^ r0); | |||
q[1] = q0 ^ r0 ^ q7 ^ r7 ^ r1 ^ rotr32(q1 ^ r1); | |||
q[2] = q1 ^ r1 ^ r2 ^ rotr32(q2 ^ r2); | |||
q[3] = q2 ^ r2 ^ q7 ^ r7 ^ r3 ^ rotr32(q3 ^ r3); | |||
q[4] = q3 ^ r3 ^ q7 ^ r7 ^ r4 ^ rotr32(q4 ^ r4); | |||
q[5] = q4 ^ r4 ^ r5 ^ rotr32(q5 ^ r5); | |||
q[6] = q5 ^ r5 ^ r6 ^ rotr32(q6 ^ r6); | |||
q[7] = q6 ^ r6 ^ r7 ^ rotr32(q7 ^ r7); | |||
} | |||
static void inc4_be(uint32_t *x) { | |||
*x = br_swap32(*x) + 4; | |||
*x = br_swap32(*x); | |||
} | |||
static void aes_ctr4x(uint8_t out[64], uint32_t ivw[16], uint64_t sk_exp[64]) { | |||
uint32_t w[16]; | |||
uint64_t q[8]; | |||
int i; | |||
memcpy(w, ivw, sizeof(w)); | |||
for (i = 0; i < 4; i++) { | |||
br_aes_ct64_interleave_in(&q[i], &q[i + 4], w + (i << 2)); | |||
} | |||
br_aes_ct64_ortho(q); | |||
add_round_key(q, sk_exp); | |||
for (i = 1; i < 14; i++) { | |||
br_aes_ct64_bitslice_Sbox(q); | |||
shift_rows(q); | |||
mix_columns(q); | |||
add_round_key(q, sk_exp + (i << 3)); | |||
} | |||
br_aes_ct64_bitslice_Sbox(q); | |||
shift_rows(q); | |||
add_round_key(q, sk_exp + 112); | |||
br_aes_ct64_ortho(q); | |||
for (i = 0; i < 4; i ++) { | |||
br_aes_ct64_interleave_out(w + (i << 2), q[i], q[i + 4]); | |||
} | |||
br_range_enc32le(out, w, 16); | |||
/* Increase counter for next 4 blocks */ | |||
inc4_be(ivw + 3); | |||
inc4_be(ivw + 7); | |||
inc4_be(ivw + 11); | |||
inc4_be(ivw + 15); | |||
} | |||
static void br_aes_ct64_ctr_init(uint64_t sk_exp[120], const uint8_t *key) { | |||
uint64_t skey[30]; | |||
br_aes_ct64_keysched(skey, key); | |||
br_aes_ct64_skey_expand(sk_exp, skey); | |||
} | |||
static void br_aes_ct64_ctr_run(uint64_t sk_exp[120], const uint8_t *iv, uint32_t cc, uint8_t *data, size_t len) { | |||
uint32_t ivw[16]; | |||
size_t i; | |||
br_range_dec32le(ivw, 3, iv); | |||
memcpy(ivw + 4, ivw, 3 * sizeof(uint32_t)); | |||
memcpy(ivw + 8, ivw, 3 * sizeof(uint32_t)); | |||
memcpy(ivw + 12, ivw, 3 * sizeof(uint32_t)); | |||
ivw[ 3] = br_swap32(cc); | |||
ivw[ 7] = br_swap32(cc + 1); | |||
ivw[11] = br_swap32(cc + 2); | |||
ivw[15] = br_swap32(cc + 3); | |||
while (len > 64) { | |||
aes_ctr4x(data, ivw, sk_exp); | |||
data += 64; | |||
len -= 64; | |||
} | |||
if (len > 0) { | |||
uint8_t tmp[64]; | |||
aes_ctr4x(tmp, ivw, sk_exp); | |||
for (i = 0; i < len; i++) { | |||
data[i] = tmp[i]; | |||
} | |||
} | |||
} | |||
void PQCLEAN_KYBER102490S_CLEAN_aes256ctr_prf(uint8_t *out, size_t outlen, const uint8_t *key, const uint8_t *nonce) { | |||
uint64_t sk_exp[120]; | |||
br_aes_ct64_ctr_init(sk_exp, key); | |||
br_aes_ct64_ctr_run(sk_exp, nonce, 0, out, outlen); | |||
} | |||
void PQCLEAN_KYBER102490S_CLEAN_aes256ctr_init(aes256ctr_ctx *s, const uint8_t *key, const uint8_t *nonce) { | |||
br_aes_ct64_ctr_init(s->sk_exp, key); | |||
br_range_dec32le(s->ivw, 3, nonce); | |||
memcpy(s->ivw + 4, s->ivw, 3 * sizeof(uint32_t)); | |||
memcpy(s->ivw + 8, s->ivw, 3 * sizeof(uint32_t)); | |||
memcpy(s->ivw + 12, s->ivw, 3 * sizeof(uint32_t)); | |||
s->ivw[ 3] = br_swap32(0); | |||
s->ivw[ 7] = br_swap32(1); | |||
s->ivw[11] = br_swap32(2); | |||
s->ivw[15] = br_swap32(3); | |||
} | |||
void PQCLEAN_KYBER102490S_CLEAN_aes256ctr_squeezeblocks(uint8_t *out, size_t nblocks, aes256ctr_ctx *s) { | |||
while (nblocks > 0) { | |||
aes_ctr4x(out, s->ivw, s->sk_exp); | |||
out += 64; | |||
nblocks--; | |||
} | |||
} |
@@ -0,0 +1,28 @@ | |||
#ifndef PQCLEAN_KYBER102490S_CLEAN_AES256CTR_H | |||
#define PQCLEAN_KYBER102490S_CLEAN_AES256CTR_H | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
#define AES256CTR_BLOCKBYTES 64 | |||
typedef struct { | |||
uint64_t sk_exp[120]; | |||
uint32_t ivw[16]; | |||
} aes256ctr_ctx; | |||
void PQCLEAN_KYBER102490S_CLEAN_aes256ctr_prf(uint8_t *out, | |||
size_t outlen, | |||
const uint8_t key[32], | |||
const uint8_t nonce[12]); | |||
void PQCLEAN_KYBER102490S_CLEAN_aes256ctr_init(aes256ctr_ctx *state, | |||
const uint8_t key[32], | |||
const uint8_t nonce[12]); | |||
void PQCLEAN_KYBER102490S_CLEAN_aes256ctr_squeezeblocks(uint8_t *out, | |||
size_t nblocks, | |||
aes256ctr_ctx *state); | |||
#endif |
@@ -5,7 +5,7 @@ | |||
/************************************************* | |||
* Name: load32_littleendian | |||
* | |||
* Description: load bytes into a 32-bit integer | |||
* Description: load 4 bytes into a 32-bit integer | |||
* in little-endian order | |||
* | |||
* Arguments: - const uint8_t *x: pointer to input byte array | |||
@@ -22,16 +22,29 @@ static uint32_t load32_littleendian(const uint8_t x[4]) { | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_cbd | |||
* Name: load24_littleendian | |||
* | |||
* Description: load 3 bytes into a 32-bit integer | |||
* in little-endian order. | |||
* This function is only needed for Kyber-512 | |||
* | |||
* Arguments: - const uint8_t *x: pointer to input byte array | |||
* | |||
* Returns 32-bit unsigned integer loaded from x (most significant byte is zero) | |||
**************************************************/ | |||
/************************************************* | |||
* Name: cbd2 | |||
* | |||
* Description: Given an array of uniformly random bytes, compute | |||
* polynomial with coefficients distributed according to | |||
* a centered binomial distribution with parameter KYBER_ETA | |||
* a centered binomial distribution with parameter eta=2 | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *buf: pointer to input byte array | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]) { | |||
static void cbd2(poly *r, const uint8_t buf[2 * KYBER_N / 4]) { | |||
unsigned int i, j; | |||
uint32_t t, d; | |||
int16_t a, b; | |||
@@ -48,3 +61,23 @@ void PQCLEAN_KYBER102490S_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER | |||
} | |||
} | |||
} | |||
/************************************************* | |||
* Name: cbd3 | |||
* | |||
* Description: Given an array of uniformly random bytes, compute | |||
* polynomial with coefficients distributed according to | |||
* a centered binomial distribution with parameter eta=3. | |||
* This function is only needed for Kyber-512 | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *buf: pointer to input byte array | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_cbd_eta1(poly *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]) { | |||
cbd2(r, buf); | |||
} | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_cbd_eta2(poly *r, const uint8_t buf[KYBER_ETA2 * KYBER_N / 4]) { | |||
cbd2(r, buf); | |||
} |
@@ -4,6 +4,8 @@ | |||
#include "poly.h" | |||
#include <stdint.h> | |||
void PQCLEAN_KYBER102490S_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_cbd_eta1(poly *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_cbd_eta2(poly *r, const uint8_t buf[KYBER_ETA2 * KYBER_N / 4]); | |||
#endif |
@@ -15,8 +15,8 @@ | |||
* serialized vector of polynomials pk | |||
* and the public seed used to generate the matrix A. | |||
* | |||
* Arguments: uint8_t *r: pointer to the output serialized public key | |||
* polyvec *pk: pointer to the input public-key polyvec | |||
* Arguments: uint8_t *r: pointer to the output serialized public key | |||
* polyvec *pk: pointer to the input public-key polyvec | |||
* const uint8_t *seed: pointer to the input public seed | |||
**************************************************/ | |||
static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], | |||
@@ -35,10 +35,8 @@ static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], | |||
* Description: De-serialize public key from a byte array; | |||
* approximate inverse of pack_pk | |||
* | |||
* Arguments: - polyvec *pk: pointer to output public-key | |||
* polynomial vector | |||
* - uint8_t *seed: pointer to output seed to generate | |||
* matrix A | |||
* Arguments: - polyvec *pk: pointer to output public-key polynomial vector | |||
* - uint8_t *seed: pointer to output seed to generate matrix A | |||
* - const uint8_t *packedpk: pointer to input serialized public key | |||
**************************************************/ | |||
static void unpack_pk(polyvec *pk, | |||
@@ -56,7 +54,7 @@ static void unpack_pk(polyvec *pk, | |||
* | |||
* Description: Serialize the secret key | |||
* | |||
* Arguments: - uint8_t *r: pointer to output serialized secret key | |||
* Arguments: - uint8_t *r: pointer to output serialized secret key | |||
* - polyvec *sk: pointer to input vector of polynomials (secret key) | |||
**************************************************/ | |||
static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { | |||
@@ -66,15 +64,12 @@ static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { | |||
/************************************************* | |||
* Name: unpack_sk | |||
* | |||
* Description: De-serialize the secret key; | |||
* inverse of pack_sk | |||
* Description: De-serialize the secret key; inverse of pack_sk | |||
* | |||
* Arguments: - polyvec *sk: pointer to output vector of | |||
* polynomials (secret key) | |||
* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key) | |||
* - const uint8_t *packedsk: pointer to input serialized secret key | |||
**************************************************/ | |||
static void unpack_sk(polyvec *sk, | |||
const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { | |||
static void unpack_sk(polyvec *sk, const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_frombytes(sk, packedsk); | |||
} | |||
@@ -86,12 +81,10 @@ static void unpack_sk(polyvec *sk, | |||
* and the compressed and serialized polynomial v | |||
* | |||
* Arguments: uint8_t *r: pointer to the output serialized ciphertext | |||
* poly *pk: pointer to the input vector of polynomials b | |||
* poly *v: pointer to the input polynomial v | |||
* poly *pk: pointer to the input vector of polynomials b | |||
* poly *v: pointer to the input polynomial v | |||
**************************************************/ | |||
static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], | |||
polyvec *b, | |||
poly *v) { | |||
static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], polyvec *b, poly *v) { | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_compress(r, b); | |||
PQCLEAN_KYBER102490S_CLEAN_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v); | |||
} | |||
@@ -102,13 +95,11 @@ static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], | |||
* Description: De-serialize and decompress ciphertext from a byte array; | |||
* approximate inverse of pack_ciphertext | |||
* | |||
* Arguments: - polyvec *b: pointer to the output vector of polynomials b | |||
* - poly *v: pointer to the output polynomial v | |||
* Arguments: - polyvec *b: pointer to the output vector of polynomials b | |||
* - poly *v: pointer to the output polynomial v | |||
* - const uint8_t *c: pointer to the input serialized ciphertext | |||
**************************************************/ | |||
static void unpack_ciphertext(polyvec *b, | |||
poly *v, | |||
const uint8_t c[KYBER_INDCPA_BYTES]) { | |||
static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t c[KYBER_INDCPA_BYTES]) { | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress(b, c); | |||
PQCLEAN_KYBER102490S_CLEAN_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES); | |||
} | |||
@@ -119,11 +110,9 @@ static void unpack_ciphertext(polyvec *b, | |||
* Description: Run rejection sampling on uniform random bytes to generate | |||
* uniform random integers mod q | |||
* | |||
* Arguments: - int16_t *r: pointer to output buffer | |||
* - unsigned int len: requested number of 16-bit integers | |||
* (uniform mod q) | |||
* - const uint8_t *buf: pointer to input buffer | |||
* (assumed to be uniform random bytes) | |||
* Arguments: - int16_t *r: pointer to output buffer | |||
* - unsigned int len: requested number of 16-bit integers (uniform mod q) | |||
* - const uint8_t *buf: pointer to input buffer (assumed to be uniformly random bytes) | |||
* - unsigned int buflen: length of input buffer in bytes | |||
* | |||
* Returns number of sampled 16-bit integers (at most len) | |||
@@ -133,16 +122,19 @@ static unsigned int rej_uniform(int16_t *r, | |||
const uint8_t *buf, | |||
unsigned int buflen) { | |||
unsigned int ctr, pos; | |||
uint16_t val; | |||
uint16_t val0, val1; | |||
ctr = pos = 0; | |||
while (ctr < len && pos + 2 <= buflen) { | |||
val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); | |||
pos += 2; | |||
while (ctr < len && pos + 3 <= buflen) { | |||
val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; | |||
val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF; | |||
pos += 3; | |||
if (val < 19 * KYBER_Q) { | |||
val -= (val >> 12) * KYBER_Q; // Barrett reduction | |||
r[ctr++] = (int16_t)val; | |||
if (val0 < KYBER_Q) { | |||
r[ctr++] = val0; | |||
} | |||
if (ctr < len && val1 < KYBER_Q) { | |||
r[ctr++] = val1; | |||
} | |||
} | |||
@@ -160,17 +152,16 @@ static unsigned int rej_uniform(int16_t *r, | |||
* uniformly random. Performs rejection sampling on output of | |||
* a XOF | |||
* | |||
* Arguments: - polyvec *a: pointer to ouptput matrix A | |||
* Arguments: - polyvec *a: pointer to ouptput matrix A | |||
* - const uint8_t *seed: pointer to input seed | |||
* - int transposed: boolean deciding whether A or A^T | |||
* is generated | |||
* - int transposed: boolean deciding whether A or A^T is generated | |||
**************************************************/ | |||
#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \ | |||
+ XOF_BLOCKBYTES)/XOF_BLOCKBYTES) | |||
#define GEN_MATRIX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) | |||
// Not static for benchmarking | |||
void PQCLEAN_KYBER102490S_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed) { | |||
unsigned int ctr, i, j; | |||
uint8_t buf[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES]; | |||
unsigned int ctr, i, j, k; | |||
unsigned int buflen, off; | |||
uint8_t buf[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2]; | |||
xof_state state; | |||
for (i = 0; i < KYBER_K; i++) { | |||
@@ -182,12 +173,17 @@ void PQCLEAN_KYBER102490S_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_ | |||
} | |||
xof_squeezeblocks(buf, GEN_MATRIX_NBLOCKS, &state); | |||
ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, sizeof(buf)); | |||
buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES; | |||
ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, buflen); | |||
while (ctr < KYBER_N) { | |||
xof_squeezeblocks(buf, 1, &state); | |||
ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, | |||
XOF_BLOCKBYTES); | |||
off = buflen % 3; | |||
for (k = 0; k < off; k++) { | |||
buf[k] = buf[buflen - off + k]; | |||
} | |||
xof_squeezeblocks(buf + off, 1, &state); | |||
buflen = off + XOF_BLOCKBYTES; | |||
ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, buflen); | |||
} | |||
xof_ctx_release(&state); | |||
} | |||
@@ -220,10 +216,10 @@ void PQCLEAN_KYBER102490S_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEY | |||
gen_a(a, publicseed); | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(&skpv.vec[i], noiseseed, nonce++); | |||
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise_eta1(&skpv.vec[i], noiseseed, nonce++); | |||
} | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(&e.vec[i], noiseseed, nonce++); | |||
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise_eta1(&e.vec[i], noiseseed, nonce++); | |||
} | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt(&skpv); | |||
@@ -231,7 +227,7 @@ void PQCLEAN_KYBER102490S_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEY | |||
// matrix-vector multiplication | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_basemul_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); | |||
PQCLEAN_KYBER102490S_CLEAN_poly_tomont(&pkpv.vec[i]); | |||
} | |||
@@ -248,16 +244,15 @@ void PQCLEAN_KYBER102490S_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEY | |||
* Description: Encryption function of the CPA-secure | |||
* public-key encryption scheme underlying Kyber. | |||
* | |||
* Arguments: - uint8_t *c: pointer to output ciphertext | |||
* (of length KYBER_INDCPA_BYTES bytes) | |||
* - const uint8_t *m: pointer to input message | |||
* (of length KYBER_INDCPA_MSGBYTES bytes) | |||
* - const uint8_t *pk: pointer to input public key | |||
* (of length KYBER_INDCPA_PUBLICKEYBYTES) | |||
* - const uint8_t *coins: pointer to input random coins | |||
* used as seed (of length KYBER_SYMBYTES) | |||
* to deterministically generate all | |||
* randomness | |||
* Arguments: - uint8_t *c: pointer to output ciphertext | |||
* (of length KYBER_INDCPA_BYTES bytes) | |||
* - const uint8_t *m: pointer to input message | |||
* (of length KYBER_INDCPA_MSGBYTES bytes) | |||
* - const uint8_t *pk: pointer to input public key | |||
* (of length KYBER_INDCPA_PUBLICKEYBYTES) | |||
* - const uint8_t *coins: pointer to input random coins used as seed | |||
* (of length KYBER_SYMBYTES) to deterministically | |||
* generate all randomness | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], | |||
const uint8_t m[KYBER_INDCPA_MSGBYTES], | |||
@@ -266,7 +261,7 @@ void PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], | |||
unsigned int i; | |||
uint8_t seed[KYBER_SYMBYTES]; | |||
uint8_t nonce = 0; | |||
polyvec sp, pkpv, ep, at[KYBER_K], bp; | |||
polyvec sp, pkpv, ep, at[KYBER_K], b; | |||
poly v, k, epp; | |||
unpack_pk(&pkpv, seed, pk); | |||
@@ -274,32 +269,32 @@ void PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], | |||
gen_at(at, seed); | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(sp.vec + i, coins, nonce++); | |||
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise_eta1(sp.vec + i, coins, nonce++); | |||
} | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(ep.vec + i, coins, nonce++); | |||
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise_eta2(ep.vec + i, coins, nonce++); | |||
} | |||
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(&epp, coins, nonce++); | |||
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise_eta2(&epp, coins, nonce++); | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt(&sp); | |||
// matrix-vector multiplication | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp); | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_basemul_acc_montgomery(&b.vec[i], &at[i], &sp); | |||
} | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp); | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_basemul_acc_montgomery(&v, &pkpv, &sp); | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_invntt_tomont(&bp); | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_invntt_tomont(&b); | |||
PQCLEAN_KYBER102490S_CLEAN_poly_invntt_tomont(&v); | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_add(&bp, &bp, &ep); | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_add(&b, &b, &ep); | |||
PQCLEAN_KYBER102490S_CLEAN_poly_add(&v, &v, &epp); | |||
PQCLEAN_KYBER102490S_CLEAN_poly_add(&v, &v, &k); | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_reduce(&bp); | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_reduce(&b); | |||
PQCLEAN_KYBER102490S_CLEAN_poly_reduce(&v); | |||
pack_ciphertext(c, &bp, &v); | |||
pack_ciphertext(c, &b, &v); | |||
} | |||
/************************************************* | |||
@@ -308,24 +303,24 @@ void PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], | |||
* Description: Decryption function of the CPA-secure | |||
* public-key encryption scheme underlying Kyber. | |||
* | |||
* Arguments: - uint8_t *m: pointer to output decrypted message | |||
* (of length KYBER_INDCPA_MSGBYTES) | |||
* - const uint8_t *c: pointer to input ciphertext | |||
* (of length KYBER_INDCPA_BYTES) | |||
* Arguments: - uint8_t *m: pointer to output decrypted message | |||
* (of length KYBER_INDCPA_MSGBYTES) | |||
* - const uint8_t *c: pointer to input ciphertext | |||
* (of length KYBER_INDCPA_BYTES) | |||
* - const uint8_t *sk: pointer to input secret key | |||
* (of length KYBER_INDCPA_SECRETKEYBYTES) | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], | |||
const uint8_t c[KYBER_INDCPA_BYTES], | |||
const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { | |||
polyvec bp, skpv; | |||
polyvec b, skpv; | |||
poly v, mp; | |||
unpack_ciphertext(&bp, &v, c); | |||
unpack_ciphertext(&b, &v, c); | |||
unpack_sk(&skpv, sk); | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt(&bp); | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp); | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt(&b); | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_basemul_acc_montgomery(&mp, &skpv, &b); | |||
PQCLEAN_KYBER102490S_CLEAN_poly_invntt_tomont(&mp); | |||
PQCLEAN_KYBER102490S_CLEAN_poly_sub(&mp, &v, &mp); | |||
@@ -14,13 +14,14 @@ | |||
* for CCA-secure Kyber key encapsulation mechanism | |||
* | |||
* Arguments: - unsigned char *pk: pointer to output public key | |||
* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) | |||
* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) | |||
* - unsigned char *sk: pointer to output private key | |||
* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) | |||
* (an already allocated array of KYBER_SECRETKEYBYTES bytes) | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { | |||
int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_keypair(unsigned char pk[KYBER_PUBLICKEYBYTES], | |||
unsigned char sk[KYBER_SECRETKEYBYTES]) { | |||
size_t i; | |||
PQCLEAN_KYBER102490S_CLEAN_indcpa_keypair(pk, sk); | |||
for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) { | |||
@@ -39,17 +40,17 @@ int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned ch | |||
* secret for given public key | |||
* | |||
* Arguments: - unsigned char *ct: pointer to output cipher text | |||
* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) | |||
* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) | |||
* - unsigned char *ss: pointer to output shared secret | |||
* (an already allocated array of CRYPTO_BYTES bytes) | |||
* (an already allocated array of KYBER_SSBYTES bytes) | |||
* - const unsigned char *pk: pointer to input public key | |||
* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) | |||
* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_enc(unsigned char *ct, | |||
unsigned char *ss, | |||
const unsigned char *pk) { | |||
int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_enc(unsigned char ct[KYBER_CIPHERTEXTBYTES], | |||
unsigned char ss[KYBER_SSBYTES], | |||
const unsigned char pk[KYBER_PUBLICKEYBYTES]) { | |||
uint8_t buf[2 * KYBER_SYMBYTES]; | |||
/* Will contain key, coins */ | |||
uint8_t kr[2 * KYBER_SYMBYTES]; | |||
@@ -79,19 +80,19 @@ int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_enc(unsigned char *ct, | |||
* cipher text and private key | |||
* | |||
* Arguments: - unsigned char *ss: pointer to output shared secret | |||
* (an already allocated array of CRYPTO_BYTES bytes) | |||
* (an already allocated array of KYBER_SSBYTES bytes) | |||
* - const unsigned char *ct: pointer to input cipher text | |||
* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) | |||
* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) | |||
* - const unsigned char *sk: pointer to input private key | |||
* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) | |||
* (an already allocated array of KYBER_SECRETKEYBYTES bytes) | |||
* | |||
* Returns 0. | |||
* | |||
* On failure, ss will contain a pseudo-random value. | |||
**************************************************/ | |||
int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_dec(unsigned char *ss, | |||
const unsigned char *ct, | |||
const unsigned char *sk) { | |||
int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_dec(unsigned char ss[KYBER_SSBYTES], | |||
const unsigned char ct[KYBER_CIPHERTEXTBYTES], | |||
const unsigned char sk[KYBER_SECRETKEYBYTES]) { | |||
size_t i; | |||
int fail; | |||
uint8_t buf[2 * KYBER_SYMBYTES]; | |||
@@ -3,11 +3,11 @@ | |||
#include "reduce.h" | |||
#include <stdint.h> | |||
/* Code to generate PQCLEAN_KYBER102490S_CLEAN_zetas and PQCLEAN_KYBER102490S_CLEAN_zetas_inv used in the number-theoretic transform: | |||
/* Code to generate PQCLEAN_KYBER102490S_CLEAN_zetas and zetas_inv used in the number-theoretic transform: | |||
#define KYBER_ROOT_OF_UNITY 17 | |||
static const uint16_t tree[128] = { | |||
static const uint8_t tree[128] = { | |||
0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120, | |||
4, 68, 36, 100, 20, 84, 52, 116, 12, 76, 44, 108, 28, 92, 60, 124, | |||
2, 66, 34, 98, 18, 82, 50, 114, 10, 74, 42, 106, 26, 90, 58, 122, | |||
@@ -19,51 +19,41 @@ static const uint16_t tree[128] = { | |||
}; | |||
void init_ntt() { | |||
unsigned int i, j, k; | |||
unsigned int i; | |||
int16_t tmp[128]; | |||
tmp[0] = MONT; | |||
for(i = 1; i < 128; ++i) | |||
tmp[i] = fqmul(tmp[i-1], KYBER_ROOT_OF_UNITY*MONT % KYBER_Q); | |||
for(i=1;i<128;i++) | |||
tmp[i] = fqmul(tmp[i-1],MONT*KYBER_ROOT_OF_UNITY % KYBER_Q); | |||
for(i = 0; i < 128; ++i) | |||
for(i=0;i<128;i++) { | |||
PQCLEAN_KYBER102490S_CLEAN_zetas[i] = tmp[tree[i]]; | |||
k = 0; | |||
for(i = 64; i >= 1; i >>= 1) | |||
for(j = i; j < 2*i; ++j) | |||
PQCLEAN_KYBER102490S_CLEAN_zetas_inv[k++] = -tmp[128 - tree[j]]; | |||
PQCLEAN_KYBER102490S_CLEAN_zetas_inv[127] = MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q; | |||
if(PQCLEAN_KYBER102490S_CLEAN_zetas[i] > KYBER_Q/2) | |||
PQCLEAN_KYBER102490S_CLEAN_zetas[i] -= KYBER_Q; | |||
if(PQCLEAN_KYBER102490S_CLEAN_zetas[i] < -KYBER_Q/2) | |||
PQCLEAN_KYBER102490S_CLEAN_zetas[i] += KYBER_Q; | |||
} | |||
} | |||
*/ | |||
const int16_t PQCLEAN_KYBER102490S_CLEAN_zetas[128] = { | |||
2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182, 962, | |||
2127, 1855, 1468, 573, 2004, 264, 383, 2500, 1458, 1727, 3199, 2648, 1017, | |||
732, 608, 1787, 411, 3124, 1758, 1223, 652, 2777, 1015, 2036, 1491, 3047, | |||
1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, 2476, 3239, 3058, 830, | |||
107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, 2226, | |||
430, 555, 843, 2078, 871, 1550, 105, 422, 587, 177, 3094, 3038, 2869, 1574, | |||
1653, 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349, | |||
418, 329, 3173, 3254, 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193, | |||
1218, 1994, 2455, 220, 2142, 1670, 2144, 1799, 2051, 794, 1819, 2475, 2459, | |||
478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628 | |||
}; | |||
const int16_t PQCLEAN_KYBER102490S_CLEAN_zetas_inv[128] = { | |||
1701, 1807, 1460, 2371, 2338, 2333, 308, 108, 2851, 870, 854, 1510, 2535, | |||
1278, 1530, 1185, 1659, 1187, 3109, 874, 1335, 2111, 136, 1215, 2945, 1465, | |||
1285, 2007, 2719, 2726, 2232, 2512, 75, 156, 3000, 2911, 2980, 872, 2685, | |||
1590, 2210, 602, 1846, 777, 147, 2170, 2551, 246, 1676, 1755, 460, 291, 235, | |||
3152, 2742, 2907, 3224, 1779, 2458, 1251, 2486, 2774, 2899, 1103, 1275, 2652, | |||
1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, | |||
1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, | |||
2677, 2106, 1571, 205, 2918, 1542, 2721, 2597, 2312, 681, 130, 1602, 1871, | |||
829, 2946, 3065, 1325, 2756, 1861, 1474, 1202, 2367, 3147, 1752, 2707, 171, | |||
3127, 3042, 1907, 1836, 1517, 359, 758, 1441 | |||
}; | |||
-1044, -758, -359, -1517, 1493, 1422, 287, 202, | |||
-171, 622, 1577, 182, 962, -1202, -1474, 1468, | |||
573, -1325, 264, 383, -829, 1458, -1602, -130, | |||
-681, 1017, 732, 608, -1542, 411, -205, -1571, | |||
1223, 652, -552, 1015, -1293, 1491, -282, -1544, | |||
516, -8, -320, -666, -1618, -1162, 126, 1469, | |||
-853, -90, -271, 830, 107, -1421, -247, -951, | |||
-398, 961, -1508, -725, 448, -1065, 677, -1275, | |||
-1103, 430, 555, 843, -1251, 871, 1550, 105, | |||
422, 587, 177, -235, -291, -460, 1574, 1653, | |||
-246, 778, 1159, -147, -777, 1483, -602, 1119, | |||
-1590, 644, -872, 349, 418, 329, -156, -75, | |||
817, 1097, 603, 610, 1322, -1285, -1465, 384, | |||
-1215, -136, 1218, -1335, -874, 220, -1187, -1659, | |||
-1185, -1530, -1278, 794, -1510, -854, -870, 478, | |||
-108, -308, 996, 991, 958, -1460, 1522, 1628 | |||
}; | |||
/************************************************* | |||
* Name: fqmul | |||
@@ -82,11 +72,10 @@ static int16_t fqmul(int16_t a, int16_t b) { | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_ntt | |||
* | |||
* Description: Inplace number-theoretic transform (NTT) in Rq | |||
* Description: Inplace number-theoretic transform (NTT) in Rq. | |||
* input is in standard order, output is in bitreversed order | |||
* | |||
* Arguments: - int16_t r[256]: pointer to input/output vector of elements | |||
* of Zq | |||
* Arguments: - int16_t r[256]: pointer to input/output vector of elements of Zq | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_ntt(int16_t r[256]) { | |||
unsigned int len, start, j, k; | |||
@@ -96,7 +85,7 @@ void PQCLEAN_KYBER102490S_CLEAN_ntt(int16_t r[256]) { | |||
for (len = 128; len >= 2; len >>= 1) { | |||
for (start = 0; start < 256; start = j + len) { | |||
zeta = PQCLEAN_KYBER102490S_CLEAN_zetas[k++]; | |||
for (j = start; j < start + len; ++j) { | |||
for (j = start; j < start + len; j++) { | |||
t = fqmul(zeta, r[j + len]); | |||
r[j + len] = r[j] - t; | |||
r[j] = r[j] + t; | |||
@@ -112,28 +101,28 @@ void PQCLEAN_KYBER102490S_CLEAN_ntt(int16_t r[256]) { | |||
* multiplication by Montgomery factor 2^16. | |||
* Input is in bitreversed order, output is in standard order | |||
* | |||
* Arguments: - int16_t r[256]: pointer to input/output vector of elements | |||
* of Zq | |||
* Arguments: - int16_t r[256]: pointer to input/output vector of elements of Zq | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_invntt(int16_t r[256]) { | |||
unsigned int start, len, j, k; | |||
int16_t t, zeta; | |||
const int16_t f = 1441; // mont^2/128 | |||
k = 0; | |||
k = 127; | |||
for (len = 2; len <= 128; len <<= 1) { | |||
for (start = 0; start < 256; start = j + len) { | |||
zeta = PQCLEAN_KYBER102490S_CLEAN_zetas_inv[k++]; | |||
for (j = start; j < start + len; ++j) { | |||
zeta = PQCLEAN_KYBER102490S_CLEAN_zetas[k--]; | |||
for (j = start; j < start + len; j++) { | |||
t = r[j]; | |||
r[j] = PQCLEAN_KYBER102490S_CLEAN_barrett_reduce(t + r[j + len]); | |||
r[j + len] = t - r[j + len]; | |||
r[j + len] = r[j + len] - t; | |||
r[j + len] = fqmul(zeta, r[j + len]); | |||
} | |||
} | |||
} | |||
for (j = 0; j < 256; ++j) { | |||
r[j] = fqmul(r[j], PQCLEAN_KYBER102490S_CLEAN_zetas_inv[127]); | |||
for (j = 0; j < 256; j++) { | |||
r[j] = fqmul(r[j], f); | |||
} | |||
} | |||
@@ -143,19 +132,15 @@ void PQCLEAN_KYBER102490S_CLEAN_invntt(int16_t r[256]) { | |||
* Description: Multiplication of polynomials in Zq[X]/(X^2-zeta) | |||
* used for multiplication of elements in Rq in NTT domain | |||
* | |||
* Arguments: - int16_t r[2]: pointer to the output polynomial | |||
* Arguments: - int16_t r[2]: pointer to the output polynomial | |||
* - const int16_t a[2]: pointer to the first factor | |||
* - const int16_t b[2]: pointer to the second factor | |||
* - int16_t zeta: integer defining the reduction polynomial | |||
* - int16_t zeta: integer defining the reduction polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_basemul(int16_t r[2], | |||
const int16_t a[2], | |||
const int16_t b[2], | |||
int16_t zeta) { | |||
void PQCLEAN_KYBER102490S_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta) { | |||
r[0] = fqmul(a[1], b[1]); | |||
r[0] = fqmul(r[0], zeta); | |||
r[0] += fqmul(a[0], b[0]); | |||
r[1] = fqmul(a[0], b[1]); | |||
r[1] += fqmul(a[1], b[0]); | |||
} |
@@ -5,15 +5,10 @@ | |||
extern const int16_t PQCLEAN_KYBER102490S_CLEAN_zetas[128]; | |||
extern const int16_t PQCLEAN_KYBER102490S_CLEAN_zetas_inv[128]; | |||
void PQCLEAN_KYBER102490S_CLEAN_ntt(int16_t r[256]); | |||
void PQCLEAN_KYBER102490S_CLEAN_invntt(int16_t r[256]); | |||
void PQCLEAN_KYBER102490S_CLEAN_basemul(int16_t r[2], | |||
const int16_t a[2], | |||
const int16_t b[2], | |||
int16_t zeta); | |||
void PQCLEAN_KYBER102490S_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta); | |||
#endif |
@@ -7,8 +7,6 @@ | |||
#define KYBER_N 256 | |||
#define KYBER_Q 3329 | |||
#define KYBER_ETA 2 | |||
#define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ | |||
#define KYBER_SSBYTES 32 /* size in bytes of shared key */ | |||
@@ -16,20 +14,20 @@ | |||
#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) | |||
#define KYBER_K 4 | |||
#define KYBER_ETA1 2 | |||
#define KYBER_POLYCOMPRESSEDBYTES 160 | |||
#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 352) | |||
#define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES | |||
#define KYBER_ETA2 2 | |||
#define KYBER_INDCPA_MSGBYTES (KYBER_SYMBYTES) | |||
#define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) | |||
#define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) | |||
#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES \ | |||
+ KYBER_POLYCOMPRESSEDBYTES) | |||
#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES) | |||
#define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES) | |||
/* 32 bytes of additional space to save H(pk) */ | |||
#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES \ | |||
+ KYBER_INDCPA_PUBLICKEYBYTES \ | |||
+ 2*KYBER_SYMBYTES) | |||
#define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES | |||
#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) | |||
#define KYBER_CIPHERTEXTBYTES (KYBER_INDCPA_BYTES) | |||
#endif |
@@ -13,17 +13,19 @@ | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* (of length KYBER_POLYCOMPRESSEDBYTES) | |||
* - poly *a: pointer to input polynomial | |||
* - const poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a) { | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a) { | |||
size_t i, j; | |||
int16_t u; | |||
uint8_t t[8]; | |||
PQCLEAN_KYBER102490S_CLEAN_poly_csubq(a); | |||
for (i = 0; i < KYBER_N / 8; i++) { | |||
for (j = 0; j < 8; j++) { | |||
t[j] = ((((uint32_t)a->coeffs[8 * i + j] << 5) + KYBER_Q / 2) / KYBER_Q) & 31; | |||
// map to positive standard representatives | |||
u = a->coeffs[8 * i + j]; | |||
u += (u >> 15) & KYBER_Q; | |||
t[j] = ((((uint32_t)u << 5) + KYBER_Q / 2) / KYBER_Q) & 31; | |||
} | |||
r[0] = (t[0] >> 0) | (t[1] << 5); | |||
@@ -41,7 +43,7 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTE | |||
* Description: De-serialization and subsequent decompression of a polynomial; | |||
* approximate inverse of PQCLEAN_KYBER102490S_CLEAN_poly_compress | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *a: pointer to input byte array | |||
* (of length KYBER_POLYCOMPRESSEDBYTES bytes) | |||
**************************************************/ | |||
@@ -74,20 +76,21 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_P | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* (needs space for KYBER_POLYBYTES bytes) | |||
* - poly *a: pointer to input polynomial | |||
* - const poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a) { | |||
size_t i; | |||
uint16_t t0, t1; | |||
PQCLEAN_KYBER102490S_CLEAN_poly_csubq(a); | |||
for (i = 0; i < KYBER_N / 2; i++) { | |||
t0 = a->coeffs[2 * i]; | |||
// map to positive standard representatives | |||
t0 = a->coeffs[2 * i]; | |||
t0 += ((int16_t)t0 >> 15) & KYBER_Q; | |||
t1 = a->coeffs[2 * i + 1]; | |||
r[3 * i + 0] = (uint8_t)(t0 >> 0); | |||
r[3 * i + 1] = (uint8_t)((t0 >> 8) | (t1 << 4)); | |||
r[3 * i + 2] = (uint8_t)(t1 >> 4); | |||
t1 += ((int16_t)t1 >> 15) & KYBER_Q; | |||
r[3 * i + 0] = (t0 >> 0); | |||
r[3 * i + 1] = (t0 >> 8) | (t1 << 4); | |||
r[3 * i + 2] = (t1 >> 4); | |||
} | |||
} | |||
@@ -97,7 +100,7 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a | |||
* Description: De-serialization of a polynomial; | |||
* inverse of PQCLEAN_KYBER102490S_CLEAN_poly_tobytes | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *a: pointer to input byte array | |||
* (of KYBER_POLYBYTES bytes) | |||
**************************************************/ | |||
@@ -114,7 +117,7 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_PO | |||
* | |||
* Description: Convert 32-byte message to polynomial | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *msg: pointer to input message | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { | |||
@@ -135,41 +138,60 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_IN | |||
* Description: Convert polynomial to 32-byte message | |||
* | |||
* Arguments: - uint8_t *msg: pointer to output message | |||
* - poly *a: pointer to input polynomial | |||
* - const poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a) { | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *a) { | |||
size_t i, j; | |||
uint16_t t; | |||
PQCLEAN_KYBER102490S_CLEAN_poly_csubq(a); | |||
for (i = 0; i < KYBER_N / 8; i++) { | |||
msg[i] = 0; | |||
for (j = 0; j < 8; j++) { | |||
t = ((((uint16_t)a->coeffs[8 * i + j] << 1) + KYBER_Q / 2) / KYBER_Q) & 1; | |||
t = a->coeffs[8 * i + j]; | |||
t += ((int16_t)t >> 15) & KYBER_Q; | |||
t = (((t << 1) + KYBER_Q / 2) / KYBER_Q) & 1; | |||
msg[i] |= t << j; | |||
} | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_getnoise | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_getnoise_eta1 | |||
* | |||
* Description: Sample a polynomial deterministically from a seed and a nonce, | |||
* with output polynomial close to centered binomial distribution | |||
* with parameter KYBER_ETA1 | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *seed: pointer to input seed | |||
* (of length KYBER_SYMBYTES bytes) | |||
* - uint8_t nonce: one-byte input nonce | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { | |||
uint8_t buf[KYBER_ETA1 * KYBER_N / 4]; | |||
prf(buf, sizeof(buf), seed, nonce); | |||
PQCLEAN_KYBER102490S_CLEAN_poly_cbd_eta1(r, buf); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_getnoise_eta2 | |||
* | |||
* Description: Sample a polynomial deterministically from a seed and a nonce, | |||
* with output polynomial close to centered binomial distribution | |||
* with parameter KYBER_ETA | |||
* with parameter KYBER_ETA2 | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *seed: pointer to input seed | |||
* (of length KYBER_SYMBYTES bytes) | |||
* - uint8_t nonce: one-byte input nonce | |||
* - uint8_t nonce: one-byte input nonce | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { | |||
uint8_t buf[KYBER_ETA * KYBER_N / 4]; | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { | |||
uint8_t buf[KYBER_ETA2 * KYBER_N / 4]; | |||
prf(buf, sizeof(buf), seed, nonce); | |||
PQCLEAN_KYBER102490S_CLEAN_cbd(r, buf); | |||
PQCLEAN_KYBER102490S_CLEAN_poly_cbd_eta2(r, buf); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_ntt | |||
* | |||
@@ -202,7 +224,7 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_invntt_tomont(poly *r) { | |||
* | |||
* Description: Multiplication of two polynomials in NTT domain | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const poly *a: pointer to first input polynomial | |||
* - const poly *b: pointer to second input polynomial | |||
**************************************************/ | |||
@@ -210,8 +232,7 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, | |||
size_t i; | |||
for (i = 0; i < KYBER_N / 4; i++) { | |||
PQCLEAN_KYBER102490S_CLEAN_basemul(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i], PQCLEAN_KYBER102490S_CLEAN_zetas[64 + i]); | |||
PQCLEAN_KYBER102490S_CLEAN_basemul(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2], &b->coeffs[4 * i + 2], | |||
-PQCLEAN_KYBER102490S_CLEAN_zetas[64 + i]); | |||
PQCLEAN_KYBER102490S_CLEAN_basemul(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2], &b->coeffs[4 * i + 2], -PQCLEAN_KYBER102490S_CLEAN_zetas[64 + i]); | |||
} | |||
} | |||
@@ -246,28 +267,12 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_reduce(poly *r) { | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_csubq | |||
* | |||
* Description: Applies conditional subtraction of q to each coefficient | |||
* of a polynomial. For details of conditional subtraction | |||
* of q see comments in reduce.c | |||
* | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_csubq(poly *r) { | |||
size_t i; | |||
for (i = 0; i < KYBER_N; i++) { | |||
r->coeffs[i] = PQCLEAN_KYBER102490S_CLEAN_csubq(r->coeffs[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_add | |||
* | |||
* Description: Add two polynomials | |||
* Description: Add two polynomials; no modular reduction is performed | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const poly *a: pointer to first input polynomial | |||
* - const poly *b: pointer to second input polynomial | |||
**************************************************/ | |||
@@ -281,7 +286,7 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_add(poly *r, const poly *a, const poly *b) | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_sub | |||
* | |||
* Description: Subtract two polynomials | |||
* Description: Subtract two polynomials; no modular reduction is performed | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const poly *a: pointer to first input polynomial | |||
@@ -11,16 +11,18 @@ typedef struct { | |||
int16_t coeffs[KYBER_N]; | |||
} poly; | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *a); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_ntt(poly *r); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_invntt_tomont(poly *r); | |||
@@ -28,7 +30,6 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_tomont(poly *r); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_reduce(poly *r); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_csubq(poly *r); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_add(poly *r, const poly *a, const poly *b); | |||
void PQCLEAN_KYBER102490S_CLEAN_poly_sub(poly *r, const poly *a, const poly *b); | |||
@@ -10,19 +10,18 @@ | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* (needs space for KYBER_POLYVECCOMPRESSEDBYTES) | |||
* - polyvec *a: pointer to input vector of polynomials | |||
* - const polyvec *a: pointer to input vector of polynomials | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a) { | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a) { | |||
unsigned int i, j, k; | |||
PQCLEAN_KYBER102490S_CLEAN_polyvec_csubq(a); | |||
uint16_t t[8]; | |||
for (i = 0; i < KYBER_K; i++) { | |||
for (j = 0; j < KYBER_N / 8; j++) { | |||
for (k = 0; k < 8; k++) { | |||
t[k] = ((((uint32_t)a->vec[i].coeffs[8 * j + k] << 11) + KYBER_Q / 2) | |||
/ KYBER_Q) & 0x7ff; | |||
t[k] = a->vec[i].coeffs[8 * j + k]; | |||
t[k] += ((int16_t)t[k] >> 15) & KYBER_Q; | |||
t[k] = ((((uint32_t)t[k] << 11) + KYBER_Q / 2) / KYBER_Q) & 0x7ff; | |||
} | |||
r[ 0] = (uint8_t)(t[0] >> 0); | |||
@@ -51,8 +50,7 @@ void PQCLEAN_KYBER102490S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESS | |||
* - const uint8_t *a: pointer to input byte array | |||
* (of length KYBER_POLYVECCOMPRESSEDBYTES) | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress(polyvec *r, | |||
const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { | |||
unsigned int i, j, k; | |||
uint16_t t[8]; | |||
@@ -82,9 +80,9 @@ void PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress(polyvec *r, | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* (needs space for KYBER_POLYVECBYTES) | |||
* - polyvec *a: pointer to input vector of polynomials | |||
* - const polyvec *a: pointer to input vector of polynomials | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a) { | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a) { | |||
unsigned int i; | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_CLEAN_poly_tobytes(r + i * KYBER_POLYBYTES, &a->vec[i]); | |||
@@ -138,18 +136,16 @@ void PQCLEAN_KYBER102490S_CLEAN_polyvec_invntt_tomont(polyvec *r) { | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_basemul_acc_montgomery | |||
* | |||
* Description: Pointwise multiply elements of a and b, accumulate into r, | |||
* Description: Multiply elements of a and b in NTT domain, accumulate into r, | |||
* and multiply by 2^-16. | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const polyvec *a: pointer to first input vector of polynomials | |||
* - const polyvec *b: pointer to second input vector of polynomials | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, | |||
const polyvec *a, | |||
const polyvec *b) { | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) { | |||
unsigned int i; | |||
poly t; | |||
@@ -166,10 +162,10 @@ void PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_reduce | |||
* | |||
* Description: Applies Barrett reduction to each coefficient | |||
* of each element of a vector of polynomials | |||
* of each element of a vector of polynomials; | |||
* for details of the Barrett reduction see comments in reduce.c | |||
* | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
* Arguments: - polyvec *r: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_reduce(polyvec *r) { | |||
unsigned int i; | |||
@@ -178,29 +174,12 @@ void PQCLEAN_KYBER102490S_CLEAN_polyvec_reduce(polyvec *r) { | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_csubq | |||
* | |||
* Description: Applies conditional subtraction of q to each coefficient | |||
* of each element of a vector of polynomials | |||
* for details of conditional subtraction of q see comments in | |||
* reduce.c | |||
* | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_csubq(polyvec *r) { | |||
unsigned int i; | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER102490S_CLEAN_poly_csubq(&r->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_add | |||
* | |||
* Description: Add vectors of polynomials | |||
* | |||
* Arguments: - polyvec *r: pointer to output vector of polynomials | |||
* Arguments: - polyvec *r: pointer to output vector of polynomials | |||
* - const polyvec *a: pointer to first input vector of polynomials | |||
* - const polyvec *b: pointer to second input vector of polynomials | |||
**************************************************/ | |||
@@ -8,22 +8,18 @@ typedef struct { | |||
poly vec[KYBER_K]; | |||
} polyvec; | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a); | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress(polyvec *r, | |||
const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a); | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a); | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a); | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]); | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt(polyvec *r); | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_invntt_tomont(polyvec *r); | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, | |||
const polyvec *a, | |||
const polyvec *b); | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b); | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_reduce(polyvec *r); | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_csubq(polyvec *r); | |||
void PQCLEAN_KYBER102490S_CLEAN_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); | |||
@@ -6,8 +6,7 @@ | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_montgomery_reduce | |||
* | |||
* Description: Montgomery reduction; given a 32-bit integer a, computes | |||
* 16-bit integer congruent to a * R^-1 mod q, | |||
* where R=2^16 | |||
* 16-bit integer congruent to a * R^-1 mod q, where R=2^16 | |||
* | |||
* Arguments: - int32_t a: input integer to be reduced; | |||
* has to be in {-q2^15,...,q2^15-1} | |||
@@ -29,32 +28,17 @@ int16_t PQCLEAN_KYBER102490S_CLEAN_montgomery_reduce(int32_t a) { | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_barrett_reduce | |||
* | |||
* Description: Barrett reduction; given a 16-bit integer a, computes | |||
* 16-bit integer congruent to a mod q in {0,...,q} | |||
* centered representative congruent to a mod q in {-(q-1)/2,...,(q-1)/2} | |||
* | |||
* Arguments: - int16_t a: input integer to be reduced | |||
* | |||
* Returns: integer in {0,...,q} congruent to a modulo q. | |||
* Returns: integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q. | |||
**************************************************/ | |||
int16_t PQCLEAN_KYBER102490S_CLEAN_barrett_reduce(int16_t a) { | |||
int16_t t; | |||
const int16_t v = ((1U << 26) + KYBER_Q / 2) / KYBER_Q; | |||
t = (int32_t)v * a >> 26; | |||
t = ((int32_t)v * a + (1 << 25)) >> 26; | |||
t *= KYBER_Q; | |||
return a - t; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_csubq | |||
* | |||
* Description: Conditionallly subtract q | |||
* | |||
* Arguments: - int16_t x: input integer | |||
* | |||
* Returns: a - q if a >= q, else a | |||
**************************************************/ | |||
int16_t PQCLEAN_KYBER102490S_CLEAN_csubq(int16_t a) { | |||
a -= KYBER_Q; | |||
a += (a >> 15) & KYBER_Q; | |||
return a; | |||
} |
@@ -10,6 +10,4 @@ int16_t PQCLEAN_KYBER102490S_CLEAN_montgomery_reduce(int32_t a); | |||
int16_t PQCLEAN_KYBER102490S_CLEAN_barrett_reduce(int16_t a); | |||
int16_t PQCLEAN_KYBER102490S_CLEAN_csubq(int16_t a); | |||
#endif |
@@ -1,100 +1,18 @@ | |||
#include "aes.h" | |||
#include "aes256ctr.h" | |||
#include "params.h" | |||
#include "symmetric.h" | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
#include <string.h> | |||
static inline void br_enc32be(unsigned char *dst, uint32_t x) { | |||
dst[3] = (unsigned char)x; | |||
dst[2] = (unsigned char)(x >> 8); | |||
dst[1] = (unsigned char)(x >> 16); | |||
dst[0] = (unsigned char)(x >> 24); | |||
void PQCLEAN_KYBER102490S_CLEAN_kyber_aes256xof_absorb(aes256ctr_ctx *state, const uint8_t seed[32], uint8_t x, uint8_t y) { | |||
uint8_t expnonce[12] = {0}; | |||
expnonce[0] = x; | |||
expnonce[1] = y; | |||
PQCLEAN_KYBER102490S_CLEAN_aes256ctr_init(state, seed, expnonce); | |||
} | |||
static void aes256_ctr_xof(unsigned char *out, size_t outlen, const unsigned char *iv, uint32_t ctr, const aes256ctx *ctx) { | |||
uint8_t ivw[16]; | |||
uint8_t buf[AES_BLOCKBYTES]; | |||
size_t i = 0; | |||
memcpy(ivw, iv, AESCTR_NONCEBYTES); | |||
br_enc32be(ivw + AESCTR_NONCEBYTES, ctr); | |||
while (outlen > AES_BLOCKBYTES) { | |||
aes256_ecb(out, ivw, 1, ctx); | |||
br_enc32be(ivw + AESCTR_NONCEBYTES, ++ctr); | |||
out += AES_BLOCKBYTES; | |||
outlen -= AES_BLOCKBYTES; | |||
} | |||
if (outlen > 0) { | |||
aes256_ecb(buf, ivw, 1, ctx); | |||
for (i = 0; i < outlen; i++) { | |||
out[i] = buf[i]; | |||
} | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_aes256_prf | |||
* | |||
* Description: AES256 stream generation in CTR mode using 32-bit counter, | |||
* nonce is zero-padded to 12 bytes, counter starts at zero | |||
* | |||
* Arguments: - uint8_t *output: pointer to output | |||
* - size_t outlen: length of requested output in bytes | |||
* - const uint8_t *key: pointer to 32-byte key | |||
* - uint8_t nonce: 1-byte nonce (will be zero-padded to 12 bytes) | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_aes256_prf(uint8_t *output, size_t outlen, const uint8_t *key, uint8_t nonce) { | |||
uint8_t iv[12]; | |||
for (int i = 1; i < 12; i++) { | |||
iv[i] = 0; | |||
} | |||
iv[0] = nonce; | |||
aes256ctx ctx; | |||
aes256_ctr_keyexp(&ctx, key); | |||
aes256_ctr(output, outlen, iv, &ctx); | |||
aes256_ctx_release(&ctx); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_aes256xof_absorb | |||
* | |||
* Description: AES256 CTR used as a replacement for a XOF; this function | |||
* "absorbs" a 32-byte key and two additional bytes that are zero-padded | |||
* to a 12-byte nonce | |||
* | |||
* Arguments: - aes256xof_ctx *s: pointer to state to "absorb" key and IV into | |||
* - const uint8_t *key: pointer to 32-byte key | |||
* - uint8_t x: first additional byte to "absorb" | |||
* - uint8_t y: second additional byte to "absorb" | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_aes256xof_absorb(aes256xof_ctx *s, const uint8_t *key, uint8_t x, uint8_t y) { | |||
aes256_ecb_keyexp(&s->sk_exp, key); | |||
for (int i = 2; i < 12; i++) { | |||
s->iv[i] = 0; | |||
} | |||
s->iv[0] = x; | |||
s->iv[1] = y; | |||
s->ctr = 0; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER102490S_CLEAN_aes256xof_squeezeblocks | |||
* | |||
* Description: AES256 CTR used as a replacement for a XOF; this function | |||
* generates 4 blocks out AES256-CTR output | |||
* | |||
* Arguments: - uint8_t *out: pointer to output | |||
* - size_t nblocks: number of reqested 64-byte output blocks | |||
* - aes256xof_ctx *s: AES "state", i.e. expanded key and IV | |||
**************************************************/ | |||
void PQCLEAN_KYBER102490S_CLEAN_aes256xof_squeezeblocks(uint8_t *out, size_t nblocks, aes256xof_ctx *s) { | |||
aes256_ctr_xof(out, nblocks * 64, s->iv, s->ctr, &s->sk_exp); | |||
s->ctr += (uint32_t) (4 * nblocks); | |||
} | |||
void PQCLEAN_KYBER102490S_CLEAN_aes256xof_ctx_release(aes256xof_ctx *s) { | |||
aes256_ctx_release(&s->sk_exp); | |||
void PQCLEAN_KYBER102490S_CLEAN_kyber_aes256ctr_prf(uint8_t *out, size_t outlen, const uint8_t key[32], uint8_t nonce) { | |||
uint8_t expnonce[12] = {0}; | |||
expnonce[0] = nonce; | |||
PQCLEAN_KYBER102490S_CLEAN_aes256ctr_prf(out, outlen, key, expnonce); | |||
} |
@@ -1,19 +0,0 @@ | |||
#ifndef PQCLEAN_KYBER102490S_CLEAN_SYMMETRIC_AES_H | |||
#define PQCLEAN_KYBER102490S_CLEAN_SYMMETRIC_AES_H | |||
#include "aes.h" | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
typedef struct { | |||
aes256ctx sk_exp; | |||
uint8_t iv[12]; | |||
uint32_t ctr; | |||
} aes256xof_ctx; | |||
void PQCLEAN_KYBER102490S_CLEAN_aes256_prf(uint8_t *output, size_t outlen, const uint8_t *key, uint8_t nonce); | |||
void PQCLEAN_KYBER102490S_CLEAN_aes256xof_absorb(aes256xof_ctx *s, const uint8_t *key, uint8_t x, uint8_t y); | |||
void PQCLEAN_KYBER102490S_CLEAN_aes256xof_squeezeblocks(uint8_t *out, size_t nblocks, aes256xof_ctx *s); | |||
void PQCLEAN_KYBER102490S_CLEAN_aes256xof_ctx_release(aes256xof_ctx *s); | |||
#endif |
@@ -1,23 +1,28 @@ | |||
#ifndef PQCLEAN_KYBER102490S_CLEAN_SYMMETRIC_H | |||
#define PQCLEAN_KYBER102490S_CLEAN_SYMMETRIC_H | |||
#include "aes256ctr.h" | |||
#include "params.h" | |||
#include "sha2.h" | |||
#include "symmetric-aes.h" | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
typedef aes256xof_ctx xof_state; | |||
#define XOF_BLOCKBYTES 64 | |||
typedef aes256ctr_ctx xof_state; | |||
void PQCLEAN_KYBER102490S_CLEAN_kyber_aes256xof_absorb(aes256ctr_ctx *state, const uint8_t seed[32], uint8_t x, uint8_t y); | |||
void PQCLEAN_KYBER102490S_CLEAN_kyber_aes256ctr_prf(uint8_t *out, size_t outlen, const uint8_t key[32], uint8_t nonce); | |||
#define XOF_BLOCKBYTES AES256CTR_BLOCKBYTES | |||
#define hash_h(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES) | |||
#define hash_g(OUT, IN, INBYTES) sha512(OUT, IN, INBYTES) | |||
#define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER102490S_CLEAN_aes256xof_absorb(STATE, SEED, X, Y) | |||
#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_KYBER102490S_CLEAN_aes256xof_squeezeblocks(OUT, OUTBLOCKS, STATE) | |||
#define xof_ctx_release(STATE) PQCLEAN_KYBER102490S_CLEAN_aes256xof_ctx_release(STATE) | |||
#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER102490S_CLEAN_aes256_prf(OUT, OUTBYTES, KEY, NONCE) | |||
#define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER102490S_CLEAN_kyber_aes256xof_absorb(STATE, SEED, X, Y) | |||
#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_KYBER102490S_CLEAN_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) | |||
#define xof_ctx_release(STATE) | |||
#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER102490S_CLEAN_kyber_aes256ctr_prf(OUT, OUTBYTES, KEY, NONCE) | |||
#define kdf(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES) | |||
@@ -6,7 +6,7 @@ length-public-key: 1568 | |||
length-ciphertext: 1568 | |||
length-secret-key: 3168 | |||
length-shared-secret: 32 | |||
nistkat-sha256: b4b4fc1c2cbbb182252d2822ccb8cb704bcfe876122635c5dfa48ddc09b6e73f | |||
nistkat-sha256: 5afcf2a568ad32d49b55105b032af1850f03f3888ff9e2a72f4059c58e968f60 | |||
principal-submitters: | |||
- Peter Schwabe | |||
auxiliary-submitters: | |||
@@ -21,9 +21,9 @@ auxiliary-submitters: | |||
- Damien Stehlé | |||
implementations: | |||
- name: clean | |||
version: https://github.com/pq-crystals/kyber/commit/844057468e69527bd15b17fbe03f4b61f9a22065 via https://github.com/jschanck/package-pqclean/tree/b45068b8/kyber | |||
version: https://github.com/pq-crystals/kyber/commit/e7faae9f662f5b92fee4e966f09b2f23e1e91c65 via https://github.com/jschanck/package-pqclean/tree/231c9bec/kyber | |||
- name: avx2 | |||
version: https://github.com/pq-crystals/kyber/commit/844057468e69527bd15b17fbe03f4b61f9a22065 via https://github.com/jschanck/package-pqclean/tree/b45068b8/kyber | |||
version: https://github.com/pq-crystals/kyber/commit/e7faae9f662f5b92fee4e966f09b2f23e1e91c65 via https://github.com/jschanck/package-pqclean/tree/231c9bec/kyber | |||
supported_platforms: | |||
- architecture: x86_64 | |||
operating_systems: | |||
@@ -2,22 +2,18 @@ | |||
#define PQCLEAN_KYBER1024_AVX2_ALIGN_H | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
#define ALIGN16_TYPE(t) \ | |||
union { \ | |||
__m128i vec; \ | |||
t orig; \ | |||
#define ALIGNED_UINT8(N) \ | |||
union { \ | |||
uint8_t coeffs[(N)]; \ | |||
__m256i vec[((N)+31)/32]; \ | |||
} | |||
#define ALIGN32_ARRAY(t, s) \ | |||
union { \ | |||
__m256i vec; \ | |||
t arr[(s)]; \ | |||
#define ALIGNED_INT16(N) \ | |||
union { \ | |||
int16_t coeffs[(N)]; \ | |||
__m256i vec[((N)+15)/16]; \ | |||
} | |||
#define ALIGN32_ARRAY_2D(t, n, m) \ | |||
union { \ | |||
__m256i vec; \ | |||
t arr[(n)][(m)]; \ | |||
} | |||
#endif |
@@ -1,248 +1,107 @@ | |||
#include "cdecl.h" | |||
#include "params.h" | |||
.macro schoolbook off,sign | |||
#load | |||
vmovdqa \off+32(%rsi),%ymm7 # b | |||
vmovdqa \off+32(%rdx),%ymm8 # d | |||
vmovdqa \off(%rsi),%ymm9 # a | |||
vmovdqa \off(%rdx),%ymm10 # c | |||
#mul | |||
vpmullw %ymm7,%ymm8,%ymm11 # bd.lo | |||
vpmulhw %ymm7,%ymm8,%ymm12 # bd.hi | |||
vpmullw %ymm7,%ymm10,%ymm13 # bc.lo | |||
vpmulhw %ymm7,%ymm10,%ymm7 # bc.hi | |||
vpmullw %ymm9,%ymm8,%ymm14 # ad.lo | |||
vpmulhw %ymm9,%ymm8,%ymm8 # ad.hi | |||
vpmullw %ymm9,%ymm10,%ymm15 # ac.lo | |||
vpmulhw %ymm9,%ymm10,%ymm9 # ac.hi | |||
#reduce | |||
vpmullw %ymm1,%ymm11,%ymm11 | |||
vpmulhw %ymm0,%ymm11,%ymm11 | |||
vpsubw %ymm11,%ymm12,%ymm11 # bd | |||
#mul | |||
vpmullw %ymm2,%ymm11,%ymm10 # rbd.lo | |||
vpmulhw %ymm2,%ymm11,%ymm11 # rbd.hi | |||
#unpack | |||
vpunpcklwd %ymm7,%ymm13,%ymm12 # bc0 | |||
vpunpckhwd %ymm7,%ymm13,%ymm7 # bc1 | |||
vpunpcklwd %ymm8,%ymm14,%ymm13 # ad0 | |||
vpunpckhwd %ymm8,%ymm14,%ymm8 # ad1 | |||
vpunpcklwd %ymm9,%ymm15,%ymm14 # ac0 | |||
vpunpckhwd %ymm9,%ymm15,%ymm9 # ac1 | |||
vpunpcklwd %ymm11,%ymm10,%ymm15 # rbd0 | |||
vpunpckhwd %ymm11,%ymm10,%ymm10 # rbd1 | |||
#add | |||
.ifeq \sign | |||
vpaddd %ymm14,%ymm15,%ymm14 # x0 | |||
vpaddd %ymm9,%ymm10,%ymm9 # x1 | |||
.else | |||
vpsubd %ymm15,%ymm14,%ymm14 # x0 | |||
vpsubd %ymm10,%ymm9,%ymm9 # x1 | |||
.endif | |||
vpaddd %ymm12,%ymm13,%ymm12 # y0 | |||
vpaddd %ymm7,%ymm8,%ymm7 # y1 | |||
.endm | |||
.macro red a0,a1,b0,b1,x,y,z | |||
#pack | |||
vpxor %ymm\x,%ymm\x,%ymm\x | |||
vpblendw $0xAA,%ymm\x,%ymm\a0,%ymm\y | |||
vpblendw $0xAA,%ymm\x,%ymm\a1,%ymm\z | |||
vpsrld $16,%ymm\a0,%ymm\a0 | |||
vpsrld $16,%ymm\a1,%ymm\a1 | |||
vpackusdw %ymm\z,%ymm\y,%ymm\z | |||
vpackusdw %ymm\a1,%ymm\a0,%ymm\a0 | |||
vpblendw $0xAA,%ymm\x,%ymm\b0,%ymm\y | |||
vpblendw $0xAA,%ymm\x,%ymm\b1,%ymm\x | |||
vpsrld $16,%ymm\b0,%ymm\b0 | |||
vpsrld $16,%ymm\b1,%ymm\b1 | |||
vpackusdw %ymm\x,%ymm\y,%ymm\y | |||
vpackusdw %ymm\b1,%ymm\b0,%ymm\b0 | |||
#reduce | |||
vpmullw %ymm1,%ymm\z,%ymm\z | |||
vpmullw %ymm1,%ymm\y,%ymm\y | |||
vpmulhw %ymm0,%ymm\z,%ymm\z | |||
vpmulhw %ymm0,%ymm\y,%ymm\y | |||
vpsubw %ymm\z,%ymm\a0,%ymm\a0 | |||
vpsubw %ymm\y,%ymm\b0,%ymm\b0 | |||
.macro schoolbook off | |||
vmovdqa _16XQINV*2(%rcx),%ymm0 | |||
vmovdqa (64*\off+ 0)*2(%rsi),%ymm1 # a0 | |||
vmovdqa (64*\off+16)*2(%rsi),%ymm2 # b0 | |||
vmovdqa (64*\off+32)*2(%rsi),%ymm3 # a1 | |||
vmovdqa (64*\off+48)*2(%rsi),%ymm4 # b1 | |||
vpmullw %ymm0,%ymm1,%ymm9 # a0.lo | |||
vpmullw %ymm0,%ymm2,%ymm10 # b0.lo | |||
vpmullw %ymm0,%ymm3,%ymm11 # a1.lo | |||
vpmullw %ymm0,%ymm4,%ymm12 # b1.lo | |||
vmovdqa (64*\off+ 0)*2(%rdx),%ymm5 # c0 | |||
vmovdqa (64*\off+16)*2(%rdx),%ymm6 # d0 | |||
vpmulhw %ymm5,%ymm1,%ymm13 # a0c0.hi | |||
vpmulhw %ymm6,%ymm1,%ymm1 # a0d0.hi | |||
vpmulhw %ymm5,%ymm2,%ymm14 # b0c0.hi | |||
vpmulhw %ymm6,%ymm2,%ymm2 # b0d0.hi | |||
vmovdqa (64*\off+32)*2(%rdx),%ymm7 # c1 | |||
vmovdqa (64*\off+48)*2(%rdx),%ymm8 # d1 | |||
vpmulhw %ymm7,%ymm3,%ymm15 # a1c1.hi | |||
vpmulhw %ymm8,%ymm3,%ymm3 # a1d1.hi | |||
vpmulhw %ymm7,%ymm4,%ymm0 # b1c1.hi | |||
vpmulhw %ymm8,%ymm4,%ymm4 # b1d1.hi | |||
vmovdqa %ymm13,(%rsp) | |||
vpmullw %ymm5,%ymm9,%ymm13 # a0c0.lo | |||
vpmullw %ymm6,%ymm9,%ymm9 # a0d0.lo | |||
vpmullw %ymm5,%ymm10,%ymm5 # b0c0.lo | |||
vpmullw %ymm6,%ymm10,%ymm10 # b0d0.lo | |||
vpmullw %ymm7,%ymm11,%ymm6 # a1c1.lo | |||
vpmullw %ymm8,%ymm11,%ymm11 # a1d1.lo | |||
vpmullw %ymm7,%ymm12,%ymm7 # b1c1.lo | |||
vpmullw %ymm8,%ymm12,%ymm12 # b1d1.lo | |||
vmovdqa _16XQ*2(%rcx),%ymm8 | |||
vpmulhw %ymm8,%ymm13,%ymm13 | |||
vpmulhw %ymm8,%ymm9,%ymm9 | |||
vpmulhw %ymm8,%ymm5,%ymm5 | |||
vpmulhw %ymm8,%ymm10,%ymm10 | |||
vpmulhw %ymm8,%ymm6,%ymm6 | |||
vpmulhw %ymm8,%ymm11,%ymm11 | |||
vpmulhw %ymm8,%ymm7,%ymm7 | |||
vpmulhw %ymm8,%ymm12,%ymm12 | |||
vpsubw (%rsp),%ymm13,%ymm13 # -a0c0 | |||
vpsubw %ymm9,%ymm1,%ymm9 # a0d0 | |||
vpsubw %ymm5,%ymm14,%ymm5 # b0c0 | |||
vpsubw %ymm10,%ymm2,%ymm10 # b0d0 | |||
vpsubw %ymm6,%ymm15,%ymm6 # a1c1 | |||
vpsubw %ymm11,%ymm3,%ymm11 # a1d1 | |||
vpsubw %ymm7,%ymm0,%ymm7 # b1c1 | |||
vpsubw %ymm12,%ymm4,%ymm12 # b1d1 | |||
vmovdqa (%r9),%ymm0 | |||
vmovdqa 32(%r9),%ymm1 | |||
vpmullw %ymm0,%ymm10,%ymm2 | |||
vpmullw %ymm0,%ymm12,%ymm3 | |||
vpmulhw %ymm1,%ymm10,%ymm10 | |||
vpmulhw %ymm1,%ymm12,%ymm12 | |||
vpmulhw %ymm8,%ymm2,%ymm2 | |||
vpmulhw %ymm8,%ymm3,%ymm3 | |||
vpsubw %ymm2,%ymm10,%ymm10 # rb0d0 | |||
vpsubw %ymm3,%ymm12,%ymm12 # rb1d1 | |||
vpaddw %ymm5,%ymm9,%ymm9 | |||
vpaddw %ymm7,%ymm11,%ymm11 | |||
vpsubw %ymm13,%ymm10,%ymm13 | |||
vpsubw %ymm12,%ymm6,%ymm6 | |||
vmovdqa %ymm13,(64*\off+ 0)*2(%rdi) | |||
vmovdqa %ymm9,(64*\off+16)*2(%rdi) | |||
vmovdqa %ymm6,(64*\off+32)*2(%rdi) | |||
vmovdqa %ymm11,(64*\off+48)*2(%rdi) | |||
.endm | |||
.text | |||
basemul64_acc_avx: | |||
poly0.0: | |||
schoolbook 0,0 | |||
#mov | |||
vmovdqa %ymm14,%ymm3 | |||
vmovdqa %ymm9,%ymm4 | |||
vmovdqa %ymm12,%ymm5 | |||
vmovdqa %ymm7,%ymm6 | |||
poly1.0: | |||
schoolbook 512,0 | |||
#add | |||
vpaddd %ymm14,%ymm3,%ymm3 | |||
vpaddd %ymm9,%ymm4,%ymm4 | |||
vpaddd %ymm12,%ymm5,%ymm5 | |||
vpaddd %ymm7,%ymm6,%ymm6 | |||
poly2.0: | |||
schoolbook 1024,0 | |||
#add | |||
vpaddd %ymm14,%ymm3,%ymm3 | |||
vpaddd %ymm9,%ymm4,%ymm4 | |||
vpaddd %ymm12,%ymm5,%ymm5 | |||
vpaddd %ymm7,%ymm6,%ymm6 | |||
poly3.0: | |||
schoolbook 1536,0 | |||
#add | |||
vpaddd %ymm14,%ymm3,%ymm3 | |||
vpaddd %ymm9,%ymm4,%ymm4 | |||
vpaddd %ymm12,%ymm5,%ymm5 | |||
vpaddd %ymm7,%ymm6,%ymm6 | |||
#reduce | |||
red 3,4,5,6,7,8,9 | |||
#store | |||
vmovdqa %ymm3,(%rdi) | |||
vmovdqa %ymm5,32(%rdi) | |||
poly0.1: | |||
schoolbook 64,1 | |||
#mov | |||
vmovdqa %ymm14,%ymm3 | |||
vmovdqa %ymm9,%ymm4 | |||
vmovdqa %ymm12,%ymm5 | |||
vmovdqa %ymm7,%ymm6 | |||
poly1.1: | |||
schoolbook 576,1 | |||
#add | |||
vpaddd %ymm14,%ymm3,%ymm3 | |||
vpaddd %ymm9,%ymm4,%ymm4 | |||
vpaddd %ymm12,%ymm5,%ymm5 | |||
vpaddd %ymm7,%ymm6,%ymm6 | |||
poly2.1: | |||
schoolbook 1088,1 | |||
#add | |||
vpaddd %ymm14,%ymm3,%ymm3 | |||
vpaddd %ymm9,%ymm4,%ymm4 | |||
vpaddd %ymm12,%ymm5,%ymm5 | |||
vpaddd %ymm7,%ymm6,%ymm6 | |||
poly3.1: | |||
schoolbook 1600,1 | |||
#add | |||
vpaddd %ymm14,%ymm3,%ymm3 | |||
vpaddd %ymm9,%ymm4,%ymm4 | |||
vpaddd %ymm12,%ymm5,%ymm5 | |||
vpaddd %ymm7,%ymm6,%ymm6 | |||
#reduce | |||
red 3,4,5,6,7,8,9 | |||
#store | |||
vmovdqa %ymm3,64(%rdi) | |||
vmovdqa %ymm5,96(%rdi) | |||
ret | |||
.global cdecl(PQCLEAN_KYBER1024_AVX2_basemul_acc_avx) | |||
.global _cdecl(PQCLEAN_KYBER1024_AVX2_basemul_acc_avx) | |||
cdecl(PQCLEAN_KYBER1024_AVX2_basemul_acc_avx): | |||
_cdecl(PQCLEAN_KYBER1024_AVX2_basemul_acc_avx): | |||
#consts | |||
vmovdqa _16XQ*2(%rcx),%ymm0 | |||
vmovdqa _16XQINV*2(%rcx),%ymm1 | |||
vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 | |||
call basemul64_acc_avx | |||
vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 | |||
add $128,%rdi | |||
add $128,%rsi | |||
add $128,%rdx | |||
call basemul64_acc_avx | |||
vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 | |||
add $128,%rdi | |||
add $128,%rsi | |||
add $128,%rdx | |||
call basemul64_acc_avx | |||
vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 | |||
add $128,%rdi | |||
add $128,%rsi | |||
add $128,%rdx | |||
call basemul64_acc_avx | |||
ret | |||
basemul64_avx: | |||
schoolbook 0,0 | |||
#reduce | |||
red 14,9,12,7,8,10,11 | |||
#store | |||
vmovdqa %ymm14,(%rdi) | |||
vmovdqa %ymm12,32(%rdi) | |||
schoolbook 64,1 | |||
#reduce | |||
red 14,9,12,7,8,10,11 | |||
#store | |||
vmovdqa %ymm14,64(%rdi) | |||
vmovdqa %ymm12,96(%rdi) | |||
ret | |||
.global cdecl(PQCLEAN_KYBER1024_AVX2_basemul_avx) | |||
.global _cdecl(PQCLEAN_KYBER1024_AVX2_basemul_avx) | |||
cdecl(PQCLEAN_KYBER1024_AVX2_basemul_avx): | |||
_cdecl(PQCLEAN_KYBER1024_AVX2_basemul_avx): | |||
#consts | |||
vmovdqa _16XQ*2(%rcx),%ymm0 | |||
vmovdqa _16XQINV*2(%rcx),%ymm1 | |||
vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 | |||
call basemul64_avx | |||
vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 | |||
add $128,%rdi | |||
add $128,%rsi | |||
add $128,%rdx | |||
call basemul64_avx | |||
vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 | |||
add $128,%rdi | |||
add $128,%rsi | |||
add $128,%rdx | |||
call basemul64_avx | |||
vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 | |||
add $128,%rdi | |||
add $128,%rsi | |||
add $128,%rdx | |||
call basemul64_avx | |||
mov %rsp,%r8 | |||
and $-32,%rsp | |||
sub $32,%rsp | |||
lea (_ZETAS_EXP+176)*2(%rcx),%r9 | |||
schoolbook 0 | |||
add $32*2,%r9 | |||
schoolbook 1 | |||
add $192*2,%r9 | |||
schoolbook 2 | |||
add $32*2,%r9 | |||
schoolbook 3 | |||
mov %r8,%rsp | |||
ret |
@@ -4,66 +4,64 @@ | |||
#include <stdint.h> | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_AVX2_cbd | |||
* Name: cbd2 | |||
* | |||
* Description: Given an array of uniformly random bytes, compute | |||
* polynomial with coefficients distributed according to | |||
* a centered binomial distribution with parameter KYBER_ETA | |||
* a centered binomial distribution with parameter eta=2 | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const unsigned char *buf: pointer to input byte array | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const __m256i *buf: pointer to aligned input byte array | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_cbd(poly *restrict r, const uint8_t *restrict buf) { | |||
static void cbd2(poly *restrict r, const __m256i buf[2 * KYBER_N / 128]) { | |||
unsigned int i; | |||
__m256i vec0, vec1, vec2, vec3, tmp; | |||
__m256i f0, f1, f2, f3; | |||
const __m256i mask55 = _mm256_set1_epi32(0x55555555); | |||
const __m256i mask33 = _mm256_set1_epi32(0x33333333); | |||
const __m256i mask03 = _mm256_set1_epi32(0x03030303); | |||
const __m256i mask0F = _mm256_set1_epi32(0x0F0F0F0F); | |||
for (i = 0; i < KYBER_N / 64; i++) { | |||
vec0 = _mm256_load_si256((__m256i *)&buf[32 * i]); | |||
f0 = _mm256_load_si256(&buf[i]); | |||
vec1 = _mm256_srli_epi32(vec0, 1); | |||
vec0 = _mm256_and_si256(mask55, vec0); | |||
vec1 = _mm256_and_si256(mask55, vec1); | |||
vec0 = _mm256_add_epi32(vec0, vec1); | |||
f1 = _mm256_srli_epi16(f0, 1); | |||
f0 = _mm256_and_si256(mask55, f0); | |||
f1 = _mm256_and_si256(mask55, f1); | |||
f0 = _mm256_add_epi8(f0, f1); | |||
vec1 = _mm256_srli_epi32(vec0, 2); | |||
vec0 = _mm256_and_si256(mask33, vec0); | |||
vec1 = _mm256_and_si256(mask33, vec1); | |||
f1 = _mm256_srli_epi16(f0, 2); | |||
f0 = _mm256_and_si256(mask33, f0); | |||
f1 = _mm256_and_si256(mask33, f1); | |||
f0 = _mm256_add_epi8(f0, mask33); | |||
f0 = _mm256_sub_epi8(f0, f1); | |||
vec2 = _mm256_srli_epi32(vec0, 4); | |||
vec3 = _mm256_srli_epi32(vec1, 4); | |||
vec0 = _mm256_and_si256(mask03, vec0); | |||
vec1 = _mm256_and_si256(mask03, vec1); | |||
vec2 = _mm256_and_si256(mask03, vec2); | |||
vec3 = _mm256_and_si256(mask03, vec3); | |||
f1 = _mm256_srli_epi16(f0, 4); | |||
f0 = _mm256_and_si256(mask0F, f0); | |||
f1 = _mm256_and_si256(mask0F, f1); | |||
f0 = _mm256_sub_epi8(f0, mask03); | |||
f1 = _mm256_sub_epi8(f1, mask03); | |||
vec1 = _mm256_sub_epi8(vec0, vec1); | |||
vec3 = _mm256_sub_epi8(vec2, vec3); | |||
f2 = _mm256_unpacklo_epi8(f0, f1); | |||
f3 = _mm256_unpackhi_epi8(f0, f1); | |||
vec0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec1)); | |||
vec1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec1, 1)); | |||
vec2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec3)); | |||
vec3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec3, 1)); | |||
f0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f2)); | |||
f1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f2, 1)); | |||
f2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f3)); | |||
f3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f3, 1)); | |||
tmp = _mm256_unpacklo_epi16(vec0, vec2); | |||
vec2 = _mm256_unpackhi_epi16(vec0, vec2); | |||
vec0 = tmp; | |||
tmp = _mm256_unpacklo_epi16(vec1, vec3); | |||
vec3 = _mm256_unpackhi_epi16(vec1, vec3); | |||
vec1 = tmp; | |||
_mm256_store_si256(&r->vec[4 * i + 0], f0); | |||
_mm256_store_si256(&r->vec[4 * i + 1], f2); | |||
_mm256_store_si256(&r->vec[4 * i + 2], f1); | |||
_mm256_store_si256(&r->vec[4 * i + 3], f3); | |||
} | |||
} | |||
tmp = _mm256_permute2x128_si256(vec0, vec2, 0x20); | |||
vec2 = _mm256_permute2x128_si256(vec0, vec2, 0x31); | |||
vec0 = tmp; | |||
tmp = _mm256_permute2x128_si256(vec1, vec3, 0x20); | |||
vec3 = _mm256_permute2x128_si256(vec1, vec3, 0x31); | |||
vec1 = tmp; | |||
_mm256_store_si256((__m256i *)&r->coeffs[64 * i + 0], vec0); | |||
_mm256_store_si256((__m256i *)&r->coeffs[64 * i + 16], vec2); | |||
_mm256_store_si256((__m256i *)&r->coeffs[64 * i + 32], vec1); | |||
_mm256_store_si256((__m256i *)&r->coeffs[64 * i + 48], vec3); | |||
} | |||
/* buf 32 bytes longer for cbd3 */ | |||
void PQCLEAN_KYBER1024_AVX2_poly_cbd_eta1(poly *r, const __m256i buf[KYBER_ETA1 * KYBER_N / 128 + 1]) { | |||
cbd2(r, buf); | |||
} | |||
void PQCLEAN_KYBER1024_AVX2_poly_cbd_eta2(poly *r, const __m256i buf[KYBER_ETA2 * KYBER_N / 128]) { | |||
cbd2(r, buf); | |||
} |
@@ -2,8 +2,11 @@ | |||
#define PQCLEAN_KYBER1024_AVX2_CBD_H | |||
#include "params.h" | |||
#include "poly.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
void PQCLEAN_KYBER1024_AVX2_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]); | |||
void PQCLEAN_KYBER1024_AVX2_poly_cbd_eta1(poly *r, const __m256i buf[KYBER_ETA1 * KYBER_N / 128 + 1]); | |||
void PQCLEAN_KYBER1024_AVX2_poly_cbd_eta2(poly *r, const __m256i buf[KYBER_ETA2 * KYBER_N / 128]); | |||
#endif |
@@ -1,6 +1,8 @@ | |||
#ifndef PQCLEAN_KYBER1024_AVX2_CDECL_H | |||
#define PQCLEAN_KYBER1024_AVX2_CDECL_H | |||
#define _16XQ 0 | |||
#define _16XQINV 16 | |||
#define _16XV 32 | |||
@@ -9,9 +11,10 @@ | |||
#define _16XMONTSQLO 80 | |||
#define _16XMONTSQHI 96 | |||
#define _16XMASK 112 | |||
#define _ZETAS_EXP 128 | |||
#define _ZETAS_INV_EXP 528 | |||
#define _REVIDXB 128 | |||
#define _REVIDXD 144 | |||
#define _ZETAS_EXP 160 | |||
#define _16XSHIFT 624 | |||
/* The C ABI on MacOS exports all symbols with a leading | |||
* underscore. This means that any symbols we refer to from | |||
@@ -23,4 +26,5 @@ | |||
#define _cdecl(s) _##s | |||
#define cdecl(s) s | |||
#endif |
@@ -1,155 +1,123 @@ | |||
#include "align.h" | |||
#include "consts.h" | |||
#include "params.h" | |||
#include <stdint.h> | |||
#define Q KYBER_Q | |||
#define MONT ((1U << 16) % Q) | |||
#define QINV 62209 // q^-1 mod 2^16 | |||
#define V (((1U << 26) + Q/2)/Q) | |||
#define FHI (MONT*(MONT*(Q-1)*((Q-1)/128) % Q) % Q) | |||
#define FLO (FHI*QINV % 65536) | |||
#define MONTSQHI (MONT*MONT % Q) | |||
#define MONTSQLO (MONTSQHI*QINV % 65536) | |||
#define MONT (-1044) // 2^16 mod q | |||
#define QINV (-3327) // q^-1 mod 2^16 | |||
#define V 20159 // floor(2^26/q + 0.5) | |||
#define FHI 1441 // mont^2/128 | |||
#define FLO (-10079) // qinv*FHI | |||
#define MONTSQHI 1353 // mont^2 | |||
#define MONTSQLO 20553 // qinv*MONTSQHI | |||
#define MASK 4095 | |||
#define SHIFT 32 | |||
const qdata_t PQCLEAN_KYBER1024_AVX2_qdata = {.as_arr = { | |||
#define _16XQ 0 | |||
const qdata_t PQCLEAN_KYBER1024_AVX2_qdata = {.coeffs = { | |||
//#define _16XQ 0 | |||
Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, | |||
#define _16XQINV 16 | |||
//#define _16XQINV 16 | |||
QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, | |||
QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, | |||
#define _16XV 32 | |||
//#define _16XV 32 | |||
V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, | |||
#define _16XFLO 48 | |||
//#define _16XFLO 48 | |||
FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, | |||
FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, | |||
#define _16XFHI 64 | |||
//#define _16XFHI 64 | |||
FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, | |||
FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, | |||
#define _16XMONTSQLO 80 | |||
//#define _16XMONTSQLO 80 | |||
MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, | |||
MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, | |||
MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, | |||
MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, | |||
#define _16XMONTSQHI 96 | |||
//#define _16XMONTSQHI 96 | |||
MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, | |||
MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, | |||
MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, | |||
MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, | |||
#define _16XMASK 112 | |||
//#define _16XMASK 112 | |||
MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, | |||
MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, | |||
#define _ZETAS_EXP 128 | |||
31499, 31499, 2571, 2571, 14746, 14746, 2970, 2970, | |||
13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, | |||
53134, 53134, 53134, 53134, 53134, 53134, 53134, 53134, | |||
1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, | |||
1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, | |||
44630, 44630, 44630, 44630, 27758, 27758, 27758, 27758, | |||
61737, 61737, 61737, 61737, 49846, 49846, 49846, 49846, | |||
3158, 3158, 3158, 3158, 622, 622, 622, 622, | |||
1577, 1577, 1577, 1577, 182, 182, 182, 182, | |||
59709, 59709, 17364, 17364, 39176, 39176, 36479, 36479, | |||
5572, 5572, 64434, 64434, 21439, 21439, 39295, 39295, | |||
573, 573, 2004, 2004, 264, 264, 383, 383, | |||
2500, 2500, 1458, 1458, 1727, 1727, 3199, 3199, | |||
59847, 59020, 1497, 30967, 41972, 20179, 20711, 25081, | |||
52740, 26617, 16065, 53095, 9135, 64887, 39550, 27837, | |||
1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, | |||
516, 3321, 3009, 2663, 1711, 2167, 126, 1469, | |||
65202, 54059, 33310, 20494, 37798, 945, 50654, 6182, | |||
32011, 10631, 29176, 36775, 47051, 17561, 51106, 60261, | |||
2226, 555, 2078, 1550, 422, 177, 3038, 1574, | |||
3083, 1159, 2552, 2727, 1739, 2457, 418, 3173, | |||
11182, 13387, 51303, 43881, 13131, 60950, 23093, 5493, | |||
33034, 30318, 46795, 12639, 20100, 18525, 19529, 52918, | |||
430, 843, 871, 105, 587, 3094, 2869, 1653, | |||
778, 3182, 1483, 1119, 644, 349, 329, 3254, | |||
788, 788, 1812, 1812, 28191, 28191, 28191, 28191, | |||
28191, 28191, 28191, 28191, 48842, 48842, 48842, 48842, | |||
48842, 48842, 48842, 48842, 287, 287, 287, 287, | |||
287, 287, 287, 287, 202, 202, 202, 202, | |||
202, 202, 202, 202, 10690, 10690, 10690, 10690, | |||
1359, 1359, 1359, 1359, 54335, 54335, 54335, 54335, | |||
31164, 31164, 31164, 31164, 962, 962, 962, 962, | |||
2127, 2127, 2127, 2127, 1855, 1855, 1855, 1855, | |||
1468, 1468, 1468, 1468, 37464, 37464, 24313, 24313, | |||
55004, 55004, 8800, 8800, 18427, 18427, 8859, 8859, | |||
26676, 26676, 49374, 49374, 2648, 2648, 1017, 1017, | |||
732, 732, 608, 608, 1787, 1787, 411, 411, | |||
3124, 3124, 1758, 1758, 19884, 37287, 49650, 56638, | |||
37227, 9076, 35338, 18250, 13427, 14017, 36381, 52780, | |||
16832, 4312, 41381, 47622, 2476, 3239, 3058, 830, | |||
107, 1908, 3082, 2378, 2931, 961, 1821, 2604, | |||
448, 2264, 677, 2054, 34353, 25435, 58154, 24392, | |||
44610, 10946, 24215, 16990, 10336, 57603, 43035, 10907, | |||
31637, 28644, 23998, 48114, 817, 603, 1322, 1864, | |||
2114, 1218, 2455, 2142, 2144, 2051, 1819, 2459, | |||
3221, 996, 958, 1522, 20297, 2146, 15356, 33152, | |||
59257, 50634, 54492, 14470, 44039, 45338, 23211, 48094, | |||
41677, 45279, 7757, 23132, 1097, 610, 2044, 384, | |||
3193, 1994, 220, 1670, 1799, 794, 2475, 478, | |||
3021, 991, 1869, 1628, 0, 0, 0, 0, | |||
//#define _REVIDXB 128 | |||
3854, 3340, 2826, 2312, 1798, 1284, 770, 256, | |||
3854, 3340, 2826, 2312, 1798, 1284, 770, 256, | |||
//#define _REVIDXD 144 | |||
7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0, 0, | |||
//#define _ZETAS_EXP 160 | |||
31498, 31498, 31498, 31498, -758, -758, -758, -758, | |||
5237, 5237, 5237, 5237, 1397, 1397, 1397, 1397, | |||
14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745, | |||
14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745, | |||
-359, -359, -359, -359, -359, -359, -359, -359, | |||
-359, -359, -359, -359, -359, -359, -359, -359, | |||
13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, | |||
-12402, -12402, -12402, -12402, -12402, -12402, -12402, -12402, | |||
1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, | |||
1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, | |||
-20907, -20907, -20907, -20907, 27758, 27758, 27758, 27758, | |||
-3799, -3799, -3799, -3799, -15690, -15690, -15690, -15690, | |||
-171, -171, -171, -171, 622, 622, 622, 622, | |||
1577, 1577, 1577, 1577, 182, 182, 182, 182, | |||
-5827, -5827, 17363, 17363, -26360, -26360, -29057, -29057, | |||
5571, 5571, -1102, -1102, 21438, 21438, -26242, -26242, | |||
573, 573, -1325, -1325, 264, 264, 383, 383, | |||
-829, -829, 1458, 1458, -1602, -1602, -130, -130, | |||
-5689, -6516, 1496, 30967, -23565, 20179, 20710, 25080, | |||
-12796, 26616, 16064, -12442, 9134, -650, -25986, 27837, | |||
1223, 652, -552, 1015, -1293, 1491, -282, -1544, | |||
516, -8, -320, -666, -1618, -1162, 126, 1469, | |||
-335, -11477, -32227, 20494, -27738, 945, -14883, 6182, | |||
32010, 10631, 29175, -28762, -18486, 17560, -14430, -5276, | |||
-1103, 555, -1251, 1550, 422, 177, -291, 1574, | |||
-246, 1159, -777, -602, -1590, -872, 418, -156, | |||
11182, 13387, -14233, -21655, 13131, -4587, 23092, 5493, | |||
-32502, 30317, -18741, 12639, 20100, 18525, 19529, -12619, | |||
430, 843, 871, 105, 587, -235, -460, 1653, | |||
778, -147, 1483, 1119, 644, 349, 329, -75, | |||
787, 787, 787, 787, 787, 787, 787, 787, | |||
787, 787, 787, 787, 787, 787, 787, 787, | |||
-1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, | |||
-1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, | |||
28191, 28191, 28191, 28191, 28191, 28191, 28191, 28191, | |||
-16694, -16694, -16694, -16694, -16694, -16694, -16694, -16694, | |||
287, 287, 287, 287, 287, 287, 287, 287, | |||
202, 202, 202, 202, 202, 202, 202, 202, | |||
10690, 10690, 10690, 10690, 1358, 1358, 1358, 1358, | |||
-11202, -11202, -11202, -11202, 31164, 31164, 31164, 31164, | |||
962, 962, 962, 962, -1202, -1202, -1202, -1202, | |||
-1474, -1474, -1474, -1474, 1468, 1468, 1468, 1468, | |||
-28073, -28073, 24313, 24313, -10532, -10532, 8800, 8800, | |||
18426, 18426, 8859, 8859, 26675, 26675, -16163, -16163, | |||
-681, -681, 1017, 1017, 732, 732, 608, 608, | |||
-1542, -1542, 411, 411, -205, -205, -1571, -1571, | |||
19883, -28250, -15887, -8898, -28309, 9075, -30199, 18249, | |||
13426, 14017, -29156, -12757, 16832, 4311, -24155, -17915, | |||
-853, -90, -271, 830, 107, -1421, -247, -951, | |||
-398, 961, -1508, -725, 448, -1065, 677, -1275, | |||
-31183, 25435, -7382, 24391, -20927, 10946, 24214, 16989, | |||
10335, -7934, -22502, 10906, 31636, 28644, 23998, -17422, | |||
817, 603, 1322, -1465, -1215, 1218, -874, -1187, | |||
-1185, -1278, -1510, -870, -108, 996, 958, 1522, | |||
20297, 2146, 15355, -32384, -6280, -14903, -11044, 14469, | |||
-21498, -20198, 23210, -17442, -23860, -20257, 7756, 23132, | |||
1097, 610, -1285, 384, -136, -1335, 220, -1659, | |||
-1530, 794, -854, 478, -308, 991, -1460, 1628, | |||
#define _ZETAS_INV_EXP 528 | |||
42405, 57780, 20258, 23860, 17443, 42326, 20199, 21498, | |||
51067, 11045, 14903, 6280, 32385, 50181, 63391, 45240, | |||
1701, 1460, 2338, 308, 2851, 854, 2535, 1530, | |||
1659, 3109, 1335, 136, 2945, 1285, 2719, 2232, | |||
17423, 41539, 36893, 33900, 54630, 22502, 7934, 55201, | |||
48547, 41322, 54591, 20927, 41145, 7383, 40102, 31184, | |||
1807, 2371, 2333, 108, 870, 1510, 1278, 1185, | |||
1187, 874, 2111, 1215, 1465, 2007, 2726, 2512, | |||
17915, 24156, 61225, 48705, 12757, 29156, 51520, 52110, | |||
47287, 30199, 56461, 28310, 8899, 15887, 28250, 45653, | |||
1275, 2652, 1065, 2881, 725, 1508, 2368, 398, | |||
951, 247, 1421, 3222, 2499, 271, 90, 853, | |||
16163, 16163, 38861, 38861, 56678, 56678, 47110, 47110, | |||
56737, 56737, 10533, 10533, 41224, 41224, 28073, 28073, | |||
1571, 1571, 205, 205, 2918, 2918, 1542, 1542, | |||
2721, 2721, 2597, 2597, 2312, 2312, 681, 681, | |||
34373, 34373, 34373, 34373, 11202, 11202, 11202, 11202, | |||
64178, 64178, 64178, 64178, 54847, 54847, 54847, 54847, | |||
1861, 1861, 1861, 1861, 1474, 1474, 1474, 1474, | |||
1202, 1202, 1202, 1202, 2367, 2367, 2367, 2367, | |||
16695, 16695, 16695, 16695, 16695, 16695, 16695, 16695, | |||
37346, 37346, 37346, 37346, 37346, 37346, 37346, 37346, | |||
3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127, | |||
3042, 3042, 3042, 3042, 3042, 3042, 3042, 3042, | |||
64749, 64749, 1517, 1517, 12619, 46008, 47012, 45437, | |||
52898, 18742, 35219, 32503, 60044, 42444, 4587, 52406, | |||
21656, 14234, 52150, 54355, 75, 3000, 2980, 2685, | |||
2210, 1846, 147, 2551, 1676, 460, 235, 2742, | |||
3224, 2458, 2486, 2899, 5276, 14431, 47976, 18486, | |||
28762, 36361, 54906, 33526, 59355, 14883, 64592, 27739, | |||
45043, 32227, 11478, 335, 156, 2911, 872, 1590, | |||
602, 777, 2170, 246, 1755, 291, 3152, 2907, | |||
1779, 1251, 2774, 1103, 37700, 25987, 650, 56402, | |||
12442, 49472, 38920, 12797, 40456, 44826, 45358, 23565, | |||
34570, 64040, 6517, 5690, 1860, 3203, 1162, 1618, | |||
666, 320, 8, 2813, 1544, 282, 1838, 1293, | |||
2314, 552, 2677, 2106, 26242, 26242, 44098, 44098, | |||
1103, 1103, 59965, 59965, 29058, 29058, 26361, 26361, | |||
48173, 48173, 5828, 5828, 130, 130, 1602, 1602, | |||
1871, 1871, 829, 829, 2946, 2946, 3065, 3065, | |||
1325, 1325, 2756, 2756, 15691, 15691, 15691, 15691, | |||
3800, 3800, 3800, 3800, 37779, 37779, 37779, 37779, | |||
20907, 20907, 20907, 20907, 3147, 3147, 3147, 3147, | |||
1752, 1752, 1752, 1752, 2707, 2707, 2707, 2707, | |||
171, 171, 171, 171, 12403, 12403, 12403, 12403, | |||
12403, 12403, 12403, 12403, 52012, 52012, 52012, 52012, | |||
52012, 52012, 52012, 52012, 1907, 1907, 1907, 1907, | |||
1907, 1907, 1907, 1907, 1836, 1836, 1836, 1836, | |||
1836, 1836, 1836, 1836, 50791, 50791, 359, 359, | |||
60300, 60300, 1932, 1932, 0, 0, 0, 0 | |||
//#define _16XSHIFT 624 | |||
SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, | |||
SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT | |||
} | |||
}; |
@@ -1,19 +1,10 @@ | |||
#ifndef PQCLEAN_KYBER1024_AVX2_CONSTS_H | |||
#define PQCLEAN_KYBER1024_AVX2_CONSTS_H | |||
#include "align.h" | |||
#include "cdecl.h" | |||
#include "params.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
#define ALIGNED_UINT16_T(N) \ | |||
union { \ | |||
__m256i as_vec; \ | |||
uint16_t as_arr[(N)]; \ | |||
} | |||
typedef ALIGNED_UINT16_T(928) qdata_t; | |||
typedef ALIGNED_INT16(640) qdata_t; | |||
extern const qdata_t PQCLEAN_KYBER1024_AVX2_qdata; | |||
#endif |
@@ -9,22 +9,14 @@ | |||
#define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds | |||
extern void KeccakF1600_StatePermute4x(__m256i *s); | |||
static inline void store64(uint8_t x[8], uint64_t u) { | |||
unsigned int i; | |||
for (i = 0; i < 8; i++) { | |||
x[i] = u >> 8 * i; | |||
} | |||
} | |||
static void keccakx4_absorb(__m256i s[25], | |||
unsigned int r, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen, | |||
uint8_t p) { | |||
static void keccakx4_absorb_once(__m256i s[25], | |||
unsigned int r, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen, | |||
uint8_t p) { | |||
size_t i, pos = 0; | |||
__m256i t, idx; | |||
@@ -39,20 +31,17 @@ static void keccakx4_absorb(__m256i s[25], | |||
s[i] = _mm256_xor_si256(s[i], t); | |||
pos += 8; | |||
} | |||
inlen -= r; | |||
KeccakF1600_StatePermute4x(s); | |||
inlen -= r; | |||
} | |||
i = 0; | |||
while (inlen >= 8) { | |||
for (i = 0; i < inlen / 8; ++i) { | |||
t = _mm256_i64gather_epi64((long long *)pos, idx, 1); | |||
s[i] = _mm256_xor_si256(s[i], t); | |||
i++; | |||
pos += 8; | |||
inlen -= 8; | |||
} | |||
inlen -= 8 * i; | |||
if (inlen) { | |||
t = _mm256_i64gather_epi64((long long *)pos, idx, 1); | |||
@@ -75,37 +64,34 @@ static void keccakx4_squeezeblocks(uint8_t *out0, | |||
unsigned int r, | |||
__m256i s[25]) { | |||
unsigned int i; | |||
uint64_t f0, f1, f2, f3; | |||
__m128d t; | |||
while (nblocks > 0) { | |||
KeccakF1600_StatePermute4x(s); | |||
for (i = 0; i < r / 8; ++i) { | |||
f0 = _mm256_extract_epi64(s[i], 0); | |||
f1 = _mm256_extract_epi64(s[i], 1); | |||
f2 = _mm256_extract_epi64(s[i], 2); | |||
f3 = _mm256_extract_epi64(s[i], 3); | |||
store64(out0, f0); | |||
store64(out1, f1); | |||
store64(out2, f2); | |||
store64(out3, f3); | |||
out0 += 8; | |||
out1 += 8; | |||
out2 += 8; | |||
out3 += 8; | |||
t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i])); | |||
_mm_storel_pd((double *)&out0[8 * i], t); | |||
_mm_storeh_pd((double *)&out1[8 * i], t); | |||
t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i], 1)); | |||
_mm_storel_pd((double *)&out2[8 * i], t); | |||
_mm_storeh_pd((double *)&out3[8 * i], t); | |||
} | |||
out0 += r; | |||
out1 += r; | |||
out2 += r; | |||
out3 += r; | |||
--nblocks; | |||
} | |||
} | |||
void PQCLEAN_KYBER1024_AVX2_shake128x4_absorb(keccakx4_state *state, | |||
void PQCLEAN_KYBER1024_AVX2_shake128x4_absorb_once(keccakx4_state *state, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen) { | |||
keccakx4_absorb(state->s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F); | |||
keccakx4_absorb_once(state->s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F); | |||
} | |||
void PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(uint8_t *out0, | |||
@@ -114,17 +100,16 @@ void PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(uint8_t *out0, | |||
uint8_t *out3, | |||
size_t nblocks, | |||
keccakx4_state *state) { | |||
keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE128_RATE, | |||
state->s); | |||
keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE128_RATE, state->s); | |||
} | |||
void PQCLEAN_KYBER1024_AVX2_shake256x4_absorb(keccakx4_state *state, | |||
void PQCLEAN_KYBER1024_AVX2_shake256x4_absorb_once(keccakx4_state *state, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
const uint8_t *in3, | |||
size_t inlen) { | |||
keccakx4_absorb(state->s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F); | |||
keccakx4_absorb_once(state->s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F); | |||
} | |||
void PQCLEAN_KYBER1024_AVX2_shake256x4_squeezeblocks(uint8_t *out0, | |||
@@ -133,8 +118,7 @@ void PQCLEAN_KYBER1024_AVX2_shake256x4_squeezeblocks(uint8_t *out0, | |||
uint8_t *out3, | |||
size_t nblocks, | |||
keccakx4_state *state) { | |||
keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE256_RATE, | |||
state->s); | |||
keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE256_RATE, state->s); | |||
} | |||
void PQCLEAN_KYBER1024_AVX2_shake128x4(uint8_t *out0, | |||
@@ -152,7 +136,7 @@ void PQCLEAN_KYBER1024_AVX2_shake128x4(uint8_t *out0, | |||
uint8_t t[4][SHAKE128_RATE]; | |||
keccakx4_state state; | |||
PQCLEAN_KYBER1024_AVX2_shake128x4_absorb(&state, in0, in1, in2, in3, inlen); | |||
PQCLEAN_KYBER1024_AVX2_shake128x4_absorb_once(&state, in0, in1, in2, in3, inlen); | |||
PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); | |||
out0 += nblocks * SHAKE128_RATE; | |||
@@ -187,7 +171,7 @@ void PQCLEAN_KYBER1024_AVX2_shake256x4(uint8_t *out0, | |||
uint8_t t[4][SHAKE256_RATE]; | |||
keccakx4_state state; | |||
PQCLEAN_KYBER1024_AVX2_shake256x4_absorb(&state, in0, in1, in2, in3, inlen); | |||
PQCLEAN_KYBER1024_AVX2_shake256x4_absorb_once(&state, in0, in1, in2, in3, inlen); | |||
PQCLEAN_KYBER1024_AVX2_shake256x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); | |||
out0 += nblocks * SHAKE256_RATE; | |||
@@ -9,7 +9,7 @@ typedef struct { | |||
__m256i s[25]; | |||
} keccakx4_state; | |||
void PQCLEAN_KYBER1024_AVX2_shake128x4_absorb(keccakx4_state *state, | |||
void PQCLEAN_KYBER1024_AVX2_shake128x4_absorb_once(keccakx4_state *state, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
@@ -23,7 +23,7 @@ void PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(uint8_t *out0, | |||
size_t nblocks, | |||
keccakx4_state *state); | |||
void PQCLEAN_KYBER1024_AVX2_shake256x4_absorb(keccakx4_state *state, | |||
void PQCLEAN_KYBER1024_AVX2_shake256x4_absorb_once(keccakx4_state *state, | |||
const uint8_t *in0, | |||
const uint8_t *in1, | |||
const uint8_t *in2, | |||
@@ -13,14 +13,14 @@ vmovdqa 160(%rdi),%ymm7 | |||
vmovdqa 192(%rdi),%ymm8 | |||
vmovdqa 224(%rdi),%ymm9 | |||
red16 2,10 | |||
red16 3,11 | |||
red16 4,12 | |||
red16 5,13 | |||
red16 6,14 | |||
red16 7,15 | |||
red16 8,10 | |||
red16 9,11 | |||
red16 2 | |||
red16 3 | |||
red16 4 | |||
red16 5 | |||
red16 6 | |||
red16 7 | |||
red16 8 | |||
red16 9 | |||
#store | |||
vmovdqa %ymm2,(%rdi) | |||
@@ -46,49 +46,6 @@ add $256,%rdi | |||
call reduce128_avx | |||
ret | |||
csubq128_avx: | |||
#load | |||
vmovdqa (%rdi),%ymm1 | |||
vmovdqa 32(%rdi),%ymm2 | |||
vmovdqa 64(%rdi),%ymm3 | |||
vmovdqa 96(%rdi),%ymm4 | |||
vmovdqa 128(%rdi),%ymm5 | |||
vmovdqa 160(%rdi),%ymm6 | |||
vmovdqa 192(%rdi),%ymm7 | |||
vmovdqa 224(%rdi),%ymm8 | |||
csubq 1,9 | |||
csubq 2,10 | |||
csubq 3,11 | |||
csubq 4,12 | |||
csubq 5,13 | |||
csubq 6,14 | |||
csubq 7,15 | |||
csubq 8,9 | |||
#store | |||
vmovdqa %ymm1,(%rdi) | |||
vmovdqa %ymm2,32(%rdi) | |||
vmovdqa %ymm3,64(%rdi) | |||
vmovdqa %ymm4,96(%rdi) | |||
vmovdqa %ymm5,128(%rdi) | |||
vmovdqa %ymm6,160(%rdi) | |||
vmovdqa %ymm7,192(%rdi) | |||
vmovdqa %ymm8,224(%rdi) | |||
ret | |||
.global cdecl(PQCLEAN_KYBER1024_AVX2_csubq_avx) | |||
.global _cdecl(PQCLEAN_KYBER1024_AVX2_csubq_avx) | |||
cdecl(PQCLEAN_KYBER1024_AVX2_csubq_avx): | |||
_cdecl(PQCLEAN_KYBER1024_AVX2_csubq_avx): | |||
#consts | |||
vmovdqa _16XQ*2(%rsi),%ymm0 | |||
call csubq128_avx | |||
add $256,%rdi | |||
call csubq128_avx | |||
ret | |||
tomont128_avx: | |||
#load | |||
vmovdqa (%rdi),%ymm3 | |||
@@ -1,6 +1,10 @@ | |||
.macro red16 r,x=12 | |||
.macro red16 r,rs=0,x=12 | |||
vpmulhw %ymm1,%ymm\r,%ymm\x | |||
.if \rs | |||
vpmulhrsw %ymm\rs,%ymm\x,%ymm\x | |||
.else | |||
vpsraw $10,%ymm\x,%ymm\x | |||
.endif | |||
vpmullw %ymm0,%ymm\x,%ymm\x | |||
vpsubw %ymm\x,%ymm\r,%ymm\r | |||
.endm | |||
@@ -10,9 +14,6 @@ vpsubw %ymm0,%ymm\r,%ymm\r | |||
vpsraw $15,%ymm\r,%ymm\x | |||
vpand %ymm0,%ymm\x,%ymm\x | |||
vpaddw %ymm\x,%ymm\r,%ymm\r | |||
#vpcmpgtw %ymm0,%ymm\r,%ymm\x | |||
#vpand %ymm0,%ymm\x,%ymm\x | |||
#vpsubw %ymm\x,%ymm\r,%ymm\r | |||
.endm | |||
.macro caddq r,x=12 | |||
@@ -8,6 +8,7 @@ | |||
#include "randombytes.h" | |||
#include "rejsample.h" | |||
#include "symmetric.h" | |||
#include <immintrin.h> | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
@@ -15,11 +16,14 @@ | |||
* Name: pack_pk | |||
* | |||
* Description: Serialize the public key as concatenation of the | |||
* serialized vector of polynomials pk | |||
* and the public seed used to generate the matrix A. | |||
* serialized vector of polynomials pk and the | |||
* public seed used to generate the matrix A. | |||
* The polynomial coefficients in pk are assumed to | |||
* lie in the invertal [0,q], i.e. pk must be reduced | |||
* by PQCLEAN_KYBER1024_AVX2_polyvec_reduce(). | |||
* | |||
* Arguments: uint8_t *r: pointer to the output serialized public key | |||
* polyvec *pk: pointer to the input public-key polyvec | |||
* Arguments: uint8_t *r: pointer to the output serialized public key | |||
* polyvec *pk: pointer to the input public-key polyvec | |||
* const uint8_t *seed: pointer to the input public seed | |||
**************************************************/ | |||
static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], | |||
@@ -55,9 +59,12 @@ static void unpack_pk(polyvec *pk, | |||
/************************************************* | |||
* Name: pack_sk | |||
* | |||
* Description: Serialize the secret key | |||
* Description: Serialize the secret key. | |||
* The polynomial coefficients in sk are assumed to | |||
* lie in the invertal [0,q], i.e. sk must be reduced | |||
* by PQCLEAN_KYBER1024_AVX2_polyvec_reduce(). | |||
* | |||
* Arguments: - uint8_t *r: pointer to output serialized secret key | |||
* Arguments: - uint8_t *r: pointer to output serialized secret key | |||
* - polyvec *sk: pointer to input vector of polynomials (secret key) | |||
**************************************************/ | |||
static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { | |||
@@ -67,15 +74,12 @@ static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { | |||
/************************************************* | |||
* Name: unpack_sk | |||
* | |||
* Description: De-serialize the secret key; | |||
* inverse of pack_sk | |||
* Description: De-serialize the secret key; inverse of pack_sk | |||
* | |||
* Arguments: - polyvec *sk: pointer to output vector of polynomials | |||
* (secret key) | |||
* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key) | |||
* - const uint8_t *packedsk: pointer to input serialized secret key | |||
**************************************************/ | |||
static void unpack_sk(polyvec *sk, | |||
const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { | |||
static void unpack_sk(polyvec *sk, const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { | |||
PQCLEAN_KYBER1024_AVX2_polyvec_frombytes(sk, packedsk); | |||
} | |||
@@ -84,15 +88,16 @@ static void unpack_sk(polyvec *sk, | |||
* | |||
* Description: Serialize the ciphertext as concatenation of the | |||
* compressed and serialized vector of polynomials b | |||
* and the compressed and serialized polynomial v | |||
* and the compressed and serialized polynomial v. | |||
* The polynomial coefficients in b and v are assumed to | |||
* lie in the invertal [0,q], i.e. b and v must be reduced | |||
* by PQCLEAN_KYBER1024_AVX2_polyvec_reduce() and PQCLEAN_KYBER1024_AVX2_poly_reduce(), respectively. | |||
* | |||
* Arguments: uint8_t *r: pointer to the output serialized ciphertext | |||
* poly *pk: pointer to the input vector of polynomials b | |||
* poly *v: pointer to the input polynomial v | |||
* poly *pk: pointer to the input vector of polynomials b | |||
* poly *v: pointer to the input polynomial v | |||
**************************************************/ | |||
static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], | |||
polyvec *b, | |||
poly *v) { | |||
static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], polyvec *b, poly *v) { | |||
PQCLEAN_KYBER1024_AVX2_polyvec_compress(r, b); | |||
PQCLEAN_KYBER1024_AVX2_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v); | |||
} | |||
@@ -103,13 +108,11 @@ static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], | |||
* Description: De-serialize and decompress ciphertext from a byte array; | |||
* approximate inverse of pack_ciphertext | |||
* | |||
* Arguments: - polyvec *b: pointer to the output vector of polynomials b | |||
* - poly *v: pointer to the output polynomial v | |||
* Arguments: - polyvec *b: pointer to the output vector of polynomials b | |||
* - poly *v: pointer to the output polynomial v | |||
* - const uint8_t *c: pointer to the input serialized ciphertext | |||
**************************************************/ | |||
static void unpack_ciphertext(polyvec *b, | |||
poly *v, | |||
const uint8_t c[KYBER_INDCPA_BYTES]) { | |||
static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t c[KYBER_INDCPA_BYTES]) { | |||
PQCLEAN_KYBER1024_AVX2_polyvec_decompress(b, c); | |||
PQCLEAN_KYBER1024_AVX2_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES); | |||
} | |||
@@ -120,11 +123,9 @@ static void unpack_ciphertext(polyvec *b, | |||
* Description: Run rejection sampling on uniform random bytes to generate | |||
* uniform random integers mod q | |||
* | |||
* Arguments: - int16_t *r: pointer to output buffer | |||
* - unsigned int len: requested number of 16-bit integers | |||
* (uniform mod q) | |||
* - const uint8_t *buf: pointer to input buffer | |||
* (assumed to be uniform random bytes) | |||
* Arguments: - int16_t *r: pointer to output array | |||
* - unsigned int len: requested number of 16-bit integers (uniform mod q) | |||
* - const uint8_t *buf: pointer to input buffer (assumed to be uniformly random bytes) | |||
* - unsigned int buflen: length of input buffer in bytes | |||
* | |||
* Returns number of sampled 16-bit integers (at most len) | |||
@@ -134,16 +135,19 @@ static unsigned int rej_uniform(int16_t *r, | |||
const uint8_t *buf, | |||
unsigned int buflen) { | |||
unsigned int ctr, pos; | |||
uint16_t val; | |||
uint16_t val0, val1; | |||
ctr = pos = 0; | |||
while (ctr < len && pos + 2 <= buflen) { | |||
val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); | |||
pos += 2; | |||
while (ctr < len && pos + 3 <= buflen) { | |||
val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; | |||
val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF; | |||
pos += 3; | |||
if (val < 19 * KYBER_Q) { | |||
val -= ((uint32_t)val * 20159 >> 26) * KYBER_Q; // Barrett reduction | |||
r[ctr++] = (int16_t)val; | |||
if (val0 < KYBER_Q) { | |||
r[ctr++] = val0; | |||
} | |||
if (ctr < len && val1 < KYBER_Q) { | |||
r[ctr++] = val1; | |||
} | |||
} | |||
@@ -165,61 +169,54 @@ static unsigned int rej_uniform(int16_t *r, | |||
* - const uint8_t *seed: pointer to input seed | |||
* - int transposed: boolean deciding whether A or A^T is generated | |||
**************************************************/ | |||
#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \ | |||
+ XOF_BLOCKBYTES)/XOF_BLOCKBYTES) | |||
void PQCLEAN_KYBER1024_AVX2_gen_matrix(polyvec *a, const uint8_t seed[32], int transposed) { | |||
unsigned int i, ctr0, ctr1, ctr2, ctr3; | |||
ALIGN32_ARRAY_2D(uint8_t, 4, (GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 31) / 32 * 32) buf; | |||
ALIGNED_UINT8(REJ_UNIFORM_AVX_NBLOCKS * SHAKE128_RATE) buf[4]; | |||
__m256i f; | |||
keccakx4_state state; | |||
for (i = 0; i < 4; i++) { | |||
f = _mm256_load_si256((__m256i *)seed); | |||
_mm256_store_si256((__m256i *)buf.arr[0], f); | |||
_mm256_store_si256((__m256i *)buf.arr[1], f); | |||
_mm256_store_si256((__m256i *)buf.arr[2], f); | |||
_mm256_store_si256((__m256i *)buf.arr[3], f); | |||
f = _mm256_loadu_si256((__m256i *)seed); | |||
_mm256_store_si256(buf[0].vec, f); | |||
_mm256_store_si256(buf[1].vec, f); | |||
_mm256_store_si256(buf[2].vec, f); | |||
_mm256_store_si256(buf[3].vec, f); | |||
if (transposed) { | |||
buf.arr[0][KYBER_SYMBYTES + 0] = i; | |||
buf.arr[0][KYBER_SYMBYTES + 1] = 0; | |||
buf.arr[1][KYBER_SYMBYTES + 0] = i; | |||
buf.arr[1][KYBER_SYMBYTES + 1] = 1; | |||
buf.arr[2][KYBER_SYMBYTES + 0] = i; | |||
buf.arr[2][KYBER_SYMBYTES + 1] = 2; | |||
buf.arr[3][KYBER_SYMBYTES + 0] = i; | |||
buf.arr[3][KYBER_SYMBYTES + 1] = 3; | |||
buf[0].coeffs[32] = i; | |||
buf[0].coeffs[33] = 0; | |||
buf[1].coeffs[32] = i; | |||
buf[1].coeffs[33] = 1; | |||
buf[2].coeffs[32] = i; | |||
buf[2].coeffs[33] = 2; | |||
buf[3].coeffs[32] = i; | |||
buf[3].coeffs[33] = 3; | |||
} else { | |||
buf.arr[0][KYBER_SYMBYTES + 0] = 0; | |||
buf.arr[0][KYBER_SYMBYTES + 1] = i; | |||
buf.arr[1][KYBER_SYMBYTES + 0] = 1; | |||
buf.arr[1][KYBER_SYMBYTES + 1] = i; | |||
buf.arr[2][KYBER_SYMBYTES + 0] = 2; | |||
buf.arr[2][KYBER_SYMBYTES + 1] = i; | |||
buf.arr[3][KYBER_SYMBYTES + 0] = 3; | |||
buf.arr[3][KYBER_SYMBYTES + 1] = i; | |||
buf[0].coeffs[32] = 0; | |||
buf[0].coeffs[33] = i; | |||
buf[1].coeffs[32] = 1; | |||
buf[1].coeffs[33] = i; | |||
buf[2].coeffs[32] = 2; | |||
buf[2].coeffs[33] = i; | |||
buf[3].coeffs[32] = 3; | |||
buf[3].coeffs[33] = i; | |||
} | |||
PQCLEAN_KYBER1024_AVX2_shake128x4_absorb(&state, buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], KYBER_SYMBYTES + 2); | |||
PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], | |||
GEN_MATRIX_NBLOCKS, &state); | |||
PQCLEAN_KYBER1024_AVX2_shake128x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 34); | |||
PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, REJ_UNIFORM_AVX_NBLOCKS, &state); | |||
ctr0 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[0].coeffs, buf.arr[0]); | |||
ctr1 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[1].coeffs, buf.arr[1]); | |||
ctr2 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[2].coeffs, buf.arr[2]); | |||
ctr3 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[3].coeffs, buf.arr[3]); | |||
ctr0 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[0].coeffs, buf[0].coeffs); | |||
ctr1 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[1].coeffs, buf[1].coeffs); | |||
ctr2 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[2].coeffs, buf[2].coeffs); | |||
ctr3 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[3].coeffs, buf[3].coeffs); | |||
while (ctr0 < KYBER_N || ctr1 < KYBER_N || ctr2 < KYBER_N || ctr3 < KYBER_N) { | |||
PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], 1, &state); | |||
ctr0 += rej_uniform(a[i].vec[0].coeffs + ctr0, KYBER_N - ctr0, buf.arr[0], | |||
XOF_BLOCKBYTES); | |||
ctr1 += rej_uniform(a[i].vec[1].coeffs + ctr1, KYBER_N - ctr1, buf.arr[1], | |||
XOF_BLOCKBYTES); | |||
ctr2 += rej_uniform(a[i].vec[2].coeffs + ctr2, KYBER_N - ctr2, buf.arr[2], | |||
XOF_BLOCKBYTES); | |||
ctr3 += rej_uniform(a[i].vec[3].coeffs + ctr3, KYBER_N - ctr3, buf.arr[3], | |||
XOF_BLOCKBYTES); | |||
PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 1, &state); | |||
ctr0 += rej_uniform(a[i].vec[0].coeffs + ctr0, KYBER_N - ctr0, buf[0].coeffs, SHAKE128_RATE); | |||
ctr1 += rej_uniform(a[i].vec[1].coeffs + ctr1, KYBER_N - ctr1, buf[1].coeffs, SHAKE128_RATE); | |||
ctr2 += rej_uniform(a[i].vec[2].coeffs + ctr2, KYBER_N - ctr2, buf[2].coeffs, SHAKE128_RATE); | |||
ctr3 += rej_uniform(a[i].vec[3].coeffs + ctr3, KYBER_N - ctr3, buf[3].coeffs, SHAKE128_RATE); | |||
} | |||
PQCLEAN_KYBER1024_AVX2_poly_nttunpack(&a[i].vec[0]); | |||
@@ -243,27 +240,26 @@ void PQCLEAN_KYBER1024_AVX2_gen_matrix(polyvec *a, const uint8_t seed[32], int t | |||
void PQCLEAN_KYBER1024_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], | |||
uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { | |||
unsigned int i; | |||
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; | |||
const uint8_t *publicseed = buf.arr; | |||
const uint8_t *noiseseed = buf.arr + KYBER_SYMBYTES; | |||
uint8_t buf[2 * KYBER_SYMBYTES]; | |||
const uint8_t *publicseed = buf; | |||
const uint8_t *noiseseed = buf + KYBER_SYMBYTES; | |||
polyvec a[KYBER_K], e, pkpv, skpv; | |||
randombytes(buf.arr, KYBER_SYMBYTES); | |||
hash_g(buf.arr, buf.arr, KYBER_SYMBYTES); | |||
randombytes(buf, KYBER_SYMBYTES); | |||
hash_g(buf, buf, KYBER_SYMBYTES); | |||
gen_a(a, publicseed); | |||
PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(skpv.vec + 0, skpv.vec + 1, skpv.vec + 2, skpv.vec + 3, noiseseed, | |||
0, 1, 2, 3); | |||
PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(e.vec + 0, e.vec + 1, e.vec + 2, e.vec + 3, noiseseed, | |||
4, 5, 6, 7); | |||
PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta1_4x(skpv.vec + 0, skpv.vec + 1, skpv.vec + 2, skpv.vec + 3, noiseseed, 0, 1, 2, 3); | |||
PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta1_4x(e.vec + 0, e.vec + 1, e.vec + 2, e.vec + 3, noiseseed, 4, 5, 6, 7); | |||
PQCLEAN_KYBER1024_AVX2_polyvec_ntt(&skpv); | |||
PQCLEAN_KYBER1024_AVX2_polyvec_reduce(&skpv); | |||
PQCLEAN_KYBER1024_AVX2_polyvec_ntt(&e); | |||
// matrix-vector multiplication | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); | |||
PQCLEAN_KYBER1024_AVX2_polyvec_basemul_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); | |||
PQCLEAN_KYBER1024_AVX2_poly_tomont(&pkpv.vec[i]); | |||
} | |||
@@ -280,55 +276,51 @@ void PQCLEAN_KYBER1024_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTE | |||
* Description: Encryption function of the CPA-secure | |||
* public-key encryption scheme underlying Kyber. | |||
* | |||
* Arguments: - uint8_t *c: pointer to output ciphertext | |||
* (of length KYBER_INDCPA_BYTES bytes) | |||
* - const uint8_t *m: pointer to input message | |||
* (of length KYBER_INDCPA_MSGBYTES bytes) | |||
* - const uint8_t *pk: pointer to input public key | |||
* (of length KYBER_INDCPA_PUBLICKEYBYTES) | |||
* - const uint8_t *coins: pointer to input random coins | |||
* used as seed (of length KYBER_SYMBYTES) | |||
* to deterministically generate all | |||
* randomness | |||
* Arguments: - uint8_t *c: pointer to output ciphertext | |||
* (of length KYBER_INDCPA_BYTES bytes) | |||
* - const uint8_t *m: pointer to input message | |||
* (of length KYBER_INDCPA_MSGBYTES bytes) | |||
* - const uint8_t *pk: pointer to input public key | |||
* (of length KYBER_INDCPA_PUBLICKEYBYTES) | |||
* - const uint8_t *coins: pointer to input random coins used as seed | |||
* (of length KYBER_SYMBYTES) to deterministically | |||
* generate all randomness | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], | |||
const uint8_t m[KYBER_INDCPA_MSGBYTES], | |||
const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], | |||
const uint8_t coins[KYBER_SYMBYTES]) { | |||
unsigned int i; | |||
ALIGN32_ARRAY(uint8_t, KYBER_SYMBYTES) seed; | |||
polyvec sp, pkpv, ep, at[KYBER_K], bp; | |||
uint8_t seed[KYBER_SYMBYTES]; | |||
polyvec sp, pkpv, ep, at[KYBER_K], b; | |||
poly v, k, epp; | |||
unpack_pk(&pkpv, seed.arr, pk); | |||
unpack_pk(&pkpv, seed, pk); | |||
PQCLEAN_KYBER1024_AVX2_poly_frommsg(&k, m); | |||
gen_at(at, seed.arr); | |||
gen_at(at, seed); | |||
PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(sp.vec + 0, sp.vec + 1, sp.vec + 2, sp.vec + 3, coins, | |||
0, 1, 2, 3); | |||
PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(ep.vec + 0, ep.vec + 1, ep.vec + 2, ep.vec + 3, coins, | |||
4, 5, 6, 7); | |||
PQCLEAN_KYBER1024_AVX2_poly_getnoise(&epp, coins, 8); | |||
PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta1_4x(sp.vec + 0, sp.vec + 1, sp.vec + 2, sp.vec + 3, coins, 0, 1, 2, 3); | |||
PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta1_4x(ep.vec + 0, ep.vec + 1, ep.vec + 2, ep.vec + 3, coins, 4, 5, 6, 7); | |||
PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta2(&epp, coins, 8); | |||
PQCLEAN_KYBER1024_AVX2_polyvec_ntt(&sp); | |||
// matrix-vector multiplication | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp); | |||
PQCLEAN_KYBER1024_AVX2_polyvec_basemul_acc_montgomery(&b.vec[i], &at[i], &sp); | |||
} | |||
PQCLEAN_KYBER1024_AVX2_polyvec_basemul_acc_montgomery(&v, &pkpv, &sp); | |||
PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp); | |||
PQCLEAN_KYBER1024_AVX2_polyvec_invntt_tomont(&bp); | |||
PQCLEAN_KYBER1024_AVX2_polyvec_invntt_tomont(&b); | |||
PQCLEAN_KYBER1024_AVX2_poly_invntt_tomont(&v); | |||
PQCLEAN_KYBER1024_AVX2_polyvec_add(&bp, &bp, &ep); | |||
PQCLEAN_KYBER1024_AVX2_polyvec_add(&b, &b, &ep); | |||
PQCLEAN_KYBER1024_AVX2_poly_add(&v, &v, &epp); | |||
PQCLEAN_KYBER1024_AVX2_poly_add(&v, &v, &k); | |||
PQCLEAN_KYBER1024_AVX2_polyvec_reduce(&bp); | |||
PQCLEAN_KYBER1024_AVX2_polyvec_reduce(&b); | |||
PQCLEAN_KYBER1024_AVX2_poly_reduce(&v); | |||
pack_ciphertext(c, &bp, &v); | |||
pack_ciphertext(c, &b, &v); | |||
} | |||
/************************************************* | |||
@@ -337,24 +329,24 @@ void PQCLEAN_KYBER1024_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], | |||
* Description: Decryption function of the CPA-secure | |||
* public-key encryption scheme underlying Kyber. | |||
* | |||
* Arguments: - uint8_t *m: pointer to output decrypted message | |||
* (of length KYBER_INDCPA_MSGBYTES) | |||
* - const uint8_t *c: pointer to input ciphertext | |||
* (of length KYBER_INDCPA_BYTES) | |||
* Arguments: - uint8_t *m: pointer to output decrypted message | |||
* (of length KYBER_INDCPA_MSGBYTES) | |||
* - const uint8_t *c: pointer to input ciphertext | |||
* (of length KYBER_INDCPA_BYTES) | |||
* - const uint8_t *sk: pointer to input secret key | |||
* (of length KYBER_INDCPA_SECRETKEYBYTES) | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], | |||
const uint8_t c[KYBER_INDCPA_BYTES], | |||
const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { | |||
polyvec bp, skpv; | |||
polyvec b, skpv; | |||
poly v, mp; | |||
unpack_ciphertext(&bp, &v, c); | |||
unpack_ciphertext(&b, &v, c); | |||
unpack_sk(&skpv, sk); | |||
PQCLEAN_KYBER1024_AVX2_polyvec_ntt(&bp); | |||
PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp); | |||
PQCLEAN_KYBER1024_AVX2_polyvec_ntt(&b); | |||
PQCLEAN_KYBER1024_AVX2_polyvec_basemul_acc_montgomery(&mp, &skpv, &b); | |||
PQCLEAN_KYBER1024_AVX2_poly_invntt_tomont(&mp); | |||
PQCLEAN_KYBER1024_AVX2_poly_sub(&mp, &v, &mp); | |||
@@ -2,22 +2,21 @@ | |||
.include "shuffle.inc" | |||
.include "fq.inc" | |||
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=1,zl1=1,zh0=2,zh1=2 | |||
#update & mul | |||
vpsubw %ymm\rh0,%ymm\rl0,%ymm12 | |||
vpsubw %ymm\rh1,%ymm\rl1,%ymm13 | |||
vpsubw %ymm\rh2,%ymm\rl2,%ymm14 | |||
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=2,zl1=2,zh0=3,zh1=3 | |||
vpsubw %ymm\rl0,%ymm\rh0,%ymm12 | |||
vpaddw %ymm\rh0,%ymm\rl0,%ymm\rl0 | |||
vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl1 | |||
vpsubw %ymm\rl1,%ymm\rh1,%ymm13 | |||
vpmullw %ymm\zl0,%ymm12,%ymm\rh0 | |||
vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl1 | |||
vpsubw %ymm\rl2,%ymm\rh2,%ymm14 | |||
vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl2 | |||
vpmullw %ymm\zl0,%ymm13,%ymm\rh1 | |||
vpsubw %ymm\rh3,%ymm\rl3,%ymm15 | |||
vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl2 | |||
vpsubw %ymm\rl3,%ymm\rh3,%ymm15 | |||
vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl3 | |||
vpmullw %ymm\zl1,%ymm14,%ymm\rh2 | |||
vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl3 | |||
vpmullw %ymm\zl1,%ymm15,%ymm\rh3 | |||
vpmulhw %ymm\zh0,%ymm12,%ymm12 | |||
@@ -26,60 +25,84 @@ vpmulhw %ymm\zh0,%ymm13,%ymm13 | |||
vpmulhw %ymm\zh1,%ymm14,%ymm14 | |||
vpmulhw %ymm\zh1,%ymm15,%ymm15 | |||
#reduce | |||
vpmulhw %ymm0,%ymm\rh0,%ymm\rh0 | |||
vpmulhw %ymm0,%ymm\rh1,%ymm\rh1 | |||
vpmulhw %ymm0,%ymm\rh2,%ymm\rh2 | |||
vpmulhw %ymm0,%ymm\rh3,%ymm\rh3 | |||
# | |||
# | |||
vpsubw %ymm\rh0,%ymm12,%ymm\rh0 | |||
vpsubw %ymm\rh1,%ymm13,%ymm\rh1 | |||
vpsubw %ymm\rh2,%ymm14,%ymm\rh2 | |||
vpsubw %ymm\rh3,%ymm15,%ymm\rh3 | |||
.endm | |||
.text | |||
invntt_levels0t5_avx: | |||
level0: | |||
#zetas | |||
vmovdqu (%rsi),%ymm15 | |||
vmovdqu 64(%rsi),%ymm3 | |||
vmovdqu 32(%rsi),%ymm1 | |||
vmovdqu 96(%rsi),%ymm2 | |||
#load | |||
vmovdqa (%rdi),%ymm4 | |||
vmovdqa 32(%rdi),%ymm5 | |||
vmovdqa 64(%rdi),%ymm6 | |||
vmovdqa 96(%rdi),%ymm7 | |||
vmovdqa 128(%rdi),%ymm8 | |||
vmovdqa 160(%rdi),%ymm9 | |||
vmovdqa 192(%rdi),%ymm10 | |||
vmovdqa 224(%rdi),%ymm11 | |||
butterfly 4,5,8,9,6,7,10,11,15,3,1,2 | |||
level1: | |||
#zetas | |||
vmovdqu 128(%rsi),%ymm3 | |||
vmovdqu 160(%rsi),%ymm2 | |||
butterfly 4,5,6,7,8,9,10,11,3,3,2,2 | |||
.macro intt_levels0t5 off | |||
/* level 0 */ | |||
vmovdqa _16XFLO*2(%rsi),%ymm2 | |||
vmovdqa _16XFHI*2(%rsi),%ymm3 | |||
vmovdqa (128*\off+ 0)*2(%rdi),%ymm4 | |||
vmovdqa (128*\off+ 32)*2(%rdi),%ymm6 | |||
vmovdqa (128*\off+ 16)*2(%rdi),%ymm5 | |||
vmovdqa (128*\off+ 48)*2(%rdi),%ymm7 | |||
fqmulprecomp 2,3,4 | |||
fqmulprecomp 2,3,6 | |||
fqmulprecomp 2,3,5 | |||
fqmulprecomp 2,3,7 | |||
vmovdqa (128*\off+ 64)*2(%rdi),%ymm8 | |||
vmovdqa (128*\off+ 96)*2(%rdi),%ymm10 | |||
vmovdqa (128*\off+ 80)*2(%rdi),%ymm9 | |||
vmovdqa (128*\off+112)*2(%rdi),%ymm11 | |||
fqmulprecomp 2,3,8 | |||
fqmulprecomp 2,3,10 | |||
fqmulprecomp 2,3,9 | |||
fqmulprecomp 2,3,11 | |||
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+208)*2(%rsi),%ymm15 | |||
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+176)*2(%rsi),%ymm1 | |||
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+224)*2(%rsi),%ymm2 | |||
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+192)*2(%rsi),%ymm3 | |||
vmovdqa _REVIDXB*2(%rsi),%ymm12 | |||
vpshufb %ymm12,%ymm15,%ymm15 | |||
vpshufb %ymm12,%ymm1,%ymm1 | |||
vpshufb %ymm12,%ymm2,%ymm2 | |||
vpshufb %ymm12,%ymm3,%ymm3 | |||
butterfly 4,5,8,9,6,7,10,11,15,1,2,3 | |||
/* level 1 */ | |||
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+144)*2(%rsi),%ymm2 | |||
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+160)*2(%rsi),%ymm3 | |||
vmovdqa _REVIDXB*2(%rsi),%ymm1 | |||
vpshufb %ymm1,%ymm2,%ymm2 | |||
vpshufb %ymm1,%ymm3,%ymm3 | |||
butterfly 4,5,6,7,8,9,10,11,2,2,3,3 | |||
shuffle1 4,5,3,5 | |||
shuffle1 6,7,4,7 | |||
shuffle1 8,9,6,9 | |||
shuffle1 10,11,8,11 | |||
level2: | |||
#zetas | |||
vmovdqu 192(%rsi),%ymm10 | |||
vmovdqu 224(%rsi),%ymm2 | |||
#consts | |||
vmovdqa _16XV*2(%rdx),%ymm1 | |||
/* level 2 */ | |||
vmovdqa _REVIDXD*2(%rsi),%ymm12 | |||
vpermd (_ZETAS_EXP+(1-\off)*224+112)*2(%rsi),%ymm12,%ymm2 | |||
vpermd (_ZETAS_EXP+(1-\off)*224+128)*2(%rsi),%ymm12,%ymm10 | |||
butterfly 3,4,6,8,5,7,9,11,10,10,2,2 | |||
butterfly 3,4,6,8,5,7,9,11,2,2,10,10 | |||
vmovdqa _16XV*2(%rsi),%ymm1 | |||
red16 3 | |||
shuffle2 3,4,10,4 | |||
@@ -87,26 +110,22 @@ shuffle2 6,8,3,8 | |||
shuffle2 5,7,6,7 | |||
shuffle2 9,11,5,11 | |||
level3: | |||
#zetas | |||
vmovdqu 256(%rsi),%ymm9 | |||
vmovdqu 288(%rsi),%ymm2 | |||
/* level 3 */ | |||
vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+80)*2(%rsi),%ymm2 | |||
vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+96)*2(%rsi),%ymm9 | |||
butterfly 10,3,6,5,4,8,7,11,9,9,2,2 | |||
red16 10 | |||
butterfly 10,3,6,5,4,8,7,11,2,2,9,9 | |||
shuffle4 10,3,9,3 | |||
shuffle4 6,5,10,5 | |||
shuffle4 4,8,6,8 | |||
shuffle4 7,11,4,11 | |||
level4: | |||
#zetas | |||
vmovdqu 320(%rsi),%ymm7 | |||
vmovdqu 352(%rsi),%ymm2 | |||
/* level 4 */ | |||
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+48)*2(%rsi),%ymm2 | |||
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+64)*2(%rsi),%ymm7 | |||
butterfly 9,10,6,4,3,5,8,11,7,7,2,2 | |||
butterfly 9,10,6,4,3,5,8,11,2,2,7,7 | |||
red16 9 | |||
@@ -115,113 +134,62 @@ shuffle8 6,4,9,4 | |||
shuffle8 3,5,6,5 | |||
shuffle8 8,11,3,11 | |||
level5: | |||
#zetas | |||
vpbroadcastd 384(%rsi),%ymm8 | |||
vpbroadcastd 388(%rsi),%ymm2 | |||
butterfly 7,9,6,3,10,4,5,11,8,8,2,2 | |||
red16 7 | |||
#store | |||
vmovdqa %ymm7,(%rdi) | |||
vmovdqa %ymm9,32(%rdi) | |||
vmovdqa %ymm6,64(%rdi) | |||
vmovdqa %ymm3,96(%rdi) | |||
vmovdqa %ymm10,128(%rdi) | |||
vmovdqa %ymm4,160(%rdi) | |||
vmovdqa %ymm5,192(%rdi) | |||
vmovdqa %ymm11,224(%rdi) | |||
/* level5 */ | |||
vmovdqa (_ZETAS_EXP+(1-\off)*224+16)*2(%rsi),%ymm2 | |||
vmovdqa (_ZETAS_EXP+(1-\off)*224+32)*2(%rsi),%ymm8 | |||
ret | |||
butterfly 7,9,6,3,10,4,5,11,2,2,8,8 | |||
invntt_level6_avx: | |||
#zetas | |||
vpbroadcastd (%rsi),%ymm1 | |||
vpbroadcastd 4(%rsi),%ymm2 | |||
#load | |||
vmovdqa (%rdi),%ymm4 | |||
vmovdqa 32(%rdi),%ymm5 | |||
vmovdqa 64(%rdi),%ymm6 | |||
vmovdqa 96(%rdi),%ymm7 | |||
vmovdqa 256(%rdi),%ymm8 | |||
vmovdqa 288(%rdi),%ymm9 | |||
vmovdqa 320(%rdi),%ymm10 | |||
vmovdqa 352(%rdi),%ymm11 | |||
vmovdqa %ymm7,(128*\off+ 0)*2(%rdi) | |||
vmovdqa %ymm9,(128*\off+ 16)*2(%rdi) | |||
vmovdqa %ymm6,(128*\off+ 32)*2(%rdi) | |||
vmovdqa %ymm3,(128*\off+ 48)*2(%rdi) | |||
vmovdqa %ymm10,(128*\off+ 64)*2(%rdi) | |||
vmovdqa %ymm4,(128*\off+ 80)*2(%rdi) | |||
vmovdqa %ymm5,(128*\off+ 96)*2(%rdi) | |||
vmovdqa %ymm11,(128*\off+112)*2(%rdi) | |||
.endm | |||
butterfly 4,5,6,7,8,9,10,11 | |||
.macro intt_level6 off | |||
/* level 6 */ | |||
vmovdqa (64*\off+ 0)*2(%rdi),%ymm4 | |||
vmovdqa (64*\off+128)*2(%rdi),%ymm8 | |||
vmovdqa (64*\off+ 16)*2(%rdi),%ymm5 | |||
vmovdqa (64*\off+144)*2(%rdi),%ymm9 | |||
vpbroadcastq (_ZETAS_EXP+0)*2(%rsi),%ymm2 | |||
#consts | |||
vmovdqa _16XFLO*2(%rdx),%ymm12 | |||
vmovdqa _16XFHI*2(%rdx),%ymm13 | |||
#store | |||
vmovdqa %ymm8,256(%rdi) | |||
vmovdqa %ymm9,288(%rdi) | |||
vmovdqa %ymm10,320(%rdi) | |||
vmovdqa %ymm11,352(%rdi) | |||
fqmulprecomp 12,13,4,8 | |||
fqmulprecomp 12,13,5,9 | |||
fqmulprecomp 12,13,6,10 | |||
fqmulprecomp 12,13,7,11 | |||
#store | |||
vmovdqa %ymm4,(%rdi) | |||
vmovdqa %ymm5,32(%rdi) | |||
vmovdqa %ymm6,64(%rdi) | |||
vmovdqa %ymm7,96(%rdi) | |||
#load | |||
vmovdqa 128(%rdi),%ymm4 | |||
vmovdqa 160(%rdi),%ymm5 | |||
vmovdqa 192(%rdi),%ymm6 | |||
vmovdqa 224(%rdi),%ymm7 | |||
vmovdqa 384(%rdi),%ymm8 | |||
vmovdqa 416(%rdi),%ymm9 | |||
vmovdqa 448(%rdi),%ymm10 | |||
vmovdqa 480(%rdi),%ymm11 | |||
vmovdqa (64*\off+ 32)*2(%rdi),%ymm6 | |||
vmovdqa (64*\off+160)*2(%rdi),%ymm10 | |||
vmovdqa (64*\off+ 48)*2(%rdi),%ymm7 | |||
vmovdqa (64*\off+176)*2(%rdi),%ymm11 | |||
vpbroadcastq (_ZETAS_EXP+4)*2(%rsi),%ymm3 | |||
butterfly 4,5,6,7,8,9,10,11 | |||
#consts | |||
vmovdqa _16XFLO*2(%rdx),%ymm12 | |||
vmovdqa _16XFHI*2(%rdx),%ymm13 | |||
#store | |||
vmovdqa %ymm8,384(%rdi) | |||
vmovdqa %ymm9,416(%rdi) | |||
vmovdqa %ymm10,448(%rdi) | |||
vmovdqa %ymm11,480(%rdi) | |||
fqmulprecomp 12,13,4,8 | |||
fqmulprecomp 12,13,5,9 | |||
fqmulprecomp 12,13,6,10 | |||
fqmulprecomp 12,13,7,11 | |||
#store | |||
vmovdqa %ymm4,128(%rdi) | |||
vmovdqa %ymm5,160(%rdi) | |||
vmovdqa %ymm6,192(%rdi) | |||
vmovdqa %ymm7,224(%rdi) | |||
ret | |||
.if \off == 0 | |||
red16 4 | |||
.endif | |||
vmovdqa %ymm4,(64*\off+ 0)*2(%rdi) | |||
vmovdqa %ymm5,(64*\off+ 16)*2(%rdi) | |||
vmovdqa %ymm6,(64*\off+ 32)*2(%rdi) | |||
vmovdqa %ymm7,(64*\off+ 48)*2(%rdi) | |||
vmovdqa %ymm8,(64*\off+128)*2(%rdi) | |||
vmovdqa %ymm9,(64*\off+144)*2(%rdi) | |||
vmovdqa %ymm10,(64*\off+160)*2(%rdi) | |||
vmovdqa %ymm11,(64*\off+176)*2(%rdi) | |||
.endm | |||
.text | |||
.global cdecl(PQCLEAN_KYBER1024_AVX2_invntt_avx) | |||
.global _cdecl(PQCLEAN_KYBER1024_AVX2_invntt_avx) | |||
cdecl(PQCLEAN_KYBER1024_AVX2_invntt_avx): | |||
_cdecl(PQCLEAN_KYBER1024_AVX2_invntt_avx): | |||
#consts | |||
vmovdqa _16XQ*2(%rsi),%ymm0 | |||
mov %rsi,%rdx | |||
add $_ZETAS_INV_EXP*2,%rsi | |||
call invntt_levels0t5_avx | |||
add $256,%rdi | |||
add $392,%rsi | |||
call invntt_levels0t5_avx | |||
sub $256,%rdi | |||
add $392,%rsi | |||
call invntt_level6_avx | |||
intt_levels0t5 0 | |||
intt_levels0t5 1 | |||
intt_level6 0 | |||
intt_level6 1 | |||
ret |
@@ -1,4 +1,3 @@ | |||
#include "align.h" | |||
#include "indcpa.h" | |||
#include "kem.h" | |||
#include "params.h" | |||
@@ -15,13 +14,14 @@ | |||
* for CCA-secure Kyber key encapsulation mechanism | |||
* | |||
* Arguments: - unsigned char *pk: pointer to output public key | |||
* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) | |||
* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) | |||
* - unsigned char *sk: pointer to output private key | |||
* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) | |||
* (an already allocated array of KYBER_SECRETKEYBYTES bytes) | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_KYBER1024_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { | |||
int PQCLEAN_KYBER1024_AVX2_crypto_kem_keypair(unsigned char pk[KYBER_PUBLICKEYBYTES], | |||
unsigned char sk[KYBER_SECRETKEYBYTES]) { | |||
size_t i; | |||
PQCLEAN_KYBER1024_AVX2_indcpa_keypair(pk, sk); | |||
for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) { | |||
@@ -40,36 +40,36 @@ int PQCLEAN_KYBER1024_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char * | |||
* secret for given public key | |||
* | |||
* Arguments: - unsigned char *ct: pointer to output cipher text | |||
* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) | |||
* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) | |||
* - unsigned char *ss: pointer to output shared secret | |||
* (an already allocated array of CRYPTO_BYTES bytes) | |||
* (an already allocated array of KYBER_SSBYTES bytes) | |||
* - const unsigned char *pk: pointer to input public key | |||
* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) | |||
* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_KYBER1024_AVX2_crypto_kem_enc(unsigned char *ct, | |||
unsigned char *ss, | |||
const unsigned char *pk) { | |||
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; | |||
int PQCLEAN_KYBER1024_AVX2_crypto_kem_enc(unsigned char ct[KYBER_CIPHERTEXTBYTES], | |||
unsigned char ss[KYBER_SSBYTES], | |||
const unsigned char pk[KYBER_PUBLICKEYBYTES]) { | |||
uint8_t buf[2 * KYBER_SYMBYTES]; | |||
/* Will contain key, coins */ | |||
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr; | |||
uint8_t kr[2 * KYBER_SYMBYTES]; | |||
randombytes(buf.arr, KYBER_SYMBYTES); | |||
randombytes(buf, KYBER_SYMBYTES); | |||
/* Don't release system RNG output */ | |||
hash_h(buf.arr, buf.arr, KYBER_SYMBYTES); | |||
hash_h(buf, buf, KYBER_SYMBYTES); | |||
/* Multitarget countermeasure for coins + contributory KEM */ | |||
hash_h(buf.arr + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); | |||
hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES); | |||
hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); | |||
hash_g(kr, buf, 2 * KYBER_SYMBYTES); | |||
/* coins are in kr+KYBER_SYMBYTES */ | |||
PQCLEAN_KYBER1024_AVX2_indcpa_enc(ct, buf.arr, pk, kr.arr + KYBER_SYMBYTES); | |||
PQCLEAN_KYBER1024_AVX2_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); | |||
/* overwrite coins in kr with H(c) */ | |||
hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); | |||
hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); | |||
/* hash concatenation of pre-k and H(c) to k */ | |||
kdf(ss, kr.arr, 2 * KYBER_SYMBYTES); | |||
kdf(ss, kr, 2 * KYBER_SYMBYTES); | |||
return 0; | |||
} | |||
@@ -80,47 +80,47 @@ int PQCLEAN_KYBER1024_AVX2_crypto_kem_enc(unsigned char *ct, | |||
* cipher text and private key | |||
* | |||
* Arguments: - unsigned char *ss: pointer to output shared secret | |||
* (an already allocated array of CRYPTO_BYTES bytes) | |||
* (an already allocated array of KYBER_SSBYTES bytes) | |||
* - const unsigned char *ct: pointer to input cipher text | |||
* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) | |||
* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) | |||
* - const unsigned char *sk: pointer to input private key | |||
* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) | |||
* (an already allocated array of KYBER_SECRETKEYBYTES bytes) | |||
* | |||
* Returns 0. | |||
* | |||
* On failure, ss will contain a pseudo-random value. | |||
**************************************************/ | |||
int PQCLEAN_KYBER1024_AVX2_crypto_kem_dec(unsigned char *ss, | |||
const unsigned char *ct, | |||
const unsigned char *sk) { | |||
int PQCLEAN_KYBER1024_AVX2_crypto_kem_dec(unsigned char ss[KYBER_SSBYTES], | |||
const unsigned char ct[KYBER_CIPHERTEXTBYTES], | |||
const unsigned char sk[KYBER_SECRETKEYBYTES]) { | |||
size_t i; | |||
int fail; | |||
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; | |||
uint8_t buf[2 * KYBER_SYMBYTES]; | |||
/* Will contain key, coins */ | |||
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr; | |||
uint8_t cmp[KYBER_CIPHERTEXTBYTES]; | |||
uint8_t kr[2 * KYBER_SYMBYTES]; | |||
ALIGNED_UINT8(KYBER_CIPHERTEXTBYTES) cmp; | |||
const uint8_t *pk = sk + KYBER_INDCPA_SECRETKEYBYTES; | |||
PQCLEAN_KYBER1024_AVX2_indcpa_dec(buf.arr, ct, sk); | |||
PQCLEAN_KYBER1024_AVX2_indcpa_dec(buf, ct, sk); | |||
/* Multitarget countermeasure for coins + contributory KEM */ | |||
for (i = 0; i < KYBER_SYMBYTES; i++) { | |||
buf.arr[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; | |||
buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; | |||
} | |||
hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES); | |||
hash_g(kr, buf, 2 * KYBER_SYMBYTES); | |||
/* coins are in kr+KYBER_SYMBYTES */ | |||
PQCLEAN_KYBER1024_AVX2_indcpa_enc(cmp, buf.arr, pk, kr.arr + KYBER_SYMBYTES); | |||
PQCLEAN_KYBER1024_AVX2_indcpa_enc(cmp.coeffs, buf, pk, kr + KYBER_SYMBYTES); | |||
fail = PQCLEAN_KYBER1024_AVX2_verify(ct, cmp, KYBER_CIPHERTEXTBYTES); | |||
fail = PQCLEAN_KYBER1024_AVX2_verify(ct, cmp.coeffs, KYBER_CIPHERTEXTBYTES); | |||
/* overwrite coins in kr with H(c) */ | |||
hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); | |||
hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); | |||
/* Overwrite pre-k with z on re-encryption failure */ | |||
PQCLEAN_KYBER1024_AVX2_cmov(kr.arr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, (uint8_t)fail); | |||
PQCLEAN_KYBER1024_AVX2_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); | |||
/* hash concatenation of pre-k and H(c) to k */ | |||
kdf(ss, kr.arr, 2 * KYBER_SYMBYTES); | |||
kdf(ss, kr, 2 * KYBER_SYMBYTES); | |||
return 0; | |||
} |
@@ -1,222 +1,191 @@ | |||
#include "cdecl.h" | |||
.include "shuffle.inc" | |||
.include "fq.inc" | |||
.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=1,zh1=1 | |||
#mul | |||
.macro mul rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=2,zh1=2 | |||
vpmullw %ymm\zl0,%ymm\rh0,%ymm12 | |||
vpmullw %ymm\zl0,%ymm\rh1,%ymm13 | |||
vpmullw %ymm\zl1,%ymm\rh2,%ymm14 | |||
vpmullw %ymm\zl1,%ymm\rh3,%ymm15 | |||
vpmulhw %ymm\zh0,%ymm\rh0,%ymm\rh0 | |||
vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh1 | |||
vpmulhw %ymm\zh1,%ymm\rh2,%ymm\rh2 | |||
vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh3 | |||
.endm | |||
#reduce | |||
.macro reduce | |||
vpmulhw %ymm0,%ymm12,%ymm12 | |||
vpmulhw %ymm0,%ymm13,%ymm13 | |||
vpmulhw %ymm0,%ymm14,%ymm14 | |||
vpmulhw %ymm0,%ymm15,%ymm15 | |||
vpsubw %ymm12,%ymm\rh0,%ymm12 | |||
vpsubw %ymm13,%ymm\rh1,%ymm13 | |||
vpsubw %ymm14,%ymm\rh2,%ymm14 | |||
vpsubw %ymm15,%ymm\rh3,%ymm15 | |||
#update | |||
vpsubw %ymm12,%ymm\rl0,%ymm\rh0 | |||
vpaddw %ymm12,%ymm\rl0,%ymm\rl0 | |||
vpsubw %ymm13,%ymm\rl1,%ymm\rh1 | |||
vpaddw %ymm13,%ymm\rl1,%ymm\rl1 | |||
vpsubw %ymm14,%ymm\rl2,%ymm\rh2 | |||
vpaddw %ymm14,%ymm\rl2,%ymm\rl2 | |||
vpsubw %ymm15,%ymm\rl3,%ymm\rh3 | |||
vpaddw %ymm15,%ymm\rl3,%ymm\rl3 | |||
.endm | |||
# We break the dependency chains with the cost of slightly more additions. | |||
# But they can be run in parallel to the multiplications on execution port 5 | |||
# (multiplications only go to ports 0 and 1) | |||
.macro butterfly2 rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,x=3,y=2,zl0=15,zl1=15,zh0=1,zh1=1 | |||
#mul | |||
vpmullw %ymm\zl0,%ymm\rh0,%ymm12 | |||
vpmulhw %ymm\zh0,%ymm\rh0,%ymm\x | |||
vpmullw %ymm\zl0,%ymm\rh1,%ymm13 | |||
vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh0 | |||
vpmullw %ymm\zl1,%ymm\rh2,%ymm14 | |||
vpmulhw %ymm\zh1,%ymm\rh2,%ymm\y | |||
vpmullw %ymm\zl1,%ymm\rh3,%ymm15 | |||
vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh2 | |||
.macro update rln,rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 | |||
vpaddw %ymm\rh0,%ymm\rl0,%ymm\rln | |||
vpsubw %ymm\rh0,%ymm\rl0,%ymm\rh0 | |||
vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl0 | |||
#reduce | |||
vpmulhw %ymm0,%ymm12,%ymm12 | |||
vpmulhw %ymm0,%ymm13,%ymm13 | |||
vpmulhw %ymm0,%ymm14,%ymm14 | |||
vpmulhw %ymm0,%ymm15,%ymm15 | |||
vpsubw %ymm\rh1,%ymm\rl1,%ymm\rh1 | |||
vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl1 | |||
vpsubw %ymm\rh2,%ymm\rl2,%ymm\rh2 | |||
vpsubw %ymm\rh0,%ymm\rl1,%ymm\rh1 | |||
vpaddw %ymm\rh0,%ymm\rl1,%ymm\rl1 | |||
vpsubw %ymm\x,%ymm\rl0,%ymm\rh0 | |||
vpaddw %ymm\x,%ymm\rl0,%ymm\rl0 | |||
vpsubw %ymm\rh2,%ymm\rl3,%ymm\rh3 | |||
vpaddw %ymm\rh2,%ymm\rl3,%ymm\rl3 | |||
vpsubw %ymm\y,%ymm\rl2,%ymm\rh2 | |||
vpaddw %ymm\y,%ymm\rl2,%ymm\rl2 | |||
vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl2 | |||
vpsubw %ymm\rh3,%ymm\rl3,%ymm\rh3 | |||
#update | |||
vpsubw %ymm12,%ymm\rln,%ymm\rln | |||
vpaddw %ymm12,%ymm\rh0,%ymm\rh0 | |||
vpsubw %ymm12,%ymm\rl0,%ymm\rl0 | |||
vpsubw %ymm13,%ymm\rl0,%ymm\rl0 | |||
vpaddw %ymm13,%ymm\rh1,%ymm\rh1 | |||
vpsubw %ymm13,%ymm\rl1,%ymm\rl1 | |||
vpsubw %ymm14,%ymm\rl1,%ymm\rl1 | |||
vpaddw %ymm14,%ymm\rh2,%ymm\rh2 | |||
vpsubw %ymm14,%ymm\rl2,%ymm\rl2 | |||
vpsubw %ymm15,%ymm\rl2,%ymm\rl2 | |||
vpaddw %ymm15,%ymm\rh3,%ymm\rh3 | |||
vpsubw %ymm15,%ymm\rl3,%ymm\rl3 | |||
.endm | |||
.text | |||
ntt_level0_avx: | |||
level0: | |||
#zetas | |||
vpbroadcastd (%rsi),%ymm15 | |||
vpbroadcastd 4(%rsi),%ymm1 | |||
#load | |||
vmovdqa (%rdi),%ymm4 | |||
vmovdqa 32(%rdi),%ymm5 | |||
vmovdqa 64(%rdi),%ymm6 | |||
vmovdqa 96(%rdi),%ymm7 | |||
vmovdqa 256(%rdi),%ymm8 | |||
vmovdqa 288(%rdi),%ymm9 | |||
vmovdqa 320(%rdi),%ymm10 | |||
vmovdqa 352(%rdi),%ymm11 | |||
butterfly2 4,5,6,7,8,9,10,11 | |||
#store | |||
vmovdqa %ymm4,(%rdi) | |||
vmovdqa %ymm5,32(%rdi) | |||
vmovdqa %ymm6,64(%rdi) | |||
vmovdqa %ymm7,96(%rdi) | |||
vmovdqa %ymm8,256(%rdi) | |||
vmovdqa %ymm9,288(%rdi) | |||
vmovdqa %ymm10,320(%rdi) | |||
vmovdqa %ymm11,352(%rdi) | |||
.macro level0 off | |||
vpbroadcastq (_ZETAS_EXP+0)*2(%rsi),%ymm15 | |||
vmovdqa (64*\off+128)*2(%rdi),%ymm8 | |||
vmovdqa (64*\off+144)*2(%rdi),%ymm9 | |||
vmovdqa (64*\off+160)*2(%rdi),%ymm10 | |||
vmovdqa (64*\off+176)*2(%rdi),%ymm11 | |||
vpbroadcastq (_ZETAS_EXP+4)*2(%rsi),%ymm2 | |||
mul 8,9,10,11 | |||
vmovdqa (64*\off+ 0)*2(%rdi),%ymm4 | |||
vmovdqa (64*\off+ 16)*2(%rdi),%ymm5 | |||
vmovdqa (64*\off+ 32)*2(%rdi),%ymm6 | |||
vmovdqa (64*\off+ 48)*2(%rdi),%ymm7 | |||
reduce | |||
update 3,4,5,6,7,8,9,10,11 | |||
vmovdqa %ymm3,(64*\off+ 0)*2(%rdi) | |||
vmovdqa %ymm4,(64*\off+ 16)*2(%rdi) | |||
vmovdqa %ymm5,(64*\off+ 32)*2(%rdi) | |||
vmovdqa %ymm6,(64*\off+ 48)*2(%rdi) | |||
vmovdqa %ymm8,(64*\off+128)*2(%rdi) | |||
vmovdqa %ymm9,(64*\off+144)*2(%rdi) | |||
vmovdqa %ymm10,(64*\off+160)*2(%rdi) | |||
vmovdqa %ymm11,(64*\off+176)*2(%rdi) | |||
.endm | |||
ret | |||
.macro levels1t6 off | |||
/* level 1 */ | |||
vmovdqa (_ZETAS_EXP+224*\off+16)*2(%rsi),%ymm15 | |||
vmovdqa (128*\off+ 64)*2(%rdi),%ymm8 | |||
vmovdqa (128*\off+ 80)*2(%rdi),%ymm9 | |||
vmovdqa (128*\off+ 96)*2(%rdi),%ymm10 | |||
vmovdqa (128*\off+112)*2(%rdi),%ymm11 | |||
vmovdqa (_ZETAS_EXP+224*\off+32)*2(%rsi),%ymm2 | |||
ntt_levels1t6_avx: | |||
level1: | |||
#zetas | |||
vpbroadcastd (%rsi),%ymm15 | |||
vpbroadcastd 4(%rsi),%ymm1 | |||
#load | |||
vmovdqa (%rdi),%ymm4 | |||
vmovdqa 32(%rdi),%ymm5 | |||
vmovdqa 64(%rdi),%ymm6 | |||
vmovdqa 96(%rdi),%ymm7 | |||
vmovdqa 128(%rdi),%ymm8 | |||
vmovdqa 160(%rdi),%ymm9 | |||
vmovdqa 192(%rdi),%ymm10 | |||
vmovdqa 224(%rdi),%ymm11 | |||
butterfly2 4,5,6,7,8,9,10,11,3 | |||
level2: | |||
#zetas | |||
vmovdqu 8(%rsi),%ymm15 | |||
vmovdqu 40(%rsi),%ymm1 | |||
shuffle8 4,8,3,8 | |||
shuffle8 5,9,4,9 | |||
shuffle8 6,10,5,10 | |||
shuffle8 7,11,6,11 | |||
butterfly2 3,8,4,9,5,10,6,11,7 | |||
level3: | |||
#zetas | |||
vmovdqu 72(%rsi),%ymm15 | |||
vmovdqu 104(%rsi),%ymm1 | |||
shuffle4 3,5,7,5 | |||
shuffle4 8,10,3,10 | |||
shuffle4 4,6,8,6 | |||
shuffle4 9,11,4,11 | |||
butterfly2 7,5,3,10,8,6,4,11,9 | |||
level4: | |||
#zetas | |||
vmovdqu 136(%rsi),%ymm15 | |||
vmovdqu 168(%rsi),%ymm1 | |||
shuffle2 7,8,9,8 | |||
shuffle2 5,6,7,6 | |||
shuffle2 3,4,5,4 | |||
shuffle2 10,11,3,11 | |||
butterfly2 9,8,7,6,5,4,3,11,10 | |||
level5: | |||
#zetas | |||
vmovdqu 200(%rsi),%ymm15 | |||
vmovdqu 232(%rsi),%ymm1 | |||
shuffle1 9,5,10,5 | |||
shuffle1 8,4,9,4 | |||
shuffle1 7,3,8,3 | |||
shuffle1 6,11,7,11 | |||
butterfly2 10,5,9,4,8,3,7,11,6 | |||
level6: | |||
#zetas | |||
vmovdqu 264(%rsi),%ymm14 | |||
vmovdqu 328(%rsi),%ymm15 | |||
vmovdqu 296(%rsi),%ymm1 | |||
vmovdqu 360(%rsi),%ymm2 | |||
butterfly2 10,5,8,3,9,4,7,11,6,1,14,15,1,2 | |||
vmovdqa _16XV*2(%rdx),%ymm1 | |||
red16 10,12 | |||
red16 5,13 | |||
red16 9,14 | |||
red16 4,15 | |||
red16 8,2 | |||
red16 3,6 | |||
red16 7,12 | |||
red16 11,13 | |||
#store | |||
vmovdqa %ymm10,(%rdi) | |||
vmovdqa %ymm5,32(%rdi) | |||
vmovdqa %ymm9,64(%rdi) | |||
vmovdqa %ymm4,96(%rdi) | |||
vmovdqa %ymm8,128(%rdi) | |||
vmovdqa %ymm3,160(%rdi) | |||
vmovdqa %ymm7,192(%rdi) | |||
vmovdqa %ymm11,224(%rdi) | |||
mul 8,9,10,11 | |||
ret | |||
vmovdqa (128*\off+ 0)*2(%rdi),%ymm4 | |||
vmovdqa (128*\off+ 16)*2(%rdi),%ymm5 | |||
vmovdqa (128*\off+ 32)*2(%rdi),%ymm6 | |||
vmovdqa (128*\off+ 48)*2(%rdi),%ymm7 | |||
reduce | |||
update 3,4,5,6,7,8,9,10,11 | |||
/* level 2 */ | |||
shuffle8 5,10,7,10 | |||
shuffle8 6,11,5,11 | |||
vmovdqa (_ZETAS_EXP+224*\off+48)*2(%rsi),%ymm15 | |||
vmovdqa (_ZETAS_EXP+224*\off+64)*2(%rsi),%ymm2 | |||
mul 7,10,5,11 | |||
shuffle8 3,8,6,8 | |||
shuffle8 4,9,3,9 | |||
reduce | |||
update 4,6,8,3,9,7,10,5,11 | |||
/* level 3 */ | |||
shuffle4 8,5,9,5 | |||
shuffle4 3,11,8,11 | |||
vmovdqa (_ZETAS_EXP+224*\off+80)*2(%rsi),%ymm15 | |||
vmovdqa (_ZETAS_EXP+224*\off+96)*2(%rsi),%ymm2 | |||
mul 9,5,8,11 | |||
shuffle4 4,7,3,7 | |||
shuffle4 6,10,4,10 | |||
reduce | |||
update 6,3,7,4,10,9,5,8,11 | |||
/* level 4 */ | |||
shuffle2 7,8,10,8 | |||
shuffle2 4,11,7,11 | |||
vmovdqa (_ZETAS_EXP+224*\off+112)*2(%rsi),%ymm15 | |||
vmovdqa (_ZETAS_EXP+224*\off+128)*2(%rsi),%ymm2 | |||
mul 10,8,7,11 | |||
shuffle2 6,9,4,9 | |||
shuffle2 3,5,6,5 | |||
reduce | |||
update 3,4,9,6,5,10,8,7,11 | |||
/* level 5 */ | |||
shuffle1 9,7,5,7 | |||
shuffle1 6,11,9,11 | |||
vmovdqa (_ZETAS_EXP+224*\off+144)*2(%rsi),%ymm15 | |||
vmovdqa (_ZETAS_EXP+224*\off+160)*2(%rsi),%ymm2 | |||
mul 5,7,9,11 | |||
shuffle1 3,10,6,10 | |||
shuffle1 4,8,3,8 | |||
reduce | |||
update 4,6,10,3,8,5,7,9,11 | |||
/* level 6 */ | |||
vmovdqa (_ZETAS_EXP+224*\off+176)*2(%rsi),%ymm14 | |||
vmovdqa (_ZETAS_EXP+224*\off+208)*2(%rsi),%ymm15 | |||
vmovdqa (_ZETAS_EXP+224*\off+192)*2(%rsi),%ymm8 | |||
vmovdqa (_ZETAS_EXP+224*\off+224)*2(%rsi),%ymm2 | |||
mul 10,3,9,11,14,15,8,2 | |||
reduce | |||
update 8,4,6,5,7,10,3,9,11 | |||
vmovdqa %ymm8,(128*\off+ 0)*2(%rdi) | |||
vmovdqa %ymm4,(128*\off+ 16)*2(%rdi) | |||
vmovdqa %ymm10,(128*\off+ 32)*2(%rdi) | |||
vmovdqa %ymm3,(128*\off+ 48)*2(%rdi) | |||
vmovdqa %ymm6,(128*\off+ 64)*2(%rdi) | |||
vmovdqa %ymm5,(128*\off+ 80)*2(%rdi) | |||
vmovdqa %ymm9,(128*\off+ 96)*2(%rdi) | |||
vmovdqa %ymm11,(128*\off+112)*2(%rdi) | |||
.endm | |||
.text | |||
.global cdecl(PQCLEAN_KYBER1024_AVX2_ntt_avx) | |||
.global _cdecl(PQCLEAN_KYBER1024_AVX2_ntt_avx) | |||
cdecl(PQCLEAN_KYBER1024_AVX2_ntt_avx): | |||
_cdecl(PQCLEAN_KYBER1024_AVX2_ntt_avx): | |||
#consts | |||
vmovdqa _16XQ*2(%rsi),%ymm0 | |||
mov %rsi,%rdx | |||
add $_ZETAS_EXP*2,%rsi | |||
call ntt_level0_avx | |||
add $128,%rdi | |||
call ntt_level0_avx | |||
sub $128,%rdi | |||
add $8,%rsi | |||
call ntt_levels1t6_avx | |||
add $256,%rdi | |||
add $392,%rsi | |||
call ntt_levels1t6_avx | |||
level0 0 | |||
level0 1 | |||
levels1t6 0 | |||
levels1t6 1 | |||
ret |
@@ -1,24 +1,21 @@ | |||
#ifndef PQCLEAN_KYBER1024_AVX2_NTT_H | |||
#define PQCLEAN_KYBER1024_AVX2_NTT_H | |||
#include "consts.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
void PQCLEAN_KYBER1024_AVX2_ntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); | |||
void PQCLEAN_KYBER1024_AVX2_invntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); | |||
void PQCLEAN_KYBER1024_AVX2_ntt_avx(__m256i *r, const __m256i *PQCLEAN_KYBER1024_AVX2_qdata); | |||
void PQCLEAN_KYBER1024_AVX2_invntt_avx(__m256i *r, const __m256i *PQCLEAN_KYBER1024_AVX2_qdata); | |||
void PQCLEAN_KYBER1024_AVX2_nttpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); | |||
void PQCLEAN_KYBER1024_AVX2_nttunpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); | |||
void PQCLEAN_KYBER1024_AVX2_nttpack_avx(__m256i *r, const __m256i *PQCLEAN_KYBER1024_AVX2_qdata); | |||
void PQCLEAN_KYBER1024_AVX2_nttunpack_avx(__m256i *r, const __m256i *PQCLEAN_KYBER1024_AVX2_qdata); | |||
void PQCLEAN_KYBER1024_AVX2_basemul_avx(int16_t *r, | |||
const int16_t *a, | |||
const int16_t *b, | |||
const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); | |||
void PQCLEAN_KYBER1024_AVX2_basemul_acc_avx(int16_t *r, | |||
const int16_t *a, | |||
const int16_t *b, | |||
const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); | |||
void PQCLEAN_KYBER1024_AVX2_basemul_avx(__m256i *r, | |||
const __m256i *a, | |||
const __m256i *b, | |||
const __m256i *PQCLEAN_KYBER1024_AVX2_qdata); | |||
void PQCLEAN_KYBER1024_AVX2_ntttobytes_avx(uint8_t *r, const int16_t *a, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); | |||
void PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx(int16_t *r, const uint8_t *a, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); | |||
void PQCLEAN_KYBER1024_AVX2_ntttobytes_avx(uint8_t *r, const __m256i *a, const __m256i *PQCLEAN_KYBER1024_AVX2_qdata); | |||
void PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx(__m256i *r, const uint8_t *a, const __m256i *PQCLEAN_KYBER1024_AVX2_qdata); | |||
#endif |
@@ -7,8 +7,6 @@ | |||
#define KYBER_N 256 | |||
#define KYBER_Q 3329 | |||
#define KYBER_ETA 2 | |||
#define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ | |||
#define KYBER_SSBYTES 32 /* size in bytes of shared key */ | |||
@@ -16,9 +14,12 @@ | |||
#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) | |||
#define KYBER_K 4 | |||
#define KYBER_ETA1 2 | |||
#define KYBER_POLYCOMPRESSEDBYTES 160 | |||
#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 352) | |||
#define KYBER_ETA2 2 | |||
#define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES | |||
#define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) | |||
#define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) | |||
@@ -12,76 +12,99 @@ | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_AVX2_poly_compress | |||
* | |||
* Description: Compression and subsequent serialization of a polynomial | |||
* Description: Compression and subsequent serialization of a polynomial. | |||
* The coefficients of the input polynomial are assumed to | |||
* lie in the invertal [0,q], i.e. the polynomial must be reduced | |||
* by PQCLEAN_KYBER1024_AVX2_poly_reduce(). | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* (of length KYBER_POLYCOMPRESSEDBYTES) | |||
* - poly *a: pointer to input polynomial | |||
* - const poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *restrict a) { | |||
unsigned int i, j; | |||
uint8_t t[8]; | |||
PQCLEAN_KYBER1024_AVX2_poly_csubq(a); | |||
for (i = 0; i < KYBER_N / 8; i++) { | |||
for (j = 0; j < 8; j++) { | |||
t[j] = ((((uint32_t)a->coeffs[8 * i + j] << 5) + KYBER_Q / 2) / KYBER_Q) & 31; | |||
} | |||
void PQCLEAN_KYBER1024_AVX2_poly_compress(uint8_t r[160], const poly *restrict a) { | |||
size_t i; | |||
uint32_t low; | |||
__m256i f0, f1; | |||
__m128i t0, t1; | |||
const __m256i v = _mm256_load_si256(&PQCLEAN_KYBER1024_AVX2_qdata.vec[_16XV / 16]); | |||
const __m256i shift1 = _mm256_set1_epi16(1 << 10); | |||
const __m256i mask = _mm256_set1_epi16(31); | |||
const __m256i shift2 = _mm256_set1_epi16((32 << 8) + 1); | |||
const __m256i shift3 = _mm256_set1_epi32((1024 << 16) + 1); | |||
const __m256i sllvdidx = _mm256_set1_epi64x(12); | |||
const __m256i shufbidx = _mm256_set_epi8( 8, -1, -1, -1, -1, -1, 4, 3, 2, 1, 0, -1, 12, 11, 10, 9, | |||
-1, 12, 11, 10, 9, 8, -1, -1, -1, -1, -1, 4, 3, 2, 1, 0); | |||
r[0] = (t[0] >> 0) | (t[1] << 5); | |||
r[1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7); | |||
r[2] = (t[3] >> 1) | (t[4] << 4); | |||
r[3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6); | |||
r[4] = (t[6] >> 2) | (t[7] << 3); | |||
r += 5; | |||
for (i = 0; i < KYBER_N / 32; i++) { | |||
f0 = _mm256_load_si256(&a->vec[2 * i + 0]); | |||
f1 = _mm256_load_si256(&a->vec[2 * i + 1]); | |||
f0 = _mm256_mulhi_epi16(f0, v); | |||
f1 = _mm256_mulhi_epi16(f1, v); | |||
f0 = _mm256_mulhrs_epi16(f0, shift1); | |||
f1 = _mm256_mulhrs_epi16(f1, shift1); | |||
f0 = _mm256_and_si256(f0, mask); | |||
f1 = _mm256_and_si256(f1, mask); | |||
f0 = _mm256_packus_epi16(f0, f1); | |||
f0 = _mm256_maddubs_epi16(f0, shift2); // a0 a1 a2 a3 b0 b1 b2 b3 a4 a5 a6 a7 b4 b5 b6 b7 | |||
f0 = _mm256_madd_epi16(f0, shift3); // a0 a1 b0 b1 a2 a3 b2 b3 | |||
f0 = _mm256_sllv_epi32(f0, sllvdidx); | |||
f0 = _mm256_srlv_epi64(f0, sllvdidx); | |||
f0 = _mm256_shuffle_epi8(f0, shufbidx); | |||
t0 = _mm256_castsi256_si128(f0); | |||
t1 = _mm256_extracti128_si256(f0, 1); | |||
t0 = _mm_blendv_epi8(t0, t1, _mm256_castsi256_si128(shufbidx)); | |||
_mm_storeu_si128((__m128i *)&r[20 * i + 0], t0); | |||
_mm_store_ss((float *)&low, _mm_castsi128_ps(t1)); | |||
r[20 * i + 16] = (uint8_t)low; | |||
r[20 * i + 17] = (uint8_t)(low >> 0x08); | |||
r[20 * i + 18] = (uint8_t)(low >> 0x10); | |||
r[20 * i + 19] = (uint8_t)(low >> 0x18); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_AVX2_poly_decompress | |||
* | |||
* Description: De-serialization and subsequent decompression of a polynomial; | |||
* approximate inverse of PQCLEAN_KYBER1024_AVX2_poly_compress | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *a: pointer to input byte array | |||
* (of length KYBER_POLYCOMPRESSEDBYTES bytes) | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_poly_decompress(poly *restrict r, | |||
const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) { | |||
void PQCLEAN_KYBER1024_AVX2_poly_decompress(poly *restrict r, const uint8_t a[160]) { | |||
unsigned int i; | |||
int16_t h; | |||
__m128i t; | |||
__m256i f; | |||
const __m256i q = _mm256_load_si256(&PQCLEAN_KYBER1024_AVX2_qdata.vec[_16XQ / 16]); | |||
const __m256i shufbidx = _mm256_set_epi8(9, 9, 9, 8, 8, 8, 8, 7, 7, 6, 6, 6, 6, 5, 5, 5, | |||
4, 4, 4, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0); | |||
const __m256i mask = _mm256_set_epi16(248, 1984, 62, 496, 3968, 124, 992, 31, | |||
248, 1984, 62, 496, 3968, 124, 992, 31); | |||
const __m256i shift = _mm256_set_epi16(128, 16, 512, 64, 8, 256, 32, 1024, | |||
128, 16, 512, 64, 8, 256, 32, 1024); | |||
unsigned int j; | |||
uint8_t t[8]; | |||
for (i = 0; i < KYBER_N / 8; i++) { | |||
t[0] = (a[0] >> 0); | |||
t[1] = (a[0] >> 5) | (a[1] << 3); | |||
t[2] = (a[1] >> 2); | |||
t[3] = (a[1] >> 7) | (a[2] << 1); | |||
t[4] = (a[2] >> 4) | (a[3] << 4); | |||
t[5] = (a[3] >> 1); | |||
t[6] = (a[3] >> 6) | (a[4] << 2); | |||
t[7] = (a[4] >> 3); | |||
a += 5; | |||
for (j = 0; j < 8; j++) { | |||
r->coeffs[8 * i + j] = ((uint32_t)(t[j] & 31) * KYBER_Q + 16) >> 5; | |||
} | |||
for (i = 0; i < KYBER_N / 16; i++) { | |||
t = _mm_loadl_epi64((__m128i *)&a[10 * i + 0]); | |||
h = (a[10 * i + 9] << 8) + a[10 * i + 8]; | |||
t = _mm_insert_epi16(t, h, 4); | |||
f = _mm256_broadcastsi128_si256(t); | |||
f = _mm256_shuffle_epi8(f, shufbidx); | |||
f = _mm256_and_si256(f, mask); | |||
f = _mm256_mullo_epi16(f, shift); | |||
f = _mm256_mulhrs_epi16(f, q); | |||
_mm256_store_si256(&r->vec[i], f); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_AVX2_poly_tobytes | |||
* | |||
* Description: Serialization of a polynomial | |||
* Description: Serialization of a polynomial in NTT representation. | |||
* The coefficients of the input polynomial are assumed to | |||
* lie in the invertal [0,q], i.e. the polynomial must be reduced | |||
* by PQCLEAN_KYBER1024_AVX2_poly_reduce(). The coefficients are orderd as output by | |||
* PQCLEAN_KYBER1024_AVX2_poly_ntt(); the serialized output coefficients are in bitreversed | |||
* order. | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* (needs space for KYBER_POLYBYTES bytes) | |||
* - poly *a: pointer to input polynomial | |||
* - poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { | |||
PQCLEAN_KYBER1024_AVX2_ntttobytes_avx(r, a->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); | |||
PQCLEAN_KYBER1024_AVX2_ntttobytes_avx(r, a->vec, PQCLEAN_KYBER1024_AVX2_qdata.vec); | |||
} | |||
/************************************************* | |||
@@ -90,12 +113,12 @@ void PQCLEAN_KYBER1024_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { | |||
* Description: De-serialization of a polynomial; | |||
* inverse of PQCLEAN_KYBER1024_AVX2_poly_tobytes | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *a: pointer to input byte array | |||
* (of KYBER_POLYBYTES bytes) | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]) { | |||
PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx(r->coeffs, a, &PQCLEAN_KYBER1024_AVX2_qdata); | |||
PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx(r->vec, a, PQCLEAN_KYBER1024_AVX2_qdata.vec); | |||
} | |||
/************************************************* | |||
@@ -103,11 +126,10 @@ void PQCLEAN_KYBER1024_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBY | |||
* | |||
* Description: Convert 32-byte message to polynomial | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *msg: pointer to input message | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_poly_frommsg(poly *restrict r, | |||
const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { | |||
void PQCLEAN_KYBER1024_AVX2_poly_frommsg(poly *restrict r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { | |||
__m256i f, g0, g1, g2, g3, h0, h1, h2, h3; | |||
const __m256i shift = _mm256_broadcastsi128_si256(_mm_set_epi32(0, 1, 2, 3)); | |||
const __m256i idx = _mm256_broadcastsi128_si256(_mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); | |||
@@ -136,12 +158,12 @@ void PQCLEAN_KYBER1024_AVX2_poly_frommsg(poly *restrict r, | |||
g2 = _mm256_permute2x128_si256(h0,h1,0x31); \ | |||
g1 = _mm256_permute2x128_si256(h2,h3,0x20); \ | |||
g3 = _mm256_permute2x128_si256(h2,h3,0x31); \ | |||
_mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+ 0],g0); \ | |||
_mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+16],g1); \ | |||
_mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+ 0],g2); \ | |||
_mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+16],g3) | |||
_mm256_store_si256(&r->vec[0+2*(i)+0],g0); \ | |||
_mm256_store_si256(&r->vec[0+2*(i)+1],g1); \ | |||
_mm256_store_si256(&r->vec[8+2*(i)+0],g2); \ | |||
_mm256_store_si256(&r->vec[8+2*(i)+1],g3) | |||
f = _mm256_load_si256((__m256i *)msg); | |||
f = _mm256_loadu_si256((__m256i *)msg); | |||
FROMMSG64(0); | |||
FROMMSG64(1); | |||
FROMMSG64(2); | |||
@@ -151,32 +173,34 @@ void PQCLEAN_KYBER1024_AVX2_poly_frommsg(poly *restrict r, | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_AVX2_poly_tomsg | |||
* | |||
* Description: Convert polynomial to 32-byte message | |||
* Description: Convert polynomial to 32-byte message. | |||
* The coefficients of the input polynomial are assumed to | |||
* lie in the invertal [0,q], i.e. the polynomial must be reduced | |||
* by PQCLEAN_KYBER1024_AVX2_poly_reduce(). | |||
* | |||
* Arguments: - uint8_t *msg: pointer to output message | |||
* - poly *a: pointer to input polynomial | |||
* - poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *restrict a) { | |||
unsigned int i; | |||
uint32_t small; | |||
__m256i f0, f1, g0, g1; | |||
const __m256i hqs = _mm256_set1_epi16((KYBER_Q - 1) / 2); | |||
const __m256i hhqs = _mm256_set1_epi16((KYBER_Q - 5) / 4); | |||
const __m256i hq = _mm256_set1_epi16((KYBER_Q - 1) / 2); | |||
const __m256i hhq = _mm256_set1_epi16((KYBER_Q - 1) / 4); | |||
for (i = 0; i < KYBER_N / 32; i++) { | |||
f0 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i]); | |||
f1 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i + 16]); | |||
f0 = _mm256_sub_epi16(hqs, f0); | |||
f1 = _mm256_sub_epi16(hqs, f1); | |||
f0 = _mm256_load_si256(&a->vec[2 * i + 0]); | |||
f1 = _mm256_load_si256(&a->vec[2 * i + 1]); | |||
f0 = _mm256_sub_epi16(hq, f0); | |||
f1 = _mm256_sub_epi16(hq, f1); | |||
g0 = _mm256_srai_epi16(f0, 15); | |||
g1 = _mm256_srai_epi16(f1, 15); | |||
f0 = _mm256_xor_si256(f0, g0); | |||
f1 = _mm256_xor_si256(f1, g1); | |||
f0 = _mm256_sub_epi16(hhqs, f0); | |||
f1 = _mm256_sub_epi16(hhqs, f1); | |||
f0 = _mm256_sub_epi16(f0, hhq); | |||
f1 = _mm256_sub_epi16(f1, hhq); | |||
f0 = _mm256_packs_epi16(f0, f1); | |||
small = _mm256_movemask_epi8(f0); | |||
small = ~small; | |||
msg[4 * i + 0] = small; | |||
msg[4 * i + 1] = small >> 16; | |||
msg[4 * i + 2] = small >> 8; | |||
@@ -185,24 +209,43 @@ void PQCLEAN_KYBER1024_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_AVX2_poly_getnoise | |||
* Name: PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta1 | |||
* | |||
* Description: Sample a polynomial deterministically from a seed and a nonce, | |||
* with output polynomial close to centered binomial distribution | |||
* with parameter KYBER_ETA1 | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *seed: pointer to input seed | |||
* (of length KYBER_SYMBYTES bytes) | |||
* - uint8_t nonce: one-byte input nonce | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { | |||
ALIGNED_UINT8(KYBER_ETA1 * KYBER_N / 4 + 32) buf; // +32 bytes as required by PQCLEAN_KYBER1024_AVX2_poly_cbd_eta1 | |||
prf(buf.coeffs, KYBER_ETA1 * KYBER_N / 4, seed, nonce); | |||
PQCLEAN_KYBER1024_AVX2_poly_cbd_eta1(r, buf.vec); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta2 | |||
* | |||
* Description: Sample a polynomial deterministically from a seed and a nonce, | |||
* with output polynomial close to centered binomial distribution | |||
* with parameter KYBER_ETA | |||
* with parameter KYBER_ETA2 | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *seed: pointer to input seed | |||
* (of length KYBER_SYMBYTES bytes) | |||
* - uint8_t nonce: one-byte input nonce | |||
* - uint8_t nonce: one-byte input nonce | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { | |||
ALIGN32_ARRAY(uint8_t, KYBER_ETA * KYBER_N / 4) buf; | |||
prf(buf.arr, sizeof(buf.arr), seed, nonce); | |||
PQCLEAN_KYBER1024_AVX2_cbd(r, buf.arr); | |||
void PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { | |||
ALIGNED_UINT8(KYBER_ETA2 * KYBER_N / 4) buf; | |||
prf(buf.coeffs, KYBER_ETA2 * KYBER_N / 4, seed, nonce); | |||
PQCLEAN_KYBER1024_AVX2_poly_cbd_eta2(r, buf.vec); | |||
} | |||
void PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(poly *r0, | |||
#define NOISE_NBLOCKS ((KYBER_ETA1*KYBER_N/4+SHAKE256_RATE-1)/SHAKE256_RATE) | |||
void PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta1_4x(poly *r0, | |||
poly *r1, | |||
poly *r2, | |||
poly *r3, | |||
@@ -211,41 +254,46 @@ void PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(poly *r0, | |||
uint8_t nonce1, | |||
uint8_t nonce2, | |||
uint8_t nonce3) { | |||
ALIGN32_ARRAY_2D(uint8_t, 4, 160) buf; | |||
ALIGNED_UINT8(NOISE_NBLOCKS * SHAKE256_RATE) buf[4]; | |||
__m256i f; | |||
keccakx4_state state; | |||
f = _mm256_load_si256((__m256i *)seed); | |||
_mm256_store_si256((__m256i *)buf.arr[0], f); | |||
_mm256_store_si256((__m256i *)buf.arr[1], f); | |||
_mm256_store_si256((__m256i *)buf.arr[2], f); | |||
_mm256_store_si256((__m256i *)buf.arr[3], f); | |||
f = _mm256_loadu_si256((__m256i *)seed); | |||
_mm256_store_si256(buf[0].vec, f); | |||
_mm256_store_si256(buf[1].vec, f); | |||
_mm256_store_si256(buf[2].vec, f); | |||
_mm256_store_si256(buf[3].vec, f); | |||
buf.arr[0][32] = nonce0; | |||
buf.arr[1][32] = nonce1; | |||
buf.arr[2][32] = nonce2; | |||
buf.arr[3][32] = nonce3; | |||
buf[0].coeffs[32] = nonce0; | |||
buf[1].coeffs[32] = nonce1; | |||
buf[2].coeffs[32] = nonce2; | |||
buf[3].coeffs[32] = nonce3; | |||
PQCLEAN_KYBER1024_AVX2_shake256x4_absorb(&state, buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], 33); | |||
PQCLEAN_KYBER1024_AVX2_shake256x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], 1, &state); | |||
PQCLEAN_KYBER1024_AVX2_shake256x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 33); | |||
PQCLEAN_KYBER1024_AVX2_shake256x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, NOISE_NBLOCKS, &state); | |||
PQCLEAN_KYBER1024_AVX2_cbd(r0, buf.arr[0]); | |||
PQCLEAN_KYBER1024_AVX2_cbd(r1, buf.arr[1]); | |||
PQCLEAN_KYBER1024_AVX2_cbd(r2, buf.arr[2]); | |||
PQCLEAN_KYBER1024_AVX2_cbd(r3, buf.arr[3]); | |||
PQCLEAN_KYBER1024_AVX2_poly_cbd_eta1(r0, buf[0].vec); | |||
PQCLEAN_KYBER1024_AVX2_poly_cbd_eta1(r1, buf[1].vec); | |||
PQCLEAN_KYBER1024_AVX2_poly_cbd_eta1(r2, buf[2].vec); | |||
PQCLEAN_KYBER1024_AVX2_poly_cbd_eta1(r3, buf[3].vec); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_AVX2_poly_ntt | |||
* | |||
* Description: Computes negacyclic number-theoretic transform (NTT) of | |||
* a polynomial in place; | |||
* inputs assumed to be in normal order, output in bitreversed order | |||
* a polynomial in place. | |||
* Input coefficients assumed to be in normal order, | |||
* output coefficients are in special order that is natural | |||
* for the vectorization. Input coefficients are assumed to be | |||
* bounded by q in absolute value, output coefficients are bounded | |||
* by 16118 in absolute value. | |||
* | |||
* Arguments: - uint16_t *r: pointer to in/output polynomial | |||
* Arguments: - poly *r: pointer to in/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_poly_ntt(poly *r) { | |||
PQCLEAN_KYBER1024_AVX2_ntt_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); | |||
PQCLEAN_KYBER1024_AVX2_ntt_avx(r->vec, PQCLEAN_KYBER1024_AVX2_qdata.vec); | |||
} | |||
/************************************************* | |||
@@ -253,29 +301,35 @@ void PQCLEAN_KYBER1024_AVX2_poly_ntt(poly *r) { | |||
* | |||
* Description: Computes inverse of negacyclic number-theoretic transform (NTT) | |||
* of a polynomial in place; | |||
* inputs assumed to be in bitreversed order, output in normal order | |||
* Input coefficients assumed to be in special order from vectorized | |||
* forward ntt, output in normal order. Input coefficients can be | |||
* arbitrary 16-bit integers, output coefficients are bounded by 14870 | |||
* in absolute value. | |||
* | |||
* Arguments: - uint16_t *a: pointer to in/output polynomial | |||
* Arguments: - poly *a: pointer to in/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_poly_invntt_tomont(poly *r) { | |||
PQCLEAN_KYBER1024_AVX2_invntt_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); | |||
PQCLEAN_KYBER1024_AVX2_invntt_avx(r->vec, PQCLEAN_KYBER1024_AVX2_qdata.vec); | |||
} | |||
void PQCLEAN_KYBER1024_AVX2_poly_nttunpack(poly *r) { | |||
PQCLEAN_KYBER1024_AVX2_nttunpack_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); | |||
PQCLEAN_KYBER1024_AVX2_nttunpack_avx(r->vec, PQCLEAN_KYBER1024_AVX2_qdata.vec); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_AVX2_poly_basemul_montgomery | |||
* | |||
* Description: Multiplication of two polynomials in NTT domain | |||
* Description: Multiplication of two polynomials in NTT domain. | |||
* One of the input polynomials needs to have coefficients | |||
* bounded by q, the other polynomial can have arbitrary | |||
* coefficients. Output coefficients are bounded by 6656. | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const poly *a: pointer to first input polynomial | |||
* - const poly *b: pointer to second input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const poly *b) { | |||
PQCLEAN_KYBER1024_AVX2_basemul_avx(r->coeffs, a->coeffs, b->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); | |||
PQCLEAN_KYBER1024_AVX2_basemul_avx(r->vec, a->vec, b->vec, PQCLEAN_KYBER1024_AVX2_qdata.vec); | |||
} | |||
/************************************************* | |||
@@ -287,7 +341,7 @@ void PQCLEAN_KYBER1024_AVX2_poly_basemul_montgomery(poly *r, const poly *a, cons | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_poly_tomont(poly *r) { | |||
PQCLEAN_KYBER1024_AVX2_tomont_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); | |||
PQCLEAN_KYBER1024_AVX2_tomont_avx(r->vec, PQCLEAN_KYBER1024_AVX2_qdata.vec); | |||
} | |||
/************************************************* | |||
@@ -299,28 +353,16 @@ void PQCLEAN_KYBER1024_AVX2_poly_tomont(poly *r) { | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_poly_reduce(poly *r) { | |||
PQCLEAN_KYBER1024_AVX2_reduce_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_AVX2_poly_csubq | |||
* | |||
* Description: Applies conditional subtraction of q to each coefficient | |||
* of a polynomial. For details of conditional subtraction | |||
* of q see comments in reduce.c | |||
* | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_poly_csubq(poly *r) { | |||
PQCLEAN_KYBER1024_AVX2_csubq_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); | |||
PQCLEAN_KYBER1024_AVX2_reduce_avx(r->vec, PQCLEAN_KYBER1024_AVX2_qdata.vec); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_AVX2_poly_add | |||
* | |||
* Description: Add two polynomials | |||
* Description: Add two polynomials. No modular reduction | |||
* is performed. | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const poly *a: pointer to first input polynomial | |||
* - const poly *b: pointer to second input polynomial | |||
**************************************************/ | |||
@@ -328,20 +370,21 @@ void PQCLEAN_KYBER1024_AVX2_poly_add(poly *r, const poly *a, const poly *b) { | |||
unsigned int i; | |||
__m256i f0, f1; | |||
for (i = 0; i < KYBER_N; i += 16) { | |||
f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); | |||
f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); | |||
for (i = 0; i < KYBER_N / 16; i++) { | |||
f0 = _mm256_load_si256(&a->vec[i]); | |||
f1 = _mm256_load_si256(&b->vec[i]); | |||
f0 = _mm256_add_epi16(f0, f1); | |||
_mm256_store_si256((__m256i *)&r->coeffs[i], f0); | |||
_mm256_store_si256(&r->vec[i], f0); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_AVX2_poly_sub | |||
* | |||
* Description: Subtract two polynomials | |||
* Description: Subtract two polynomials. No modular reduction | |||
* is performed. | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const poly *a: pointer to first input polynomial | |||
* - const poly *b: pointer to second input polynomial | |||
**************************************************/ | |||
@@ -349,10 +392,10 @@ void PQCLEAN_KYBER1024_AVX2_poly_sub(poly *r, const poly *a, const poly *b) { | |||
unsigned int i; | |||
__m256i f0, f1; | |||
for (i = 0; i < KYBER_N; i += 16) { | |||
f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); | |||
f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); | |||
for (i = 0; i < KYBER_N / 16; i++) { | |||
f0 = _mm256_load_si256(&a->vec[i]); | |||
f1 = _mm256_load_si256(&b->vec[i]); | |||
f0 = _mm256_sub_epi16(f0, f1); | |||
_mm256_store_si256((__m256i *)&r->coeffs[i], f0); | |||
_mm256_store_si256(&r->vec[i], f0); | |||
} | |||
} |
@@ -1,19 +1,13 @@ | |||
#ifndef PQCLEAN_KYBER1024_AVX2_POLY_H | |||
#define PQCLEAN_KYBER1024_AVX2_POLY_H | |||
#include "align.h" | |||
#include "params.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
/* | |||
* Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial | |||
* coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1] | |||
*/ | |||
typedef union { | |||
__m256i dummy; | |||
int16_t coeffs[KYBER_N]; | |||
} poly; | |||
typedef ALIGNED_INT16(KYBER_N) poly; | |||
void PQCLEAN_KYBER1024_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a); | |||
void PQCLEAN_KYBER1024_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a); | |||
void PQCLEAN_KYBER1024_AVX2_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]); | |||
void PQCLEAN_KYBER1024_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a); | |||
@@ -22,8 +16,11 @@ void PQCLEAN_KYBER1024_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBY | |||
void PQCLEAN_KYBER1024_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]); | |||
void PQCLEAN_KYBER1024_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a); | |||
void PQCLEAN_KYBER1024_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); | |||
void PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(poly *r0, | |||
void PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); | |||
void PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); | |||
void PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta1_4x(poly *r0, | |||
poly *r1, | |||
poly *r2, | |||
poly *r3, | |||
@@ -33,6 +30,8 @@ void PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(poly *r0, | |||
uint8_t nonce2, | |||
uint8_t nonce3); | |||
void PQCLEAN_KYBER1024_AVX2_poly_ntt(poly *r); | |||
void PQCLEAN_KYBER1024_AVX2_poly_invntt_tomont(poly *r); | |||
void PQCLEAN_KYBER1024_AVX2_poly_nttunpack(poly *r); | |||
@@ -40,7 +39,6 @@ void PQCLEAN_KYBER1024_AVX2_poly_basemul_montgomery(poly *r, const poly *a, cons | |||
void PQCLEAN_KYBER1024_AVX2_poly_tomont(poly *r); | |||
void PQCLEAN_KYBER1024_AVX2_poly_reduce(poly *r); | |||
void PQCLEAN_KYBER1024_AVX2_poly_csubq(poly *r); | |||
void PQCLEAN_KYBER1024_AVX2_poly_add(poly *r, const poly *a, const poly *b); | |||
void PQCLEAN_KYBER1024_AVX2_poly_sub(poly *r, const poly *a, const poly *b); | |||
@@ -3,8 +3,79 @@ | |||
#include "params.h" | |||
#include "poly.h" | |||
#include "polyvec.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
static void poly_compress11(uint8_t r[352 + 2], const poly *restrict a) { | |||
unsigned int i; | |||
__m256i f0, f1, f2; | |||
__m128i t0, t1; | |||
const __m256i v = _mm256_load_si256(&PQCLEAN_KYBER1024_AVX2_qdata.vec[_16XV / 16]); | |||
const __m256i v8 = _mm256_slli_epi16(v, 3); | |||
const __m256i off = _mm256_set1_epi16(36); | |||
const __m256i shift1 = _mm256_set1_epi16(1 << 13); | |||
const __m256i mask = _mm256_set1_epi16(2047); | |||
const __m256i shift2 = _mm256_set1_epi64x((2048LL << 48) + (1LL << 32) + (2048 << 16) + 1); | |||
const __m256i sllvdidx = _mm256_set1_epi64x(10); | |||
const __m256i srlvqidx = _mm256_set_epi64x(30, 10, 30, 10); | |||
const __m256i shufbidx = _mm256_set_epi8( 4, 3, 2, 1, 0, 0, -1, -1, -1, -1, 10, 9, 8, 7, 6, 5, | |||
-1, -1, -1, -1, -1, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); | |||
for (i = 0; i < KYBER_N / 16; i++) { | |||
f0 = _mm256_load_si256(&a->vec[i]); | |||
f1 = _mm256_mullo_epi16(f0, v8); | |||
f2 = _mm256_add_epi16(f0, off); | |||
f0 = _mm256_slli_epi16(f0, 3); | |||
f0 = _mm256_mulhi_epi16(f0, v); | |||
f2 = _mm256_sub_epi16(f1, f2); | |||
f1 = _mm256_andnot_si256(f1, f2); | |||
f1 = _mm256_srli_epi16(f1, 15); | |||
f0 = _mm256_sub_epi16(f0, f1); | |||
f0 = _mm256_mulhrs_epi16(f0, shift1); | |||
f0 = _mm256_and_si256(f0, mask); | |||
f0 = _mm256_madd_epi16(f0, shift2); | |||
f0 = _mm256_sllv_epi32(f0, sllvdidx); | |||
f1 = _mm256_bsrli_epi128(f0, 8); | |||
f0 = _mm256_srlv_epi64(f0, srlvqidx); | |||
f1 = _mm256_slli_epi64(f1, 34); | |||
f0 = _mm256_add_epi64(f0, f1); | |||
f0 = _mm256_shuffle_epi8(f0, shufbidx); | |||
t0 = _mm256_castsi256_si128(f0); | |||
t1 = _mm256_extracti128_si256(f0, 1); | |||
t0 = _mm_blendv_epi8(t0, t1, _mm256_castsi256_si128(shufbidx)); | |||
_mm_storeu_si128((__m128i *)&r[22 * i + 0], t0); | |||
_mm_storel_epi64((__m128i *)&r[22 * i + 16], t1); | |||
} | |||
} | |||
static void poly_decompress11(poly *restrict r, const uint8_t a[352 + 10]) { | |||
unsigned int i; | |||
__m256i f; | |||
const __m256i q = _mm256_load_si256(&PQCLEAN_KYBER1024_AVX2_qdata.vec[_16XQ / 16]); | |||
const __m256i shufbidx = _mm256_set_epi8(13, 12, 12, 11, 10, 9, 9, 8, | |||
8, 7, 6, 5, 5, 4, 4, 3, | |||
10, 9, 9, 8, 7, 6, 6, 5, | |||
5, 4, 3, 2, 2, 1, 1, 0); | |||
const __m256i srlvdidx = _mm256_set_epi32(0, 0, 1, 0, 0, 0, 1, 0); | |||
const __m256i srlvqidx = _mm256_set_epi64x(2, 0, 2, 0); | |||
const __m256i shift = _mm256_set_epi16(4, 32, 1, 8, 32, 1, 4, 32, 4, 32, 1, 8, 32, 1, 4, 32); | |||
const __m256i mask = _mm256_set1_epi16(32752); | |||
for (i = 0; i < KYBER_N / 16; i++) { | |||
f = _mm256_loadu_si256((__m256i *)&a[22 * i]); | |||
f = _mm256_permute4x64_epi64(f, 0x94); | |||
f = _mm256_shuffle_epi8(f, shufbidx); | |||
f = _mm256_srlv_epi32(f, srlvdidx); | |||
f = _mm256_srlv_epi64(f, srlvqidx); | |||
f = _mm256_mullo_epi16(f, shift); | |||
f = _mm256_srli_epi16(f, 1); | |||
f = _mm256_and_si256(f, mask); | |||
f = _mm256_mulhrs_epi16(f, q); | |||
_mm256_store_si256(&r->vec[i], f); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_AVX2_polyvec_compress | |||
* | |||
@@ -14,33 +85,11 @@ | |||
* (needs space for KYBER_POLYVECCOMPRESSEDBYTES) | |||
* - polyvec *a: pointer to input vector of polynomials | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], | |||
polyvec *restrict a) { | |||
size_t i, j, k; | |||
PQCLEAN_KYBER1024_AVX2_polyvec_csubq(a); | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES + 2], polyvec *a) { | |||
size_t i; | |||
uint16_t t[8]; | |||
for (i = 0; i < KYBER_K; i++) { | |||
for (j = 0; j < KYBER_N / 8; j++) { | |||
for (k = 0; k < 8; k++) { | |||
t[k] = ((((uint32_t)a->vec[i].coeffs[8 * j + k] << 11) + KYBER_Q / 2) | |||
/ KYBER_Q) & 0x7ff; | |||
} | |||
r[ 0] = (t[0] >> 0); | |||
r[ 1] = (t[0] >> 8) | (t[1] << 3); | |||
r[ 2] = (t[1] >> 5) | (t[2] << 6); | |||
r[ 3] = (t[2] >> 2); | |||
r[ 4] = (t[2] >> 10) | (t[3] << 1); | |||
r[ 5] = (t[3] >> 7) | (t[4] << 4); | |||
r[ 6] = (t[4] >> 4) | (t[5] << 7); | |||
r[ 7] = (t[5] >> 1); | |||
r[ 8] = (t[5] >> 9) | (t[6] << 2); | |||
r[ 9] = (t[6] >> 6) | (t[7] << 5); | |||
r[10] = (t[7] >> 3); | |||
r += 11; | |||
} | |||
poly_compress11(&r[352 * i], &a->vec[i]); | |||
} | |||
} | |||
@@ -50,31 +99,15 @@ void PQCLEAN_KYBER1024_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBY | |||
* Description: De-serialize and decompress vector of polynomials; | |||
* approximate inverse of PQCLEAN_KYBER1024_AVX2_polyvec_compress | |||
* | |||
* Arguments: - polyvec *r: pointer to output vector of polynomials | |||
* Arguments: - polyvec *r: pointer to output vector of polynomials | |||
* - const uint8_t *a: pointer to input byte array | |||
* (of length KYBER_POLYVECCOMPRESSEDBYTES) | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_decompress(polyvec *restrict r, | |||
const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { | |||
size_t i, j, k; | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES + 12]) { | |||
size_t i; | |||
uint16_t t[8]; | |||
for (i = 0; i < KYBER_K; i++) { | |||
for (j = 0; j < KYBER_N / 8; j++) { | |||
t[0] = (a[0] >> 0) | ((uint16_t)a[ 1] << 8); | |||
t[1] = (a[1] >> 3) | ((uint16_t)a[ 2] << 5); | |||
t[2] = (a[2] >> 6) | ((uint16_t)a[ 3] << 2) | ((uint16_t)a[4] << 10); | |||
t[3] = (a[4] >> 1) | ((uint16_t)a[ 5] << 7); | |||
t[4] = (a[5] >> 4) | ((uint16_t)a[ 6] << 4); | |||
t[5] = (a[6] >> 7) | ((uint16_t)a[ 7] << 1) | ((uint16_t)a[8] << 9); | |||
t[6] = (a[8] >> 2) | ((uint16_t)a[ 9] << 6); | |||
t[7] = (a[9] >> 5) | ((uint16_t)a[10] << 3); | |||
a += 11; | |||
for (k = 0; k < 8; k++) { | |||
r->vec[i].coeffs[8 * j + k] = ((uint32_t)(t[k] & 0x7FF) * KYBER_Q + 1024) >> 11; | |||
} | |||
} | |||
poly_decompress11(&r->vec[i], &a[352 * i]); | |||
} | |||
} | |||
@@ -100,7 +133,7 @@ void PQCLEAN_KYBER1024_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyv | |||
* Description: De-serialize vector of polynomials; | |||
* inverse of PQCLEAN_KYBER1024_AVX2_polyvec_tobytes | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* - const polyvec *a: pointer to input vector of polynomials | |||
* (of length KYBER_POLYVECBYTES) | |||
**************************************************/ | |||
@@ -141,29 +174,34 @@ void PQCLEAN_KYBER1024_AVX2_polyvec_invntt_tomont(polyvec *r) { | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery | |||
* Name: PQCLEAN_KYBER1024_AVX2_polyvec_basemul_acc_montgomery | |||
* | |||
* Description: Pointwise multiply elements of a and b, accumulate into r, | |||
* Description: Multiply elements in a and b in NTT domain, accumulate into r, | |||
* and multiply by 2^-16. | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const polyvec *a: pointer to first input vector of polynomials | |||
* - const polyvec *b: pointer to second input vector of polynomials | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(poly *r, | |||
const polyvec *a, | |||
const polyvec *b) { | |||
PQCLEAN_KYBER1024_AVX2_basemul_acc_avx(r->coeffs, a->vec->coeffs, b->vec->coeffs, & PQCLEAN_KYBER1024_AVX2_qdata); | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) { | |||
size_t i; | |||
poly tmp; | |||
PQCLEAN_KYBER1024_AVX2_poly_basemul_montgomery(r, &a->vec[0], &b->vec[0]); | |||
for (i = 1; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_AVX2_poly_basemul_montgomery(&tmp, &a->vec[i], &b->vec[i]); | |||
PQCLEAN_KYBER1024_AVX2_poly_add(r, r, &tmp); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_AVX2_polyvec_reduce | |||
* | |||
* Description: Applies Barrett reduction to each coefficient | |||
* of each element of a vector of polynomials | |||
* of each element of a vector of polynomials; | |||
* for details of the Barrett reduction see comments in reduce.c | |||
* | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
* Arguments: - polyvec *r: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_reduce(polyvec *r) { | |||
size_t i; | |||
@@ -172,23 +210,6 @@ void PQCLEAN_KYBER1024_AVX2_polyvec_reduce(polyvec *r) { | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_AVX2_polyvec_csubq | |||
* | |||
* Description: Applies conditional subtraction of q to each coefficient | |||
* of each element of a vector of polynomials | |||
* for details of conditional subtraction of q see comments in | |||
* reduce.c | |||
* | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_csubq(polyvec *r) { | |||
size_t i; | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_AVX2_poly_csubq(&r->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_AVX2_polyvec_add | |||
* | |||
@@ -8,9 +8,8 @@ typedef struct { | |||
poly vec[KYBER_K]; | |||
} polyvec; | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a); | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_decompress(polyvec *r, | |||
const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES + 2], polyvec *a); | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES + 12]); | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a); | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]); | |||
@@ -18,12 +17,9 @@ void PQCLEAN_KYBER1024_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_ | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_ntt(polyvec *r); | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_invntt_tomont(polyvec *r); | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(poly *r, | |||
const polyvec *a, | |||
const polyvec *b); | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b); | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_reduce(polyvec *r); | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_csubq(polyvec *r); | |||
void PQCLEAN_KYBER1024_AVX2_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); | |||
@@ -1,10 +1,9 @@ | |||
#ifndef PQCLEAN_KYBER1024_AVX2_REDUCE_H | |||
#define PQCLEAN_KYBER1024_AVX2_REDUCE_H | |||
#include "consts.h" | |||
#include <stdint.h> | |||
#include "params.h" | |||
#include <immintrin.h> | |||
int16_t PQCLEAN_KYBER1024_AVX2_reduce_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); | |||
int16_t PQCLEAN_KYBER1024_AVX2_csubq_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); | |||
int16_t PQCLEAN_KYBER1024_AVX2_tomont_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); | |||
void PQCLEAN_KYBER1024_AVX2_reduce_avx(__m256i *r, const __m256i *PQCLEAN_KYBER1024_AVX2_qdata); | |||
void PQCLEAN_KYBER1024_AVX2_tomont_avx(__m256i *r, const __m256i *PQCLEAN_KYBER1024_AVX2_qdata); | |||
#endif |
@@ -4,311 +4,68 @@ | |||
#include "rejsample.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
#include <string.h> | |||
//#define BMI | |||
static const ALIGN32_ARRAY_2D(uint8_t, 256, 8) idx = {.arr = { | |||
{-1, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 0, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 2, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 2, -1, -1, -1, -1, -1, -1}, | |||
{ 4, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 4, -1, -1, -1, -1, -1, -1}, | |||
{ 2, 4, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 4, -1, -1, -1, -1, -1}, | |||
{ 6, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 6, -1, -1, -1, -1, -1, -1}, | |||
{ 2, 6, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 6, -1, -1, -1, -1, -1}, | |||
{ 4, 6, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 6, -1, -1, -1, -1, -1}, | |||
{ 2, 4, 6, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 6, -1, -1, -1, -1}, | |||
{ 8, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 8, -1, -1, -1, -1, -1, -1}, | |||
{ 2, 8, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 8, -1, -1, -1, -1, -1}, | |||
{ 4, 8, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 8, -1, -1, -1, -1, -1}, | |||
{ 2, 4, 8, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 8, -1, -1, -1, -1}, | |||
{ 6, 8, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 8, -1, -1, -1, -1, -1}, | |||
{ 2, 6, 8, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 8, -1, -1, -1, -1}, | |||
{ 4, 6, 8, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 8, -1, -1, -1, -1}, | |||
{ 2, 4, 6, 8, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 8, -1, -1, -1}, | |||
{10, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 10, -1, -1, -1, -1, -1, -1}, | |||
{ 2, 10, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 10, -1, -1, -1, -1, -1}, | |||
{ 4, 10, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 10, -1, -1, -1, -1, -1}, | |||
{ 2, 4, 10, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 10, -1, -1, -1, -1}, | |||
{ 6, 10, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 10, -1, -1, -1, -1, -1}, | |||
{ 2, 6, 10, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 10, -1, -1, -1, -1}, | |||
{ 4, 6, 10, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 10, -1, -1, -1, -1}, | |||
{ 2, 4, 6, 10, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 10, -1, -1, -1}, | |||
{ 8, 10, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 8, 10, -1, -1, -1, -1, -1}, | |||
{ 2, 8, 10, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 8, 10, -1, -1, -1, -1}, | |||
{ 4, 8, 10, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 8, 10, -1, -1, -1, -1}, | |||
{ 2, 4, 8, 10, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 8, 10, -1, -1, -1}, | |||
{ 6, 8, 10, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 8, 10, -1, -1, -1, -1}, | |||
{ 2, 6, 8, 10, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 8, 10, -1, -1, -1}, | |||
{ 4, 6, 8, 10, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 8, 10, -1, -1, -1}, | |||
{ 2, 4, 6, 8, 10, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 8, 10, -1, -1}, | |||
{12, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 12, -1, -1, -1, -1, -1, -1}, | |||
{ 2, 12, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 12, -1, -1, -1, -1, -1}, | |||
{ 4, 12, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 12, -1, -1, -1, -1, -1}, | |||
{ 2, 4, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 12, -1, -1, -1, -1}, | |||
{ 6, 12, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 12, -1, -1, -1, -1, -1}, | |||
{ 2, 6, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 12, -1, -1, -1, -1}, | |||
{ 4, 6, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 12, -1, -1, -1, -1}, | |||
{ 2, 4, 6, 12, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 12, -1, -1, -1}, | |||
{ 8, 12, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 8, 12, -1, -1, -1, -1, -1}, | |||
{ 2, 8, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 8, 12, -1, -1, -1, -1}, | |||
{ 4, 8, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 8, 12, -1, -1, -1, -1}, | |||
{ 2, 4, 8, 12, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 8, 12, -1, -1, -1}, | |||
{ 6, 8, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 8, 12, -1, -1, -1, -1}, | |||
{ 2, 6, 8, 12, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 8, 12, -1, -1, -1}, | |||
{ 4, 6, 8, 12, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 8, 12, -1, -1, -1}, | |||
{ 2, 4, 6, 8, 12, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 8, 12, -1, -1}, | |||
{10, 12, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 10, 12, -1, -1, -1, -1, -1}, | |||
{ 2, 10, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 10, 12, -1, -1, -1, -1}, | |||
{ 4, 10, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 10, 12, -1, -1, -1, -1}, | |||
{ 2, 4, 10, 12, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 10, 12, -1, -1, -1}, | |||
{ 6, 10, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 10, 12, -1, -1, -1, -1}, | |||
{ 2, 6, 10, 12, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 10, 12, -1, -1, -1}, | |||
{ 4, 6, 10, 12, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 10, 12, -1, -1, -1}, | |||
{ 2, 4, 6, 10, 12, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 10, 12, -1, -1}, | |||
{ 8, 10, 12, -1, -1, -1, -1, -1}, | |||
{ 0, 8, 10, 12, -1, -1, -1, -1}, | |||
{ 2, 8, 10, 12, -1, -1, -1, -1}, | |||
{ 0, 2, 8, 10, 12, -1, -1, -1}, | |||
{ 4, 8, 10, 12, -1, -1, -1, -1}, | |||
{ 0, 4, 8, 10, 12, -1, -1, -1}, | |||
{ 2, 4, 8, 10, 12, -1, -1, -1}, | |||
{ 0, 2, 4, 8, 10, 12, -1, -1}, | |||
{ 6, 8, 10, 12, -1, -1, -1, -1}, | |||
{ 0, 6, 8, 10, 12, -1, -1, -1}, | |||
{ 2, 6, 8, 10, 12, -1, -1, -1}, | |||
{ 0, 2, 6, 8, 10, 12, -1, -1}, | |||
{ 4, 6, 8, 10, 12, -1, -1, -1}, | |||
{ 0, 4, 6, 8, 10, 12, -1, -1}, | |||
{ 2, 4, 6, 8, 10, 12, -1, -1}, | |||
{ 0, 2, 4, 6, 8, 10, 12, -1}, | |||
{14, -1, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 14, -1, -1, -1, -1, -1, -1}, | |||
{ 2, 14, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 14, -1, -1, -1, -1, -1}, | |||
{ 4, 14, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 14, -1, -1, -1, -1, -1}, | |||
{ 2, 4, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 14, -1, -1, -1, -1}, | |||
{ 6, 14, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 14, -1, -1, -1, -1, -1}, | |||
{ 2, 6, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 14, -1, -1, -1, -1}, | |||
{ 4, 6, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 14, -1, -1, -1, -1}, | |||
{ 2, 4, 6, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 14, -1, -1, -1}, | |||
{ 8, 14, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 8, 14, -1, -1, -1, -1, -1}, | |||
{ 2, 8, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 8, 14, -1, -1, -1, -1}, | |||
{ 4, 8, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 8, 14, -1, -1, -1, -1}, | |||
{ 2, 4, 8, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 8, 14, -1, -1, -1}, | |||
{ 6, 8, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 8, 14, -1, -1, -1, -1}, | |||
{ 2, 6, 8, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 8, 14, -1, -1, -1}, | |||
{ 4, 6, 8, 14, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 8, 14, -1, -1, -1}, | |||
{ 2, 4, 6, 8, 14, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 8, 14, -1, -1}, | |||
{10, 14, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 10, 14, -1, -1, -1, -1, -1}, | |||
{ 2, 10, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 10, 14, -1, -1, -1, -1}, | |||
{ 4, 10, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 10, 14, -1, -1, -1, -1}, | |||
{ 2, 4, 10, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 10, 14, -1, -1, -1}, | |||
{ 6, 10, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 10, 14, -1, -1, -1, -1}, | |||
{ 2, 6, 10, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 10, 14, -1, -1, -1}, | |||
{ 4, 6, 10, 14, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 10, 14, -1, -1, -1}, | |||
{ 2, 4, 6, 10, 14, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 10, 14, -1, -1}, | |||
{ 8, 10, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 8, 10, 14, -1, -1, -1, -1}, | |||
{ 2, 8, 10, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 8, 10, 14, -1, -1, -1}, | |||
{ 4, 8, 10, 14, -1, -1, -1, -1}, | |||
{ 0, 4, 8, 10, 14, -1, -1, -1}, | |||
{ 2, 4, 8, 10, 14, -1, -1, -1}, | |||
{ 0, 2, 4, 8, 10, 14, -1, -1}, | |||
{ 6, 8, 10, 14, -1, -1, -1, -1}, | |||
{ 0, 6, 8, 10, 14, -1, -1, -1}, | |||
{ 2, 6, 8, 10, 14, -1, -1, -1}, | |||
{ 0, 2, 6, 8, 10, 14, -1, -1}, | |||
{ 4, 6, 8, 10, 14, -1, -1, -1}, | |||
{ 0, 4, 6, 8, 10, 14, -1, -1}, | |||
{ 2, 4, 6, 8, 10, 14, -1, -1}, | |||
{ 0, 2, 4, 6, 8, 10, 14, -1}, | |||
{12, 14, -1, -1, -1, -1, -1, -1}, | |||
{ 0, 12, 14, -1, -1, -1, -1, -1}, | |||
{ 2, 12, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 2, 12, 14, -1, -1, -1, -1}, | |||
{ 4, 12, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 4, 12, 14, -1, -1, -1, -1}, | |||
{ 2, 4, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 4, 12, 14, -1, -1, -1}, | |||
{ 6, 12, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 6, 12, 14, -1, -1, -1, -1}, | |||
{ 2, 6, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 6, 12, 14, -1, -1, -1}, | |||
{ 4, 6, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 4, 6, 12, 14, -1, -1, -1}, | |||
{ 2, 4, 6, 12, 14, -1, -1, -1}, | |||
{ 0, 2, 4, 6, 12, 14, -1, -1}, | |||
{ 8, 12, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 8, 12, 14, -1, -1, -1, -1}, | |||
{ 2, 8, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 8, 12, 14, -1, -1, -1}, | |||
{ 4, 8, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 4, 8, 12, 14, -1, -1, -1}, | |||
{ 2, 4, 8, 12, 14, -1, -1, -1}, | |||
{ 0, 2, 4, 8, 12, 14, -1, -1}, | |||
{ 6, 8, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 6, 8, 12, 14, -1, -1, -1}, | |||
{ 2, 6, 8, 12, 14, -1, -1, -1}, | |||
{ 0, 2, 6, 8, 12, 14, -1, -1}, | |||
{ 4, 6, 8, 12, 14, -1, -1, -1}, | |||
{ 0, 4, 6, 8, 12, 14, -1, -1}, | |||
{ 2, 4, 6, 8, 12, 14, -1, -1}, | |||
{ 0, 2, 4, 6, 8, 12, 14, -1}, | |||
{10, 12, 14, -1, -1, -1, -1, -1}, | |||
{ 0, 10, 12, 14, -1, -1, -1, -1}, | |||
{ 2, 10, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 2, 10, 12, 14, -1, -1, -1}, | |||
{ 4, 10, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 4, 10, 12, 14, -1, -1, -1}, | |||
{ 2, 4, 10, 12, 14, -1, -1, -1}, | |||
{ 0, 2, 4, 10, 12, 14, -1, -1}, | |||
{ 6, 10, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 6, 10, 12, 14, -1, -1, -1}, | |||
{ 2, 6, 10, 12, 14, -1, -1, -1}, | |||
{ 0, 2, 6, 10, 12, 14, -1, -1}, | |||
{ 4, 6, 10, 12, 14, -1, -1, -1}, | |||
{ 0, 4, 6, 10, 12, 14, -1, -1}, | |||
{ 2, 4, 6, 10, 12, 14, -1, -1}, | |||
{ 0, 2, 4, 6, 10, 12, 14, -1}, | |||
{ 8, 10, 12, 14, -1, -1, -1, -1}, | |||
{ 0, 8, 10, 12, 14, -1, -1, -1}, | |||
{ 2, 8, 10, 12, 14, -1, -1, -1}, | |||
{ 0, 2, 8, 10, 12, 14, -1, -1}, | |||
{ 4, 8, 10, 12, 14, -1, -1, -1}, | |||
{ 0, 4, 8, 10, 12, 14, -1, -1}, | |||
{ 2, 4, 8, 10, 12, 14, -1, -1}, | |||
{ 0, 2, 4, 8, 10, 12, 14, -1}, | |||
{ 6, 8, 10, 12, 14, -1, -1, -1}, | |||
{ 0, 6, 8, 10, 12, 14, -1, -1}, | |||
{ 2, 6, 8, 10, 12, 14, -1, -1}, | |||
{ 0, 2, 6, 8, 10, 12, 14, -1}, | |||
{ 4, 6, 8, 10, 12, 14, -1, -1}, | |||
{ 0, 4, 6, 8, 10, 12, 14, -1}, | |||
{ 2, 4, 6, 8, 10, 12, 14, -1}, | |||
{ 0, 2, 4, 6, 8, 10, 12, 14} | |||
} | |||
}; | |||
#define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a) | |||
#define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a) | |||
#define REJ_UNIFORM_BUFLEN 672 | |||
unsigned int PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(int16_t *restrict r, | |||
const uint8_t *restrict buf) { | |||
unsigned int PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(int16_t *restrict r, const uint8_t *buf) { | |||
unsigned int ctr, pos; | |||
uint16_t val; | |||
uint16_t val0, val1; | |||
uint32_t good; | |||
const __m256i bound = _mm256_set1_epi16((int16_t)(19 * KYBER_Q - 1)); | |||
uint64_t idx0, idx1, idx2, idx3; | |||
const __m256i bound = _mm256_load_si256(&PQCLEAN_KYBER1024_AVX2_qdata.vec[_16XQ / 16]); | |||
const __m256i ones = _mm256_set1_epi8(1); | |||
const __m256i kyberq = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER1024_AVX2_qdata.as_arr[_16XQ]); | |||
const __m256i v = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER1024_AVX2_qdata.as_arr[_16XV]); | |||
const __m256i mask = _mm256_set1_epi16(0xFFF); | |||
const __m256i idx8 = _mm256_set_epi8(15, 14, 14, 13, 12, 11, 11, 10, | |||
9, 8, 8, 7, 6, 5, 5, 4, | |||
11, 10, 10, 9, 8, 7, 7, 6, | |||
5, 4, 4, 3, 2, 1, 1, 0); | |||
__m256i f0, f1, g0, g1, g2, g3; | |||
__m128i f, t, pilo, pihi; | |||
ctr = 0; | |||
for (pos = 0; pos < 2 * KYBER_N; pos += 64) { | |||
f0 = _mm256_load_si256((__m256i *)&buf[pos + 0]); | |||
f1 = _mm256_load_si256((__m256i *)&buf[pos + 32]); | |||
g0 = _mm256_cmpge_epu16(bound, f0); | |||
g1 = _mm256_cmpge_epu16(bound, f1); | |||
ctr = pos = 0; | |||
while (ctr <= KYBER_N - 32 && pos <= REJ_UNIFORM_AVX_BUFLEN - 48) { | |||
f0 = _mm256_loadu_si256((__m256i *)&buf[pos]); | |||
f1 = _mm256_loadu_si256((__m256i *)&buf[pos + 24]); | |||
f0 = _mm256_permute4x64_epi64(f0, 0x94); | |||
f1 = _mm256_permute4x64_epi64(f1, 0x94); | |||
f0 = _mm256_shuffle_epi8(f0, idx8); | |||
f1 = _mm256_shuffle_epi8(f1, idx8); | |||
g0 = _mm256_srli_epi16(f0, 4); | |||
g1 = _mm256_srli_epi16(f1, 4); | |||
f0 = _mm256_blend_epi16(f0, g0, 0xAA); | |||
f1 = _mm256_blend_epi16(f1, g1, 0xAA); | |||
f0 = _mm256_and_si256(f0, mask); | |||
f1 = _mm256_and_si256(f1, mask); | |||
pos += 48; | |||
g0 = _mm256_cmpgt_epi16(bound, f0); | |||
g1 = _mm256_cmpgt_epi16(bound, f1); | |||
g0 = _mm256_packs_epi16(g0, g1); | |||
good = _mm256_movemask_epi8(g0); | |||
g0 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 0) & 0xFF])); | |||
g1 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 8) & 0xFF])); | |||
g0 = _mm256_inserti128_si256(g0, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 16) & 0xFF]), 1); | |||
g1 = _mm256_inserti128_si256(g1, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 24) & 0xFF]), 1); | |||
//g0 = _mm256_cvtepu8_epi64(_mm_loadl_epi64((__m128i *)&good)); | |||
//g1 = _mm256_i64gather_epi64((long long *)idx.arr, g0, 8); | |||
/* Barrett reduction of (still unsigned) values */ | |||
g2 = _mm256_mulhi_epu16(f0, v); | |||
g3 = _mm256_mulhi_epu16(f1, v); | |||
g2 = _mm256_srli_epi16(g2, 10); | |||
g3 = _mm256_srli_epi16(g3, 10); | |||
g2 = _mm256_mullo_epi16(g2, kyberq); | |||
g3 = _mm256_mullo_epi16(g3, kyberq); | |||
f0 = _mm256_sub_epi16(f0, g2); | |||
f1 = _mm256_sub_epi16(f1, g3); | |||
idx0 = _pdep_u64(good >> 0, 0x0101010101010101); | |||
idx1 = _pdep_u64(good >> 8, 0x0101010101010101); | |||
idx2 = _pdep_u64(good >> 16, 0x0101010101010101); | |||
idx3 = _pdep_u64(good >> 24, 0x0101010101010101); | |||
idx0 = (idx0 << 8) - idx0; | |||
idx0 = _pext_u64(0x0E0C0A0806040200, idx0); | |||
idx1 = (idx1 << 8) - idx1; | |||
idx1 = _pext_u64(0x0E0C0A0806040200, idx1); | |||
idx2 = (idx2 << 8) - idx2; | |||
idx2 = _pext_u64(0x0E0C0A0806040200, idx2); | |||
idx3 = (idx3 << 8) - idx3; | |||
idx3 = _pext_u64(0x0E0C0A0806040200, idx3); | |||
g0 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx0)); | |||
g1 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx1)); | |||
g0 = _mm256_inserti128_si256(g0, _mm_cvtsi64_si128(idx2), 1); | |||
g1 = _mm256_inserti128_si256(g1, _mm_cvtsi64_si128(idx3), 1); | |||
g2 = _mm256_add_epi8(g0, ones); | |||
g3 = _mm256_add_epi8(g1, ones); | |||
@@ -328,34 +85,40 @@ unsigned int PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(int16_t *restrict r, | |||
ctr += _mm_popcnt_u32((good >> 24) & 0xFF); | |||
} | |||
while (ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_BUFLEN - 16) { | |||
f = _mm_load_si128((__m128i *)&buf[pos]); | |||
t = _mm_cmpge_epu16(_mm256_castsi256_si128(bound), f); | |||
while (ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_AVX_BUFLEN - 12) { | |||
f = _mm_loadu_si128((__m128i *)&buf[pos]); | |||
f = _mm_shuffle_epi8(f, _mm256_castsi256_si128(idx8)); | |||
t = _mm_srli_epi16(f, 4); | |||
f = _mm_blend_epi16(f, t, 0xAA); | |||
f = _mm_and_si128(f, _mm256_castsi256_si128(mask)); | |||
pos += 12; | |||
t = _mm_cmpgt_epi16(_mm256_castsi256_si128(bound), f); | |||
good = _mm_movemask_epi8(t); | |||
good = _pext_u32(good, 0x5555); | |||
pilo = _mm_loadl_epi64((__m128i *)&idx.arr[good]); | |||
pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones)); | |||
pilo = _mm_unpacklo_epi8(pilo, pihi); | |||
/* Barrett reduction */ | |||
t = _mm_mulhi_epu16(f, _mm256_castsi256_si128(v)); | |||
t = _mm_srli_epi16(t, 10); | |||
t = _mm_mullo_epi16(t, _mm256_castsi256_si128(kyberq)); | |||
f = _mm_sub_epi16(f, t); | |||
good &= 0x5555; | |||
idx0 = _pdep_u64(good, 0x1111111111111111); | |||
idx0 = (idx0 << 8) - idx0; | |||
idx0 = _pext_u64(0x0E0C0A0806040200, idx0); | |||
pilo = _mm_cvtsi64_si128(idx0); | |||
pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones)); | |||
pilo = _mm_unpacklo_epi8(pilo, pihi); | |||
f = _mm_shuffle_epi8(f, pilo); | |||
_mm_storeu_si128((__m128i *)&r[ctr], f); | |||
ctr += _mm_popcnt_u32(good); | |||
pos += 16; | |||
} | |||
while (ctr < KYBER_N && pos <= REJ_UNIFORM_BUFLEN - 2) { | |||
val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); | |||
pos += 2; | |||
while (ctr < KYBER_N && pos <= REJ_UNIFORM_AVX_BUFLEN - 3) { | |||
val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; | |||
val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)); | |||
pos += 3; | |||
if (val < 19 * KYBER_Q) { | |||
val -= ((int32_t)val * 20159 >> 26) * KYBER_Q; | |||
r[ctr++] = val; | |||
if (val0 < KYBER_Q) { | |||
r[ctr++] = val0; | |||
} | |||
if (val1 < KYBER_Q && ctr < KYBER_N) { | |||
r[ctr++] = val1; | |||
} | |||
} | |||
@@ -1,9 +1,12 @@ | |||
#ifndef PQCLEAN_KYBER1024_AVX2_REJSAMPLE_H | |||
#define PQCLEAN_KYBER1024_AVX2_REJSAMPLE_H | |||
#include "params.h" | |||
#include "symmetric.h" | |||
#include <stdint.h> | |||
unsigned int PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(int16_t *r, | |||
const unsigned char *buf); | |||
#define REJ_UNIFORM_AVX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) | |||
#define REJ_UNIFORM_AVX_BUFLEN (REJ_UNIFORM_AVX_NBLOCKS*XOF_BLOCKBYTES) | |||
unsigned int PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(int16_t *r, const uint8_t *buf); | |||
#endif |
@@ -113,13 +113,13 @@ vmovdqa 224(%rsi),%ymm12 | |||
#csubq | |||
csubq 5,13 | |||
csubq 6,14 | |||
csubq 7,15 | |||
csubq 8,1 | |||
csubq 6,13 | |||
csubq 7,13 | |||
csubq 8,13 | |||
csubq 9,13 | |||
csubq 10,14 | |||
csubq 11,15 | |||
csubq 12,1 | |||
csubq 10,13 | |||
csubq 11,13 | |||
csubq 12,13 | |||
#bitpack | |||
vpsllw $12,%ymm6,%ymm4 | |||
@@ -9,17 +9,17 @@ vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 | |||
.endm | |||
.macro shuffle2 r0,r1,r2,r3 | |||
#vshufps $0x20,%ymm\r1,%ymm\r0,%ymm\r2 | |||
#vshufps $0x31,%ymm\r1,%ymm\r0,%ymm\r3 | |||
vpsllq $32,%ymm\r1,%ymm12 | |||
vpsrlq $32,%ymm\r0,%ymm13 | |||
vpblendd $0xAA,%ymm12,%ymm\r0,%ymm\r2 | |||
vpblendd $0xAA,%ymm\r1,%ymm13,%ymm\r3 | |||
#vpsllq $32,%ymm\r1,%ymm\r2 | |||
vmovsldup %ymm\r1,%ymm\r2 | |||
vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 | |||
vpsrlq $32,%ymm\r0,%ymm\r0 | |||
#vmovshdup %ymm\r0,%ymm\r0 | |||
vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 | |||
.endm | |||
.macro shuffle1 r0,r1,r2,r3 | |||
vpslld $16,%ymm\r1,%ymm12 | |||
vpsrld $16,%ymm\r0,%ymm13 | |||
vpblendw $0xAA,%ymm12,%ymm\r0,%ymm\r2 | |||
vpblendw $0xAA,%ymm\r1,%ymm13,%ymm\r3 | |||
vpslld $16,%ymm\r1,%ymm\r2 | |||
vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 | |||
vpsrld $16,%ymm\r0,%ymm\r0 | |||
vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 | |||
.endm |
@@ -9,12 +9,10 @@ | |||
* | |||
* Description: Absorb step of the SHAKE128 specialized for the Kyber context. | |||
* | |||
* Arguments: - xof_state *state: pointer to (uninitialized) output | |||
* Keccak state | |||
* - const uint8_t *seed: pointer to KYBER_SYMBYTES input | |||
* to be absorbed into state | |||
* - uint8_t i additional byte of input | |||
* - uint8_t j additional byte of input | |||
* Arguments: - xof_state *state: pointer to (uninitialized) output Keccak state | |||
* - const uint8_t *seed: pointer to KYBER_SYMBYTES input to be absorbed into state | |||
* - uint8_t i: additional byte of input | |||
* - uint8_t j: additional byte of input | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb(xof_state *state, | |||
const uint8_t seed[KYBER_SYMBYTES], | |||
@@ -26,8 +24,8 @@ void PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb(xof_state *state, | |||
for (i = 0; i < KYBER_SYMBYTES; i++) { | |||
extseed[i] = seed[i]; | |||
} | |||
extseed[i++] = x; | |||
extseed[i] = y; | |||
extseed[KYBER_SYMBYTES + 0] = x; | |||
extseed[KYBER_SYMBYTES + 1] = y; | |||
shake128_absorb(state, extseed, sizeof(extseed)); | |||
} | |||
@@ -38,23 +36,19 @@ void PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb(xof_state *state, | |||
* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input | |||
* and then generates outlen bytes of SHAKE256 output | |||
* | |||
* Arguments: - uint8_t *out: pointer to output | |||
* - size_t outlen: number of requested output bytes | |||
* - const uint8_t *key: pointer to the key | |||
* (of length KYBER_SYMBYTES) | |||
* - uint8_t nonce: single-byte nonce (public PRF input) | |||
* Arguments: - uint8_t *out: pointer to output | |||
* - size_t outlen: number of requested output bytes | |||
* - const uint8_t *key: pointer to the key (of length KYBER_SYMBYTES) | |||
* - uint8_t nonce: single-byte nonce (public PRF input) | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf(uint8_t *out, | |||
size_t outlen, | |||
const uint8_t key[KYBER_SYMBYTES], | |||
uint8_t nonce) { | |||
void PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce) { | |||
unsigned int i; | |||
uint8_t extkey[KYBER_SYMBYTES + 1]; | |||
for (i = 0; i < KYBER_SYMBYTES; i++) { | |||
extkey[i] = key[i]; | |||
} | |||
extkey[i] = nonce; | |||
extkey[KYBER_SYMBYTES] = nonce; | |||
shake256(out, outlen, extkey, sizeof(extkey)); | |||
} |
@@ -15,21 +15,16 @@ void PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb(xof_state *s, | |||
uint8_t x, | |||
uint8_t y); | |||
void PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf(uint8_t *out, | |||
size_t outlen, | |||
const uint8_t key[KYBER_SYMBYTES], | |||
uint8_t nonce); | |||
void PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce); | |||
#define XOF_BLOCKBYTES SHAKE128_RATE | |||
#define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES) | |||
#define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES) | |||
#define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb(STATE, SEED, X, Y) | |||
#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) \ | |||
shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) | |||
#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) | |||
#define xof_ctx_release(STATE) shake128_ctx_release(STATE) | |||
#define prf(OUT, OUTBYTES, KEY, NONCE) \ | |||
PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE) | |||
#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE) | |||
#define kdf(OUT, IN, INBYTES) shake256(OUT, KYBER_SSBYTES, IN, INBYTES) | |||
@@ -8,31 +8,31 @@ | |||
* | |||
* Description: Compare two arrays for equality in constant time. | |||
* | |||
* Arguments: const unsigned char *a: pointer to first byte array | |||
* const unsigned char *b: pointer to second byte array | |||
* size_t len: length of the byte arrays | |||
* Arguments: const uint8_t *a: pointer to first byte array | |||
* const uint8_t *b: pointer to second byte array | |||
* size_t len: length of the byte arrays | |||
* | |||
* Returns 0 if the byte arrays are equal, 1 otherwise | |||
**************************************************/ | |||
int PQCLEAN_KYBER1024_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) { | |||
size_t pos; | |||
size_t i; | |||
uint64_t r; | |||
__m256i avec, bvec, cvec; | |||
__m256i f, g, h; | |||
cvec = _mm256_setzero_si256(); | |||
for (pos = 0; pos + 32 <= len; pos += 32) { | |||
avec = _mm256_loadu_si256((__m256i *)&a[pos]); | |||
bvec = _mm256_loadu_si256((__m256i *)&b[pos]); | |||
avec = _mm256_xor_si256(avec, bvec); | |||
cvec = _mm256_or_si256(cvec, avec); | |||
h = _mm256_setzero_si256(); | |||
for (i = 0; i < len / 32; i++) { | |||
f = _mm256_loadu_si256((__m256i *)&a[32 * i]); | |||
g = _mm256_loadu_si256((__m256i *)&b[32 * i]); | |||
f = _mm256_xor_si256(f, g); | |||
h = _mm256_or_si256(h, f); | |||
} | |||
r = 1 - _mm256_testz_si256(cvec, cvec); | |||
r = 1 - _mm256_testz_si256(h, h); | |||
if (pos < len) { | |||
avec = _mm256_loadu_si256((__m256i *)&a[pos]); | |||
bvec = _mm256_loadu_si256((__m256i *)&b[pos]); | |||
cvec = _mm256_cmpeq_epi8(avec, bvec); | |||
r |= _mm256_movemask_epi8(cvec) & (-(uint32_t)1 >> (32 + pos - len)); | |||
a += 32 * i; | |||
b += 32 * i; | |||
len -= 32 * i; | |||
for (i = 0; i < len; i++) { | |||
r |= a[i] ^ b[i]; | |||
} | |||
r = (-r) >> 63; | |||
@@ -47,29 +47,27 @@ int PQCLEAN_KYBER1024_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len | |||
* assumes two's complement representation of negative integers. | |||
* Runs in constant time. | |||
* | |||
* Arguments: unsigned char *r: pointer to output byte array | |||
* Arguments: unsigned char *r: pointer to output byte array | |||
* const unsigned char *x: pointer to input byte array | |||
* size_t len: Amount of bytes to be copied | |||
* unsigned char b: Condition bit; has to be in {0,1} | |||
* size_t len: Amount of bytes to be copied | |||
* unsigned char b: Condition bit; has to be in {0,1} | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_AVX2_cmov(uint8_t *restrict r, const uint8_t *restrict x, size_t len, uint8_t b) { | |||
size_t pos; | |||
void PQCLEAN_KYBER1024_AVX2_cmov(uint8_t *restrict r, const uint8_t *x, size_t len, uint8_t b) { | |||
size_t i; | |||
__m256i xvec, rvec, bvec; | |||
b = -b; | |||
bvec = _mm256_set1_epi8(b); | |||
for (pos = 0; pos + 32 <= len; pos += 32) { | |||
rvec = _mm256_loadu_si256((__m256i *)&r[pos]); | |||
xvec = _mm256_loadu_si256((__m256i *)&x[pos]); | |||
xvec = _mm256_xor_si256(xvec, rvec); | |||
xvec = _mm256_and_si256(xvec, bvec); | |||
rvec = _mm256_xor_si256(rvec, xvec); | |||
_mm256_storeu_si256((__m256i *)&r[pos], rvec); | |||
bvec = _mm256_set1_epi64x(-(uint64_t)b); | |||
for (i = 0; i < len / 32; i++) { | |||
rvec = _mm256_loadu_si256((__m256i *)&r[32 * i]); | |||
xvec = _mm256_loadu_si256((__m256i *)&x[32 * i]); | |||
rvec = _mm256_blendv_epi8(rvec, xvec, bvec); | |||
_mm256_storeu_si256((__m256i *)&r[32 * i], rvec); | |||
} | |||
while (pos < len) { | |||
r[pos] ^= b & (x[pos] ^ r[pos]); | |||
pos += 1; | |||
r += 32 * i; | |||
x += 32 * i; | |||
len -= 32 * i; | |||
for (i = 0; i < len; i++) { | |||
r[i] ^= -b & (x[i] ^ r[i]); | |||
} | |||
} |
@@ -5,7 +5,7 @@ | |||
/************************************************* | |||
* Name: load32_littleendian | |||
* | |||
* Description: load bytes into a 32-bit integer | |||
* Description: load 4 bytes into a 32-bit integer | |||
* in little-endian order | |||
* | |||
* Arguments: - const uint8_t *x: pointer to input byte array | |||
@@ -22,16 +22,29 @@ static uint32_t load32_littleendian(const uint8_t x[4]) { | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_CLEAN_cbd | |||
* Name: load24_littleendian | |||
* | |||
* Description: load 3 bytes into a 32-bit integer | |||
* in little-endian order. | |||
* This function is only needed for Kyber-512 | |||
* | |||
* Arguments: - const uint8_t *x: pointer to input byte array | |||
* | |||
* Returns 32-bit unsigned integer loaded from x (most significant byte is zero) | |||
**************************************************/ | |||
/************************************************* | |||
* Name: cbd2 | |||
* | |||
* Description: Given an array of uniformly random bytes, compute | |||
* polynomial with coefficients distributed according to | |||
* a centered binomial distribution with parameter KYBER_ETA | |||
* a centered binomial distribution with parameter eta=2 | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *buf: pointer to input byte array | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]) { | |||
static void cbd2(poly *r, const uint8_t buf[2 * KYBER_N / 4]) { | |||
unsigned int i, j; | |||
uint32_t t, d; | |||
int16_t a, b; | |||
@@ -48,3 +61,23 @@ void PQCLEAN_KYBER1024_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N | |||
} | |||
} | |||
} | |||
/************************************************* | |||
* Name: cbd3 | |||
* | |||
* Description: Given an array of uniformly random bytes, compute | |||
* polynomial with coefficients distributed according to | |||
* a centered binomial distribution with parameter eta=3. | |||
* This function is only needed for Kyber-512 | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *buf: pointer to input byte array | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_CLEAN_poly_cbd_eta1(poly *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]) { | |||
cbd2(r, buf); | |||
} | |||
void PQCLEAN_KYBER1024_CLEAN_poly_cbd_eta2(poly *r, const uint8_t buf[KYBER_ETA2 * KYBER_N / 4]) { | |||
cbd2(r, buf); | |||
} |
@@ -4,6 +4,8 @@ | |||
#include "poly.h" | |||
#include <stdint.h> | |||
void PQCLEAN_KYBER1024_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]); | |||
void PQCLEAN_KYBER1024_CLEAN_poly_cbd_eta1(poly *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]); | |||
void PQCLEAN_KYBER1024_CLEAN_poly_cbd_eta2(poly *r, const uint8_t buf[KYBER_ETA2 * KYBER_N / 4]); | |||
#endif |
@@ -15,8 +15,8 @@ | |||
* serialized vector of polynomials pk | |||
* and the public seed used to generate the matrix A. | |||
* | |||
* Arguments: uint8_t *r: pointer to the output serialized public key | |||
* polyvec *pk: pointer to the input public-key polyvec | |||
* Arguments: uint8_t *r: pointer to the output serialized public key | |||
* polyvec *pk: pointer to the input public-key polyvec | |||
* const uint8_t *seed: pointer to the input public seed | |||
**************************************************/ | |||
static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], | |||
@@ -35,10 +35,8 @@ static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], | |||
* Description: De-serialize public key from a byte array; | |||
* approximate inverse of pack_pk | |||
* | |||
* Arguments: - polyvec *pk: pointer to output public-key | |||
* polynomial vector | |||
* - uint8_t *seed: pointer to output seed to generate | |||
* matrix A | |||
* Arguments: - polyvec *pk: pointer to output public-key polynomial vector | |||
* - uint8_t *seed: pointer to output seed to generate matrix A | |||
* - const uint8_t *packedpk: pointer to input serialized public key | |||
**************************************************/ | |||
static void unpack_pk(polyvec *pk, | |||
@@ -56,7 +54,7 @@ static void unpack_pk(polyvec *pk, | |||
* | |||
* Description: Serialize the secret key | |||
* | |||
* Arguments: - uint8_t *r: pointer to output serialized secret key | |||
* Arguments: - uint8_t *r: pointer to output serialized secret key | |||
* - polyvec *sk: pointer to input vector of polynomials (secret key) | |||
**************************************************/ | |||
static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { | |||
@@ -66,15 +64,12 @@ static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { | |||
/************************************************* | |||
* Name: unpack_sk | |||
* | |||
* Description: De-serialize the secret key; | |||
* inverse of pack_sk | |||
* Description: De-serialize the secret key; inverse of pack_sk | |||
* | |||
* Arguments: - polyvec *sk: pointer to output vector of | |||
* polynomials (secret key) | |||
* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key) | |||
* - const uint8_t *packedsk: pointer to input serialized secret key | |||
**************************************************/ | |||
static void unpack_sk(polyvec *sk, | |||
const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { | |||
static void unpack_sk(polyvec *sk, const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_frombytes(sk, packedsk); | |||
} | |||
@@ -86,12 +81,10 @@ static void unpack_sk(polyvec *sk, | |||
* and the compressed and serialized polynomial v | |||
* | |||
* Arguments: uint8_t *r: pointer to the output serialized ciphertext | |||
* poly *pk: pointer to the input vector of polynomials b | |||
* poly *v: pointer to the input polynomial v | |||
* poly *pk: pointer to the input vector of polynomials b | |||
* poly *v: pointer to the input polynomial v | |||
**************************************************/ | |||
static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], | |||
polyvec *b, | |||
poly *v) { | |||
static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], polyvec *b, poly *v) { | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_compress(r, b); | |||
PQCLEAN_KYBER1024_CLEAN_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v); | |||
} | |||
@@ -102,13 +95,11 @@ static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], | |||
* Description: De-serialize and decompress ciphertext from a byte array; | |||
* approximate inverse of pack_ciphertext | |||
* | |||
* Arguments: - polyvec *b: pointer to the output vector of polynomials b | |||
* - poly *v: pointer to the output polynomial v | |||
* Arguments: - polyvec *b: pointer to the output vector of polynomials b | |||
* - poly *v: pointer to the output polynomial v | |||
* - const uint8_t *c: pointer to the input serialized ciphertext | |||
**************************************************/ | |||
static void unpack_ciphertext(polyvec *b, | |||
poly *v, | |||
const uint8_t c[KYBER_INDCPA_BYTES]) { | |||
static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t c[KYBER_INDCPA_BYTES]) { | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_decompress(b, c); | |||
PQCLEAN_KYBER1024_CLEAN_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES); | |||
} | |||
@@ -119,11 +110,9 @@ static void unpack_ciphertext(polyvec *b, | |||
* Description: Run rejection sampling on uniform random bytes to generate | |||
* uniform random integers mod q | |||
* | |||
* Arguments: - int16_t *r: pointer to output buffer | |||
* - unsigned int len: requested number of 16-bit integers | |||
* (uniform mod q) | |||
* - const uint8_t *buf: pointer to input buffer | |||
* (assumed to be uniform random bytes) | |||
* Arguments: - int16_t *r: pointer to output buffer | |||
* - unsigned int len: requested number of 16-bit integers (uniform mod q) | |||
* - const uint8_t *buf: pointer to input buffer (assumed to be uniformly random bytes) | |||
* - unsigned int buflen: length of input buffer in bytes | |||
* | |||
* Returns number of sampled 16-bit integers (at most len) | |||
@@ -133,16 +122,19 @@ static unsigned int rej_uniform(int16_t *r, | |||
const uint8_t *buf, | |||
unsigned int buflen) { | |||
unsigned int ctr, pos; | |||
uint16_t val; | |||
uint16_t val0, val1; | |||
ctr = pos = 0; | |||
while (ctr < len && pos + 2 <= buflen) { | |||
val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); | |||
pos += 2; | |||
while (ctr < len && pos + 3 <= buflen) { | |||
val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; | |||
val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF; | |||
pos += 3; | |||
if (val < 19 * KYBER_Q) { | |||
val -= (val >> 12) * KYBER_Q; // Barrett reduction | |||
r[ctr++] = (int16_t)val; | |||
if (val0 < KYBER_Q) { | |||
r[ctr++] = val0; | |||
} | |||
if (ctr < len && val1 < KYBER_Q) { | |||
r[ctr++] = val1; | |||
} | |||
} | |||
@@ -160,17 +152,16 @@ static unsigned int rej_uniform(int16_t *r, | |||
* uniformly random. Performs rejection sampling on output of | |||
* a XOF | |||
* | |||
* Arguments: - polyvec *a: pointer to ouptput matrix A | |||
* Arguments: - polyvec *a: pointer to ouptput matrix A | |||
* - const uint8_t *seed: pointer to input seed | |||
* - int transposed: boolean deciding whether A or A^T | |||
* is generated | |||
* - int transposed: boolean deciding whether A or A^T is generated | |||
**************************************************/ | |||
#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \ | |||
+ XOF_BLOCKBYTES)/XOF_BLOCKBYTES) | |||
#define GEN_MATRIX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) | |||
// Not static for benchmarking | |||
void PQCLEAN_KYBER1024_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed) { | |||
unsigned int ctr, i, j; | |||
uint8_t buf[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES]; | |||
unsigned int ctr, i, j, k; | |||
unsigned int buflen, off; | |||
uint8_t buf[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2]; | |||
xof_state state; | |||
for (i = 0; i < KYBER_K; i++) { | |||
@@ -182,12 +173,17 @@ void PQCLEAN_KYBER1024_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYM | |||
} | |||
xof_squeezeblocks(buf, GEN_MATRIX_NBLOCKS, &state); | |||
ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, sizeof(buf)); | |||
buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES; | |||
ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, buflen); | |||
while (ctr < KYBER_N) { | |||
xof_squeezeblocks(buf, 1, &state); | |||
ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, | |||
XOF_BLOCKBYTES); | |||
off = buflen % 3; | |||
for (k = 0; k < off; k++) { | |||
buf[k] = buf[buflen - off + k]; | |||
} | |||
xof_squeezeblocks(buf + off, 1, &state); | |||
buflen = off + XOF_BLOCKBYTES; | |||
ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, buflen); | |||
} | |||
xof_ctx_release(&state); | |||
} | |||
@@ -220,10 +216,10 @@ void PQCLEAN_KYBER1024_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYT | |||
gen_a(a, publicseed); | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_CLEAN_poly_getnoise(&skpv.vec[i], noiseseed, nonce++); | |||
PQCLEAN_KYBER1024_CLEAN_poly_getnoise_eta1(&skpv.vec[i], noiseseed, nonce++); | |||
} | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_CLEAN_poly_getnoise(&e.vec[i], noiseseed, nonce++); | |||
PQCLEAN_KYBER1024_CLEAN_poly_getnoise_eta1(&e.vec[i], noiseseed, nonce++); | |||
} | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_ntt(&skpv); | |||
@@ -231,7 +227,7 @@ void PQCLEAN_KYBER1024_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYT | |||
// matrix-vector multiplication | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_basemul_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); | |||
PQCLEAN_KYBER1024_CLEAN_poly_tomont(&pkpv.vec[i]); | |||
} | |||
@@ -248,16 +244,15 @@ void PQCLEAN_KYBER1024_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYT | |||
* Description: Encryption function of the CPA-secure | |||
* public-key encryption scheme underlying Kyber. | |||
* | |||
* Arguments: - uint8_t *c: pointer to output ciphertext | |||
* (of length KYBER_INDCPA_BYTES bytes) | |||
* - const uint8_t *m: pointer to input message | |||
* (of length KYBER_INDCPA_MSGBYTES bytes) | |||
* - const uint8_t *pk: pointer to input public key | |||
* (of length KYBER_INDCPA_PUBLICKEYBYTES) | |||
* - const uint8_t *coins: pointer to input random coins | |||
* used as seed (of length KYBER_SYMBYTES) | |||
* to deterministically generate all | |||
* randomness | |||
* Arguments: - uint8_t *c: pointer to output ciphertext | |||
* (of length KYBER_INDCPA_BYTES bytes) | |||
* - const uint8_t *m: pointer to input message | |||
* (of length KYBER_INDCPA_MSGBYTES bytes) | |||
* - const uint8_t *pk: pointer to input public key | |||
* (of length KYBER_INDCPA_PUBLICKEYBYTES) | |||
* - const uint8_t *coins: pointer to input random coins used as seed | |||
* (of length KYBER_SYMBYTES) to deterministically | |||
* generate all randomness | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], | |||
const uint8_t m[KYBER_INDCPA_MSGBYTES], | |||
@@ -266,7 +261,7 @@ void PQCLEAN_KYBER1024_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], | |||
unsigned int i; | |||
uint8_t seed[KYBER_SYMBYTES]; | |||
uint8_t nonce = 0; | |||
polyvec sp, pkpv, ep, at[KYBER_K], bp; | |||
polyvec sp, pkpv, ep, at[KYBER_K], b; | |||
poly v, k, epp; | |||
unpack_pk(&pkpv, seed, pk); | |||
@@ -274,32 +269,32 @@ void PQCLEAN_KYBER1024_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], | |||
gen_at(at, seed); | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_CLEAN_poly_getnoise(sp.vec + i, coins, nonce++); | |||
PQCLEAN_KYBER1024_CLEAN_poly_getnoise_eta1(sp.vec + i, coins, nonce++); | |||
} | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_CLEAN_poly_getnoise(ep.vec + i, coins, nonce++); | |||
PQCLEAN_KYBER1024_CLEAN_poly_getnoise_eta2(ep.vec + i, coins, nonce++); | |||
} | |||
PQCLEAN_KYBER1024_CLEAN_poly_getnoise(&epp, coins, nonce++); | |||
PQCLEAN_KYBER1024_CLEAN_poly_getnoise_eta2(&epp, coins, nonce++); | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_ntt(&sp); | |||
// matrix-vector multiplication | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp); | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_basemul_acc_montgomery(&b.vec[i], &at[i], &sp); | |||
} | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp); | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_basemul_acc_montgomery(&v, &pkpv, &sp); | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_invntt_tomont(&bp); | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_invntt_tomont(&b); | |||
PQCLEAN_KYBER1024_CLEAN_poly_invntt_tomont(&v); | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_add(&bp, &bp, &ep); | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_add(&b, &b, &ep); | |||
PQCLEAN_KYBER1024_CLEAN_poly_add(&v, &v, &epp); | |||
PQCLEAN_KYBER1024_CLEAN_poly_add(&v, &v, &k); | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_reduce(&bp); | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_reduce(&b); | |||
PQCLEAN_KYBER1024_CLEAN_poly_reduce(&v); | |||
pack_ciphertext(c, &bp, &v); | |||
pack_ciphertext(c, &b, &v); | |||
} | |||
/************************************************* | |||
@@ -308,24 +303,24 @@ void PQCLEAN_KYBER1024_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], | |||
* Description: Decryption function of the CPA-secure | |||
* public-key encryption scheme underlying Kyber. | |||
* | |||
* Arguments: - uint8_t *m: pointer to output decrypted message | |||
* (of length KYBER_INDCPA_MSGBYTES) | |||
* - const uint8_t *c: pointer to input ciphertext | |||
* (of length KYBER_INDCPA_BYTES) | |||
* Arguments: - uint8_t *m: pointer to output decrypted message | |||
* (of length KYBER_INDCPA_MSGBYTES) | |||
* - const uint8_t *c: pointer to input ciphertext | |||
* (of length KYBER_INDCPA_BYTES) | |||
* - const uint8_t *sk: pointer to input secret key | |||
* (of length KYBER_INDCPA_SECRETKEYBYTES) | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_CLEAN_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], | |||
const uint8_t c[KYBER_INDCPA_BYTES], | |||
const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { | |||
polyvec bp, skpv; | |||
polyvec b, skpv; | |||
poly v, mp; | |||
unpack_ciphertext(&bp, &v, c); | |||
unpack_ciphertext(&b, &v, c); | |||
unpack_sk(&skpv, sk); | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_ntt(&bp); | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp); | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_ntt(&b); | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_basemul_acc_montgomery(&mp, &skpv, &b); | |||
PQCLEAN_KYBER1024_CLEAN_poly_invntt_tomont(&mp); | |||
PQCLEAN_KYBER1024_CLEAN_poly_sub(&mp, &v, &mp); | |||
@@ -14,13 +14,14 @@ | |||
* for CCA-secure Kyber key encapsulation mechanism | |||
* | |||
* Arguments: - unsigned char *pk: pointer to output public key | |||
* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) | |||
* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) | |||
* - unsigned char *sk: pointer to output private key | |||
* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) | |||
* (an already allocated array of KYBER_SECRETKEYBYTES bytes) | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_KYBER1024_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { | |||
int PQCLEAN_KYBER1024_CLEAN_crypto_kem_keypair(unsigned char pk[KYBER_PUBLICKEYBYTES], | |||
unsigned char sk[KYBER_SECRETKEYBYTES]) { | |||
size_t i; | |||
PQCLEAN_KYBER1024_CLEAN_indcpa_keypair(pk, sk); | |||
for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) { | |||
@@ -39,17 +40,17 @@ int PQCLEAN_KYBER1024_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char | |||
* secret for given public key | |||
* | |||
* Arguments: - unsigned char *ct: pointer to output cipher text | |||
* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) | |||
* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) | |||
* - unsigned char *ss: pointer to output shared secret | |||
* (an already allocated array of CRYPTO_BYTES bytes) | |||
* (an already allocated array of KYBER_SSBYTES bytes) | |||
* - const unsigned char *pk: pointer to input public key | |||
* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) | |||
* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) | |||
* | |||
* Returns 0 (success) | |||
**************************************************/ | |||
int PQCLEAN_KYBER1024_CLEAN_crypto_kem_enc(unsigned char *ct, | |||
unsigned char *ss, | |||
const unsigned char *pk) { | |||
int PQCLEAN_KYBER1024_CLEAN_crypto_kem_enc(unsigned char ct[KYBER_CIPHERTEXTBYTES], | |||
unsigned char ss[KYBER_SSBYTES], | |||
const unsigned char pk[KYBER_PUBLICKEYBYTES]) { | |||
uint8_t buf[2 * KYBER_SYMBYTES]; | |||
/* Will contain key, coins */ | |||
uint8_t kr[2 * KYBER_SYMBYTES]; | |||
@@ -79,19 +80,19 @@ int PQCLEAN_KYBER1024_CLEAN_crypto_kem_enc(unsigned char *ct, | |||
* cipher text and private key | |||
* | |||
* Arguments: - unsigned char *ss: pointer to output shared secret | |||
* (an already allocated array of CRYPTO_BYTES bytes) | |||
* (an already allocated array of KYBER_SSBYTES bytes) | |||
* - const unsigned char *ct: pointer to input cipher text | |||
* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) | |||
* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) | |||
* - const unsigned char *sk: pointer to input private key | |||
* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) | |||
* (an already allocated array of KYBER_SECRETKEYBYTES bytes) | |||
* | |||
* Returns 0. | |||
* | |||
* On failure, ss will contain a pseudo-random value. | |||
**************************************************/ | |||
int PQCLEAN_KYBER1024_CLEAN_crypto_kem_dec(unsigned char *ss, | |||
const unsigned char *ct, | |||
const unsigned char *sk) { | |||
int PQCLEAN_KYBER1024_CLEAN_crypto_kem_dec(unsigned char ss[KYBER_SSBYTES], | |||
const unsigned char ct[KYBER_CIPHERTEXTBYTES], | |||
const unsigned char sk[KYBER_SECRETKEYBYTES]) { | |||
size_t i; | |||
int fail; | |||
uint8_t buf[2 * KYBER_SYMBYTES]; | |||
@@ -3,11 +3,11 @@ | |||
#include "reduce.h" | |||
#include <stdint.h> | |||
/* Code to generate PQCLEAN_KYBER1024_CLEAN_zetas and PQCLEAN_KYBER1024_CLEAN_zetas_inv used in the number-theoretic transform: | |||
/* Code to generate PQCLEAN_KYBER1024_CLEAN_zetas and zetas_inv used in the number-theoretic transform: | |||
#define KYBER_ROOT_OF_UNITY 17 | |||
static const uint16_t tree[128] = { | |||
static const uint8_t tree[128] = { | |||
0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120, | |||
4, 68, 36, 100, 20, 84, 52, 116, 12, 76, 44, 108, 28, 92, 60, 124, | |||
2, 66, 34, 98, 18, 82, 50, 114, 10, 74, 42, 106, 26, 90, 58, 122, | |||
@@ -19,51 +19,41 @@ static const uint16_t tree[128] = { | |||
}; | |||
void init_ntt() { | |||
unsigned int i, j, k; | |||
unsigned int i; | |||
int16_t tmp[128]; | |||
tmp[0] = MONT; | |||
for(i = 1; i < 128; ++i) | |||
tmp[i] = fqmul(tmp[i-1], KYBER_ROOT_OF_UNITY*MONT % KYBER_Q); | |||
for(i=1;i<128;i++) | |||
tmp[i] = fqmul(tmp[i-1],MONT*KYBER_ROOT_OF_UNITY % KYBER_Q); | |||
for(i = 0; i < 128; ++i) | |||
for(i=0;i<128;i++) { | |||
PQCLEAN_KYBER1024_CLEAN_zetas[i] = tmp[tree[i]]; | |||
k = 0; | |||
for(i = 64; i >= 1; i >>= 1) | |||
for(j = i; j < 2*i; ++j) | |||
PQCLEAN_KYBER1024_CLEAN_zetas_inv[k++] = -tmp[128 - tree[j]]; | |||
PQCLEAN_KYBER1024_CLEAN_zetas_inv[127] = MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q; | |||
if(PQCLEAN_KYBER1024_CLEAN_zetas[i] > KYBER_Q/2) | |||
PQCLEAN_KYBER1024_CLEAN_zetas[i] -= KYBER_Q; | |||
if(PQCLEAN_KYBER1024_CLEAN_zetas[i] < -KYBER_Q/2) | |||
PQCLEAN_KYBER1024_CLEAN_zetas[i] += KYBER_Q; | |||
} | |||
} | |||
*/ | |||
const int16_t PQCLEAN_KYBER1024_CLEAN_zetas[128] = { | |||
2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182, 962, | |||
2127, 1855, 1468, 573, 2004, 264, 383, 2500, 1458, 1727, 3199, 2648, 1017, | |||
732, 608, 1787, 411, 3124, 1758, 1223, 652, 2777, 1015, 2036, 1491, 3047, | |||
1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, 2476, 3239, 3058, 830, | |||
107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, 2226, | |||
430, 555, 843, 2078, 871, 1550, 105, 422, 587, 177, 3094, 3038, 2869, 1574, | |||
1653, 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349, | |||
418, 329, 3173, 3254, 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193, | |||
1218, 1994, 2455, 220, 2142, 1670, 2144, 1799, 2051, 794, 1819, 2475, 2459, | |||
478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628 | |||
}; | |||
const int16_t PQCLEAN_KYBER1024_CLEAN_zetas_inv[128] = { | |||
1701, 1807, 1460, 2371, 2338, 2333, 308, 108, 2851, 870, 854, 1510, 2535, | |||
1278, 1530, 1185, 1659, 1187, 3109, 874, 1335, 2111, 136, 1215, 2945, 1465, | |||
1285, 2007, 2719, 2726, 2232, 2512, 75, 156, 3000, 2911, 2980, 872, 2685, | |||
1590, 2210, 602, 1846, 777, 147, 2170, 2551, 246, 1676, 1755, 460, 291, 235, | |||
3152, 2742, 2907, 3224, 1779, 2458, 1251, 2486, 2774, 2899, 1103, 1275, 2652, | |||
1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, | |||
1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, | |||
2677, 2106, 1571, 205, 2918, 1542, 2721, 2597, 2312, 681, 130, 1602, 1871, | |||
829, 2946, 3065, 1325, 2756, 1861, 1474, 1202, 2367, 3147, 1752, 2707, 171, | |||
3127, 3042, 1907, 1836, 1517, 359, 758, 1441 | |||
}; | |||
-1044, -758, -359, -1517, 1493, 1422, 287, 202, | |||
-171, 622, 1577, 182, 962, -1202, -1474, 1468, | |||
573, -1325, 264, 383, -829, 1458, -1602, -130, | |||
-681, 1017, 732, 608, -1542, 411, -205, -1571, | |||
1223, 652, -552, 1015, -1293, 1491, -282, -1544, | |||
516, -8, -320, -666, -1618, -1162, 126, 1469, | |||
-853, -90, -271, 830, 107, -1421, -247, -951, | |||
-398, 961, -1508, -725, 448, -1065, 677, -1275, | |||
-1103, 430, 555, 843, -1251, 871, 1550, 105, | |||
422, 587, 177, -235, -291, -460, 1574, 1653, | |||
-246, 778, 1159, -147, -777, 1483, -602, 1119, | |||
-1590, 644, -872, 349, 418, 329, -156, -75, | |||
817, 1097, 603, 610, 1322, -1285, -1465, 384, | |||
-1215, -136, 1218, -1335, -874, 220, -1187, -1659, | |||
-1185, -1530, -1278, 794, -1510, -854, -870, 478, | |||
-108, -308, 996, 991, 958, -1460, 1522, 1628 | |||
}; | |||
/************************************************* | |||
* Name: fqmul | |||
@@ -82,11 +72,10 @@ static int16_t fqmul(int16_t a, int16_t b) { | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_CLEAN_ntt | |||
* | |||
* Description: Inplace number-theoretic transform (NTT) in Rq | |||
* Description: Inplace number-theoretic transform (NTT) in Rq. | |||
* input is in standard order, output is in bitreversed order | |||
* | |||
* Arguments: - int16_t r[256]: pointer to input/output vector of elements | |||
* of Zq | |||
* Arguments: - int16_t r[256]: pointer to input/output vector of elements of Zq | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_CLEAN_ntt(int16_t r[256]) { | |||
unsigned int len, start, j, k; | |||
@@ -96,7 +85,7 @@ void PQCLEAN_KYBER1024_CLEAN_ntt(int16_t r[256]) { | |||
for (len = 128; len >= 2; len >>= 1) { | |||
for (start = 0; start < 256; start = j + len) { | |||
zeta = PQCLEAN_KYBER1024_CLEAN_zetas[k++]; | |||
for (j = start; j < start + len; ++j) { | |||
for (j = start; j < start + len; j++) { | |||
t = fqmul(zeta, r[j + len]); | |||
r[j + len] = r[j] - t; | |||
r[j] = r[j] + t; | |||
@@ -112,28 +101,28 @@ void PQCLEAN_KYBER1024_CLEAN_ntt(int16_t r[256]) { | |||
* multiplication by Montgomery factor 2^16. | |||
* Input is in bitreversed order, output is in standard order | |||
* | |||
* Arguments: - int16_t r[256]: pointer to input/output vector of elements | |||
* of Zq | |||
* Arguments: - int16_t r[256]: pointer to input/output vector of elements of Zq | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_CLEAN_invntt(int16_t r[256]) { | |||
unsigned int start, len, j, k; | |||
int16_t t, zeta; | |||
const int16_t f = 1441; // mont^2/128 | |||
k = 0; | |||
k = 127; | |||
for (len = 2; len <= 128; len <<= 1) { | |||
for (start = 0; start < 256; start = j + len) { | |||
zeta = PQCLEAN_KYBER1024_CLEAN_zetas_inv[k++]; | |||
for (j = start; j < start + len; ++j) { | |||
zeta = PQCLEAN_KYBER1024_CLEAN_zetas[k--]; | |||
for (j = start; j < start + len; j++) { | |||
t = r[j]; | |||
r[j] = PQCLEAN_KYBER1024_CLEAN_barrett_reduce(t + r[j + len]); | |||
r[j + len] = t - r[j + len]; | |||
r[j + len] = r[j + len] - t; | |||
r[j + len] = fqmul(zeta, r[j + len]); | |||
} | |||
} | |||
} | |||
for (j = 0; j < 256; ++j) { | |||
r[j] = fqmul(r[j], PQCLEAN_KYBER1024_CLEAN_zetas_inv[127]); | |||
for (j = 0; j < 256; j++) { | |||
r[j] = fqmul(r[j], f); | |||
} | |||
} | |||
@@ -143,19 +132,15 @@ void PQCLEAN_KYBER1024_CLEAN_invntt(int16_t r[256]) { | |||
* Description: Multiplication of polynomials in Zq[X]/(X^2-zeta) | |||
* used for multiplication of elements in Rq in NTT domain | |||
* | |||
* Arguments: - int16_t r[2]: pointer to the output polynomial | |||
* Arguments: - int16_t r[2]: pointer to the output polynomial | |||
* - const int16_t a[2]: pointer to the first factor | |||
* - const int16_t b[2]: pointer to the second factor | |||
* - int16_t zeta: integer defining the reduction polynomial | |||
* - int16_t zeta: integer defining the reduction polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_CLEAN_basemul(int16_t r[2], | |||
const int16_t a[2], | |||
const int16_t b[2], | |||
int16_t zeta) { | |||
void PQCLEAN_KYBER1024_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta) { | |||
r[0] = fqmul(a[1], b[1]); | |||
r[0] = fqmul(r[0], zeta); | |||
r[0] += fqmul(a[0], b[0]); | |||
r[1] = fqmul(a[0], b[1]); | |||
r[1] += fqmul(a[1], b[0]); | |||
} |
@@ -5,15 +5,10 @@ | |||
extern const int16_t PQCLEAN_KYBER1024_CLEAN_zetas[128]; | |||
extern const int16_t PQCLEAN_KYBER1024_CLEAN_zetas_inv[128]; | |||
void PQCLEAN_KYBER1024_CLEAN_ntt(int16_t r[256]); | |||
void PQCLEAN_KYBER1024_CLEAN_invntt(int16_t r[256]); | |||
void PQCLEAN_KYBER1024_CLEAN_basemul(int16_t r[2], | |||
const int16_t a[2], | |||
const int16_t b[2], | |||
int16_t zeta); | |||
void PQCLEAN_KYBER1024_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta); | |||
#endif |
@@ -7,8 +7,6 @@ | |||
#define KYBER_N 256 | |||
#define KYBER_Q 3329 | |||
#define KYBER_ETA 2 | |||
#define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ | |||
#define KYBER_SSBYTES 32 /* size in bytes of shared key */ | |||
@@ -16,20 +14,20 @@ | |||
#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) | |||
#define KYBER_K 4 | |||
#define KYBER_ETA1 2 | |||
#define KYBER_POLYCOMPRESSEDBYTES 160 | |||
#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 352) | |||
#define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES | |||
#define KYBER_ETA2 2 | |||
#define KYBER_INDCPA_MSGBYTES (KYBER_SYMBYTES) | |||
#define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) | |||
#define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) | |||
#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES \ | |||
+ KYBER_POLYCOMPRESSEDBYTES) | |||
#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES) | |||
#define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES) | |||
/* 32 bytes of additional space to save H(pk) */ | |||
#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES \ | |||
+ KYBER_INDCPA_PUBLICKEYBYTES \ | |||
+ 2*KYBER_SYMBYTES) | |||
#define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES | |||
#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) | |||
#define KYBER_CIPHERTEXTBYTES (KYBER_INDCPA_BYTES) | |||
#endif |
@@ -13,17 +13,19 @@ | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* (of length KYBER_POLYCOMPRESSEDBYTES) | |||
* - poly *a: pointer to input polynomial | |||
* - const poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a) { | |||
void PQCLEAN_KYBER1024_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a) { | |||
size_t i, j; | |||
int16_t u; | |||
uint8_t t[8]; | |||
PQCLEAN_KYBER1024_CLEAN_poly_csubq(a); | |||
for (i = 0; i < KYBER_N / 8; i++) { | |||
for (j = 0; j < 8; j++) { | |||
t[j] = ((((uint32_t)a->coeffs[8 * i + j] << 5) + KYBER_Q / 2) / KYBER_Q) & 31; | |||
// map to positive standard representatives | |||
u = a->coeffs[8 * i + j]; | |||
u += (u >> 15) & KYBER_Q; | |||
t[j] = ((((uint32_t)u << 5) + KYBER_Q / 2) / KYBER_Q) & 31; | |||
} | |||
r[0] = (t[0] >> 0) | (t[1] << 5); | |||
@@ -41,7 +43,7 @@ void PQCLEAN_KYBER1024_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], | |||
* Description: De-serialization and subsequent decompression of a polynomial; | |||
* approximate inverse of PQCLEAN_KYBER1024_CLEAN_poly_compress | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *a: pointer to input byte array | |||
* (of length KYBER_POLYCOMPRESSEDBYTES bytes) | |||
**************************************************/ | |||
@@ -74,20 +76,21 @@ void PQCLEAN_KYBER1024_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_POLY | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* (needs space for KYBER_POLYBYTES bytes) | |||
* - poly *a: pointer to input polynomial | |||
* - const poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { | |||
void PQCLEAN_KYBER1024_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a) { | |||
size_t i; | |||
uint16_t t0, t1; | |||
PQCLEAN_KYBER1024_CLEAN_poly_csubq(a); | |||
for (i = 0; i < KYBER_N / 2; i++) { | |||
t0 = a->coeffs[2 * i]; | |||
// map to positive standard representatives | |||
t0 = a->coeffs[2 * i]; | |||
t0 += ((int16_t)t0 >> 15) & KYBER_Q; | |||
t1 = a->coeffs[2 * i + 1]; | |||
r[3 * i + 0] = (uint8_t)(t0 >> 0); | |||
r[3 * i + 1] = (uint8_t)((t0 >> 8) | (t1 << 4)); | |||
r[3 * i + 2] = (uint8_t)(t1 >> 4); | |||
t1 += ((int16_t)t1 >> 15) & KYBER_Q; | |||
r[3 * i + 0] = (t0 >> 0); | |||
r[3 * i + 1] = (t0 >> 8) | (t1 << 4); | |||
r[3 * i + 2] = (t1 >> 4); | |||
} | |||
} | |||
@@ -97,7 +100,7 @@ void PQCLEAN_KYBER1024_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { | |||
* Description: De-serialization of a polynomial; | |||
* inverse of PQCLEAN_KYBER1024_CLEAN_poly_tobytes | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *a: pointer to input byte array | |||
* (of KYBER_POLYBYTES bytes) | |||
**************************************************/ | |||
@@ -114,7 +117,7 @@ void PQCLEAN_KYBER1024_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYB | |||
* | |||
* Description: Convert 32-byte message to polynomial | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *msg: pointer to input message | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { | |||
@@ -135,41 +138,60 @@ void PQCLEAN_KYBER1024_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCP | |||
* Description: Convert polynomial to 32-byte message | |||
* | |||
* Arguments: - uint8_t *msg: pointer to output message | |||
* - poly *a: pointer to input polynomial | |||
* - const poly *a: pointer to input polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a) { | |||
void PQCLEAN_KYBER1024_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *a) { | |||
size_t i, j; | |||
uint16_t t; | |||
PQCLEAN_KYBER1024_CLEAN_poly_csubq(a); | |||
for (i = 0; i < KYBER_N / 8; i++) { | |||
msg[i] = 0; | |||
for (j = 0; j < 8; j++) { | |||
t = ((((uint16_t)a->coeffs[8 * i + j] << 1) + KYBER_Q / 2) / KYBER_Q) & 1; | |||
t = a->coeffs[8 * i + j]; | |||
t += ((int16_t)t >> 15) & KYBER_Q; | |||
t = (((t << 1) + KYBER_Q / 2) / KYBER_Q) & 1; | |||
msg[i] |= t << j; | |||
} | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_CLEAN_poly_getnoise | |||
* Name: PQCLEAN_KYBER1024_CLEAN_poly_getnoise_eta1 | |||
* | |||
* Description: Sample a polynomial deterministically from a seed and a nonce, | |||
* with output polynomial close to centered binomial distribution | |||
* with parameter KYBER_ETA1 | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *seed: pointer to input seed | |||
* (of length KYBER_SYMBYTES bytes) | |||
* - uint8_t nonce: one-byte input nonce | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_CLEAN_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { | |||
uint8_t buf[KYBER_ETA1 * KYBER_N / 4]; | |||
prf(buf, sizeof(buf), seed, nonce); | |||
PQCLEAN_KYBER1024_CLEAN_poly_cbd_eta1(r, buf); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_CLEAN_poly_getnoise_eta2 | |||
* | |||
* Description: Sample a polynomial deterministically from a seed and a nonce, | |||
* with output polynomial close to centered binomial distribution | |||
* with parameter KYBER_ETA | |||
* with parameter KYBER_ETA2 | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const uint8_t *seed: pointer to input seed | |||
* (of length KYBER_SYMBYTES bytes) | |||
* - uint8_t nonce: one-byte input nonce | |||
* - uint8_t nonce: one-byte input nonce | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { | |||
uint8_t buf[KYBER_ETA * KYBER_N / 4]; | |||
void PQCLEAN_KYBER1024_CLEAN_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { | |||
uint8_t buf[KYBER_ETA2 * KYBER_N / 4]; | |||
prf(buf, sizeof(buf), seed, nonce); | |||
PQCLEAN_KYBER1024_CLEAN_cbd(r, buf); | |||
PQCLEAN_KYBER1024_CLEAN_poly_cbd_eta2(r, buf); | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_CLEAN_poly_ntt | |||
* | |||
@@ -202,7 +224,7 @@ void PQCLEAN_KYBER1024_CLEAN_poly_invntt_tomont(poly *r) { | |||
* | |||
* Description: Multiplication of two polynomials in NTT domain | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const poly *a: pointer to first input polynomial | |||
* - const poly *b: pointer to second input polynomial | |||
**************************************************/ | |||
@@ -210,8 +232,7 @@ void PQCLEAN_KYBER1024_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, con | |||
size_t i; | |||
for (i = 0; i < KYBER_N / 4; i++) { | |||
PQCLEAN_KYBER1024_CLEAN_basemul(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i], PQCLEAN_KYBER1024_CLEAN_zetas[64 + i]); | |||
PQCLEAN_KYBER1024_CLEAN_basemul(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2], &b->coeffs[4 * i + 2], | |||
-PQCLEAN_KYBER1024_CLEAN_zetas[64 + i]); | |||
PQCLEAN_KYBER1024_CLEAN_basemul(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2], &b->coeffs[4 * i + 2], -PQCLEAN_KYBER1024_CLEAN_zetas[64 + i]); | |||
} | |||
} | |||
@@ -246,28 +267,12 @@ void PQCLEAN_KYBER1024_CLEAN_poly_reduce(poly *r) { | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_CLEAN_poly_csubq | |||
* | |||
* Description: Applies conditional subtraction of q to each coefficient | |||
* of a polynomial. For details of conditional subtraction | |||
* of q see comments in reduce.c | |||
* | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_CLEAN_poly_csubq(poly *r) { | |||
size_t i; | |||
for (i = 0; i < KYBER_N; i++) { | |||
r->coeffs[i] = PQCLEAN_KYBER1024_CLEAN_csubq(r->coeffs[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_CLEAN_poly_add | |||
* | |||
* Description: Add two polynomials | |||
* Description: Add two polynomials; no modular reduction is performed | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const poly *a: pointer to first input polynomial | |||
* - const poly *b: pointer to second input polynomial | |||
**************************************************/ | |||
@@ -281,7 +286,7 @@ void PQCLEAN_KYBER1024_CLEAN_poly_add(poly *r, const poly *a, const poly *b) { | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_CLEAN_poly_sub | |||
* | |||
* Description: Subtract two polynomials | |||
* Description: Subtract two polynomials; no modular reduction is performed | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const poly *a: pointer to first input polynomial | |||
@@ -11,16 +11,18 @@ typedef struct { | |||
int16_t coeffs[KYBER_N]; | |||
} poly; | |||
void PQCLEAN_KYBER1024_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a); | |||
void PQCLEAN_KYBER1024_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a); | |||
void PQCLEAN_KYBER1024_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]); | |||
void PQCLEAN_KYBER1024_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a); | |||
void PQCLEAN_KYBER1024_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a); | |||
void PQCLEAN_KYBER1024_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]); | |||
void PQCLEAN_KYBER1024_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]); | |||
void PQCLEAN_KYBER1024_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a); | |||
void PQCLEAN_KYBER1024_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *a); | |||
void PQCLEAN_KYBER1024_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); | |||
void PQCLEAN_KYBER1024_CLEAN_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); | |||
void PQCLEAN_KYBER1024_CLEAN_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); | |||
void PQCLEAN_KYBER1024_CLEAN_poly_ntt(poly *r); | |||
void PQCLEAN_KYBER1024_CLEAN_poly_invntt_tomont(poly *r); | |||
@@ -28,7 +30,6 @@ void PQCLEAN_KYBER1024_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, con | |||
void PQCLEAN_KYBER1024_CLEAN_poly_tomont(poly *r); | |||
void PQCLEAN_KYBER1024_CLEAN_poly_reduce(poly *r); | |||
void PQCLEAN_KYBER1024_CLEAN_poly_csubq(poly *r); | |||
void PQCLEAN_KYBER1024_CLEAN_poly_add(poly *r, const poly *a, const poly *b); | |||
void PQCLEAN_KYBER1024_CLEAN_poly_sub(poly *r, const poly *a, const poly *b); | |||
@@ -10,19 +10,18 @@ | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* (needs space for KYBER_POLYVECCOMPRESSEDBYTES) | |||
* - polyvec *a: pointer to input vector of polynomials | |||
* - const polyvec *a: pointer to input vector of polynomials | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a) { | |||
void PQCLEAN_KYBER1024_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a) { | |||
unsigned int i, j, k; | |||
PQCLEAN_KYBER1024_CLEAN_polyvec_csubq(a); | |||
uint16_t t[8]; | |||
for (i = 0; i < KYBER_K; i++) { | |||
for (j = 0; j < KYBER_N / 8; j++) { | |||
for (k = 0; k < 8; k++) { | |||
t[k] = ((((uint32_t)a->vec[i].coeffs[8 * j + k] << 11) + KYBER_Q / 2) | |||
/ KYBER_Q) & 0x7ff; | |||
t[k] = a->vec[i].coeffs[8 * j + k]; | |||
t[k] += ((int16_t)t[k] >> 15) & KYBER_Q; | |||
t[k] = ((((uint32_t)t[k] << 11) + KYBER_Q / 2) / KYBER_Q) & 0x7ff; | |||
} | |||
r[ 0] = (uint8_t)(t[0] >> 0); | |||
@@ -51,8 +50,7 @@ void PQCLEAN_KYBER1024_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDB | |||
* - const uint8_t *a: pointer to input byte array | |||
* (of length KYBER_POLYVECCOMPRESSEDBYTES) | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_CLEAN_polyvec_decompress(polyvec *r, | |||
const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { | |||
void PQCLEAN_KYBER1024_CLEAN_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { | |||
unsigned int i, j, k; | |||
uint16_t t[8]; | |||
@@ -82,9 +80,9 @@ void PQCLEAN_KYBER1024_CLEAN_polyvec_decompress(polyvec *r, | |||
* | |||
* Arguments: - uint8_t *r: pointer to output byte array | |||
* (needs space for KYBER_POLYVECBYTES) | |||
* - polyvec *a: pointer to input vector of polynomials | |||
* - const polyvec *a: pointer to input vector of polynomials | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a) { | |||
void PQCLEAN_KYBER1024_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a) { | |||
unsigned int i; | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_CLEAN_poly_tobytes(r + i * KYBER_POLYBYTES, &a->vec[i]); | |||
@@ -138,18 +136,16 @@ void PQCLEAN_KYBER1024_CLEAN_polyvec_invntt_tomont(polyvec *r) { | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery | |||
* Name: PQCLEAN_KYBER1024_CLEAN_polyvec_basemul_acc_montgomery | |||
* | |||
* Description: Pointwise multiply elements of a and b, accumulate into r, | |||
* Description: Multiply elements of a and b in NTT domain, accumulate into r, | |||
* and multiply by 2^-16. | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const polyvec *a: pointer to first input vector of polynomials | |||
* - const polyvec *b: pointer to second input vector of polynomials | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, | |||
const polyvec *a, | |||
const polyvec *b) { | |||
void PQCLEAN_KYBER1024_CLEAN_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) { | |||
unsigned int i; | |||
poly t; | |||
@@ -166,10 +162,10 @@ void PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, | |||
* Name: PQCLEAN_KYBER1024_CLEAN_polyvec_reduce | |||
* | |||
* Description: Applies Barrett reduction to each coefficient | |||
* of each element of a vector of polynomials | |||
* of each element of a vector of polynomials; | |||
* for details of the Barrett reduction see comments in reduce.c | |||
* | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
* Arguments: - polyvec *r: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_CLEAN_polyvec_reduce(polyvec *r) { | |||
unsigned int i; | |||
@@ -178,29 +174,12 @@ void PQCLEAN_KYBER1024_CLEAN_polyvec_reduce(polyvec *r) { | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_CLEAN_polyvec_csubq | |||
* | |||
* Description: Applies conditional subtraction of q to each coefficient | |||
* of each element of a vector of polynomials | |||
* for details of conditional subtraction of q see comments in | |||
* reduce.c | |||
* | |||
* Arguments: - poly *r: pointer to input/output polynomial | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_CLEAN_polyvec_csubq(polyvec *r) { | |||
unsigned int i; | |||
for (i = 0; i < KYBER_K; i++) { | |||
PQCLEAN_KYBER1024_CLEAN_poly_csubq(&r->vec[i]); | |||
} | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_CLEAN_polyvec_add | |||
* | |||
* Description: Add vectors of polynomials | |||
* | |||
* Arguments: - polyvec *r: pointer to output vector of polynomials | |||
* Arguments: - polyvec *r: pointer to output vector of polynomials | |||
* - const polyvec *a: pointer to first input vector of polynomials | |||
* - const polyvec *b: pointer to second input vector of polynomials | |||
**************************************************/ | |||
@@ -8,22 +8,18 @@ typedef struct { | |||
poly vec[KYBER_K]; | |||
} polyvec; | |||
void PQCLEAN_KYBER1024_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a); | |||
void PQCLEAN_KYBER1024_CLEAN_polyvec_decompress(polyvec *r, | |||
const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); | |||
void PQCLEAN_KYBER1024_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a); | |||
void PQCLEAN_KYBER1024_CLEAN_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); | |||
void PQCLEAN_KYBER1024_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a); | |||
void PQCLEAN_KYBER1024_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a); | |||
void PQCLEAN_KYBER1024_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]); | |||
void PQCLEAN_KYBER1024_CLEAN_polyvec_ntt(polyvec *r); | |||
void PQCLEAN_KYBER1024_CLEAN_polyvec_invntt_tomont(polyvec *r); | |||
void PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, | |||
const polyvec *a, | |||
const polyvec *b); | |||
void PQCLEAN_KYBER1024_CLEAN_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b); | |||
void PQCLEAN_KYBER1024_CLEAN_polyvec_reduce(polyvec *r); | |||
void PQCLEAN_KYBER1024_CLEAN_polyvec_csubq(polyvec *r); | |||
void PQCLEAN_KYBER1024_CLEAN_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); | |||
@@ -6,8 +6,7 @@ | |||
* Name: PQCLEAN_KYBER1024_CLEAN_montgomery_reduce | |||
* | |||
* Description: Montgomery reduction; given a 32-bit integer a, computes | |||
* 16-bit integer congruent to a * R^-1 mod q, | |||
* where R=2^16 | |||
* 16-bit integer congruent to a * R^-1 mod q, where R=2^16 | |||
* | |||
* Arguments: - int32_t a: input integer to be reduced; | |||
* has to be in {-q2^15,...,q2^15-1} | |||
@@ -29,32 +28,17 @@ int16_t PQCLEAN_KYBER1024_CLEAN_montgomery_reduce(int32_t a) { | |||
* Name: PQCLEAN_KYBER1024_CLEAN_barrett_reduce | |||
* | |||
* Description: Barrett reduction; given a 16-bit integer a, computes | |||
* 16-bit integer congruent to a mod q in {0,...,q} | |||
* centered representative congruent to a mod q in {-(q-1)/2,...,(q-1)/2} | |||
* | |||
* Arguments: - int16_t a: input integer to be reduced | |||
* | |||
* Returns: integer in {0,...,q} congruent to a modulo q. | |||
* Returns: integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q. | |||
**************************************************/ | |||
int16_t PQCLEAN_KYBER1024_CLEAN_barrett_reduce(int16_t a) { | |||
int16_t t; | |||
const int16_t v = ((1U << 26) + KYBER_Q / 2) / KYBER_Q; | |||
t = (int32_t)v * a >> 26; | |||
t = ((int32_t)v * a + (1 << 25)) >> 26; | |||
t *= KYBER_Q; | |||
return a - t; | |||
} | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER1024_CLEAN_csubq | |||
* | |||
* Description: Conditionallly subtract q | |||
* | |||
* Arguments: - int16_t x: input integer | |||
* | |||
* Returns: a - q if a >= q, else a | |||
**************************************************/ | |||
int16_t PQCLEAN_KYBER1024_CLEAN_csubq(int16_t a) { | |||
a -= KYBER_Q; | |||
a += (a >> 15) & KYBER_Q; | |||
return a; | |||
} |
@@ -10,6 +10,4 @@ int16_t PQCLEAN_KYBER1024_CLEAN_montgomery_reduce(int32_t a); | |||
int16_t PQCLEAN_KYBER1024_CLEAN_barrett_reduce(int16_t a); | |||
int16_t PQCLEAN_KYBER1024_CLEAN_csubq(int16_t a); | |||
#endif |
@@ -9,12 +9,10 @@ | |||
* | |||
* Description: Absorb step of the SHAKE128 specialized for the Kyber context. | |||
* | |||
* Arguments: - xof_state *state: pointer to (uninitialized) output | |||
* Keccak state | |||
* - const uint8_t *seed: pointer to KYBER_SYMBYTES input | |||
* to be absorbed into state | |||
* - uint8_t i additional byte of input | |||
* - uint8_t j additional byte of input | |||
* Arguments: - xof_state *state: pointer to (uninitialized) output Keccak state | |||
* - const uint8_t *seed: pointer to KYBER_SYMBYTES input to be absorbed into state | |||
* - uint8_t i: additional byte of input | |||
* - uint8_t j: additional byte of input | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_CLEAN_kyber_shake128_absorb(xof_state *state, | |||
const uint8_t seed[KYBER_SYMBYTES], | |||
@@ -26,8 +24,8 @@ void PQCLEAN_KYBER1024_CLEAN_kyber_shake128_absorb(xof_state *state, | |||
for (i = 0; i < KYBER_SYMBYTES; i++) { | |||
extseed[i] = seed[i]; | |||
} | |||
extseed[i++] = x; | |||
extseed[i] = y; | |||
extseed[KYBER_SYMBYTES + 0] = x; | |||
extseed[KYBER_SYMBYTES + 1] = y; | |||
shake128_absorb(state, extseed, sizeof(extseed)); | |||
} | |||
@@ -38,23 +36,19 @@ void PQCLEAN_KYBER1024_CLEAN_kyber_shake128_absorb(xof_state *state, | |||
* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input | |||
* and then generates outlen bytes of SHAKE256 output | |||
* | |||
* Arguments: - uint8_t *out: pointer to output | |||
* - size_t outlen: number of requested output bytes | |||
* - const uint8_t *key: pointer to the key | |||
* (of length KYBER_SYMBYTES) | |||
* - uint8_t nonce: single-byte nonce (public PRF input) | |||
* Arguments: - uint8_t *out: pointer to output | |||
* - size_t outlen: number of requested output bytes | |||
* - const uint8_t *key: pointer to the key (of length KYBER_SYMBYTES) | |||
* - uint8_t nonce: single-byte nonce (public PRF input) | |||
**************************************************/ | |||
void PQCLEAN_KYBER1024_CLEAN_kyber_shake256_prf(uint8_t *out, | |||
size_t outlen, | |||
const uint8_t key[KYBER_SYMBYTES], | |||
uint8_t nonce) { | |||
void PQCLEAN_KYBER1024_CLEAN_kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce) { | |||
unsigned int i; | |||
uint8_t extkey[KYBER_SYMBYTES + 1]; | |||
for (i = 0; i < KYBER_SYMBYTES; i++) { | |||
extkey[i] = key[i]; | |||
} | |||
extkey[i] = nonce; | |||
extkey[KYBER_SYMBYTES] = nonce; | |||
shake256(out, outlen, extkey, sizeof(extkey)); | |||
} |
@@ -14,21 +14,16 @@ void PQCLEAN_KYBER1024_CLEAN_kyber_shake128_absorb(xof_state *s, | |||
uint8_t x, | |||
uint8_t y); | |||
void PQCLEAN_KYBER1024_CLEAN_kyber_shake256_prf(uint8_t *out, | |||
size_t outlen, | |||
const uint8_t key[KYBER_SYMBYTES], | |||
uint8_t nonce); | |||
void PQCLEAN_KYBER1024_CLEAN_kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce); | |||
#define XOF_BLOCKBYTES SHAKE128_RATE | |||
#define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES) | |||
#define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES) | |||
#define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER1024_CLEAN_kyber_shake128_absorb(STATE, SEED, X, Y) | |||
#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) \ | |||
shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) | |||
#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) | |||
#define xof_ctx_release(STATE) shake128_ctx_release(STATE) | |||
#define prf(OUT, OUTBYTES, KEY, NONCE) \ | |||
PQCLEAN_KYBER1024_CLEAN_kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE) | |||
#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER1024_CLEAN_kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE) | |||
#define kdf(OUT, IN, INBYTES) shake256(OUT, KYBER_SSBYTES, IN, INBYTES) | |||
@@ -3,10 +3,10 @@ type: kem | |||
claimed-nist-level: 1 | |||
claimed-security: IND-CCA2 | |||
length-public-key: 800 | |||
length-ciphertext: 736 | |||
length-ciphertext: 768 | |||
length-secret-key: 1632 | |||
length-shared-secret: 32 | |||
nistkat-sha256: d081dafce242de5d2a9b1cfe2b304cf5ebaed71b7a91f028fefd569693307d45 | |||
nistkat-sha256: 7bfe0653b63b3fac7ee300a6e4801046c1a3d8d445b271633b6c9d81ed125e5b | |||
principal-submitters: | |||
- Peter Schwabe | |||
auxiliary-submitters: | |||
@@ -21,9 +21,9 @@ auxiliary-submitters: | |||
- Damien Stehlé | |||
implementations: | |||
- name: clean | |||
version: https://github.com/pq-crystals/kyber/commit/844057468e69527bd15b17fbe03f4b61f9a22065 via https://github.com/jschanck/package-pqclean/tree/b45068b8/kyber | |||
version: https://github.com/pq-crystals/kyber/commit/e7faae9f662f5b92fee4e966f09b2f23e1e91c65 via https://github.com/jschanck/package-pqclean/tree/231c9bec/kyber | |||
- name: avx2 | |||
version: https://github.com/pq-crystals/kyber/commit/844057468e69527bd15b17fbe03f4b61f9a22065 via https://github.com/jschanck/package-pqclean/tree/b45068b8/kyber | |||
version: https://github.com/pq-crystals/kyber/commit/e7faae9f662f5b92fee4e966f09b2f23e1e91c65 via https://github.com/jschanck/package-pqclean/tree/231c9bec/kyber | |||
supported_platforms: | |||
- architecture: x86_64 | |||
operating_systems: | |||
@@ -2,52 +2,48 @@ | |||
#include <immintrin.h> | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
/* | |||
Based heavily on public-domain code by Romain Dolbeau | |||
Different handling of nonce+counter than original version | |||
using separated 64-bit nonce and internal 64-bit counter, starting from zero | |||
Public Domain | |||
*/ | |||
/* Based heavily on public-domain code by Romain Dolbeau | |||
* Different handling of nonce+counter than original version using | |||
* separated 64-bit nonce and internal 64-bit counter, starting from zero | |||
* Public Domain */ | |||
static inline void aesni_encrypt4(uint8_t out[64], | |||
__m128i *n, | |||
const __m128i rkeys[16]) { | |||
__m128i f, f0, f1, f2, f3, t; | |||
static inline void aesni_encrypt4(uint8_t out[64], __m128i *n, const __m128i rkeys[16]) { | |||
__m128i f, f0, f1, f2, f3; | |||
const __m128i idx = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); | |||
/* Load current counter value */ | |||
f = _mm_load_si128(n); | |||
/* Increase counter in 4 consecutive blocks */ | |||
t = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); | |||
f0 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(0, 0)), t); | |||
f1 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(1, 0)), t); | |||
f2 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(2, 0)), t); | |||
f3 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(3, 0)), t); | |||
f0 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(0, 0)), idx); | |||
f1 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(1, 0)), idx); | |||
f2 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(2, 0)), idx); | |||
f3 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(3, 0)), idx); | |||
/* Write counter for next iteration, increased by 4 */ | |||
_mm_store_si128(n, _mm_add_epi64(f, _mm_set_epi64x(4, 0))); | |||
/* Actual AES encryption, 4x interleaved */ | |||
t = _mm_load_si128(&rkeys[0]); | |||
f0 = _mm_xor_si128(f0, t); | |||
f1 = _mm_xor_si128(f1, t); | |||
f2 = _mm_xor_si128(f2, t); | |||
f3 = _mm_xor_si128(f3, t); | |||
f = _mm_load_si128(&rkeys[0]); | |||
f0 = _mm_xor_si128(f0, f); | |||
f1 = _mm_xor_si128(f1, f); | |||
f2 = _mm_xor_si128(f2, f); | |||
f3 = _mm_xor_si128(f3, f); | |||
for (int i = 1; i < 14; i++) { | |||
t = _mm_load_si128(&rkeys[i]); | |||
f0 = _mm_aesenc_si128(f0, t); | |||
f1 = _mm_aesenc_si128(f1, t); | |||
f2 = _mm_aesenc_si128(f2, t); | |||
f3 = _mm_aesenc_si128(f3, t); | |||
f = _mm_load_si128(&rkeys[i]); | |||
f0 = _mm_aesenc_si128(f0, f); | |||
f1 = _mm_aesenc_si128(f1, f); | |||
f2 = _mm_aesenc_si128(f2, f); | |||
f3 = _mm_aesenc_si128(f3, f); | |||
} | |||
t = _mm_load_si128(&rkeys[14]); | |||
f0 = _mm_aesenclast_si128(f0, t); | |||
f1 = _mm_aesenclast_si128(f1, t); | |||
f2 = _mm_aesenclast_si128(f2, t); | |||
f3 = _mm_aesenclast_si128(f3, t); | |||
f = _mm_load_si128(&rkeys[14]); | |||
f0 = _mm_aesenclast_si128(f0, f); | |||
f1 = _mm_aesenclast_si128(f1, f); | |||
f2 = _mm_aesenclast_si128(f2, f); | |||
f3 = _mm_aesenclast_si128(f3, f); | |||
/* Write results */ | |||
_mm_storeu_si128((__m128i *)(out + 0), f0); | |||
@@ -134,6 +130,7 @@ void PQCLEAN_KYBER51290S_AVX2_aes256ctr_prf(uint8_t *out, | |||
while (outlen >= 64) { | |||
aesni_encrypt4(out, &state.n, state.rkeys); | |||
outlen -= 64; | |||
out += 64; | |||
} | |||
if (outlen) { | |||
@@ -2,22 +2,18 @@ | |||
#define PQCLEAN_KYBER51290S_AVX2_ALIGN_H | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
#define ALIGN16_TYPE(t) \ | |||
union { \ | |||
__m128i vec; \ | |||
t orig; \ | |||
#define ALIGNED_UINT8(N) \ | |||
union { \ | |||
uint8_t coeffs[(N)]; \ | |||
__m256i vec[((N)+31)/32]; \ | |||
} | |||
#define ALIGN32_ARRAY(t, s) \ | |||
union { \ | |||
__m256i vec; \ | |||
t arr[(s)]; \ | |||
#define ALIGNED_INT16(N) \ | |||
union { \ | |||
int16_t coeffs[(N)]; \ | |||
__m256i vec[((N)+15)/16]; \ | |||
} | |||
#define ALIGN32_ARRAY_2D(t, n, m) \ | |||
union { \ | |||
__m256i vec; \ | |||
t arr[(n)][(m)]; \ | |||
} | |||
#endif |
@@ -5,7 +5,7 @@ | |||
#define PQCLEAN_KYBER51290S_AVX2_CRYPTO_SECRETKEYBYTES 1632 | |||
#define PQCLEAN_KYBER51290S_AVX2_CRYPTO_PUBLICKEYBYTES 800 | |||
#define PQCLEAN_KYBER51290S_AVX2_CRYPTO_CIPHERTEXTBYTES 736 | |||
#define PQCLEAN_KYBER51290S_AVX2_CRYPTO_CIPHERTEXTBYTES 768 | |||
#define PQCLEAN_KYBER51290S_AVX2_CRYPTO_BYTES 32 | |||
#define PQCLEAN_KYBER51290S_AVX2_CRYPTO_ALGNAME "Kyber512-90s" | |||
@@ -1,216 +1,107 @@ | |||
#include "cdecl.h" | |||
#include "params.h" | |||
.macro schoolbook off,sign | |||
#load | |||
vmovdqa \off+32(%rsi),%ymm7 # b | |||
vmovdqa \off+32(%rdx),%ymm8 # d | |||
vmovdqa \off(%rsi),%ymm9 # a | |||
vmovdqa \off(%rdx),%ymm10 # c | |||
#mul | |||
vpmullw %ymm7,%ymm8,%ymm11 # bd.lo | |||
vpmulhw %ymm7,%ymm8,%ymm12 # bd.hi | |||
vpmullw %ymm7,%ymm10,%ymm13 # bc.lo | |||
vpmulhw %ymm7,%ymm10,%ymm7 # bc.hi | |||
vpmullw %ymm9,%ymm8,%ymm14 # ad.lo | |||
vpmulhw %ymm9,%ymm8,%ymm8 # ad.hi | |||
vpmullw %ymm9,%ymm10,%ymm15 # ac.lo | |||
vpmulhw %ymm9,%ymm10,%ymm9 # ac.hi | |||
#reduce | |||
vpmullw %ymm1,%ymm11,%ymm11 | |||
vpmulhw %ymm0,%ymm11,%ymm11 | |||
vpsubw %ymm11,%ymm12,%ymm11 # bd | |||
#mul | |||
vpmullw %ymm2,%ymm11,%ymm10 # rbd.lo | |||
vpmulhw %ymm2,%ymm11,%ymm11 # rbd.hi | |||
#unpack | |||
vpunpcklwd %ymm7,%ymm13,%ymm12 # bc0 | |||
vpunpckhwd %ymm7,%ymm13,%ymm7 # bc1 | |||
vpunpcklwd %ymm8,%ymm14,%ymm13 # ad0 | |||
vpunpckhwd %ymm8,%ymm14,%ymm8 # ad1 | |||
vpunpcklwd %ymm9,%ymm15,%ymm14 # ac0 | |||
vpunpckhwd %ymm9,%ymm15,%ymm9 # ac1 | |||
vpunpcklwd %ymm11,%ymm10,%ymm15 # rbd0 | |||
vpunpckhwd %ymm11,%ymm10,%ymm10 # rbd1 | |||
#add | |||
.ifeq \sign | |||
vpaddd %ymm14,%ymm15,%ymm14 # x0 | |||
vpaddd %ymm9,%ymm10,%ymm9 # x1 | |||
.else | |||
vpsubd %ymm15,%ymm14,%ymm14 # x0 | |||
vpsubd %ymm10,%ymm9,%ymm9 # x1 | |||
.endif | |||
vpaddd %ymm12,%ymm13,%ymm12 # y0 | |||
vpaddd %ymm7,%ymm8,%ymm7 # y1 | |||
.endm | |||
.macro red a0,a1,b0,b1,x,y,z | |||
#pack | |||
vpxor %ymm\x,%ymm\x,%ymm\x | |||
vpblendw $0xAA,%ymm\x,%ymm\a0,%ymm\y | |||
vpblendw $0xAA,%ymm\x,%ymm\a1,%ymm\z | |||
vpsrld $16,%ymm\a0,%ymm\a0 | |||
vpsrld $16,%ymm\a1,%ymm\a1 | |||
vpackusdw %ymm\z,%ymm\y,%ymm\z | |||
vpackusdw %ymm\a1,%ymm\a0,%ymm\a0 | |||
vpblendw $0xAA,%ymm\x,%ymm\b0,%ymm\y | |||
vpblendw $0xAA,%ymm\x,%ymm\b1,%ymm\x | |||
vpsrld $16,%ymm\b0,%ymm\b0 | |||
vpsrld $16,%ymm\b1,%ymm\b1 | |||
vpackusdw %ymm\x,%ymm\y,%ymm\y | |||
vpackusdw %ymm\b1,%ymm\b0,%ymm\b0 | |||
#reduce | |||
vpmullw %ymm1,%ymm\z,%ymm\z | |||
vpmullw %ymm1,%ymm\y,%ymm\y | |||
vpmulhw %ymm0,%ymm\z,%ymm\z | |||
vpmulhw %ymm0,%ymm\y,%ymm\y | |||
vpsubw %ymm\z,%ymm\a0,%ymm\a0 | |||
vpsubw %ymm\y,%ymm\b0,%ymm\b0 | |||
.macro schoolbook off | |||
vmovdqa _16XQINV*2(%rcx),%ymm0 | |||
vmovdqa (64*\off+ 0)*2(%rsi),%ymm1 # a0 | |||
vmovdqa (64*\off+16)*2(%rsi),%ymm2 # b0 | |||
vmovdqa (64*\off+32)*2(%rsi),%ymm3 # a1 | |||
vmovdqa (64*\off+48)*2(%rsi),%ymm4 # b1 | |||
vpmullw %ymm0,%ymm1,%ymm9 # a0.lo | |||
vpmullw %ymm0,%ymm2,%ymm10 # b0.lo | |||
vpmullw %ymm0,%ymm3,%ymm11 # a1.lo | |||
vpmullw %ymm0,%ymm4,%ymm12 # b1.lo | |||
vmovdqa (64*\off+ 0)*2(%rdx),%ymm5 # c0 | |||
vmovdqa (64*\off+16)*2(%rdx),%ymm6 # d0 | |||
vpmulhw %ymm5,%ymm1,%ymm13 # a0c0.hi | |||
vpmulhw %ymm6,%ymm1,%ymm1 # a0d0.hi | |||
vpmulhw %ymm5,%ymm2,%ymm14 # b0c0.hi | |||
vpmulhw %ymm6,%ymm2,%ymm2 # b0d0.hi | |||
vmovdqa (64*\off+32)*2(%rdx),%ymm7 # c1 | |||
vmovdqa (64*\off+48)*2(%rdx),%ymm8 # d1 | |||
vpmulhw %ymm7,%ymm3,%ymm15 # a1c1.hi | |||
vpmulhw %ymm8,%ymm3,%ymm3 # a1d1.hi | |||
vpmulhw %ymm7,%ymm4,%ymm0 # b1c1.hi | |||
vpmulhw %ymm8,%ymm4,%ymm4 # b1d1.hi | |||
vmovdqa %ymm13,(%rsp) | |||
vpmullw %ymm5,%ymm9,%ymm13 # a0c0.lo | |||
vpmullw %ymm6,%ymm9,%ymm9 # a0d0.lo | |||
vpmullw %ymm5,%ymm10,%ymm5 # b0c0.lo | |||
vpmullw %ymm6,%ymm10,%ymm10 # b0d0.lo | |||
vpmullw %ymm7,%ymm11,%ymm6 # a1c1.lo | |||
vpmullw %ymm8,%ymm11,%ymm11 # a1d1.lo | |||
vpmullw %ymm7,%ymm12,%ymm7 # b1c1.lo | |||
vpmullw %ymm8,%ymm12,%ymm12 # b1d1.lo | |||
vmovdqa _16XQ*2(%rcx),%ymm8 | |||
vpmulhw %ymm8,%ymm13,%ymm13 | |||
vpmulhw %ymm8,%ymm9,%ymm9 | |||
vpmulhw %ymm8,%ymm5,%ymm5 | |||
vpmulhw %ymm8,%ymm10,%ymm10 | |||
vpmulhw %ymm8,%ymm6,%ymm6 | |||
vpmulhw %ymm8,%ymm11,%ymm11 | |||
vpmulhw %ymm8,%ymm7,%ymm7 | |||
vpmulhw %ymm8,%ymm12,%ymm12 | |||
vpsubw (%rsp),%ymm13,%ymm13 # -a0c0 | |||
vpsubw %ymm9,%ymm1,%ymm9 # a0d0 | |||
vpsubw %ymm5,%ymm14,%ymm5 # b0c0 | |||
vpsubw %ymm10,%ymm2,%ymm10 # b0d0 | |||
vpsubw %ymm6,%ymm15,%ymm6 # a1c1 | |||
vpsubw %ymm11,%ymm3,%ymm11 # a1d1 | |||
vpsubw %ymm7,%ymm0,%ymm7 # b1c1 | |||
vpsubw %ymm12,%ymm4,%ymm12 # b1d1 | |||
vmovdqa (%r9),%ymm0 | |||
vmovdqa 32(%r9),%ymm1 | |||
vpmullw %ymm0,%ymm10,%ymm2 | |||
vpmullw %ymm0,%ymm12,%ymm3 | |||
vpmulhw %ymm1,%ymm10,%ymm10 | |||
vpmulhw %ymm1,%ymm12,%ymm12 | |||
vpmulhw %ymm8,%ymm2,%ymm2 | |||
vpmulhw %ymm8,%ymm3,%ymm3 | |||
vpsubw %ymm2,%ymm10,%ymm10 # rb0d0 | |||
vpsubw %ymm3,%ymm12,%ymm12 # rb1d1 | |||
vpaddw %ymm5,%ymm9,%ymm9 | |||
vpaddw %ymm7,%ymm11,%ymm11 | |||
vpsubw %ymm13,%ymm10,%ymm13 | |||
vpsubw %ymm12,%ymm6,%ymm6 | |||
vmovdqa %ymm13,(64*\off+ 0)*2(%rdi) | |||
vmovdqa %ymm9,(64*\off+16)*2(%rdi) | |||
vmovdqa %ymm6,(64*\off+32)*2(%rdi) | |||
vmovdqa %ymm11,(64*\off+48)*2(%rdi) | |||
.endm | |||
.text | |||
basemul64_acc_avx: | |||
poly0.0: | |||
schoolbook 0,0 | |||
#mov | |||
vmovdqa %ymm14,%ymm3 | |||
vmovdqa %ymm9,%ymm4 | |||
vmovdqa %ymm12,%ymm5 | |||
vmovdqa %ymm7,%ymm6 | |||
poly1.0: | |||
schoolbook 512,0 | |||
#add | |||
vpaddd %ymm14,%ymm3,%ymm3 | |||
vpaddd %ymm9,%ymm4,%ymm4 | |||
vpaddd %ymm12,%ymm5,%ymm5 | |||
vpaddd %ymm7,%ymm6,%ymm6 | |||
#reduce | |||
red 3,4,5,6,7,8,9 | |||
#store | |||
vmovdqa %ymm3,(%rdi) | |||
vmovdqa %ymm5,32(%rdi) | |||
poly0.1: | |||
schoolbook 64,1 | |||
#mov | |||
vmovdqa %ymm14,%ymm3 | |||
vmovdqa %ymm9,%ymm4 | |||
vmovdqa %ymm12,%ymm5 | |||
vmovdqa %ymm7,%ymm6 | |||
poly1.1: | |||
schoolbook 576,1 | |||
#add | |||
vpaddd %ymm14,%ymm3,%ymm3 | |||
vpaddd %ymm9,%ymm4,%ymm4 | |||
vpaddd %ymm12,%ymm5,%ymm5 | |||
vpaddd %ymm7,%ymm6,%ymm6 | |||
#reduce | |||
red 3,4,5,6,7,8,9 | |||
#store | |||
vmovdqa %ymm3,64(%rdi) | |||
vmovdqa %ymm5,96(%rdi) | |||
ret | |||
.global cdecl(PQCLEAN_KYBER51290S_AVX2_basemul_acc_avx) | |||
.global _cdecl(PQCLEAN_KYBER51290S_AVX2_basemul_acc_avx) | |||
cdecl(PQCLEAN_KYBER51290S_AVX2_basemul_acc_avx): | |||
_cdecl(PQCLEAN_KYBER51290S_AVX2_basemul_acc_avx): | |||
#consts | |||
vmovdqa _16XQ*2(%rcx),%ymm0 | |||
vmovdqa _16XQINV*2(%rcx),%ymm1 | |||
vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 | |||
call basemul64_acc_avx | |||
vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 | |||
add $128,%rdi | |||
add $128,%rsi | |||
add $128,%rdx | |||
call basemul64_acc_avx | |||
vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 | |||
add $128,%rdi | |||
add $128,%rsi | |||
add $128,%rdx | |||
call basemul64_acc_avx | |||
vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 | |||
add $128,%rdi | |||
add $128,%rsi | |||
add $128,%rdx | |||
call basemul64_acc_avx | |||
ret | |||
basemul64_avx: | |||
schoolbook 0,0 | |||
#reduce | |||
red 14,9,12,7,8,10,11 | |||
#store | |||
vmovdqa %ymm14,(%rdi) | |||
vmovdqa %ymm12,32(%rdi) | |||
schoolbook 64,1 | |||
#reduce | |||
red 14,9,12,7,8,10,11 | |||
#store | |||
vmovdqa %ymm14,64(%rdi) | |||
vmovdqa %ymm12,96(%rdi) | |||
ret | |||
.global cdecl(PQCLEAN_KYBER51290S_AVX2_basemul_avx) | |||
.global _cdecl(PQCLEAN_KYBER51290S_AVX2_basemul_avx) | |||
cdecl(PQCLEAN_KYBER51290S_AVX2_basemul_avx): | |||
_cdecl(PQCLEAN_KYBER51290S_AVX2_basemul_avx): | |||
#consts | |||
vmovdqa _16XQ*2(%rcx),%ymm0 | |||
vmovdqa _16XQINV*2(%rcx),%ymm1 | |||
vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 | |||
call basemul64_avx | |||
vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 | |||
add $128,%rdi | |||
add $128,%rsi | |||
add $128,%rdx | |||
call basemul64_avx | |||
vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 | |||
add $128,%rdi | |||
add $128,%rsi | |||
add $128,%rdx | |||
call basemul64_avx | |||
vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 | |||
add $128,%rdi | |||
add $128,%rsi | |||
add $128,%rdx | |||
call basemul64_avx | |||
mov %rsp,%r8 | |||
and $-32,%rsp | |||
sub $32,%rsp | |||
lea (_ZETAS_EXP+176)*2(%rcx),%r9 | |||
schoolbook 0 | |||
add $32*2,%r9 | |||
schoolbook 1 | |||
add $192*2,%r9 | |||
schoolbook 2 | |||
add $32*2,%r9 | |||
schoolbook 3 | |||
mov %r8,%rsp | |||
ret |
@@ -4,66 +4,125 @@ | |||
#include <stdint.h> | |||
/************************************************* | |||
* Name: PQCLEAN_KYBER51290S_AVX2_cbd | |||
* Name: cbd2 | |||
* | |||
* Description: Given an array of uniformly random bytes, compute | |||
* polynomial with coefficients distributed according to | |||
* a centered binomial distribution with parameter KYBER_ETA | |||
* a centered binomial distribution with parameter eta=2 | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const unsigned char *buf: pointer to input byte array | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const __m256i *buf: pointer to aligned input byte array | |||
**************************************************/ | |||
void PQCLEAN_KYBER51290S_AVX2_cbd(poly *restrict r, const uint8_t *restrict buf) { | |||
static void cbd2(poly *restrict r, const __m256i buf[2 * KYBER_N / 128]) { | |||
unsigned int i; | |||
__m256i vec0, vec1, vec2, vec3, tmp; | |||
__m256i f0, f1, f2, f3; | |||
const __m256i mask55 = _mm256_set1_epi32(0x55555555); | |||
const __m256i mask33 = _mm256_set1_epi32(0x33333333); | |||
const __m256i mask03 = _mm256_set1_epi32(0x03030303); | |||
const __m256i mask0F = _mm256_set1_epi32(0x0F0F0F0F); | |||
for (i = 0; i < KYBER_N / 64; i++) { | |||
vec0 = _mm256_load_si256((__m256i *)&buf[32 * i]); | |||
vec1 = _mm256_srli_epi32(vec0, 1); | |||
vec0 = _mm256_and_si256(mask55, vec0); | |||
vec1 = _mm256_and_si256(mask55, vec1); | |||
vec0 = _mm256_add_epi32(vec0, vec1); | |||
vec1 = _mm256_srli_epi32(vec0, 2); | |||
vec0 = _mm256_and_si256(mask33, vec0); | |||
vec1 = _mm256_and_si256(mask33, vec1); | |||
vec2 = _mm256_srli_epi32(vec0, 4); | |||
vec3 = _mm256_srli_epi32(vec1, 4); | |||
vec0 = _mm256_and_si256(mask03, vec0); | |||
vec1 = _mm256_and_si256(mask03, vec1); | |||
vec2 = _mm256_and_si256(mask03, vec2); | |||
vec3 = _mm256_and_si256(mask03, vec3); | |||
vec1 = _mm256_sub_epi8(vec0, vec1); | |||
vec3 = _mm256_sub_epi8(vec2, vec3); | |||
vec0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec1)); | |||
vec1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec1, 1)); | |||
vec2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec3)); | |||
vec3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec3, 1)); | |||
tmp = _mm256_unpacklo_epi16(vec0, vec2); | |||
vec2 = _mm256_unpackhi_epi16(vec0, vec2); | |||
vec0 = tmp; | |||
tmp = _mm256_unpacklo_epi16(vec1, vec3); | |||
vec3 = _mm256_unpackhi_epi16(vec1, vec3); | |||
vec1 = tmp; | |||
tmp = _mm256_permute2x128_si256(vec0, vec2, 0x20); | |||
vec2 = _mm256_permute2x128_si256(vec0, vec2, 0x31); | |||
vec0 = tmp; | |||
tmp = _mm256_permute2x128_si256(vec1, vec3, 0x20); | |||
vec3 = _mm256_permute2x128_si256(vec1, vec3, 0x31); | |||
vec1 = tmp; | |||
_mm256_store_si256((__m256i *)&r->coeffs[64 * i + 0], vec0); | |||
_mm256_store_si256((__m256i *)&r->coeffs[64 * i + 16], vec2); | |||
_mm256_store_si256((__m256i *)&r->coeffs[64 * i + 32], vec1); | |||
_mm256_store_si256((__m256i *)&r->coeffs[64 * i + 48], vec3); | |||
f0 = _mm256_load_si256(&buf[i]); | |||
f1 = _mm256_srli_epi16(f0, 1); | |||
f0 = _mm256_and_si256(mask55, f0); | |||
f1 = _mm256_and_si256(mask55, f1); | |||
f0 = _mm256_add_epi8(f0, f1); | |||
f1 = _mm256_srli_epi16(f0, 2); | |||
f0 = _mm256_and_si256(mask33, f0); | |||
f1 = _mm256_and_si256(mask33, f1); | |||
f0 = _mm256_add_epi8(f0, mask33); | |||
f0 = _mm256_sub_epi8(f0, f1); | |||
f1 = _mm256_srli_epi16(f0, 4); | |||
f0 = _mm256_and_si256(mask0F, f0); | |||
f1 = _mm256_and_si256(mask0F, f1); | |||
f0 = _mm256_sub_epi8(f0, mask03); | |||
f1 = _mm256_sub_epi8(f1, mask03); | |||
f2 = _mm256_unpacklo_epi8(f0, f1); | |||
f3 = _mm256_unpackhi_epi8(f0, f1); | |||
f0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f2)); | |||
f1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f2, 1)); | |||
f2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f3)); | |||
f3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f3, 1)); | |||
_mm256_store_si256(&r->vec[4 * i + 0], f0); | |||
_mm256_store_si256(&r->vec[4 * i + 1], f2); | |||
_mm256_store_si256(&r->vec[4 * i + 2], f1); | |||
_mm256_store_si256(&r->vec[4 * i + 3], f3); | |||
} | |||
} | |||
/************************************************* | |||
* Name: cbd3 | |||
* | |||
* Description: Given an array of uniformly random bytes, compute | |||
* polynomial with coefficients distributed according to | |||
* a centered binomial distribution with parameter eta=3 | |||
* This function is only needed for Kyber-512 | |||
* | |||
* Arguments: - poly *r: pointer to output polynomial | |||
* - const __m256i *buf: pointer to aligned input byte array | |||
**************************************************/ | |||
static void cbd3(poly *restrict r, const uint8_t buf[3 * KYBER_N / 4 + 8]) { | |||
unsigned int i; | |||
__m256i f0, f1, f2, f3; | |||
const __m256i mask249 = _mm256_set1_epi32(0x249249); | |||
const __m256i mask6DB = _mm256_set1_epi32(0x6DB6DB); | |||
const __m256i mask07 = _mm256_set1_epi32(7); | |||
const __m256i mask70 = _mm256_set1_epi32(7 << 16); | |||
const __m256i mask3 = _mm256_set1_epi16(3); | |||
const __m256i shufbidx = _mm256_set_epi8(-1, 15, 14, 13, -1, 12, 11, 10, -1, 9, 8, 7, -1, 6, 5, 4, | |||
-1, 11, 10, 9, -1, 8, 7, 6, -1, 5, 4, 3, -1, 2, 1, 0); | |||
for (i = 0; i < KYBER_N / 32; i++) { | |||
f0 = _mm256_loadu_si256((__m256i *)&buf[24 * i]); | |||
f0 = _mm256_permute4x64_epi64(f0, 0x94); | |||
f0 = _mm256_shuffle_epi8(f0, shufbidx); | |||
f1 = _mm256_srli_epi32(f0, 1); | |||
f2 = _mm256_srli_epi32(f0, 2); | |||
f0 = _mm256_and_si256(mask249, f0); | |||
f1 = _mm256_and_si256(mask249, f1); | |||
f2 = _mm256_and_si256(mask249, f2); | |||
f0 = _mm256_add_epi32(f0, f1); | |||
f0 = _mm256_add_epi32(f0, f2); | |||
f1 = _mm256_srli_epi32(f0, 3); | |||
f0 = _mm256_add_epi32(f0, mask6DB); | |||
f0 = _mm256_sub_epi32(f0, f1); | |||
f1 = _mm256_slli_epi32(f0, 10); | |||
f2 = _mm256_srli_epi32(f0, 12); | |||
f3 = _mm256_srli_epi32(f0, 2); | |||
f0 = _mm256_and_si256(f0, mask07); | |||
f1 = _mm256_and_si256(f1, mask70); | |||
f2 = _mm256_and_si256(f2, mask07); | |||
f3 = _mm256_and_si256(f3, mask70); | |||
f0 = _mm256_add_epi16(f0, f1); | |||
f1 = _mm256_add_epi16(f2, f3); | |||
f0 = _mm256_sub_epi16(f0, mask3); | |||
f1 = _mm256_sub_epi16(f1, mask3); | |||
f2 = _mm256_unpacklo_epi32(f0, f1); | |||
f3 = _mm256_unpackhi_epi32(f0, f1); | |||
f0 = _mm256_permute2x128_si256(f2, f3, 0x20); | |||
f1 = _mm256_permute2x128_si256(f2, f3, 0x31); | |||
_mm256_store_si256(&r->vec[2 * i + 0], f0); | |||
_mm256_store_si256(&r->vec[2 * i + 1], f1); | |||
} | |||
} | |||
/* buf 32 bytes longer for cbd3 */ | |||
void PQCLEAN_KYBER51290S_AVX2_poly_cbd_eta1(poly *r, const __m256i buf[KYBER_ETA1 * KYBER_N / 128 + 1]) { | |||
cbd3(r, (uint8_t *)buf); | |||
} | |||
void PQCLEAN_KYBER51290S_AVX2_poly_cbd_eta2(poly *r, const __m256i buf[KYBER_ETA2 * KYBER_N / 128]) { | |||
cbd2(r, buf); | |||
} |
@@ -2,8 +2,11 @@ | |||
#define PQCLEAN_KYBER51290S_AVX2_CBD_H | |||
#include "params.h" | |||
#include "poly.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
void PQCLEAN_KYBER51290S_AVX2_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]); | |||
void PQCLEAN_KYBER51290S_AVX2_poly_cbd_eta1(poly *r, const __m256i buf[KYBER_ETA1 * KYBER_N / 128 + 1]); | |||
void PQCLEAN_KYBER51290S_AVX2_poly_cbd_eta2(poly *r, const __m256i buf[KYBER_ETA2 * KYBER_N / 128]); | |||
#endif |