diff --git a/crypto_kem/kyber1024-90s/META.yml b/crypto_kem/kyber1024-90s/META.yml index cf157c78..c8dd1982 100644 --- a/crypto_kem/kyber1024-90s/META.yml +++ b/crypto_kem/kyber1024-90s/META.yml @@ -6,7 +6,7 @@ length-public-key: 1568 length-ciphertext: 1568 length-secret-key: 3168 length-shared-secret: 32 -nistkat-sha256: d3064040a33c15b65eb55dfd1bb116d092dab2cf5d693f8ab02b91ed105d66e3 +nistkat-sha256: a1b564348a126a118fbc49a6aeaebcb74896753fd99f30eeb0f75f0b2d25115f principal-submitters: - Peter Schwabe auxiliary-submitters: @@ -21,9 +21,9 @@ auxiliary-submitters: - Damien Stehlé implementations: - name: clean - version: https://github.com/pq-crystals/kyber/commit/844057468e69527bd15b17fbe03f4b61f9a22065 via https://github.com/jschanck/package-pqclean/tree/b45068b8/kyber + version: https://github.com/pq-crystals/kyber/commit/e7faae9f662f5b92fee4e966f09b2f23e1e91c65 via https://github.com/jschanck/package-pqclean/tree/231c9bec/kyber - name: avx2 - version: https://github.com/pq-crystals/kyber/commit/844057468e69527bd15b17fbe03f4b61f9a22065 via https://github.com/jschanck/package-pqclean/tree/b45068b8/kyber + version: https://github.com/pq-crystals/kyber/commit/e7faae9f662f5b92fee4e966f09b2f23e1e91c65 via https://github.com/jschanck/package-pqclean/tree/231c9bec/kyber supported_platforms: - architecture: x86_64 operating_systems: diff --git a/crypto_kem/kyber1024-90s/avx2/aes256ctr.c b/crypto_kem/kyber1024-90s/avx2/aes256ctr.c index e2ae81cc..158a9a54 100644 --- a/crypto_kem/kyber1024-90s/avx2/aes256ctr.c +++ b/crypto_kem/kyber1024-90s/avx2/aes256ctr.c @@ -2,52 +2,48 @@ #include #include #include -/* - Based heavily on public-domain code by Romain Dolbeau - Different handling of nonce+counter than original version - using separated 64-bit nonce and internal 64-bit counter, starting from zero - Public Domain -*/ +/* Based heavily on public-domain code by Romain Dolbeau + * Different handling of nonce+counter than original version using + * separated 64-bit nonce and internal 64-bit counter, starting from zero + * Public Domain */ -static inline void aesni_encrypt4(uint8_t out[64], - __m128i *n, - const __m128i rkeys[16]) { - __m128i f, f0, f1, f2, f3, t; +static inline void aesni_encrypt4(uint8_t out[64], __m128i *n, const __m128i rkeys[16]) { + __m128i f, f0, f1, f2, f3; + const __m128i idx = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); /* Load current counter value */ f = _mm_load_si128(n); /* Increase counter in 4 consecutive blocks */ - t = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); - f0 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(0, 0)), t); - f1 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(1, 0)), t); - f2 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(2, 0)), t); - f3 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(3, 0)), t); + f0 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(0, 0)), idx); + f1 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(1, 0)), idx); + f2 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(2, 0)), idx); + f3 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(3, 0)), idx); /* Write counter for next iteration, increased by 4 */ _mm_store_si128(n, _mm_add_epi64(f, _mm_set_epi64x(4, 0))); /* Actual AES encryption, 4x interleaved */ - t = _mm_load_si128(&rkeys[0]); - f0 = _mm_xor_si128(f0, t); - f1 = _mm_xor_si128(f1, t); - f2 = _mm_xor_si128(f2, t); - f3 = _mm_xor_si128(f3, t); + f = _mm_load_si128(&rkeys[0]); + f0 = _mm_xor_si128(f0, f); + f1 = _mm_xor_si128(f1, f); + f2 = _mm_xor_si128(f2, f); + f3 = _mm_xor_si128(f3, f); for (int i = 1; i < 14; i++) { - t = _mm_load_si128(&rkeys[i]); - f0 = _mm_aesenc_si128(f0, t); - f1 = _mm_aesenc_si128(f1, t); - f2 = _mm_aesenc_si128(f2, t); - f3 = _mm_aesenc_si128(f3, t); + f = _mm_load_si128(&rkeys[i]); + f0 = _mm_aesenc_si128(f0, f); + f1 = _mm_aesenc_si128(f1, f); + f2 = _mm_aesenc_si128(f2, f); + f3 = _mm_aesenc_si128(f3, f); } - t = _mm_load_si128(&rkeys[14]); - f0 = _mm_aesenclast_si128(f0, t); - f1 = _mm_aesenclast_si128(f1, t); - f2 = _mm_aesenclast_si128(f2, t); - f3 = _mm_aesenclast_si128(f3, t); + f = _mm_load_si128(&rkeys[14]); + f0 = _mm_aesenclast_si128(f0, f); + f1 = _mm_aesenclast_si128(f1, f); + f2 = _mm_aesenclast_si128(f2, f); + f3 = _mm_aesenclast_si128(f3, f); /* Write results */ _mm_storeu_si128((__m128i *)(out + 0), f0); @@ -134,6 +130,7 @@ void PQCLEAN_KYBER102490S_AVX2_aes256ctr_prf(uint8_t *out, while (outlen >= 64) { aesni_encrypt4(out, &state.n, state.rkeys); outlen -= 64; + out += 64; } if (outlen) { diff --git a/crypto_kem/kyber1024-90s/avx2/align.h b/crypto_kem/kyber1024-90s/avx2/align.h index 8d3aa971..12300d15 100644 --- a/crypto_kem/kyber1024-90s/avx2/align.h +++ b/crypto_kem/kyber1024-90s/avx2/align.h @@ -2,22 +2,18 @@ #define PQCLEAN_KYBER102490S_AVX2_ALIGN_H #include +#include -#define ALIGN16_TYPE(t) \ - union { \ - __m128i vec; \ - t orig; \ +#define ALIGNED_UINT8(N) \ + union { \ + uint8_t coeffs[(N)]; \ + __m256i vec[((N)+31)/32]; \ } -#define ALIGN32_ARRAY(t, s) \ - union { \ - __m256i vec; \ - t arr[(s)]; \ +#define ALIGNED_INT16(N) \ + union { \ + int16_t coeffs[(N)]; \ + __m256i vec[((N)+15)/16]; \ } -#define ALIGN32_ARRAY_2D(t, n, m) \ - union { \ - __m256i vec; \ - t arr[(n)][(m)]; \ - } #endif diff --git a/crypto_kem/kyber1024-90s/avx2/basemul.S b/crypto_kem/kyber1024-90s/avx2/basemul.S index a7b98edd..ead7d7b2 100644 --- a/crypto_kem/kyber1024-90s/avx2/basemul.S +++ b/crypto_kem/kyber1024-90s/avx2/basemul.S @@ -1,248 +1,107 @@ #include "cdecl.h" -#include "params.h" -.macro schoolbook off,sign -#load -vmovdqa \off+32(%rsi),%ymm7 # b -vmovdqa \off+32(%rdx),%ymm8 # d -vmovdqa \off(%rsi),%ymm9 # a -vmovdqa \off(%rdx),%ymm10 # c +.macro schoolbook off +vmovdqa _16XQINV*2(%rcx),%ymm0 +vmovdqa (64*\off+ 0)*2(%rsi),%ymm1 # a0 +vmovdqa (64*\off+16)*2(%rsi),%ymm2 # b0 +vmovdqa (64*\off+32)*2(%rsi),%ymm3 # a1 +vmovdqa (64*\off+48)*2(%rsi),%ymm4 # b1 -#mul -vpmullw %ymm7,%ymm8,%ymm11 # bd.lo -vpmulhw %ymm7,%ymm8,%ymm12 # bd.hi -vpmullw %ymm7,%ymm10,%ymm13 # bc.lo -vpmulhw %ymm7,%ymm10,%ymm7 # bc.hi -vpmullw %ymm9,%ymm8,%ymm14 # ad.lo -vpmulhw %ymm9,%ymm8,%ymm8 # ad.hi -vpmullw %ymm9,%ymm10,%ymm15 # ac.lo -vpmulhw %ymm9,%ymm10,%ymm9 # ac.hi +vpmullw %ymm0,%ymm1,%ymm9 # a0.lo +vpmullw %ymm0,%ymm2,%ymm10 # b0.lo +vpmullw %ymm0,%ymm3,%ymm11 # a1.lo +vpmullw %ymm0,%ymm4,%ymm12 # b1.lo -#reduce -vpmullw %ymm1,%ymm11,%ymm11 -vpmulhw %ymm0,%ymm11,%ymm11 -vpsubw %ymm11,%ymm12,%ymm11 # bd +vmovdqa (64*\off+ 0)*2(%rdx),%ymm5 # c0 +vmovdqa (64*\off+16)*2(%rdx),%ymm6 # d0 -#mul -vpmullw %ymm2,%ymm11,%ymm10 # rbd.lo -vpmulhw %ymm2,%ymm11,%ymm11 # rbd.hi +vpmulhw %ymm5,%ymm1,%ymm13 # a0c0.hi +vpmulhw %ymm6,%ymm1,%ymm1 # a0d0.hi +vpmulhw %ymm5,%ymm2,%ymm14 # b0c0.hi +vpmulhw %ymm6,%ymm2,%ymm2 # b0d0.hi -#unpack -vpunpcklwd %ymm7,%ymm13,%ymm12 # bc0 -vpunpckhwd %ymm7,%ymm13,%ymm7 # bc1 -vpunpcklwd %ymm8,%ymm14,%ymm13 # ad0 -vpunpckhwd %ymm8,%ymm14,%ymm8 # ad1 -vpunpcklwd %ymm9,%ymm15,%ymm14 # ac0 -vpunpckhwd %ymm9,%ymm15,%ymm9 # ac1 -vpunpcklwd %ymm11,%ymm10,%ymm15 # rbd0 -vpunpckhwd %ymm11,%ymm10,%ymm10 # rbd1 +vmovdqa (64*\off+32)*2(%rdx),%ymm7 # c1 +vmovdqa (64*\off+48)*2(%rdx),%ymm8 # d1 -#add -.ifeq \sign -vpaddd %ymm14,%ymm15,%ymm14 # x0 -vpaddd %ymm9,%ymm10,%ymm9 # x1 -.else -vpsubd %ymm15,%ymm14,%ymm14 # x0 -vpsubd %ymm10,%ymm9,%ymm9 # x1 -.endif -vpaddd %ymm12,%ymm13,%ymm12 # y0 -vpaddd %ymm7,%ymm8,%ymm7 # y1 -.endm +vpmulhw %ymm7,%ymm3,%ymm15 # a1c1.hi +vpmulhw %ymm8,%ymm3,%ymm3 # a1d1.hi +vpmulhw %ymm7,%ymm4,%ymm0 # b1c1.hi +vpmulhw %ymm8,%ymm4,%ymm4 # b1d1.hi -.macro red a0,a1,b0,b1,x,y,z -#pack -vpxor %ymm\x,%ymm\x,%ymm\x -vpblendw $0xAA,%ymm\x,%ymm\a0,%ymm\y -vpblendw $0xAA,%ymm\x,%ymm\a1,%ymm\z -vpsrld $16,%ymm\a0,%ymm\a0 -vpsrld $16,%ymm\a1,%ymm\a1 -vpackusdw %ymm\z,%ymm\y,%ymm\z -vpackusdw %ymm\a1,%ymm\a0,%ymm\a0 -vpblendw $0xAA,%ymm\x,%ymm\b0,%ymm\y -vpblendw $0xAA,%ymm\x,%ymm\b1,%ymm\x -vpsrld $16,%ymm\b0,%ymm\b0 -vpsrld $16,%ymm\b1,%ymm\b1 -vpackusdw %ymm\x,%ymm\y,%ymm\y -vpackusdw %ymm\b1,%ymm\b0,%ymm\b0 +vmovdqa %ymm13,(%rsp) -#reduce -vpmullw %ymm1,%ymm\z,%ymm\z -vpmullw %ymm1,%ymm\y,%ymm\y -vpmulhw %ymm0,%ymm\z,%ymm\z -vpmulhw %ymm0,%ymm\y,%ymm\y -vpsubw %ymm\z,%ymm\a0,%ymm\a0 -vpsubw %ymm\y,%ymm\b0,%ymm\b0 +vpmullw %ymm5,%ymm9,%ymm13 # a0c0.lo +vpmullw %ymm6,%ymm9,%ymm9 # a0d0.lo +vpmullw %ymm5,%ymm10,%ymm5 # b0c0.lo +vpmullw %ymm6,%ymm10,%ymm10 # b0d0.lo + +vpmullw %ymm7,%ymm11,%ymm6 # a1c1.lo +vpmullw %ymm8,%ymm11,%ymm11 # a1d1.lo +vpmullw %ymm7,%ymm12,%ymm7 # b1c1.lo +vpmullw %ymm8,%ymm12,%ymm12 # b1d1.lo + +vmovdqa _16XQ*2(%rcx),%ymm8 +vpmulhw %ymm8,%ymm13,%ymm13 +vpmulhw %ymm8,%ymm9,%ymm9 +vpmulhw %ymm8,%ymm5,%ymm5 +vpmulhw %ymm8,%ymm10,%ymm10 +vpmulhw %ymm8,%ymm6,%ymm6 +vpmulhw %ymm8,%ymm11,%ymm11 +vpmulhw %ymm8,%ymm7,%ymm7 +vpmulhw %ymm8,%ymm12,%ymm12 + +vpsubw (%rsp),%ymm13,%ymm13 # -a0c0 +vpsubw %ymm9,%ymm1,%ymm9 # a0d0 +vpsubw %ymm5,%ymm14,%ymm5 # b0c0 +vpsubw %ymm10,%ymm2,%ymm10 # b0d0 + +vpsubw %ymm6,%ymm15,%ymm6 # a1c1 +vpsubw %ymm11,%ymm3,%ymm11 # a1d1 +vpsubw %ymm7,%ymm0,%ymm7 # b1c1 +vpsubw %ymm12,%ymm4,%ymm12 # b1d1 + +vmovdqa (%r9),%ymm0 +vmovdqa 32(%r9),%ymm1 +vpmullw %ymm0,%ymm10,%ymm2 +vpmullw %ymm0,%ymm12,%ymm3 +vpmulhw %ymm1,%ymm10,%ymm10 +vpmulhw %ymm1,%ymm12,%ymm12 +vpmulhw %ymm8,%ymm2,%ymm2 +vpmulhw %ymm8,%ymm3,%ymm3 +vpsubw %ymm2,%ymm10,%ymm10 # rb0d0 +vpsubw %ymm3,%ymm12,%ymm12 # rb1d1 + +vpaddw %ymm5,%ymm9,%ymm9 +vpaddw %ymm7,%ymm11,%ymm11 +vpsubw %ymm13,%ymm10,%ymm13 +vpsubw %ymm12,%ymm6,%ymm6 + +vmovdqa %ymm13,(64*\off+ 0)*2(%rdi) +vmovdqa %ymm9,(64*\off+16)*2(%rdi) +vmovdqa %ymm6,(64*\off+32)*2(%rdi) +vmovdqa %ymm11,(64*\off+48)*2(%rdi) .endm .text -basemul64_acc_avx: -poly0.0: -schoolbook 0,0 - -#mov -vmovdqa %ymm14,%ymm3 -vmovdqa %ymm9,%ymm4 -vmovdqa %ymm12,%ymm5 -vmovdqa %ymm7,%ymm6 - -poly1.0: -schoolbook 512,0 - -#add -vpaddd %ymm14,%ymm3,%ymm3 -vpaddd %ymm9,%ymm4,%ymm4 -vpaddd %ymm12,%ymm5,%ymm5 -vpaddd %ymm7,%ymm6,%ymm6 - -poly2.0: -schoolbook 1024,0 - -#add -vpaddd %ymm14,%ymm3,%ymm3 -vpaddd %ymm9,%ymm4,%ymm4 -vpaddd %ymm12,%ymm5,%ymm5 -vpaddd %ymm7,%ymm6,%ymm6 - -poly3.0: -schoolbook 1536,0 - -#add -vpaddd %ymm14,%ymm3,%ymm3 -vpaddd %ymm9,%ymm4,%ymm4 -vpaddd %ymm12,%ymm5,%ymm5 -vpaddd %ymm7,%ymm6,%ymm6 - -#reduce -red 3,4,5,6,7,8,9 - -#store -vmovdqa %ymm3,(%rdi) -vmovdqa %ymm5,32(%rdi) - -poly0.1: -schoolbook 64,1 - -#mov -vmovdqa %ymm14,%ymm3 -vmovdqa %ymm9,%ymm4 -vmovdqa %ymm12,%ymm5 -vmovdqa %ymm7,%ymm6 - -poly1.1: -schoolbook 576,1 - -#add -vpaddd %ymm14,%ymm3,%ymm3 -vpaddd %ymm9,%ymm4,%ymm4 -vpaddd %ymm12,%ymm5,%ymm5 -vpaddd %ymm7,%ymm6,%ymm6 - -poly2.1: -schoolbook 1088,1 - -#add -vpaddd %ymm14,%ymm3,%ymm3 -vpaddd %ymm9,%ymm4,%ymm4 -vpaddd %ymm12,%ymm5,%ymm5 -vpaddd %ymm7,%ymm6,%ymm6 - -poly3.1: -schoolbook 1600,1 - -#add -vpaddd %ymm14,%ymm3,%ymm3 -vpaddd %ymm9,%ymm4,%ymm4 -vpaddd %ymm12,%ymm5,%ymm5 -vpaddd %ymm7,%ymm6,%ymm6 - -#reduce -red 3,4,5,6,7,8,9 - -#store -vmovdqa %ymm3,64(%rdi) -vmovdqa %ymm5,96(%rdi) - -ret - -.global cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx) -.global _cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx) -cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx): -_cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx): -#consts -vmovdqa _16XQ*2(%rcx),%ymm0 -vmovdqa _16XQINV*2(%rcx),%ymm1 - -vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 -call basemul64_acc_avx - -vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_acc_avx - -vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_acc_avx - -vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_acc_avx - -ret - -basemul64_avx: -schoolbook 0,0 - -#reduce -red 14,9,12,7,8,10,11 - -#store -vmovdqa %ymm14,(%rdi) -vmovdqa %ymm12,32(%rdi) - -schoolbook 64,1 - -#reduce -red 14,9,12,7,8,10,11 - -#store -vmovdqa %ymm14,64(%rdi) -vmovdqa %ymm12,96(%rdi) - -ret - .global cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_avx) .global _cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_avx) cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_avx): _cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_avx): -#consts -vmovdqa _16XQ*2(%rcx),%ymm0 -vmovdqa _16XQINV*2(%rcx),%ymm1 +mov %rsp,%r8 +and $-32,%rsp +sub $32,%rsp -vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 -call basemul64_avx +lea (_ZETAS_EXP+176)*2(%rcx),%r9 +schoolbook 0 -vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_avx +add $32*2,%r9 +schoolbook 1 -vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_avx +add $192*2,%r9 +schoolbook 2 -vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_avx +add $32*2,%r9 +schoolbook 3 +mov %r8,%rsp ret diff --git a/crypto_kem/kyber1024-90s/avx2/cbd.c b/crypto_kem/kyber1024-90s/avx2/cbd.c index 26a75d52..39d2ffde 100644 --- a/crypto_kem/kyber1024-90s/avx2/cbd.c +++ b/crypto_kem/kyber1024-90s/avx2/cbd.c @@ -4,66 +4,64 @@ #include /************************************************* -* Name: PQCLEAN_KYBER102490S_AVX2_cbd +* Name: cbd2 * * Description: Given an array of uniformly random bytes, compute * polynomial with coefficients distributed according to -* a centered binomial distribution with parameter KYBER_ETA +* a centered binomial distribution with parameter eta=2 * -* Arguments: - poly *r: pointer to output polynomial -* - const unsigned char *buf: pointer to input byte array +* Arguments: - poly *r: pointer to output polynomial +* - const __m256i *buf: pointer to aligned input byte array **************************************************/ -void PQCLEAN_KYBER102490S_AVX2_cbd(poly *restrict r, const uint8_t *restrict buf) { +static void cbd2(poly *restrict r, const __m256i buf[2 * KYBER_N / 128]) { unsigned int i; - __m256i vec0, vec1, vec2, vec3, tmp; + __m256i f0, f1, f2, f3; const __m256i mask55 = _mm256_set1_epi32(0x55555555); const __m256i mask33 = _mm256_set1_epi32(0x33333333); const __m256i mask03 = _mm256_set1_epi32(0x03030303); + const __m256i mask0F = _mm256_set1_epi32(0x0F0F0F0F); for (i = 0; i < KYBER_N / 64; i++) { - vec0 = _mm256_load_si256((__m256i *)&buf[32 * i]); + f0 = _mm256_load_si256(&buf[i]); - vec1 = _mm256_srli_epi32(vec0, 1); - vec0 = _mm256_and_si256(mask55, vec0); - vec1 = _mm256_and_si256(mask55, vec1); - vec0 = _mm256_add_epi32(vec0, vec1); + f1 = _mm256_srli_epi16(f0, 1); + f0 = _mm256_and_si256(mask55, f0); + f1 = _mm256_and_si256(mask55, f1); + f0 = _mm256_add_epi8(f0, f1); - vec1 = _mm256_srli_epi32(vec0, 2); - vec0 = _mm256_and_si256(mask33, vec0); - vec1 = _mm256_and_si256(mask33, vec1); + f1 = _mm256_srli_epi16(f0, 2); + f0 = _mm256_and_si256(mask33, f0); + f1 = _mm256_and_si256(mask33, f1); + f0 = _mm256_add_epi8(f0, mask33); + f0 = _mm256_sub_epi8(f0, f1); - vec2 = _mm256_srli_epi32(vec0, 4); - vec3 = _mm256_srli_epi32(vec1, 4); - vec0 = _mm256_and_si256(mask03, vec0); - vec1 = _mm256_and_si256(mask03, vec1); - vec2 = _mm256_and_si256(mask03, vec2); - vec3 = _mm256_and_si256(mask03, vec3); + f1 = _mm256_srli_epi16(f0, 4); + f0 = _mm256_and_si256(mask0F, f0); + f1 = _mm256_and_si256(mask0F, f1); + f0 = _mm256_sub_epi8(f0, mask03); + f1 = _mm256_sub_epi8(f1, mask03); - vec1 = _mm256_sub_epi8(vec0, vec1); - vec3 = _mm256_sub_epi8(vec2, vec3); + f2 = _mm256_unpacklo_epi8(f0, f1); + f3 = _mm256_unpackhi_epi8(f0, f1); - vec0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec1)); - vec1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec1, 1)); - vec2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec3)); - vec3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec3, 1)); + f0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f2)); + f1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f2, 1)); + f2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f3)); + f3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f3, 1)); - tmp = _mm256_unpacklo_epi16(vec0, vec2); - vec2 = _mm256_unpackhi_epi16(vec0, vec2); - vec0 = tmp; - tmp = _mm256_unpacklo_epi16(vec1, vec3); - vec3 = _mm256_unpackhi_epi16(vec1, vec3); - vec1 = tmp; - - tmp = _mm256_permute2x128_si256(vec0, vec2, 0x20); - vec2 = _mm256_permute2x128_si256(vec0, vec2, 0x31); - vec0 = tmp; - tmp = _mm256_permute2x128_si256(vec1, vec3, 0x20); - vec3 = _mm256_permute2x128_si256(vec1, vec3, 0x31); - vec1 = tmp; - - _mm256_store_si256((__m256i *)&r->coeffs[64 * i + 0], vec0); - _mm256_store_si256((__m256i *)&r->coeffs[64 * i + 16], vec2); - _mm256_store_si256((__m256i *)&r->coeffs[64 * i + 32], vec1); - _mm256_store_si256((__m256i *)&r->coeffs[64 * i + 48], vec3); + _mm256_store_si256(&r->vec[4 * i + 0], f0); + _mm256_store_si256(&r->vec[4 * i + 1], f2); + _mm256_store_si256(&r->vec[4 * i + 2], f1); + _mm256_store_si256(&r->vec[4 * i + 3], f3); } } + + +/* buf 32 bytes longer for cbd3 */ +void PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta1(poly *r, const __m256i buf[KYBER_ETA1 * KYBER_N / 128 + 1]) { + cbd2(r, buf); +} + +void PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta2(poly *r, const __m256i buf[KYBER_ETA2 * KYBER_N / 128]) { + cbd2(r, buf); +} diff --git a/crypto_kem/kyber1024-90s/avx2/cbd.h b/crypto_kem/kyber1024-90s/avx2/cbd.h index 5f4e435a..2c9d77a5 100644 --- a/crypto_kem/kyber1024-90s/avx2/cbd.h +++ b/crypto_kem/kyber1024-90s/avx2/cbd.h @@ -2,8 +2,11 @@ #define PQCLEAN_KYBER102490S_AVX2_CBD_H #include "params.h" #include "poly.h" +#include #include -void PQCLEAN_KYBER102490S_AVX2_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]); +void PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta1(poly *r, const __m256i buf[KYBER_ETA1 * KYBER_N / 128 + 1]); + +void PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta2(poly *r, const __m256i buf[KYBER_ETA2 * KYBER_N / 128]); #endif diff --git a/crypto_kem/kyber1024-90s/avx2/cdecl.h b/crypto_kem/kyber1024-90s/avx2/cdecl.h index 34bdf281..4c262f61 100644 --- a/crypto_kem/kyber1024-90s/avx2/cdecl.h +++ b/crypto_kem/kyber1024-90s/avx2/cdecl.h @@ -1,6 +1,8 @@ #ifndef PQCLEAN_KYBER102490S_AVX2_CDECL_H #define PQCLEAN_KYBER102490S_AVX2_CDECL_H + + #define _16XQ 0 #define _16XQINV 16 #define _16XV 32 @@ -9,9 +11,10 @@ #define _16XMONTSQLO 80 #define _16XMONTSQHI 96 #define _16XMASK 112 -#define _ZETAS_EXP 128 -#define _ZETAS_INV_EXP 528 - +#define _REVIDXB 128 +#define _REVIDXD 144 +#define _ZETAS_EXP 160 +#define _16XSHIFT 624 /* The C ABI on MacOS exports all symbols with a leading * underscore. This means that any symbols we refer to from @@ -23,4 +26,5 @@ #define _cdecl(s) _##s #define cdecl(s) s + #endif diff --git a/crypto_kem/kyber1024-90s/avx2/consts.c b/crypto_kem/kyber1024-90s/avx2/consts.c index 0afd7b5f..db1ae9a6 100644 --- a/crypto_kem/kyber1024-90s/avx2/consts.c +++ b/crypto_kem/kyber1024-90s/avx2/consts.c @@ -1,155 +1,123 @@ +#include "align.h" #include "consts.h" #include "params.h" -#include + #define Q KYBER_Q -#define MONT ((1U << 16) % Q) -#define QINV 62209 // q^-1 mod 2^16 -#define V (((1U << 26) + Q/2)/Q) -#define FHI (MONT*(MONT*(Q-1)*((Q-1)/128) % Q) % Q) -#define FLO (FHI*QINV % 65536) -#define MONTSQHI (MONT*MONT % Q) -#define MONTSQLO (MONTSQHI*QINV % 65536) +#define MONT (-1044) // 2^16 mod q +#define QINV (-3327) // q^-1 mod 2^16 +#define V 20159 // floor(2^26/q + 0.5) +#define FHI 1441 // mont^2/128 +#define FLO (-10079) // qinv*FHI +#define MONTSQHI 1353 // mont^2 +#define MONTSQLO 20553 // qinv*MONTSQHI #define MASK 4095 +#define SHIFT 32 - -const qdata_t PQCLEAN_KYBER102490S_AVX2_qdata = {.as_arr = { -#define _16XQ 0 +const qdata_t PQCLEAN_KYBER102490S_AVX2_qdata = {.coeffs = { +//#define _16XQ 0 Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, -#define _16XQINV 16 +//#define _16XQINV 16 QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, -#define _16XV 32 +//#define _16XV 32 V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, -#define _16XFLO 48 +//#define _16XFLO 48 FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, -#define _16XFHI 64 +//#define _16XFHI 64 FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, -#define _16XMONTSQLO 80 +//#define _16XMONTSQLO 80 MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, -#define _16XMONTSQHI 96 +//#define _16XMONTSQHI 96 MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, -#define _16XMASK 112 +//#define _16XMASK 112 MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, -#define _ZETAS_EXP 128 - 31499, 31499, 2571, 2571, 14746, 14746, 2970, 2970, - 13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, - 53134, 53134, 53134, 53134, 53134, 53134, 53134, 53134, - 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, - 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, - 44630, 44630, 44630, 44630, 27758, 27758, 27758, 27758, - 61737, 61737, 61737, 61737, 49846, 49846, 49846, 49846, - 3158, 3158, 3158, 3158, 622, 622, 622, 622, - 1577, 1577, 1577, 1577, 182, 182, 182, 182, - 59709, 59709, 17364, 17364, 39176, 39176, 36479, 36479, - 5572, 5572, 64434, 64434, 21439, 21439, 39295, 39295, - 573, 573, 2004, 2004, 264, 264, 383, 383, - 2500, 2500, 1458, 1458, 1727, 1727, 3199, 3199, - 59847, 59020, 1497, 30967, 41972, 20179, 20711, 25081, - 52740, 26617, 16065, 53095, 9135, 64887, 39550, 27837, - 1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, - 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, - 65202, 54059, 33310, 20494, 37798, 945, 50654, 6182, - 32011, 10631, 29176, 36775, 47051, 17561, 51106, 60261, - 2226, 555, 2078, 1550, 422, 177, 3038, 1574, - 3083, 1159, 2552, 2727, 1739, 2457, 418, 3173, - 11182, 13387, 51303, 43881, 13131, 60950, 23093, 5493, - 33034, 30318, 46795, 12639, 20100, 18525, 19529, 52918, - 430, 843, 871, 105, 587, 3094, 2869, 1653, - 778, 3182, 1483, 1119, 644, 349, 329, 3254, - 788, 788, 1812, 1812, 28191, 28191, 28191, 28191, - 28191, 28191, 28191, 28191, 48842, 48842, 48842, 48842, - 48842, 48842, 48842, 48842, 287, 287, 287, 287, - 287, 287, 287, 287, 202, 202, 202, 202, - 202, 202, 202, 202, 10690, 10690, 10690, 10690, - 1359, 1359, 1359, 1359, 54335, 54335, 54335, 54335, - 31164, 31164, 31164, 31164, 962, 962, 962, 962, - 2127, 2127, 2127, 2127, 1855, 1855, 1855, 1855, - 1468, 1468, 1468, 1468, 37464, 37464, 24313, 24313, - 55004, 55004, 8800, 8800, 18427, 18427, 8859, 8859, - 26676, 26676, 49374, 49374, 2648, 2648, 1017, 1017, - 732, 732, 608, 608, 1787, 1787, 411, 411, - 3124, 3124, 1758, 1758, 19884, 37287, 49650, 56638, - 37227, 9076, 35338, 18250, 13427, 14017, 36381, 52780, - 16832, 4312, 41381, 47622, 2476, 3239, 3058, 830, - 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, - 448, 2264, 677, 2054, 34353, 25435, 58154, 24392, - 44610, 10946, 24215, 16990, 10336, 57603, 43035, 10907, - 31637, 28644, 23998, 48114, 817, 603, 1322, 1864, - 2114, 1218, 2455, 2142, 2144, 2051, 1819, 2459, - 3221, 996, 958, 1522, 20297, 2146, 15356, 33152, - 59257, 50634, 54492, 14470, 44039, 45338, 23211, 48094, - 41677, 45279, 7757, 23132, 1097, 610, 2044, 384, - 3193, 1994, 220, 1670, 1799, 794, 2475, 478, - 3021, 991, 1869, 1628, 0, 0, 0, 0, +//#define _REVIDXB 128 + 3854, 3340, 2826, 2312, 1798, 1284, 770, 256, + 3854, 3340, 2826, 2312, 1798, 1284, 770, 256, -#define _ZETAS_INV_EXP 528 - 42405, 57780, 20258, 23860, 17443, 42326, 20199, 21498, - 51067, 11045, 14903, 6280, 32385, 50181, 63391, 45240, - 1701, 1460, 2338, 308, 2851, 854, 2535, 1530, - 1659, 3109, 1335, 136, 2945, 1285, 2719, 2232, - 17423, 41539, 36893, 33900, 54630, 22502, 7934, 55201, - 48547, 41322, 54591, 20927, 41145, 7383, 40102, 31184, - 1807, 2371, 2333, 108, 870, 1510, 1278, 1185, - 1187, 874, 2111, 1215, 1465, 2007, 2726, 2512, - 17915, 24156, 61225, 48705, 12757, 29156, 51520, 52110, - 47287, 30199, 56461, 28310, 8899, 15887, 28250, 45653, - 1275, 2652, 1065, 2881, 725, 1508, 2368, 398, - 951, 247, 1421, 3222, 2499, 271, 90, 853, - 16163, 16163, 38861, 38861, 56678, 56678, 47110, 47110, - 56737, 56737, 10533, 10533, 41224, 41224, 28073, 28073, - 1571, 1571, 205, 205, 2918, 2918, 1542, 1542, - 2721, 2721, 2597, 2597, 2312, 2312, 681, 681, - 34373, 34373, 34373, 34373, 11202, 11202, 11202, 11202, - 64178, 64178, 64178, 64178, 54847, 54847, 54847, 54847, - 1861, 1861, 1861, 1861, 1474, 1474, 1474, 1474, - 1202, 1202, 1202, 1202, 2367, 2367, 2367, 2367, - 16695, 16695, 16695, 16695, 16695, 16695, 16695, 16695, - 37346, 37346, 37346, 37346, 37346, 37346, 37346, 37346, - 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127, - 3042, 3042, 3042, 3042, 3042, 3042, 3042, 3042, - 64749, 64749, 1517, 1517, 12619, 46008, 47012, 45437, - 52898, 18742, 35219, 32503, 60044, 42444, 4587, 52406, - 21656, 14234, 52150, 54355, 75, 3000, 2980, 2685, - 2210, 1846, 147, 2551, 1676, 460, 235, 2742, - 3224, 2458, 2486, 2899, 5276, 14431, 47976, 18486, - 28762, 36361, 54906, 33526, 59355, 14883, 64592, 27739, - 45043, 32227, 11478, 335, 156, 2911, 872, 1590, - 602, 777, 2170, 246, 1755, 291, 3152, 2907, - 1779, 1251, 2774, 1103, 37700, 25987, 650, 56402, - 12442, 49472, 38920, 12797, 40456, 44826, 45358, 23565, - 34570, 64040, 6517, 5690, 1860, 3203, 1162, 1618, - 666, 320, 8, 2813, 1544, 282, 1838, 1293, - 2314, 552, 2677, 2106, 26242, 26242, 44098, 44098, - 1103, 1103, 59965, 59965, 29058, 29058, 26361, 26361, - 48173, 48173, 5828, 5828, 130, 130, 1602, 1602, - 1871, 1871, 829, 829, 2946, 2946, 3065, 3065, - 1325, 1325, 2756, 2756, 15691, 15691, 15691, 15691, - 3800, 3800, 3800, 3800, 37779, 37779, 37779, 37779, - 20907, 20907, 20907, 20907, 3147, 3147, 3147, 3147, - 1752, 1752, 1752, 1752, 2707, 2707, 2707, 2707, - 171, 171, 171, 171, 12403, 12403, 12403, 12403, - 12403, 12403, 12403, 12403, 52012, 52012, 52012, 52012, - 52012, 52012, 52012, 52012, 1907, 1907, 1907, 1907, - 1907, 1907, 1907, 1907, 1836, 1836, 1836, 1836, - 1836, 1836, 1836, 1836, 50791, 50791, 359, 359, - 60300, 60300, 1932, 1932, 0, 0, 0, 0 +//#define _REVIDXD 144 + 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0, 0, + +//#define _ZETAS_EXP 160 + 31498, 31498, 31498, 31498, -758, -758, -758, -758, + 5237, 5237, 5237, 5237, 1397, 1397, 1397, 1397, + 14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745, + 14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745, + -359, -359, -359, -359, -359, -359, -359, -359, + -359, -359, -359, -359, -359, -359, -359, -359, + 13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, + -12402, -12402, -12402, -12402, -12402, -12402, -12402, -12402, + 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, + 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, + -20907, -20907, -20907, -20907, 27758, 27758, 27758, 27758, + -3799, -3799, -3799, -3799, -15690, -15690, -15690, -15690, + -171, -171, -171, -171, 622, 622, 622, 622, + 1577, 1577, 1577, 1577, 182, 182, 182, 182, + -5827, -5827, 17363, 17363, -26360, -26360, -29057, -29057, + 5571, 5571, -1102, -1102, 21438, 21438, -26242, -26242, + 573, 573, -1325, -1325, 264, 264, 383, 383, + -829, -829, 1458, 1458, -1602, -1602, -130, -130, + -5689, -6516, 1496, 30967, -23565, 20179, 20710, 25080, + -12796, 26616, 16064, -12442, 9134, -650, -25986, 27837, + 1223, 652, -552, 1015, -1293, 1491, -282, -1544, + 516, -8, -320, -666, -1618, -1162, 126, 1469, + -335, -11477, -32227, 20494, -27738, 945, -14883, 6182, + 32010, 10631, 29175, -28762, -18486, 17560, -14430, -5276, + -1103, 555, -1251, 1550, 422, 177, -291, 1574, + -246, 1159, -777, -602, -1590, -872, 418, -156, + 11182, 13387, -14233, -21655, 13131, -4587, 23092, 5493, + -32502, 30317, -18741, 12639, 20100, 18525, 19529, -12619, + 430, 843, 871, 105, 587, -235, -460, 1653, + 778, -147, 1483, 1119, 644, 349, 329, -75, + 787, 787, 787, 787, 787, 787, 787, 787, + 787, 787, 787, 787, 787, 787, 787, 787, + -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + 28191, 28191, 28191, 28191, 28191, 28191, 28191, 28191, + -16694, -16694, -16694, -16694, -16694, -16694, -16694, -16694, + 287, 287, 287, 287, 287, 287, 287, 287, + 202, 202, 202, 202, 202, 202, 202, 202, + 10690, 10690, 10690, 10690, 1358, 1358, 1358, 1358, + -11202, -11202, -11202, -11202, 31164, 31164, 31164, 31164, + 962, 962, 962, 962, -1202, -1202, -1202, -1202, + -1474, -1474, -1474, -1474, 1468, 1468, 1468, 1468, + -28073, -28073, 24313, 24313, -10532, -10532, 8800, 8800, + 18426, 18426, 8859, 8859, 26675, 26675, -16163, -16163, + -681, -681, 1017, 1017, 732, 732, 608, 608, + -1542, -1542, 411, 411, -205, -205, -1571, -1571, + 19883, -28250, -15887, -8898, -28309, 9075, -30199, 18249, + 13426, 14017, -29156, -12757, 16832, 4311, -24155, -17915, + -853, -90, -271, 830, 107, -1421, -247, -951, + -398, 961, -1508, -725, 448, -1065, 677, -1275, + -31183, 25435, -7382, 24391, -20927, 10946, 24214, 16989, + 10335, -7934, -22502, 10906, 31636, 28644, 23998, -17422, + 817, 603, 1322, -1465, -1215, 1218, -874, -1187, + -1185, -1278, -1510, -870, -108, 996, 958, 1522, + 20297, 2146, 15355, -32384, -6280, -14903, -11044, 14469, + -21498, -20198, 23210, -17442, -23860, -20257, 7756, 23132, + 1097, 610, -1285, 384, -136, -1335, 220, -1659, + -1530, 794, -854, 478, -308, 991, -1460, 1628, + +//#define _16XSHIFT 624 + SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, + SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT } }; diff --git a/crypto_kem/kyber1024-90s/avx2/consts.h b/crypto_kem/kyber1024-90s/avx2/consts.h index f5755661..61371483 100644 --- a/crypto_kem/kyber1024-90s/avx2/consts.h +++ b/crypto_kem/kyber1024-90s/avx2/consts.h @@ -1,19 +1,10 @@ #ifndef PQCLEAN_KYBER102490S_AVX2_CONSTS_H #define PQCLEAN_KYBER102490S_AVX2_CONSTS_H +#include "align.h" #include "cdecl.h" -#include "params.h" -#include -#include -#define ALIGNED_UINT16_T(N) \ - union { \ - __m256i as_vec; \ - uint16_t as_arr[(N)]; \ - } - -typedef ALIGNED_UINT16_T(928) qdata_t; - +typedef ALIGNED_INT16(640) qdata_t; extern const qdata_t PQCLEAN_KYBER102490S_AVX2_qdata; #endif diff --git a/crypto_kem/kyber1024-90s/avx2/fq.S b/crypto_kem/kyber1024-90s/avx2/fq.S index c436df31..1374c5a5 100644 --- a/crypto_kem/kyber1024-90s/avx2/fq.S +++ b/crypto_kem/kyber1024-90s/avx2/fq.S @@ -13,14 +13,14 @@ vmovdqa 160(%rdi),%ymm7 vmovdqa 192(%rdi),%ymm8 vmovdqa 224(%rdi),%ymm9 -red16 2,10 -red16 3,11 -red16 4,12 -red16 5,13 -red16 6,14 -red16 7,15 -red16 8,10 -red16 9,11 +red16 2 +red16 3 +red16 4 +red16 5 +red16 6 +red16 7 +red16 8 +red16 9 #store vmovdqa %ymm2,(%rdi) @@ -46,49 +46,6 @@ add $256,%rdi call reduce128_avx ret -csubq128_avx: -#load -vmovdqa (%rdi),%ymm1 -vmovdqa 32(%rdi),%ymm2 -vmovdqa 64(%rdi),%ymm3 -vmovdqa 96(%rdi),%ymm4 -vmovdqa 128(%rdi),%ymm5 -vmovdqa 160(%rdi),%ymm6 -vmovdqa 192(%rdi),%ymm7 -vmovdqa 224(%rdi),%ymm8 - -csubq 1,9 -csubq 2,10 -csubq 3,11 -csubq 4,12 -csubq 5,13 -csubq 6,14 -csubq 7,15 -csubq 8,9 - -#store -vmovdqa %ymm1,(%rdi) -vmovdqa %ymm2,32(%rdi) -vmovdqa %ymm3,64(%rdi) -vmovdqa %ymm4,96(%rdi) -vmovdqa %ymm5,128(%rdi) -vmovdqa %ymm6,160(%rdi) -vmovdqa %ymm7,192(%rdi) -vmovdqa %ymm8,224(%rdi) - -ret - -.global cdecl(PQCLEAN_KYBER102490S_AVX2_csubq_avx) -.global _cdecl(PQCLEAN_KYBER102490S_AVX2_csubq_avx) -cdecl(PQCLEAN_KYBER102490S_AVX2_csubq_avx): -_cdecl(PQCLEAN_KYBER102490S_AVX2_csubq_avx): -#consts -vmovdqa _16XQ*2(%rsi),%ymm0 -call csubq128_avx -add $256,%rdi -call csubq128_avx -ret - tomont128_avx: #load vmovdqa (%rdi),%ymm3 diff --git a/crypto_kem/kyber1024-90s/avx2/fq.inc b/crypto_kem/kyber1024-90s/avx2/fq.inc index 75df098a..4b7afc31 100644 --- a/crypto_kem/kyber1024-90s/avx2/fq.inc +++ b/crypto_kem/kyber1024-90s/avx2/fq.inc @@ -1,6 +1,10 @@ -.macro red16 r,x=12 +.macro red16 r,rs=0,x=12 vpmulhw %ymm1,%ymm\r,%ymm\x +.if \rs +vpmulhrsw %ymm\rs,%ymm\x,%ymm\x +.else vpsraw $10,%ymm\x,%ymm\x +.endif vpmullw %ymm0,%ymm\x,%ymm\x vpsubw %ymm\x,%ymm\r,%ymm\r .endm @@ -10,9 +14,6 @@ vpsubw %ymm0,%ymm\r,%ymm\r vpsraw $15,%ymm\r,%ymm\x vpand %ymm0,%ymm\x,%ymm\x vpaddw %ymm\x,%ymm\r,%ymm\r -#vpcmpgtw %ymm0,%ymm\r,%ymm\x -#vpand %ymm0,%ymm\x,%ymm\x -#vpsubw %ymm\x,%ymm\r,%ymm\r .endm .macro caddq r,x=12 diff --git a/crypto_kem/kyber1024-90s/avx2/indcpa.c b/crypto_kem/kyber1024-90s/avx2/indcpa.c index ae5e7ed2..f1367a1d 100644 --- a/crypto_kem/kyber1024-90s/avx2/indcpa.c +++ b/crypto_kem/kyber1024-90s/avx2/indcpa.c @@ -8,6 +8,7 @@ #include "randombytes.h" #include "rejsample.h" #include "symmetric.h" +#include #include #include @@ -15,11 +16,14 @@ * Name: pack_pk * * Description: Serialize the public key as concatenation of the -* serialized vector of polynomials pk -* and the public seed used to generate the matrix A. +* serialized vector of polynomials pk and the +* public seed used to generate the matrix A. +* The polynomial coefficients in pk are assumed to +* lie in the invertal [0,q], i.e. pk must be reduced +* by PQCLEAN_KYBER102490S_AVX2_polyvec_reduce(). * -* Arguments: uint8_t *r: pointer to the output serialized public key -* polyvec *pk: pointer to the input public-key polyvec +* Arguments: uint8_t *r: pointer to the output serialized public key +* polyvec *pk: pointer to the input public-key polyvec * const uint8_t *seed: pointer to the input public seed **************************************************/ static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], @@ -55,9 +59,12 @@ static void unpack_pk(polyvec *pk, /************************************************* * Name: pack_sk * -* Description: Serialize the secret key +* Description: Serialize the secret key. +* The polynomial coefficients in sk are assumed to +* lie in the invertal [0,q], i.e. sk must be reduced +* by PQCLEAN_KYBER102490S_AVX2_polyvec_reduce(). * -* Arguments: - uint8_t *r: pointer to output serialized secret key +* Arguments: - uint8_t *r: pointer to output serialized secret key * - polyvec *sk: pointer to input vector of polynomials (secret key) **************************************************/ static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { @@ -67,15 +74,12 @@ static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { /************************************************* * Name: unpack_sk * -* Description: De-serialize the secret key; -* inverse of pack_sk +* Description: De-serialize the secret key; inverse of pack_sk * -* Arguments: - polyvec *sk: pointer to output vector of polynomials -* (secret key) +* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key) * - const uint8_t *packedsk: pointer to input serialized secret key **************************************************/ -static void unpack_sk(polyvec *sk, - const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { +static void unpack_sk(polyvec *sk, const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { PQCLEAN_KYBER102490S_AVX2_polyvec_frombytes(sk, packedsk); } @@ -84,15 +88,16 @@ static void unpack_sk(polyvec *sk, * * Description: Serialize the ciphertext as concatenation of the * compressed and serialized vector of polynomials b -* and the compressed and serialized polynomial v +* and the compressed and serialized polynomial v. +* The polynomial coefficients in b and v are assumed to +* lie in the invertal [0,q], i.e. b and v must be reduced +* by PQCLEAN_KYBER102490S_AVX2_polyvec_reduce() and PQCLEAN_KYBER102490S_AVX2_poly_reduce(), respectively. * * Arguments: uint8_t *r: pointer to the output serialized ciphertext -* poly *pk: pointer to the input vector of polynomials b -* poly *v: pointer to the input polynomial v +* poly *pk: pointer to the input vector of polynomials b +* poly *v: pointer to the input polynomial v **************************************************/ -static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], - polyvec *b, - poly *v) { +static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], polyvec *b, poly *v) { PQCLEAN_KYBER102490S_AVX2_polyvec_compress(r, b); PQCLEAN_KYBER102490S_AVX2_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v); } @@ -103,13 +108,11 @@ static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], * Description: De-serialize and decompress ciphertext from a byte array; * approximate inverse of pack_ciphertext * -* Arguments: - polyvec *b: pointer to the output vector of polynomials b -* - poly *v: pointer to the output polynomial v +* Arguments: - polyvec *b: pointer to the output vector of polynomials b +* - poly *v: pointer to the output polynomial v * - const uint8_t *c: pointer to the input serialized ciphertext **************************************************/ -static void unpack_ciphertext(polyvec *b, - poly *v, - const uint8_t c[KYBER_INDCPA_BYTES]) { +static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t c[KYBER_INDCPA_BYTES]) { PQCLEAN_KYBER102490S_AVX2_polyvec_decompress(b, c); PQCLEAN_KYBER102490S_AVX2_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES); } @@ -120,11 +123,9 @@ static void unpack_ciphertext(polyvec *b, * Description: Run rejection sampling on uniform random bytes to generate * uniform random integers mod q * -* Arguments: - int16_t *r: pointer to output buffer -* - unsigned int len: requested number of 16-bit integers -* (uniform mod q) -* - const uint8_t *buf: pointer to input buffer -* (assumed to be uniform random bytes) +* Arguments: - int16_t *r: pointer to output array +* - unsigned int len: requested number of 16-bit integers (uniform mod q) +* - const uint8_t *buf: pointer to input buffer (assumed to be uniformly random bytes) * - unsigned int buflen: length of input buffer in bytes * * Returns number of sampled 16-bit integers (at most len) @@ -134,16 +135,19 @@ static unsigned int rej_uniform(int16_t *r, const uint8_t *buf, unsigned int buflen) { unsigned int ctr, pos; - uint16_t val; + uint16_t val0, val1; ctr = pos = 0; - while (ctr < len && pos + 2 <= buflen) { - val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); - pos += 2; + while (ctr < len && pos + 3 <= buflen) { + val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; + val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF; + pos += 3; - if (val < 19 * KYBER_Q) { - val -= ((uint32_t)val * 20159 >> 26) * KYBER_Q; // Barrett reduction - r[ctr++] = (int16_t)val; + if (val0 < KYBER_Q) { + r[ctr++] = val0; + } + if (ctr < len && val1 < KYBER_Q) { + r[ctr++] = val1; } } @@ -165,12 +169,11 @@ static unsigned int rej_uniform(int16_t *r, * - const uint8_t *seed: pointer to input seed * - int transposed: boolean deciding whether A or A^T is generated **************************************************/ -#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \ - + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) -void PQCLEAN_KYBER102490S_AVX2_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed) { - unsigned int ctr, i, j; - ALIGN16_TYPE(uint64_t) nonce = {.orig = 0}; - ALIGN32_ARRAY(uint8_t, GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES) buf; +void PQCLEAN_KYBER102490S_AVX2_gen_matrix(polyvec *a, const uint8_t seed[32], int transposed) { + unsigned int ctr, i, j, k; + unsigned int buflen, off; + uint64_t nonce = 0; + ALIGNED_UINT8(REJ_UNIFORM_AVX_NBLOCKS * AES256CTR_BLOCKBYTES) buf; aes256ctr_ctx state; PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, seed, 0); @@ -178,19 +181,24 @@ void PQCLEAN_KYBER102490S_AVX2_gen_matrix(polyvec *a, const uint8_t seed[KYBER_S for (i = 0; i < KYBER_K; i++) { for (j = 0; j < KYBER_K; j++) { if (transposed) { - nonce.orig = (j << 8) | i; + nonce = (j << 8) | i; } else { - nonce.orig = (i << 8) | j; + nonce = (i << 8) | j; } - state.n = _mm_loadl_epi64(&nonce.vec); - PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.arr, GEN_MATRIX_NBLOCKS, &state); - ctr = PQCLEAN_KYBER102490S_AVX2_rej_uniform_avx(a[i].vec[j].coeffs, buf.arr); + state.n = _mm_loadl_epi64((__m128i *)&nonce); + PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.coeffs, REJ_UNIFORM_AVX_NBLOCKS, &state); + buflen = REJ_UNIFORM_AVX_NBLOCKS * AES256CTR_BLOCKBYTES; + ctr = PQCLEAN_KYBER102490S_AVX2_rej_uniform_avx(a[i].vec[j].coeffs, buf.coeffs); while (ctr < KYBER_N) { - PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.arr, 1, &state); - ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf.arr, - XOF_BLOCKBYTES); + off = buflen % 3; + for (k = 0; k < off; k++) { + buf.coeffs[k] = buf.coeffs[buflen - off + k]; + } + PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.coeffs + off, 1, &state); + buflen = off + AES256CTR_BLOCKBYTES; + ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf.coeffs, buflen); } PQCLEAN_KYBER102490S_AVX2_poly_nttunpack(&a[i].vec[j]); @@ -212,39 +220,41 @@ void PQCLEAN_KYBER102490S_AVX2_gen_matrix(polyvec *a, const uint8_t seed[KYBER_S void PQCLEAN_KYBER102490S_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { unsigned int i; - ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; - const uint8_t *publicseed = buf.arr; - const uint8_t *noiseseed = buf.arr + KYBER_SYMBYTES; + uint8_t buf[2 * KYBER_SYMBYTES]; + const uint8_t *publicseed = buf; + const uint8_t *noiseseed = buf + KYBER_SYMBYTES; polyvec a[KYBER_K], e, pkpv, skpv; - randombytes(buf.arr, KYBER_SYMBYTES); - hash_g(buf.arr, buf.arr, KYBER_SYMBYTES); + randombytes(buf, KYBER_SYMBYTES); + hash_g(buf, buf, KYBER_SYMBYTES); gen_a(a, publicseed); - ALIGN16_TYPE(uint64_t) nonce = {.orig = 0}; +#define NOISE_NBLOCKS ((KYBER_ETA1*KYBER_N/4)/AES256CTR_BLOCKBYTES) /* Assumes divisibility */ + uint64_t nonce = 0; + ALIGNED_UINT8(NOISE_NBLOCKS * AES256CTR_BLOCKBYTES + 32) coins; // +32 bytes as required by PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta1 aes256ctr_ctx state; - ALIGN32_ARRAY(uint8_t, 128) coins; - PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, noiseseed, nonce.orig++); + PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, noiseseed, nonce++); for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(coins.arr, 2, &state); - state.n = _mm_loadl_epi64(&nonce.vec); - nonce.orig++; - PQCLEAN_KYBER102490S_AVX2_cbd(&skpv.vec[i], coins.arr); + PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(coins.coeffs, NOISE_NBLOCKS, &state); + state.n = _mm_loadl_epi64((__m128i *)&nonce); + nonce += 1; + PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta1(&skpv.vec[i], coins.vec); } for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(coins.arr, 2, &state); - state.n = _mm_loadl_epi64(&nonce.vec); - nonce.orig++; - PQCLEAN_KYBER102490S_AVX2_cbd(&e.vec[i], coins.arr); + PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(coins.coeffs, NOISE_NBLOCKS, &state); + state.n = _mm_loadl_epi64((__m128i *)&nonce); + nonce += 1; + PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta1(&e.vec[i], coins.vec); } PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(&skpv); + PQCLEAN_KYBER102490S_AVX2_polyvec_reduce(&skpv); PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(&e); // matrix-vector multiplication for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); + PQCLEAN_KYBER102490S_AVX2_polyvec_basemul_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); PQCLEAN_KYBER102490S_AVX2_poly_tomont(&pkpv.vec[i]); } @@ -261,70 +271,70 @@ void PQCLEAN_KYBER102490S_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYB * Description: Encryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *c: pointer to output ciphertext -* (of length KYBER_INDCPA_BYTES bytes) -* - const uint8_t *m: pointer to input message -* (of length KYBER_INDCPA_MSGBYTES bytes) -* - const uint8_t *pk: pointer to input public key -* (of length KYBER_INDCPA_PUBLICKEYBYTES) -* - const uint8_t *coins: pointer to input random coins -* used as seed (of length KYBER_SYMBYTES) -* to deterministically generate all -* randomness +* Arguments: - uint8_t *c: pointer to output ciphertext +* (of length KYBER_INDCPA_BYTES bytes) +* - const uint8_t *m: pointer to input message +* (of length KYBER_INDCPA_MSGBYTES bytes) +* - const uint8_t *pk: pointer to input public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES) +* - const uint8_t *coins: pointer to input random coins used as seed +* (of length KYBER_SYMBYTES) to deterministically +* generate all randomness **************************************************/ void PQCLEAN_KYBER102490S_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], const uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], const uint8_t coins[KYBER_SYMBYTES]) { unsigned int i; - ALIGN32_ARRAY(uint8_t, KYBER_SYMBYTES) seed; - polyvec sp, pkpv, ep, at[KYBER_K], bp; + uint8_t seed[KYBER_SYMBYTES]; + polyvec sp, pkpv, ep, at[KYBER_K], b; poly v, k, epp; - unpack_pk(&pkpv, seed.arr, pk); + unpack_pk(&pkpv, seed, pk); PQCLEAN_KYBER102490S_AVX2_poly_frommsg(&k, m); - gen_at(at, seed.arr); + gen_at(at, seed); - ALIGN16_TYPE(uint64_t) nonce = {.orig = 0}; +#define NOISE_NBLOCKS ((KYBER_ETA1*KYBER_N/4)/AES256CTR_BLOCKBYTES) /* Assumes divisibility */ +#define CIPHERTEXTNOISE_NBLOCKS ((KYBER_ETA2*KYBER_N/4)/AES256CTR_BLOCKBYTES) /* Assumes divisibility */ + uint64_t nonce = 0; + ALIGNED_UINT8(NOISE_NBLOCKS * AES256CTR_BLOCKBYTES + 32) buf; /* +32 bytes as required by PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta1 */ aes256ctr_ctx state; - ALIGN32_ARRAY(uint8_t, 128) buf; - PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, coins, nonce.orig++); + PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, coins, nonce++); for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.arr, 2, &state); - state.n = _mm_loadl_epi64(&nonce.vec); - nonce.orig++; - PQCLEAN_KYBER102490S_AVX2_cbd(&sp.vec[i], buf.arr); + PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.coeffs, NOISE_NBLOCKS, &state); + state.n = _mm_loadl_epi64((__m128i *)&nonce); + nonce += 1; + PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta1(&sp.vec[i], buf.vec); } for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.arr, 2, &state); - state.n = _mm_loadl_epi64(&nonce.vec); - nonce.orig++; - PQCLEAN_KYBER102490S_AVX2_cbd(&ep.vec[i], buf.arr); + PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.coeffs, CIPHERTEXTNOISE_NBLOCKS, &state); + state.n = _mm_loadl_epi64((__m128i *)&nonce); + nonce += 1; + PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta2(&ep.vec[i], buf.vec); } - PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.arr, 2, &state); - state.n = _mm_loadl_epi64(&nonce.vec); - nonce.orig++; - PQCLEAN_KYBER102490S_AVX2_cbd(&epp, buf.arr); + PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.coeffs, CIPHERTEXTNOISE_NBLOCKS, &state); + state.n = _mm_loadl_epi64((__m128i *)&nonce); + nonce += 1; + PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta2(&epp, buf.vec); PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(&sp); // matrix-vector multiplication for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp); + PQCLEAN_KYBER102490S_AVX2_polyvec_basemul_acc_montgomery(&b.vec[i], &at[i], &sp); } + PQCLEAN_KYBER102490S_AVX2_polyvec_basemul_acc_montgomery(&v, &pkpv, &sp); - PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp); - - PQCLEAN_KYBER102490S_AVX2_polyvec_invntt_tomont(&bp); + PQCLEAN_KYBER102490S_AVX2_polyvec_invntt_tomont(&b); PQCLEAN_KYBER102490S_AVX2_poly_invntt_tomont(&v); - PQCLEAN_KYBER102490S_AVX2_polyvec_add(&bp, &bp, &ep); + PQCLEAN_KYBER102490S_AVX2_polyvec_add(&b, &b, &ep); PQCLEAN_KYBER102490S_AVX2_poly_add(&v, &v, &epp); PQCLEAN_KYBER102490S_AVX2_poly_add(&v, &v, &k); - PQCLEAN_KYBER102490S_AVX2_polyvec_reduce(&bp); + PQCLEAN_KYBER102490S_AVX2_polyvec_reduce(&b); PQCLEAN_KYBER102490S_AVX2_poly_reduce(&v); - pack_ciphertext(c, &bp, &v); + pack_ciphertext(c, &b, &v); } /************************************************* @@ -333,24 +343,24 @@ void PQCLEAN_KYBER102490S_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], * Description: Decryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *m: pointer to output decrypted message -* (of length KYBER_INDCPA_MSGBYTES) -* - const uint8_t *c: pointer to input ciphertext -* (of length KYBER_INDCPA_BYTES) +* Arguments: - uint8_t *m: pointer to output decrypted message +* (of length KYBER_INDCPA_MSGBYTES) +* - const uint8_t *c: pointer to input ciphertext +* (of length KYBER_INDCPA_BYTES) * - const uint8_t *sk: pointer to input secret key * (of length KYBER_INDCPA_SECRETKEYBYTES) **************************************************/ void PQCLEAN_KYBER102490S_AVX2_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t c[KYBER_INDCPA_BYTES], const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { - polyvec bp, skpv; + polyvec b, skpv; poly v, mp; - unpack_ciphertext(&bp, &v, c); + unpack_ciphertext(&b, &v, c); unpack_sk(&skpv, sk); - PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(&bp); - PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp); + PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(&b); + PQCLEAN_KYBER102490S_AVX2_polyvec_basemul_acc_montgomery(&mp, &skpv, &b); PQCLEAN_KYBER102490S_AVX2_poly_invntt_tomont(&mp); PQCLEAN_KYBER102490S_AVX2_poly_sub(&mp, &v, &mp); diff --git a/crypto_kem/kyber1024-90s/avx2/invntt.S b/crypto_kem/kyber1024-90s/avx2/invntt.S index 959d4987..ecd7ed63 100644 --- a/crypto_kem/kyber1024-90s/avx2/invntt.S +++ b/crypto_kem/kyber1024-90s/avx2/invntt.S @@ -2,22 +2,21 @@ .include "shuffle.inc" .include "fq.inc" -.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=1,zl1=1,zh0=2,zh1=2 -#update & mul -vpsubw %ymm\rh0,%ymm\rl0,%ymm12 -vpsubw %ymm\rh1,%ymm\rl1,%ymm13 -vpsubw %ymm\rh2,%ymm\rl2,%ymm14 - +.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=2,zl1=2,zh0=3,zh1=3 +vpsubw %ymm\rl0,%ymm\rh0,%ymm12 vpaddw %ymm\rh0,%ymm\rl0,%ymm\rl0 -vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl1 +vpsubw %ymm\rl1,%ymm\rh1,%ymm13 + vpmullw %ymm\zl0,%ymm12,%ymm\rh0 +vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl1 +vpsubw %ymm\rl2,%ymm\rh2,%ymm14 -vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl2 vpmullw %ymm\zl0,%ymm13,%ymm\rh1 -vpsubw %ymm\rh3,%ymm\rl3,%ymm15 +vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl2 +vpsubw %ymm\rl3,%ymm\rh3,%ymm15 -vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl3 vpmullw %ymm\zl1,%ymm14,%ymm\rh2 +vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl3 vpmullw %ymm\zl1,%ymm15,%ymm\rh3 vpmulhw %ymm\zh0,%ymm12,%ymm12 @@ -26,60 +25,84 @@ vpmulhw %ymm\zh0,%ymm13,%ymm13 vpmulhw %ymm\zh1,%ymm14,%ymm14 vpmulhw %ymm\zh1,%ymm15,%ymm15 -#reduce vpmulhw %ymm0,%ymm\rh0,%ymm\rh0 + vpmulhw %ymm0,%ymm\rh1,%ymm\rh1 + vpmulhw %ymm0,%ymm\rh2,%ymm\rh2 vpmulhw %ymm0,%ymm\rh3,%ymm\rh3 + +# + +# + vpsubw %ymm\rh0,%ymm12,%ymm\rh0 + vpsubw %ymm\rh1,%ymm13,%ymm\rh1 + vpsubw %ymm\rh2,%ymm14,%ymm\rh2 vpsubw %ymm\rh3,%ymm15,%ymm\rh3 .endm -.text -invntt_levels0t5_avx: -level0: -#zetas -vmovdqu (%rsi),%ymm15 -vmovdqu 64(%rsi),%ymm3 -vmovdqu 32(%rsi),%ymm1 -vmovdqu 96(%rsi),%ymm2 +.macro intt_levels0t5 off +/* level 0 */ +vmovdqa _16XFLO*2(%rsi),%ymm2 +vmovdqa _16XFHI*2(%rsi),%ymm3 -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 128(%rdi),%ymm8 -vmovdqa 160(%rdi),%ymm9 -vmovdqa 192(%rdi),%ymm10 -vmovdqa 224(%rdi),%ymm11 +vmovdqa (128*\off+ 0)*2(%rdi),%ymm4 +vmovdqa (128*\off+ 32)*2(%rdi),%ymm6 +vmovdqa (128*\off+ 16)*2(%rdi),%ymm5 +vmovdqa (128*\off+ 48)*2(%rdi),%ymm7 -butterfly 4,5,8,9,6,7,10,11,15,3,1,2 +fqmulprecomp 2,3,4 +fqmulprecomp 2,3,6 +fqmulprecomp 2,3,5 +fqmulprecomp 2,3,7 -level1: -#zetas -vmovdqu 128(%rsi),%ymm3 -vmovdqu 160(%rsi),%ymm2 +vmovdqa (128*\off+ 64)*2(%rdi),%ymm8 +vmovdqa (128*\off+ 96)*2(%rdi),%ymm10 +vmovdqa (128*\off+ 80)*2(%rdi),%ymm9 +vmovdqa (128*\off+112)*2(%rdi),%ymm11 -butterfly 4,5,6,7,8,9,10,11,3,3,2,2 +fqmulprecomp 2,3,8 +fqmulprecomp 2,3,10 +fqmulprecomp 2,3,9 +fqmulprecomp 2,3,11 + +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+208)*2(%rsi),%ymm15 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+176)*2(%rsi),%ymm1 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+224)*2(%rsi),%ymm2 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+192)*2(%rsi),%ymm3 +vmovdqa _REVIDXB*2(%rsi),%ymm12 +vpshufb %ymm12,%ymm15,%ymm15 +vpshufb %ymm12,%ymm1,%ymm1 +vpshufb %ymm12,%ymm2,%ymm2 +vpshufb %ymm12,%ymm3,%ymm3 + +butterfly 4,5,8,9,6,7,10,11,15,1,2,3 + +/* level 1 */ +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+144)*2(%rsi),%ymm2 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+160)*2(%rsi),%ymm3 +vmovdqa _REVIDXB*2(%rsi),%ymm1 +vpshufb %ymm1,%ymm2,%ymm2 +vpshufb %ymm1,%ymm3,%ymm3 + +butterfly 4,5,6,7,8,9,10,11,2,2,3,3 shuffle1 4,5,3,5 shuffle1 6,7,4,7 shuffle1 8,9,6,9 shuffle1 10,11,8,11 -level2: -#zetas -vmovdqu 192(%rsi),%ymm10 -vmovdqu 224(%rsi),%ymm2 +/* level 2 */ +vmovdqa _REVIDXD*2(%rsi),%ymm12 +vpermd (_ZETAS_EXP+(1-\off)*224+112)*2(%rsi),%ymm12,%ymm2 +vpermd (_ZETAS_EXP+(1-\off)*224+128)*2(%rsi),%ymm12,%ymm10 -#consts -vmovdqa _16XV*2(%rdx),%ymm1 - -butterfly 3,4,6,8,5,7,9,11,10,10,2,2 +butterfly 3,4,6,8,5,7,9,11,2,2,10,10 +vmovdqa _16XV*2(%rsi),%ymm1 red16 3 shuffle2 3,4,10,4 @@ -87,26 +110,22 @@ shuffle2 6,8,3,8 shuffle2 5,7,6,7 shuffle2 9,11,5,11 -level3: -#zetas -vmovdqu 256(%rsi),%ymm9 -vmovdqu 288(%rsi),%ymm2 +/* level 3 */ +vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+80)*2(%rsi),%ymm2 +vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+96)*2(%rsi),%ymm9 -butterfly 10,3,6,5,4,8,7,11,9,9,2,2 - -red16 10 +butterfly 10,3,6,5,4,8,7,11,2,2,9,9 shuffle4 10,3,9,3 shuffle4 6,5,10,5 shuffle4 4,8,6,8 shuffle4 7,11,4,11 -level4: -#zetas -vmovdqu 320(%rsi),%ymm7 -vmovdqu 352(%rsi),%ymm2 +/* level 4 */ +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+48)*2(%rsi),%ymm2 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+64)*2(%rsi),%ymm7 -butterfly 9,10,6,4,3,5,8,11,7,7,2,2 +butterfly 9,10,6,4,3,5,8,11,2,2,7,7 red16 9 @@ -115,113 +134,62 @@ shuffle8 6,4,9,4 shuffle8 3,5,6,5 shuffle8 8,11,3,11 -level5: -#zetas -vpbroadcastd 384(%rsi),%ymm8 -vpbroadcastd 388(%rsi),%ymm2 +/* level5 */ +vmovdqa (_ZETAS_EXP+(1-\off)*224+16)*2(%rsi),%ymm2 +vmovdqa (_ZETAS_EXP+(1-\off)*224+32)*2(%rsi),%ymm8 -butterfly 7,9,6,3,10,4,5,11,8,8,2,2 +butterfly 7,9,6,3,10,4,5,11,2,2,8,8 -red16 7 +vmovdqa %ymm7,(128*\off+ 0)*2(%rdi) +vmovdqa %ymm9,(128*\off+ 16)*2(%rdi) +vmovdqa %ymm6,(128*\off+ 32)*2(%rdi) +vmovdqa %ymm3,(128*\off+ 48)*2(%rdi) +vmovdqa %ymm10,(128*\off+ 64)*2(%rdi) +vmovdqa %ymm4,(128*\off+ 80)*2(%rdi) +vmovdqa %ymm5,(128*\off+ 96)*2(%rdi) +vmovdqa %ymm11,(128*\off+112)*2(%rdi) +.endm -#store -vmovdqa %ymm7,(%rdi) -vmovdqa %ymm9,32(%rdi) -vmovdqa %ymm6,64(%rdi) -vmovdqa %ymm3,96(%rdi) -vmovdqa %ymm10,128(%rdi) -vmovdqa %ymm4,160(%rdi) -vmovdqa %ymm5,192(%rdi) -vmovdqa %ymm11,224(%rdi) +.macro intt_level6 off +/* level 6 */ +vmovdqa (64*\off+ 0)*2(%rdi),%ymm4 +vmovdqa (64*\off+128)*2(%rdi),%ymm8 +vmovdqa (64*\off+ 16)*2(%rdi),%ymm5 +vmovdqa (64*\off+144)*2(%rdi),%ymm9 +vpbroadcastq (_ZETAS_EXP+0)*2(%rsi),%ymm2 -ret - -invntt_level6_avx: -#zetas -vpbroadcastd (%rsi),%ymm1 -vpbroadcastd 4(%rsi),%ymm2 - -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 256(%rdi),%ymm8 -vmovdqa 288(%rdi),%ymm9 -vmovdqa 320(%rdi),%ymm10 -vmovdqa 352(%rdi),%ymm11 +vmovdqa (64*\off+ 32)*2(%rdi),%ymm6 +vmovdqa (64*\off+160)*2(%rdi),%ymm10 +vmovdqa (64*\off+ 48)*2(%rdi),%ymm7 +vmovdqa (64*\off+176)*2(%rdi),%ymm11 +vpbroadcastq (_ZETAS_EXP+4)*2(%rsi),%ymm3 butterfly 4,5,6,7,8,9,10,11 -#consts -vmovdqa _16XFLO*2(%rdx),%ymm12 -vmovdqa _16XFHI*2(%rdx),%ymm13 +.if \off == 0 +red16 4 +.endif -#store -vmovdqa %ymm8,256(%rdi) -vmovdqa %ymm9,288(%rdi) -vmovdqa %ymm10,320(%rdi) -vmovdqa %ymm11,352(%rdi) - -fqmulprecomp 12,13,4,8 -fqmulprecomp 12,13,5,9 -fqmulprecomp 12,13,6,10 -fqmulprecomp 12,13,7,11 - -#store -vmovdqa %ymm4,(%rdi) -vmovdqa %ymm5,32(%rdi) -vmovdqa %ymm6,64(%rdi) -vmovdqa %ymm7,96(%rdi) - -#load -vmovdqa 128(%rdi),%ymm4 -vmovdqa 160(%rdi),%ymm5 -vmovdqa 192(%rdi),%ymm6 -vmovdqa 224(%rdi),%ymm7 -vmovdqa 384(%rdi),%ymm8 -vmovdqa 416(%rdi),%ymm9 -vmovdqa 448(%rdi),%ymm10 -vmovdqa 480(%rdi),%ymm11 - -butterfly 4,5,6,7,8,9,10,11 - -#consts -vmovdqa _16XFLO*2(%rdx),%ymm12 -vmovdqa _16XFHI*2(%rdx),%ymm13 - -#store -vmovdqa %ymm8,384(%rdi) -vmovdqa %ymm9,416(%rdi) -vmovdqa %ymm10,448(%rdi) -vmovdqa %ymm11,480(%rdi) - -fqmulprecomp 12,13,4,8 -fqmulprecomp 12,13,5,9 -fqmulprecomp 12,13,6,10 -fqmulprecomp 12,13,7,11 - -#store -vmovdqa %ymm4,128(%rdi) -vmovdqa %ymm5,160(%rdi) -vmovdqa %ymm6,192(%rdi) -vmovdqa %ymm7,224(%rdi) - -ret +vmovdqa %ymm4,(64*\off+ 0)*2(%rdi) +vmovdqa %ymm5,(64*\off+ 16)*2(%rdi) +vmovdqa %ymm6,(64*\off+ 32)*2(%rdi) +vmovdqa %ymm7,(64*\off+ 48)*2(%rdi) +vmovdqa %ymm8,(64*\off+128)*2(%rdi) +vmovdqa %ymm9,(64*\off+144)*2(%rdi) +vmovdqa %ymm10,(64*\off+160)*2(%rdi) +vmovdqa %ymm11,(64*\off+176)*2(%rdi) +.endm +.text .global cdecl(PQCLEAN_KYBER102490S_AVX2_invntt_avx) .global _cdecl(PQCLEAN_KYBER102490S_AVX2_invntt_avx) cdecl(PQCLEAN_KYBER102490S_AVX2_invntt_avx): _cdecl(PQCLEAN_KYBER102490S_AVX2_invntt_avx): -#consts vmovdqa _16XQ*2(%rsi),%ymm0 -mov %rsi,%rdx -add $_ZETAS_INV_EXP*2,%rsi -call invntt_levels0t5_avx -add $256,%rdi -add $392,%rsi -call invntt_levels0t5_avx -sub $256,%rdi -add $392,%rsi -call invntt_level6_avx + +intt_levels0t5 0 +intt_levels0t5 1 + +intt_level6 0 +intt_level6 1 ret diff --git a/crypto_kem/kyber1024-90s/avx2/kem.c b/crypto_kem/kyber1024-90s/avx2/kem.c index 4b6ee88f..eaaf4878 100644 --- a/crypto_kem/kyber1024-90s/avx2/kem.c +++ b/crypto_kem/kyber1024-90s/avx2/kem.c @@ -1,4 +1,3 @@ -#include "align.h" #include "indcpa.h" #include "kem.h" #include "params.h" @@ -15,13 +14,14 @@ * for CCA-secure Kyber key encapsulation mechanism * * Arguments: - unsigned char *pk: pointer to output public key -* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) * - unsigned char *sk: pointer to output private key -* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* (an already allocated array of KYBER_SECRETKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER102490S_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { +int PQCLEAN_KYBER102490S_AVX2_crypto_kem_keypair(unsigned char pk[KYBER_PUBLICKEYBYTES], + unsigned char sk[KYBER_SECRETKEYBYTES]) { size_t i; PQCLEAN_KYBER102490S_AVX2_indcpa_keypair(pk, sk); for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) { @@ -40,36 +40,36 @@ int PQCLEAN_KYBER102490S_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned cha * secret for given public key * * Arguments: - unsigned char *ct: pointer to output cipher text -* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) * - unsigned char *ss: pointer to output shared secret -* (an already allocated array of CRYPTO_BYTES bytes) +* (an already allocated array of KYBER_SSBYTES bytes) * - const unsigned char *pk: pointer to input public key -* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER102490S_AVX2_crypto_kem_enc(unsigned char *ct, - unsigned char *ss, - const unsigned char *pk) { - ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; +int PQCLEAN_KYBER102490S_AVX2_crypto_kem_enc(unsigned char ct[KYBER_CIPHERTEXTBYTES], + unsigned char ss[KYBER_SSBYTES], + const unsigned char pk[KYBER_PUBLICKEYBYTES]) { + uint8_t buf[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ - ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr; + uint8_t kr[2 * KYBER_SYMBYTES]; - randombytes(buf.arr, KYBER_SYMBYTES); + randombytes(buf, KYBER_SYMBYTES); /* Don't release system RNG output */ - hash_h(buf.arr, buf.arr, KYBER_SYMBYTES); + hash_h(buf, buf, KYBER_SYMBYTES); /* Multitarget countermeasure for coins + contributory KEM */ - hash_h(buf.arr + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); - hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES); + hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); + hash_g(kr, buf, 2 * KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ - PQCLEAN_KYBER102490S_AVX2_indcpa_enc(ct, buf.arr, pk, kr.arr + KYBER_SYMBYTES); + PQCLEAN_KYBER102490S_AVX2_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); /* overwrite coins in kr with H(c) */ - hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); + hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* hash concatenation of pre-k and H(c) to k */ - kdf(ss, kr.arr, 2 * KYBER_SYMBYTES); + kdf(ss, kr, 2 * KYBER_SYMBYTES); return 0; } @@ -80,47 +80,47 @@ int PQCLEAN_KYBER102490S_AVX2_crypto_kem_enc(unsigned char *ct, * cipher text and private key * * Arguments: - unsigned char *ss: pointer to output shared secret -* (an already allocated array of CRYPTO_BYTES bytes) +* (an already allocated array of KYBER_SSBYTES bytes) * - const unsigned char *ct: pointer to input cipher text -* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) * - const unsigned char *sk: pointer to input private key -* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* (an already allocated array of KYBER_SECRETKEYBYTES bytes) * * Returns 0. * * On failure, ss will contain a pseudo-random value. **************************************************/ -int PQCLEAN_KYBER102490S_AVX2_crypto_kem_dec(unsigned char *ss, - const unsigned char *ct, - const unsigned char *sk) { +int PQCLEAN_KYBER102490S_AVX2_crypto_kem_dec(unsigned char ss[KYBER_SSBYTES], + const unsigned char ct[KYBER_CIPHERTEXTBYTES], + const unsigned char sk[KYBER_SECRETKEYBYTES]) { size_t i; int fail; - ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; + uint8_t buf[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ - ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr; - uint8_t cmp[KYBER_CIPHERTEXTBYTES]; + uint8_t kr[2 * KYBER_SYMBYTES]; + ALIGNED_UINT8(KYBER_CIPHERTEXTBYTES) cmp; const uint8_t *pk = sk + KYBER_INDCPA_SECRETKEYBYTES; - PQCLEAN_KYBER102490S_AVX2_indcpa_dec(buf.arr, ct, sk); + PQCLEAN_KYBER102490S_AVX2_indcpa_dec(buf, ct, sk); /* Multitarget countermeasure for coins + contributory KEM */ for (i = 0; i < KYBER_SYMBYTES; i++) { - buf.arr[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; + buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; } - hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES); + hash_g(kr, buf, 2 * KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ - PQCLEAN_KYBER102490S_AVX2_indcpa_enc(cmp, buf.arr, pk, kr.arr + KYBER_SYMBYTES); + PQCLEAN_KYBER102490S_AVX2_indcpa_enc(cmp.coeffs, buf, pk, kr + KYBER_SYMBYTES); - fail = PQCLEAN_KYBER102490S_AVX2_verify(ct, cmp, KYBER_CIPHERTEXTBYTES); + fail = PQCLEAN_KYBER102490S_AVX2_verify(ct, cmp.coeffs, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ - hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); + hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* Overwrite pre-k with z on re-encryption failure */ - PQCLEAN_KYBER102490S_AVX2_cmov(kr.arr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, (uint8_t)fail); + PQCLEAN_KYBER102490S_AVX2_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); /* hash concatenation of pre-k and H(c) to k */ - kdf(ss, kr.arr, 2 * KYBER_SYMBYTES); + kdf(ss, kr, 2 * KYBER_SYMBYTES); return 0; } diff --git a/crypto_kem/kyber1024-90s/avx2/ntt.S b/crypto_kem/kyber1024-90s/avx2/ntt.S index 841aa9f4..a7abc79d 100644 --- a/crypto_kem/kyber1024-90s/avx2/ntt.S +++ b/crypto_kem/kyber1024-90s/avx2/ntt.S @@ -1,222 +1,191 @@ #include "cdecl.h" .include "shuffle.inc" -.include "fq.inc" -.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=1,zh1=1 -#mul +.macro mul rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=2,zh1=2 vpmullw %ymm\zl0,%ymm\rh0,%ymm12 vpmullw %ymm\zl0,%ymm\rh1,%ymm13 + vpmullw %ymm\zl1,%ymm\rh2,%ymm14 vpmullw %ymm\zl1,%ymm\rh3,%ymm15 + vpmulhw %ymm\zh0,%ymm\rh0,%ymm\rh0 vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh1 + vpmulhw %ymm\zh1,%ymm\rh2,%ymm\rh2 vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh3 - -#reduce -vpmulhw %ymm0,%ymm12,%ymm12 -vpmulhw %ymm0,%ymm13,%ymm13 -vpmulhw %ymm0,%ymm14,%ymm14 -vpmulhw %ymm0,%ymm15,%ymm15 -vpsubw %ymm12,%ymm\rh0,%ymm12 -vpsubw %ymm13,%ymm\rh1,%ymm13 -vpsubw %ymm14,%ymm\rh2,%ymm14 -vpsubw %ymm15,%ymm\rh3,%ymm15 - -#update -vpsubw %ymm12,%ymm\rl0,%ymm\rh0 -vpaddw %ymm12,%ymm\rl0,%ymm\rl0 -vpsubw %ymm13,%ymm\rl1,%ymm\rh1 -vpaddw %ymm13,%ymm\rl1,%ymm\rl1 -vpsubw %ymm14,%ymm\rl2,%ymm\rh2 -vpaddw %ymm14,%ymm\rl2,%ymm\rl2 -vpsubw %ymm15,%ymm\rl3,%ymm\rh3 -vpaddw %ymm15,%ymm\rl3,%ymm\rl3 .endm -# We break the dependency chains with the cost of slightly more additions. -# But they can be run in parallel to the multiplications on execution port 5 -# (multiplications only go to ports 0 and 1) -.macro butterfly2 rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,x=3,y=2,zl0=15,zl1=15,zh0=1,zh1=1 -#mul -vpmullw %ymm\zl0,%ymm\rh0,%ymm12 -vpmulhw %ymm\zh0,%ymm\rh0,%ymm\x -vpmullw %ymm\zl0,%ymm\rh1,%ymm13 -vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh0 -vpmullw %ymm\zl1,%ymm\rh2,%ymm14 -vpmulhw %ymm\zh1,%ymm\rh2,%ymm\y -vpmullw %ymm\zl1,%ymm\rh3,%ymm15 -vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh2 - -#reduce +.macro reduce vpmulhw %ymm0,%ymm12,%ymm12 vpmulhw %ymm0,%ymm13,%ymm13 + vpmulhw %ymm0,%ymm14,%ymm14 vpmulhw %ymm0,%ymm15,%ymm15 +.endm -vpsubw %ymm\rh0,%ymm\rl1,%ymm\rh1 -vpaddw %ymm\rh0,%ymm\rl1,%ymm\rl1 -vpsubw %ymm\x,%ymm\rl0,%ymm\rh0 -vpaddw %ymm\x,%ymm\rl0,%ymm\rl0 -vpsubw %ymm\rh2,%ymm\rl3,%ymm\rh3 -vpaddw %ymm\rh2,%ymm\rl3,%ymm\rl3 -vpsubw %ymm\y,%ymm\rl2,%ymm\rh2 -vpaddw %ymm\y,%ymm\rl2,%ymm\rl2 +.macro update rln,rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 +vpaddw %ymm\rh0,%ymm\rl0,%ymm\rln +vpsubw %ymm\rh0,%ymm\rl0,%ymm\rh0 +vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl0 -#update +vpsubw %ymm\rh1,%ymm\rl1,%ymm\rh1 +vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl1 +vpsubw %ymm\rh2,%ymm\rl2,%ymm\rh2 + +vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl2 +vpsubw %ymm\rh3,%ymm\rl3,%ymm\rh3 + +vpsubw %ymm12,%ymm\rln,%ymm\rln vpaddw %ymm12,%ymm\rh0,%ymm\rh0 -vpsubw %ymm12,%ymm\rl0,%ymm\rl0 +vpsubw %ymm13,%ymm\rl0,%ymm\rl0 + vpaddw %ymm13,%ymm\rh1,%ymm\rh1 -vpsubw %ymm13,%ymm\rl1,%ymm\rl1 +vpsubw %ymm14,%ymm\rl1,%ymm\rl1 vpaddw %ymm14,%ymm\rh2,%ymm\rh2 -vpsubw %ymm14,%ymm\rl2,%ymm\rl2 + +vpsubw %ymm15,%ymm\rl2,%ymm\rl2 vpaddw %ymm15,%ymm\rh3,%ymm\rh3 -vpsubw %ymm15,%ymm\rl3,%ymm\rl3 +.endm + +.macro level0 off +vpbroadcastq (_ZETAS_EXP+0)*2(%rsi),%ymm15 +vmovdqa (64*\off+128)*2(%rdi),%ymm8 +vmovdqa (64*\off+144)*2(%rdi),%ymm9 +vmovdqa (64*\off+160)*2(%rdi),%ymm10 +vmovdqa (64*\off+176)*2(%rdi),%ymm11 +vpbroadcastq (_ZETAS_EXP+4)*2(%rsi),%ymm2 + +mul 8,9,10,11 + +vmovdqa (64*\off+ 0)*2(%rdi),%ymm4 +vmovdqa (64*\off+ 16)*2(%rdi),%ymm5 +vmovdqa (64*\off+ 32)*2(%rdi),%ymm6 +vmovdqa (64*\off+ 48)*2(%rdi),%ymm7 + +reduce +update 3,4,5,6,7,8,9,10,11 + +vmovdqa %ymm3,(64*\off+ 0)*2(%rdi) +vmovdqa %ymm4,(64*\off+ 16)*2(%rdi) +vmovdqa %ymm5,(64*\off+ 32)*2(%rdi) +vmovdqa %ymm6,(64*\off+ 48)*2(%rdi) +vmovdqa %ymm8,(64*\off+128)*2(%rdi) +vmovdqa %ymm9,(64*\off+144)*2(%rdi) +vmovdqa %ymm10,(64*\off+160)*2(%rdi) +vmovdqa %ymm11,(64*\off+176)*2(%rdi) +.endm + +.macro levels1t6 off +/* level 1 */ +vmovdqa (_ZETAS_EXP+224*\off+16)*2(%rsi),%ymm15 +vmovdqa (128*\off+ 64)*2(%rdi),%ymm8 +vmovdqa (128*\off+ 80)*2(%rdi),%ymm9 +vmovdqa (128*\off+ 96)*2(%rdi),%ymm10 +vmovdqa (128*\off+112)*2(%rdi),%ymm11 +vmovdqa (_ZETAS_EXP+224*\off+32)*2(%rsi),%ymm2 + +mul 8,9,10,11 + +vmovdqa (128*\off+ 0)*2(%rdi),%ymm4 +vmovdqa (128*\off+ 16)*2(%rdi),%ymm5 +vmovdqa (128*\off+ 32)*2(%rdi),%ymm6 +vmovdqa (128*\off+ 48)*2(%rdi),%ymm7 + +reduce +update 3,4,5,6,7,8,9,10,11 + +/* level 2 */ +shuffle8 5,10,7,10 +shuffle8 6,11,5,11 + +vmovdqa (_ZETAS_EXP+224*\off+48)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+64)*2(%rsi),%ymm2 + +mul 7,10,5,11 + +shuffle8 3,8,6,8 +shuffle8 4,9,3,9 + +reduce +update 4,6,8,3,9,7,10,5,11 + +/* level 3 */ +shuffle4 8,5,9,5 +shuffle4 3,11,8,11 + +vmovdqa (_ZETAS_EXP+224*\off+80)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+96)*2(%rsi),%ymm2 + +mul 9,5,8,11 + +shuffle4 4,7,3,7 +shuffle4 6,10,4,10 + +reduce +update 6,3,7,4,10,9,5,8,11 + +/* level 4 */ +shuffle2 7,8,10,8 +shuffle2 4,11,7,11 + +vmovdqa (_ZETAS_EXP+224*\off+112)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+128)*2(%rsi),%ymm2 + +mul 10,8,7,11 + +shuffle2 6,9,4,9 +shuffle2 3,5,6,5 + +reduce +update 3,4,9,6,5,10,8,7,11 + +/* level 5 */ +shuffle1 9,7,5,7 +shuffle1 6,11,9,11 + +vmovdqa (_ZETAS_EXP+224*\off+144)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+160)*2(%rsi),%ymm2 + +mul 5,7,9,11 + +shuffle1 3,10,6,10 +shuffle1 4,8,3,8 + +reduce +update 4,6,10,3,8,5,7,9,11 + +/* level 6 */ +vmovdqa (_ZETAS_EXP+224*\off+176)*2(%rsi),%ymm14 +vmovdqa (_ZETAS_EXP+224*\off+208)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+192)*2(%rsi),%ymm8 +vmovdqa (_ZETAS_EXP+224*\off+224)*2(%rsi),%ymm2 + +mul 10,3,9,11,14,15,8,2 + +reduce +update 8,4,6,5,7,10,3,9,11 + +vmovdqa %ymm8,(128*\off+ 0)*2(%rdi) +vmovdqa %ymm4,(128*\off+ 16)*2(%rdi) +vmovdqa %ymm10,(128*\off+ 32)*2(%rdi) +vmovdqa %ymm3,(128*\off+ 48)*2(%rdi) +vmovdqa %ymm6,(128*\off+ 64)*2(%rdi) +vmovdqa %ymm5,(128*\off+ 80)*2(%rdi) +vmovdqa %ymm9,(128*\off+ 96)*2(%rdi) +vmovdqa %ymm11,(128*\off+112)*2(%rdi) .endm .text -ntt_level0_avx: -level0: -#zetas -vpbroadcastd (%rsi),%ymm15 -vpbroadcastd 4(%rsi),%ymm1 - -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 256(%rdi),%ymm8 -vmovdqa 288(%rdi),%ymm9 -vmovdqa 320(%rdi),%ymm10 -vmovdqa 352(%rdi),%ymm11 - -butterfly2 4,5,6,7,8,9,10,11 - -#store -vmovdqa %ymm4,(%rdi) -vmovdqa %ymm5,32(%rdi) -vmovdqa %ymm6,64(%rdi) -vmovdqa %ymm7,96(%rdi) -vmovdqa %ymm8,256(%rdi) -vmovdqa %ymm9,288(%rdi) -vmovdqa %ymm10,320(%rdi) -vmovdqa %ymm11,352(%rdi) - -ret - -ntt_levels1t6_avx: -level1: -#zetas -vpbroadcastd (%rsi),%ymm15 -vpbroadcastd 4(%rsi),%ymm1 - -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 128(%rdi),%ymm8 -vmovdqa 160(%rdi),%ymm9 -vmovdqa 192(%rdi),%ymm10 -vmovdqa 224(%rdi),%ymm11 - -butterfly2 4,5,6,7,8,9,10,11,3 - -level2: -#zetas -vmovdqu 8(%rsi),%ymm15 -vmovdqu 40(%rsi),%ymm1 - -shuffle8 4,8,3,8 -shuffle8 5,9,4,9 -shuffle8 6,10,5,10 -shuffle8 7,11,6,11 - -butterfly2 3,8,4,9,5,10,6,11,7 - -level3: -#zetas -vmovdqu 72(%rsi),%ymm15 -vmovdqu 104(%rsi),%ymm1 - -shuffle4 3,5,7,5 -shuffle4 8,10,3,10 -shuffle4 4,6,8,6 -shuffle4 9,11,4,11 - -butterfly2 7,5,3,10,8,6,4,11,9 - -level4: -#zetas -vmovdqu 136(%rsi),%ymm15 -vmovdqu 168(%rsi),%ymm1 - -shuffle2 7,8,9,8 -shuffle2 5,6,7,6 -shuffle2 3,4,5,4 -shuffle2 10,11,3,11 - -butterfly2 9,8,7,6,5,4,3,11,10 - -level5: -#zetas -vmovdqu 200(%rsi),%ymm15 -vmovdqu 232(%rsi),%ymm1 - -shuffle1 9,5,10,5 -shuffle1 8,4,9,4 -shuffle1 7,3,8,3 -shuffle1 6,11,7,11 - -butterfly2 10,5,9,4,8,3,7,11,6 - -level6: -#zetas -vmovdqu 264(%rsi),%ymm14 -vmovdqu 328(%rsi),%ymm15 -vmovdqu 296(%rsi),%ymm1 -vmovdqu 360(%rsi),%ymm2 - -butterfly2 10,5,8,3,9,4,7,11,6,1,14,15,1,2 - -vmovdqa _16XV*2(%rdx),%ymm1 -red16 10,12 -red16 5,13 -red16 9,14 -red16 4,15 -red16 8,2 -red16 3,6 -red16 7,12 -red16 11,13 - -#store -vmovdqa %ymm10,(%rdi) -vmovdqa %ymm5,32(%rdi) -vmovdqa %ymm9,64(%rdi) -vmovdqa %ymm4,96(%rdi) -vmovdqa %ymm8,128(%rdi) -vmovdqa %ymm3,160(%rdi) -vmovdqa %ymm7,192(%rdi) -vmovdqa %ymm11,224(%rdi) - -ret - .global cdecl(PQCLEAN_KYBER102490S_AVX2_ntt_avx) .global _cdecl(PQCLEAN_KYBER102490S_AVX2_ntt_avx) cdecl(PQCLEAN_KYBER102490S_AVX2_ntt_avx): _cdecl(PQCLEAN_KYBER102490S_AVX2_ntt_avx): -#consts vmovdqa _16XQ*2(%rsi),%ymm0 -mov %rsi,%rdx -add $_ZETAS_EXP*2,%rsi -call ntt_level0_avx -add $128,%rdi -call ntt_level0_avx -sub $128,%rdi -add $8,%rsi -call ntt_levels1t6_avx -add $256,%rdi -add $392,%rsi -call ntt_levels1t6_avx + +level0 0 +level0 1 + +levels1t6 0 +levels1t6 1 + ret diff --git a/crypto_kem/kyber1024-90s/avx2/ntt.h b/crypto_kem/kyber1024-90s/avx2/ntt.h index db21cece..e27fb481 100644 --- a/crypto_kem/kyber1024-90s/avx2/ntt.h +++ b/crypto_kem/kyber1024-90s/avx2/ntt.h @@ -1,24 +1,21 @@ #ifndef PQCLEAN_KYBER102490S_AVX2_NTT_H #define PQCLEAN_KYBER102490S_AVX2_NTT_H -#include "consts.h" + +#include #include -void PQCLEAN_KYBER102490S_AVX2_ntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); -void PQCLEAN_KYBER102490S_AVX2_invntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); +void PQCLEAN_KYBER102490S_AVX2_ntt_avx(__m256i *r, const __m256i *PQCLEAN_KYBER102490S_AVX2_qdata); +void PQCLEAN_KYBER102490S_AVX2_invntt_avx(__m256i *r, const __m256i *PQCLEAN_KYBER102490S_AVX2_qdata); -void PQCLEAN_KYBER102490S_AVX2_nttpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); -void PQCLEAN_KYBER102490S_AVX2_nttunpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); +void PQCLEAN_KYBER102490S_AVX2_nttpack_avx(__m256i *r, const __m256i *PQCLEAN_KYBER102490S_AVX2_qdata); +void PQCLEAN_KYBER102490S_AVX2_nttunpack_avx(__m256i *r, const __m256i *PQCLEAN_KYBER102490S_AVX2_qdata); -void PQCLEAN_KYBER102490S_AVX2_basemul_avx(int16_t *r, - const int16_t *a, - const int16_t *b, - const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); -void PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx(int16_t *r, - const int16_t *a, - const int16_t *b, - const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); +void PQCLEAN_KYBER102490S_AVX2_basemul_avx(__m256i *r, + const __m256i *a, + const __m256i *b, + const __m256i *PQCLEAN_KYBER102490S_AVX2_qdata); -void PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx(uint8_t *r, const int16_t *a, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); -void PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx(int16_t *r, const uint8_t *a, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); +void PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx(uint8_t *r, const __m256i *a, const __m256i *PQCLEAN_KYBER102490S_AVX2_qdata); +void PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx(__m256i *r, const uint8_t *a, const __m256i *PQCLEAN_KYBER102490S_AVX2_qdata); #endif diff --git a/crypto_kem/kyber1024-90s/avx2/params.h b/crypto_kem/kyber1024-90s/avx2/params.h index a1ba0077..8de2c883 100644 --- a/crypto_kem/kyber1024-90s/avx2/params.h +++ b/crypto_kem/kyber1024-90s/avx2/params.h @@ -7,8 +7,6 @@ #define KYBER_N 256 #define KYBER_Q 3329 -#define KYBER_ETA 2 - #define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define KYBER_SSBYTES 32 /* size in bytes of shared key */ @@ -16,9 +14,12 @@ #define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) #define KYBER_K 4 +#define KYBER_ETA1 2 #define KYBER_POLYCOMPRESSEDBYTES 160 #define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 352) +#define KYBER_ETA2 2 + #define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES #define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) #define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) diff --git a/crypto_kem/kyber1024-90s/avx2/poly.c b/crypto_kem/kyber1024-90s/avx2/poly.c index a614ed25..af76d233 100644 --- a/crypto_kem/kyber1024-90s/avx2/poly.c +++ b/crypto_kem/kyber1024-90s/avx2/poly.c @@ -12,76 +12,99 @@ /************************************************* * Name: PQCLEAN_KYBER102490S_AVX2_poly_compress * -* Description: Compression and subsequent serialization of a polynomial +* Description: Compression and subsequent serialization of a polynomial. +* The coefficients of the input polynomial are assumed to +* lie in the invertal [0,q], i.e. the polynomial must be reduced +* by PQCLEAN_KYBER102490S_AVX2_poly_reduce(). * * Arguments: - uint8_t *r: pointer to output byte array * (of length KYBER_POLYCOMPRESSEDBYTES) -* - poly *a: pointer to input polynomial +* - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER102490S_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *restrict a) { - unsigned int i, j; - uint8_t t[8]; +void PQCLEAN_KYBER102490S_AVX2_poly_compress(uint8_t r[160], const poly *restrict a) { + size_t i; + uint32_t low; + __m256i f0, f1; + __m128i t0, t1; + const __m256i v = _mm256_load_si256(&PQCLEAN_KYBER102490S_AVX2_qdata.vec[_16XV / 16]); + const __m256i shift1 = _mm256_set1_epi16(1 << 10); + const __m256i mask = _mm256_set1_epi16(31); + const __m256i shift2 = _mm256_set1_epi16((32 << 8) + 1); + const __m256i shift3 = _mm256_set1_epi32((1024 << 16) + 1); + const __m256i sllvdidx = _mm256_set1_epi64x(12); + const __m256i shufbidx = _mm256_set_epi8( 8, -1, -1, -1, -1, -1, 4, 3, 2, 1, 0, -1, 12, 11, 10, 9, + -1, 12, 11, 10, 9, 8, -1, -1, -1, -1, -1, 4, 3, 2, 1, 0); - PQCLEAN_KYBER102490S_AVX2_poly_csubq(a); - - for (i = 0; i < KYBER_N / 8; i++) { - for (j = 0; j < 8; j++) { - t[j] = ((((uint32_t)a->coeffs[8 * i + j] << 5) + KYBER_Q / 2) / KYBER_Q) & 31; - } - - r[0] = (t[0] >> 0) | (t[1] << 5); - r[1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7); - r[2] = (t[3] >> 1) | (t[4] << 4); - r[3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6); - r[4] = (t[6] >> 2) | (t[7] << 3); - r += 5; + for (i = 0; i < KYBER_N / 32; i++) { + f0 = _mm256_load_si256(&a->vec[2 * i + 0]); + f1 = _mm256_load_si256(&a->vec[2 * i + 1]); + f0 = _mm256_mulhi_epi16(f0, v); + f1 = _mm256_mulhi_epi16(f1, v); + f0 = _mm256_mulhrs_epi16(f0, shift1); + f1 = _mm256_mulhrs_epi16(f1, shift1); + f0 = _mm256_and_si256(f0, mask); + f1 = _mm256_and_si256(f1, mask); + f0 = _mm256_packus_epi16(f0, f1); + f0 = _mm256_maddubs_epi16(f0, shift2); // a0 a1 a2 a3 b0 b1 b2 b3 a4 a5 a6 a7 b4 b5 b6 b7 + f0 = _mm256_madd_epi16(f0, shift3); // a0 a1 b0 b1 a2 a3 b2 b3 + f0 = _mm256_sllv_epi32(f0, sllvdidx); + f0 = _mm256_srlv_epi64(f0, sllvdidx); + f0 = _mm256_shuffle_epi8(f0, shufbidx); + t0 = _mm256_castsi256_si128(f0); + t1 = _mm256_extracti128_si256(f0, 1); + t0 = _mm_blendv_epi8(t0, t1, _mm256_castsi256_si128(shufbidx)); + _mm_storeu_si128((__m128i *)&r[20 * i + 0], t0); + _mm_store_ss((float *)&low, _mm_castsi128_ps(t1)); + r[20 * i + 16] = (uint8_t)low; + r[20 * i + 17] = (uint8_t)(low >> 0x08); + r[20 * i + 18] = (uint8_t)(low >> 0x10); + r[20 * i + 19] = (uint8_t)(low >> 0x18); } } -/************************************************* -* Name: PQCLEAN_KYBER102490S_AVX2_poly_decompress -* -* Description: De-serialization and subsequent decompression of a polynomial; -* approximate inverse of PQCLEAN_KYBER102490S_AVX2_poly_compress -* -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *a: pointer to input byte array -* (of length KYBER_POLYCOMPRESSEDBYTES bytes) -**************************************************/ -void PQCLEAN_KYBER102490S_AVX2_poly_decompress(poly *restrict r, - const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) { +void PQCLEAN_KYBER102490S_AVX2_poly_decompress(poly *restrict r, const uint8_t a[160]) { unsigned int i; + int16_t h; + __m128i t; + __m256i f; + const __m256i q = _mm256_load_si256(&PQCLEAN_KYBER102490S_AVX2_qdata.vec[_16XQ / 16]); + const __m256i shufbidx = _mm256_set_epi8(9, 9, 9, 8, 8, 8, 8, 7, 7, 6, 6, 6, 6, 5, 5, 5, + 4, 4, 4, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0); + const __m256i mask = _mm256_set_epi16(248, 1984, 62, 496, 3968, 124, 992, 31, + 248, 1984, 62, 496, 3968, 124, 992, 31); + const __m256i shift = _mm256_set_epi16(128, 16, 512, 64, 8, 256, 32, 1024, + 128, 16, 512, 64, 8, 256, 32, 1024); - unsigned int j; - uint8_t t[8]; - for (i = 0; i < KYBER_N / 8; i++) { - t[0] = (a[0] >> 0); - t[1] = (a[0] >> 5) | (a[1] << 3); - t[2] = (a[1] >> 2); - t[3] = (a[1] >> 7) | (a[2] << 1); - t[4] = (a[2] >> 4) | (a[3] << 4); - t[5] = (a[3] >> 1); - t[6] = (a[3] >> 6) | (a[4] << 2); - t[7] = (a[4] >> 3); - a += 5; - - for (j = 0; j < 8; j++) { - r->coeffs[8 * i + j] = ((uint32_t)(t[j] & 31) * KYBER_Q + 16) >> 5; - } + for (i = 0; i < KYBER_N / 16; i++) { + t = _mm_loadl_epi64((__m128i *)&a[10 * i + 0]); + h = (a[10 * i + 9] << 8) + a[10 * i + 8]; + t = _mm_insert_epi16(t, h, 4); + f = _mm256_broadcastsi128_si256(t); + f = _mm256_shuffle_epi8(f, shufbidx); + f = _mm256_and_si256(f, mask); + f = _mm256_mullo_epi16(f, shift); + f = _mm256_mulhrs_epi16(f, q); + _mm256_store_si256(&r->vec[i], f); } } + /************************************************* * Name: PQCLEAN_KYBER102490S_AVX2_poly_tobytes * -* Description: Serialization of a polynomial +* Description: Serialization of a polynomial in NTT representation. +* The coefficients of the input polynomial are assumed to +* lie in the invertal [0,q], i.e. the polynomial must be reduced +* by PQCLEAN_KYBER102490S_AVX2_poly_reduce(). The coefficients are orderd as output by +* PQCLEAN_KYBER102490S_AVX2_poly_ntt(); the serialized output coefficients are in bitreversed +* order. * * Arguments: - uint8_t *r: pointer to output byte array * (needs space for KYBER_POLYBYTES bytes) -* - poly *a: pointer to input polynomial +* - poly *a: pointer to input polynomial **************************************************/ void PQCLEAN_KYBER102490S_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { - PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx(r, a->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); + PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx(r, a->vec, PQCLEAN_KYBER102490S_AVX2_qdata.vec); } /************************************************* @@ -90,12 +113,12 @@ void PQCLEAN_KYBER102490S_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) * Description: De-serialization of a polynomial; * inverse of PQCLEAN_KYBER102490S_AVX2_poly_tobytes * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: pointer to input byte array * (of KYBER_POLYBYTES bytes) **************************************************/ void PQCLEAN_KYBER102490S_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]) { - PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx(r->coeffs, a, &PQCLEAN_KYBER102490S_AVX2_qdata); + PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx(r->vec, a, PQCLEAN_KYBER102490S_AVX2_qdata.vec); } /************************************************* @@ -103,11 +126,10 @@ void PQCLEAN_KYBER102490S_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POL * * Description: Convert 32-byte message to polynomial * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *msg: pointer to input message **************************************************/ -void PQCLEAN_KYBER102490S_AVX2_poly_frommsg(poly *restrict r, - const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { +void PQCLEAN_KYBER102490S_AVX2_poly_frommsg(poly *restrict r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { __m256i f, g0, g1, g2, g3, h0, h1, h2, h3; const __m256i shift = _mm256_broadcastsi128_si256(_mm_set_epi32(0, 1, 2, 3)); const __m256i idx = _mm256_broadcastsi128_si256(_mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); @@ -136,12 +158,12 @@ void PQCLEAN_KYBER102490S_AVX2_poly_frommsg(poly *restrict r, g2 = _mm256_permute2x128_si256(h0,h1,0x31); \ g1 = _mm256_permute2x128_si256(h2,h3,0x20); \ g3 = _mm256_permute2x128_si256(h2,h3,0x31); \ - _mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+ 0],g0); \ - _mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+16],g1); \ - _mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+ 0],g2); \ - _mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+16],g3) + _mm256_store_si256(&r->vec[0+2*(i)+0],g0); \ + _mm256_store_si256(&r->vec[0+2*(i)+1],g1); \ + _mm256_store_si256(&r->vec[8+2*(i)+0],g2); \ + _mm256_store_si256(&r->vec[8+2*(i)+1],g3) - f = _mm256_load_si256((__m256i *)msg); + f = _mm256_loadu_si256((__m256i *)msg); FROMMSG64(0); FROMMSG64(1); FROMMSG64(2); @@ -151,32 +173,34 @@ void PQCLEAN_KYBER102490S_AVX2_poly_frommsg(poly *restrict r, /************************************************* * Name: PQCLEAN_KYBER102490S_AVX2_poly_tomsg * -* Description: Convert polynomial to 32-byte message +* Description: Convert polynomial to 32-byte message. +* The coefficients of the input polynomial are assumed to +* lie in the invertal [0,q], i.e. the polynomial must be reduced +* by PQCLEAN_KYBER102490S_AVX2_poly_reduce(). * * Arguments: - uint8_t *msg: pointer to output message -* - poly *a: pointer to input polynomial +* - poly *a: pointer to input polynomial **************************************************/ void PQCLEAN_KYBER102490S_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *restrict a) { unsigned int i; uint32_t small; __m256i f0, f1, g0, g1; - const __m256i hqs = _mm256_set1_epi16((KYBER_Q - 1) / 2); - const __m256i hhqs = _mm256_set1_epi16((KYBER_Q - 5) / 4); + const __m256i hq = _mm256_set1_epi16((KYBER_Q - 1) / 2); + const __m256i hhq = _mm256_set1_epi16((KYBER_Q - 1) / 4); for (i = 0; i < KYBER_N / 32; i++) { - f0 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i]); - f1 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i + 16]); - f0 = _mm256_sub_epi16(hqs, f0); - f1 = _mm256_sub_epi16(hqs, f1); + f0 = _mm256_load_si256(&a->vec[2 * i + 0]); + f1 = _mm256_load_si256(&a->vec[2 * i + 1]); + f0 = _mm256_sub_epi16(hq, f0); + f1 = _mm256_sub_epi16(hq, f1); g0 = _mm256_srai_epi16(f0, 15); g1 = _mm256_srai_epi16(f1, 15); f0 = _mm256_xor_si256(f0, g0); f1 = _mm256_xor_si256(f1, g1); - f0 = _mm256_sub_epi16(hhqs, f0); - f1 = _mm256_sub_epi16(hhqs, f1); + f0 = _mm256_sub_epi16(f0, hhq); + f1 = _mm256_sub_epi16(f1, hhq); f0 = _mm256_packs_epi16(f0, f1); small = _mm256_movemask_epi8(f0); - small = ~small; msg[4 * i + 0] = small; msg[4 * i + 1] = small >> 16; msg[4 * i + 2] = small >> 8; @@ -185,21 +209,39 @@ void PQCLEAN_KYBER102490S_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], po } /************************************************* -* Name: PQCLEAN_KYBER102490S_AVX2_poly_getnoise +* Name: PQCLEAN_KYBER102490S_AVX2_poly_getnoise_eta1 * * Description: Sample a polynomial deterministically from a seed and a nonce, * with output polynomial close to centered binomial distribution -* with parameter KYBER_ETA +* with parameter KYBER_ETA1 * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *seed: pointer to input seed * (of length KYBER_SYMBYTES bytes) -* - uint8_t nonce: one-byte input nonce +* - uint8_t nonce: one-byte input nonce **************************************************/ -void PQCLEAN_KYBER102490S_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { - ALIGN32_ARRAY(uint8_t, KYBER_ETA * KYBER_N / 4) buf; - prf(buf.arr, sizeof(buf.arr), seed, nonce); - PQCLEAN_KYBER102490S_AVX2_cbd(r, buf.arr); +void PQCLEAN_KYBER102490S_AVX2_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { + ALIGNED_UINT8(KYBER_ETA1 * KYBER_N / 4 + 32) buf; // +32 bytes as required by PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta1 + prf(buf.coeffs, KYBER_ETA1 * KYBER_N / 4, seed, nonce); + PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta1(r, buf.vec); +} + +/************************************************* +* Name: PQCLEAN_KYBER102490S_AVX2_poly_getnoise_eta2 +* +* Description: Sample a polynomial deterministically from a seed and a nonce, +* with output polynomial close to centered binomial distribution +* with parameter KYBER_ETA2 +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *seed: pointer to input seed +* (of length KYBER_SYMBYTES bytes) +* - uint8_t nonce: one-byte input nonce +**************************************************/ +void PQCLEAN_KYBER102490S_AVX2_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { + ALIGNED_UINT8(KYBER_ETA2 * KYBER_N / 4) buf; + prf(buf.coeffs, KYBER_ETA2 * KYBER_N / 4, seed, nonce); + PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta2(r, buf.vec); } @@ -207,13 +249,17 @@ void PQCLEAN_KYBER102490S_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_S * Name: PQCLEAN_KYBER102490S_AVX2_poly_ntt * * Description: Computes negacyclic number-theoretic transform (NTT) of -* a polynomial in place; -* inputs assumed to be in normal order, output in bitreversed order +* a polynomial in place. +* Input coefficients assumed to be in normal order, +* output coefficients are in special order that is natural +* for the vectorization. Input coefficients are assumed to be +* bounded by q in absolute value, output coefficients are bounded +* by 16118 in absolute value. * -* Arguments: - uint16_t *r: pointer to in/output polynomial +* Arguments: - poly *r: pointer to in/output polynomial **************************************************/ void PQCLEAN_KYBER102490S_AVX2_poly_ntt(poly *r) { - PQCLEAN_KYBER102490S_AVX2_ntt_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); + PQCLEAN_KYBER102490S_AVX2_ntt_avx(r->vec, PQCLEAN_KYBER102490S_AVX2_qdata.vec); } /************************************************* @@ -221,29 +267,35 @@ void PQCLEAN_KYBER102490S_AVX2_poly_ntt(poly *r) { * * Description: Computes inverse of negacyclic number-theoretic transform (NTT) * of a polynomial in place; -* inputs assumed to be in bitreversed order, output in normal order +* Input coefficients assumed to be in special order from vectorized +* forward ntt, output in normal order. Input coefficients can be +* arbitrary 16-bit integers, output coefficients are bounded by 14870 +* in absolute value. * -* Arguments: - uint16_t *a: pointer to in/output polynomial +* Arguments: - poly *a: pointer to in/output polynomial **************************************************/ void PQCLEAN_KYBER102490S_AVX2_poly_invntt_tomont(poly *r) { - PQCLEAN_KYBER102490S_AVX2_invntt_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); + PQCLEAN_KYBER102490S_AVX2_invntt_avx(r->vec, PQCLEAN_KYBER102490S_AVX2_qdata.vec); } void PQCLEAN_KYBER102490S_AVX2_poly_nttunpack(poly *r) { - PQCLEAN_KYBER102490S_AVX2_nttunpack_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); + PQCLEAN_KYBER102490S_AVX2_nttunpack_avx(r->vec, PQCLEAN_KYBER102490S_AVX2_qdata.vec); } /************************************************* * Name: PQCLEAN_KYBER102490S_AVX2_poly_basemul_montgomery * -* Description: Multiplication of two polynomials in NTT domain +* Description: Multiplication of two polynomials in NTT domain. +* One of the input polynomials needs to have coefficients +* bounded by q, the other polynomial can have arbitrary +* coefficients. Output coefficients are bounded by 6656. * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ void PQCLEAN_KYBER102490S_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const poly *b) { - PQCLEAN_KYBER102490S_AVX2_basemul_avx(r->coeffs, a->coeffs, b->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); + PQCLEAN_KYBER102490S_AVX2_basemul_avx(r->vec, a->vec, b->vec, PQCLEAN_KYBER102490S_AVX2_qdata.vec); } /************************************************* @@ -255,7 +307,7 @@ void PQCLEAN_KYBER102490S_AVX2_poly_basemul_montgomery(poly *r, const poly *a, c * Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER102490S_AVX2_poly_tomont(poly *r) { - PQCLEAN_KYBER102490S_AVX2_tomont_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); + PQCLEAN_KYBER102490S_AVX2_tomont_avx(r->vec, PQCLEAN_KYBER102490S_AVX2_qdata.vec); } /************************************************* @@ -267,28 +319,16 @@ void PQCLEAN_KYBER102490S_AVX2_poly_tomont(poly *r) { * Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER102490S_AVX2_poly_reduce(poly *r) { - PQCLEAN_KYBER102490S_AVX2_reduce_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); -} - -/************************************************* -* Name: PQCLEAN_KYBER102490S_AVX2_poly_csubq -* -* Description: Applies conditional subtraction of q to each coefficient -* of a polynomial. For details of conditional subtraction -* of q see comments in reduce.c -* -* Arguments: - poly *r: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_KYBER102490S_AVX2_poly_csubq(poly *r) { - PQCLEAN_KYBER102490S_AVX2_csubq_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); + PQCLEAN_KYBER102490S_AVX2_reduce_avx(r->vec, PQCLEAN_KYBER102490S_AVX2_qdata.vec); } /************************************************* * Name: PQCLEAN_KYBER102490S_AVX2_poly_add * -* Description: Add two polynomials +* Description: Add two polynomials. No modular reduction +* is performed. * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ @@ -296,20 +336,21 @@ void PQCLEAN_KYBER102490S_AVX2_poly_add(poly *r, const poly *a, const poly *b) { unsigned int i; __m256i f0, f1; - for (i = 0; i < KYBER_N; i += 16) { - f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); - f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); + for (i = 0; i < KYBER_N / 16; i++) { + f0 = _mm256_load_si256(&a->vec[i]); + f1 = _mm256_load_si256(&b->vec[i]); f0 = _mm256_add_epi16(f0, f1); - _mm256_store_si256((__m256i *)&r->coeffs[i], f0); + _mm256_store_si256(&r->vec[i], f0); } } /************************************************* * Name: PQCLEAN_KYBER102490S_AVX2_poly_sub * -* Description: Subtract two polynomials +* Description: Subtract two polynomials. No modular reduction +* is performed. * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ @@ -317,10 +358,10 @@ void PQCLEAN_KYBER102490S_AVX2_poly_sub(poly *r, const poly *a, const poly *b) { unsigned int i; __m256i f0, f1; - for (i = 0; i < KYBER_N; i += 16) { - f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); - f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); + for (i = 0; i < KYBER_N / 16; i++) { + f0 = _mm256_load_si256(&a->vec[i]); + f1 = _mm256_load_si256(&b->vec[i]); f0 = _mm256_sub_epi16(f0, f1); - _mm256_store_si256((__m256i *)&r->coeffs[i], f0); + _mm256_store_si256(&r->vec[i], f0); } } diff --git a/crypto_kem/kyber1024-90s/avx2/poly.h b/crypto_kem/kyber1024-90s/avx2/poly.h index 5b7a29cc..ed37fc21 100644 --- a/crypto_kem/kyber1024-90s/avx2/poly.h +++ b/crypto_kem/kyber1024-90s/avx2/poly.h @@ -1,19 +1,13 @@ #ifndef PQCLEAN_KYBER102490S_AVX2_POLY_H #define PQCLEAN_KYBER102490S_AVX2_POLY_H +#include "align.h" #include "params.h" #include #include -/* - * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial - * coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1] - */ -typedef union { - __m256i dummy; - int16_t coeffs[KYBER_N]; -} poly; +typedef ALIGNED_INT16(KYBER_N) poly; -void PQCLEAN_KYBER102490S_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a); +void PQCLEAN_KYBER102490S_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a); void PQCLEAN_KYBER102490S_AVX2_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]); void PQCLEAN_KYBER102490S_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a); @@ -22,7 +16,11 @@ void PQCLEAN_KYBER102490S_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POL void PQCLEAN_KYBER102490S_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]); void PQCLEAN_KYBER102490S_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a); -void PQCLEAN_KYBER102490S_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); +void PQCLEAN_KYBER102490S_AVX2_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); + +void PQCLEAN_KYBER102490S_AVX2_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); + + void PQCLEAN_KYBER102490S_AVX2_poly_ntt(poly *r); void PQCLEAN_KYBER102490S_AVX2_poly_invntt_tomont(poly *r); @@ -31,7 +29,6 @@ void PQCLEAN_KYBER102490S_AVX2_poly_basemul_montgomery(poly *r, const poly *a, c void PQCLEAN_KYBER102490S_AVX2_poly_tomont(poly *r); void PQCLEAN_KYBER102490S_AVX2_poly_reduce(poly *r); -void PQCLEAN_KYBER102490S_AVX2_poly_csubq(poly *r); void PQCLEAN_KYBER102490S_AVX2_poly_add(poly *r, const poly *a, const poly *b); void PQCLEAN_KYBER102490S_AVX2_poly_sub(poly *r, const poly *a, const poly *b); diff --git a/crypto_kem/kyber1024-90s/avx2/polyvec.c b/crypto_kem/kyber1024-90s/avx2/polyvec.c index 960e320d..1a1e2788 100644 --- a/crypto_kem/kyber1024-90s/avx2/polyvec.c +++ b/crypto_kem/kyber1024-90s/avx2/polyvec.c @@ -3,8 +3,79 @@ #include "params.h" #include "poly.h" #include "polyvec.h" +#include #include +static void poly_compress11(uint8_t r[352 + 2], const poly *restrict a) { + unsigned int i; + __m256i f0, f1, f2; + __m128i t0, t1; + const __m256i v = _mm256_load_si256(&PQCLEAN_KYBER102490S_AVX2_qdata.vec[_16XV / 16]); + const __m256i v8 = _mm256_slli_epi16(v, 3); + const __m256i off = _mm256_set1_epi16(36); + const __m256i shift1 = _mm256_set1_epi16(1 << 13); + const __m256i mask = _mm256_set1_epi16(2047); + const __m256i shift2 = _mm256_set1_epi64x((2048LL << 48) + (1LL << 32) + (2048 << 16) + 1); + const __m256i sllvdidx = _mm256_set1_epi64x(10); + const __m256i srlvqidx = _mm256_set_epi64x(30, 10, 30, 10); + const __m256i shufbidx = _mm256_set_epi8( 4, 3, 2, 1, 0, 0, -1, -1, -1, -1, 10, 9, 8, 7, 6, 5, + -1, -1, -1, -1, -1, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + + for (i = 0; i < KYBER_N / 16; i++) { + f0 = _mm256_load_si256(&a->vec[i]); + f1 = _mm256_mullo_epi16(f0, v8); + f2 = _mm256_add_epi16(f0, off); + f0 = _mm256_slli_epi16(f0, 3); + f0 = _mm256_mulhi_epi16(f0, v); + f2 = _mm256_sub_epi16(f1, f2); + f1 = _mm256_andnot_si256(f1, f2); + f1 = _mm256_srli_epi16(f1, 15); + f0 = _mm256_sub_epi16(f0, f1); + f0 = _mm256_mulhrs_epi16(f0, shift1); + f0 = _mm256_and_si256(f0, mask); + f0 = _mm256_madd_epi16(f0, shift2); + f0 = _mm256_sllv_epi32(f0, sllvdidx); + f1 = _mm256_bsrli_epi128(f0, 8); + f0 = _mm256_srlv_epi64(f0, srlvqidx); + f1 = _mm256_slli_epi64(f1, 34); + f0 = _mm256_add_epi64(f0, f1); + f0 = _mm256_shuffle_epi8(f0, shufbidx); + t0 = _mm256_castsi256_si128(f0); + t1 = _mm256_extracti128_si256(f0, 1); + t0 = _mm_blendv_epi8(t0, t1, _mm256_castsi256_si128(shufbidx)); + _mm_storeu_si128((__m128i *)&r[22 * i + 0], t0); + _mm_storel_epi64((__m128i *)&r[22 * i + 16], t1); + } +} + +static void poly_decompress11(poly *restrict r, const uint8_t a[352 + 10]) { + unsigned int i; + __m256i f; + const __m256i q = _mm256_load_si256(&PQCLEAN_KYBER102490S_AVX2_qdata.vec[_16XQ / 16]); + const __m256i shufbidx = _mm256_set_epi8(13, 12, 12, 11, 10, 9, 9, 8, + 8, 7, 6, 5, 5, 4, 4, 3, + 10, 9, 9, 8, 7, 6, 6, 5, + 5, 4, 3, 2, 2, 1, 1, 0); + const __m256i srlvdidx = _mm256_set_epi32(0, 0, 1, 0, 0, 0, 1, 0); + const __m256i srlvqidx = _mm256_set_epi64x(2, 0, 2, 0); + const __m256i shift = _mm256_set_epi16(4, 32, 1, 8, 32, 1, 4, 32, 4, 32, 1, 8, 32, 1, 4, 32); + const __m256i mask = _mm256_set1_epi16(32752); + + for (i = 0; i < KYBER_N / 16; i++) { + f = _mm256_loadu_si256((__m256i *)&a[22 * i]); + f = _mm256_permute4x64_epi64(f, 0x94); + f = _mm256_shuffle_epi8(f, shufbidx); + f = _mm256_srlv_epi32(f, srlvdidx); + f = _mm256_srlv_epi64(f, srlvqidx); + f = _mm256_mullo_epi16(f, shift); + f = _mm256_srli_epi16(f, 1); + f = _mm256_and_si256(f, mask); + f = _mm256_mulhrs_epi16(f, q); + _mm256_store_si256(&r->vec[i], f); + } +} + + /************************************************* * Name: PQCLEAN_KYBER102490S_AVX2_polyvec_compress * @@ -14,33 +85,11 @@ * (needs space for KYBER_POLYVECCOMPRESSEDBYTES) * - polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER102490S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], - polyvec *restrict a) { - size_t i, j, k; +void PQCLEAN_KYBER102490S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES + 2], polyvec *a) { + size_t i; - PQCLEAN_KYBER102490S_AVX2_polyvec_csubq(a); - - uint16_t t[8]; for (i = 0; i < KYBER_K; i++) { - for (j = 0; j < KYBER_N / 8; j++) { - for (k = 0; k < 8; k++) { - t[k] = ((((uint32_t)a->vec[i].coeffs[8 * j + k] << 11) + KYBER_Q / 2) - / KYBER_Q) & 0x7ff; - } - - r[ 0] = (t[0] >> 0); - r[ 1] = (t[0] >> 8) | (t[1] << 3); - r[ 2] = (t[1] >> 5) | (t[2] << 6); - r[ 3] = (t[2] >> 2); - r[ 4] = (t[2] >> 10) | (t[3] << 1); - r[ 5] = (t[3] >> 7) | (t[4] << 4); - r[ 6] = (t[4] >> 4) | (t[5] << 7); - r[ 7] = (t[5] >> 1); - r[ 8] = (t[5] >> 9) | (t[6] << 2); - r[ 9] = (t[6] >> 6) | (t[7] << 5); - r[10] = (t[7] >> 3); - r += 11; - } + poly_compress11(&r[352 * i], &a->vec[i]); } } @@ -50,31 +99,15 @@ void PQCLEAN_KYBER102490S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSE * Description: De-serialize and decompress vector of polynomials; * approximate inverse of PQCLEAN_KYBER102490S_AVX2_polyvec_compress * -* Arguments: - polyvec *r: pointer to output vector of polynomials +* Arguments: - polyvec *r: pointer to output vector of polynomials * - const uint8_t *a: pointer to input byte array * (of length KYBER_POLYVECCOMPRESSEDBYTES) **************************************************/ -void PQCLEAN_KYBER102490S_AVX2_polyvec_decompress(polyvec *restrict r, - const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { - size_t i, j, k; +void PQCLEAN_KYBER102490S_AVX2_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES + 12]) { + size_t i; - uint16_t t[8]; for (i = 0; i < KYBER_K; i++) { - for (j = 0; j < KYBER_N / 8; j++) { - t[0] = (a[0] >> 0) | ((uint16_t)a[ 1] << 8); - t[1] = (a[1] >> 3) | ((uint16_t)a[ 2] << 5); - t[2] = (a[2] >> 6) | ((uint16_t)a[ 3] << 2) | ((uint16_t)a[4] << 10); - t[3] = (a[4] >> 1) | ((uint16_t)a[ 5] << 7); - t[4] = (a[5] >> 4) | ((uint16_t)a[ 6] << 4); - t[5] = (a[6] >> 7) | ((uint16_t)a[ 7] << 1) | ((uint16_t)a[8] << 9); - t[6] = (a[8] >> 2) | ((uint16_t)a[ 9] << 6); - t[7] = (a[9] >> 5) | ((uint16_t)a[10] << 3); - a += 11; - - for (k = 0; k < 8; k++) { - r->vec[i].coeffs[8 * j + k] = ((uint32_t)(t[k] & 0x7FF) * KYBER_Q + 1024) >> 11; - } - } + poly_decompress11(&r->vec[i], &a[352 * i]); } } @@ -100,7 +133,7 @@ void PQCLEAN_KYBER102490S_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], po * Description: De-serialize vector of polynomials; * inverse of PQCLEAN_KYBER102490S_AVX2_polyvec_tobytes * -* Arguments: - uint8_t *r: pointer to output byte array +* Arguments: - uint8_t *r: pointer to output byte array * - const polyvec *a: pointer to input vector of polynomials * (of length KYBER_POLYVECBYTES) **************************************************/ @@ -141,29 +174,34 @@ void PQCLEAN_KYBER102490S_AVX2_polyvec_invntt_tomont(polyvec *r) { } /************************************************* -* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery +* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_basemul_acc_montgomery * -* Description: Pointwise multiply elements of a and b, accumulate into r, +* Description: Multiply elements in a and b in NTT domain, accumulate into r, * and multiply by 2^-16. * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ -void PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(poly *r, - const polyvec *a, - const polyvec *b) { - PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx(r->coeffs, a->vec->coeffs, b->vec->coeffs, & PQCLEAN_KYBER102490S_AVX2_qdata); +void PQCLEAN_KYBER102490S_AVX2_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) { + size_t i; + poly tmp; + + PQCLEAN_KYBER102490S_AVX2_poly_basemul_montgomery(r, &a->vec[0], &b->vec[0]); + for (i = 1; i < KYBER_K; i++) { + PQCLEAN_KYBER102490S_AVX2_poly_basemul_montgomery(&tmp, &a->vec[i], &b->vec[i]); + PQCLEAN_KYBER102490S_AVX2_poly_add(r, r, &tmp); + } } /************************************************* * Name: PQCLEAN_KYBER102490S_AVX2_polyvec_reduce * * Description: Applies Barrett reduction to each coefficient -* of each element of a vector of polynomials +* of each element of a vector of polynomials; * for details of the Barrett reduction see comments in reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - polyvec *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER102490S_AVX2_polyvec_reduce(polyvec *r) { size_t i; @@ -172,23 +210,6 @@ void PQCLEAN_KYBER102490S_AVX2_polyvec_reduce(polyvec *r) { } } -/************************************************* -* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_csubq -* -* Description: Applies conditional subtraction of q to each coefficient -* of each element of a vector of polynomials -* for details of conditional subtraction of q see comments in -* reduce.c -* -* Arguments: - poly *r: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_KYBER102490S_AVX2_polyvec_csubq(polyvec *r) { - size_t i; - for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER102490S_AVX2_poly_csubq(&r->vec[i]); - } -} - /************************************************* * Name: PQCLEAN_KYBER102490S_AVX2_polyvec_add * diff --git a/crypto_kem/kyber1024-90s/avx2/polyvec.h b/crypto_kem/kyber1024-90s/avx2/polyvec.h index e8bcb731..050a0fde 100644 --- a/crypto_kem/kyber1024-90s/avx2/polyvec.h +++ b/crypto_kem/kyber1024-90s/avx2/polyvec.h @@ -8,9 +8,8 @@ typedef struct { poly vec[KYBER_K]; } polyvec; -void PQCLEAN_KYBER102490S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a); -void PQCLEAN_KYBER102490S_AVX2_polyvec_decompress(polyvec *r, - const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); +void PQCLEAN_KYBER102490S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES + 2], polyvec *a); +void PQCLEAN_KYBER102490S_AVX2_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES + 12]); void PQCLEAN_KYBER102490S_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a); void PQCLEAN_KYBER102490S_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]); @@ -18,12 +17,9 @@ void PQCLEAN_KYBER102490S_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYB void PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(polyvec *r); void PQCLEAN_KYBER102490S_AVX2_polyvec_invntt_tomont(polyvec *r); -void PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(poly *r, - const polyvec *a, - const polyvec *b); +void PQCLEAN_KYBER102490S_AVX2_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b); void PQCLEAN_KYBER102490S_AVX2_polyvec_reduce(polyvec *r); -void PQCLEAN_KYBER102490S_AVX2_polyvec_csubq(polyvec *r); void PQCLEAN_KYBER102490S_AVX2_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); diff --git a/crypto_kem/kyber1024-90s/avx2/reduce.h b/crypto_kem/kyber1024-90s/avx2/reduce.h index 8ac905e7..9cb7f3a5 100644 --- a/crypto_kem/kyber1024-90s/avx2/reduce.h +++ b/crypto_kem/kyber1024-90s/avx2/reduce.h @@ -1,10 +1,9 @@ #ifndef PQCLEAN_KYBER102490S_AVX2_REDUCE_H #define PQCLEAN_KYBER102490S_AVX2_REDUCE_H -#include "consts.h" -#include +#include "params.h" +#include -int16_t PQCLEAN_KYBER102490S_AVX2_reduce_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); -int16_t PQCLEAN_KYBER102490S_AVX2_csubq_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); -int16_t PQCLEAN_KYBER102490S_AVX2_tomont_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); +void PQCLEAN_KYBER102490S_AVX2_reduce_avx(__m256i *r, const __m256i *PQCLEAN_KYBER102490S_AVX2_qdata); +void PQCLEAN_KYBER102490S_AVX2_tomont_avx(__m256i *r, const __m256i *PQCLEAN_KYBER102490S_AVX2_qdata); #endif diff --git a/crypto_kem/kyber1024-90s/avx2/rejsample.c b/crypto_kem/kyber1024-90s/avx2/rejsample.c index a8a8fbd5..473473f8 100644 --- a/crypto_kem/kyber1024-90s/avx2/rejsample.c +++ b/crypto_kem/kyber1024-90s/avx2/rejsample.c @@ -4,311 +4,68 @@ #include "rejsample.h" #include #include +#include + +//#define BMI -static const ALIGN32_ARRAY_2D(uint8_t, 256, 8) idx = {.arr = { - {-1, -1, -1, -1, -1, -1, -1, -1}, - { 0, -1, -1, -1, -1, -1, -1, -1}, - { 2, -1, -1, -1, -1, -1, -1, -1}, - { 0, 2, -1, -1, -1, -1, -1, -1}, - { 4, -1, -1, -1, -1, -1, -1, -1}, - { 0, 4, -1, -1, -1, -1, -1, -1}, - { 2, 4, -1, -1, -1, -1, -1, -1}, - { 0, 2, 4, -1, -1, -1, -1, -1}, - { 6, -1, -1, -1, -1, -1, -1, -1}, - { 0, 6, -1, -1, -1, -1, -1, -1}, - { 2, 6, -1, -1, -1, -1, -1, -1}, - { 0, 2, 6, -1, -1, -1, -1, -1}, - { 4, 6, -1, -1, -1, -1, -1, -1}, - { 0, 4, 6, -1, -1, -1, -1, -1}, - { 2, 4, 6, -1, -1, -1, -1, -1}, - { 0, 2, 4, 6, -1, -1, -1, -1}, - { 8, -1, -1, -1, -1, -1, -1, -1}, - { 0, 8, -1, -1, -1, -1, -1, -1}, - { 2, 8, -1, -1, -1, -1, -1, -1}, - { 0, 2, 8, -1, -1, -1, -1, -1}, - { 4, 8, -1, -1, -1, -1, -1, -1}, - { 0, 4, 8, -1, -1, -1, -1, -1}, - { 2, 4, 8, -1, -1, -1, -1, -1}, - { 0, 2, 4, 8, -1, -1, -1, -1}, - { 6, 8, -1, -1, -1, -1, -1, -1}, - { 0, 6, 8, -1, -1, -1, -1, -1}, - { 2, 6, 8, -1, -1, -1, -1, -1}, - { 0, 2, 6, 8, -1, -1, -1, -1}, - { 4, 6, 8, -1, -1, -1, -1, -1}, - { 0, 4, 6, 8, -1, -1, -1, -1}, - { 2, 4, 6, 8, -1, -1, -1, -1}, - { 0, 2, 4, 6, 8, -1, -1, -1}, - {10, -1, -1, -1, -1, -1, -1, -1}, - { 0, 10, -1, -1, -1, -1, -1, -1}, - { 2, 10, -1, -1, -1, -1, -1, -1}, - { 0, 2, 10, -1, -1, -1, -1, -1}, - { 4, 10, -1, -1, -1, -1, -1, -1}, - { 0, 4, 10, -1, -1, -1, -1, -1}, - { 2, 4, 10, -1, -1, -1, -1, -1}, - { 0, 2, 4, 10, -1, -1, -1, -1}, - { 6, 10, -1, -1, -1, -1, -1, -1}, - { 0, 6, 10, -1, -1, -1, -1, -1}, - { 2, 6, 10, -1, -1, -1, -1, -1}, - { 0, 2, 6, 10, -1, -1, -1, -1}, - { 4, 6, 10, -1, -1, -1, -1, -1}, - { 0, 4, 6, 10, -1, -1, -1, -1}, - { 2, 4, 6, 10, -1, -1, -1, -1}, - { 0, 2, 4, 6, 10, -1, -1, -1}, - { 8, 10, -1, -1, -1, -1, -1, -1}, - { 0, 8, 10, -1, -1, -1, -1, -1}, - { 2, 8, 10, -1, -1, -1, -1, -1}, - { 0, 2, 8, 10, -1, -1, -1, -1}, - { 4, 8, 10, -1, -1, -1, -1, -1}, - { 0, 4, 8, 10, -1, -1, -1, -1}, - { 2, 4, 8, 10, -1, -1, -1, -1}, - { 0, 2, 4, 8, 10, -1, -1, -1}, - { 6, 8, 10, -1, -1, -1, -1, -1}, - { 0, 6, 8, 10, -1, -1, -1, -1}, - { 2, 6, 8, 10, -1, -1, -1, -1}, - { 0, 2, 6, 8, 10, -1, -1, -1}, - { 4, 6, 8, 10, -1, -1, -1, -1}, - { 0, 4, 6, 8, 10, -1, -1, -1}, - { 2, 4, 6, 8, 10, -1, -1, -1}, - { 0, 2, 4, 6, 8, 10, -1, -1}, - {12, -1, -1, -1, -1, -1, -1, -1}, - { 0, 12, -1, -1, -1, -1, -1, -1}, - { 2, 12, -1, -1, -1, -1, -1, -1}, - { 0, 2, 12, -1, -1, -1, -1, -1}, - { 4, 12, -1, -1, -1, -1, -1, -1}, - { 0, 4, 12, -1, -1, -1, -1, -1}, - { 2, 4, 12, -1, -1, -1, -1, -1}, - { 0, 2, 4, 12, -1, -1, -1, -1}, - { 6, 12, -1, -1, -1, -1, -1, -1}, - { 0, 6, 12, -1, -1, -1, -1, -1}, - { 2, 6, 12, -1, -1, -1, -1, -1}, - { 0, 2, 6, 12, -1, -1, -1, -1}, - { 4, 6, 12, -1, -1, -1, -1, -1}, - { 0, 4, 6, 12, -1, -1, -1, -1}, - { 2, 4, 6, 12, -1, -1, -1, -1}, - { 0, 2, 4, 6, 12, -1, -1, -1}, - { 8, 12, -1, -1, -1, -1, -1, -1}, - { 0, 8, 12, -1, -1, -1, -1, -1}, - { 2, 8, 12, -1, -1, -1, -1, -1}, - { 0, 2, 8, 12, -1, -1, -1, -1}, - { 4, 8, 12, -1, -1, -1, -1, -1}, - { 0, 4, 8, 12, -1, -1, -1, -1}, - { 2, 4, 8, 12, -1, -1, -1, -1}, - { 0, 2, 4, 8, 12, -1, -1, -1}, - { 6, 8, 12, -1, -1, -1, -1, -1}, - { 0, 6, 8, 12, -1, -1, -1, -1}, - { 2, 6, 8, 12, -1, -1, -1, -1}, - { 0, 2, 6, 8, 12, -1, -1, -1}, - { 4, 6, 8, 12, -1, -1, -1, -1}, - { 0, 4, 6, 8, 12, -1, -1, -1}, - { 2, 4, 6, 8, 12, -1, -1, -1}, - { 0, 2, 4, 6, 8, 12, -1, -1}, - {10, 12, -1, -1, -1, -1, -1, -1}, - { 0, 10, 12, -1, -1, -1, -1, -1}, - { 2, 10, 12, -1, -1, -1, -1, -1}, - { 0, 2, 10, 12, -1, -1, -1, -1}, - { 4, 10, 12, -1, -1, -1, -1, -1}, - { 0, 4, 10, 12, -1, -1, -1, -1}, - { 2, 4, 10, 12, -1, -1, -1, -1}, - { 0, 2, 4, 10, 12, -1, -1, -1}, - { 6, 10, 12, -1, -1, -1, -1, -1}, - { 0, 6, 10, 12, -1, -1, -1, -1}, - { 2, 6, 10, 12, -1, -1, -1, -1}, - { 0, 2, 6, 10, 12, -1, -1, -1}, - { 4, 6, 10, 12, -1, -1, -1, -1}, - { 0, 4, 6, 10, 12, -1, -1, -1}, - { 2, 4, 6, 10, 12, -1, -1, -1}, - { 0, 2, 4, 6, 10, 12, -1, -1}, - { 8, 10, 12, -1, -1, -1, -1, -1}, - { 0, 8, 10, 12, -1, -1, -1, -1}, - { 2, 8, 10, 12, -1, -1, -1, -1}, - { 0, 2, 8, 10, 12, -1, -1, -1}, - { 4, 8, 10, 12, -1, -1, -1, -1}, - { 0, 4, 8, 10, 12, -1, -1, -1}, - { 2, 4, 8, 10, 12, -1, -1, -1}, - { 0, 2, 4, 8, 10, 12, -1, -1}, - { 6, 8, 10, 12, -1, -1, -1, -1}, - { 0, 6, 8, 10, 12, -1, -1, -1}, - { 2, 6, 8, 10, 12, -1, -1, -1}, - { 0, 2, 6, 8, 10, 12, -1, -1}, - { 4, 6, 8, 10, 12, -1, -1, -1}, - { 0, 4, 6, 8, 10, 12, -1, -1}, - { 2, 4, 6, 8, 10, 12, -1, -1}, - { 0, 2, 4, 6, 8, 10, 12, -1}, - {14, -1, -1, -1, -1, -1, -1, -1}, - { 0, 14, -1, -1, -1, -1, -1, -1}, - { 2, 14, -1, -1, -1, -1, -1, -1}, - { 0, 2, 14, -1, -1, -1, -1, -1}, - { 4, 14, -1, -1, -1, -1, -1, -1}, - { 0, 4, 14, -1, -1, -1, -1, -1}, - { 2, 4, 14, -1, -1, -1, -1, -1}, - { 0, 2, 4, 14, -1, -1, -1, -1}, - { 6, 14, -1, -1, -1, -1, -1, -1}, - { 0, 6, 14, -1, -1, -1, -1, -1}, - { 2, 6, 14, -1, -1, -1, -1, -1}, - { 0, 2, 6, 14, -1, -1, -1, -1}, - { 4, 6, 14, -1, -1, -1, -1, -1}, - { 0, 4, 6, 14, -1, -1, -1, -1}, - { 2, 4, 6, 14, -1, -1, -1, -1}, - { 0, 2, 4, 6, 14, -1, -1, -1}, - { 8, 14, -1, -1, -1, -1, -1, -1}, - { 0, 8, 14, -1, -1, -1, -1, -1}, - { 2, 8, 14, -1, -1, -1, -1, -1}, - { 0, 2, 8, 14, -1, -1, -1, -1}, - { 4, 8, 14, -1, -1, -1, -1, -1}, - { 0, 4, 8, 14, -1, -1, -1, -1}, - { 2, 4, 8, 14, -1, -1, -1, -1}, - { 0, 2, 4, 8, 14, -1, -1, -1}, - { 6, 8, 14, -1, -1, -1, -1, -1}, - { 0, 6, 8, 14, -1, -1, -1, -1}, - { 2, 6, 8, 14, -1, -1, -1, -1}, - { 0, 2, 6, 8, 14, -1, -1, -1}, - { 4, 6, 8, 14, -1, -1, -1, -1}, - { 0, 4, 6, 8, 14, -1, -1, -1}, - { 2, 4, 6, 8, 14, -1, -1, -1}, - { 0, 2, 4, 6, 8, 14, -1, -1}, - {10, 14, -1, -1, -1, -1, -1, -1}, - { 0, 10, 14, -1, -1, -1, -1, -1}, - { 2, 10, 14, -1, -1, -1, -1, -1}, - { 0, 2, 10, 14, -1, -1, -1, -1}, - { 4, 10, 14, -1, -1, -1, -1, -1}, - { 0, 4, 10, 14, -1, -1, -1, -1}, - { 2, 4, 10, 14, -1, -1, -1, -1}, - { 0, 2, 4, 10, 14, -1, -1, -1}, - { 6, 10, 14, -1, -1, -1, -1, -1}, - { 0, 6, 10, 14, -1, -1, -1, -1}, - { 2, 6, 10, 14, -1, -1, -1, -1}, - { 0, 2, 6, 10, 14, -1, -1, -1}, - { 4, 6, 10, 14, -1, -1, -1, -1}, - { 0, 4, 6, 10, 14, -1, -1, -1}, - { 2, 4, 6, 10, 14, -1, -1, -1}, - { 0, 2, 4, 6, 10, 14, -1, -1}, - { 8, 10, 14, -1, -1, -1, -1, -1}, - { 0, 8, 10, 14, -1, -1, -1, -1}, - { 2, 8, 10, 14, -1, -1, -1, -1}, - { 0, 2, 8, 10, 14, -1, -1, -1}, - { 4, 8, 10, 14, -1, -1, -1, -1}, - { 0, 4, 8, 10, 14, -1, -1, -1}, - { 2, 4, 8, 10, 14, -1, -1, -1}, - { 0, 2, 4, 8, 10, 14, -1, -1}, - { 6, 8, 10, 14, -1, -1, -1, -1}, - { 0, 6, 8, 10, 14, -1, -1, -1}, - { 2, 6, 8, 10, 14, -1, -1, -1}, - { 0, 2, 6, 8, 10, 14, -1, -1}, - { 4, 6, 8, 10, 14, -1, -1, -1}, - { 0, 4, 6, 8, 10, 14, -1, -1}, - { 2, 4, 6, 8, 10, 14, -1, -1}, - { 0, 2, 4, 6, 8, 10, 14, -1}, - {12, 14, -1, -1, -1, -1, -1, -1}, - { 0, 12, 14, -1, -1, -1, -1, -1}, - { 2, 12, 14, -1, -1, -1, -1, -1}, - { 0, 2, 12, 14, -1, -1, -1, -1}, - { 4, 12, 14, -1, -1, -1, -1, -1}, - { 0, 4, 12, 14, -1, -1, -1, -1}, - { 2, 4, 12, 14, -1, -1, -1, -1}, - { 0, 2, 4, 12, 14, -1, -1, -1}, - { 6, 12, 14, -1, -1, -1, -1, -1}, - { 0, 6, 12, 14, -1, -1, -1, -1}, - { 2, 6, 12, 14, -1, -1, -1, -1}, - { 0, 2, 6, 12, 14, -1, -1, -1}, - { 4, 6, 12, 14, -1, -1, -1, -1}, - { 0, 4, 6, 12, 14, -1, -1, -1}, - { 2, 4, 6, 12, 14, -1, -1, -1}, - { 0, 2, 4, 6, 12, 14, -1, -1}, - { 8, 12, 14, -1, -1, -1, -1, -1}, - { 0, 8, 12, 14, -1, -1, -1, -1}, - { 2, 8, 12, 14, -1, -1, -1, -1}, - { 0, 2, 8, 12, 14, -1, -1, -1}, - { 4, 8, 12, 14, -1, -1, -1, -1}, - { 0, 4, 8, 12, 14, -1, -1, -1}, - { 2, 4, 8, 12, 14, -1, -1, -1}, - { 0, 2, 4, 8, 12, 14, -1, -1}, - { 6, 8, 12, 14, -1, -1, -1, -1}, - { 0, 6, 8, 12, 14, -1, -1, -1}, - { 2, 6, 8, 12, 14, -1, -1, -1}, - { 0, 2, 6, 8, 12, 14, -1, -1}, - { 4, 6, 8, 12, 14, -1, -1, -1}, - { 0, 4, 6, 8, 12, 14, -1, -1}, - { 2, 4, 6, 8, 12, 14, -1, -1}, - { 0, 2, 4, 6, 8, 12, 14, -1}, - {10, 12, 14, -1, -1, -1, -1, -1}, - { 0, 10, 12, 14, -1, -1, -1, -1}, - { 2, 10, 12, 14, -1, -1, -1, -1}, - { 0, 2, 10, 12, 14, -1, -1, -1}, - { 4, 10, 12, 14, -1, -1, -1, -1}, - { 0, 4, 10, 12, 14, -1, -1, -1}, - { 2, 4, 10, 12, 14, -1, -1, -1}, - { 0, 2, 4, 10, 12, 14, -1, -1}, - { 6, 10, 12, 14, -1, -1, -1, -1}, - { 0, 6, 10, 12, 14, -1, -1, -1}, - { 2, 6, 10, 12, 14, -1, -1, -1}, - { 0, 2, 6, 10, 12, 14, -1, -1}, - { 4, 6, 10, 12, 14, -1, -1, -1}, - { 0, 4, 6, 10, 12, 14, -1, -1}, - { 2, 4, 6, 10, 12, 14, -1, -1}, - { 0, 2, 4, 6, 10, 12, 14, -1}, - { 8, 10, 12, 14, -1, -1, -1, -1}, - { 0, 8, 10, 12, 14, -1, -1, -1}, - { 2, 8, 10, 12, 14, -1, -1, -1}, - { 0, 2, 8, 10, 12, 14, -1, -1}, - { 4, 8, 10, 12, 14, -1, -1, -1}, - { 0, 4, 8, 10, 12, 14, -1, -1}, - { 2, 4, 8, 10, 12, 14, -1, -1}, - { 0, 2, 4, 8, 10, 12, 14, -1}, - { 6, 8, 10, 12, 14, -1, -1, -1}, - { 0, 6, 8, 10, 12, 14, -1, -1}, - { 2, 6, 8, 10, 12, 14, -1, -1}, - { 0, 2, 6, 8, 10, 12, 14, -1}, - { 4, 6, 8, 10, 12, 14, -1, -1}, - { 0, 4, 6, 8, 10, 12, 14, -1}, - { 2, 4, 6, 8, 10, 12, 14, -1}, - { 0, 2, 4, 6, 8, 10, 12, 14} - } -}; #define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a) #define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a) -#define REJ_UNIFORM_BUFLEN 576 -unsigned int PQCLEAN_KYBER102490S_AVX2_rej_uniform_avx(int16_t *restrict r, - const uint8_t *restrict buf) { +unsigned int PQCLEAN_KYBER102490S_AVX2_rej_uniform_avx(int16_t *restrict r, const uint8_t *buf) { unsigned int ctr, pos; - uint16_t val; + uint16_t val0, val1; uint32_t good; - const __m256i bound = _mm256_set1_epi16((int16_t)(19 * KYBER_Q - 1)); + uint64_t idx0, idx1, idx2, idx3; + const __m256i bound = _mm256_load_si256(&PQCLEAN_KYBER102490S_AVX2_qdata.vec[_16XQ / 16]); const __m256i ones = _mm256_set1_epi8(1); - const __m256i kyberq = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER102490S_AVX2_qdata.as_arr[_16XQ]); - const __m256i v = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER102490S_AVX2_qdata.as_arr[_16XV]); + const __m256i mask = _mm256_set1_epi16(0xFFF); + const __m256i idx8 = _mm256_set_epi8(15, 14, 14, 13, 12, 11, 11, 10, + 9, 8, 8, 7, 6, 5, 5, 4, + 11, 10, 10, 9, 8, 7, 7, 6, + 5, 4, 4, 3, 2, 1, 1, 0); __m256i f0, f1, g0, g1, g2, g3; __m128i f, t, pilo, pihi; - ctr = 0; - for (pos = 0; pos < 2 * KYBER_N; pos += 64) { - f0 = _mm256_load_si256((__m256i *)&buf[pos + 0]); - f1 = _mm256_load_si256((__m256i *)&buf[pos + 32]); + ctr = pos = 0; + while (ctr <= KYBER_N - 32 && pos <= REJ_UNIFORM_AVX_BUFLEN - 48) { + f0 = _mm256_loadu_si256((__m256i *)&buf[pos]); + f1 = _mm256_loadu_si256((__m256i *)&buf[pos + 24]); + f0 = _mm256_permute4x64_epi64(f0, 0x94); + f1 = _mm256_permute4x64_epi64(f1, 0x94); + f0 = _mm256_shuffle_epi8(f0, idx8); + f1 = _mm256_shuffle_epi8(f1, idx8); + g0 = _mm256_srli_epi16(f0, 4); + g1 = _mm256_srli_epi16(f1, 4); + f0 = _mm256_blend_epi16(f0, g0, 0xAA); + f1 = _mm256_blend_epi16(f1, g1, 0xAA); + f0 = _mm256_and_si256(f0, mask); + f1 = _mm256_and_si256(f1, mask); + pos += 48; - g0 = _mm256_cmpge_epu16(bound, f0); - g1 = _mm256_cmpge_epu16(bound, f1); + g0 = _mm256_cmpgt_epi16(bound, f0); + g1 = _mm256_cmpgt_epi16(bound, f1); g0 = _mm256_packs_epi16(g0, g1); good = _mm256_movemask_epi8(g0); - g0 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 0) & 0xFF])); - g1 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 8) & 0xFF])); - g0 = _mm256_inserti128_si256(g0, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 16) & 0xFF]), 1); - g1 = _mm256_inserti128_si256(g1, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 24) & 0xFF]), 1); + idx0 = _pdep_u64(good >> 0, 0x0101010101010101); + idx1 = _pdep_u64(good >> 8, 0x0101010101010101); + idx2 = _pdep_u64(good >> 16, 0x0101010101010101); + idx3 = _pdep_u64(good >> 24, 0x0101010101010101); + idx0 = (idx0 << 8) - idx0; + idx0 = _pext_u64(0x0E0C0A0806040200, idx0); + idx1 = (idx1 << 8) - idx1; + idx1 = _pext_u64(0x0E0C0A0806040200, idx1); + idx2 = (idx2 << 8) - idx2; + idx2 = _pext_u64(0x0E0C0A0806040200, idx2); + idx3 = (idx3 << 8) - idx3; + idx3 = _pext_u64(0x0E0C0A0806040200, idx3); - //g0 = _mm256_cvtepu8_epi64(_mm_loadl_epi64((__m128i *)&good)); - //g1 = _mm256_i64gather_epi64((long long *)idx.arr, g0, 8); - - /* Barrett reduction of (still unsigned) values */ - g2 = _mm256_mulhi_epu16(f0, v); - g3 = _mm256_mulhi_epu16(f1, v); - g2 = _mm256_srli_epi16(g2, 10); - g3 = _mm256_srli_epi16(g3, 10); - g2 = _mm256_mullo_epi16(g2, kyberq); - g3 = _mm256_mullo_epi16(g3, kyberq); - f0 = _mm256_sub_epi16(f0, g2); - f1 = _mm256_sub_epi16(f1, g3); + g0 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx0)); + g1 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx1)); + g0 = _mm256_inserti128_si256(g0, _mm_cvtsi64_si128(idx2), 1); + g1 = _mm256_inserti128_si256(g1, _mm_cvtsi64_si128(idx3), 1); g2 = _mm256_add_epi8(g0, ones); g3 = _mm256_add_epi8(g1, ones); @@ -328,34 +85,40 @@ unsigned int PQCLEAN_KYBER102490S_AVX2_rej_uniform_avx(int16_t *restrict r, ctr += _mm_popcnt_u32((good >> 24) & 0xFF); } - while (ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_BUFLEN - 16) { - f = _mm_load_si128((__m128i *)&buf[pos]); - t = _mm_cmpge_epu16(_mm256_castsi256_si128(bound), f); + while (ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_AVX_BUFLEN - 12) { + f = _mm_loadu_si128((__m128i *)&buf[pos]); + f = _mm_shuffle_epi8(f, _mm256_castsi256_si128(idx8)); + t = _mm_srli_epi16(f, 4); + f = _mm_blend_epi16(f, t, 0xAA); + f = _mm_and_si128(f, _mm256_castsi256_si128(mask)); + pos += 12; + + t = _mm_cmpgt_epi16(_mm256_castsi256_si128(bound), f); good = _mm_movemask_epi8(t); - good = _pext_u32(good, 0x5555); - pilo = _mm_loadl_epi64((__m128i *)&idx.arr[good]); + + good &= 0x5555; + idx0 = _pdep_u64(good, 0x1111111111111111); + idx0 = (idx0 << 8) - idx0; + idx0 = _pext_u64(0x0E0C0A0806040200, idx0); + pilo = _mm_cvtsi64_si128(idx0); + pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones)); pilo = _mm_unpacklo_epi8(pilo, pihi); - - /* Barrett reduction */ - t = _mm_mulhi_epu16(f, _mm256_castsi256_si128(v)); - t = _mm_srli_epi16(t, 10); - t = _mm_mullo_epi16(t, _mm256_castsi256_si128(kyberq)); - f = _mm_sub_epi16(f, t); - f = _mm_shuffle_epi8(f, pilo); _mm_storeu_si128((__m128i *)&r[ctr], f); ctr += _mm_popcnt_u32(good); - pos += 16; } - while (ctr < KYBER_N && pos <= REJ_UNIFORM_BUFLEN - 2) { - val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); - pos += 2; + while (ctr < KYBER_N && pos <= REJ_UNIFORM_AVX_BUFLEN - 3) { + val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; + val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)); + pos += 3; - if (val < 19 * KYBER_Q) { - val -= ((int32_t)val * 20159 >> 26) * KYBER_Q; - r[ctr++] = val; + if (val0 < KYBER_Q) { + r[ctr++] = val0; + } + if (val1 < KYBER_Q && ctr < KYBER_N) { + r[ctr++] = val1; } } diff --git a/crypto_kem/kyber1024-90s/avx2/rejsample.h b/crypto_kem/kyber1024-90s/avx2/rejsample.h index bbf8f8a7..3283fc1b 100644 --- a/crypto_kem/kyber1024-90s/avx2/rejsample.h +++ b/crypto_kem/kyber1024-90s/avx2/rejsample.h @@ -1,9 +1,12 @@ #ifndef PQCLEAN_KYBER102490S_AVX2_REJSAMPLE_H #define PQCLEAN_KYBER102490S_AVX2_REJSAMPLE_H #include "params.h" +#include "symmetric.h" #include -unsigned int PQCLEAN_KYBER102490S_AVX2_rej_uniform_avx(int16_t *r, - const unsigned char *buf); +#define REJ_UNIFORM_AVX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) +#define REJ_UNIFORM_AVX_BUFLEN (REJ_UNIFORM_AVX_NBLOCKS*XOF_BLOCKBYTES) + +unsigned int PQCLEAN_KYBER102490S_AVX2_rej_uniform_avx(int16_t *r, const uint8_t *buf); #endif diff --git a/crypto_kem/kyber1024-90s/avx2/shuffle.S b/crypto_kem/kyber1024-90s/avx2/shuffle.S index d994de45..0667999f 100644 --- a/crypto_kem/kyber1024-90s/avx2/shuffle.S +++ b/crypto_kem/kyber1024-90s/avx2/shuffle.S @@ -113,13 +113,13 @@ vmovdqa 224(%rsi),%ymm12 #csubq csubq 5,13 -csubq 6,14 -csubq 7,15 -csubq 8,1 +csubq 6,13 +csubq 7,13 +csubq 8,13 csubq 9,13 -csubq 10,14 -csubq 11,15 -csubq 12,1 +csubq 10,13 +csubq 11,13 +csubq 12,13 #bitpack vpsllw $12,%ymm6,%ymm4 diff --git a/crypto_kem/kyber1024-90s/avx2/shuffle.inc b/crypto_kem/kyber1024-90s/avx2/shuffle.inc index d4b092bc..73e9ffe0 100644 --- a/crypto_kem/kyber1024-90s/avx2/shuffle.inc +++ b/crypto_kem/kyber1024-90s/avx2/shuffle.inc @@ -9,17 +9,17 @@ vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 .endm .macro shuffle2 r0,r1,r2,r3 -#vshufps $0x20,%ymm\r1,%ymm\r0,%ymm\r2 -#vshufps $0x31,%ymm\r1,%ymm\r0,%ymm\r3 -vpsllq $32,%ymm\r1,%ymm12 -vpsrlq $32,%ymm\r0,%ymm13 -vpblendd $0xAA,%ymm12,%ymm\r0,%ymm\r2 -vpblendd $0xAA,%ymm\r1,%ymm13,%ymm\r3 +#vpsllq $32,%ymm\r1,%ymm\r2 +vmovsldup %ymm\r1,%ymm\r2 +vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrlq $32,%ymm\r0,%ymm\r0 +#vmovshdup %ymm\r0,%ymm\r0 +vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 .endm .macro shuffle1 r0,r1,r2,r3 -vpslld $16,%ymm\r1,%ymm12 -vpsrld $16,%ymm\r0,%ymm13 -vpblendw $0xAA,%ymm12,%ymm\r0,%ymm\r2 -vpblendw $0xAA,%ymm\r1,%ymm13,%ymm\r3 +vpslld $16,%ymm\r1,%ymm\r2 +vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrld $16,%ymm\r0,%ymm\r0 +vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 .endm diff --git a/crypto_kem/kyber1024-90s/avx2/symmetric.h b/crypto_kem/kyber1024-90s/avx2/symmetric.h index 00a7d655..e22e10b2 100644 --- a/crypto_kem/kyber1024-90s/avx2/symmetric.h +++ b/crypto_kem/kyber1024-90s/avx2/symmetric.h @@ -14,12 +14,10 @@ typedef aes256ctr_ctx xof_state; #define hash_h(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES) #define hash_g(OUT, IN, INBYTES) sha512(OUT, IN, INBYTES) -#define xof_absorb(STATE, SEED, X, Y) \ - PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(STATE, SEED, (X) | ((uint16_t)(Y) << 8)) -#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) \ - PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) -#define prf(OUT, OUTBYTES, KEY, NONCE) \ - PQCLEAN_KYBER102490S_AVX2_aes256ctr_prf(OUT, OUTBYTES, KEY, NONCE) +#define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(STATE, SEED, (X) | ((uint16_t)(Y) << 8)) +#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define xof_ctx_release(STATE) +#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER102490S_AVX2_aes256ctr_prf(OUT, OUTBYTES, KEY, NONCE) #define kdf(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES) diff --git a/crypto_kem/kyber1024-90s/avx2/verify.c b/crypto_kem/kyber1024-90s/avx2/verify.c index 9d4f4feb..37b553f4 100644 --- a/crypto_kem/kyber1024-90s/avx2/verify.c +++ b/crypto_kem/kyber1024-90s/avx2/verify.c @@ -8,31 +8,31 @@ * * Description: Compare two arrays for equality in constant time. * -* Arguments: const unsigned char *a: pointer to first byte array -* const unsigned char *b: pointer to second byte array -* size_t len: length of the byte arrays +* Arguments: const uint8_t *a: pointer to first byte array +* const uint8_t *b: pointer to second byte array +* size_t len: length of the byte arrays * * Returns 0 if the byte arrays are equal, 1 otherwise **************************************************/ int PQCLEAN_KYBER102490S_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) { - size_t pos; + size_t i; uint64_t r; - __m256i avec, bvec, cvec; + __m256i f, g, h; - cvec = _mm256_setzero_si256(); - for (pos = 0; pos + 32 <= len; pos += 32) { - avec = _mm256_loadu_si256((__m256i *)&a[pos]); - bvec = _mm256_loadu_si256((__m256i *)&b[pos]); - avec = _mm256_xor_si256(avec, bvec); - cvec = _mm256_or_si256(cvec, avec); + h = _mm256_setzero_si256(); + for (i = 0; i < len / 32; i++) { + f = _mm256_loadu_si256((__m256i *)&a[32 * i]); + g = _mm256_loadu_si256((__m256i *)&b[32 * i]); + f = _mm256_xor_si256(f, g); + h = _mm256_or_si256(h, f); } - r = 1 - _mm256_testz_si256(cvec, cvec); + r = 1 - _mm256_testz_si256(h, h); - if (pos < len) { - avec = _mm256_loadu_si256((__m256i *)&a[pos]); - bvec = _mm256_loadu_si256((__m256i *)&b[pos]); - cvec = _mm256_cmpeq_epi8(avec, bvec); - r |= _mm256_movemask_epi8(cvec) & (-(uint32_t)1 >> (32 + pos - len)); + a += 32 * i; + b += 32 * i; + len -= 32 * i; + for (i = 0; i < len; i++) { + r |= a[i] ^ b[i]; } r = (-r) >> 63; @@ -47,29 +47,27 @@ int PQCLEAN_KYBER102490S_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t * assumes two's complement representation of negative integers. * Runs in constant time. * -* Arguments: unsigned char *r: pointer to output byte array +* Arguments: unsigned char *r: pointer to output byte array * const unsigned char *x: pointer to input byte array -* size_t len: Amount of bytes to be copied -* unsigned char b: Condition bit; has to be in {0,1} +* size_t len: Amount of bytes to be copied +* unsigned char b: Condition bit; has to be in {0,1} **************************************************/ -void PQCLEAN_KYBER102490S_AVX2_cmov(uint8_t *restrict r, const uint8_t *restrict x, size_t len, uint8_t b) { - size_t pos; +void PQCLEAN_KYBER102490S_AVX2_cmov(uint8_t *restrict r, const uint8_t *x, size_t len, uint8_t b) { + size_t i; __m256i xvec, rvec, bvec; - b = -b; - bvec = _mm256_set1_epi8(b); - - for (pos = 0; pos + 32 <= len; pos += 32) { - rvec = _mm256_loadu_si256((__m256i *)&r[pos]); - xvec = _mm256_loadu_si256((__m256i *)&x[pos]); - xvec = _mm256_xor_si256(xvec, rvec); - xvec = _mm256_and_si256(xvec, bvec); - rvec = _mm256_xor_si256(rvec, xvec); - _mm256_storeu_si256((__m256i *)&r[pos], rvec); + bvec = _mm256_set1_epi64x(-(uint64_t)b); + for (i = 0; i < len / 32; i++) { + rvec = _mm256_loadu_si256((__m256i *)&r[32 * i]); + xvec = _mm256_loadu_si256((__m256i *)&x[32 * i]); + rvec = _mm256_blendv_epi8(rvec, xvec, bvec); + _mm256_storeu_si256((__m256i *)&r[32 * i], rvec); } - while (pos < len) { - r[pos] ^= b & (x[pos] ^ r[pos]); - pos += 1; + r += 32 * i; + x += 32 * i; + len -= 32 * i; + for (i = 0; i < len; i++) { + r[i] ^= -b & (x[i] ^ r[i]); } } diff --git a/crypto_kem/kyber1024-90s/clean/Makefile b/crypto_kem/kyber1024-90s/clean/Makefile index 2574c694..7ae5aaf2 100644 --- a/crypto_kem/kyber1024-90s/clean/Makefile +++ b/crypto_kem/kyber1024-90s/clean/Makefile @@ -1,8 +1,8 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libkyber1024-90s_clean.a -HEADERS=api.h cbd.h indcpa.h kem.h ntt.h params.h poly.h polyvec.h reduce.h symmetric-aes.h symmetric.h verify.h -OBJECTS=cbd.o indcpa.o kem.o ntt.o poly.o polyvec.o reduce.o symmetric-aes.o verify.o +HEADERS=aes256ctr.h api.h cbd.h indcpa.h kem.h ntt.h params.h poly.h polyvec.h reduce.h symmetric.h verify.h +OBJECTS=aes256ctr.o cbd.o indcpa.o kem.o ntt.o poly.o polyvec.o reduce.o symmetric-aes.o verify.o CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) diff --git a/crypto_kem/kyber1024-90s/clean/Makefile.Microsoft_nmake b/crypto_kem/kyber1024-90s/clean/Makefile.Microsoft_nmake index 24fc3849..ea1df01f 100644 --- a/crypto_kem/kyber1024-90s/clean/Makefile.Microsoft_nmake +++ b/crypto_kem/kyber1024-90s/clean/Makefile.Microsoft_nmake @@ -2,7 +2,7 @@ # nmake /f Makefile.Microsoft_nmake LIBRARY=libkyber1024-90s_clean.lib -OBJECTS=cbd.obj indcpa.obj kem.obj ntt.obj poly.obj polyvec.obj reduce.obj symmetric-aes.obj verify.obj +OBJECTS=aes256ctr.obj cbd.obj indcpa.obj kem.obj ntt.obj poly.obj polyvec.obj reduce.obj symmetric-aes.obj verify.obj # Warning C4146 is raised when a unary minus operator is applied to an # unsigned type; this has nonetheless been standard and portable for as diff --git a/crypto_kem/kyber1024-90s/clean/aes256ctr.c b/crypto_kem/kyber1024-90s/clean/aes256ctr.c new file mode 100644 index 00000000..46340d6f --- /dev/null +++ b/crypto_kem/kyber1024-90s/clean/aes256ctr.c @@ -0,0 +1,564 @@ +#include "aes256ctr.h" +#include +#include +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +static inline uint32_t br_dec32le(const uint8_t *src) { + return (uint32_t)src[0] + | ((uint32_t)src[1] << 8) + | ((uint32_t)src[2] << 16) + | ((uint32_t)src[3] << 24); +} + +static void br_range_dec32le(uint32_t *v, size_t num, const uint8_t *src) { + while (num-- > 0) { + *v ++ = br_dec32le(src); + src += 4; + } +} + +static inline uint32_t br_swap32(uint32_t x) { + x = ((x & (uint32_t)0x00FF00FF) << 8) + | ((x >> 8) & (uint32_t)0x00FF00FF); + return (x << 16) | (x >> 16); +} + +static inline void br_enc32le(uint8_t *dst, uint32_t x) { + dst[0] = (uint8_t)x; + dst[1] = (uint8_t)(x >> 8); + dst[2] = (uint8_t)(x >> 16); + dst[3] = (uint8_t)(x >> 24); +} + +static void br_range_enc32le(uint8_t *dst, const uint32_t *v, size_t num) { + while (num-- > 0) { + br_enc32le(dst, *v ++); + dst += 4; + } +} + +static void br_aes_ct64_bitslice_Sbox(uint64_t *q) { + /* + * This S-box implementation is a straightforward translation of + * the circuit described by Boyar and Peralta in "A new + * combinational logic minimization technique with applications + * to cryptology" (https://eprint.iacr.org/2009/191.pdf). + * + * Note that variables x* (input) and s* (output) are numbered + * in "reverse" order (x0 is the high bit, x7 is the low bit). + */ + + uint64_t x0, x1, x2, x3, x4, x5, x6, x7; + uint64_t y1, y2, y3, y4, y5, y6, y7, y8, y9; + uint64_t y10, y11, y12, y13, y14, y15, y16, y17, y18, y19; + uint64_t y20, y21; + uint64_t z0, z1, z2, z3, z4, z5, z6, z7, z8, z9; + uint64_t z10, z11, z12, z13, z14, z15, z16, z17; + uint64_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9; + uint64_t t10, t11, t12, t13, t14, t15, t16, t17, t18, t19; + uint64_t t20, t21, t22, t23, t24, t25, t26, t27, t28, t29; + uint64_t t30, t31, t32, t33, t34, t35, t36, t37, t38, t39; + uint64_t t40, t41, t42, t43, t44, t45, t46, t47, t48, t49; + uint64_t t50, t51, t52, t53, t54, t55, t56, t57, t58, t59; + uint64_t t60, t61, t62, t63, t64, t65, t66, t67; + uint64_t s0, s1, s2, s3, s4, s5, s6, s7; + + x0 = q[7]; + x1 = q[6]; + x2 = q[5]; + x3 = q[4]; + x4 = q[3]; + x5 = q[2]; + x6 = q[1]; + x7 = q[0]; + + /* + * Top linear transformation. + */ + y14 = x3 ^ x5; + y13 = x0 ^ x6; + y9 = x0 ^ x3; + y8 = x0 ^ x5; + t0 = x1 ^ x2; + y1 = t0 ^ x7; + y4 = y1 ^ x3; + y12 = y13 ^ y14; + y2 = y1 ^ x0; + y5 = y1 ^ x6; + y3 = y5 ^ y8; + t1 = x4 ^ y12; + y15 = t1 ^ x5; + y20 = t1 ^ x1; + y6 = y15 ^ x7; + y10 = y15 ^ t0; + y11 = y20 ^ y9; + y7 = x7 ^ y11; + y17 = y10 ^ y11; + y19 = y10 ^ y8; + y16 = t0 ^ y11; + y21 = y13 ^ y16; + y18 = x0 ^ y16; + + /* + * Non-linear section. + */ + t2 = y12 & y15; + t3 = y3 & y6; + t4 = t3 ^ t2; + t5 = y4 & x7; + t6 = t5 ^ t2; + t7 = y13 & y16; + t8 = y5 & y1; + t9 = t8 ^ t7; + t10 = y2 & y7; + t11 = t10 ^ t7; + t12 = y9 & y11; + t13 = y14 & y17; + t14 = t13 ^ t12; + t15 = y8 & y10; + t16 = t15 ^ t12; + t17 = t4 ^ t14; + t18 = t6 ^ t16; + t19 = t9 ^ t14; + t20 = t11 ^ t16; + t21 = t17 ^ y20; + t22 = t18 ^ y19; + t23 = t19 ^ y21; + t24 = t20 ^ y18; + + t25 = t21 ^ t22; + t26 = t21 & t23; + t27 = t24 ^ t26; + t28 = t25 & t27; + t29 = t28 ^ t22; + t30 = t23 ^ t24; + t31 = t22 ^ t26; + t32 = t31 & t30; + t33 = t32 ^ t24; + t34 = t23 ^ t33; + t35 = t27 ^ t33; + t36 = t24 & t35; + t37 = t36 ^ t34; + t38 = t27 ^ t36; + t39 = t29 & t38; + t40 = t25 ^ t39; + + t41 = t40 ^ t37; + t42 = t29 ^ t33; + t43 = t29 ^ t40; + t44 = t33 ^ t37; + t45 = t42 ^ t41; + z0 = t44 & y15; + z1 = t37 & y6; + z2 = t33 & x7; + z3 = t43 & y16; + z4 = t40 & y1; + z5 = t29 & y7; + z6 = t42 & y11; + z7 = t45 & y17; + z8 = t41 & y10; + z9 = t44 & y12; + z10 = t37 & y3; + z11 = t33 & y4; + z12 = t43 & y13; + z13 = t40 & y5; + z14 = t29 & y2; + z15 = t42 & y9; + z16 = t45 & y14; + z17 = t41 & y8; + + /* + * Bottom linear transformation. + */ + t46 = z15 ^ z16; + t47 = z10 ^ z11; + t48 = z5 ^ z13; + t49 = z9 ^ z10; + t50 = z2 ^ z12; + t51 = z2 ^ z5; + t52 = z7 ^ z8; + t53 = z0 ^ z3; + t54 = z6 ^ z7; + t55 = z16 ^ z17; + t56 = z12 ^ t48; + t57 = t50 ^ t53; + t58 = z4 ^ t46; + t59 = z3 ^ t54; + t60 = t46 ^ t57; + t61 = z14 ^ t57; + t62 = t52 ^ t58; + t63 = t49 ^ t58; + t64 = z4 ^ t59; + t65 = t61 ^ t62; + t66 = z1 ^ t63; + s0 = t59 ^ t63; + s6 = t56 ^ ~t62; + s7 = t48 ^ ~t60; + t67 = t64 ^ t65; + s3 = t53 ^ t66; + s4 = t51 ^ t66; + s5 = t47 ^ t65; + s1 = t64 ^ ~s3; + s2 = t55 ^ ~t67; + + q[7] = s0; + q[6] = s1; + q[5] = s2; + q[4] = s3; + q[3] = s4; + q[2] = s5; + q[1] = s6; + q[0] = s7; +} + +static void br_aes_ct64_ortho(uint64_t *q) { +#define SWAPN(cl, ch, s, x, y) do { \ + uint64_t a, b; \ + a = (x); \ + b = (y); \ + (x) = (a & (uint64_t)(cl)) | ((b & (uint64_t)(cl)) << (s)); \ + (y) = ((a & (uint64_t)(ch)) >> (s)) | (b & (uint64_t)(ch)); \ + } while (0) + +#define SWAP2(x, y) SWAPN(0x5555555555555555, 0xAAAAAAAAAAAAAAAA, 1, x, y) +#define SWAP4(x, y) SWAPN(0x3333333333333333, 0xCCCCCCCCCCCCCCCC, 2, x, y) +#define SWAP8(x, y) SWAPN(0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 4, x, y) + + SWAP2(q[0], q[1]); + SWAP2(q[2], q[3]); + SWAP2(q[4], q[5]); + SWAP2(q[6], q[7]); + + SWAP4(q[0], q[2]); + SWAP4(q[1], q[3]); + SWAP4(q[4], q[6]); + SWAP4(q[5], q[7]); + + SWAP8(q[0], q[4]); + SWAP8(q[1], q[5]); + SWAP8(q[2], q[6]); + SWAP8(q[3], q[7]); +} + +static void br_aes_ct64_interleave_in(uint64_t *q0, uint64_t *q1, const uint32_t *w) { + uint64_t x0, x1, x2, x3; + + x0 = w[0]; + x1 = w[1]; + x2 = w[2]; + x3 = w[3]; + x0 |= (x0 << 16); + x1 |= (x1 << 16); + x2 |= (x2 << 16); + x3 |= (x3 << 16); + x0 &= (uint64_t)0x0000FFFF0000FFFF; + x1 &= (uint64_t)0x0000FFFF0000FFFF; + x2 &= (uint64_t)0x0000FFFF0000FFFF; + x3 &= (uint64_t)0x0000FFFF0000FFFF; + x0 |= (x0 << 8); + x1 |= (x1 << 8); + x2 |= (x2 << 8); + x3 |= (x3 << 8); + x0 &= (uint64_t)0x00FF00FF00FF00FF; + x1 &= (uint64_t)0x00FF00FF00FF00FF; + x2 &= (uint64_t)0x00FF00FF00FF00FF; + x3 &= (uint64_t)0x00FF00FF00FF00FF; + *q0 = x0 | (x2 << 8); + *q1 = x1 | (x3 << 8); +} + +static void br_aes_ct64_interleave_out(uint32_t *w, uint64_t q0, uint64_t q1) { + uint64_t x0, x1, x2, x3; + + x0 = q0 & (uint64_t)0x00FF00FF00FF00FF; + x1 = q1 & (uint64_t)0x00FF00FF00FF00FF; + x2 = (q0 >> 8) & (uint64_t)0x00FF00FF00FF00FF; + x3 = (q1 >> 8) & (uint64_t)0x00FF00FF00FF00FF; + x0 |= (x0 >> 8); + x1 |= (x1 >> 8); + x2 |= (x2 >> 8); + x3 |= (x3 >> 8); + x0 &= (uint64_t)0x0000FFFF0000FFFF; + x1 &= (uint64_t)0x0000FFFF0000FFFF; + x2 &= (uint64_t)0x0000FFFF0000FFFF; + x3 &= (uint64_t)0x0000FFFF0000FFFF; + w[0] = (uint32_t)x0 | (uint32_t)(x0 >> 16); + w[1] = (uint32_t)x1 | (uint32_t)(x1 >> 16); + w[2] = (uint32_t)x2 | (uint32_t)(x2 >> 16); + w[3] = (uint32_t)x3 | (uint32_t)(x3 >> 16); +} + +static const uint8_t Rcon[] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36 +}; + +static uint32_t sub_word(uint32_t x) { + uint64_t q[8]; + + memset(q, 0, sizeof q); + q[0] = x; + br_aes_ct64_ortho(q); + br_aes_ct64_bitslice_Sbox(q); + br_aes_ct64_ortho(q); + return (uint32_t)q[0]; +} + +static void br_aes_ct64_keysched(uint64_t *comp_skey, const uint8_t *key) { + int i, j, k, nk, nkf; + uint32_t tmp; + uint32_t skey[60]; + + int key_len = 32; + + nk = (int)(key_len >> 2); + nkf = (int)((14 + 1) << 2); + br_range_dec32le(skey, (key_len >> 2), key); + tmp = skey[(key_len >> 2) - 1]; + for (i = nk, j = 0, k = 0; i < nkf; i ++) { + if (j == 0) { + tmp = (tmp << 24) | (tmp >> 8); + tmp = sub_word(tmp) ^ Rcon[k]; + } else if (nk > 6 && j == 4) { + tmp = sub_word(tmp); + } + tmp ^= skey[i - nk]; + skey[i] = tmp; + if (++ j == nk) { + j = 0; + k ++; + } + } + + for (i = 0, j = 0; i < nkf; i += 4, j += 2) { + uint64_t q[8]; + + br_aes_ct64_interleave_in(&q[0], &q[4], skey + i); + q[1] = q[0]; + q[2] = q[0]; + q[3] = q[0]; + q[5] = q[4]; + q[6] = q[4]; + q[7] = q[4]; + br_aes_ct64_ortho(q); + comp_skey[j + 0] = + (q[0] & (uint64_t)0x1111111111111111) + | (q[1] & (uint64_t)0x2222222222222222) + | (q[2] & (uint64_t)0x4444444444444444) + | (q[3] & (uint64_t)0x8888888888888888); + comp_skey[j + 1] = + (q[4] & (uint64_t)0x1111111111111111) + | (q[5] & (uint64_t)0x2222222222222222) + | (q[6] & (uint64_t)0x4444444444444444) + | (q[7] & (uint64_t)0x8888888888888888); + } +} + +static void br_aes_ct64_skey_expand(uint64_t *skey, const uint64_t *comp_skey) { + unsigned u, v, n; + + n = (14 + 1) << 1; + for (u = 0, v = 0; u < n; u ++, v += 4) { + uint64_t x0, x1, x2, x3; + + x0 = x1 = x2 = x3 = comp_skey[u]; + x0 &= (uint64_t)0x1111111111111111; + x1 &= (uint64_t)0x2222222222222222; + x2 &= (uint64_t)0x4444444444444444; + x3 &= (uint64_t)0x8888888888888888; + x1 >>= 1; + x2 >>= 2; + x3 >>= 3; + skey[v + 0] = (x0 << 4) - x0; + skey[v + 1] = (x1 << 4) - x1; + skey[v + 2] = (x2 << 4) - x2; + skey[v + 3] = (x3 << 4) - x3; + } +} + +static inline void add_round_key(uint64_t *q, const uint64_t *sk) { + q[0] ^= sk[0]; + q[1] ^= sk[1]; + q[2] ^= sk[2]; + q[3] ^= sk[3]; + q[4] ^= sk[4]; + q[5] ^= sk[5]; + q[6] ^= sk[6]; + q[7] ^= sk[7]; +} + +static inline void shift_rows(uint64_t *q) { + int i; + + for (i = 0; i < 8; i ++) { + uint64_t x; + + x = q[i]; + q[i] = (x & (uint64_t)0x000000000000FFFF) + | ((x & (uint64_t)0x00000000FFF00000) >> 4) + | ((x & (uint64_t)0x00000000000F0000) << 12) + | ((x & (uint64_t)0x0000FF0000000000) >> 8) + | ((x & (uint64_t)0x000000FF00000000) << 8) + | ((x & (uint64_t)0xF000000000000000) >> 12) + | ((x & (uint64_t)0x0FFF000000000000) << 4); + } +} + +static inline uint64_t rotr32(uint64_t x) { + return (x << 32) | (x >> 32); +} + +static inline void mix_columns(uint64_t *q) { + uint64_t q0, q1, q2, q3, q4, q5, q6, q7; + uint64_t r0, r1, r2, r3, r4, r5, r6, r7; + + q0 = q[0]; + q1 = q[1]; + q2 = q[2]; + q3 = q[3]; + q4 = q[4]; + q5 = q[5]; + q6 = q[6]; + q7 = q[7]; + r0 = (q0 >> 16) | (q0 << 48); + r1 = (q1 >> 16) | (q1 << 48); + r2 = (q2 >> 16) | (q2 << 48); + r3 = (q3 >> 16) | (q3 << 48); + r4 = (q4 >> 16) | (q4 << 48); + r5 = (q5 >> 16) | (q5 << 48); + r6 = (q6 >> 16) | (q6 << 48); + r7 = (q7 >> 16) | (q7 << 48); + + q[0] = q7 ^ r7 ^ r0 ^ rotr32(q0 ^ r0); + q[1] = q0 ^ r0 ^ q7 ^ r7 ^ r1 ^ rotr32(q1 ^ r1); + q[2] = q1 ^ r1 ^ r2 ^ rotr32(q2 ^ r2); + q[3] = q2 ^ r2 ^ q7 ^ r7 ^ r3 ^ rotr32(q3 ^ r3); + q[4] = q3 ^ r3 ^ q7 ^ r7 ^ r4 ^ rotr32(q4 ^ r4); + q[5] = q4 ^ r4 ^ r5 ^ rotr32(q5 ^ r5); + q[6] = q5 ^ r5 ^ r6 ^ rotr32(q6 ^ r6); + q[7] = q6 ^ r6 ^ r7 ^ rotr32(q7 ^ r7); +} + +static void inc4_be(uint32_t *x) { + *x = br_swap32(*x) + 4; + *x = br_swap32(*x); +} + +static void aes_ctr4x(uint8_t out[64], uint32_t ivw[16], uint64_t sk_exp[64]) { + uint32_t w[16]; + uint64_t q[8]; + int i; + + memcpy(w, ivw, sizeof(w)); + for (i = 0; i < 4; i++) { + br_aes_ct64_interleave_in(&q[i], &q[i + 4], w + (i << 2)); + } + br_aes_ct64_ortho(q); + + add_round_key(q, sk_exp); + for (i = 1; i < 14; i++) { + br_aes_ct64_bitslice_Sbox(q); + shift_rows(q); + mix_columns(q); + add_round_key(q, sk_exp + (i << 3)); + } + br_aes_ct64_bitslice_Sbox(q); + shift_rows(q); + add_round_key(q, sk_exp + 112); + + br_aes_ct64_ortho(q); + for (i = 0; i < 4; i ++) { + br_aes_ct64_interleave_out(w + (i << 2), q[i], q[i + 4]); + } + br_range_enc32le(out, w, 16); + + /* Increase counter for next 4 blocks */ + inc4_be(ivw + 3); + inc4_be(ivw + 7); + inc4_be(ivw + 11); + inc4_be(ivw + 15); +} + +static void br_aes_ct64_ctr_init(uint64_t sk_exp[120], const uint8_t *key) { + uint64_t skey[30]; + + br_aes_ct64_keysched(skey, key); + br_aes_ct64_skey_expand(sk_exp, skey); +} + +static void br_aes_ct64_ctr_run(uint64_t sk_exp[120], const uint8_t *iv, uint32_t cc, uint8_t *data, size_t len) { + uint32_t ivw[16]; + size_t i; + + br_range_dec32le(ivw, 3, iv); + memcpy(ivw + 4, ivw, 3 * sizeof(uint32_t)); + memcpy(ivw + 8, ivw, 3 * sizeof(uint32_t)); + memcpy(ivw + 12, ivw, 3 * sizeof(uint32_t)); + ivw[ 3] = br_swap32(cc); + ivw[ 7] = br_swap32(cc + 1); + ivw[11] = br_swap32(cc + 2); + ivw[15] = br_swap32(cc + 3); + + while (len > 64) { + aes_ctr4x(data, ivw, sk_exp); + data += 64; + len -= 64; + } + if (len > 0) { + uint8_t tmp[64]; + aes_ctr4x(tmp, ivw, sk_exp); + for (i = 0; i < len; i++) { + data[i] = tmp[i]; + } + } +} + +void PQCLEAN_KYBER102490S_CLEAN_aes256ctr_prf(uint8_t *out, size_t outlen, const uint8_t *key, const uint8_t *nonce) { + uint64_t sk_exp[120]; + + br_aes_ct64_ctr_init(sk_exp, key); + br_aes_ct64_ctr_run(sk_exp, nonce, 0, out, outlen); +} + +void PQCLEAN_KYBER102490S_CLEAN_aes256ctr_init(aes256ctr_ctx *s, const uint8_t *key, const uint8_t *nonce) { + br_aes_ct64_ctr_init(s->sk_exp, key); + + br_range_dec32le(s->ivw, 3, nonce); + memcpy(s->ivw + 4, s->ivw, 3 * sizeof(uint32_t)); + memcpy(s->ivw + 8, s->ivw, 3 * sizeof(uint32_t)); + memcpy(s->ivw + 12, s->ivw, 3 * sizeof(uint32_t)); + s->ivw[ 3] = br_swap32(0); + s->ivw[ 7] = br_swap32(1); + s->ivw[11] = br_swap32(2); + s->ivw[15] = br_swap32(3); +} + +void PQCLEAN_KYBER102490S_CLEAN_aes256ctr_squeezeblocks(uint8_t *out, size_t nblocks, aes256ctr_ctx *s) { + while (nblocks > 0) { + aes_ctr4x(out, s->ivw, s->sk_exp); + out += 64; + nblocks--; + } +} diff --git a/crypto_kem/kyber1024-90s/clean/aes256ctr.h b/crypto_kem/kyber1024-90s/clean/aes256ctr.h new file mode 100644 index 00000000..069b3eec --- /dev/null +++ b/crypto_kem/kyber1024-90s/clean/aes256ctr.h @@ -0,0 +1,28 @@ +#ifndef PQCLEAN_KYBER102490S_CLEAN_AES256CTR_H +#define PQCLEAN_KYBER102490S_CLEAN_AES256CTR_H + +#include +#include + +#define AES256CTR_BLOCKBYTES 64 + + +typedef struct { + uint64_t sk_exp[120]; + uint32_t ivw[16]; +} aes256ctr_ctx; + +void PQCLEAN_KYBER102490S_CLEAN_aes256ctr_prf(uint8_t *out, + size_t outlen, + const uint8_t key[32], + const uint8_t nonce[12]); + +void PQCLEAN_KYBER102490S_CLEAN_aes256ctr_init(aes256ctr_ctx *state, + const uint8_t key[32], + const uint8_t nonce[12]); + +void PQCLEAN_KYBER102490S_CLEAN_aes256ctr_squeezeblocks(uint8_t *out, + size_t nblocks, + aes256ctr_ctx *state); + +#endif diff --git a/crypto_kem/kyber1024-90s/clean/cbd.c b/crypto_kem/kyber1024-90s/clean/cbd.c index dde70cd7..74d9c81e 100644 --- a/crypto_kem/kyber1024-90s/clean/cbd.c +++ b/crypto_kem/kyber1024-90s/clean/cbd.c @@ -5,7 +5,7 @@ /************************************************* * Name: load32_littleendian * -* Description: load bytes into a 32-bit integer +* Description: load 4 bytes into a 32-bit integer * in little-endian order * * Arguments: - const uint8_t *x: pointer to input byte array @@ -22,16 +22,29 @@ static uint32_t load32_littleendian(const uint8_t x[4]) { } /************************************************* -* Name: PQCLEAN_KYBER102490S_CLEAN_cbd +* Name: load24_littleendian +* +* Description: load 3 bytes into a 32-bit integer +* in little-endian order. +* This function is only needed for Kyber-512 +* +* Arguments: - const uint8_t *x: pointer to input byte array +* +* Returns 32-bit unsigned integer loaded from x (most significant byte is zero) +**************************************************/ + + +/************************************************* +* Name: cbd2 * * Description: Given an array of uniformly random bytes, compute * polynomial with coefficients distributed according to -* a centered binomial distribution with parameter KYBER_ETA +* a centered binomial distribution with parameter eta=2 * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *buf: pointer to input byte array **************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]) { +static void cbd2(poly *r, const uint8_t buf[2 * KYBER_N / 4]) { unsigned int i, j; uint32_t t, d; int16_t a, b; @@ -48,3 +61,23 @@ void PQCLEAN_KYBER102490S_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER } } } + +/************************************************* +* Name: cbd3 +* +* Description: Given an array of uniformly random bytes, compute +* polynomial with coefficients distributed according to +* a centered binomial distribution with parameter eta=3. +* This function is only needed for Kyber-512 +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *buf: pointer to input byte array +**************************************************/ + +void PQCLEAN_KYBER102490S_CLEAN_poly_cbd_eta1(poly *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]) { + cbd2(r, buf); +} + +void PQCLEAN_KYBER102490S_CLEAN_poly_cbd_eta2(poly *r, const uint8_t buf[KYBER_ETA2 * KYBER_N / 4]) { + cbd2(r, buf); +} diff --git a/crypto_kem/kyber1024-90s/clean/cbd.h b/crypto_kem/kyber1024-90s/clean/cbd.h index 12f16252..bbd9ef9d 100644 --- a/crypto_kem/kyber1024-90s/clean/cbd.h +++ b/crypto_kem/kyber1024-90s/clean/cbd.h @@ -4,6 +4,8 @@ #include "poly.h" #include -void PQCLEAN_KYBER102490S_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]); +void PQCLEAN_KYBER102490S_CLEAN_poly_cbd_eta1(poly *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]); + +void PQCLEAN_KYBER102490S_CLEAN_poly_cbd_eta2(poly *r, const uint8_t buf[KYBER_ETA2 * KYBER_N / 4]); #endif diff --git a/crypto_kem/kyber1024-90s/clean/indcpa.c b/crypto_kem/kyber1024-90s/clean/indcpa.c index abc10b3c..03d5557c 100644 --- a/crypto_kem/kyber1024-90s/clean/indcpa.c +++ b/crypto_kem/kyber1024-90s/clean/indcpa.c @@ -15,8 +15,8 @@ * serialized vector of polynomials pk * and the public seed used to generate the matrix A. * -* Arguments: uint8_t *r: pointer to the output serialized public key -* polyvec *pk: pointer to the input public-key polyvec +* Arguments: uint8_t *r: pointer to the output serialized public key +* polyvec *pk: pointer to the input public-key polyvec * const uint8_t *seed: pointer to the input public seed **************************************************/ static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], @@ -35,10 +35,8 @@ static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], * Description: De-serialize public key from a byte array; * approximate inverse of pack_pk * -* Arguments: - polyvec *pk: pointer to output public-key -* polynomial vector -* - uint8_t *seed: pointer to output seed to generate -* matrix A +* Arguments: - polyvec *pk: pointer to output public-key polynomial vector +* - uint8_t *seed: pointer to output seed to generate matrix A * - const uint8_t *packedpk: pointer to input serialized public key **************************************************/ static void unpack_pk(polyvec *pk, @@ -56,7 +54,7 @@ static void unpack_pk(polyvec *pk, * * Description: Serialize the secret key * -* Arguments: - uint8_t *r: pointer to output serialized secret key +* Arguments: - uint8_t *r: pointer to output serialized secret key * - polyvec *sk: pointer to input vector of polynomials (secret key) **************************************************/ static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { @@ -66,15 +64,12 @@ static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { /************************************************* * Name: unpack_sk * -* Description: De-serialize the secret key; -* inverse of pack_sk +* Description: De-serialize the secret key; inverse of pack_sk * -* Arguments: - polyvec *sk: pointer to output vector of -* polynomials (secret key) +* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key) * - const uint8_t *packedsk: pointer to input serialized secret key **************************************************/ -static void unpack_sk(polyvec *sk, - const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { +static void unpack_sk(polyvec *sk, const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { PQCLEAN_KYBER102490S_CLEAN_polyvec_frombytes(sk, packedsk); } @@ -86,12 +81,10 @@ static void unpack_sk(polyvec *sk, * and the compressed and serialized polynomial v * * Arguments: uint8_t *r: pointer to the output serialized ciphertext -* poly *pk: pointer to the input vector of polynomials b -* poly *v: pointer to the input polynomial v +* poly *pk: pointer to the input vector of polynomials b +* poly *v: pointer to the input polynomial v **************************************************/ -static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], - polyvec *b, - poly *v) { +static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], polyvec *b, poly *v) { PQCLEAN_KYBER102490S_CLEAN_polyvec_compress(r, b); PQCLEAN_KYBER102490S_CLEAN_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v); } @@ -102,13 +95,11 @@ static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], * Description: De-serialize and decompress ciphertext from a byte array; * approximate inverse of pack_ciphertext * -* Arguments: - polyvec *b: pointer to the output vector of polynomials b -* - poly *v: pointer to the output polynomial v +* Arguments: - polyvec *b: pointer to the output vector of polynomials b +* - poly *v: pointer to the output polynomial v * - const uint8_t *c: pointer to the input serialized ciphertext **************************************************/ -static void unpack_ciphertext(polyvec *b, - poly *v, - const uint8_t c[KYBER_INDCPA_BYTES]) { +static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t c[KYBER_INDCPA_BYTES]) { PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress(b, c); PQCLEAN_KYBER102490S_CLEAN_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES); } @@ -119,11 +110,9 @@ static void unpack_ciphertext(polyvec *b, * Description: Run rejection sampling on uniform random bytes to generate * uniform random integers mod q * -* Arguments: - int16_t *r: pointer to output buffer -* - unsigned int len: requested number of 16-bit integers -* (uniform mod q) -* - const uint8_t *buf: pointer to input buffer -* (assumed to be uniform random bytes) +* Arguments: - int16_t *r: pointer to output buffer +* - unsigned int len: requested number of 16-bit integers (uniform mod q) +* - const uint8_t *buf: pointer to input buffer (assumed to be uniformly random bytes) * - unsigned int buflen: length of input buffer in bytes * * Returns number of sampled 16-bit integers (at most len) @@ -133,16 +122,19 @@ static unsigned int rej_uniform(int16_t *r, const uint8_t *buf, unsigned int buflen) { unsigned int ctr, pos; - uint16_t val; + uint16_t val0, val1; ctr = pos = 0; - while (ctr < len && pos + 2 <= buflen) { - val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); - pos += 2; + while (ctr < len && pos + 3 <= buflen) { + val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; + val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF; + pos += 3; - if (val < 19 * KYBER_Q) { - val -= (val >> 12) * KYBER_Q; // Barrett reduction - r[ctr++] = (int16_t)val; + if (val0 < KYBER_Q) { + r[ctr++] = val0; + } + if (ctr < len && val1 < KYBER_Q) { + r[ctr++] = val1; } } @@ -160,17 +152,16 @@ static unsigned int rej_uniform(int16_t *r, * uniformly random. Performs rejection sampling on output of * a XOF * -* Arguments: - polyvec *a: pointer to ouptput matrix A +* Arguments: - polyvec *a: pointer to ouptput matrix A * - const uint8_t *seed: pointer to input seed -* - int transposed: boolean deciding whether A or A^T -* is generated +* - int transposed: boolean deciding whether A or A^T is generated **************************************************/ -#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \ - + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) +#define GEN_MATRIX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) // Not static for benchmarking void PQCLEAN_KYBER102490S_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed) { - unsigned int ctr, i, j; - uint8_t buf[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES]; + unsigned int ctr, i, j, k; + unsigned int buflen, off; + uint8_t buf[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2]; xof_state state; for (i = 0; i < KYBER_K; i++) { @@ -182,12 +173,17 @@ void PQCLEAN_KYBER102490S_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_ } xof_squeezeblocks(buf, GEN_MATRIX_NBLOCKS, &state); - ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, sizeof(buf)); + buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES; + ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, buflen); while (ctr < KYBER_N) { - xof_squeezeblocks(buf, 1, &state); - ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, - XOF_BLOCKBYTES); + off = buflen % 3; + for (k = 0; k < off; k++) { + buf[k] = buf[buflen - off + k]; + } + xof_squeezeblocks(buf + off, 1, &state); + buflen = off + XOF_BLOCKBYTES; + ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, buflen); } xof_ctx_release(&state); } @@ -220,10 +216,10 @@ void PQCLEAN_KYBER102490S_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEY gen_a(a, publicseed); for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(&skpv.vec[i], noiseseed, nonce++); + PQCLEAN_KYBER102490S_CLEAN_poly_getnoise_eta1(&skpv.vec[i], noiseseed, nonce++); } for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(&e.vec[i], noiseseed, nonce++); + PQCLEAN_KYBER102490S_CLEAN_poly_getnoise_eta1(&e.vec[i], noiseseed, nonce++); } PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt(&skpv); @@ -231,7 +227,7 @@ void PQCLEAN_KYBER102490S_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEY // matrix-vector multiplication for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); + PQCLEAN_KYBER102490S_CLEAN_polyvec_basemul_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); PQCLEAN_KYBER102490S_CLEAN_poly_tomont(&pkpv.vec[i]); } @@ -248,16 +244,15 @@ void PQCLEAN_KYBER102490S_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEY * Description: Encryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *c: pointer to output ciphertext -* (of length KYBER_INDCPA_BYTES bytes) -* - const uint8_t *m: pointer to input message -* (of length KYBER_INDCPA_MSGBYTES bytes) -* - const uint8_t *pk: pointer to input public key -* (of length KYBER_INDCPA_PUBLICKEYBYTES) -* - const uint8_t *coins: pointer to input random coins -* used as seed (of length KYBER_SYMBYTES) -* to deterministically generate all -* randomness +* Arguments: - uint8_t *c: pointer to output ciphertext +* (of length KYBER_INDCPA_BYTES bytes) +* - const uint8_t *m: pointer to input message +* (of length KYBER_INDCPA_MSGBYTES bytes) +* - const uint8_t *pk: pointer to input public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES) +* - const uint8_t *coins: pointer to input random coins used as seed +* (of length KYBER_SYMBYTES) to deterministically +* generate all randomness **************************************************/ void PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], const uint8_t m[KYBER_INDCPA_MSGBYTES], @@ -266,7 +261,7 @@ void PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], unsigned int i; uint8_t seed[KYBER_SYMBYTES]; uint8_t nonce = 0; - polyvec sp, pkpv, ep, at[KYBER_K], bp; + polyvec sp, pkpv, ep, at[KYBER_K], b; poly v, k, epp; unpack_pk(&pkpv, seed, pk); @@ -274,32 +269,32 @@ void PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], gen_at(at, seed); for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(sp.vec + i, coins, nonce++); + PQCLEAN_KYBER102490S_CLEAN_poly_getnoise_eta1(sp.vec + i, coins, nonce++); } for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(ep.vec + i, coins, nonce++); + PQCLEAN_KYBER102490S_CLEAN_poly_getnoise_eta2(ep.vec + i, coins, nonce++); } - PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(&epp, coins, nonce++); + PQCLEAN_KYBER102490S_CLEAN_poly_getnoise_eta2(&epp, coins, nonce++); PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt(&sp); // matrix-vector multiplication for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp); + PQCLEAN_KYBER102490S_CLEAN_polyvec_basemul_acc_montgomery(&b.vec[i], &at[i], &sp); } - PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp); + PQCLEAN_KYBER102490S_CLEAN_polyvec_basemul_acc_montgomery(&v, &pkpv, &sp); - PQCLEAN_KYBER102490S_CLEAN_polyvec_invntt_tomont(&bp); + PQCLEAN_KYBER102490S_CLEAN_polyvec_invntt_tomont(&b); PQCLEAN_KYBER102490S_CLEAN_poly_invntt_tomont(&v); - PQCLEAN_KYBER102490S_CLEAN_polyvec_add(&bp, &bp, &ep); + PQCLEAN_KYBER102490S_CLEAN_polyvec_add(&b, &b, &ep); PQCLEAN_KYBER102490S_CLEAN_poly_add(&v, &v, &epp); PQCLEAN_KYBER102490S_CLEAN_poly_add(&v, &v, &k); - PQCLEAN_KYBER102490S_CLEAN_polyvec_reduce(&bp); + PQCLEAN_KYBER102490S_CLEAN_polyvec_reduce(&b); PQCLEAN_KYBER102490S_CLEAN_poly_reduce(&v); - pack_ciphertext(c, &bp, &v); + pack_ciphertext(c, &b, &v); } /************************************************* @@ -308,24 +303,24 @@ void PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], * Description: Decryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *m: pointer to output decrypted message -* (of length KYBER_INDCPA_MSGBYTES) -* - const uint8_t *c: pointer to input ciphertext -* (of length KYBER_INDCPA_BYTES) +* Arguments: - uint8_t *m: pointer to output decrypted message +* (of length KYBER_INDCPA_MSGBYTES) +* - const uint8_t *c: pointer to input ciphertext +* (of length KYBER_INDCPA_BYTES) * - const uint8_t *sk: pointer to input secret key * (of length KYBER_INDCPA_SECRETKEYBYTES) **************************************************/ void PQCLEAN_KYBER102490S_CLEAN_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t c[KYBER_INDCPA_BYTES], const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { - polyvec bp, skpv; + polyvec b, skpv; poly v, mp; - unpack_ciphertext(&bp, &v, c); + unpack_ciphertext(&b, &v, c); unpack_sk(&skpv, sk); - PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt(&bp); - PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp); + PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt(&b); + PQCLEAN_KYBER102490S_CLEAN_polyvec_basemul_acc_montgomery(&mp, &skpv, &b); PQCLEAN_KYBER102490S_CLEAN_poly_invntt_tomont(&mp); PQCLEAN_KYBER102490S_CLEAN_poly_sub(&mp, &v, &mp); diff --git a/crypto_kem/kyber1024-90s/clean/kem.c b/crypto_kem/kyber1024-90s/clean/kem.c index 24b221f2..693111d2 100644 --- a/crypto_kem/kyber1024-90s/clean/kem.c +++ b/crypto_kem/kyber1024-90s/clean/kem.c @@ -14,13 +14,14 @@ * for CCA-secure Kyber key encapsulation mechanism * * Arguments: - unsigned char *pk: pointer to output public key -* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) * - unsigned char *sk: pointer to output private key -* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* (an already allocated array of KYBER_SECRETKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { +int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_keypair(unsigned char pk[KYBER_PUBLICKEYBYTES], + unsigned char sk[KYBER_SECRETKEYBYTES]) { size_t i; PQCLEAN_KYBER102490S_CLEAN_indcpa_keypair(pk, sk); for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) { @@ -39,17 +40,17 @@ int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned ch * secret for given public key * * Arguments: - unsigned char *ct: pointer to output cipher text -* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) * - unsigned char *ss: pointer to output shared secret -* (an already allocated array of CRYPTO_BYTES bytes) +* (an already allocated array of KYBER_SSBYTES bytes) * - const unsigned char *pk: pointer to input public key -* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_enc(unsigned char *ct, - unsigned char *ss, - const unsigned char *pk) { +int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_enc(unsigned char ct[KYBER_CIPHERTEXTBYTES], + unsigned char ss[KYBER_SSBYTES], + const unsigned char pk[KYBER_PUBLICKEYBYTES]) { uint8_t buf[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ uint8_t kr[2 * KYBER_SYMBYTES]; @@ -79,19 +80,19 @@ int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_enc(unsigned char *ct, * cipher text and private key * * Arguments: - unsigned char *ss: pointer to output shared secret -* (an already allocated array of CRYPTO_BYTES bytes) +* (an already allocated array of KYBER_SSBYTES bytes) * - const unsigned char *ct: pointer to input cipher text -* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) * - const unsigned char *sk: pointer to input private key -* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* (an already allocated array of KYBER_SECRETKEYBYTES bytes) * * Returns 0. * * On failure, ss will contain a pseudo-random value. **************************************************/ -int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_dec(unsigned char *ss, - const unsigned char *ct, - const unsigned char *sk) { +int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_dec(unsigned char ss[KYBER_SSBYTES], + const unsigned char ct[KYBER_CIPHERTEXTBYTES], + const unsigned char sk[KYBER_SECRETKEYBYTES]) { size_t i; int fail; uint8_t buf[2 * KYBER_SYMBYTES]; diff --git a/crypto_kem/kyber1024-90s/clean/ntt.c b/crypto_kem/kyber1024-90s/clean/ntt.c index 9cbd5523..f841f1ad 100644 --- a/crypto_kem/kyber1024-90s/clean/ntt.c +++ b/crypto_kem/kyber1024-90s/clean/ntt.c @@ -3,11 +3,11 @@ #include "reduce.h" #include -/* Code to generate PQCLEAN_KYBER102490S_CLEAN_zetas and PQCLEAN_KYBER102490S_CLEAN_zetas_inv used in the number-theoretic transform: +/* Code to generate PQCLEAN_KYBER102490S_CLEAN_zetas and zetas_inv used in the number-theoretic transform: #define KYBER_ROOT_OF_UNITY 17 -static const uint16_t tree[128] = { +static const uint8_t tree[128] = { 0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120, 4, 68, 36, 100, 20, 84, 52, 116, 12, 76, 44, 108, 28, 92, 60, 124, 2, 66, 34, 98, 18, 82, 50, 114, 10, 74, 42, 106, 26, 90, 58, 122, @@ -19,51 +19,41 @@ static const uint16_t tree[128] = { }; void init_ntt() { - unsigned int i, j, k; + unsigned int i; int16_t tmp[128]; tmp[0] = MONT; - for(i = 1; i < 128; ++i) - tmp[i] = fqmul(tmp[i-1], KYBER_ROOT_OF_UNITY*MONT % KYBER_Q); + for(i=1;i<128;i++) + tmp[i] = fqmul(tmp[i-1],MONT*KYBER_ROOT_OF_UNITY % KYBER_Q); - for(i = 0; i < 128; ++i) + for(i=0;i<128;i++) { PQCLEAN_KYBER102490S_CLEAN_zetas[i] = tmp[tree[i]]; - - k = 0; - for(i = 64; i >= 1; i >>= 1) - for(j = i; j < 2*i; ++j) - PQCLEAN_KYBER102490S_CLEAN_zetas_inv[k++] = -tmp[128 - tree[j]]; - - PQCLEAN_KYBER102490S_CLEAN_zetas_inv[127] = MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q; + if(PQCLEAN_KYBER102490S_CLEAN_zetas[i] > KYBER_Q/2) + PQCLEAN_KYBER102490S_CLEAN_zetas[i] -= KYBER_Q; + if(PQCLEAN_KYBER102490S_CLEAN_zetas[i] < -KYBER_Q/2) + PQCLEAN_KYBER102490S_CLEAN_zetas[i] += KYBER_Q; + } } - */ const int16_t PQCLEAN_KYBER102490S_CLEAN_zetas[128] = { - 2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182, 962, - 2127, 1855, 1468, 573, 2004, 264, 383, 2500, 1458, 1727, 3199, 2648, 1017, - 732, 608, 1787, 411, 3124, 1758, 1223, 652, 2777, 1015, 2036, 1491, 3047, - 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, 2476, 3239, 3058, 830, - 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, 2226, - 430, 555, 843, 2078, 871, 1550, 105, 422, 587, 177, 3094, 3038, 2869, 1574, - 1653, 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349, - 418, 329, 3173, 3254, 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193, - 1218, 1994, 2455, 220, 2142, 1670, 2144, 1799, 2051, 794, 1819, 2475, 2459, - 478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628 -}; - -const int16_t PQCLEAN_KYBER102490S_CLEAN_zetas_inv[128] = { - 1701, 1807, 1460, 2371, 2338, 2333, 308, 108, 2851, 870, 854, 1510, 2535, - 1278, 1530, 1185, 1659, 1187, 3109, 874, 1335, 2111, 136, 1215, 2945, 1465, - 1285, 2007, 2719, 2726, 2232, 2512, 75, 156, 3000, 2911, 2980, 872, 2685, - 1590, 2210, 602, 1846, 777, 147, 2170, 2551, 246, 1676, 1755, 460, 291, 235, - 3152, 2742, 2907, 3224, 1779, 2458, 1251, 2486, 2774, 2899, 1103, 1275, 2652, - 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, - 1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, - 2677, 2106, 1571, 205, 2918, 1542, 2721, 2597, 2312, 681, 130, 1602, 1871, - 829, 2946, 3065, 1325, 2756, 1861, 1474, 1202, 2367, 3147, 1752, 2707, 171, - 3127, 3042, 1907, 1836, 1517, 359, 758, 1441 -}; + -1044, -758, -359, -1517, 1493, 1422, 287, 202, + -171, 622, 1577, 182, 962, -1202, -1474, 1468, + 573, -1325, 264, 383, -829, 1458, -1602, -130, + -681, 1017, 732, 608, -1542, 411, -205, -1571, + 1223, 652, -552, 1015, -1293, 1491, -282, -1544, + 516, -8, -320, -666, -1618, -1162, 126, 1469, + -853, -90, -271, 830, 107, -1421, -247, -951, + -398, 961, -1508, -725, 448, -1065, 677, -1275, + -1103, 430, 555, 843, -1251, 871, 1550, 105, + 422, 587, 177, -235, -291, -460, 1574, 1653, + -246, 778, 1159, -147, -777, 1483, -602, 1119, + -1590, 644, -872, 349, 418, 329, -156, -75, + 817, 1097, 603, 610, 1322, -1285, -1465, 384, + -1215, -136, 1218, -1335, -874, 220, -1187, -1659, + -1185, -1530, -1278, 794, -1510, -854, -870, 478, + -108, -308, 996, 991, 958, -1460, 1522, 1628 + }; /************************************************* * Name: fqmul @@ -82,11 +72,10 @@ static int16_t fqmul(int16_t a, int16_t b) { /************************************************* * Name: PQCLEAN_KYBER102490S_CLEAN_ntt * -* Description: Inplace number-theoretic transform (NTT) in Rq +* Description: Inplace number-theoretic transform (NTT) in Rq. * input is in standard order, output is in bitreversed order * -* Arguments: - int16_t r[256]: pointer to input/output vector of elements -* of Zq +* Arguments: - int16_t r[256]: pointer to input/output vector of elements of Zq **************************************************/ void PQCLEAN_KYBER102490S_CLEAN_ntt(int16_t r[256]) { unsigned int len, start, j, k; @@ -96,7 +85,7 @@ void PQCLEAN_KYBER102490S_CLEAN_ntt(int16_t r[256]) { for (len = 128; len >= 2; len >>= 1) { for (start = 0; start < 256; start = j + len) { zeta = PQCLEAN_KYBER102490S_CLEAN_zetas[k++]; - for (j = start; j < start + len; ++j) { + for (j = start; j < start + len; j++) { t = fqmul(zeta, r[j + len]); r[j + len] = r[j] - t; r[j] = r[j] + t; @@ -112,28 +101,28 @@ void PQCLEAN_KYBER102490S_CLEAN_ntt(int16_t r[256]) { * multiplication by Montgomery factor 2^16. * Input is in bitreversed order, output is in standard order * -* Arguments: - int16_t r[256]: pointer to input/output vector of elements -* of Zq +* Arguments: - int16_t r[256]: pointer to input/output vector of elements of Zq **************************************************/ void PQCLEAN_KYBER102490S_CLEAN_invntt(int16_t r[256]) { unsigned int start, len, j, k; int16_t t, zeta; + const int16_t f = 1441; // mont^2/128 - k = 0; + k = 127; for (len = 2; len <= 128; len <<= 1) { for (start = 0; start < 256; start = j + len) { - zeta = PQCLEAN_KYBER102490S_CLEAN_zetas_inv[k++]; - for (j = start; j < start + len; ++j) { + zeta = PQCLEAN_KYBER102490S_CLEAN_zetas[k--]; + for (j = start; j < start + len; j++) { t = r[j]; r[j] = PQCLEAN_KYBER102490S_CLEAN_barrett_reduce(t + r[j + len]); - r[j + len] = t - r[j + len]; + r[j + len] = r[j + len] - t; r[j + len] = fqmul(zeta, r[j + len]); } } } - for (j = 0; j < 256; ++j) { - r[j] = fqmul(r[j], PQCLEAN_KYBER102490S_CLEAN_zetas_inv[127]); + for (j = 0; j < 256; j++) { + r[j] = fqmul(r[j], f); } } @@ -143,19 +132,15 @@ void PQCLEAN_KYBER102490S_CLEAN_invntt(int16_t r[256]) { * Description: Multiplication of polynomials in Zq[X]/(X^2-zeta) * used for multiplication of elements in Rq in NTT domain * -* Arguments: - int16_t r[2]: pointer to the output polynomial +* Arguments: - int16_t r[2]: pointer to the output polynomial * - const int16_t a[2]: pointer to the first factor * - const int16_t b[2]: pointer to the second factor -* - int16_t zeta: integer defining the reduction polynomial +* - int16_t zeta: integer defining the reduction polynomial **************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_basemul(int16_t r[2], - const int16_t a[2], - const int16_t b[2], - int16_t zeta) { +void PQCLEAN_KYBER102490S_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta) { r[0] = fqmul(a[1], b[1]); r[0] = fqmul(r[0], zeta); r[0] += fqmul(a[0], b[0]); - r[1] = fqmul(a[0], b[1]); r[1] += fqmul(a[1], b[0]); } diff --git a/crypto_kem/kyber1024-90s/clean/ntt.h b/crypto_kem/kyber1024-90s/clean/ntt.h index c9b8eb31..4acc9d65 100644 --- a/crypto_kem/kyber1024-90s/clean/ntt.h +++ b/crypto_kem/kyber1024-90s/clean/ntt.h @@ -5,15 +5,10 @@ extern const int16_t PQCLEAN_KYBER102490S_CLEAN_zetas[128]; -extern const int16_t PQCLEAN_KYBER102490S_CLEAN_zetas_inv[128]; - void PQCLEAN_KYBER102490S_CLEAN_ntt(int16_t r[256]); void PQCLEAN_KYBER102490S_CLEAN_invntt(int16_t r[256]); -void PQCLEAN_KYBER102490S_CLEAN_basemul(int16_t r[2], - const int16_t a[2], - const int16_t b[2], - int16_t zeta); +void PQCLEAN_KYBER102490S_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta); #endif diff --git a/crypto_kem/kyber1024-90s/clean/params.h b/crypto_kem/kyber1024-90s/clean/params.h index e12e9666..3de69171 100644 --- a/crypto_kem/kyber1024-90s/clean/params.h +++ b/crypto_kem/kyber1024-90s/clean/params.h @@ -7,8 +7,6 @@ #define KYBER_N 256 #define KYBER_Q 3329 -#define KYBER_ETA 2 - #define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define KYBER_SSBYTES 32 /* size in bytes of shared key */ @@ -16,20 +14,20 @@ #define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) #define KYBER_K 4 +#define KYBER_ETA1 2 #define KYBER_POLYCOMPRESSEDBYTES 160 #define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 352) -#define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES +#define KYBER_ETA2 2 + +#define KYBER_INDCPA_MSGBYTES (KYBER_SYMBYTES) #define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) #define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) -#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES \ - + KYBER_POLYCOMPRESSEDBYTES) +#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES) #define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES) /* 32 bytes of additional space to save H(pk) */ -#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES \ - + KYBER_INDCPA_PUBLICKEYBYTES \ - + 2*KYBER_SYMBYTES) -#define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES +#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) +#define KYBER_CIPHERTEXTBYTES (KYBER_INDCPA_BYTES) #endif diff --git a/crypto_kem/kyber1024-90s/clean/poly.c b/crypto_kem/kyber1024-90s/clean/poly.c index 0f1eb852..515453ac 100644 --- a/crypto_kem/kyber1024-90s/clean/poly.c +++ b/crypto_kem/kyber1024-90s/clean/poly.c @@ -13,17 +13,19 @@ * * Arguments: - uint8_t *r: pointer to output byte array * (of length KYBER_POLYCOMPRESSEDBYTES) -* - poly *a: pointer to input polynomial +* - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a) { +void PQCLEAN_KYBER102490S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a) { size_t i, j; + int16_t u; uint8_t t[8]; - PQCLEAN_KYBER102490S_CLEAN_poly_csubq(a); - for (i = 0; i < KYBER_N / 8; i++) { for (j = 0; j < 8; j++) { - t[j] = ((((uint32_t)a->coeffs[8 * i + j] << 5) + KYBER_Q / 2) / KYBER_Q) & 31; + // map to positive standard representatives + u = a->coeffs[8 * i + j]; + u += (u >> 15) & KYBER_Q; + t[j] = ((((uint32_t)u << 5) + KYBER_Q / 2) / KYBER_Q) & 31; } r[0] = (t[0] >> 0) | (t[1] << 5); @@ -41,7 +43,7 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTE * Description: De-serialization and subsequent decompression of a polynomial; * approximate inverse of PQCLEAN_KYBER102490S_CLEAN_poly_compress * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: pointer to input byte array * (of length KYBER_POLYCOMPRESSEDBYTES bytes) **************************************************/ @@ -74,20 +76,21 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_P * * Arguments: - uint8_t *r: pointer to output byte array * (needs space for KYBER_POLYBYTES bytes) -* - poly *a: pointer to input polynomial +* - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { +void PQCLEAN_KYBER102490S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a) { size_t i; uint16_t t0, t1; - PQCLEAN_KYBER102490S_CLEAN_poly_csubq(a); - for (i = 0; i < KYBER_N / 2; i++) { - t0 = a->coeffs[2 * i]; + // map to positive standard representatives + t0 = a->coeffs[2 * i]; + t0 += ((int16_t)t0 >> 15) & KYBER_Q; t1 = a->coeffs[2 * i + 1]; - r[3 * i + 0] = (uint8_t)(t0 >> 0); - r[3 * i + 1] = (uint8_t)((t0 >> 8) | (t1 << 4)); - r[3 * i + 2] = (uint8_t)(t1 >> 4); + t1 += ((int16_t)t1 >> 15) & KYBER_Q; + r[3 * i + 0] = (t0 >> 0); + r[3 * i + 1] = (t0 >> 8) | (t1 << 4); + r[3 * i + 2] = (t1 >> 4); } } @@ -97,7 +100,7 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a * Description: De-serialization of a polynomial; * inverse of PQCLEAN_KYBER102490S_CLEAN_poly_tobytes * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: pointer to input byte array * (of KYBER_POLYBYTES bytes) **************************************************/ @@ -114,7 +117,7 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_PO * * Description: Convert 32-byte message to polynomial * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *msg: pointer to input message **************************************************/ void PQCLEAN_KYBER102490S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { @@ -135,41 +138,60 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_IN * Description: Convert polynomial to 32-byte message * * Arguments: - uint8_t *msg: pointer to output message -* - poly *a: pointer to input polynomial +* - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a) { +void PQCLEAN_KYBER102490S_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *a) { size_t i, j; uint16_t t; - PQCLEAN_KYBER102490S_CLEAN_poly_csubq(a); - for (i = 0; i < KYBER_N / 8; i++) { msg[i] = 0; for (j = 0; j < 8; j++) { - t = ((((uint16_t)a->coeffs[8 * i + j] << 1) + KYBER_Q / 2) / KYBER_Q) & 1; + t = a->coeffs[8 * i + j]; + t += ((int16_t)t >> 15) & KYBER_Q; + t = (((t << 1) + KYBER_Q / 2) / KYBER_Q) & 1; msg[i] |= t << j; } } } /************************************************* -* Name: PQCLEAN_KYBER102490S_CLEAN_poly_getnoise +* Name: PQCLEAN_KYBER102490S_CLEAN_poly_getnoise_eta1 * * Description: Sample a polynomial deterministically from a seed and a nonce, * with output polynomial close to centered binomial distribution -* with parameter KYBER_ETA +* with parameter KYBER_ETA1 * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *seed: pointer to input seed * (of length KYBER_SYMBYTES bytes) -* - uint8_t nonce: one-byte input nonce +* - uint8_t nonce: one-byte input nonce **************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { - uint8_t buf[KYBER_ETA * KYBER_N / 4]; +void PQCLEAN_KYBER102490S_CLEAN_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { + uint8_t buf[KYBER_ETA1 * KYBER_N / 4]; prf(buf, sizeof(buf), seed, nonce); - PQCLEAN_KYBER102490S_CLEAN_cbd(r, buf); + PQCLEAN_KYBER102490S_CLEAN_poly_cbd_eta1(r, buf); } +/************************************************* +* Name: PQCLEAN_KYBER102490S_CLEAN_poly_getnoise_eta2 +* +* Description: Sample a polynomial deterministically from a seed and a nonce, +* with output polynomial close to centered binomial distribution +* with parameter KYBER_ETA2 +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *seed: pointer to input seed +* (of length KYBER_SYMBYTES bytes) +* - uint8_t nonce: one-byte input nonce +**************************************************/ +void PQCLEAN_KYBER102490S_CLEAN_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { + uint8_t buf[KYBER_ETA2 * KYBER_N / 4]; + prf(buf, sizeof(buf), seed, nonce); + PQCLEAN_KYBER102490S_CLEAN_poly_cbd_eta2(r, buf); +} + + /************************************************* * Name: PQCLEAN_KYBER102490S_CLEAN_poly_ntt * @@ -202,7 +224,7 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_invntt_tomont(poly *r) { * * Description: Multiplication of two polynomials in NTT domain * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ @@ -210,8 +232,7 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, size_t i; for (i = 0; i < KYBER_N / 4; i++) { PQCLEAN_KYBER102490S_CLEAN_basemul(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i], PQCLEAN_KYBER102490S_CLEAN_zetas[64 + i]); - PQCLEAN_KYBER102490S_CLEAN_basemul(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2], &b->coeffs[4 * i + 2], - -PQCLEAN_KYBER102490S_CLEAN_zetas[64 + i]); + PQCLEAN_KYBER102490S_CLEAN_basemul(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2], &b->coeffs[4 * i + 2], -PQCLEAN_KYBER102490S_CLEAN_zetas[64 + i]); } } @@ -246,28 +267,12 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_reduce(poly *r) { } } -/************************************************* -* Name: PQCLEAN_KYBER102490S_CLEAN_poly_csubq -* -* Description: Applies conditional subtraction of q to each coefficient -* of a polynomial. For details of conditional subtraction -* of q see comments in reduce.c -* -* Arguments: - poly *r: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_poly_csubq(poly *r) { - size_t i; - for (i = 0; i < KYBER_N; i++) { - r->coeffs[i] = PQCLEAN_KYBER102490S_CLEAN_csubq(r->coeffs[i]); - } -} - /************************************************* * Name: PQCLEAN_KYBER102490S_CLEAN_poly_add * -* Description: Add two polynomials +* Description: Add two polynomials; no modular reduction is performed * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ @@ -281,7 +286,7 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_add(poly *r, const poly *a, const poly *b) /************************************************* * Name: PQCLEAN_KYBER102490S_CLEAN_poly_sub * -* Description: Subtract two polynomials +* Description: Subtract two polynomials; no modular reduction is performed * * Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial diff --git a/crypto_kem/kyber1024-90s/clean/poly.h b/crypto_kem/kyber1024-90s/clean/poly.h index 607687dd..aea1b57a 100644 --- a/crypto_kem/kyber1024-90s/clean/poly.h +++ b/crypto_kem/kyber1024-90s/clean/poly.h @@ -11,16 +11,18 @@ typedef struct { int16_t coeffs[KYBER_N]; } poly; -void PQCLEAN_KYBER102490S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a); +void PQCLEAN_KYBER102490S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a); void PQCLEAN_KYBER102490S_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]); -void PQCLEAN_KYBER102490S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a); +void PQCLEAN_KYBER102490S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a); void PQCLEAN_KYBER102490S_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]); void PQCLEAN_KYBER102490S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]); -void PQCLEAN_KYBER102490S_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a); +void PQCLEAN_KYBER102490S_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *a); -void PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); +void PQCLEAN_KYBER102490S_CLEAN_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); + +void PQCLEAN_KYBER102490S_CLEAN_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); void PQCLEAN_KYBER102490S_CLEAN_poly_ntt(poly *r); void PQCLEAN_KYBER102490S_CLEAN_poly_invntt_tomont(poly *r); @@ -28,7 +30,6 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, void PQCLEAN_KYBER102490S_CLEAN_poly_tomont(poly *r); void PQCLEAN_KYBER102490S_CLEAN_poly_reduce(poly *r); -void PQCLEAN_KYBER102490S_CLEAN_poly_csubq(poly *r); void PQCLEAN_KYBER102490S_CLEAN_poly_add(poly *r, const poly *a, const poly *b); void PQCLEAN_KYBER102490S_CLEAN_poly_sub(poly *r, const poly *a, const poly *b); diff --git a/crypto_kem/kyber1024-90s/clean/polyvec.c b/crypto_kem/kyber1024-90s/clean/polyvec.c index ac7b5ab1..c1589d39 100644 --- a/crypto_kem/kyber1024-90s/clean/polyvec.c +++ b/crypto_kem/kyber1024-90s/clean/polyvec.c @@ -10,19 +10,18 @@ * * Arguments: - uint8_t *r: pointer to output byte array * (needs space for KYBER_POLYVECCOMPRESSEDBYTES) -* - polyvec *a: pointer to input vector of polynomials +* - const polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a) { +void PQCLEAN_KYBER102490S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a) { unsigned int i, j, k; - PQCLEAN_KYBER102490S_CLEAN_polyvec_csubq(a); - uint16_t t[8]; for (i = 0; i < KYBER_K; i++) { for (j = 0; j < KYBER_N / 8; j++) { for (k = 0; k < 8; k++) { - t[k] = ((((uint32_t)a->vec[i].coeffs[8 * j + k] << 11) + KYBER_Q / 2) - / KYBER_Q) & 0x7ff; + t[k] = a->vec[i].coeffs[8 * j + k]; + t[k] += ((int16_t)t[k] >> 15) & KYBER_Q; + t[k] = ((((uint32_t)t[k] << 11) + KYBER_Q / 2) / KYBER_Q) & 0x7ff; } r[ 0] = (uint8_t)(t[0] >> 0); @@ -51,8 +50,7 @@ void PQCLEAN_KYBER102490S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESS * - const uint8_t *a: pointer to input byte array * (of length KYBER_POLYVECCOMPRESSEDBYTES) **************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress(polyvec *r, - const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { +void PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { unsigned int i, j, k; uint16_t t[8]; @@ -82,9 +80,9 @@ void PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress(polyvec *r, * * Arguments: - uint8_t *r: pointer to output byte array * (needs space for KYBER_POLYVECBYTES) -* - polyvec *a: pointer to input vector of polynomials +* - const polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a) { +void PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a) { unsigned int i; for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER102490S_CLEAN_poly_tobytes(r + i * KYBER_POLYBYTES, &a->vec[i]); @@ -138,18 +136,16 @@ void PQCLEAN_KYBER102490S_CLEAN_polyvec_invntt_tomont(polyvec *r) { } /************************************************* -* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery +* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_basemul_acc_montgomery * -* Description: Pointwise multiply elements of a and b, accumulate into r, +* Description: Multiply elements of a and b in NTT domain, accumulate into r, * and multiply by 2^-16. * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, - const polyvec *a, - const polyvec *b) { +void PQCLEAN_KYBER102490S_CLEAN_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) { unsigned int i; poly t; @@ -166,10 +162,10 @@ void PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, * Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_reduce * * Description: Applies Barrett reduction to each coefficient -* of each element of a vector of polynomials +* of each element of a vector of polynomials; * for details of the Barrett reduction see comments in reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - polyvec *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER102490S_CLEAN_polyvec_reduce(polyvec *r) { unsigned int i; @@ -178,29 +174,12 @@ void PQCLEAN_KYBER102490S_CLEAN_polyvec_reduce(polyvec *r) { } } -/************************************************* -* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_csubq -* -* Description: Applies conditional subtraction of q to each coefficient -* of each element of a vector of polynomials -* for details of conditional subtraction of q see comments in -* reduce.c -* -* Arguments: - poly *r: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_polyvec_csubq(polyvec *r) { - unsigned int i; - for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER102490S_CLEAN_poly_csubq(&r->vec[i]); - } -} - /************************************************* * Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_add * * Description: Add vectors of polynomials * -* Arguments: - polyvec *r: pointer to output vector of polynomials +* Arguments: - polyvec *r: pointer to output vector of polynomials * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ diff --git a/crypto_kem/kyber1024-90s/clean/polyvec.h b/crypto_kem/kyber1024-90s/clean/polyvec.h index 44e18924..aaccd7f7 100644 --- a/crypto_kem/kyber1024-90s/clean/polyvec.h +++ b/crypto_kem/kyber1024-90s/clean/polyvec.h @@ -8,22 +8,18 @@ typedef struct { poly vec[KYBER_K]; } polyvec; -void PQCLEAN_KYBER102490S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a); -void PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress(polyvec *r, - const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); +void PQCLEAN_KYBER102490S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a); +void PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); -void PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a); +void PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a); void PQCLEAN_KYBER102490S_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]); void PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt(polyvec *r); void PQCLEAN_KYBER102490S_CLEAN_polyvec_invntt_tomont(polyvec *r); -void PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, - const polyvec *a, - const polyvec *b); +void PQCLEAN_KYBER102490S_CLEAN_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b); void PQCLEAN_KYBER102490S_CLEAN_polyvec_reduce(polyvec *r); -void PQCLEAN_KYBER102490S_CLEAN_polyvec_csubq(polyvec *r); void PQCLEAN_KYBER102490S_CLEAN_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); diff --git a/crypto_kem/kyber1024-90s/clean/reduce.c b/crypto_kem/kyber1024-90s/clean/reduce.c index f42859ed..42211d90 100644 --- a/crypto_kem/kyber1024-90s/clean/reduce.c +++ b/crypto_kem/kyber1024-90s/clean/reduce.c @@ -6,8 +6,7 @@ * Name: PQCLEAN_KYBER102490S_CLEAN_montgomery_reduce * * Description: Montgomery reduction; given a 32-bit integer a, computes -* 16-bit integer congruent to a * R^-1 mod q, -* where R=2^16 +* 16-bit integer congruent to a * R^-1 mod q, where R=2^16 * * Arguments: - int32_t a: input integer to be reduced; * has to be in {-q2^15,...,q2^15-1} @@ -29,32 +28,17 @@ int16_t PQCLEAN_KYBER102490S_CLEAN_montgomery_reduce(int32_t a) { * Name: PQCLEAN_KYBER102490S_CLEAN_barrett_reduce * * Description: Barrett reduction; given a 16-bit integer a, computes -* 16-bit integer congruent to a mod q in {0,...,q} +* centered representative congruent to a mod q in {-(q-1)/2,...,(q-1)/2} * * Arguments: - int16_t a: input integer to be reduced * -* Returns: integer in {0,...,q} congruent to a modulo q. +* Returns: integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q. **************************************************/ int16_t PQCLEAN_KYBER102490S_CLEAN_barrett_reduce(int16_t a) { int16_t t; const int16_t v = ((1U << 26) + KYBER_Q / 2) / KYBER_Q; - t = (int32_t)v * a >> 26; + t = ((int32_t)v * a + (1 << 25)) >> 26; t *= KYBER_Q; return a - t; } - -/************************************************* -* Name: PQCLEAN_KYBER102490S_CLEAN_csubq -* -* Description: Conditionallly subtract q -* -* Arguments: - int16_t x: input integer -* -* Returns: a - q if a >= q, else a -**************************************************/ -int16_t PQCLEAN_KYBER102490S_CLEAN_csubq(int16_t a) { - a -= KYBER_Q; - a += (a >> 15) & KYBER_Q; - return a; -} diff --git a/crypto_kem/kyber1024-90s/clean/reduce.h b/crypto_kem/kyber1024-90s/clean/reduce.h index 1c23db2f..f17c04aa 100644 --- a/crypto_kem/kyber1024-90s/clean/reduce.h +++ b/crypto_kem/kyber1024-90s/clean/reduce.h @@ -10,6 +10,4 @@ int16_t PQCLEAN_KYBER102490S_CLEAN_montgomery_reduce(int32_t a); int16_t PQCLEAN_KYBER102490S_CLEAN_barrett_reduce(int16_t a); -int16_t PQCLEAN_KYBER102490S_CLEAN_csubq(int16_t a); - #endif diff --git a/crypto_kem/kyber1024-90s/clean/symmetric-aes.c b/crypto_kem/kyber1024-90s/clean/symmetric-aes.c index 7a4433d8..ccfd21f0 100644 --- a/crypto_kem/kyber1024-90s/clean/symmetric-aes.c +++ b/crypto_kem/kyber1024-90s/clean/symmetric-aes.c @@ -1,100 +1,18 @@ -#include "aes.h" +#include "aes256ctr.h" #include "params.h" #include "symmetric.h" #include #include -#include -static inline void br_enc32be(unsigned char *dst, uint32_t x) { - dst[3] = (unsigned char)x; - dst[2] = (unsigned char)(x >> 8); - dst[1] = (unsigned char)(x >> 16); - dst[0] = (unsigned char)(x >> 24); +void PQCLEAN_KYBER102490S_CLEAN_kyber_aes256xof_absorb(aes256ctr_ctx *state, const uint8_t seed[32], uint8_t x, uint8_t y) { + uint8_t expnonce[12] = {0}; + expnonce[0] = x; + expnonce[1] = y; + PQCLEAN_KYBER102490S_CLEAN_aes256ctr_init(state, seed, expnonce); } -static void aes256_ctr_xof(unsigned char *out, size_t outlen, const unsigned char *iv, uint32_t ctr, const aes256ctx *ctx) { - uint8_t ivw[16]; - uint8_t buf[AES_BLOCKBYTES]; - size_t i = 0; - - memcpy(ivw, iv, AESCTR_NONCEBYTES); - br_enc32be(ivw + AESCTR_NONCEBYTES, ctr); - - while (outlen > AES_BLOCKBYTES) { - aes256_ecb(out, ivw, 1, ctx); - br_enc32be(ivw + AESCTR_NONCEBYTES, ++ctr); - out += AES_BLOCKBYTES; - outlen -= AES_BLOCKBYTES; - } - if (outlen > 0) { - aes256_ecb(buf, ivw, 1, ctx); - for (i = 0; i < outlen; i++) { - out[i] = buf[i]; - } - } -} - -/************************************************* -* Name: PQCLEAN_KYBER102490S_CLEAN_aes256_prf -* -* Description: AES256 stream generation in CTR mode using 32-bit counter, -* nonce is zero-padded to 12 bytes, counter starts at zero -* -* Arguments: - uint8_t *output: pointer to output -* - size_t outlen: length of requested output in bytes -* - const uint8_t *key: pointer to 32-byte key -* - uint8_t nonce: 1-byte nonce (will be zero-padded to 12 bytes) -**************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_aes256_prf(uint8_t *output, size_t outlen, const uint8_t *key, uint8_t nonce) { - uint8_t iv[12]; - for (int i = 1; i < 12; i++) { - iv[i] = 0; - } - iv[0] = nonce; - - aes256ctx ctx; - aes256_ctr_keyexp(&ctx, key); - aes256_ctr(output, outlen, iv, &ctx); - aes256_ctx_release(&ctx); -} - -/************************************************* -* Name: PQCLEAN_KYBER102490S_CLEAN_aes256xof_absorb -* -* Description: AES256 CTR used as a replacement for a XOF; this function -* "absorbs" a 32-byte key and two additional bytes that are zero-padded -* to a 12-byte nonce -* -* Arguments: - aes256xof_ctx *s: pointer to state to "absorb" key and IV into -* - const uint8_t *key: pointer to 32-byte key -* - uint8_t x: first additional byte to "absorb" -* - uint8_t y: second additional byte to "absorb" -**************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_aes256xof_absorb(aes256xof_ctx *s, const uint8_t *key, uint8_t x, uint8_t y) { - aes256_ecb_keyexp(&s->sk_exp, key); - for (int i = 2; i < 12; i++) { - s->iv[i] = 0; - } - s->iv[0] = x; - s->iv[1] = y; - s->ctr = 0; -} - -/************************************************* -* Name: PQCLEAN_KYBER102490S_CLEAN_aes256xof_squeezeblocks -* -* Description: AES256 CTR used as a replacement for a XOF; this function -* generates 4 blocks out AES256-CTR output -* -* Arguments: - uint8_t *out: pointer to output -* - size_t nblocks: number of reqested 64-byte output blocks -* - aes256xof_ctx *s: AES "state", i.e. expanded key and IV -**************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_aes256xof_squeezeblocks(uint8_t *out, size_t nblocks, aes256xof_ctx *s) { - aes256_ctr_xof(out, nblocks * 64, s->iv, s->ctr, &s->sk_exp); - s->ctr += (uint32_t) (4 * nblocks); -} - -void PQCLEAN_KYBER102490S_CLEAN_aes256xof_ctx_release(aes256xof_ctx *s) { - aes256_ctx_release(&s->sk_exp); +void PQCLEAN_KYBER102490S_CLEAN_kyber_aes256ctr_prf(uint8_t *out, size_t outlen, const uint8_t key[32], uint8_t nonce) { + uint8_t expnonce[12] = {0}; + expnonce[0] = nonce; + PQCLEAN_KYBER102490S_CLEAN_aes256ctr_prf(out, outlen, key, expnonce); } diff --git a/crypto_kem/kyber1024-90s/clean/symmetric-aes.h b/crypto_kem/kyber1024-90s/clean/symmetric-aes.h deleted file mode 100644 index 3fff518b..00000000 --- a/crypto_kem/kyber1024-90s/clean/symmetric-aes.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef PQCLEAN_KYBER102490S_CLEAN_SYMMETRIC_AES_H -#define PQCLEAN_KYBER102490S_CLEAN_SYMMETRIC_AES_H -#include "aes.h" -#include -#include - - -typedef struct { - aes256ctx sk_exp; - uint8_t iv[12]; - uint32_t ctr; -} aes256xof_ctx; - -void PQCLEAN_KYBER102490S_CLEAN_aes256_prf(uint8_t *output, size_t outlen, const uint8_t *key, uint8_t nonce); -void PQCLEAN_KYBER102490S_CLEAN_aes256xof_absorb(aes256xof_ctx *s, const uint8_t *key, uint8_t x, uint8_t y); -void PQCLEAN_KYBER102490S_CLEAN_aes256xof_squeezeblocks(uint8_t *out, size_t nblocks, aes256xof_ctx *s); -void PQCLEAN_KYBER102490S_CLEAN_aes256xof_ctx_release(aes256xof_ctx *s); - -#endif diff --git a/crypto_kem/kyber1024-90s/clean/symmetric.h b/crypto_kem/kyber1024-90s/clean/symmetric.h index 6ceb075d..4f3b42e7 100644 --- a/crypto_kem/kyber1024-90s/clean/symmetric.h +++ b/crypto_kem/kyber1024-90s/clean/symmetric.h @@ -1,23 +1,28 @@ #ifndef PQCLEAN_KYBER102490S_CLEAN_SYMMETRIC_H #define PQCLEAN_KYBER102490S_CLEAN_SYMMETRIC_H +#include "aes256ctr.h" #include "params.h" #include "sha2.h" -#include "symmetric-aes.h" #include #include -typedef aes256xof_ctx xof_state; -#define XOF_BLOCKBYTES 64 +typedef aes256ctr_ctx xof_state; + +void PQCLEAN_KYBER102490S_CLEAN_kyber_aes256xof_absorb(aes256ctr_ctx *state, const uint8_t seed[32], uint8_t x, uint8_t y); + +void PQCLEAN_KYBER102490S_CLEAN_kyber_aes256ctr_prf(uint8_t *out, size_t outlen, const uint8_t key[32], uint8_t nonce); + +#define XOF_BLOCKBYTES AES256CTR_BLOCKBYTES #define hash_h(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES) #define hash_g(OUT, IN, INBYTES) sha512(OUT, IN, INBYTES) -#define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER102490S_CLEAN_aes256xof_absorb(STATE, SEED, X, Y) -#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_KYBER102490S_CLEAN_aes256xof_squeezeblocks(OUT, OUTBLOCKS, STATE) -#define xof_ctx_release(STATE) PQCLEAN_KYBER102490S_CLEAN_aes256xof_ctx_release(STATE) -#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER102490S_CLEAN_aes256_prf(OUT, OUTBYTES, KEY, NONCE) +#define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER102490S_CLEAN_kyber_aes256xof_absorb(STATE, SEED, X, Y) +#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_KYBER102490S_CLEAN_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define xof_ctx_release(STATE) +#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER102490S_CLEAN_kyber_aes256ctr_prf(OUT, OUTBYTES, KEY, NONCE) #define kdf(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES) diff --git a/crypto_kem/kyber1024/META.yml b/crypto_kem/kyber1024/META.yml index 68fad5a9..86947fc9 100644 --- a/crypto_kem/kyber1024/META.yml +++ b/crypto_kem/kyber1024/META.yml @@ -6,7 +6,7 @@ length-public-key: 1568 length-ciphertext: 1568 length-secret-key: 3168 length-shared-secret: 32 -nistkat-sha256: b4b4fc1c2cbbb182252d2822ccb8cb704bcfe876122635c5dfa48ddc09b6e73f +nistkat-sha256: 5afcf2a568ad32d49b55105b032af1850f03f3888ff9e2a72f4059c58e968f60 principal-submitters: - Peter Schwabe auxiliary-submitters: @@ -21,9 +21,9 @@ auxiliary-submitters: - Damien Stehlé implementations: - name: clean - version: https://github.com/pq-crystals/kyber/commit/844057468e69527bd15b17fbe03f4b61f9a22065 via https://github.com/jschanck/package-pqclean/tree/b45068b8/kyber + version: https://github.com/pq-crystals/kyber/commit/e7faae9f662f5b92fee4e966f09b2f23e1e91c65 via https://github.com/jschanck/package-pqclean/tree/231c9bec/kyber - name: avx2 - version: https://github.com/pq-crystals/kyber/commit/844057468e69527bd15b17fbe03f4b61f9a22065 via https://github.com/jschanck/package-pqclean/tree/b45068b8/kyber + version: https://github.com/pq-crystals/kyber/commit/e7faae9f662f5b92fee4e966f09b2f23e1e91c65 via https://github.com/jschanck/package-pqclean/tree/231c9bec/kyber supported_platforms: - architecture: x86_64 operating_systems: diff --git a/crypto_kem/kyber1024/avx2/align.h b/crypto_kem/kyber1024/avx2/align.h index 89a1f23b..06ccc6c5 100644 --- a/crypto_kem/kyber1024/avx2/align.h +++ b/crypto_kem/kyber1024/avx2/align.h @@ -2,22 +2,18 @@ #define PQCLEAN_KYBER1024_AVX2_ALIGN_H #include +#include -#define ALIGN16_TYPE(t) \ - union { \ - __m128i vec; \ - t orig; \ +#define ALIGNED_UINT8(N) \ + union { \ + uint8_t coeffs[(N)]; \ + __m256i vec[((N)+31)/32]; \ } -#define ALIGN32_ARRAY(t, s) \ - union { \ - __m256i vec; \ - t arr[(s)]; \ +#define ALIGNED_INT16(N) \ + union { \ + int16_t coeffs[(N)]; \ + __m256i vec[((N)+15)/16]; \ } -#define ALIGN32_ARRAY_2D(t, n, m) \ - union { \ - __m256i vec; \ - t arr[(n)][(m)]; \ - } #endif diff --git a/crypto_kem/kyber1024/avx2/basemul.S b/crypto_kem/kyber1024/avx2/basemul.S index 80a4c4cc..2fe9b434 100644 --- a/crypto_kem/kyber1024/avx2/basemul.S +++ b/crypto_kem/kyber1024/avx2/basemul.S @@ -1,248 +1,107 @@ #include "cdecl.h" -#include "params.h" -.macro schoolbook off,sign -#load -vmovdqa \off+32(%rsi),%ymm7 # b -vmovdqa \off+32(%rdx),%ymm8 # d -vmovdqa \off(%rsi),%ymm9 # a -vmovdqa \off(%rdx),%ymm10 # c +.macro schoolbook off +vmovdqa _16XQINV*2(%rcx),%ymm0 +vmovdqa (64*\off+ 0)*2(%rsi),%ymm1 # a0 +vmovdqa (64*\off+16)*2(%rsi),%ymm2 # b0 +vmovdqa (64*\off+32)*2(%rsi),%ymm3 # a1 +vmovdqa (64*\off+48)*2(%rsi),%ymm4 # b1 -#mul -vpmullw %ymm7,%ymm8,%ymm11 # bd.lo -vpmulhw %ymm7,%ymm8,%ymm12 # bd.hi -vpmullw %ymm7,%ymm10,%ymm13 # bc.lo -vpmulhw %ymm7,%ymm10,%ymm7 # bc.hi -vpmullw %ymm9,%ymm8,%ymm14 # ad.lo -vpmulhw %ymm9,%ymm8,%ymm8 # ad.hi -vpmullw %ymm9,%ymm10,%ymm15 # ac.lo -vpmulhw %ymm9,%ymm10,%ymm9 # ac.hi +vpmullw %ymm0,%ymm1,%ymm9 # a0.lo +vpmullw %ymm0,%ymm2,%ymm10 # b0.lo +vpmullw %ymm0,%ymm3,%ymm11 # a1.lo +vpmullw %ymm0,%ymm4,%ymm12 # b1.lo -#reduce -vpmullw %ymm1,%ymm11,%ymm11 -vpmulhw %ymm0,%ymm11,%ymm11 -vpsubw %ymm11,%ymm12,%ymm11 # bd +vmovdqa (64*\off+ 0)*2(%rdx),%ymm5 # c0 +vmovdqa (64*\off+16)*2(%rdx),%ymm6 # d0 -#mul -vpmullw %ymm2,%ymm11,%ymm10 # rbd.lo -vpmulhw %ymm2,%ymm11,%ymm11 # rbd.hi +vpmulhw %ymm5,%ymm1,%ymm13 # a0c0.hi +vpmulhw %ymm6,%ymm1,%ymm1 # a0d0.hi +vpmulhw %ymm5,%ymm2,%ymm14 # b0c0.hi +vpmulhw %ymm6,%ymm2,%ymm2 # b0d0.hi -#unpack -vpunpcklwd %ymm7,%ymm13,%ymm12 # bc0 -vpunpckhwd %ymm7,%ymm13,%ymm7 # bc1 -vpunpcklwd %ymm8,%ymm14,%ymm13 # ad0 -vpunpckhwd %ymm8,%ymm14,%ymm8 # ad1 -vpunpcklwd %ymm9,%ymm15,%ymm14 # ac0 -vpunpckhwd %ymm9,%ymm15,%ymm9 # ac1 -vpunpcklwd %ymm11,%ymm10,%ymm15 # rbd0 -vpunpckhwd %ymm11,%ymm10,%ymm10 # rbd1 +vmovdqa (64*\off+32)*2(%rdx),%ymm7 # c1 +vmovdqa (64*\off+48)*2(%rdx),%ymm8 # d1 -#add -.ifeq \sign -vpaddd %ymm14,%ymm15,%ymm14 # x0 -vpaddd %ymm9,%ymm10,%ymm9 # x1 -.else -vpsubd %ymm15,%ymm14,%ymm14 # x0 -vpsubd %ymm10,%ymm9,%ymm9 # x1 -.endif -vpaddd %ymm12,%ymm13,%ymm12 # y0 -vpaddd %ymm7,%ymm8,%ymm7 # y1 -.endm +vpmulhw %ymm7,%ymm3,%ymm15 # a1c1.hi +vpmulhw %ymm8,%ymm3,%ymm3 # a1d1.hi +vpmulhw %ymm7,%ymm4,%ymm0 # b1c1.hi +vpmulhw %ymm8,%ymm4,%ymm4 # b1d1.hi -.macro red a0,a1,b0,b1,x,y,z -#pack -vpxor %ymm\x,%ymm\x,%ymm\x -vpblendw $0xAA,%ymm\x,%ymm\a0,%ymm\y -vpblendw $0xAA,%ymm\x,%ymm\a1,%ymm\z -vpsrld $16,%ymm\a0,%ymm\a0 -vpsrld $16,%ymm\a1,%ymm\a1 -vpackusdw %ymm\z,%ymm\y,%ymm\z -vpackusdw %ymm\a1,%ymm\a0,%ymm\a0 -vpblendw $0xAA,%ymm\x,%ymm\b0,%ymm\y -vpblendw $0xAA,%ymm\x,%ymm\b1,%ymm\x -vpsrld $16,%ymm\b0,%ymm\b0 -vpsrld $16,%ymm\b1,%ymm\b1 -vpackusdw %ymm\x,%ymm\y,%ymm\y -vpackusdw %ymm\b1,%ymm\b0,%ymm\b0 +vmovdqa %ymm13,(%rsp) -#reduce -vpmullw %ymm1,%ymm\z,%ymm\z -vpmullw %ymm1,%ymm\y,%ymm\y -vpmulhw %ymm0,%ymm\z,%ymm\z -vpmulhw %ymm0,%ymm\y,%ymm\y -vpsubw %ymm\z,%ymm\a0,%ymm\a0 -vpsubw %ymm\y,%ymm\b0,%ymm\b0 +vpmullw %ymm5,%ymm9,%ymm13 # a0c0.lo +vpmullw %ymm6,%ymm9,%ymm9 # a0d0.lo +vpmullw %ymm5,%ymm10,%ymm5 # b0c0.lo +vpmullw %ymm6,%ymm10,%ymm10 # b0d0.lo + +vpmullw %ymm7,%ymm11,%ymm6 # a1c1.lo +vpmullw %ymm8,%ymm11,%ymm11 # a1d1.lo +vpmullw %ymm7,%ymm12,%ymm7 # b1c1.lo +vpmullw %ymm8,%ymm12,%ymm12 # b1d1.lo + +vmovdqa _16XQ*2(%rcx),%ymm8 +vpmulhw %ymm8,%ymm13,%ymm13 +vpmulhw %ymm8,%ymm9,%ymm9 +vpmulhw %ymm8,%ymm5,%ymm5 +vpmulhw %ymm8,%ymm10,%ymm10 +vpmulhw %ymm8,%ymm6,%ymm6 +vpmulhw %ymm8,%ymm11,%ymm11 +vpmulhw %ymm8,%ymm7,%ymm7 +vpmulhw %ymm8,%ymm12,%ymm12 + +vpsubw (%rsp),%ymm13,%ymm13 # -a0c0 +vpsubw %ymm9,%ymm1,%ymm9 # a0d0 +vpsubw %ymm5,%ymm14,%ymm5 # b0c0 +vpsubw %ymm10,%ymm2,%ymm10 # b0d0 + +vpsubw %ymm6,%ymm15,%ymm6 # a1c1 +vpsubw %ymm11,%ymm3,%ymm11 # a1d1 +vpsubw %ymm7,%ymm0,%ymm7 # b1c1 +vpsubw %ymm12,%ymm4,%ymm12 # b1d1 + +vmovdqa (%r9),%ymm0 +vmovdqa 32(%r9),%ymm1 +vpmullw %ymm0,%ymm10,%ymm2 +vpmullw %ymm0,%ymm12,%ymm3 +vpmulhw %ymm1,%ymm10,%ymm10 +vpmulhw %ymm1,%ymm12,%ymm12 +vpmulhw %ymm8,%ymm2,%ymm2 +vpmulhw %ymm8,%ymm3,%ymm3 +vpsubw %ymm2,%ymm10,%ymm10 # rb0d0 +vpsubw %ymm3,%ymm12,%ymm12 # rb1d1 + +vpaddw %ymm5,%ymm9,%ymm9 +vpaddw %ymm7,%ymm11,%ymm11 +vpsubw %ymm13,%ymm10,%ymm13 +vpsubw %ymm12,%ymm6,%ymm6 + +vmovdqa %ymm13,(64*\off+ 0)*2(%rdi) +vmovdqa %ymm9,(64*\off+16)*2(%rdi) +vmovdqa %ymm6,(64*\off+32)*2(%rdi) +vmovdqa %ymm11,(64*\off+48)*2(%rdi) .endm .text -basemul64_acc_avx: -poly0.0: -schoolbook 0,0 - -#mov -vmovdqa %ymm14,%ymm3 -vmovdqa %ymm9,%ymm4 -vmovdqa %ymm12,%ymm5 -vmovdqa %ymm7,%ymm6 - -poly1.0: -schoolbook 512,0 - -#add -vpaddd %ymm14,%ymm3,%ymm3 -vpaddd %ymm9,%ymm4,%ymm4 -vpaddd %ymm12,%ymm5,%ymm5 -vpaddd %ymm7,%ymm6,%ymm6 - -poly2.0: -schoolbook 1024,0 - -#add -vpaddd %ymm14,%ymm3,%ymm3 -vpaddd %ymm9,%ymm4,%ymm4 -vpaddd %ymm12,%ymm5,%ymm5 -vpaddd %ymm7,%ymm6,%ymm6 - -poly3.0: -schoolbook 1536,0 - -#add -vpaddd %ymm14,%ymm3,%ymm3 -vpaddd %ymm9,%ymm4,%ymm4 -vpaddd %ymm12,%ymm5,%ymm5 -vpaddd %ymm7,%ymm6,%ymm6 - -#reduce -red 3,4,5,6,7,8,9 - -#store -vmovdqa %ymm3,(%rdi) -vmovdqa %ymm5,32(%rdi) - -poly0.1: -schoolbook 64,1 - -#mov -vmovdqa %ymm14,%ymm3 -vmovdqa %ymm9,%ymm4 -vmovdqa %ymm12,%ymm5 -vmovdqa %ymm7,%ymm6 - -poly1.1: -schoolbook 576,1 - -#add -vpaddd %ymm14,%ymm3,%ymm3 -vpaddd %ymm9,%ymm4,%ymm4 -vpaddd %ymm12,%ymm5,%ymm5 -vpaddd %ymm7,%ymm6,%ymm6 - -poly2.1: -schoolbook 1088,1 - -#add -vpaddd %ymm14,%ymm3,%ymm3 -vpaddd %ymm9,%ymm4,%ymm4 -vpaddd %ymm12,%ymm5,%ymm5 -vpaddd %ymm7,%ymm6,%ymm6 - -poly3.1: -schoolbook 1600,1 - -#add -vpaddd %ymm14,%ymm3,%ymm3 -vpaddd %ymm9,%ymm4,%ymm4 -vpaddd %ymm12,%ymm5,%ymm5 -vpaddd %ymm7,%ymm6,%ymm6 - -#reduce -red 3,4,5,6,7,8,9 - -#store -vmovdqa %ymm3,64(%rdi) -vmovdqa %ymm5,96(%rdi) - -ret - -.global cdecl(PQCLEAN_KYBER1024_AVX2_basemul_acc_avx) -.global _cdecl(PQCLEAN_KYBER1024_AVX2_basemul_acc_avx) -cdecl(PQCLEAN_KYBER1024_AVX2_basemul_acc_avx): -_cdecl(PQCLEAN_KYBER1024_AVX2_basemul_acc_avx): -#consts -vmovdqa _16XQ*2(%rcx),%ymm0 -vmovdqa _16XQINV*2(%rcx),%ymm1 - -vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 -call basemul64_acc_avx - -vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_acc_avx - -vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_acc_avx - -vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_acc_avx - -ret - -basemul64_avx: -schoolbook 0,0 - -#reduce -red 14,9,12,7,8,10,11 - -#store -vmovdqa %ymm14,(%rdi) -vmovdqa %ymm12,32(%rdi) - -schoolbook 64,1 - -#reduce -red 14,9,12,7,8,10,11 - -#store -vmovdqa %ymm14,64(%rdi) -vmovdqa %ymm12,96(%rdi) - -ret - .global cdecl(PQCLEAN_KYBER1024_AVX2_basemul_avx) .global _cdecl(PQCLEAN_KYBER1024_AVX2_basemul_avx) cdecl(PQCLEAN_KYBER1024_AVX2_basemul_avx): _cdecl(PQCLEAN_KYBER1024_AVX2_basemul_avx): -#consts -vmovdqa _16XQ*2(%rcx),%ymm0 -vmovdqa _16XQINV*2(%rcx),%ymm1 +mov %rsp,%r8 +and $-32,%rsp +sub $32,%rsp -vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 -call basemul64_avx +lea (_ZETAS_EXP+176)*2(%rcx),%r9 +schoolbook 0 -vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_avx +add $32*2,%r9 +schoolbook 1 -vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_avx +add $192*2,%r9 +schoolbook 2 -vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_avx +add $32*2,%r9 +schoolbook 3 +mov %r8,%rsp ret diff --git a/crypto_kem/kyber1024/avx2/cbd.c b/crypto_kem/kyber1024/avx2/cbd.c index 93ff7fa9..43587624 100644 --- a/crypto_kem/kyber1024/avx2/cbd.c +++ b/crypto_kem/kyber1024/avx2/cbd.c @@ -4,66 +4,64 @@ #include /************************************************* -* Name: PQCLEAN_KYBER1024_AVX2_cbd +* Name: cbd2 * * Description: Given an array of uniformly random bytes, compute * polynomial with coefficients distributed according to -* a centered binomial distribution with parameter KYBER_ETA +* a centered binomial distribution with parameter eta=2 * -* Arguments: - poly *r: pointer to output polynomial -* - const unsigned char *buf: pointer to input byte array +* Arguments: - poly *r: pointer to output polynomial +* - const __m256i *buf: pointer to aligned input byte array **************************************************/ -void PQCLEAN_KYBER1024_AVX2_cbd(poly *restrict r, const uint8_t *restrict buf) { +static void cbd2(poly *restrict r, const __m256i buf[2 * KYBER_N / 128]) { unsigned int i; - __m256i vec0, vec1, vec2, vec3, tmp; + __m256i f0, f1, f2, f3; const __m256i mask55 = _mm256_set1_epi32(0x55555555); const __m256i mask33 = _mm256_set1_epi32(0x33333333); const __m256i mask03 = _mm256_set1_epi32(0x03030303); + const __m256i mask0F = _mm256_set1_epi32(0x0F0F0F0F); for (i = 0; i < KYBER_N / 64; i++) { - vec0 = _mm256_load_si256((__m256i *)&buf[32 * i]); + f0 = _mm256_load_si256(&buf[i]); - vec1 = _mm256_srli_epi32(vec0, 1); - vec0 = _mm256_and_si256(mask55, vec0); - vec1 = _mm256_and_si256(mask55, vec1); - vec0 = _mm256_add_epi32(vec0, vec1); + f1 = _mm256_srli_epi16(f0, 1); + f0 = _mm256_and_si256(mask55, f0); + f1 = _mm256_and_si256(mask55, f1); + f0 = _mm256_add_epi8(f0, f1); - vec1 = _mm256_srli_epi32(vec0, 2); - vec0 = _mm256_and_si256(mask33, vec0); - vec1 = _mm256_and_si256(mask33, vec1); + f1 = _mm256_srli_epi16(f0, 2); + f0 = _mm256_and_si256(mask33, f0); + f1 = _mm256_and_si256(mask33, f1); + f0 = _mm256_add_epi8(f0, mask33); + f0 = _mm256_sub_epi8(f0, f1); - vec2 = _mm256_srli_epi32(vec0, 4); - vec3 = _mm256_srli_epi32(vec1, 4); - vec0 = _mm256_and_si256(mask03, vec0); - vec1 = _mm256_and_si256(mask03, vec1); - vec2 = _mm256_and_si256(mask03, vec2); - vec3 = _mm256_and_si256(mask03, vec3); + f1 = _mm256_srli_epi16(f0, 4); + f0 = _mm256_and_si256(mask0F, f0); + f1 = _mm256_and_si256(mask0F, f1); + f0 = _mm256_sub_epi8(f0, mask03); + f1 = _mm256_sub_epi8(f1, mask03); - vec1 = _mm256_sub_epi8(vec0, vec1); - vec3 = _mm256_sub_epi8(vec2, vec3); + f2 = _mm256_unpacklo_epi8(f0, f1); + f3 = _mm256_unpackhi_epi8(f0, f1); - vec0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec1)); - vec1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec1, 1)); - vec2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec3)); - vec3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec3, 1)); + f0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f2)); + f1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f2, 1)); + f2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f3)); + f3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f3, 1)); - tmp = _mm256_unpacklo_epi16(vec0, vec2); - vec2 = _mm256_unpackhi_epi16(vec0, vec2); - vec0 = tmp; - tmp = _mm256_unpacklo_epi16(vec1, vec3); - vec3 = _mm256_unpackhi_epi16(vec1, vec3); - vec1 = tmp; - - tmp = _mm256_permute2x128_si256(vec0, vec2, 0x20); - vec2 = _mm256_permute2x128_si256(vec0, vec2, 0x31); - vec0 = tmp; - tmp = _mm256_permute2x128_si256(vec1, vec3, 0x20); - vec3 = _mm256_permute2x128_si256(vec1, vec3, 0x31); - vec1 = tmp; - - _mm256_store_si256((__m256i *)&r->coeffs[64 * i + 0], vec0); - _mm256_store_si256((__m256i *)&r->coeffs[64 * i + 16], vec2); - _mm256_store_si256((__m256i *)&r->coeffs[64 * i + 32], vec1); - _mm256_store_si256((__m256i *)&r->coeffs[64 * i + 48], vec3); + _mm256_store_si256(&r->vec[4 * i + 0], f0); + _mm256_store_si256(&r->vec[4 * i + 1], f2); + _mm256_store_si256(&r->vec[4 * i + 2], f1); + _mm256_store_si256(&r->vec[4 * i + 3], f3); } } + + +/* buf 32 bytes longer for cbd3 */ +void PQCLEAN_KYBER1024_AVX2_poly_cbd_eta1(poly *r, const __m256i buf[KYBER_ETA1 * KYBER_N / 128 + 1]) { + cbd2(r, buf); +} + +void PQCLEAN_KYBER1024_AVX2_poly_cbd_eta2(poly *r, const __m256i buf[KYBER_ETA2 * KYBER_N / 128]) { + cbd2(r, buf); +} diff --git a/crypto_kem/kyber1024/avx2/cbd.h b/crypto_kem/kyber1024/avx2/cbd.h index 53ae890e..ccc01f93 100644 --- a/crypto_kem/kyber1024/avx2/cbd.h +++ b/crypto_kem/kyber1024/avx2/cbd.h @@ -2,8 +2,11 @@ #define PQCLEAN_KYBER1024_AVX2_CBD_H #include "params.h" #include "poly.h" +#include #include -void PQCLEAN_KYBER1024_AVX2_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]); +void PQCLEAN_KYBER1024_AVX2_poly_cbd_eta1(poly *r, const __m256i buf[KYBER_ETA1 * KYBER_N / 128 + 1]); + +void PQCLEAN_KYBER1024_AVX2_poly_cbd_eta2(poly *r, const __m256i buf[KYBER_ETA2 * KYBER_N / 128]); #endif diff --git a/crypto_kem/kyber1024/avx2/cdecl.h b/crypto_kem/kyber1024/avx2/cdecl.h index effdcdf8..7daa7b99 100644 --- a/crypto_kem/kyber1024/avx2/cdecl.h +++ b/crypto_kem/kyber1024/avx2/cdecl.h @@ -1,6 +1,8 @@ #ifndef PQCLEAN_KYBER1024_AVX2_CDECL_H #define PQCLEAN_KYBER1024_AVX2_CDECL_H + + #define _16XQ 0 #define _16XQINV 16 #define _16XV 32 @@ -9,9 +11,10 @@ #define _16XMONTSQLO 80 #define _16XMONTSQHI 96 #define _16XMASK 112 -#define _ZETAS_EXP 128 -#define _ZETAS_INV_EXP 528 - +#define _REVIDXB 128 +#define _REVIDXD 144 +#define _ZETAS_EXP 160 +#define _16XSHIFT 624 /* The C ABI on MacOS exports all symbols with a leading * underscore. This means that any symbols we refer to from @@ -23,4 +26,5 @@ #define _cdecl(s) _##s #define cdecl(s) s + #endif diff --git a/crypto_kem/kyber1024/avx2/consts.c b/crypto_kem/kyber1024/avx2/consts.c index 1beb39f6..2c89e7a8 100644 --- a/crypto_kem/kyber1024/avx2/consts.c +++ b/crypto_kem/kyber1024/avx2/consts.c @@ -1,155 +1,123 @@ +#include "align.h" #include "consts.h" #include "params.h" -#include + #define Q KYBER_Q -#define MONT ((1U << 16) % Q) -#define QINV 62209 // q^-1 mod 2^16 -#define V (((1U << 26) + Q/2)/Q) -#define FHI (MONT*(MONT*(Q-1)*((Q-1)/128) % Q) % Q) -#define FLO (FHI*QINV % 65536) -#define MONTSQHI (MONT*MONT % Q) -#define MONTSQLO (MONTSQHI*QINV % 65536) +#define MONT (-1044) // 2^16 mod q +#define QINV (-3327) // q^-1 mod 2^16 +#define V 20159 // floor(2^26/q + 0.5) +#define FHI 1441 // mont^2/128 +#define FLO (-10079) // qinv*FHI +#define MONTSQHI 1353 // mont^2 +#define MONTSQLO 20553 // qinv*MONTSQHI #define MASK 4095 +#define SHIFT 32 - -const qdata_t PQCLEAN_KYBER1024_AVX2_qdata = {.as_arr = { -#define _16XQ 0 +const qdata_t PQCLEAN_KYBER1024_AVX2_qdata = {.coeffs = { +//#define _16XQ 0 Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, -#define _16XQINV 16 +//#define _16XQINV 16 QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, -#define _16XV 32 +//#define _16XV 32 V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, -#define _16XFLO 48 +//#define _16XFLO 48 FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, -#define _16XFHI 64 +//#define _16XFHI 64 FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, -#define _16XMONTSQLO 80 +//#define _16XMONTSQLO 80 MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, -#define _16XMONTSQHI 96 +//#define _16XMONTSQHI 96 MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, -#define _16XMASK 112 +//#define _16XMASK 112 MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, -#define _ZETAS_EXP 128 - 31499, 31499, 2571, 2571, 14746, 14746, 2970, 2970, - 13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, - 53134, 53134, 53134, 53134, 53134, 53134, 53134, 53134, - 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, - 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, - 44630, 44630, 44630, 44630, 27758, 27758, 27758, 27758, - 61737, 61737, 61737, 61737, 49846, 49846, 49846, 49846, - 3158, 3158, 3158, 3158, 622, 622, 622, 622, - 1577, 1577, 1577, 1577, 182, 182, 182, 182, - 59709, 59709, 17364, 17364, 39176, 39176, 36479, 36479, - 5572, 5572, 64434, 64434, 21439, 21439, 39295, 39295, - 573, 573, 2004, 2004, 264, 264, 383, 383, - 2500, 2500, 1458, 1458, 1727, 1727, 3199, 3199, - 59847, 59020, 1497, 30967, 41972, 20179, 20711, 25081, - 52740, 26617, 16065, 53095, 9135, 64887, 39550, 27837, - 1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, - 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, - 65202, 54059, 33310, 20494, 37798, 945, 50654, 6182, - 32011, 10631, 29176, 36775, 47051, 17561, 51106, 60261, - 2226, 555, 2078, 1550, 422, 177, 3038, 1574, - 3083, 1159, 2552, 2727, 1739, 2457, 418, 3173, - 11182, 13387, 51303, 43881, 13131, 60950, 23093, 5493, - 33034, 30318, 46795, 12639, 20100, 18525, 19529, 52918, - 430, 843, 871, 105, 587, 3094, 2869, 1653, - 778, 3182, 1483, 1119, 644, 349, 329, 3254, - 788, 788, 1812, 1812, 28191, 28191, 28191, 28191, - 28191, 28191, 28191, 28191, 48842, 48842, 48842, 48842, - 48842, 48842, 48842, 48842, 287, 287, 287, 287, - 287, 287, 287, 287, 202, 202, 202, 202, - 202, 202, 202, 202, 10690, 10690, 10690, 10690, - 1359, 1359, 1359, 1359, 54335, 54335, 54335, 54335, - 31164, 31164, 31164, 31164, 962, 962, 962, 962, - 2127, 2127, 2127, 2127, 1855, 1855, 1855, 1855, - 1468, 1468, 1468, 1468, 37464, 37464, 24313, 24313, - 55004, 55004, 8800, 8800, 18427, 18427, 8859, 8859, - 26676, 26676, 49374, 49374, 2648, 2648, 1017, 1017, - 732, 732, 608, 608, 1787, 1787, 411, 411, - 3124, 3124, 1758, 1758, 19884, 37287, 49650, 56638, - 37227, 9076, 35338, 18250, 13427, 14017, 36381, 52780, - 16832, 4312, 41381, 47622, 2476, 3239, 3058, 830, - 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, - 448, 2264, 677, 2054, 34353, 25435, 58154, 24392, - 44610, 10946, 24215, 16990, 10336, 57603, 43035, 10907, - 31637, 28644, 23998, 48114, 817, 603, 1322, 1864, - 2114, 1218, 2455, 2142, 2144, 2051, 1819, 2459, - 3221, 996, 958, 1522, 20297, 2146, 15356, 33152, - 59257, 50634, 54492, 14470, 44039, 45338, 23211, 48094, - 41677, 45279, 7757, 23132, 1097, 610, 2044, 384, - 3193, 1994, 220, 1670, 1799, 794, 2475, 478, - 3021, 991, 1869, 1628, 0, 0, 0, 0, +//#define _REVIDXB 128 + 3854, 3340, 2826, 2312, 1798, 1284, 770, 256, + 3854, 3340, 2826, 2312, 1798, 1284, 770, 256, -#define _ZETAS_INV_EXP 528 - 42405, 57780, 20258, 23860, 17443, 42326, 20199, 21498, - 51067, 11045, 14903, 6280, 32385, 50181, 63391, 45240, - 1701, 1460, 2338, 308, 2851, 854, 2535, 1530, - 1659, 3109, 1335, 136, 2945, 1285, 2719, 2232, - 17423, 41539, 36893, 33900, 54630, 22502, 7934, 55201, - 48547, 41322, 54591, 20927, 41145, 7383, 40102, 31184, - 1807, 2371, 2333, 108, 870, 1510, 1278, 1185, - 1187, 874, 2111, 1215, 1465, 2007, 2726, 2512, - 17915, 24156, 61225, 48705, 12757, 29156, 51520, 52110, - 47287, 30199, 56461, 28310, 8899, 15887, 28250, 45653, - 1275, 2652, 1065, 2881, 725, 1508, 2368, 398, - 951, 247, 1421, 3222, 2499, 271, 90, 853, - 16163, 16163, 38861, 38861, 56678, 56678, 47110, 47110, - 56737, 56737, 10533, 10533, 41224, 41224, 28073, 28073, - 1571, 1571, 205, 205, 2918, 2918, 1542, 1542, - 2721, 2721, 2597, 2597, 2312, 2312, 681, 681, - 34373, 34373, 34373, 34373, 11202, 11202, 11202, 11202, - 64178, 64178, 64178, 64178, 54847, 54847, 54847, 54847, - 1861, 1861, 1861, 1861, 1474, 1474, 1474, 1474, - 1202, 1202, 1202, 1202, 2367, 2367, 2367, 2367, - 16695, 16695, 16695, 16695, 16695, 16695, 16695, 16695, - 37346, 37346, 37346, 37346, 37346, 37346, 37346, 37346, - 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127, - 3042, 3042, 3042, 3042, 3042, 3042, 3042, 3042, - 64749, 64749, 1517, 1517, 12619, 46008, 47012, 45437, - 52898, 18742, 35219, 32503, 60044, 42444, 4587, 52406, - 21656, 14234, 52150, 54355, 75, 3000, 2980, 2685, - 2210, 1846, 147, 2551, 1676, 460, 235, 2742, - 3224, 2458, 2486, 2899, 5276, 14431, 47976, 18486, - 28762, 36361, 54906, 33526, 59355, 14883, 64592, 27739, - 45043, 32227, 11478, 335, 156, 2911, 872, 1590, - 602, 777, 2170, 246, 1755, 291, 3152, 2907, - 1779, 1251, 2774, 1103, 37700, 25987, 650, 56402, - 12442, 49472, 38920, 12797, 40456, 44826, 45358, 23565, - 34570, 64040, 6517, 5690, 1860, 3203, 1162, 1618, - 666, 320, 8, 2813, 1544, 282, 1838, 1293, - 2314, 552, 2677, 2106, 26242, 26242, 44098, 44098, - 1103, 1103, 59965, 59965, 29058, 29058, 26361, 26361, - 48173, 48173, 5828, 5828, 130, 130, 1602, 1602, - 1871, 1871, 829, 829, 2946, 2946, 3065, 3065, - 1325, 1325, 2756, 2756, 15691, 15691, 15691, 15691, - 3800, 3800, 3800, 3800, 37779, 37779, 37779, 37779, - 20907, 20907, 20907, 20907, 3147, 3147, 3147, 3147, - 1752, 1752, 1752, 1752, 2707, 2707, 2707, 2707, - 171, 171, 171, 171, 12403, 12403, 12403, 12403, - 12403, 12403, 12403, 12403, 52012, 52012, 52012, 52012, - 52012, 52012, 52012, 52012, 1907, 1907, 1907, 1907, - 1907, 1907, 1907, 1907, 1836, 1836, 1836, 1836, - 1836, 1836, 1836, 1836, 50791, 50791, 359, 359, - 60300, 60300, 1932, 1932, 0, 0, 0, 0 +//#define _REVIDXD 144 + 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0, 0, + +//#define _ZETAS_EXP 160 + 31498, 31498, 31498, 31498, -758, -758, -758, -758, + 5237, 5237, 5237, 5237, 1397, 1397, 1397, 1397, + 14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745, + 14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745, + -359, -359, -359, -359, -359, -359, -359, -359, + -359, -359, -359, -359, -359, -359, -359, -359, + 13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, + -12402, -12402, -12402, -12402, -12402, -12402, -12402, -12402, + 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, + 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, + -20907, -20907, -20907, -20907, 27758, 27758, 27758, 27758, + -3799, -3799, -3799, -3799, -15690, -15690, -15690, -15690, + -171, -171, -171, -171, 622, 622, 622, 622, + 1577, 1577, 1577, 1577, 182, 182, 182, 182, + -5827, -5827, 17363, 17363, -26360, -26360, -29057, -29057, + 5571, 5571, -1102, -1102, 21438, 21438, -26242, -26242, + 573, 573, -1325, -1325, 264, 264, 383, 383, + -829, -829, 1458, 1458, -1602, -1602, -130, -130, + -5689, -6516, 1496, 30967, -23565, 20179, 20710, 25080, + -12796, 26616, 16064, -12442, 9134, -650, -25986, 27837, + 1223, 652, -552, 1015, -1293, 1491, -282, -1544, + 516, -8, -320, -666, -1618, -1162, 126, 1469, + -335, -11477, -32227, 20494, -27738, 945, -14883, 6182, + 32010, 10631, 29175, -28762, -18486, 17560, -14430, -5276, + -1103, 555, -1251, 1550, 422, 177, -291, 1574, + -246, 1159, -777, -602, -1590, -872, 418, -156, + 11182, 13387, -14233, -21655, 13131, -4587, 23092, 5493, + -32502, 30317, -18741, 12639, 20100, 18525, 19529, -12619, + 430, 843, 871, 105, 587, -235, -460, 1653, + 778, -147, 1483, 1119, 644, 349, 329, -75, + 787, 787, 787, 787, 787, 787, 787, 787, + 787, 787, 787, 787, 787, 787, 787, 787, + -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + 28191, 28191, 28191, 28191, 28191, 28191, 28191, 28191, + -16694, -16694, -16694, -16694, -16694, -16694, -16694, -16694, + 287, 287, 287, 287, 287, 287, 287, 287, + 202, 202, 202, 202, 202, 202, 202, 202, + 10690, 10690, 10690, 10690, 1358, 1358, 1358, 1358, + -11202, -11202, -11202, -11202, 31164, 31164, 31164, 31164, + 962, 962, 962, 962, -1202, -1202, -1202, -1202, + -1474, -1474, -1474, -1474, 1468, 1468, 1468, 1468, + -28073, -28073, 24313, 24313, -10532, -10532, 8800, 8800, + 18426, 18426, 8859, 8859, 26675, 26675, -16163, -16163, + -681, -681, 1017, 1017, 732, 732, 608, 608, + -1542, -1542, 411, 411, -205, -205, -1571, -1571, + 19883, -28250, -15887, -8898, -28309, 9075, -30199, 18249, + 13426, 14017, -29156, -12757, 16832, 4311, -24155, -17915, + -853, -90, -271, 830, 107, -1421, -247, -951, + -398, 961, -1508, -725, 448, -1065, 677, -1275, + -31183, 25435, -7382, 24391, -20927, 10946, 24214, 16989, + 10335, -7934, -22502, 10906, 31636, 28644, 23998, -17422, + 817, 603, 1322, -1465, -1215, 1218, -874, -1187, + -1185, -1278, -1510, -870, -108, 996, 958, 1522, + 20297, 2146, 15355, -32384, -6280, -14903, -11044, 14469, + -21498, -20198, 23210, -17442, -23860, -20257, 7756, 23132, + 1097, 610, -1285, 384, -136, -1335, 220, -1659, + -1530, 794, -854, 478, -308, 991, -1460, 1628, + +//#define _16XSHIFT 624 + SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, + SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT } }; diff --git a/crypto_kem/kyber1024/avx2/consts.h b/crypto_kem/kyber1024/avx2/consts.h index 0d9bf87c..b8d02ad7 100644 --- a/crypto_kem/kyber1024/avx2/consts.h +++ b/crypto_kem/kyber1024/avx2/consts.h @@ -1,19 +1,10 @@ #ifndef PQCLEAN_KYBER1024_AVX2_CONSTS_H #define PQCLEAN_KYBER1024_AVX2_CONSTS_H +#include "align.h" #include "cdecl.h" -#include "params.h" -#include -#include -#define ALIGNED_UINT16_T(N) \ - union { \ - __m256i as_vec; \ - uint16_t as_arr[(N)]; \ - } - -typedef ALIGNED_UINT16_T(928) qdata_t; - +typedef ALIGNED_INT16(640) qdata_t; extern const qdata_t PQCLEAN_KYBER1024_AVX2_qdata; #endif diff --git a/crypto_kem/kyber1024/avx2/fips202x4.c b/crypto_kem/kyber1024/avx2/fips202x4.c index 66232b87..5537d7ea 100644 --- a/crypto_kem/kyber1024/avx2/fips202x4.c +++ b/crypto_kem/kyber1024/avx2/fips202x4.c @@ -9,22 +9,14 @@ #define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds extern void KeccakF1600_StatePermute4x(__m256i *s); -static inline void store64(uint8_t x[8], uint64_t u) { - unsigned int i; - - for (i = 0; i < 8; i++) { - x[i] = u >> 8 * i; - } -} - -static void keccakx4_absorb(__m256i s[25], - unsigned int r, - const uint8_t *in0, - const uint8_t *in1, - const uint8_t *in2, - const uint8_t *in3, - size_t inlen, - uint8_t p) { +static void keccakx4_absorb_once(__m256i s[25], + unsigned int r, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen, + uint8_t p) { size_t i, pos = 0; __m256i t, idx; @@ -39,20 +31,17 @@ static void keccakx4_absorb(__m256i s[25], s[i] = _mm256_xor_si256(s[i], t); pos += 8; } + inlen -= r; KeccakF1600_StatePermute4x(s); - inlen -= r; } - i = 0; - while (inlen >= 8) { + for (i = 0; i < inlen / 8; ++i) { t = _mm256_i64gather_epi64((long long *)pos, idx, 1); s[i] = _mm256_xor_si256(s[i], t); - - i++; pos += 8; - inlen -= 8; } + inlen -= 8 * i; if (inlen) { t = _mm256_i64gather_epi64((long long *)pos, idx, 1); @@ -75,37 +64,34 @@ static void keccakx4_squeezeblocks(uint8_t *out0, unsigned int r, __m256i s[25]) { unsigned int i; - uint64_t f0, f1, f2, f3; + __m128d t; while (nblocks > 0) { KeccakF1600_StatePermute4x(s); for (i = 0; i < r / 8; ++i) { - f0 = _mm256_extract_epi64(s[i], 0); - f1 = _mm256_extract_epi64(s[i], 1); - f2 = _mm256_extract_epi64(s[i], 2); - f3 = _mm256_extract_epi64(s[i], 3); - store64(out0, f0); - store64(out1, f1); - store64(out2, f2); - store64(out3, f3); - - out0 += 8; - out1 += 8; - out2 += 8; - out3 += 8; + t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i])); + _mm_storel_pd((double *)&out0[8 * i], t); + _mm_storeh_pd((double *)&out1[8 * i], t); + t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i], 1)); + _mm_storel_pd((double *)&out2[8 * i], t); + _mm_storeh_pd((double *)&out3[8 * i], t); } + out0 += r; + out1 += r; + out2 += r; + out3 += r; --nblocks; } } -void PQCLEAN_KYBER1024_AVX2_shake128x4_absorb(keccakx4_state *state, +void PQCLEAN_KYBER1024_AVX2_shake128x4_absorb_once(keccakx4_state *state, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen) { - keccakx4_absorb(state->s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F); + keccakx4_absorb_once(state->s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F); } void PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(uint8_t *out0, @@ -114,17 +100,16 @@ void PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(uint8_t *out0, uint8_t *out3, size_t nblocks, keccakx4_state *state) { - keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE128_RATE, - state->s); + keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE128_RATE, state->s); } -void PQCLEAN_KYBER1024_AVX2_shake256x4_absorb(keccakx4_state *state, +void PQCLEAN_KYBER1024_AVX2_shake256x4_absorb_once(keccakx4_state *state, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen) { - keccakx4_absorb(state->s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F); + keccakx4_absorb_once(state->s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F); } void PQCLEAN_KYBER1024_AVX2_shake256x4_squeezeblocks(uint8_t *out0, @@ -133,8 +118,7 @@ void PQCLEAN_KYBER1024_AVX2_shake256x4_squeezeblocks(uint8_t *out0, uint8_t *out3, size_t nblocks, keccakx4_state *state) { - keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE256_RATE, - state->s); + keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE256_RATE, state->s); } void PQCLEAN_KYBER1024_AVX2_shake128x4(uint8_t *out0, @@ -152,7 +136,7 @@ void PQCLEAN_KYBER1024_AVX2_shake128x4(uint8_t *out0, uint8_t t[4][SHAKE128_RATE]; keccakx4_state state; - PQCLEAN_KYBER1024_AVX2_shake128x4_absorb(&state, in0, in1, in2, in3, inlen); + PQCLEAN_KYBER1024_AVX2_shake128x4_absorb_once(&state, in0, in1, in2, in3, inlen); PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); out0 += nblocks * SHAKE128_RATE; @@ -187,7 +171,7 @@ void PQCLEAN_KYBER1024_AVX2_shake256x4(uint8_t *out0, uint8_t t[4][SHAKE256_RATE]; keccakx4_state state; - PQCLEAN_KYBER1024_AVX2_shake256x4_absorb(&state, in0, in1, in2, in3, inlen); + PQCLEAN_KYBER1024_AVX2_shake256x4_absorb_once(&state, in0, in1, in2, in3, inlen); PQCLEAN_KYBER1024_AVX2_shake256x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); out0 += nblocks * SHAKE256_RATE; diff --git a/crypto_kem/kyber1024/avx2/fips202x4.h b/crypto_kem/kyber1024/avx2/fips202x4.h index 47f3176c..f3d0a34b 100644 --- a/crypto_kem/kyber1024/avx2/fips202x4.h +++ b/crypto_kem/kyber1024/avx2/fips202x4.h @@ -9,7 +9,7 @@ typedef struct { __m256i s[25]; } keccakx4_state; -void PQCLEAN_KYBER1024_AVX2_shake128x4_absorb(keccakx4_state *state, +void PQCLEAN_KYBER1024_AVX2_shake128x4_absorb_once(keccakx4_state *state, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, @@ -23,7 +23,7 @@ void PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(uint8_t *out0, size_t nblocks, keccakx4_state *state); -void PQCLEAN_KYBER1024_AVX2_shake256x4_absorb(keccakx4_state *state, +void PQCLEAN_KYBER1024_AVX2_shake256x4_absorb_once(keccakx4_state *state, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, diff --git a/crypto_kem/kyber1024/avx2/fq.S b/crypto_kem/kyber1024/avx2/fq.S index 3a039784..a02d3578 100644 --- a/crypto_kem/kyber1024/avx2/fq.S +++ b/crypto_kem/kyber1024/avx2/fq.S @@ -13,14 +13,14 @@ vmovdqa 160(%rdi),%ymm7 vmovdqa 192(%rdi),%ymm8 vmovdqa 224(%rdi),%ymm9 -red16 2,10 -red16 3,11 -red16 4,12 -red16 5,13 -red16 6,14 -red16 7,15 -red16 8,10 -red16 9,11 +red16 2 +red16 3 +red16 4 +red16 5 +red16 6 +red16 7 +red16 8 +red16 9 #store vmovdqa %ymm2,(%rdi) @@ -46,49 +46,6 @@ add $256,%rdi call reduce128_avx ret -csubq128_avx: -#load -vmovdqa (%rdi),%ymm1 -vmovdqa 32(%rdi),%ymm2 -vmovdqa 64(%rdi),%ymm3 -vmovdqa 96(%rdi),%ymm4 -vmovdqa 128(%rdi),%ymm5 -vmovdqa 160(%rdi),%ymm6 -vmovdqa 192(%rdi),%ymm7 -vmovdqa 224(%rdi),%ymm8 - -csubq 1,9 -csubq 2,10 -csubq 3,11 -csubq 4,12 -csubq 5,13 -csubq 6,14 -csubq 7,15 -csubq 8,9 - -#store -vmovdqa %ymm1,(%rdi) -vmovdqa %ymm2,32(%rdi) -vmovdqa %ymm3,64(%rdi) -vmovdqa %ymm4,96(%rdi) -vmovdqa %ymm5,128(%rdi) -vmovdqa %ymm6,160(%rdi) -vmovdqa %ymm7,192(%rdi) -vmovdqa %ymm8,224(%rdi) - -ret - -.global cdecl(PQCLEAN_KYBER1024_AVX2_csubq_avx) -.global _cdecl(PQCLEAN_KYBER1024_AVX2_csubq_avx) -cdecl(PQCLEAN_KYBER1024_AVX2_csubq_avx): -_cdecl(PQCLEAN_KYBER1024_AVX2_csubq_avx): -#consts -vmovdqa _16XQ*2(%rsi),%ymm0 -call csubq128_avx -add $256,%rdi -call csubq128_avx -ret - tomont128_avx: #load vmovdqa (%rdi),%ymm3 diff --git a/crypto_kem/kyber1024/avx2/fq.inc b/crypto_kem/kyber1024/avx2/fq.inc index 75df098a..4b7afc31 100644 --- a/crypto_kem/kyber1024/avx2/fq.inc +++ b/crypto_kem/kyber1024/avx2/fq.inc @@ -1,6 +1,10 @@ -.macro red16 r,x=12 +.macro red16 r,rs=0,x=12 vpmulhw %ymm1,%ymm\r,%ymm\x +.if \rs +vpmulhrsw %ymm\rs,%ymm\x,%ymm\x +.else vpsraw $10,%ymm\x,%ymm\x +.endif vpmullw %ymm0,%ymm\x,%ymm\x vpsubw %ymm\x,%ymm\r,%ymm\r .endm @@ -10,9 +14,6 @@ vpsubw %ymm0,%ymm\r,%ymm\r vpsraw $15,%ymm\r,%ymm\x vpand %ymm0,%ymm\x,%ymm\x vpaddw %ymm\x,%ymm\r,%ymm\r -#vpcmpgtw %ymm0,%ymm\r,%ymm\x -#vpand %ymm0,%ymm\x,%ymm\x -#vpsubw %ymm\x,%ymm\r,%ymm\r .endm .macro caddq r,x=12 diff --git a/crypto_kem/kyber1024/avx2/indcpa.c b/crypto_kem/kyber1024/avx2/indcpa.c index 4b49bdaf..ff90a36b 100644 --- a/crypto_kem/kyber1024/avx2/indcpa.c +++ b/crypto_kem/kyber1024/avx2/indcpa.c @@ -8,6 +8,7 @@ #include "randombytes.h" #include "rejsample.h" #include "symmetric.h" +#include #include #include @@ -15,11 +16,14 @@ * Name: pack_pk * * Description: Serialize the public key as concatenation of the -* serialized vector of polynomials pk -* and the public seed used to generate the matrix A. +* serialized vector of polynomials pk and the +* public seed used to generate the matrix A. +* The polynomial coefficients in pk are assumed to +* lie in the invertal [0,q], i.e. pk must be reduced +* by PQCLEAN_KYBER1024_AVX2_polyvec_reduce(). * -* Arguments: uint8_t *r: pointer to the output serialized public key -* polyvec *pk: pointer to the input public-key polyvec +* Arguments: uint8_t *r: pointer to the output serialized public key +* polyvec *pk: pointer to the input public-key polyvec * const uint8_t *seed: pointer to the input public seed **************************************************/ static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], @@ -55,9 +59,12 @@ static void unpack_pk(polyvec *pk, /************************************************* * Name: pack_sk * -* Description: Serialize the secret key +* Description: Serialize the secret key. +* The polynomial coefficients in sk are assumed to +* lie in the invertal [0,q], i.e. sk must be reduced +* by PQCLEAN_KYBER1024_AVX2_polyvec_reduce(). * -* Arguments: - uint8_t *r: pointer to output serialized secret key +* Arguments: - uint8_t *r: pointer to output serialized secret key * - polyvec *sk: pointer to input vector of polynomials (secret key) **************************************************/ static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { @@ -67,15 +74,12 @@ static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { /************************************************* * Name: unpack_sk * -* Description: De-serialize the secret key; -* inverse of pack_sk +* Description: De-serialize the secret key; inverse of pack_sk * -* Arguments: - polyvec *sk: pointer to output vector of polynomials -* (secret key) +* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key) * - const uint8_t *packedsk: pointer to input serialized secret key **************************************************/ -static void unpack_sk(polyvec *sk, - const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { +static void unpack_sk(polyvec *sk, const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { PQCLEAN_KYBER1024_AVX2_polyvec_frombytes(sk, packedsk); } @@ -84,15 +88,16 @@ static void unpack_sk(polyvec *sk, * * Description: Serialize the ciphertext as concatenation of the * compressed and serialized vector of polynomials b -* and the compressed and serialized polynomial v +* and the compressed and serialized polynomial v. +* The polynomial coefficients in b and v are assumed to +* lie in the invertal [0,q], i.e. b and v must be reduced +* by PQCLEAN_KYBER1024_AVX2_polyvec_reduce() and PQCLEAN_KYBER1024_AVX2_poly_reduce(), respectively. * * Arguments: uint8_t *r: pointer to the output serialized ciphertext -* poly *pk: pointer to the input vector of polynomials b -* poly *v: pointer to the input polynomial v +* poly *pk: pointer to the input vector of polynomials b +* poly *v: pointer to the input polynomial v **************************************************/ -static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], - polyvec *b, - poly *v) { +static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], polyvec *b, poly *v) { PQCLEAN_KYBER1024_AVX2_polyvec_compress(r, b); PQCLEAN_KYBER1024_AVX2_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v); } @@ -103,13 +108,11 @@ static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], * Description: De-serialize and decompress ciphertext from a byte array; * approximate inverse of pack_ciphertext * -* Arguments: - polyvec *b: pointer to the output vector of polynomials b -* - poly *v: pointer to the output polynomial v +* Arguments: - polyvec *b: pointer to the output vector of polynomials b +* - poly *v: pointer to the output polynomial v * - const uint8_t *c: pointer to the input serialized ciphertext **************************************************/ -static void unpack_ciphertext(polyvec *b, - poly *v, - const uint8_t c[KYBER_INDCPA_BYTES]) { +static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t c[KYBER_INDCPA_BYTES]) { PQCLEAN_KYBER1024_AVX2_polyvec_decompress(b, c); PQCLEAN_KYBER1024_AVX2_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES); } @@ -120,11 +123,9 @@ static void unpack_ciphertext(polyvec *b, * Description: Run rejection sampling on uniform random bytes to generate * uniform random integers mod q * -* Arguments: - int16_t *r: pointer to output buffer -* - unsigned int len: requested number of 16-bit integers -* (uniform mod q) -* - const uint8_t *buf: pointer to input buffer -* (assumed to be uniform random bytes) +* Arguments: - int16_t *r: pointer to output array +* - unsigned int len: requested number of 16-bit integers (uniform mod q) +* - const uint8_t *buf: pointer to input buffer (assumed to be uniformly random bytes) * - unsigned int buflen: length of input buffer in bytes * * Returns number of sampled 16-bit integers (at most len) @@ -134,16 +135,19 @@ static unsigned int rej_uniform(int16_t *r, const uint8_t *buf, unsigned int buflen) { unsigned int ctr, pos; - uint16_t val; + uint16_t val0, val1; ctr = pos = 0; - while (ctr < len && pos + 2 <= buflen) { - val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); - pos += 2; + while (ctr < len && pos + 3 <= buflen) { + val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; + val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF; + pos += 3; - if (val < 19 * KYBER_Q) { - val -= ((uint32_t)val * 20159 >> 26) * KYBER_Q; // Barrett reduction - r[ctr++] = (int16_t)val; + if (val0 < KYBER_Q) { + r[ctr++] = val0; + } + if (ctr < len && val1 < KYBER_Q) { + r[ctr++] = val1; } } @@ -165,61 +169,54 @@ static unsigned int rej_uniform(int16_t *r, * - const uint8_t *seed: pointer to input seed * - int transposed: boolean deciding whether A or A^T is generated **************************************************/ -#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \ - + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) void PQCLEAN_KYBER1024_AVX2_gen_matrix(polyvec *a, const uint8_t seed[32], int transposed) { unsigned int i, ctr0, ctr1, ctr2, ctr3; - ALIGN32_ARRAY_2D(uint8_t, 4, (GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 31) / 32 * 32) buf; + ALIGNED_UINT8(REJ_UNIFORM_AVX_NBLOCKS * SHAKE128_RATE) buf[4]; __m256i f; keccakx4_state state; for (i = 0; i < 4; i++) { - f = _mm256_load_si256((__m256i *)seed); - _mm256_store_si256((__m256i *)buf.arr[0], f); - _mm256_store_si256((__m256i *)buf.arr[1], f); - _mm256_store_si256((__m256i *)buf.arr[2], f); - _mm256_store_si256((__m256i *)buf.arr[3], f); + f = _mm256_loadu_si256((__m256i *)seed); + _mm256_store_si256(buf[0].vec, f); + _mm256_store_si256(buf[1].vec, f); + _mm256_store_si256(buf[2].vec, f); + _mm256_store_si256(buf[3].vec, f); if (transposed) { - buf.arr[0][KYBER_SYMBYTES + 0] = i; - buf.arr[0][KYBER_SYMBYTES + 1] = 0; - buf.arr[1][KYBER_SYMBYTES + 0] = i; - buf.arr[1][KYBER_SYMBYTES + 1] = 1; - buf.arr[2][KYBER_SYMBYTES + 0] = i; - buf.arr[2][KYBER_SYMBYTES + 1] = 2; - buf.arr[3][KYBER_SYMBYTES + 0] = i; - buf.arr[3][KYBER_SYMBYTES + 1] = 3; + buf[0].coeffs[32] = i; + buf[0].coeffs[33] = 0; + buf[1].coeffs[32] = i; + buf[1].coeffs[33] = 1; + buf[2].coeffs[32] = i; + buf[2].coeffs[33] = 2; + buf[3].coeffs[32] = i; + buf[3].coeffs[33] = 3; } else { - buf.arr[0][KYBER_SYMBYTES + 0] = 0; - buf.arr[0][KYBER_SYMBYTES + 1] = i; - buf.arr[1][KYBER_SYMBYTES + 0] = 1; - buf.arr[1][KYBER_SYMBYTES + 1] = i; - buf.arr[2][KYBER_SYMBYTES + 0] = 2; - buf.arr[2][KYBER_SYMBYTES + 1] = i; - buf.arr[3][KYBER_SYMBYTES + 0] = 3; - buf.arr[3][KYBER_SYMBYTES + 1] = i; + buf[0].coeffs[32] = 0; + buf[0].coeffs[33] = i; + buf[1].coeffs[32] = 1; + buf[1].coeffs[33] = i; + buf[2].coeffs[32] = 2; + buf[2].coeffs[33] = i; + buf[3].coeffs[32] = 3; + buf[3].coeffs[33] = i; } - PQCLEAN_KYBER1024_AVX2_shake128x4_absorb(&state, buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], KYBER_SYMBYTES + 2); - PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], - GEN_MATRIX_NBLOCKS, &state); + PQCLEAN_KYBER1024_AVX2_shake128x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 34); + PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, REJ_UNIFORM_AVX_NBLOCKS, &state); - ctr0 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[0].coeffs, buf.arr[0]); - ctr1 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[1].coeffs, buf.arr[1]); - ctr2 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[2].coeffs, buf.arr[2]); - ctr3 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[3].coeffs, buf.arr[3]); + ctr0 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[0].coeffs, buf[0].coeffs); + ctr1 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[1].coeffs, buf[1].coeffs); + ctr2 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[2].coeffs, buf[2].coeffs); + ctr3 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[3].coeffs, buf[3].coeffs); while (ctr0 < KYBER_N || ctr1 < KYBER_N || ctr2 < KYBER_N || ctr3 < KYBER_N) { - PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], 1, &state); + PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 1, &state); - ctr0 += rej_uniform(a[i].vec[0].coeffs + ctr0, KYBER_N - ctr0, buf.arr[0], - XOF_BLOCKBYTES); - ctr1 += rej_uniform(a[i].vec[1].coeffs + ctr1, KYBER_N - ctr1, buf.arr[1], - XOF_BLOCKBYTES); - ctr2 += rej_uniform(a[i].vec[2].coeffs + ctr2, KYBER_N - ctr2, buf.arr[2], - XOF_BLOCKBYTES); - ctr3 += rej_uniform(a[i].vec[3].coeffs + ctr3, KYBER_N - ctr3, buf.arr[3], - XOF_BLOCKBYTES); + ctr0 += rej_uniform(a[i].vec[0].coeffs + ctr0, KYBER_N - ctr0, buf[0].coeffs, SHAKE128_RATE); + ctr1 += rej_uniform(a[i].vec[1].coeffs + ctr1, KYBER_N - ctr1, buf[1].coeffs, SHAKE128_RATE); + ctr2 += rej_uniform(a[i].vec[2].coeffs + ctr2, KYBER_N - ctr2, buf[2].coeffs, SHAKE128_RATE); + ctr3 += rej_uniform(a[i].vec[3].coeffs + ctr3, KYBER_N - ctr3, buf[3].coeffs, SHAKE128_RATE); } PQCLEAN_KYBER1024_AVX2_poly_nttunpack(&a[i].vec[0]); @@ -243,27 +240,26 @@ void PQCLEAN_KYBER1024_AVX2_gen_matrix(polyvec *a, const uint8_t seed[32], int t void PQCLEAN_KYBER1024_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { unsigned int i; - ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; - const uint8_t *publicseed = buf.arr; - const uint8_t *noiseseed = buf.arr + KYBER_SYMBYTES; + uint8_t buf[2 * KYBER_SYMBYTES]; + const uint8_t *publicseed = buf; + const uint8_t *noiseseed = buf + KYBER_SYMBYTES; polyvec a[KYBER_K], e, pkpv, skpv; - randombytes(buf.arr, KYBER_SYMBYTES); - hash_g(buf.arr, buf.arr, KYBER_SYMBYTES); + randombytes(buf, KYBER_SYMBYTES); + hash_g(buf, buf, KYBER_SYMBYTES); gen_a(a, publicseed); - PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(skpv.vec + 0, skpv.vec + 1, skpv.vec + 2, skpv.vec + 3, noiseseed, - 0, 1, 2, 3); - PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(e.vec + 0, e.vec + 1, e.vec + 2, e.vec + 3, noiseseed, - 4, 5, 6, 7); + PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta1_4x(skpv.vec + 0, skpv.vec + 1, skpv.vec + 2, skpv.vec + 3, noiseseed, 0, 1, 2, 3); + PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta1_4x(e.vec + 0, e.vec + 1, e.vec + 2, e.vec + 3, noiseseed, 4, 5, 6, 7); PQCLEAN_KYBER1024_AVX2_polyvec_ntt(&skpv); + PQCLEAN_KYBER1024_AVX2_polyvec_reduce(&skpv); PQCLEAN_KYBER1024_AVX2_polyvec_ntt(&e); // matrix-vector multiplication for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); + PQCLEAN_KYBER1024_AVX2_polyvec_basemul_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); PQCLEAN_KYBER1024_AVX2_poly_tomont(&pkpv.vec[i]); } @@ -280,55 +276,51 @@ void PQCLEAN_KYBER1024_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTE * Description: Encryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *c: pointer to output ciphertext -* (of length KYBER_INDCPA_BYTES bytes) -* - const uint8_t *m: pointer to input message -* (of length KYBER_INDCPA_MSGBYTES bytes) -* - const uint8_t *pk: pointer to input public key -* (of length KYBER_INDCPA_PUBLICKEYBYTES) -* - const uint8_t *coins: pointer to input random coins -* used as seed (of length KYBER_SYMBYTES) -* to deterministically generate all -* randomness +* Arguments: - uint8_t *c: pointer to output ciphertext +* (of length KYBER_INDCPA_BYTES bytes) +* - const uint8_t *m: pointer to input message +* (of length KYBER_INDCPA_MSGBYTES bytes) +* - const uint8_t *pk: pointer to input public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES) +* - const uint8_t *coins: pointer to input random coins used as seed +* (of length KYBER_SYMBYTES) to deterministically +* generate all randomness **************************************************/ void PQCLEAN_KYBER1024_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], const uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], const uint8_t coins[KYBER_SYMBYTES]) { unsigned int i; - ALIGN32_ARRAY(uint8_t, KYBER_SYMBYTES) seed; - polyvec sp, pkpv, ep, at[KYBER_K], bp; + uint8_t seed[KYBER_SYMBYTES]; + polyvec sp, pkpv, ep, at[KYBER_K], b; poly v, k, epp; - unpack_pk(&pkpv, seed.arr, pk); + unpack_pk(&pkpv, seed, pk); PQCLEAN_KYBER1024_AVX2_poly_frommsg(&k, m); - gen_at(at, seed.arr); + gen_at(at, seed); - PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(sp.vec + 0, sp.vec + 1, sp.vec + 2, sp.vec + 3, coins, - 0, 1, 2, 3); - PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(ep.vec + 0, ep.vec + 1, ep.vec + 2, ep.vec + 3, coins, - 4, 5, 6, 7); - PQCLEAN_KYBER1024_AVX2_poly_getnoise(&epp, coins, 8); + PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta1_4x(sp.vec + 0, sp.vec + 1, sp.vec + 2, sp.vec + 3, coins, 0, 1, 2, 3); + PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta1_4x(ep.vec + 0, ep.vec + 1, ep.vec + 2, ep.vec + 3, coins, 4, 5, 6, 7); + PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta2(&epp, coins, 8); PQCLEAN_KYBER1024_AVX2_polyvec_ntt(&sp); // matrix-vector multiplication for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp); + PQCLEAN_KYBER1024_AVX2_polyvec_basemul_acc_montgomery(&b.vec[i], &at[i], &sp); } + PQCLEAN_KYBER1024_AVX2_polyvec_basemul_acc_montgomery(&v, &pkpv, &sp); - PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp); - - PQCLEAN_KYBER1024_AVX2_polyvec_invntt_tomont(&bp); + PQCLEAN_KYBER1024_AVX2_polyvec_invntt_tomont(&b); PQCLEAN_KYBER1024_AVX2_poly_invntt_tomont(&v); - PQCLEAN_KYBER1024_AVX2_polyvec_add(&bp, &bp, &ep); + PQCLEAN_KYBER1024_AVX2_polyvec_add(&b, &b, &ep); PQCLEAN_KYBER1024_AVX2_poly_add(&v, &v, &epp); PQCLEAN_KYBER1024_AVX2_poly_add(&v, &v, &k); - PQCLEAN_KYBER1024_AVX2_polyvec_reduce(&bp); + PQCLEAN_KYBER1024_AVX2_polyvec_reduce(&b); PQCLEAN_KYBER1024_AVX2_poly_reduce(&v); - pack_ciphertext(c, &bp, &v); + pack_ciphertext(c, &b, &v); } /************************************************* @@ -337,24 +329,24 @@ void PQCLEAN_KYBER1024_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], * Description: Decryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *m: pointer to output decrypted message -* (of length KYBER_INDCPA_MSGBYTES) -* - const uint8_t *c: pointer to input ciphertext -* (of length KYBER_INDCPA_BYTES) +* Arguments: - uint8_t *m: pointer to output decrypted message +* (of length KYBER_INDCPA_MSGBYTES) +* - const uint8_t *c: pointer to input ciphertext +* (of length KYBER_INDCPA_BYTES) * - const uint8_t *sk: pointer to input secret key * (of length KYBER_INDCPA_SECRETKEYBYTES) **************************************************/ void PQCLEAN_KYBER1024_AVX2_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t c[KYBER_INDCPA_BYTES], const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { - polyvec bp, skpv; + polyvec b, skpv; poly v, mp; - unpack_ciphertext(&bp, &v, c); + unpack_ciphertext(&b, &v, c); unpack_sk(&skpv, sk); - PQCLEAN_KYBER1024_AVX2_polyvec_ntt(&bp); - PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp); + PQCLEAN_KYBER1024_AVX2_polyvec_ntt(&b); + PQCLEAN_KYBER1024_AVX2_polyvec_basemul_acc_montgomery(&mp, &skpv, &b); PQCLEAN_KYBER1024_AVX2_poly_invntt_tomont(&mp); PQCLEAN_KYBER1024_AVX2_poly_sub(&mp, &v, &mp); diff --git a/crypto_kem/kyber1024/avx2/invntt.S b/crypto_kem/kyber1024/avx2/invntt.S index d6fe8915..ecee61af 100644 --- a/crypto_kem/kyber1024/avx2/invntt.S +++ b/crypto_kem/kyber1024/avx2/invntt.S @@ -2,22 +2,21 @@ .include "shuffle.inc" .include "fq.inc" -.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=1,zl1=1,zh0=2,zh1=2 -#update & mul -vpsubw %ymm\rh0,%ymm\rl0,%ymm12 -vpsubw %ymm\rh1,%ymm\rl1,%ymm13 -vpsubw %ymm\rh2,%ymm\rl2,%ymm14 - +.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=2,zl1=2,zh0=3,zh1=3 +vpsubw %ymm\rl0,%ymm\rh0,%ymm12 vpaddw %ymm\rh0,%ymm\rl0,%ymm\rl0 -vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl1 +vpsubw %ymm\rl1,%ymm\rh1,%ymm13 + vpmullw %ymm\zl0,%ymm12,%ymm\rh0 +vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl1 +vpsubw %ymm\rl2,%ymm\rh2,%ymm14 -vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl2 vpmullw %ymm\zl0,%ymm13,%ymm\rh1 -vpsubw %ymm\rh3,%ymm\rl3,%ymm15 +vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl2 +vpsubw %ymm\rl3,%ymm\rh3,%ymm15 -vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl3 vpmullw %ymm\zl1,%ymm14,%ymm\rh2 +vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl3 vpmullw %ymm\zl1,%ymm15,%ymm\rh3 vpmulhw %ymm\zh0,%ymm12,%ymm12 @@ -26,60 +25,84 @@ vpmulhw %ymm\zh0,%ymm13,%ymm13 vpmulhw %ymm\zh1,%ymm14,%ymm14 vpmulhw %ymm\zh1,%ymm15,%ymm15 -#reduce vpmulhw %ymm0,%ymm\rh0,%ymm\rh0 + vpmulhw %ymm0,%ymm\rh1,%ymm\rh1 + vpmulhw %ymm0,%ymm\rh2,%ymm\rh2 vpmulhw %ymm0,%ymm\rh3,%ymm\rh3 + +# + +# + vpsubw %ymm\rh0,%ymm12,%ymm\rh0 + vpsubw %ymm\rh1,%ymm13,%ymm\rh1 + vpsubw %ymm\rh2,%ymm14,%ymm\rh2 vpsubw %ymm\rh3,%ymm15,%ymm\rh3 .endm -.text -invntt_levels0t5_avx: -level0: -#zetas -vmovdqu (%rsi),%ymm15 -vmovdqu 64(%rsi),%ymm3 -vmovdqu 32(%rsi),%ymm1 -vmovdqu 96(%rsi),%ymm2 +.macro intt_levels0t5 off +/* level 0 */ +vmovdqa _16XFLO*2(%rsi),%ymm2 +vmovdqa _16XFHI*2(%rsi),%ymm3 -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 128(%rdi),%ymm8 -vmovdqa 160(%rdi),%ymm9 -vmovdqa 192(%rdi),%ymm10 -vmovdqa 224(%rdi),%ymm11 +vmovdqa (128*\off+ 0)*2(%rdi),%ymm4 +vmovdqa (128*\off+ 32)*2(%rdi),%ymm6 +vmovdqa (128*\off+ 16)*2(%rdi),%ymm5 +vmovdqa (128*\off+ 48)*2(%rdi),%ymm7 -butterfly 4,5,8,9,6,7,10,11,15,3,1,2 +fqmulprecomp 2,3,4 +fqmulprecomp 2,3,6 +fqmulprecomp 2,3,5 +fqmulprecomp 2,3,7 -level1: -#zetas -vmovdqu 128(%rsi),%ymm3 -vmovdqu 160(%rsi),%ymm2 +vmovdqa (128*\off+ 64)*2(%rdi),%ymm8 +vmovdqa (128*\off+ 96)*2(%rdi),%ymm10 +vmovdqa (128*\off+ 80)*2(%rdi),%ymm9 +vmovdqa (128*\off+112)*2(%rdi),%ymm11 -butterfly 4,5,6,7,8,9,10,11,3,3,2,2 +fqmulprecomp 2,3,8 +fqmulprecomp 2,3,10 +fqmulprecomp 2,3,9 +fqmulprecomp 2,3,11 + +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+208)*2(%rsi),%ymm15 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+176)*2(%rsi),%ymm1 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+224)*2(%rsi),%ymm2 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+192)*2(%rsi),%ymm3 +vmovdqa _REVIDXB*2(%rsi),%ymm12 +vpshufb %ymm12,%ymm15,%ymm15 +vpshufb %ymm12,%ymm1,%ymm1 +vpshufb %ymm12,%ymm2,%ymm2 +vpshufb %ymm12,%ymm3,%ymm3 + +butterfly 4,5,8,9,6,7,10,11,15,1,2,3 + +/* level 1 */ +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+144)*2(%rsi),%ymm2 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+160)*2(%rsi),%ymm3 +vmovdqa _REVIDXB*2(%rsi),%ymm1 +vpshufb %ymm1,%ymm2,%ymm2 +vpshufb %ymm1,%ymm3,%ymm3 + +butterfly 4,5,6,7,8,9,10,11,2,2,3,3 shuffle1 4,5,3,5 shuffle1 6,7,4,7 shuffle1 8,9,6,9 shuffle1 10,11,8,11 -level2: -#zetas -vmovdqu 192(%rsi),%ymm10 -vmovdqu 224(%rsi),%ymm2 +/* level 2 */ +vmovdqa _REVIDXD*2(%rsi),%ymm12 +vpermd (_ZETAS_EXP+(1-\off)*224+112)*2(%rsi),%ymm12,%ymm2 +vpermd (_ZETAS_EXP+(1-\off)*224+128)*2(%rsi),%ymm12,%ymm10 -#consts -vmovdqa _16XV*2(%rdx),%ymm1 - -butterfly 3,4,6,8,5,7,9,11,10,10,2,2 +butterfly 3,4,6,8,5,7,9,11,2,2,10,10 +vmovdqa _16XV*2(%rsi),%ymm1 red16 3 shuffle2 3,4,10,4 @@ -87,26 +110,22 @@ shuffle2 6,8,3,8 shuffle2 5,7,6,7 shuffle2 9,11,5,11 -level3: -#zetas -vmovdqu 256(%rsi),%ymm9 -vmovdqu 288(%rsi),%ymm2 +/* level 3 */ +vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+80)*2(%rsi),%ymm2 +vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+96)*2(%rsi),%ymm9 -butterfly 10,3,6,5,4,8,7,11,9,9,2,2 - -red16 10 +butterfly 10,3,6,5,4,8,7,11,2,2,9,9 shuffle4 10,3,9,3 shuffle4 6,5,10,5 shuffle4 4,8,6,8 shuffle4 7,11,4,11 -level4: -#zetas -vmovdqu 320(%rsi),%ymm7 -vmovdqu 352(%rsi),%ymm2 +/* level 4 */ +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+48)*2(%rsi),%ymm2 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+64)*2(%rsi),%ymm7 -butterfly 9,10,6,4,3,5,8,11,7,7,2,2 +butterfly 9,10,6,4,3,5,8,11,2,2,7,7 red16 9 @@ -115,113 +134,62 @@ shuffle8 6,4,9,4 shuffle8 3,5,6,5 shuffle8 8,11,3,11 -level5: -#zetas -vpbroadcastd 384(%rsi),%ymm8 -vpbroadcastd 388(%rsi),%ymm2 +/* level5 */ +vmovdqa (_ZETAS_EXP+(1-\off)*224+16)*2(%rsi),%ymm2 +vmovdqa (_ZETAS_EXP+(1-\off)*224+32)*2(%rsi),%ymm8 -butterfly 7,9,6,3,10,4,5,11,8,8,2,2 +butterfly 7,9,6,3,10,4,5,11,2,2,8,8 -red16 7 +vmovdqa %ymm7,(128*\off+ 0)*2(%rdi) +vmovdqa %ymm9,(128*\off+ 16)*2(%rdi) +vmovdqa %ymm6,(128*\off+ 32)*2(%rdi) +vmovdqa %ymm3,(128*\off+ 48)*2(%rdi) +vmovdqa %ymm10,(128*\off+ 64)*2(%rdi) +vmovdqa %ymm4,(128*\off+ 80)*2(%rdi) +vmovdqa %ymm5,(128*\off+ 96)*2(%rdi) +vmovdqa %ymm11,(128*\off+112)*2(%rdi) +.endm -#store -vmovdqa %ymm7,(%rdi) -vmovdqa %ymm9,32(%rdi) -vmovdqa %ymm6,64(%rdi) -vmovdqa %ymm3,96(%rdi) -vmovdqa %ymm10,128(%rdi) -vmovdqa %ymm4,160(%rdi) -vmovdqa %ymm5,192(%rdi) -vmovdqa %ymm11,224(%rdi) +.macro intt_level6 off +/* level 6 */ +vmovdqa (64*\off+ 0)*2(%rdi),%ymm4 +vmovdqa (64*\off+128)*2(%rdi),%ymm8 +vmovdqa (64*\off+ 16)*2(%rdi),%ymm5 +vmovdqa (64*\off+144)*2(%rdi),%ymm9 +vpbroadcastq (_ZETAS_EXP+0)*2(%rsi),%ymm2 -ret - -invntt_level6_avx: -#zetas -vpbroadcastd (%rsi),%ymm1 -vpbroadcastd 4(%rsi),%ymm2 - -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 256(%rdi),%ymm8 -vmovdqa 288(%rdi),%ymm9 -vmovdqa 320(%rdi),%ymm10 -vmovdqa 352(%rdi),%ymm11 +vmovdqa (64*\off+ 32)*2(%rdi),%ymm6 +vmovdqa (64*\off+160)*2(%rdi),%ymm10 +vmovdqa (64*\off+ 48)*2(%rdi),%ymm7 +vmovdqa (64*\off+176)*2(%rdi),%ymm11 +vpbroadcastq (_ZETAS_EXP+4)*2(%rsi),%ymm3 butterfly 4,5,6,7,8,9,10,11 -#consts -vmovdqa _16XFLO*2(%rdx),%ymm12 -vmovdqa _16XFHI*2(%rdx),%ymm13 +.if \off == 0 +red16 4 +.endif -#store -vmovdqa %ymm8,256(%rdi) -vmovdqa %ymm9,288(%rdi) -vmovdqa %ymm10,320(%rdi) -vmovdqa %ymm11,352(%rdi) - -fqmulprecomp 12,13,4,8 -fqmulprecomp 12,13,5,9 -fqmulprecomp 12,13,6,10 -fqmulprecomp 12,13,7,11 - -#store -vmovdqa %ymm4,(%rdi) -vmovdqa %ymm5,32(%rdi) -vmovdqa %ymm6,64(%rdi) -vmovdqa %ymm7,96(%rdi) - -#load -vmovdqa 128(%rdi),%ymm4 -vmovdqa 160(%rdi),%ymm5 -vmovdqa 192(%rdi),%ymm6 -vmovdqa 224(%rdi),%ymm7 -vmovdqa 384(%rdi),%ymm8 -vmovdqa 416(%rdi),%ymm9 -vmovdqa 448(%rdi),%ymm10 -vmovdqa 480(%rdi),%ymm11 - -butterfly 4,5,6,7,8,9,10,11 - -#consts -vmovdqa _16XFLO*2(%rdx),%ymm12 -vmovdqa _16XFHI*2(%rdx),%ymm13 - -#store -vmovdqa %ymm8,384(%rdi) -vmovdqa %ymm9,416(%rdi) -vmovdqa %ymm10,448(%rdi) -vmovdqa %ymm11,480(%rdi) - -fqmulprecomp 12,13,4,8 -fqmulprecomp 12,13,5,9 -fqmulprecomp 12,13,6,10 -fqmulprecomp 12,13,7,11 - -#store -vmovdqa %ymm4,128(%rdi) -vmovdqa %ymm5,160(%rdi) -vmovdqa %ymm6,192(%rdi) -vmovdqa %ymm7,224(%rdi) - -ret +vmovdqa %ymm4,(64*\off+ 0)*2(%rdi) +vmovdqa %ymm5,(64*\off+ 16)*2(%rdi) +vmovdqa %ymm6,(64*\off+ 32)*2(%rdi) +vmovdqa %ymm7,(64*\off+ 48)*2(%rdi) +vmovdqa %ymm8,(64*\off+128)*2(%rdi) +vmovdqa %ymm9,(64*\off+144)*2(%rdi) +vmovdqa %ymm10,(64*\off+160)*2(%rdi) +vmovdqa %ymm11,(64*\off+176)*2(%rdi) +.endm +.text .global cdecl(PQCLEAN_KYBER1024_AVX2_invntt_avx) .global _cdecl(PQCLEAN_KYBER1024_AVX2_invntt_avx) cdecl(PQCLEAN_KYBER1024_AVX2_invntt_avx): _cdecl(PQCLEAN_KYBER1024_AVX2_invntt_avx): -#consts vmovdqa _16XQ*2(%rsi),%ymm0 -mov %rsi,%rdx -add $_ZETAS_INV_EXP*2,%rsi -call invntt_levels0t5_avx -add $256,%rdi -add $392,%rsi -call invntt_levels0t5_avx -sub $256,%rdi -add $392,%rsi -call invntt_level6_avx + +intt_levels0t5 0 +intt_levels0t5 1 + +intt_level6 0 +intt_level6 1 ret diff --git a/crypto_kem/kyber1024/avx2/kem.c b/crypto_kem/kyber1024/avx2/kem.c index 7b603b02..644181d5 100644 --- a/crypto_kem/kyber1024/avx2/kem.c +++ b/crypto_kem/kyber1024/avx2/kem.c @@ -1,4 +1,3 @@ -#include "align.h" #include "indcpa.h" #include "kem.h" #include "params.h" @@ -15,13 +14,14 @@ * for CCA-secure Kyber key encapsulation mechanism * * Arguments: - unsigned char *pk: pointer to output public key -* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) * - unsigned char *sk: pointer to output private key -* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* (an already allocated array of KYBER_SECRETKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER1024_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { +int PQCLEAN_KYBER1024_AVX2_crypto_kem_keypair(unsigned char pk[KYBER_PUBLICKEYBYTES], + unsigned char sk[KYBER_SECRETKEYBYTES]) { size_t i; PQCLEAN_KYBER1024_AVX2_indcpa_keypair(pk, sk); for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) { @@ -40,36 +40,36 @@ int PQCLEAN_KYBER1024_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char * * secret for given public key * * Arguments: - unsigned char *ct: pointer to output cipher text -* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) * - unsigned char *ss: pointer to output shared secret -* (an already allocated array of CRYPTO_BYTES bytes) +* (an already allocated array of KYBER_SSBYTES bytes) * - const unsigned char *pk: pointer to input public key -* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER1024_AVX2_crypto_kem_enc(unsigned char *ct, - unsigned char *ss, - const unsigned char *pk) { - ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; +int PQCLEAN_KYBER1024_AVX2_crypto_kem_enc(unsigned char ct[KYBER_CIPHERTEXTBYTES], + unsigned char ss[KYBER_SSBYTES], + const unsigned char pk[KYBER_PUBLICKEYBYTES]) { + uint8_t buf[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ - ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr; + uint8_t kr[2 * KYBER_SYMBYTES]; - randombytes(buf.arr, KYBER_SYMBYTES); + randombytes(buf, KYBER_SYMBYTES); /* Don't release system RNG output */ - hash_h(buf.arr, buf.arr, KYBER_SYMBYTES); + hash_h(buf, buf, KYBER_SYMBYTES); /* Multitarget countermeasure for coins + contributory KEM */ - hash_h(buf.arr + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); - hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES); + hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); + hash_g(kr, buf, 2 * KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ - PQCLEAN_KYBER1024_AVX2_indcpa_enc(ct, buf.arr, pk, kr.arr + KYBER_SYMBYTES); + PQCLEAN_KYBER1024_AVX2_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); /* overwrite coins in kr with H(c) */ - hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); + hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* hash concatenation of pre-k and H(c) to k */ - kdf(ss, kr.arr, 2 * KYBER_SYMBYTES); + kdf(ss, kr, 2 * KYBER_SYMBYTES); return 0; } @@ -80,47 +80,47 @@ int PQCLEAN_KYBER1024_AVX2_crypto_kem_enc(unsigned char *ct, * cipher text and private key * * Arguments: - unsigned char *ss: pointer to output shared secret -* (an already allocated array of CRYPTO_BYTES bytes) +* (an already allocated array of KYBER_SSBYTES bytes) * - const unsigned char *ct: pointer to input cipher text -* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) * - const unsigned char *sk: pointer to input private key -* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* (an already allocated array of KYBER_SECRETKEYBYTES bytes) * * Returns 0. * * On failure, ss will contain a pseudo-random value. **************************************************/ -int PQCLEAN_KYBER1024_AVX2_crypto_kem_dec(unsigned char *ss, - const unsigned char *ct, - const unsigned char *sk) { +int PQCLEAN_KYBER1024_AVX2_crypto_kem_dec(unsigned char ss[KYBER_SSBYTES], + const unsigned char ct[KYBER_CIPHERTEXTBYTES], + const unsigned char sk[KYBER_SECRETKEYBYTES]) { size_t i; int fail; - ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; + uint8_t buf[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ - ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr; - uint8_t cmp[KYBER_CIPHERTEXTBYTES]; + uint8_t kr[2 * KYBER_SYMBYTES]; + ALIGNED_UINT8(KYBER_CIPHERTEXTBYTES) cmp; const uint8_t *pk = sk + KYBER_INDCPA_SECRETKEYBYTES; - PQCLEAN_KYBER1024_AVX2_indcpa_dec(buf.arr, ct, sk); + PQCLEAN_KYBER1024_AVX2_indcpa_dec(buf, ct, sk); /* Multitarget countermeasure for coins + contributory KEM */ for (i = 0; i < KYBER_SYMBYTES; i++) { - buf.arr[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; + buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; } - hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES); + hash_g(kr, buf, 2 * KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ - PQCLEAN_KYBER1024_AVX2_indcpa_enc(cmp, buf.arr, pk, kr.arr + KYBER_SYMBYTES); + PQCLEAN_KYBER1024_AVX2_indcpa_enc(cmp.coeffs, buf, pk, kr + KYBER_SYMBYTES); - fail = PQCLEAN_KYBER1024_AVX2_verify(ct, cmp, KYBER_CIPHERTEXTBYTES); + fail = PQCLEAN_KYBER1024_AVX2_verify(ct, cmp.coeffs, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ - hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); + hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* Overwrite pre-k with z on re-encryption failure */ - PQCLEAN_KYBER1024_AVX2_cmov(kr.arr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, (uint8_t)fail); + PQCLEAN_KYBER1024_AVX2_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); /* hash concatenation of pre-k and H(c) to k */ - kdf(ss, kr.arr, 2 * KYBER_SYMBYTES); + kdf(ss, kr, 2 * KYBER_SYMBYTES); return 0; } diff --git a/crypto_kem/kyber1024/avx2/ntt.S b/crypto_kem/kyber1024/avx2/ntt.S index 79259edb..868ad265 100644 --- a/crypto_kem/kyber1024/avx2/ntt.S +++ b/crypto_kem/kyber1024/avx2/ntt.S @@ -1,222 +1,191 @@ #include "cdecl.h" .include "shuffle.inc" -.include "fq.inc" -.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=1,zh1=1 -#mul +.macro mul rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=2,zh1=2 vpmullw %ymm\zl0,%ymm\rh0,%ymm12 vpmullw %ymm\zl0,%ymm\rh1,%ymm13 + vpmullw %ymm\zl1,%ymm\rh2,%ymm14 vpmullw %ymm\zl1,%ymm\rh3,%ymm15 + vpmulhw %ymm\zh0,%ymm\rh0,%ymm\rh0 vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh1 + vpmulhw %ymm\zh1,%ymm\rh2,%ymm\rh2 vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh3 - -#reduce -vpmulhw %ymm0,%ymm12,%ymm12 -vpmulhw %ymm0,%ymm13,%ymm13 -vpmulhw %ymm0,%ymm14,%ymm14 -vpmulhw %ymm0,%ymm15,%ymm15 -vpsubw %ymm12,%ymm\rh0,%ymm12 -vpsubw %ymm13,%ymm\rh1,%ymm13 -vpsubw %ymm14,%ymm\rh2,%ymm14 -vpsubw %ymm15,%ymm\rh3,%ymm15 - -#update -vpsubw %ymm12,%ymm\rl0,%ymm\rh0 -vpaddw %ymm12,%ymm\rl0,%ymm\rl0 -vpsubw %ymm13,%ymm\rl1,%ymm\rh1 -vpaddw %ymm13,%ymm\rl1,%ymm\rl1 -vpsubw %ymm14,%ymm\rl2,%ymm\rh2 -vpaddw %ymm14,%ymm\rl2,%ymm\rl2 -vpsubw %ymm15,%ymm\rl3,%ymm\rh3 -vpaddw %ymm15,%ymm\rl3,%ymm\rl3 .endm -# We break the dependency chains with the cost of slightly more additions. -# But they can be run in parallel to the multiplications on execution port 5 -# (multiplications only go to ports 0 and 1) -.macro butterfly2 rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,x=3,y=2,zl0=15,zl1=15,zh0=1,zh1=1 -#mul -vpmullw %ymm\zl0,%ymm\rh0,%ymm12 -vpmulhw %ymm\zh0,%ymm\rh0,%ymm\x -vpmullw %ymm\zl0,%ymm\rh1,%ymm13 -vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh0 -vpmullw %ymm\zl1,%ymm\rh2,%ymm14 -vpmulhw %ymm\zh1,%ymm\rh2,%ymm\y -vpmullw %ymm\zl1,%ymm\rh3,%ymm15 -vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh2 - -#reduce +.macro reduce vpmulhw %ymm0,%ymm12,%ymm12 vpmulhw %ymm0,%ymm13,%ymm13 + vpmulhw %ymm0,%ymm14,%ymm14 vpmulhw %ymm0,%ymm15,%ymm15 +.endm -vpsubw %ymm\rh0,%ymm\rl1,%ymm\rh1 -vpaddw %ymm\rh0,%ymm\rl1,%ymm\rl1 -vpsubw %ymm\x,%ymm\rl0,%ymm\rh0 -vpaddw %ymm\x,%ymm\rl0,%ymm\rl0 -vpsubw %ymm\rh2,%ymm\rl3,%ymm\rh3 -vpaddw %ymm\rh2,%ymm\rl3,%ymm\rl3 -vpsubw %ymm\y,%ymm\rl2,%ymm\rh2 -vpaddw %ymm\y,%ymm\rl2,%ymm\rl2 +.macro update rln,rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 +vpaddw %ymm\rh0,%ymm\rl0,%ymm\rln +vpsubw %ymm\rh0,%ymm\rl0,%ymm\rh0 +vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl0 -#update +vpsubw %ymm\rh1,%ymm\rl1,%ymm\rh1 +vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl1 +vpsubw %ymm\rh2,%ymm\rl2,%ymm\rh2 + +vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl2 +vpsubw %ymm\rh3,%ymm\rl3,%ymm\rh3 + +vpsubw %ymm12,%ymm\rln,%ymm\rln vpaddw %ymm12,%ymm\rh0,%ymm\rh0 -vpsubw %ymm12,%ymm\rl0,%ymm\rl0 +vpsubw %ymm13,%ymm\rl0,%ymm\rl0 + vpaddw %ymm13,%ymm\rh1,%ymm\rh1 -vpsubw %ymm13,%ymm\rl1,%ymm\rl1 +vpsubw %ymm14,%ymm\rl1,%ymm\rl1 vpaddw %ymm14,%ymm\rh2,%ymm\rh2 -vpsubw %ymm14,%ymm\rl2,%ymm\rl2 + +vpsubw %ymm15,%ymm\rl2,%ymm\rl2 vpaddw %ymm15,%ymm\rh3,%ymm\rh3 -vpsubw %ymm15,%ymm\rl3,%ymm\rl3 +.endm + +.macro level0 off +vpbroadcastq (_ZETAS_EXP+0)*2(%rsi),%ymm15 +vmovdqa (64*\off+128)*2(%rdi),%ymm8 +vmovdqa (64*\off+144)*2(%rdi),%ymm9 +vmovdqa (64*\off+160)*2(%rdi),%ymm10 +vmovdqa (64*\off+176)*2(%rdi),%ymm11 +vpbroadcastq (_ZETAS_EXP+4)*2(%rsi),%ymm2 + +mul 8,9,10,11 + +vmovdqa (64*\off+ 0)*2(%rdi),%ymm4 +vmovdqa (64*\off+ 16)*2(%rdi),%ymm5 +vmovdqa (64*\off+ 32)*2(%rdi),%ymm6 +vmovdqa (64*\off+ 48)*2(%rdi),%ymm7 + +reduce +update 3,4,5,6,7,8,9,10,11 + +vmovdqa %ymm3,(64*\off+ 0)*2(%rdi) +vmovdqa %ymm4,(64*\off+ 16)*2(%rdi) +vmovdqa %ymm5,(64*\off+ 32)*2(%rdi) +vmovdqa %ymm6,(64*\off+ 48)*2(%rdi) +vmovdqa %ymm8,(64*\off+128)*2(%rdi) +vmovdqa %ymm9,(64*\off+144)*2(%rdi) +vmovdqa %ymm10,(64*\off+160)*2(%rdi) +vmovdqa %ymm11,(64*\off+176)*2(%rdi) +.endm + +.macro levels1t6 off +/* level 1 */ +vmovdqa (_ZETAS_EXP+224*\off+16)*2(%rsi),%ymm15 +vmovdqa (128*\off+ 64)*2(%rdi),%ymm8 +vmovdqa (128*\off+ 80)*2(%rdi),%ymm9 +vmovdqa (128*\off+ 96)*2(%rdi),%ymm10 +vmovdqa (128*\off+112)*2(%rdi),%ymm11 +vmovdqa (_ZETAS_EXP+224*\off+32)*2(%rsi),%ymm2 + +mul 8,9,10,11 + +vmovdqa (128*\off+ 0)*2(%rdi),%ymm4 +vmovdqa (128*\off+ 16)*2(%rdi),%ymm5 +vmovdqa (128*\off+ 32)*2(%rdi),%ymm6 +vmovdqa (128*\off+ 48)*2(%rdi),%ymm7 + +reduce +update 3,4,5,6,7,8,9,10,11 + +/* level 2 */ +shuffle8 5,10,7,10 +shuffle8 6,11,5,11 + +vmovdqa (_ZETAS_EXP+224*\off+48)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+64)*2(%rsi),%ymm2 + +mul 7,10,5,11 + +shuffle8 3,8,6,8 +shuffle8 4,9,3,9 + +reduce +update 4,6,8,3,9,7,10,5,11 + +/* level 3 */ +shuffle4 8,5,9,5 +shuffle4 3,11,8,11 + +vmovdqa (_ZETAS_EXP+224*\off+80)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+96)*2(%rsi),%ymm2 + +mul 9,5,8,11 + +shuffle4 4,7,3,7 +shuffle4 6,10,4,10 + +reduce +update 6,3,7,4,10,9,5,8,11 + +/* level 4 */ +shuffle2 7,8,10,8 +shuffle2 4,11,7,11 + +vmovdqa (_ZETAS_EXP+224*\off+112)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+128)*2(%rsi),%ymm2 + +mul 10,8,7,11 + +shuffle2 6,9,4,9 +shuffle2 3,5,6,5 + +reduce +update 3,4,9,6,5,10,8,7,11 + +/* level 5 */ +shuffle1 9,7,5,7 +shuffle1 6,11,9,11 + +vmovdqa (_ZETAS_EXP+224*\off+144)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+160)*2(%rsi),%ymm2 + +mul 5,7,9,11 + +shuffle1 3,10,6,10 +shuffle1 4,8,3,8 + +reduce +update 4,6,10,3,8,5,7,9,11 + +/* level 6 */ +vmovdqa (_ZETAS_EXP+224*\off+176)*2(%rsi),%ymm14 +vmovdqa (_ZETAS_EXP+224*\off+208)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+192)*2(%rsi),%ymm8 +vmovdqa (_ZETAS_EXP+224*\off+224)*2(%rsi),%ymm2 + +mul 10,3,9,11,14,15,8,2 + +reduce +update 8,4,6,5,7,10,3,9,11 + +vmovdqa %ymm8,(128*\off+ 0)*2(%rdi) +vmovdqa %ymm4,(128*\off+ 16)*2(%rdi) +vmovdqa %ymm10,(128*\off+ 32)*2(%rdi) +vmovdqa %ymm3,(128*\off+ 48)*2(%rdi) +vmovdqa %ymm6,(128*\off+ 64)*2(%rdi) +vmovdqa %ymm5,(128*\off+ 80)*2(%rdi) +vmovdqa %ymm9,(128*\off+ 96)*2(%rdi) +vmovdqa %ymm11,(128*\off+112)*2(%rdi) .endm .text -ntt_level0_avx: -level0: -#zetas -vpbroadcastd (%rsi),%ymm15 -vpbroadcastd 4(%rsi),%ymm1 - -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 256(%rdi),%ymm8 -vmovdqa 288(%rdi),%ymm9 -vmovdqa 320(%rdi),%ymm10 -vmovdqa 352(%rdi),%ymm11 - -butterfly2 4,5,6,7,8,9,10,11 - -#store -vmovdqa %ymm4,(%rdi) -vmovdqa %ymm5,32(%rdi) -vmovdqa %ymm6,64(%rdi) -vmovdqa %ymm7,96(%rdi) -vmovdqa %ymm8,256(%rdi) -vmovdqa %ymm9,288(%rdi) -vmovdqa %ymm10,320(%rdi) -vmovdqa %ymm11,352(%rdi) - -ret - -ntt_levels1t6_avx: -level1: -#zetas -vpbroadcastd (%rsi),%ymm15 -vpbroadcastd 4(%rsi),%ymm1 - -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 128(%rdi),%ymm8 -vmovdqa 160(%rdi),%ymm9 -vmovdqa 192(%rdi),%ymm10 -vmovdqa 224(%rdi),%ymm11 - -butterfly2 4,5,6,7,8,9,10,11,3 - -level2: -#zetas -vmovdqu 8(%rsi),%ymm15 -vmovdqu 40(%rsi),%ymm1 - -shuffle8 4,8,3,8 -shuffle8 5,9,4,9 -shuffle8 6,10,5,10 -shuffle8 7,11,6,11 - -butterfly2 3,8,4,9,5,10,6,11,7 - -level3: -#zetas -vmovdqu 72(%rsi),%ymm15 -vmovdqu 104(%rsi),%ymm1 - -shuffle4 3,5,7,5 -shuffle4 8,10,3,10 -shuffle4 4,6,8,6 -shuffle4 9,11,4,11 - -butterfly2 7,5,3,10,8,6,4,11,9 - -level4: -#zetas -vmovdqu 136(%rsi),%ymm15 -vmovdqu 168(%rsi),%ymm1 - -shuffle2 7,8,9,8 -shuffle2 5,6,7,6 -shuffle2 3,4,5,4 -shuffle2 10,11,3,11 - -butterfly2 9,8,7,6,5,4,3,11,10 - -level5: -#zetas -vmovdqu 200(%rsi),%ymm15 -vmovdqu 232(%rsi),%ymm1 - -shuffle1 9,5,10,5 -shuffle1 8,4,9,4 -shuffle1 7,3,8,3 -shuffle1 6,11,7,11 - -butterfly2 10,5,9,4,8,3,7,11,6 - -level6: -#zetas -vmovdqu 264(%rsi),%ymm14 -vmovdqu 328(%rsi),%ymm15 -vmovdqu 296(%rsi),%ymm1 -vmovdqu 360(%rsi),%ymm2 - -butterfly2 10,5,8,3,9,4,7,11,6,1,14,15,1,2 - -vmovdqa _16XV*2(%rdx),%ymm1 -red16 10,12 -red16 5,13 -red16 9,14 -red16 4,15 -red16 8,2 -red16 3,6 -red16 7,12 -red16 11,13 - -#store -vmovdqa %ymm10,(%rdi) -vmovdqa %ymm5,32(%rdi) -vmovdqa %ymm9,64(%rdi) -vmovdqa %ymm4,96(%rdi) -vmovdqa %ymm8,128(%rdi) -vmovdqa %ymm3,160(%rdi) -vmovdqa %ymm7,192(%rdi) -vmovdqa %ymm11,224(%rdi) - -ret - .global cdecl(PQCLEAN_KYBER1024_AVX2_ntt_avx) .global _cdecl(PQCLEAN_KYBER1024_AVX2_ntt_avx) cdecl(PQCLEAN_KYBER1024_AVX2_ntt_avx): _cdecl(PQCLEAN_KYBER1024_AVX2_ntt_avx): -#consts vmovdqa _16XQ*2(%rsi),%ymm0 -mov %rsi,%rdx -add $_ZETAS_EXP*2,%rsi -call ntt_level0_avx -add $128,%rdi -call ntt_level0_avx -sub $128,%rdi -add $8,%rsi -call ntt_levels1t6_avx -add $256,%rdi -add $392,%rsi -call ntt_levels1t6_avx + +level0 0 +level0 1 + +levels1t6 0 +levels1t6 1 + ret diff --git a/crypto_kem/kyber1024/avx2/ntt.h b/crypto_kem/kyber1024/avx2/ntt.h index 4c36d007..d0c2f7ff 100644 --- a/crypto_kem/kyber1024/avx2/ntt.h +++ b/crypto_kem/kyber1024/avx2/ntt.h @@ -1,24 +1,21 @@ #ifndef PQCLEAN_KYBER1024_AVX2_NTT_H #define PQCLEAN_KYBER1024_AVX2_NTT_H -#include "consts.h" + +#include #include -void PQCLEAN_KYBER1024_AVX2_ntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); -void PQCLEAN_KYBER1024_AVX2_invntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); +void PQCLEAN_KYBER1024_AVX2_ntt_avx(__m256i *r, const __m256i *PQCLEAN_KYBER1024_AVX2_qdata); +void PQCLEAN_KYBER1024_AVX2_invntt_avx(__m256i *r, const __m256i *PQCLEAN_KYBER1024_AVX2_qdata); -void PQCLEAN_KYBER1024_AVX2_nttpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); -void PQCLEAN_KYBER1024_AVX2_nttunpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); +void PQCLEAN_KYBER1024_AVX2_nttpack_avx(__m256i *r, const __m256i *PQCLEAN_KYBER1024_AVX2_qdata); +void PQCLEAN_KYBER1024_AVX2_nttunpack_avx(__m256i *r, const __m256i *PQCLEAN_KYBER1024_AVX2_qdata); -void PQCLEAN_KYBER1024_AVX2_basemul_avx(int16_t *r, - const int16_t *a, - const int16_t *b, - const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); -void PQCLEAN_KYBER1024_AVX2_basemul_acc_avx(int16_t *r, - const int16_t *a, - const int16_t *b, - const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); +void PQCLEAN_KYBER1024_AVX2_basemul_avx(__m256i *r, + const __m256i *a, + const __m256i *b, + const __m256i *PQCLEAN_KYBER1024_AVX2_qdata); -void PQCLEAN_KYBER1024_AVX2_ntttobytes_avx(uint8_t *r, const int16_t *a, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); -void PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx(int16_t *r, const uint8_t *a, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); +void PQCLEAN_KYBER1024_AVX2_ntttobytes_avx(uint8_t *r, const __m256i *a, const __m256i *PQCLEAN_KYBER1024_AVX2_qdata); +void PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx(__m256i *r, const uint8_t *a, const __m256i *PQCLEAN_KYBER1024_AVX2_qdata); #endif diff --git a/crypto_kem/kyber1024/avx2/params.h b/crypto_kem/kyber1024/avx2/params.h index 3484cabd..3d6a8375 100644 --- a/crypto_kem/kyber1024/avx2/params.h +++ b/crypto_kem/kyber1024/avx2/params.h @@ -7,8 +7,6 @@ #define KYBER_N 256 #define KYBER_Q 3329 -#define KYBER_ETA 2 - #define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define KYBER_SSBYTES 32 /* size in bytes of shared key */ @@ -16,9 +14,12 @@ #define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) #define KYBER_K 4 +#define KYBER_ETA1 2 #define KYBER_POLYCOMPRESSEDBYTES 160 #define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 352) +#define KYBER_ETA2 2 + #define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES #define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) #define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) diff --git a/crypto_kem/kyber1024/avx2/poly.c b/crypto_kem/kyber1024/avx2/poly.c index b50d0482..2820c157 100644 --- a/crypto_kem/kyber1024/avx2/poly.c +++ b/crypto_kem/kyber1024/avx2/poly.c @@ -12,76 +12,99 @@ /************************************************* * Name: PQCLEAN_KYBER1024_AVX2_poly_compress * -* Description: Compression and subsequent serialization of a polynomial +* Description: Compression and subsequent serialization of a polynomial. +* The coefficients of the input polynomial are assumed to +* lie in the invertal [0,q], i.e. the polynomial must be reduced +* by PQCLEAN_KYBER1024_AVX2_poly_reduce(). * * Arguments: - uint8_t *r: pointer to output byte array * (of length KYBER_POLYCOMPRESSEDBYTES) -* - poly *a: pointer to input polynomial +* - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER1024_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *restrict a) { - unsigned int i, j; - uint8_t t[8]; +void PQCLEAN_KYBER1024_AVX2_poly_compress(uint8_t r[160], const poly *restrict a) { + size_t i; + uint32_t low; + __m256i f0, f1; + __m128i t0, t1; + const __m256i v = _mm256_load_si256(&PQCLEAN_KYBER1024_AVX2_qdata.vec[_16XV / 16]); + const __m256i shift1 = _mm256_set1_epi16(1 << 10); + const __m256i mask = _mm256_set1_epi16(31); + const __m256i shift2 = _mm256_set1_epi16((32 << 8) + 1); + const __m256i shift3 = _mm256_set1_epi32((1024 << 16) + 1); + const __m256i sllvdidx = _mm256_set1_epi64x(12); + const __m256i shufbidx = _mm256_set_epi8( 8, -1, -1, -1, -1, -1, 4, 3, 2, 1, 0, -1, 12, 11, 10, 9, + -1, 12, 11, 10, 9, 8, -1, -1, -1, -1, -1, 4, 3, 2, 1, 0); - PQCLEAN_KYBER1024_AVX2_poly_csubq(a); - - for (i = 0; i < KYBER_N / 8; i++) { - for (j = 0; j < 8; j++) { - t[j] = ((((uint32_t)a->coeffs[8 * i + j] << 5) + KYBER_Q / 2) / KYBER_Q) & 31; - } - - r[0] = (t[0] >> 0) | (t[1] << 5); - r[1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7); - r[2] = (t[3] >> 1) | (t[4] << 4); - r[3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6); - r[4] = (t[6] >> 2) | (t[7] << 3); - r += 5; + for (i = 0; i < KYBER_N / 32; i++) { + f0 = _mm256_load_si256(&a->vec[2 * i + 0]); + f1 = _mm256_load_si256(&a->vec[2 * i + 1]); + f0 = _mm256_mulhi_epi16(f0, v); + f1 = _mm256_mulhi_epi16(f1, v); + f0 = _mm256_mulhrs_epi16(f0, shift1); + f1 = _mm256_mulhrs_epi16(f1, shift1); + f0 = _mm256_and_si256(f0, mask); + f1 = _mm256_and_si256(f1, mask); + f0 = _mm256_packus_epi16(f0, f1); + f0 = _mm256_maddubs_epi16(f0, shift2); // a0 a1 a2 a3 b0 b1 b2 b3 a4 a5 a6 a7 b4 b5 b6 b7 + f0 = _mm256_madd_epi16(f0, shift3); // a0 a1 b0 b1 a2 a3 b2 b3 + f0 = _mm256_sllv_epi32(f0, sllvdidx); + f0 = _mm256_srlv_epi64(f0, sllvdidx); + f0 = _mm256_shuffle_epi8(f0, shufbidx); + t0 = _mm256_castsi256_si128(f0); + t1 = _mm256_extracti128_si256(f0, 1); + t0 = _mm_blendv_epi8(t0, t1, _mm256_castsi256_si128(shufbidx)); + _mm_storeu_si128((__m128i *)&r[20 * i + 0], t0); + _mm_store_ss((float *)&low, _mm_castsi128_ps(t1)); + r[20 * i + 16] = (uint8_t)low; + r[20 * i + 17] = (uint8_t)(low >> 0x08); + r[20 * i + 18] = (uint8_t)(low >> 0x10); + r[20 * i + 19] = (uint8_t)(low >> 0x18); } } -/************************************************* -* Name: PQCLEAN_KYBER1024_AVX2_poly_decompress -* -* Description: De-serialization and subsequent decompression of a polynomial; -* approximate inverse of PQCLEAN_KYBER1024_AVX2_poly_compress -* -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *a: pointer to input byte array -* (of length KYBER_POLYCOMPRESSEDBYTES bytes) -**************************************************/ -void PQCLEAN_KYBER1024_AVX2_poly_decompress(poly *restrict r, - const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) { +void PQCLEAN_KYBER1024_AVX2_poly_decompress(poly *restrict r, const uint8_t a[160]) { unsigned int i; + int16_t h; + __m128i t; + __m256i f; + const __m256i q = _mm256_load_si256(&PQCLEAN_KYBER1024_AVX2_qdata.vec[_16XQ / 16]); + const __m256i shufbidx = _mm256_set_epi8(9, 9, 9, 8, 8, 8, 8, 7, 7, 6, 6, 6, 6, 5, 5, 5, + 4, 4, 4, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0); + const __m256i mask = _mm256_set_epi16(248, 1984, 62, 496, 3968, 124, 992, 31, + 248, 1984, 62, 496, 3968, 124, 992, 31); + const __m256i shift = _mm256_set_epi16(128, 16, 512, 64, 8, 256, 32, 1024, + 128, 16, 512, 64, 8, 256, 32, 1024); - unsigned int j; - uint8_t t[8]; - for (i = 0; i < KYBER_N / 8; i++) { - t[0] = (a[0] >> 0); - t[1] = (a[0] >> 5) | (a[1] << 3); - t[2] = (a[1] >> 2); - t[3] = (a[1] >> 7) | (a[2] << 1); - t[4] = (a[2] >> 4) | (a[3] << 4); - t[5] = (a[3] >> 1); - t[6] = (a[3] >> 6) | (a[4] << 2); - t[7] = (a[4] >> 3); - a += 5; - - for (j = 0; j < 8; j++) { - r->coeffs[8 * i + j] = ((uint32_t)(t[j] & 31) * KYBER_Q + 16) >> 5; - } + for (i = 0; i < KYBER_N / 16; i++) { + t = _mm_loadl_epi64((__m128i *)&a[10 * i + 0]); + h = (a[10 * i + 9] << 8) + a[10 * i + 8]; + t = _mm_insert_epi16(t, h, 4); + f = _mm256_broadcastsi128_si256(t); + f = _mm256_shuffle_epi8(f, shufbidx); + f = _mm256_and_si256(f, mask); + f = _mm256_mullo_epi16(f, shift); + f = _mm256_mulhrs_epi16(f, q); + _mm256_store_si256(&r->vec[i], f); } } + /************************************************* * Name: PQCLEAN_KYBER1024_AVX2_poly_tobytes * -* Description: Serialization of a polynomial +* Description: Serialization of a polynomial in NTT representation. +* The coefficients of the input polynomial are assumed to +* lie in the invertal [0,q], i.e. the polynomial must be reduced +* by PQCLEAN_KYBER1024_AVX2_poly_reduce(). The coefficients are orderd as output by +* PQCLEAN_KYBER1024_AVX2_poly_ntt(); the serialized output coefficients are in bitreversed +* order. * * Arguments: - uint8_t *r: pointer to output byte array * (needs space for KYBER_POLYBYTES bytes) -* - poly *a: pointer to input polynomial +* - poly *a: pointer to input polynomial **************************************************/ void PQCLEAN_KYBER1024_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { - PQCLEAN_KYBER1024_AVX2_ntttobytes_avx(r, a->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); + PQCLEAN_KYBER1024_AVX2_ntttobytes_avx(r, a->vec, PQCLEAN_KYBER1024_AVX2_qdata.vec); } /************************************************* @@ -90,12 +113,12 @@ void PQCLEAN_KYBER1024_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { * Description: De-serialization of a polynomial; * inverse of PQCLEAN_KYBER1024_AVX2_poly_tobytes * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: pointer to input byte array * (of KYBER_POLYBYTES bytes) **************************************************/ void PQCLEAN_KYBER1024_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]) { - PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx(r->coeffs, a, &PQCLEAN_KYBER1024_AVX2_qdata); + PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx(r->vec, a, PQCLEAN_KYBER1024_AVX2_qdata.vec); } /************************************************* @@ -103,11 +126,10 @@ void PQCLEAN_KYBER1024_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBY * * Description: Convert 32-byte message to polynomial * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *msg: pointer to input message **************************************************/ -void PQCLEAN_KYBER1024_AVX2_poly_frommsg(poly *restrict r, - const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { +void PQCLEAN_KYBER1024_AVX2_poly_frommsg(poly *restrict r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { __m256i f, g0, g1, g2, g3, h0, h1, h2, h3; const __m256i shift = _mm256_broadcastsi128_si256(_mm_set_epi32(0, 1, 2, 3)); const __m256i idx = _mm256_broadcastsi128_si256(_mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); @@ -136,12 +158,12 @@ void PQCLEAN_KYBER1024_AVX2_poly_frommsg(poly *restrict r, g2 = _mm256_permute2x128_si256(h0,h1,0x31); \ g1 = _mm256_permute2x128_si256(h2,h3,0x20); \ g3 = _mm256_permute2x128_si256(h2,h3,0x31); \ - _mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+ 0],g0); \ - _mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+16],g1); \ - _mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+ 0],g2); \ - _mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+16],g3) + _mm256_store_si256(&r->vec[0+2*(i)+0],g0); \ + _mm256_store_si256(&r->vec[0+2*(i)+1],g1); \ + _mm256_store_si256(&r->vec[8+2*(i)+0],g2); \ + _mm256_store_si256(&r->vec[8+2*(i)+1],g3) - f = _mm256_load_si256((__m256i *)msg); + f = _mm256_loadu_si256((__m256i *)msg); FROMMSG64(0); FROMMSG64(1); FROMMSG64(2); @@ -151,32 +173,34 @@ void PQCLEAN_KYBER1024_AVX2_poly_frommsg(poly *restrict r, /************************************************* * Name: PQCLEAN_KYBER1024_AVX2_poly_tomsg * -* Description: Convert polynomial to 32-byte message +* Description: Convert polynomial to 32-byte message. +* The coefficients of the input polynomial are assumed to +* lie in the invertal [0,q], i.e. the polynomial must be reduced +* by PQCLEAN_KYBER1024_AVX2_poly_reduce(). * * Arguments: - uint8_t *msg: pointer to output message -* - poly *a: pointer to input polynomial +* - poly *a: pointer to input polynomial **************************************************/ void PQCLEAN_KYBER1024_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *restrict a) { unsigned int i; uint32_t small; __m256i f0, f1, g0, g1; - const __m256i hqs = _mm256_set1_epi16((KYBER_Q - 1) / 2); - const __m256i hhqs = _mm256_set1_epi16((KYBER_Q - 5) / 4); + const __m256i hq = _mm256_set1_epi16((KYBER_Q - 1) / 2); + const __m256i hhq = _mm256_set1_epi16((KYBER_Q - 1) / 4); for (i = 0; i < KYBER_N / 32; i++) { - f0 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i]); - f1 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i + 16]); - f0 = _mm256_sub_epi16(hqs, f0); - f1 = _mm256_sub_epi16(hqs, f1); + f0 = _mm256_load_si256(&a->vec[2 * i + 0]); + f1 = _mm256_load_si256(&a->vec[2 * i + 1]); + f0 = _mm256_sub_epi16(hq, f0); + f1 = _mm256_sub_epi16(hq, f1); g0 = _mm256_srai_epi16(f0, 15); g1 = _mm256_srai_epi16(f1, 15); f0 = _mm256_xor_si256(f0, g0); f1 = _mm256_xor_si256(f1, g1); - f0 = _mm256_sub_epi16(hhqs, f0); - f1 = _mm256_sub_epi16(hhqs, f1); + f0 = _mm256_sub_epi16(f0, hhq); + f1 = _mm256_sub_epi16(f1, hhq); f0 = _mm256_packs_epi16(f0, f1); small = _mm256_movemask_epi8(f0); - small = ~small; msg[4 * i + 0] = small; msg[4 * i + 1] = small >> 16; msg[4 * i + 2] = small >> 8; @@ -185,24 +209,43 @@ void PQCLEAN_KYBER1024_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly } /************************************************* -* Name: PQCLEAN_KYBER1024_AVX2_poly_getnoise +* Name: PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta1 * * Description: Sample a polynomial deterministically from a seed and a nonce, * with output polynomial close to centered binomial distribution -* with parameter KYBER_ETA +* with parameter KYBER_ETA1 * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *seed: pointer to input seed * (of length KYBER_SYMBYTES bytes) -* - uint8_t nonce: one-byte input nonce +* - uint8_t nonce: one-byte input nonce **************************************************/ -void PQCLEAN_KYBER1024_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { - ALIGN32_ARRAY(uint8_t, KYBER_ETA * KYBER_N / 4) buf; - prf(buf.arr, sizeof(buf.arr), seed, nonce); - PQCLEAN_KYBER1024_AVX2_cbd(r, buf.arr); +void PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { + ALIGNED_UINT8(KYBER_ETA1 * KYBER_N / 4 + 32) buf; // +32 bytes as required by PQCLEAN_KYBER1024_AVX2_poly_cbd_eta1 + prf(buf.coeffs, KYBER_ETA1 * KYBER_N / 4, seed, nonce); + PQCLEAN_KYBER1024_AVX2_poly_cbd_eta1(r, buf.vec); } -void PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(poly *r0, +/************************************************* +* Name: PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta2 +* +* Description: Sample a polynomial deterministically from a seed and a nonce, +* with output polynomial close to centered binomial distribution +* with parameter KYBER_ETA2 +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *seed: pointer to input seed +* (of length KYBER_SYMBYTES bytes) +* - uint8_t nonce: one-byte input nonce +**************************************************/ +void PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { + ALIGNED_UINT8(KYBER_ETA2 * KYBER_N / 4) buf; + prf(buf.coeffs, KYBER_ETA2 * KYBER_N / 4, seed, nonce); + PQCLEAN_KYBER1024_AVX2_poly_cbd_eta2(r, buf.vec); +} + +#define NOISE_NBLOCKS ((KYBER_ETA1*KYBER_N/4+SHAKE256_RATE-1)/SHAKE256_RATE) +void PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3, @@ -211,41 +254,46 @@ void PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(poly *r0, uint8_t nonce1, uint8_t nonce2, uint8_t nonce3) { - ALIGN32_ARRAY_2D(uint8_t, 4, 160) buf; + ALIGNED_UINT8(NOISE_NBLOCKS * SHAKE256_RATE) buf[4]; __m256i f; keccakx4_state state; - f = _mm256_load_si256((__m256i *)seed); - _mm256_store_si256((__m256i *)buf.arr[0], f); - _mm256_store_si256((__m256i *)buf.arr[1], f); - _mm256_store_si256((__m256i *)buf.arr[2], f); - _mm256_store_si256((__m256i *)buf.arr[3], f); + f = _mm256_loadu_si256((__m256i *)seed); + _mm256_store_si256(buf[0].vec, f); + _mm256_store_si256(buf[1].vec, f); + _mm256_store_si256(buf[2].vec, f); + _mm256_store_si256(buf[3].vec, f); - buf.arr[0][32] = nonce0; - buf.arr[1][32] = nonce1; - buf.arr[2][32] = nonce2; - buf.arr[3][32] = nonce3; + buf[0].coeffs[32] = nonce0; + buf[1].coeffs[32] = nonce1; + buf[2].coeffs[32] = nonce2; + buf[3].coeffs[32] = nonce3; - PQCLEAN_KYBER1024_AVX2_shake256x4_absorb(&state, buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], 33); - PQCLEAN_KYBER1024_AVX2_shake256x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], 1, &state); + PQCLEAN_KYBER1024_AVX2_shake256x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 33); + PQCLEAN_KYBER1024_AVX2_shake256x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, NOISE_NBLOCKS, &state); - PQCLEAN_KYBER1024_AVX2_cbd(r0, buf.arr[0]); - PQCLEAN_KYBER1024_AVX2_cbd(r1, buf.arr[1]); - PQCLEAN_KYBER1024_AVX2_cbd(r2, buf.arr[2]); - PQCLEAN_KYBER1024_AVX2_cbd(r3, buf.arr[3]); + PQCLEAN_KYBER1024_AVX2_poly_cbd_eta1(r0, buf[0].vec); + PQCLEAN_KYBER1024_AVX2_poly_cbd_eta1(r1, buf[1].vec); + PQCLEAN_KYBER1024_AVX2_poly_cbd_eta1(r2, buf[2].vec); + PQCLEAN_KYBER1024_AVX2_poly_cbd_eta1(r3, buf[3].vec); } + /************************************************* * Name: PQCLEAN_KYBER1024_AVX2_poly_ntt * * Description: Computes negacyclic number-theoretic transform (NTT) of -* a polynomial in place; -* inputs assumed to be in normal order, output in bitreversed order +* a polynomial in place. +* Input coefficients assumed to be in normal order, +* output coefficients are in special order that is natural +* for the vectorization. Input coefficients are assumed to be +* bounded by q in absolute value, output coefficients are bounded +* by 16118 in absolute value. * -* Arguments: - uint16_t *r: pointer to in/output polynomial +* Arguments: - poly *r: pointer to in/output polynomial **************************************************/ void PQCLEAN_KYBER1024_AVX2_poly_ntt(poly *r) { - PQCLEAN_KYBER1024_AVX2_ntt_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); + PQCLEAN_KYBER1024_AVX2_ntt_avx(r->vec, PQCLEAN_KYBER1024_AVX2_qdata.vec); } /************************************************* @@ -253,29 +301,35 @@ void PQCLEAN_KYBER1024_AVX2_poly_ntt(poly *r) { * * Description: Computes inverse of negacyclic number-theoretic transform (NTT) * of a polynomial in place; -* inputs assumed to be in bitreversed order, output in normal order +* Input coefficients assumed to be in special order from vectorized +* forward ntt, output in normal order. Input coefficients can be +* arbitrary 16-bit integers, output coefficients are bounded by 14870 +* in absolute value. * -* Arguments: - uint16_t *a: pointer to in/output polynomial +* Arguments: - poly *a: pointer to in/output polynomial **************************************************/ void PQCLEAN_KYBER1024_AVX2_poly_invntt_tomont(poly *r) { - PQCLEAN_KYBER1024_AVX2_invntt_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); + PQCLEAN_KYBER1024_AVX2_invntt_avx(r->vec, PQCLEAN_KYBER1024_AVX2_qdata.vec); } void PQCLEAN_KYBER1024_AVX2_poly_nttunpack(poly *r) { - PQCLEAN_KYBER1024_AVX2_nttunpack_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); + PQCLEAN_KYBER1024_AVX2_nttunpack_avx(r->vec, PQCLEAN_KYBER1024_AVX2_qdata.vec); } /************************************************* * Name: PQCLEAN_KYBER1024_AVX2_poly_basemul_montgomery * -* Description: Multiplication of two polynomials in NTT domain +* Description: Multiplication of two polynomials in NTT domain. +* One of the input polynomials needs to have coefficients +* bounded by q, the other polynomial can have arbitrary +* coefficients. Output coefficients are bounded by 6656. * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ void PQCLEAN_KYBER1024_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const poly *b) { - PQCLEAN_KYBER1024_AVX2_basemul_avx(r->coeffs, a->coeffs, b->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); + PQCLEAN_KYBER1024_AVX2_basemul_avx(r->vec, a->vec, b->vec, PQCLEAN_KYBER1024_AVX2_qdata.vec); } /************************************************* @@ -287,7 +341,7 @@ void PQCLEAN_KYBER1024_AVX2_poly_basemul_montgomery(poly *r, const poly *a, cons * Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER1024_AVX2_poly_tomont(poly *r) { - PQCLEAN_KYBER1024_AVX2_tomont_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); + PQCLEAN_KYBER1024_AVX2_tomont_avx(r->vec, PQCLEAN_KYBER1024_AVX2_qdata.vec); } /************************************************* @@ -299,28 +353,16 @@ void PQCLEAN_KYBER1024_AVX2_poly_tomont(poly *r) { * Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER1024_AVX2_poly_reduce(poly *r) { - PQCLEAN_KYBER1024_AVX2_reduce_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); -} - -/************************************************* -* Name: PQCLEAN_KYBER1024_AVX2_poly_csubq -* -* Description: Applies conditional subtraction of q to each coefficient -* of a polynomial. For details of conditional subtraction -* of q see comments in reduce.c -* -* Arguments: - poly *r: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_KYBER1024_AVX2_poly_csubq(poly *r) { - PQCLEAN_KYBER1024_AVX2_csubq_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); + PQCLEAN_KYBER1024_AVX2_reduce_avx(r->vec, PQCLEAN_KYBER1024_AVX2_qdata.vec); } /************************************************* * Name: PQCLEAN_KYBER1024_AVX2_poly_add * -* Description: Add two polynomials +* Description: Add two polynomials. No modular reduction +* is performed. * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ @@ -328,20 +370,21 @@ void PQCLEAN_KYBER1024_AVX2_poly_add(poly *r, const poly *a, const poly *b) { unsigned int i; __m256i f0, f1; - for (i = 0; i < KYBER_N; i += 16) { - f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); - f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); + for (i = 0; i < KYBER_N / 16; i++) { + f0 = _mm256_load_si256(&a->vec[i]); + f1 = _mm256_load_si256(&b->vec[i]); f0 = _mm256_add_epi16(f0, f1); - _mm256_store_si256((__m256i *)&r->coeffs[i], f0); + _mm256_store_si256(&r->vec[i], f0); } } /************************************************* * Name: PQCLEAN_KYBER1024_AVX2_poly_sub * -* Description: Subtract two polynomials +* Description: Subtract two polynomials. No modular reduction +* is performed. * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ @@ -349,10 +392,10 @@ void PQCLEAN_KYBER1024_AVX2_poly_sub(poly *r, const poly *a, const poly *b) { unsigned int i; __m256i f0, f1; - for (i = 0; i < KYBER_N; i += 16) { - f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); - f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); + for (i = 0; i < KYBER_N / 16; i++) { + f0 = _mm256_load_si256(&a->vec[i]); + f1 = _mm256_load_si256(&b->vec[i]); f0 = _mm256_sub_epi16(f0, f1); - _mm256_store_si256((__m256i *)&r->coeffs[i], f0); + _mm256_store_si256(&r->vec[i], f0); } } diff --git a/crypto_kem/kyber1024/avx2/poly.h b/crypto_kem/kyber1024/avx2/poly.h index e6cd4c65..74cbe6bf 100644 --- a/crypto_kem/kyber1024/avx2/poly.h +++ b/crypto_kem/kyber1024/avx2/poly.h @@ -1,19 +1,13 @@ #ifndef PQCLEAN_KYBER1024_AVX2_POLY_H #define PQCLEAN_KYBER1024_AVX2_POLY_H +#include "align.h" #include "params.h" #include #include -/* - * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial - * coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1] - */ -typedef union { - __m256i dummy; - int16_t coeffs[KYBER_N]; -} poly; +typedef ALIGNED_INT16(KYBER_N) poly; -void PQCLEAN_KYBER1024_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a); +void PQCLEAN_KYBER1024_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a); void PQCLEAN_KYBER1024_AVX2_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]); void PQCLEAN_KYBER1024_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a); @@ -22,8 +16,11 @@ void PQCLEAN_KYBER1024_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBY void PQCLEAN_KYBER1024_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]); void PQCLEAN_KYBER1024_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a); -void PQCLEAN_KYBER1024_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); -void PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(poly *r0, +void PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); + +void PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); + +void PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3, @@ -33,6 +30,8 @@ void PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(poly *r0, uint8_t nonce2, uint8_t nonce3); + + void PQCLEAN_KYBER1024_AVX2_poly_ntt(poly *r); void PQCLEAN_KYBER1024_AVX2_poly_invntt_tomont(poly *r); void PQCLEAN_KYBER1024_AVX2_poly_nttunpack(poly *r); @@ -40,7 +39,6 @@ void PQCLEAN_KYBER1024_AVX2_poly_basemul_montgomery(poly *r, const poly *a, cons void PQCLEAN_KYBER1024_AVX2_poly_tomont(poly *r); void PQCLEAN_KYBER1024_AVX2_poly_reduce(poly *r); -void PQCLEAN_KYBER1024_AVX2_poly_csubq(poly *r); void PQCLEAN_KYBER1024_AVX2_poly_add(poly *r, const poly *a, const poly *b); void PQCLEAN_KYBER1024_AVX2_poly_sub(poly *r, const poly *a, const poly *b); diff --git a/crypto_kem/kyber1024/avx2/polyvec.c b/crypto_kem/kyber1024/avx2/polyvec.c index 83dc277c..d686b052 100644 --- a/crypto_kem/kyber1024/avx2/polyvec.c +++ b/crypto_kem/kyber1024/avx2/polyvec.c @@ -3,8 +3,79 @@ #include "params.h" #include "poly.h" #include "polyvec.h" +#include #include +static void poly_compress11(uint8_t r[352 + 2], const poly *restrict a) { + unsigned int i; + __m256i f0, f1, f2; + __m128i t0, t1; + const __m256i v = _mm256_load_si256(&PQCLEAN_KYBER1024_AVX2_qdata.vec[_16XV / 16]); + const __m256i v8 = _mm256_slli_epi16(v, 3); + const __m256i off = _mm256_set1_epi16(36); + const __m256i shift1 = _mm256_set1_epi16(1 << 13); + const __m256i mask = _mm256_set1_epi16(2047); + const __m256i shift2 = _mm256_set1_epi64x((2048LL << 48) + (1LL << 32) + (2048 << 16) + 1); + const __m256i sllvdidx = _mm256_set1_epi64x(10); + const __m256i srlvqidx = _mm256_set_epi64x(30, 10, 30, 10); + const __m256i shufbidx = _mm256_set_epi8( 4, 3, 2, 1, 0, 0, -1, -1, -1, -1, 10, 9, 8, 7, 6, 5, + -1, -1, -1, -1, -1, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + + for (i = 0; i < KYBER_N / 16; i++) { + f0 = _mm256_load_si256(&a->vec[i]); + f1 = _mm256_mullo_epi16(f0, v8); + f2 = _mm256_add_epi16(f0, off); + f0 = _mm256_slli_epi16(f0, 3); + f0 = _mm256_mulhi_epi16(f0, v); + f2 = _mm256_sub_epi16(f1, f2); + f1 = _mm256_andnot_si256(f1, f2); + f1 = _mm256_srli_epi16(f1, 15); + f0 = _mm256_sub_epi16(f0, f1); + f0 = _mm256_mulhrs_epi16(f0, shift1); + f0 = _mm256_and_si256(f0, mask); + f0 = _mm256_madd_epi16(f0, shift2); + f0 = _mm256_sllv_epi32(f0, sllvdidx); + f1 = _mm256_bsrli_epi128(f0, 8); + f0 = _mm256_srlv_epi64(f0, srlvqidx); + f1 = _mm256_slli_epi64(f1, 34); + f0 = _mm256_add_epi64(f0, f1); + f0 = _mm256_shuffle_epi8(f0, shufbidx); + t0 = _mm256_castsi256_si128(f0); + t1 = _mm256_extracti128_si256(f0, 1); + t0 = _mm_blendv_epi8(t0, t1, _mm256_castsi256_si128(shufbidx)); + _mm_storeu_si128((__m128i *)&r[22 * i + 0], t0); + _mm_storel_epi64((__m128i *)&r[22 * i + 16], t1); + } +} + +static void poly_decompress11(poly *restrict r, const uint8_t a[352 + 10]) { + unsigned int i; + __m256i f; + const __m256i q = _mm256_load_si256(&PQCLEAN_KYBER1024_AVX2_qdata.vec[_16XQ / 16]); + const __m256i shufbidx = _mm256_set_epi8(13, 12, 12, 11, 10, 9, 9, 8, + 8, 7, 6, 5, 5, 4, 4, 3, + 10, 9, 9, 8, 7, 6, 6, 5, + 5, 4, 3, 2, 2, 1, 1, 0); + const __m256i srlvdidx = _mm256_set_epi32(0, 0, 1, 0, 0, 0, 1, 0); + const __m256i srlvqidx = _mm256_set_epi64x(2, 0, 2, 0); + const __m256i shift = _mm256_set_epi16(4, 32, 1, 8, 32, 1, 4, 32, 4, 32, 1, 8, 32, 1, 4, 32); + const __m256i mask = _mm256_set1_epi16(32752); + + for (i = 0; i < KYBER_N / 16; i++) { + f = _mm256_loadu_si256((__m256i *)&a[22 * i]); + f = _mm256_permute4x64_epi64(f, 0x94); + f = _mm256_shuffle_epi8(f, shufbidx); + f = _mm256_srlv_epi32(f, srlvdidx); + f = _mm256_srlv_epi64(f, srlvqidx); + f = _mm256_mullo_epi16(f, shift); + f = _mm256_srli_epi16(f, 1); + f = _mm256_and_si256(f, mask); + f = _mm256_mulhrs_epi16(f, q); + _mm256_store_si256(&r->vec[i], f); + } +} + + /************************************************* * Name: PQCLEAN_KYBER1024_AVX2_polyvec_compress * @@ -14,33 +85,11 @@ * (needs space for KYBER_POLYVECCOMPRESSEDBYTES) * - polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER1024_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], - polyvec *restrict a) { - size_t i, j, k; +void PQCLEAN_KYBER1024_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES + 2], polyvec *a) { + size_t i; - PQCLEAN_KYBER1024_AVX2_polyvec_csubq(a); - - uint16_t t[8]; for (i = 0; i < KYBER_K; i++) { - for (j = 0; j < KYBER_N / 8; j++) { - for (k = 0; k < 8; k++) { - t[k] = ((((uint32_t)a->vec[i].coeffs[8 * j + k] << 11) + KYBER_Q / 2) - / KYBER_Q) & 0x7ff; - } - - r[ 0] = (t[0] >> 0); - r[ 1] = (t[0] >> 8) | (t[1] << 3); - r[ 2] = (t[1] >> 5) | (t[2] << 6); - r[ 3] = (t[2] >> 2); - r[ 4] = (t[2] >> 10) | (t[3] << 1); - r[ 5] = (t[3] >> 7) | (t[4] << 4); - r[ 6] = (t[4] >> 4) | (t[5] << 7); - r[ 7] = (t[5] >> 1); - r[ 8] = (t[5] >> 9) | (t[6] << 2); - r[ 9] = (t[6] >> 6) | (t[7] << 5); - r[10] = (t[7] >> 3); - r += 11; - } + poly_compress11(&r[352 * i], &a->vec[i]); } } @@ -50,31 +99,15 @@ void PQCLEAN_KYBER1024_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBY * Description: De-serialize and decompress vector of polynomials; * approximate inverse of PQCLEAN_KYBER1024_AVX2_polyvec_compress * -* Arguments: - polyvec *r: pointer to output vector of polynomials +* Arguments: - polyvec *r: pointer to output vector of polynomials * - const uint8_t *a: pointer to input byte array * (of length KYBER_POLYVECCOMPRESSEDBYTES) **************************************************/ -void PQCLEAN_KYBER1024_AVX2_polyvec_decompress(polyvec *restrict r, - const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { - size_t i, j, k; +void PQCLEAN_KYBER1024_AVX2_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES + 12]) { + size_t i; - uint16_t t[8]; for (i = 0; i < KYBER_K; i++) { - for (j = 0; j < KYBER_N / 8; j++) { - t[0] = (a[0] >> 0) | ((uint16_t)a[ 1] << 8); - t[1] = (a[1] >> 3) | ((uint16_t)a[ 2] << 5); - t[2] = (a[2] >> 6) | ((uint16_t)a[ 3] << 2) | ((uint16_t)a[4] << 10); - t[3] = (a[4] >> 1) | ((uint16_t)a[ 5] << 7); - t[4] = (a[5] >> 4) | ((uint16_t)a[ 6] << 4); - t[5] = (a[6] >> 7) | ((uint16_t)a[ 7] << 1) | ((uint16_t)a[8] << 9); - t[6] = (a[8] >> 2) | ((uint16_t)a[ 9] << 6); - t[7] = (a[9] >> 5) | ((uint16_t)a[10] << 3); - a += 11; - - for (k = 0; k < 8; k++) { - r->vec[i].coeffs[8 * j + k] = ((uint32_t)(t[k] & 0x7FF) * KYBER_Q + 1024) >> 11; - } - } + poly_decompress11(&r->vec[i], &a[352 * i]); } } @@ -100,7 +133,7 @@ void PQCLEAN_KYBER1024_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyv * Description: De-serialize vector of polynomials; * inverse of PQCLEAN_KYBER1024_AVX2_polyvec_tobytes * -* Arguments: - uint8_t *r: pointer to output byte array +* Arguments: - uint8_t *r: pointer to output byte array * - const polyvec *a: pointer to input vector of polynomials * (of length KYBER_POLYVECBYTES) **************************************************/ @@ -141,29 +174,34 @@ void PQCLEAN_KYBER1024_AVX2_polyvec_invntt_tomont(polyvec *r) { } /************************************************* -* Name: PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery +* Name: PQCLEAN_KYBER1024_AVX2_polyvec_basemul_acc_montgomery * -* Description: Pointwise multiply elements of a and b, accumulate into r, +* Description: Multiply elements in a and b in NTT domain, accumulate into r, * and multiply by 2^-16. * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ -void PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(poly *r, - const polyvec *a, - const polyvec *b) { - PQCLEAN_KYBER1024_AVX2_basemul_acc_avx(r->coeffs, a->vec->coeffs, b->vec->coeffs, & PQCLEAN_KYBER1024_AVX2_qdata); +void PQCLEAN_KYBER1024_AVX2_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) { + size_t i; + poly tmp; + + PQCLEAN_KYBER1024_AVX2_poly_basemul_montgomery(r, &a->vec[0], &b->vec[0]); + for (i = 1; i < KYBER_K; i++) { + PQCLEAN_KYBER1024_AVX2_poly_basemul_montgomery(&tmp, &a->vec[i], &b->vec[i]); + PQCLEAN_KYBER1024_AVX2_poly_add(r, r, &tmp); + } } /************************************************* * Name: PQCLEAN_KYBER1024_AVX2_polyvec_reduce * * Description: Applies Barrett reduction to each coefficient -* of each element of a vector of polynomials +* of each element of a vector of polynomials; * for details of the Barrett reduction see comments in reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - polyvec *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER1024_AVX2_polyvec_reduce(polyvec *r) { size_t i; @@ -172,23 +210,6 @@ void PQCLEAN_KYBER1024_AVX2_polyvec_reduce(polyvec *r) { } } -/************************************************* -* Name: PQCLEAN_KYBER1024_AVX2_polyvec_csubq -* -* Description: Applies conditional subtraction of q to each coefficient -* of each element of a vector of polynomials -* for details of conditional subtraction of q see comments in -* reduce.c -* -* Arguments: - poly *r: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_KYBER1024_AVX2_polyvec_csubq(polyvec *r) { - size_t i; - for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER1024_AVX2_poly_csubq(&r->vec[i]); - } -} - /************************************************* * Name: PQCLEAN_KYBER1024_AVX2_polyvec_add * diff --git a/crypto_kem/kyber1024/avx2/polyvec.h b/crypto_kem/kyber1024/avx2/polyvec.h index 189226aa..6bdd1494 100644 --- a/crypto_kem/kyber1024/avx2/polyvec.h +++ b/crypto_kem/kyber1024/avx2/polyvec.h @@ -8,9 +8,8 @@ typedef struct { poly vec[KYBER_K]; } polyvec; -void PQCLEAN_KYBER1024_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a); -void PQCLEAN_KYBER1024_AVX2_polyvec_decompress(polyvec *r, - const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); +void PQCLEAN_KYBER1024_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES + 2], polyvec *a); +void PQCLEAN_KYBER1024_AVX2_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES + 12]); void PQCLEAN_KYBER1024_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a); void PQCLEAN_KYBER1024_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]); @@ -18,12 +17,9 @@ void PQCLEAN_KYBER1024_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_ void PQCLEAN_KYBER1024_AVX2_polyvec_ntt(polyvec *r); void PQCLEAN_KYBER1024_AVX2_polyvec_invntt_tomont(polyvec *r); -void PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(poly *r, - const polyvec *a, - const polyvec *b); +void PQCLEAN_KYBER1024_AVX2_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b); void PQCLEAN_KYBER1024_AVX2_polyvec_reduce(polyvec *r); -void PQCLEAN_KYBER1024_AVX2_polyvec_csubq(polyvec *r); void PQCLEAN_KYBER1024_AVX2_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); diff --git a/crypto_kem/kyber1024/avx2/reduce.h b/crypto_kem/kyber1024/avx2/reduce.h index 03a47704..3f576c61 100644 --- a/crypto_kem/kyber1024/avx2/reduce.h +++ b/crypto_kem/kyber1024/avx2/reduce.h @@ -1,10 +1,9 @@ #ifndef PQCLEAN_KYBER1024_AVX2_REDUCE_H #define PQCLEAN_KYBER1024_AVX2_REDUCE_H -#include "consts.h" -#include +#include "params.h" +#include -int16_t PQCLEAN_KYBER1024_AVX2_reduce_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); -int16_t PQCLEAN_KYBER1024_AVX2_csubq_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); -int16_t PQCLEAN_KYBER1024_AVX2_tomont_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); +void PQCLEAN_KYBER1024_AVX2_reduce_avx(__m256i *r, const __m256i *PQCLEAN_KYBER1024_AVX2_qdata); +void PQCLEAN_KYBER1024_AVX2_tomont_avx(__m256i *r, const __m256i *PQCLEAN_KYBER1024_AVX2_qdata); #endif diff --git a/crypto_kem/kyber1024/avx2/rejsample.c b/crypto_kem/kyber1024/avx2/rejsample.c index 4938b245..4c25d65d 100644 --- a/crypto_kem/kyber1024/avx2/rejsample.c +++ b/crypto_kem/kyber1024/avx2/rejsample.c @@ -4,311 +4,68 @@ #include "rejsample.h" #include #include +#include + +//#define BMI -static const ALIGN32_ARRAY_2D(uint8_t, 256, 8) idx = {.arr = { - {-1, -1, -1, -1, -1, -1, -1, -1}, - { 0, -1, -1, -1, -1, -1, -1, -1}, - { 2, -1, -1, -1, -1, -1, -1, -1}, - { 0, 2, -1, -1, -1, -1, -1, -1}, - { 4, -1, -1, -1, -1, -1, -1, -1}, - { 0, 4, -1, -1, -1, -1, -1, -1}, - { 2, 4, -1, -1, -1, -1, -1, -1}, - { 0, 2, 4, -1, -1, -1, -1, -1}, - { 6, -1, -1, -1, -1, -1, -1, -1}, - { 0, 6, -1, -1, -1, -1, -1, -1}, - { 2, 6, -1, -1, -1, -1, -1, -1}, - { 0, 2, 6, -1, -1, -1, -1, -1}, - { 4, 6, -1, -1, -1, -1, -1, -1}, - { 0, 4, 6, -1, -1, -1, -1, -1}, - { 2, 4, 6, -1, -1, -1, -1, -1}, - { 0, 2, 4, 6, -1, -1, -1, -1}, - { 8, -1, -1, -1, -1, -1, -1, -1}, - { 0, 8, -1, -1, -1, -1, -1, -1}, - { 2, 8, -1, -1, -1, -1, -1, -1}, - { 0, 2, 8, -1, -1, -1, -1, -1}, - { 4, 8, -1, -1, -1, -1, -1, -1}, - { 0, 4, 8, -1, -1, -1, -1, -1}, - { 2, 4, 8, -1, -1, -1, -1, -1}, - { 0, 2, 4, 8, -1, -1, -1, -1}, - { 6, 8, -1, -1, -1, -1, -1, -1}, - { 0, 6, 8, -1, -1, -1, -1, -1}, - { 2, 6, 8, -1, -1, -1, -1, -1}, - { 0, 2, 6, 8, -1, -1, -1, -1}, - { 4, 6, 8, -1, -1, -1, -1, -1}, - { 0, 4, 6, 8, -1, -1, -1, -1}, - { 2, 4, 6, 8, -1, -1, -1, -1}, - { 0, 2, 4, 6, 8, -1, -1, -1}, - {10, -1, -1, -1, -1, -1, -1, -1}, - { 0, 10, -1, -1, -1, -1, -1, -1}, - { 2, 10, -1, -1, -1, -1, -1, -1}, - { 0, 2, 10, -1, -1, -1, -1, -1}, - { 4, 10, -1, -1, -1, -1, -1, -1}, - { 0, 4, 10, -1, -1, -1, -1, -1}, - { 2, 4, 10, -1, -1, -1, -1, -1}, - { 0, 2, 4, 10, -1, -1, -1, -1}, - { 6, 10, -1, -1, -1, -1, -1, -1}, - { 0, 6, 10, -1, -1, -1, -1, -1}, - { 2, 6, 10, -1, -1, -1, -1, -1}, - { 0, 2, 6, 10, -1, -1, -1, -1}, - { 4, 6, 10, -1, -1, -1, -1, -1}, - { 0, 4, 6, 10, -1, -1, -1, -1}, - { 2, 4, 6, 10, -1, -1, -1, -1}, - { 0, 2, 4, 6, 10, -1, -1, -1}, - { 8, 10, -1, -1, -1, -1, -1, -1}, - { 0, 8, 10, -1, -1, -1, -1, -1}, - { 2, 8, 10, -1, -1, -1, -1, -1}, - { 0, 2, 8, 10, -1, -1, -1, -1}, - { 4, 8, 10, -1, -1, -1, -1, -1}, - { 0, 4, 8, 10, -1, -1, -1, -1}, - { 2, 4, 8, 10, -1, -1, -1, -1}, - { 0, 2, 4, 8, 10, -1, -1, -1}, - { 6, 8, 10, -1, -1, -1, -1, -1}, - { 0, 6, 8, 10, -1, -1, -1, -1}, - { 2, 6, 8, 10, -1, -1, -1, -1}, - { 0, 2, 6, 8, 10, -1, -1, -1}, - { 4, 6, 8, 10, -1, -1, -1, -1}, - { 0, 4, 6, 8, 10, -1, -1, -1}, - { 2, 4, 6, 8, 10, -1, -1, -1}, - { 0, 2, 4, 6, 8, 10, -1, -1}, - {12, -1, -1, -1, -1, -1, -1, -1}, - { 0, 12, -1, -1, -1, -1, -1, -1}, - { 2, 12, -1, -1, -1, -1, -1, -1}, - { 0, 2, 12, -1, -1, -1, -1, -1}, - { 4, 12, -1, -1, -1, -1, -1, -1}, - { 0, 4, 12, -1, -1, -1, -1, -1}, - { 2, 4, 12, -1, -1, -1, -1, -1}, - { 0, 2, 4, 12, -1, -1, -1, -1}, - { 6, 12, -1, -1, -1, -1, -1, -1}, - { 0, 6, 12, -1, -1, -1, -1, -1}, - { 2, 6, 12, -1, -1, -1, -1, -1}, - { 0, 2, 6, 12, -1, -1, -1, -1}, - { 4, 6, 12, -1, -1, -1, -1, -1}, - { 0, 4, 6, 12, -1, -1, -1, -1}, - { 2, 4, 6, 12, -1, -1, -1, -1}, - { 0, 2, 4, 6, 12, -1, -1, -1}, - { 8, 12, -1, -1, -1, -1, -1, -1}, - { 0, 8, 12, -1, -1, -1, -1, -1}, - { 2, 8, 12, -1, -1, -1, -1, -1}, - { 0, 2, 8, 12, -1, -1, -1, -1}, - { 4, 8, 12, -1, -1, -1, -1, -1}, - { 0, 4, 8, 12, -1, -1, -1, -1}, - { 2, 4, 8, 12, -1, -1, -1, -1}, - { 0, 2, 4, 8, 12, -1, -1, -1}, - { 6, 8, 12, -1, -1, -1, -1, -1}, - { 0, 6, 8, 12, -1, -1, -1, -1}, - { 2, 6, 8, 12, -1, -1, -1, -1}, - { 0, 2, 6, 8, 12, -1, -1, -1}, - { 4, 6, 8, 12, -1, -1, -1, -1}, - { 0, 4, 6, 8, 12, -1, -1, -1}, - { 2, 4, 6, 8, 12, -1, -1, -1}, - { 0, 2, 4, 6, 8, 12, -1, -1}, - {10, 12, -1, -1, -1, -1, -1, -1}, - { 0, 10, 12, -1, -1, -1, -1, -1}, - { 2, 10, 12, -1, -1, -1, -1, -1}, - { 0, 2, 10, 12, -1, -1, -1, -1}, - { 4, 10, 12, -1, -1, -1, -1, -1}, - { 0, 4, 10, 12, -1, -1, -1, -1}, - { 2, 4, 10, 12, -1, -1, -1, -1}, - { 0, 2, 4, 10, 12, -1, -1, -1}, - { 6, 10, 12, -1, -1, -1, -1, -1}, - { 0, 6, 10, 12, -1, -1, -1, -1}, - { 2, 6, 10, 12, -1, -1, -1, -1}, - { 0, 2, 6, 10, 12, -1, -1, -1}, - { 4, 6, 10, 12, -1, -1, -1, -1}, - { 0, 4, 6, 10, 12, -1, -1, -1}, - { 2, 4, 6, 10, 12, -1, -1, -1}, - { 0, 2, 4, 6, 10, 12, -1, -1}, - { 8, 10, 12, -1, -1, -1, -1, -1}, - { 0, 8, 10, 12, -1, -1, -1, -1}, - { 2, 8, 10, 12, -1, -1, -1, -1}, - { 0, 2, 8, 10, 12, -1, -1, -1}, - { 4, 8, 10, 12, -1, -1, -1, -1}, - { 0, 4, 8, 10, 12, -1, -1, -1}, - { 2, 4, 8, 10, 12, -1, -1, -1}, - { 0, 2, 4, 8, 10, 12, -1, -1}, - { 6, 8, 10, 12, -1, -1, -1, -1}, - { 0, 6, 8, 10, 12, -1, -1, -1}, - { 2, 6, 8, 10, 12, -1, -1, -1}, - { 0, 2, 6, 8, 10, 12, -1, -1}, - { 4, 6, 8, 10, 12, -1, -1, -1}, - { 0, 4, 6, 8, 10, 12, -1, -1}, - { 2, 4, 6, 8, 10, 12, -1, -1}, - { 0, 2, 4, 6, 8, 10, 12, -1}, - {14, -1, -1, -1, -1, -1, -1, -1}, - { 0, 14, -1, -1, -1, -1, -1, -1}, - { 2, 14, -1, -1, -1, -1, -1, -1}, - { 0, 2, 14, -1, -1, -1, -1, -1}, - { 4, 14, -1, -1, -1, -1, -1, -1}, - { 0, 4, 14, -1, -1, -1, -1, -1}, - { 2, 4, 14, -1, -1, -1, -1, -1}, - { 0, 2, 4, 14, -1, -1, -1, -1}, - { 6, 14, -1, -1, -1, -1, -1, -1}, - { 0, 6, 14, -1, -1, -1, -1, -1}, - { 2, 6, 14, -1, -1, -1, -1, -1}, - { 0, 2, 6, 14, -1, -1, -1, -1}, - { 4, 6, 14, -1, -1, -1, -1, -1}, - { 0, 4, 6, 14, -1, -1, -1, -1}, - { 2, 4, 6, 14, -1, -1, -1, -1}, - { 0, 2, 4, 6, 14, -1, -1, -1}, - { 8, 14, -1, -1, -1, -1, -1, -1}, - { 0, 8, 14, -1, -1, -1, -1, -1}, - { 2, 8, 14, -1, -1, -1, -1, -1}, - { 0, 2, 8, 14, -1, -1, -1, -1}, - { 4, 8, 14, -1, -1, -1, -1, -1}, - { 0, 4, 8, 14, -1, -1, -1, -1}, - { 2, 4, 8, 14, -1, -1, -1, -1}, - { 0, 2, 4, 8, 14, -1, -1, -1}, - { 6, 8, 14, -1, -1, -1, -1, -1}, - { 0, 6, 8, 14, -1, -1, -1, -1}, - { 2, 6, 8, 14, -1, -1, -1, -1}, - { 0, 2, 6, 8, 14, -1, -1, -1}, - { 4, 6, 8, 14, -1, -1, -1, -1}, - { 0, 4, 6, 8, 14, -1, -1, -1}, - { 2, 4, 6, 8, 14, -1, -1, -1}, - { 0, 2, 4, 6, 8, 14, -1, -1}, - {10, 14, -1, -1, -1, -1, -1, -1}, - { 0, 10, 14, -1, -1, -1, -1, -1}, - { 2, 10, 14, -1, -1, -1, -1, -1}, - { 0, 2, 10, 14, -1, -1, -1, -1}, - { 4, 10, 14, -1, -1, -1, -1, -1}, - { 0, 4, 10, 14, -1, -1, -1, -1}, - { 2, 4, 10, 14, -1, -1, -1, -1}, - { 0, 2, 4, 10, 14, -1, -1, -1}, - { 6, 10, 14, -1, -1, -1, -1, -1}, - { 0, 6, 10, 14, -1, -1, -1, -1}, - { 2, 6, 10, 14, -1, -1, -1, -1}, - { 0, 2, 6, 10, 14, -1, -1, -1}, - { 4, 6, 10, 14, -1, -1, -1, -1}, - { 0, 4, 6, 10, 14, -1, -1, -1}, - { 2, 4, 6, 10, 14, -1, -1, -1}, - { 0, 2, 4, 6, 10, 14, -1, -1}, - { 8, 10, 14, -1, -1, -1, -1, -1}, - { 0, 8, 10, 14, -1, -1, -1, -1}, - { 2, 8, 10, 14, -1, -1, -1, -1}, - { 0, 2, 8, 10, 14, -1, -1, -1}, - { 4, 8, 10, 14, -1, -1, -1, -1}, - { 0, 4, 8, 10, 14, -1, -1, -1}, - { 2, 4, 8, 10, 14, -1, -1, -1}, - { 0, 2, 4, 8, 10, 14, -1, -1}, - { 6, 8, 10, 14, -1, -1, -1, -1}, - { 0, 6, 8, 10, 14, -1, -1, -1}, - { 2, 6, 8, 10, 14, -1, -1, -1}, - { 0, 2, 6, 8, 10, 14, -1, -1}, - { 4, 6, 8, 10, 14, -1, -1, -1}, - { 0, 4, 6, 8, 10, 14, -1, -1}, - { 2, 4, 6, 8, 10, 14, -1, -1}, - { 0, 2, 4, 6, 8, 10, 14, -1}, - {12, 14, -1, -1, -1, -1, -1, -1}, - { 0, 12, 14, -1, -1, -1, -1, -1}, - { 2, 12, 14, -1, -1, -1, -1, -1}, - { 0, 2, 12, 14, -1, -1, -1, -1}, - { 4, 12, 14, -1, -1, -1, -1, -1}, - { 0, 4, 12, 14, -1, -1, -1, -1}, - { 2, 4, 12, 14, -1, -1, -1, -1}, - { 0, 2, 4, 12, 14, -1, -1, -1}, - { 6, 12, 14, -1, -1, -1, -1, -1}, - { 0, 6, 12, 14, -1, -1, -1, -1}, - { 2, 6, 12, 14, -1, -1, -1, -1}, - { 0, 2, 6, 12, 14, -1, -1, -1}, - { 4, 6, 12, 14, -1, -1, -1, -1}, - { 0, 4, 6, 12, 14, -1, -1, -1}, - { 2, 4, 6, 12, 14, -1, -1, -1}, - { 0, 2, 4, 6, 12, 14, -1, -1}, - { 8, 12, 14, -1, -1, -1, -1, -1}, - { 0, 8, 12, 14, -1, -1, -1, -1}, - { 2, 8, 12, 14, -1, -1, -1, -1}, - { 0, 2, 8, 12, 14, -1, -1, -1}, - { 4, 8, 12, 14, -1, -1, -1, -1}, - { 0, 4, 8, 12, 14, -1, -1, -1}, - { 2, 4, 8, 12, 14, -1, -1, -1}, - { 0, 2, 4, 8, 12, 14, -1, -1}, - { 6, 8, 12, 14, -1, -1, -1, -1}, - { 0, 6, 8, 12, 14, -1, -1, -1}, - { 2, 6, 8, 12, 14, -1, -1, -1}, - { 0, 2, 6, 8, 12, 14, -1, -1}, - { 4, 6, 8, 12, 14, -1, -1, -1}, - { 0, 4, 6, 8, 12, 14, -1, -1}, - { 2, 4, 6, 8, 12, 14, -1, -1}, - { 0, 2, 4, 6, 8, 12, 14, -1}, - {10, 12, 14, -1, -1, -1, -1, -1}, - { 0, 10, 12, 14, -1, -1, -1, -1}, - { 2, 10, 12, 14, -1, -1, -1, -1}, - { 0, 2, 10, 12, 14, -1, -1, -1}, - { 4, 10, 12, 14, -1, -1, -1, -1}, - { 0, 4, 10, 12, 14, -1, -1, -1}, - { 2, 4, 10, 12, 14, -1, -1, -1}, - { 0, 2, 4, 10, 12, 14, -1, -1}, - { 6, 10, 12, 14, -1, -1, -1, -1}, - { 0, 6, 10, 12, 14, -1, -1, -1}, - { 2, 6, 10, 12, 14, -1, -1, -1}, - { 0, 2, 6, 10, 12, 14, -1, -1}, - { 4, 6, 10, 12, 14, -1, -1, -1}, - { 0, 4, 6, 10, 12, 14, -1, -1}, - { 2, 4, 6, 10, 12, 14, -1, -1}, - { 0, 2, 4, 6, 10, 12, 14, -1}, - { 8, 10, 12, 14, -1, -1, -1, -1}, - { 0, 8, 10, 12, 14, -1, -1, -1}, - { 2, 8, 10, 12, 14, -1, -1, -1}, - { 0, 2, 8, 10, 12, 14, -1, -1}, - { 4, 8, 10, 12, 14, -1, -1, -1}, - { 0, 4, 8, 10, 12, 14, -1, -1}, - { 2, 4, 8, 10, 12, 14, -1, -1}, - { 0, 2, 4, 8, 10, 12, 14, -1}, - { 6, 8, 10, 12, 14, -1, -1, -1}, - { 0, 6, 8, 10, 12, 14, -1, -1}, - { 2, 6, 8, 10, 12, 14, -1, -1}, - { 0, 2, 6, 8, 10, 12, 14, -1}, - { 4, 6, 8, 10, 12, 14, -1, -1}, - { 0, 4, 6, 8, 10, 12, 14, -1}, - { 2, 4, 6, 8, 10, 12, 14, -1}, - { 0, 2, 4, 6, 8, 10, 12, 14} - } -}; #define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a) #define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a) -#define REJ_UNIFORM_BUFLEN 672 -unsigned int PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(int16_t *restrict r, - const uint8_t *restrict buf) { +unsigned int PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(int16_t *restrict r, const uint8_t *buf) { unsigned int ctr, pos; - uint16_t val; + uint16_t val0, val1; uint32_t good; - const __m256i bound = _mm256_set1_epi16((int16_t)(19 * KYBER_Q - 1)); + uint64_t idx0, idx1, idx2, idx3; + const __m256i bound = _mm256_load_si256(&PQCLEAN_KYBER1024_AVX2_qdata.vec[_16XQ / 16]); const __m256i ones = _mm256_set1_epi8(1); - const __m256i kyberq = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER1024_AVX2_qdata.as_arr[_16XQ]); - const __m256i v = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER1024_AVX2_qdata.as_arr[_16XV]); + const __m256i mask = _mm256_set1_epi16(0xFFF); + const __m256i idx8 = _mm256_set_epi8(15, 14, 14, 13, 12, 11, 11, 10, + 9, 8, 8, 7, 6, 5, 5, 4, + 11, 10, 10, 9, 8, 7, 7, 6, + 5, 4, 4, 3, 2, 1, 1, 0); __m256i f0, f1, g0, g1, g2, g3; __m128i f, t, pilo, pihi; - ctr = 0; - for (pos = 0; pos < 2 * KYBER_N; pos += 64) { - f0 = _mm256_load_si256((__m256i *)&buf[pos + 0]); - f1 = _mm256_load_si256((__m256i *)&buf[pos + 32]); + ctr = pos = 0; + while (ctr <= KYBER_N - 32 && pos <= REJ_UNIFORM_AVX_BUFLEN - 48) { + f0 = _mm256_loadu_si256((__m256i *)&buf[pos]); + f1 = _mm256_loadu_si256((__m256i *)&buf[pos + 24]); + f0 = _mm256_permute4x64_epi64(f0, 0x94); + f1 = _mm256_permute4x64_epi64(f1, 0x94); + f0 = _mm256_shuffle_epi8(f0, idx8); + f1 = _mm256_shuffle_epi8(f1, idx8); + g0 = _mm256_srli_epi16(f0, 4); + g1 = _mm256_srli_epi16(f1, 4); + f0 = _mm256_blend_epi16(f0, g0, 0xAA); + f1 = _mm256_blend_epi16(f1, g1, 0xAA); + f0 = _mm256_and_si256(f0, mask); + f1 = _mm256_and_si256(f1, mask); + pos += 48; - g0 = _mm256_cmpge_epu16(bound, f0); - g1 = _mm256_cmpge_epu16(bound, f1); + g0 = _mm256_cmpgt_epi16(bound, f0); + g1 = _mm256_cmpgt_epi16(bound, f1); g0 = _mm256_packs_epi16(g0, g1); good = _mm256_movemask_epi8(g0); - g0 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 0) & 0xFF])); - g1 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 8) & 0xFF])); - g0 = _mm256_inserti128_si256(g0, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 16) & 0xFF]), 1); - g1 = _mm256_inserti128_si256(g1, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 24) & 0xFF]), 1); + idx0 = _pdep_u64(good >> 0, 0x0101010101010101); + idx1 = _pdep_u64(good >> 8, 0x0101010101010101); + idx2 = _pdep_u64(good >> 16, 0x0101010101010101); + idx3 = _pdep_u64(good >> 24, 0x0101010101010101); + idx0 = (idx0 << 8) - idx0; + idx0 = _pext_u64(0x0E0C0A0806040200, idx0); + idx1 = (idx1 << 8) - idx1; + idx1 = _pext_u64(0x0E0C0A0806040200, idx1); + idx2 = (idx2 << 8) - idx2; + idx2 = _pext_u64(0x0E0C0A0806040200, idx2); + idx3 = (idx3 << 8) - idx3; + idx3 = _pext_u64(0x0E0C0A0806040200, idx3); - //g0 = _mm256_cvtepu8_epi64(_mm_loadl_epi64((__m128i *)&good)); - //g1 = _mm256_i64gather_epi64((long long *)idx.arr, g0, 8); - - /* Barrett reduction of (still unsigned) values */ - g2 = _mm256_mulhi_epu16(f0, v); - g3 = _mm256_mulhi_epu16(f1, v); - g2 = _mm256_srli_epi16(g2, 10); - g3 = _mm256_srli_epi16(g3, 10); - g2 = _mm256_mullo_epi16(g2, kyberq); - g3 = _mm256_mullo_epi16(g3, kyberq); - f0 = _mm256_sub_epi16(f0, g2); - f1 = _mm256_sub_epi16(f1, g3); + g0 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx0)); + g1 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx1)); + g0 = _mm256_inserti128_si256(g0, _mm_cvtsi64_si128(idx2), 1); + g1 = _mm256_inserti128_si256(g1, _mm_cvtsi64_si128(idx3), 1); g2 = _mm256_add_epi8(g0, ones); g3 = _mm256_add_epi8(g1, ones); @@ -328,34 +85,40 @@ unsigned int PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(int16_t *restrict r, ctr += _mm_popcnt_u32((good >> 24) & 0xFF); } - while (ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_BUFLEN - 16) { - f = _mm_load_si128((__m128i *)&buf[pos]); - t = _mm_cmpge_epu16(_mm256_castsi256_si128(bound), f); + while (ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_AVX_BUFLEN - 12) { + f = _mm_loadu_si128((__m128i *)&buf[pos]); + f = _mm_shuffle_epi8(f, _mm256_castsi256_si128(idx8)); + t = _mm_srli_epi16(f, 4); + f = _mm_blend_epi16(f, t, 0xAA); + f = _mm_and_si128(f, _mm256_castsi256_si128(mask)); + pos += 12; + + t = _mm_cmpgt_epi16(_mm256_castsi256_si128(bound), f); good = _mm_movemask_epi8(t); - good = _pext_u32(good, 0x5555); - pilo = _mm_loadl_epi64((__m128i *)&idx.arr[good]); + + good &= 0x5555; + idx0 = _pdep_u64(good, 0x1111111111111111); + idx0 = (idx0 << 8) - idx0; + idx0 = _pext_u64(0x0E0C0A0806040200, idx0); + pilo = _mm_cvtsi64_si128(idx0); + pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones)); pilo = _mm_unpacklo_epi8(pilo, pihi); - - /* Barrett reduction */ - t = _mm_mulhi_epu16(f, _mm256_castsi256_si128(v)); - t = _mm_srli_epi16(t, 10); - t = _mm_mullo_epi16(t, _mm256_castsi256_si128(kyberq)); - f = _mm_sub_epi16(f, t); - f = _mm_shuffle_epi8(f, pilo); _mm_storeu_si128((__m128i *)&r[ctr], f); ctr += _mm_popcnt_u32(good); - pos += 16; } - while (ctr < KYBER_N && pos <= REJ_UNIFORM_BUFLEN - 2) { - val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); - pos += 2; + while (ctr < KYBER_N && pos <= REJ_UNIFORM_AVX_BUFLEN - 3) { + val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; + val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)); + pos += 3; - if (val < 19 * KYBER_Q) { - val -= ((int32_t)val * 20159 >> 26) * KYBER_Q; - r[ctr++] = val; + if (val0 < KYBER_Q) { + r[ctr++] = val0; + } + if (val1 < KYBER_Q && ctr < KYBER_N) { + r[ctr++] = val1; } } diff --git a/crypto_kem/kyber1024/avx2/rejsample.h b/crypto_kem/kyber1024/avx2/rejsample.h index 3c3f3aeb..b9517b51 100644 --- a/crypto_kem/kyber1024/avx2/rejsample.h +++ b/crypto_kem/kyber1024/avx2/rejsample.h @@ -1,9 +1,12 @@ #ifndef PQCLEAN_KYBER1024_AVX2_REJSAMPLE_H #define PQCLEAN_KYBER1024_AVX2_REJSAMPLE_H #include "params.h" +#include "symmetric.h" #include -unsigned int PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(int16_t *r, - const unsigned char *buf); +#define REJ_UNIFORM_AVX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) +#define REJ_UNIFORM_AVX_BUFLEN (REJ_UNIFORM_AVX_NBLOCKS*XOF_BLOCKBYTES) + +unsigned int PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(int16_t *r, const uint8_t *buf); #endif diff --git a/crypto_kem/kyber1024/avx2/shuffle.S b/crypto_kem/kyber1024/avx2/shuffle.S index d706ccc1..05c26338 100644 --- a/crypto_kem/kyber1024/avx2/shuffle.S +++ b/crypto_kem/kyber1024/avx2/shuffle.S @@ -113,13 +113,13 @@ vmovdqa 224(%rsi),%ymm12 #csubq csubq 5,13 -csubq 6,14 -csubq 7,15 -csubq 8,1 +csubq 6,13 +csubq 7,13 +csubq 8,13 csubq 9,13 -csubq 10,14 -csubq 11,15 -csubq 12,1 +csubq 10,13 +csubq 11,13 +csubq 12,13 #bitpack vpsllw $12,%ymm6,%ymm4 diff --git a/crypto_kem/kyber1024/avx2/shuffle.inc b/crypto_kem/kyber1024/avx2/shuffle.inc index d4b092bc..73e9ffe0 100644 --- a/crypto_kem/kyber1024/avx2/shuffle.inc +++ b/crypto_kem/kyber1024/avx2/shuffle.inc @@ -9,17 +9,17 @@ vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 .endm .macro shuffle2 r0,r1,r2,r3 -#vshufps $0x20,%ymm\r1,%ymm\r0,%ymm\r2 -#vshufps $0x31,%ymm\r1,%ymm\r0,%ymm\r3 -vpsllq $32,%ymm\r1,%ymm12 -vpsrlq $32,%ymm\r0,%ymm13 -vpblendd $0xAA,%ymm12,%ymm\r0,%ymm\r2 -vpblendd $0xAA,%ymm\r1,%ymm13,%ymm\r3 +#vpsllq $32,%ymm\r1,%ymm\r2 +vmovsldup %ymm\r1,%ymm\r2 +vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrlq $32,%ymm\r0,%ymm\r0 +#vmovshdup %ymm\r0,%ymm\r0 +vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 .endm .macro shuffle1 r0,r1,r2,r3 -vpslld $16,%ymm\r1,%ymm12 -vpsrld $16,%ymm\r0,%ymm13 -vpblendw $0xAA,%ymm12,%ymm\r0,%ymm\r2 -vpblendw $0xAA,%ymm\r1,%ymm13,%ymm\r3 +vpslld $16,%ymm\r1,%ymm\r2 +vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrld $16,%ymm\r0,%ymm\r0 +vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 .endm diff --git a/crypto_kem/kyber1024/avx2/symmetric-shake.c b/crypto_kem/kyber1024/avx2/symmetric-shake.c index c7c4cd5a..9293481c 100644 --- a/crypto_kem/kyber1024/avx2/symmetric-shake.c +++ b/crypto_kem/kyber1024/avx2/symmetric-shake.c @@ -9,12 +9,10 @@ * * Description: Absorb step of the SHAKE128 specialized for the Kyber context. * -* Arguments: - xof_state *state: pointer to (uninitialized) output -* Keccak state -* - const uint8_t *seed: pointer to KYBER_SYMBYTES input -* to be absorbed into state -* - uint8_t i additional byte of input -* - uint8_t j additional byte of input +* Arguments: - xof_state *state: pointer to (uninitialized) output Keccak state +* - const uint8_t *seed: pointer to KYBER_SYMBYTES input to be absorbed into state +* - uint8_t i: additional byte of input +* - uint8_t j: additional byte of input **************************************************/ void PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb(xof_state *state, const uint8_t seed[KYBER_SYMBYTES], @@ -26,8 +24,8 @@ void PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb(xof_state *state, for (i = 0; i < KYBER_SYMBYTES; i++) { extseed[i] = seed[i]; } - extseed[i++] = x; - extseed[i] = y; + extseed[KYBER_SYMBYTES + 0] = x; + extseed[KYBER_SYMBYTES + 1] = y; shake128_absorb(state, extseed, sizeof(extseed)); } @@ -38,23 +36,19 @@ void PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb(xof_state *state, * Description: Usage of SHAKE256 as a PRF, concatenates secret and public input * and then generates outlen bytes of SHAKE256 output * -* Arguments: - uint8_t *out: pointer to output -* - size_t outlen: number of requested output bytes -* - const uint8_t *key: pointer to the key -* (of length KYBER_SYMBYTES) -* - uint8_t nonce: single-byte nonce (public PRF input) +* Arguments: - uint8_t *out: pointer to output +* - size_t outlen: number of requested output bytes +* - const uint8_t *key: pointer to the key (of length KYBER_SYMBYTES) +* - uint8_t nonce: single-byte nonce (public PRF input) **************************************************/ -void PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf(uint8_t *out, - size_t outlen, - const uint8_t key[KYBER_SYMBYTES], - uint8_t nonce) { +void PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce) { unsigned int i; uint8_t extkey[KYBER_SYMBYTES + 1]; for (i = 0; i < KYBER_SYMBYTES; i++) { extkey[i] = key[i]; } - extkey[i] = nonce; + extkey[KYBER_SYMBYTES] = nonce; shake256(out, outlen, extkey, sizeof(extkey)); } diff --git a/crypto_kem/kyber1024/avx2/symmetric.h b/crypto_kem/kyber1024/avx2/symmetric.h index c7b1c5de..a55def59 100644 --- a/crypto_kem/kyber1024/avx2/symmetric.h +++ b/crypto_kem/kyber1024/avx2/symmetric.h @@ -15,21 +15,16 @@ void PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb(xof_state *s, uint8_t x, uint8_t y); -void PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf(uint8_t *out, - size_t outlen, - const uint8_t key[KYBER_SYMBYTES], - uint8_t nonce); +void PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce); #define XOF_BLOCKBYTES SHAKE128_RATE #define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES) #define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES) #define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb(STATE, SEED, X, Y) -#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) \ - shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) #define xof_ctx_release(STATE) shake128_ctx_release(STATE) -#define prf(OUT, OUTBYTES, KEY, NONCE) \ - PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE) +#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE) #define kdf(OUT, IN, INBYTES) shake256(OUT, KYBER_SSBYTES, IN, INBYTES) diff --git a/crypto_kem/kyber1024/avx2/verify.c b/crypto_kem/kyber1024/avx2/verify.c index 5ec705f0..f43a1e49 100644 --- a/crypto_kem/kyber1024/avx2/verify.c +++ b/crypto_kem/kyber1024/avx2/verify.c @@ -8,31 +8,31 @@ * * Description: Compare two arrays for equality in constant time. * -* Arguments: const unsigned char *a: pointer to first byte array -* const unsigned char *b: pointer to second byte array -* size_t len: length of the byte arrays +* Arguments: const uint8_t *a: pointer to first byte array +* const uint8_t *b: pointer to second byte array +* size_t len: length of the byte arrays * * Returns 0 if the byte arrays are equal, 1 otherwise **************************************************/ int PQCLEAN_KYBER1024_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) { - size_t pos; + size_t i; uint64_t r; - __m256i avec, bvec, cvec; + __m256i f, g, h; - cvec = _mm256_setzero_si256(); - for (pos = 0; pos + 32 <= len; pos += 32) { - avec = _mm256_loadu_si256((__m256i *)&a[pos]); - bvec = _mm256_loadu_si256((__m256i *)&b[pos]); - avec = _mm256_xor_si256(avec, bvec); - cvec = _mm256_or_si256(cvec, avec); + h = _mm256_setzero_si256(); + for (i = 0; i < len / 32; i++) { + f = _mm256_loadu_si256((__m256i *)&a[32 * i]); + g = _mm256_loadu_si256((__m256i *)&b[32 * i]); + f = _mm256_xor_si256(f, g); + h = _mm256_or_si256(h, f); } - r = 1 - _mm256_testz_si256(cvec, cvec); + r = 1 - _mm256_testz_si256(h, h); - if (pos < len) { - avec = _mm256_loadu_si256((__m256i *)&a[pos]); - bvec = _mm256_loadu_si256((__m256i *)&b[pos]); - cvec = _mm256_cmpeq_epi8(avec, bvec); - r |= _mm256_movemask_epi8(cvec) & (-(uint32_t)1 >> (32 + pos - len)); + a += 32 * i; + b += 32 * i; + len -= 32 * i; + for (i = 0; i < len; i++) { + r |= a[i] ^ b[i]; } r = (-r) >> 63; @@ -47,29 +47,27 @@ int PQCLEAN_KYBER1024_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len * assumes two's complement representation of negative integers. * Runs in constant time. * -* Arguments: unsigned char *r: pointer to output byte array +* Arguments: unsigned char *r: pointer to output byte array * const unsigned char *x: pointer to input byte array -* size_t len: Amount of bytes to be copied -* unsigned char b: Condition bit; has to be in {0,1} +* size_t len: Amount of bytes to be copied +* unsigned char b: Condition bit; has to be in {0,1} **************************************************/ -void PQCLEAN_KYBER1024_AVX2_cmov(uint8_t *restrict r, const uint8_t *restrict x, size_t len, uint8_t b) { - size_t pos; +void PQCLEAN_KYBER1024_AVX2_cmov(uint8_t *restrict r, const uint8_t *x, size_t len, uint8_t b) { + size_t i; __m256i xvec, rvec, bvec; - b = -b; - bvec = _mm256_set1_epi8(b); - - for (pos = 0; pos + 32 <= len; pos += 32) { - rvec = _mm256_loadu_si256((__m256i *)&r[pos]); - xvec = _mm256_loadu_si256((__m256i *)&x[pos]); - xvec = _mm256_xor_si256(xvec, rvec); - xvec = _mm256_and_si256(xvec, bvec); - rvec = _mm256_xor_si256(rvec, xvec); - _mm256_storeu_si256((__m256i *)&r[pos], rvec); + bvec = _mm256_set1_epi64x(-(uint64_t)b); + for (i = 0; i < len / 32; i++) { + rvec = _mm256_loadu_si256((__m256i *)&r[32 * i]); + xvec = _mm256_loadu_si256((__m256i *)&x[32 * i]); + rvec = _mm256_blendv_epi8(rvec, xvec, bvec); + _mm256_storeu_si256((__m256i *)&r[32 * i], rvec); } - while (pos < len) { - r[pos] ^= b & (x[pos] ^ r[pos]); - pos += 1; + r += 32 * i; + x += 32 * i; + len -= 32 * i; + for (i = 0; i < len; i++) { + r[i] ^= -b & (x[i] ^ r[i]); } } diff --git a/crypto_kem/kyber1024/clean/cbd.c b/crypto_kem/kyber1024/clean/cbd.c index 51a040a4..92610fff 100644 --- a/crypto_kem/kyber1024/clean/cbd.c +++ b/crypto_kem/kyber1024/clean/cbd.c @@ -5,7 +5,7 @@ /************************************************* * Name: load32_littleendian * -* Description: load bytes into a 32-bit integer +* Description: load 4 bytes into a 32-bit integer * in little-endian order * * Arguments: - const uint8_t *x: pointer to input byte array @@ -22,16 +22,29 @@ static uint32_t load32_littleendian(const uint8_t x[4]) { } /************************************************* -* Name: PQCLEAN_KYBER1024_CLEAN_cbd +* Name: load24_littleendian +* +* Description: load 3 bytes into a 32-bit integer +* in little-endian order. +* This function is only needed for Kyber-512 +* +* Arguments: - const uint8_t *x: pointer to input byte array +* +* Returns 32-bit unsigned integer loaded from x (most significant byte is zero) +**************************************************/ + + +/************************************************* +* Name: cbd2 * * Description: Given an array of uniformly random bytes, compute * polynomial with coefficients distributed according to -* a centered binomial distribution with parameter KYBER_ETA +* a centered binomial distribution with parameter eta=2 * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *buf: pointer to input byte array **************************************************/ -void PQCLEAN_KYBER1024_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]) { +static void cbd2(poly *r, const uint8_t buf[2 * KYBER_N / 4]) { unsigned int i, j; uint32_t t, d; int16_t a, b; @@ -48,3 +61,23 @@ void PQCLEAN_KYBER1024_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N } } } + +/************************************************* +* Name: cbd3 +* +* Description: Given an array of uniformly random bytes, compute +* polynomial with coefficients distributed according to +* a centered binomial distribution with parameter eta=3. +* This function is only needed for Kyber-512 +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *buf: pointer to input byte array +**************************************************/ + +void PQCLEAN_KYBER1024_CLEAN_poly_cbd_eta1(poly *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]) { + cbd2(r, buf); +} + +void PQCLEAN_KYBER1024_CLEAN_poly_cbd_eta2(poly *r, const uint8_t buf[KYBER_ETA2 * KYBER_N / 4]) { + cbd2(r, buf); +} diff --git a/crypto_kem/kyber1024/clean/cbd.h b/crypto_kem/kyber1024/clean/cbd.h index dcc44012..a66d259a 100644 --- a/crypto_kem/kyber1024/clean/cbd.h +++ b/crypto_kem/kyber1024/clean/cbd.h @@ -4,6 +4,8 @@ #include "poly.h" #include -void PQCLEAN_KYBER1024_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]); +void PQCLEAN_KYBER1024_CLEAN_poly_cbd_eta1(poly *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]); + +void PQCLEAN_KYBER1024_CLEAN_poly_cbd_eta2(poly *r, const uint8_t buf[KYBER_ETA2 * KYBER_N / 4]); #endif diff --git a/crypto_kem/kyber1024/clean/indcpa.c b/crypto_kem/kyber1024/clean/indcpa.c index 1d86f8d8..a78f9139 100644 --- a/crypto_kem/kyber1024/clean/indcpa.c +++ b/crypto_kem/kyber1024/clean/indcpa.c @@ -15,8 +15,8 @@ * serialized vector of polynomials pk * and the public seed used to generate the matrix A. * -* Arguments: uint8_t *r: pointer to the output serialized public key -* polyvec *pk: pointer to the input public-key polyvec +* Arguments: uint8_t *r: pointer to the output serialized public key +* polyvec *pk: pointer to the input public-key polyvec * const uint8_t *seed: pointer to the input public seed **************************************************/ static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], @@ -35,10 +35,8 @@ static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], * Description: De-serialize public key from a byte array; * approximate inverse of pack_pk * -* Arguments: - polyvec *pk: pointer to output public-key -* polynomial vector -* - uint8_t *seed: pointer to output seed to generate -* matrix A +* Arguments: - polyvec *pk: pointer to output public-key polynomial vector +* - uint8_t *seed: pointer to output seed to generate matrix A * - const uint8_t *packedpk: pointer to input serialized public key **************************************************/ static void unpack_pk(polyvec *pk, @@ -56,7 +54,7 @@ static void unpack_pk(polyvec *pk, * * Description: Serialize the secret key * -* Arguments: - uint8_t *r: pointer to output serialized secret key +* Arguments: - uint8_t *r: pointer to output serialized secret key * - polyvec *sk: pointer to input vector of polynomials (secret key) **************************************************/ static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { @@ -66,15 +64,12 @@ static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { /************************************************* * Name: unpack_sk * -* Description: De-serialize the secret key; -* inverse of pack_sk +* Description: De-serialize the secret key; inverse of pack_sk * -* Arguments: - polyvec *sk: pointer to output vector of -* polynomials (secret key) +* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key) * - const uint8_t *packedsk: pointer to input serialized secret key **************************************************/ -static void unpack_sk(polyvec *sk, - const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { +static void unpack_sk(polyvec *sk, const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { PQCLEAN_KYBER1024_CLEAN_polyvec_frombytes(sk, packedsk); } @@ -86,12 +81,10 @@ static void unpack_sk(polyvec *sk, * and the compressed and serialized polynomial v * * Arguments: uint8_t *r: pointer to the output serialized ciphertext -* poly *pk: pointer to the input vector of polynomials b -* poly *v: pointer to the input polynomial v +* poly *pk: pointer to the input vector of polynomials b +* poly *v: pointer to the input polynomial v **************************************************/ -static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], - polyvec *b, - poly *v) { +static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], polyvec *b, poly *v) { PQCLEAN_KYBER1024_CLEAN_polyvec_compress(r, b); PQCLEAN_KYBER1024_CLEAN_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v); } @@ -102,13 +95,11 @@ static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], * Description: De-serialize and decompress ciphertext from a byte array; * approximate inverse of pack_ciphertext * -* Arguments: - polyvec *b: pointer to the output vector of polynomials b -* - poly *v: pointer to the output polynomial v +* Arguments: - polyvec *b: pointer to the output vector of polynomials b +* - poly *v: pointer to the output polynomial v * - const uint8_t *c: pointer to the input serialized ciphertext **************************************************/ -static void unpack_ciphertext(polyvec *b, - poly *v, - const uint8_t c[KYBER_INDCPA_BYTES]) { +static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t c[KYBER_INDCPA_BYTES]) { PQCLEAN_KYBER1024_CLEAN_polyvec_decompress(b, c); PQCLEAN_KYBER1024_CLEAN_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES); } @@ -119,11 +110,9 @@ static void unpack_ciphertext(polyvec *b, * Description: Run rejection sampling on uniform random bytes to generate * uniform random integers mod q * -* Arguments: - int16_t *r: pointer to output buffer -* - unsigned int len: requested number of 16-bit integers -* (uniform mod q) -* - const uint8_t *buf: pointer to input buffer -* (assumed to be uniform random bytes) +* Arguments: - int16_t *r: pointer to output buffer +* - unsigned int len: requested number of 16-bit integers (uniform mod q) +* - const uint8_t *buf: pointer to input buffer (assumed to be uniformly random bytes) * - unsigned int buflen: length of input buffer in bytes * * Returns number of sampled 16-bit integers (at most len) @@ -133,16 +122,19 @@ static unsigned int rej_uniform(int16_t *r, const uint8_t *buf, unsigned int buflen) { unsigned int ctr, pos; - uint16_t val; + uint16_t val0, val1; ctr = pos = 0; - while (ctr < len && pos + 2 <= buflen) { - val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); - pos += 2; + while (ctr < len && pos + 3 <= buflen) { + val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; + val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF; + pos += 3; - if (val < 19 * KYBER_Q) { - val -= (val >> 12) * KYBER_Q; // Barrett reduction - r[ctr++] = (int16_t)val; + if (val0 < KYBER_Q) { + r[ctr++] = val0; + } + if (ctr < len && val1 < KYBER_Q) { + r[ctr++] = val1; } } @@ -160,17 +152,16 @@ static unsigned int rej_uniform(int16_t *r, * uniformly random. Performs rejection sampling on output of * a XOF * -* Arguments: - polyvec *a: pointer to ouptput matrix A +* Arguments: - polyvec *a: pointer to ouptput matrix A * - const uint8_t *seed: pointer to input seed -* - int transposed: boolean deciding whether A or A^T -* is generated +* - int transposed: boolean deciding whether A or A^T is generated **************************************************/ -#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \ - + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) +#define GEN_MATRIX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) // Not static for benchmarking void PQCLEAN_KYBER1024_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed) { - unsigned int ctr, i, j; - uint8_t buf[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES]; + unsigned int ctr, i, j, k; + unsigned int buflen, off; + uint8_t buf[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2]; xof_state state; for (i = 0; i < KYBER_K; i++) { @@ -182,12 +173,17 @@ void PQCLEAN_KYBER1024_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYM } xof_squeezeblocks(buf, GEN_MATRIX_NBLOCKS, &state); - ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, sizeof(buf)); + buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES; + ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, buflen); while (ctr < KYBER_N) { - xof_squeezeblocks(buf, 1, &state); - ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, - XOF_BLOCKBYTES); + off = buflen % 3; + for (k = 0; k < off; k++) { + buf[k] = buf[buflen - off + k]; + } + xof_squeezeblocks(buf + off, 1, &state); + buflen = off + XOF_BLOCKBYTES; + ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, buflen); } xof_ctx_release(&state); } @@ -220,10 +216,10 @@ void PQCLEAN_KYBER1024_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYT gen_a(a, publicseed); for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER1024_CLEAN_poly_getnoise(&skpv.vec[i], noiseseed, nonce++); + PQCLEAN_KYBER1024_CLEAN_poly_getnoise_eta1(&skpv.vec[i], noiseseed, nonce++); } for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER1024_CLEAN_poly_getnoise(&e.vec[i], noiseseed, nonce++); + PQCLEAN_KYBER1024_CLEAN_poly_getnoise_eta1(&e.vec[i], noiseseed, nonce++); } PQCLEAN_KYBER1024_CLEAN_polyvec_ntt(&skpv); @@ -231,7 +227,7 @@ void PQCLEAN_KYBER1024_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYT // matrix-vector multiplication for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); + PQCLEAN_KYBER1024_CLEAN_polyvec_basemul_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); PQCLEAN_KYBER1024_CLEAN_poly_tomont(&pkpv.vec[i]); } @@ -248,16 +244,15 @@ void PQCLEAN_KYBER1024_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYT * Description: Encryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *c: pointer to output ciphertext -* (of length KYBER_INDCPA_BYTES bytes) -* - const uint8_t *m: pointer to input message -* (of length KYBER_INDCPA_MSGBYTES bytes) -* - const uint8_t *pk: pointer to input public key -* (of length KYBER_INDCPA_PUBLICKEYBYTES) -* - const uint8_t *coins: pointer to input random coins -* used as seed (of length KYBER_SYMBYTES) -* to deterministically generate all -* randomness +* Arguments: - uint8_t *c: pointer to output ciphertext +* (of length KYBER_INDCPA_BYTES bytes) +* - const uint8_t *m: pointer to input message +* (of length KYBER_INDCPA_MSGBYTES bytes) +* - const uint8_t *pk: pointer to input public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES) +* - const uint8_t *coins: pointer to input random coins used as seed +* (of length KYBER_SYMBYTES) to deterministically +* generate all randomness **************************************************/ void PQCLEAN_KYBER1024_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], const uint8_t m[KYBER_INDCPA_MSGBYTES], @@ -266,7 +261,7 @@ void PQCLEAN_KYBER1024_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], unsigned int i; uint8_t seed[KYBER_SYMBYTES]; uint8_t nonce = 0; - polyvec sp, pkpv, ep, at[KYBER_K], bp; + polyvec sp, pkpv, ep, at[KYBER_K], b; poly v, k, epp; unpack_pk(&pkpv, seed, pk); @@ -274,32 +269,32 @@ void PQCLEAN_KYBER1024_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], gen_at(at, seed); for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER1024_CLEAN_poly_getnoise(sp.vec + i, coins, nonce++); + PQCLEAN_KYBER1024_CLEAN_poly_getnoise_eta1(sp.vec + i, coins, nonce++); } for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER1024_CLEAN_poly_getnoise(ep.vec + i, coins, nonce++); + PQCLEAN_KYBER1024_CLEAN_poly_getnoise_eta2(ep.vec + i, coins, nonce++); } - PQCLEAN_KYBER1024_CLEAN_poly_getnoise(&epp, coins, nonce++); + PQCLEAN_KYBER1024_CLEAN_poly_getnoise_eta2(&epp, coins, nonce++); PQCLEAN_KYBER1024_CLEAN_polyvec_ntt(&sp); // matrix-vector multiplication for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp); + PQCLEAN_KYBER1024_CLEAN_polyvec_basemul_acc_montgomery(&b.vec[i], &at[i], &sp); } - PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp); + PQCLEAN_KYBER1024_CLEAN_polyvec_basemul_acc_montgomery(&v, &pkpv, &sp); - PQCLEAN_KYBER1024_CLEAN_polyvec_invntt_tomont(&bp); + PQCLEAN_KYBER1024_CLEAN_polyvec_invntt_tomont(&b); PQCLEAN_KYBER1024_CLEAN_poly_invntt_tomont(&v); - PQCLEAN_KYBER1024_CLEAN_polyvec_add(&bp, &bp, &ep); + PQCLEAN_KYBER1024_CLEAN_polyvec_add(&b, &b, &ep); PQCLEAN_KYBER1024_CLEAN_poly_add(&v, &v, &epp); PQCLEAN_KYBER1024_CLEAN_poly_add(&v, &v, &k); - PQCLEAN_KYBER1024_CLEAN_polyvec_reduce(&bp); + PQCLEAN_KYBER1024_CLEAN_polyvec_reduce(&b); PQCLEAN_KYBER1024_CLEAN_poly_reduce(&v); - pack_ciphertext(c, &bp, &v); + pack_ciphertext(c, &b, &v); } /************************************************* @@ -308,24 +303,24 @@ void PQCLEAN_KYBER1024_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], * Description: Decryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *m: pointer to output decrypted message -* (of length KYBER_INDCPA_MSGBYTES) -* - const uint8_t *c: pointer to input ciphertext -* (of length KYBER_INDCPA_BYTES) +* Arguments: - uint8_t *m: pointer to output decrypted message +* (of length KYBER_INDCPA_MSGBYTES) +* - const uint8_t *c: pointer to input ciphertext +* (of length KYBER_INDCPA_BYTES) * - const uint8_t *sk: pointer to input secret key * (of length KYBER_INDCPA_SECRETKEYBYTES) **************************************************/ void PQCLEAN_KYBER1024_CLEAN_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t c[KYBER_INDCPA_BYTES], const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { - polyvec bp, skpv; + polyvec b, skpv; poly v, mp; - unpack_ciphertext(&bp, &v, c); + unpack_ciphertext(&b, &v, c); unpack_sk(&skpv, sk); - PQCLEAN_KYBER1024_CLEAN_polyvec_ntt(&bp); - PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp); + PQCLEAN_KYBER1024_CLEAN_polyvec_ntt(&b); + PQCLEAN_KYBER1024_CLEAN_polyvec_basemul_acc_montgomery(&mp, &skpv, &b); PQCLEAN_KYBER1024_CLEAN_poly_invntt_tomont(&mp); PQCLEAN_KYBER1024_CLEAN_poly_sub(&mp, &v, &mp); diff --git a/crypto_kem/kyber1024/clean/kem.c b/crypto_kem/kyber1024/clean/kem.c index 436161df..d108f29a 100644 --- a/crypto_kem/kyber1024/clean/kem.c +++ b/crypto_kem/kyber1024/clean/kem.c @@ -14,13 +14,14 @@ * for CCA-secure Kyber key encapsulation mechanism * * Arguments: - unsigned char *pk: pointer to output public key -* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) * - unsigned char *sk: pointer to output private key -* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* (an already allocated array of KYBER_SECRETKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER1024_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { +int PQCLEAN_KYBER1024_CLEAN_crypto_kem_keypair(unsigned char pk[KYBER_PUBLICKEYBYTES], + unsigned char sk[KYBER_SECRETKEYBYTES]) { size_t i; PQCLEAN_KYBER1024_CLEAN_indcpa_keypair(pk, sk); for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) { @@ -39,17 +40,17 @@ int PQCLEAN_KYBER1024_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char * secret for given public key * * Arguments: - unsigned char *ct: pointer to output cipher text -* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) * - unsigned char *ss: pointer to output shared secret -* (an already allocated array of CRYPTO_BYTES bytes) +* (an already allocated array of KYBER_SSBYTES bytes) * - const unsigned char *pk: pointer to input public key -* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER1024_CLEAN_crypto_kem_enc(unsigned char *ct, - unsigned char *ss, - const unsigned char *pk) { +int PQCLEAN_KYBER1024_CLEAN_crypto_kem_enc(unsigned char ct[KYBER_CIPHERTEXTBYTES], + unsigned char ss[KYBER_SSBYTES], + const unsigned char pk[KYBER_PUBLICKEYBYTES]) { uint8_t buf[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ uint8_t kr[2 * KYBER_SYMBYTES]; @@ -79,19 +80,19 @@ int PQCLEAN_KYBER1024_CLEAN_crypto_kem_enc(unsigned char *ct, * cipher text and private key * * Arguments: - unsigned char *ss: pointer to output shared secret -* (an already allocated array of CRYPTO_BYTES bytes) +* (an already allocated array of KYBER_SSBYTES bytes) * - const unsigned char *ct: pointer to input cipher text -* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) * - const unsigned char *sk: pointer to input private key -* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* (an already allocated array of KYBER_SECRETKEYBYTES bytes) * * Returns 0. * * On failure, ss will contain a pseudo-random value. **************************************************/ -int PQCLEAN_KYBER1024_CLEAN_crypto_kem_dec(unsigned char *ss, - const unsigned char *ct, - const unsigned char *sk) { +int PQCLEAN_KYBER1024_CLEAN_crypto_kem_dec(unsigned char ss[KYBER_SSBYTES], + const unsigned char ct[KYBER_CIPHERTEXTBYTES], + const unsigned char sk[KYBER_SECRETKEYBYTES]) { size_t i; int fail; uint8_t buf[2 * KYBER_SYMBYTES]; diff --git a/crypto_kem/kyber1024/clean/ntt.c b/crypto_kem/kyber1024/clean/ntt.c index 1e612a55..7f7577c4 100644 --- a/crypto_kem/kyber1024/clean/ntt.c +++ b/crypto_kem/kyber1024/clean/ntt.c @@ -3,11 +3,11 @@ #include "reduce.h" #include -/* Code to generate PQCLEAN_KYBER1024_CLEAN_zetas and PQCLEAN_KYBER1024_CLEAN_zetas_inv used in the number-theoretic transform: +/* Code to generate PQCLEAN_KYBER1024_CLEAN_zetas and zetas_inv used in the number-theoretic transform: #define KYBER_ROOT_OF_UNITY 17 -static const uint16_t tree[128] = { +static const uint8_t tree[128] = { 0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120, 4, 68, 36, 100, 20, 84, 52, 116, 12, 76, 44, 108, 28, 92, 60, 124, 2, 66, 34, 98, 18, 82, 50, 114, 10, 74, 42, 106, 26, 90, 58, 122, @@ -19,51 +19,41 @@ static const uint16_t tree[128] = { }; void init_ntt() { - unsigned int i, j, k; + unsigned int i; int16_t tmp[128]; tmp[0] = MONT; - for(i = 1; i < 128; ++i) - tmp[i] = fqmul(tmp[i-1], KYBER_ROOT_OF_UNITY*MONT % KYBER_Q); + for(i=1;i<128;i++) + tmp[i] = fqmul(tmp[i-1],MONT*KYBER_ROOT_OF_UNITY % KYBER_Q); - for(i = 0; i < 128; ++i) + for(i=0;i<128;i++) { PQCLEAN_KYBER1024_CLEAN_zetas[i] = tmp[tree[i]]; - - k = 0; - for(i = 64; i >= 1; i >>= 1) - for(j = i; j < 2*i; ++j) - PQCLEAN_KYBER1024_CLEAN_zetas_inv[k++] = -tmp[128 - tree[j]]; - - PQCLEAN_KYBER1024_CLEAN_zetas_inv[127] = MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q; + if(PQCLEAN_KYBER1024_CLEAN_zetas[i] > KYBER_Q/2) + PQCLEAN_KYBER1024_CLEAN_zetas[i] -= KYBER_Q; + if(PQCLEAN_KYBER1024_CLEAN_zetas[i] < -KYBER_Q/2) + PQCLEAN_KYBER1024_CLEAN_zetas[i] += KYBER_Q; + } } - */ const int16_t PQCLEAN_KYBER1024_CLEAN_zetas[128] = { - 2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182, 962, - 2127, 1855, 1468, 573, 2004, 264, 383, 2500, 1458, 1727, 3199, 2648, 1017, - 732, 608, 1787, 411, 3124, 1758, 1223, 652, 2777, 1015, 2036, 1491, 3047, - 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, 2476, 3239, 3058, 830, - 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, 2226, - 430, 555, 843, 2078, 871, 1550, 105, 422, 587, 177, 3094, 3038, 2869, 1574, - 1653, 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349, - 418, 329, 3173, 3254, 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193, - 1218, 1994, 2455, 220, 2142, 1670, 2144, 1799, 2051, 794, 1819, 2475, 2459, - 478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628 -}; - -const int16_t PQCLEAN_KYBER1024_CLEAN_zetas_inv[128] = { - 1701, 1807, 1460, 2371, 2338, 2333, 308, 108, 2851, 870, 854, 1510, 2535, - 1278, 1530, 1185, 1659, 1187, 3109, 874, 1335, 2111, 136, 1215, 2945, 1465, - 1285, 2007, 2719, 2726, 2232, 2512, 75, 156, 3000, 2911, 2980, 872, 2685, - 1590, 2210, 602, 1846, 777, 147, 2170, 2551, 246, 1676, 1755, 460, 291, 235, - 3152, 2742, 2907, 3224, 1779, 2458, 1251, 2486, 2774, 2899, 1103, 1275, 2652, - 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, - 1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, - 2677, 2106, 1571, 205, 2918, 1542, 2721, 2597, 2312, 681, 130, 1602, 1871, - 829, 2946, 3065, 1325, 2756, 1861, 1474, 1202, 2367, 3147, 1752, 2707, 171, - 3127, 3042, 1907, 1836, 1517, 359, 758, 1441 -}; + -1044, -758, -359, -1517, 1493, 1422, 287, 202, + -171, 622, 1577, 182, 962, -1202, -1474, 1468, + 573, -1325, 264, 383, -829, 1458, -1602, -130, + -681, 1017, 732, 608, -1542, 411, -205, -1571, + 1223, 652, -552, 1015, -1293, 1491, -282, -1544, + 516, -8, -320, -666, -1618, -1162, 126, 1469, + -853, -90, -271, 830, 107, -1421, -247, -951, + -398, 961, -1508, -725, 448, -1065, 677, -1275, + -1103, 430, 555, 843, -1251, 871, 1550, 105, + 422, 587, 177, -235, -291, -460, 1574, 1653, + -246, 778, 1159, -147, -777, 1483, -602, 1119, + -1590, 644, -872, 349, 418, 329, -156, -75, + 817, 1097, 603, 610, 1322, -1285, -1465, 384, + -1215, -136, 1218, -1335, -874, 220, -1187, -1659, + -1185, -1530, -1278, 794, -1510, -854, -870, 478, + -108, -308, 996, 991, 958, -1460, 1522, 1628 + }; /************************************************* * Name: fqmul @@ -82,11 +72,10 @@ static int16_t fqmul(int16_t a, int16_t b) { /************************************************* * Name: PQCLEAN_KYBER1024_CLEAN_ntt * -* Description: Inplace number-theoretic transform (NTT) in Rq +* Description: Inplace number-theoretic transform (NTT) in Rq. * input is in standard order, output is in bitreversed order * -* Arguments: - int16_t r[256]: pointer to input/output vector of elements -* of Zq +* Arguments: - int16_t r[256]: pointer to input/output vector of elements of Zq **************************************************/ void PQCLEAN_KYBER1024_CLEAN_ntt(int16_t r[256]) { unsigned int len, start, j, k; @@ -96,7 +85,7 @@ void PQCLEAN_KYBER1024_CLEAN_ntt(int16_t r[256]) { for (len = 128; len >= 2; len >>= 1) { for (start = 0; start < 256; start = j + len) { zeta = PQCLEAN_KYBER1024_CLEAN_zetas[k++]; - for (j = start; j < start + len; ++j) { + for (j = start; j < start + len; j++) { t = fqmul(zeta, r[j + len]); r[j + len] = r[j] - t; r[j] = r[j] + t; @@ -112,28 +101,28 @@ void PQCLEAN_KYBER1024_CLEAN_ntt(int16_t r[256]) { * multiplication by Montgomery factor 2^16. * Input is in bitreversed order, output is in standard order * -* Arguments: - int16_t r[256]: pointer to input/output vector of elements -* of Zq +* Arguments: - int16_t r[256]: pointer to input/output vector of elements of Zq **************************************************/ void PQCLEAN_KYBER1024_CLEAN_invntt(int16_t r[256]) { unsigned int start, len, j, k; int16_t t, zeta; + const int16_t f = 1441; // mont^2/128 - k = 0; + k = 127; for (len = 2; len <= 128; len <<= 1) { for (start = 0; start < 256; start = j + len) { - zeta = PQCLEAN_KYBER1024_CLEAN_zetas_inv[k++]; - for (j = start; j < start + len; ++j) { + zeta = PQCLEAN_KYBER1024_CLEAN_zetas[k--]; + for (j = start; j < start + len; j++) { t = r[j]; r[j] = PQCLEAN_KYBER1024_CLEAN_barrett_reduce(t + r[j + len]); - r[j + len] = t - r[j + len]; + r[j + len] = r[j + len] - t; r[j + len] = fqmul(zeta, r[j + len]); } } } - for (j = 0; j < 256; ++j) { - r[j] = fqmul(r[j], PQCLEAN_KYBER1024_CLEAN_zetas_inv[127]); + for (j = 0; j < 256; j++) { + r[j] = fqmul(r[j], f); } } @@ -143,19 +132,15 @@ void PQCLEAN_KYBER1024_CLEAN_invntt(int16_t r[256]) { * Description: Multiplication of polynomials in Zq[X]/(X^2-zeta) * used for multiplication of elements in Rq in NTT domain * -* Arguments: - int16_t r[2]: pointer to the output polynomial +* Arguments: - int16_t r[2]: pointer to the output polynomial * - const int16_t a[2]: pointer to the first factor * - const int16_t b[2]: pointer to the second factor -* - int16_t zeta: integer defining the reduction polynomial +* - int16_t zeta: integer defining the reduction polynomial **************************************************/ -void PQCLEAN_KYBER1024_CLEAN_basemul(int16_t r[2], - const int16_t a[2], - const int16_t b[2], - int16_t zeta) { +void PQCLEAN_KYBER1024_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta) { r[0] = fqmul(a[1], b[1]); r[0] = fqmul(r[0], zeta); r[0] += fqmul(a[0], b[0]); - r[1] = fqmul(a[0], b[1]); r[1] += fqmul(a[1], b[0]); } diff --git a/crypto_kem/kyber1024/clean/ntt.h b/crypto_kem/kyber1024/clean/ntt.h index d8eaee82..8d17710a 100644 --- a/crypto_kem/kyber1024/clean/ntt.h +++ b/crypto_kem/kyber1024/clean/ntt.h @@ -5,15 +5,10 @@ extern const int16_t PQCLEAN_KYBER1024_CLEAN_zetas[128]; -extern const int16_t PQCLEAN_KYBER1024_CLEAN_zetas_inv[128]; - void PQCLEAN_KYBER1024_CLEAN_ntt(int16_t r[256]); void PQCLEAN_KYBER1024_CLEAN_invntt(int16_t r[256]); -void PQCLEAN_KYBER1024_CLEAN_basemul(int16_t r[2], - const int16_t a[2], - const int16_t b[2], - int16_t zeta); +void PQCLEAN_KYBER1024_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta); #endif diff --git a/crypto_kem/kyber1024/clean/params.h b/crypto_kem/kyber1024/clean/params.h index b604d6d8..210618a7 100644 --- a/crypto_kem/kyber1024/clean/params.h +++ b/crypto_kem/kyber1024/clean/params.h @@ -7,8 +7,6 @@ #define KYBER_N 256 #define KYBER_Q 3329 -#define KYBER_ETA 2 - #define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define KYBER_SSBYTES 32 /* size in bytes of shared key */ @@ -16,20 +14,20 @@ #define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) #define KYBER_K 4 +#define KYBER_ETA1 2 #define KYBER_POLYCOMPRESSEDBYTES 160 #define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 352) -#define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES +#define KYBER_ETA2 2 + +#define KYBER_INDCPA_MSGBYTES (KYBER_SYMBYTES) #define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) #define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) -#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES \ - + KYBER_POLYCOMPRESSEDBYTES) +#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES) #define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES) /* 32 bytes of additional space to save H(pk) */ -#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES \ - + KYBER_INDCPA_PUBLICKEYBYTES \ - + 2*KYBER_SYMBYTES) -#define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES +#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) +#define KYBER_CIPHERTEXTBYTES (KYBER_INDCPA_BYTES) #endif diff --git a/crypto_kem/kyber1024/clean/poly.c b/crypto_kem/kyber1024/clean/poly.c index e9a7b1a3..8eb73f74 100644 --- a/crypto_kem/kyber1024/clean/poly.c +++ b/crypto_kem/kyber1024/clean/poly.c @@ -13,17 +13,19 @@ * * Arguments: - uint8_t *r: pointer to output byte array * (of length KYBER_POLYCOMPRESSEDBYTES) -* - poly *a: pointer to input polynomial +* - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER1024_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a) { +void PQCLEAN_KYBER1024_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a) { size_t i, j; + int16_t u; uint8_t t[8]; - PQCLEAN_KYBER1024_CLEAN_poly_csubq(a); - for (i = 0; i < KYBER_N / 8; i++) { for (j = 0; j < 8; j++) { - t[j] = ((((uint32_t)a->coeffs[8 * i + j] << 5) + KYBER_Q / 2) / KYBER_Q) & 31; + // map to positive standard representatives + u = a->coeffs[8 * i + j]; + u += (u >> 15) & KYBER_Q; + t[j] = ((((uint32_t)u << 5) + KYBER_Q / 2) / KYBER_Q) & 31; } r[0] = (t[0] >> 0) | (t[1] << 5); @@ -41,7 +43,7 @@ void PQCLEAN_KYBER1024_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], * Description: De-serialization and subsequent decompression of a polynomial; * approximate inverse of PQCLEAN_KYBER1024_CLEAN_poly_compress * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: pointer to input byte array * (of length KYBER_POLYCOMPRESSEDBYTES bytes) **************************************************/ @@ -74,20 +76,21 @@ void PQCLEAN_KYBER1024_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_POLY * * Arguments: - uint8_t *r: pointer to output byte array * (needs space for KYBER_POLYBYTES bytes) -* - poly *a: pointer to input polynomial +* - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER1024_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { +void PQCLEAN_KYBER1024_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a) { size_t i; uint16_t t0, t1; - PQCLEAN_KYBER1024_CLEAN_poly_csubq(a); - for (i = 0; i < KYBER_N / 2; i++) { - t0 = a->coeffs[2 * i]; + // map to positive standard representatives + t0 = a->coeffs[2 * i]; + t0 += ((int16_t)t0 >> 15) & KYBER_Q; t1 = a->coeffs[2 * i + 1]; - r[3 * i + 0] = (uint8_t)(t0 >> 0); - r[3 * i + 1] = (uint8_t)((t0 >> 8) | (t1 << 4)); - r[3 * i + 2] = (uint8_t)(t1 >> 4); + t1 += ((int16_t)t1 >> 15) & KYBER_Q; + r[3 * i + 0] = (t0 >> 0); + r[3 * i + 1] = (t0 >> 8) | (t1 << 4); + r[3 * i + 2] = (t1 >> 4); } } @@ -97,7 +100,7 @@ void PQCLEAN_KYBER1024_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { * Description: De-serialization of a polynomial; * inverse of PQCLEAN_KYBER1024_CLEAN_poly_tobytes * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: pointer to input byte array * (of KYBER_POLYBYTES bytes) **************************************************/ @@ -114,7 +117,7 @@ void PQCLEAN_KYBER1024_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYB * * Description: Convert 32-byte message to polynomial * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *msg: pointer to input message **************************************************/ void PQCLEAN_KYBER1024_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { @@ -135,41 +138,60 @@ void PQCLEAN_KYBER1024_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCP * Description: Convert polynomial to 32-byte message * * Arguments: - uint8_t *msg: pointer to output message -* - poly *a: pointer to input polynomial +* - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER1024_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a) { +void PQCLEAN_KYBER1024_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *a) { size_t i, j; uint16_t t; - PQCLEAN_KYBER1024_CLEAN_poly_csubq(a); - for (i = 0; i < KYBER_N / 8; i++) { msg[i] = 0; for (j = 0; j < 8; j++) { - t = ((((uint16_t)a->coeffs[8 * i + j] << 1) + KYBER_Q / 2) / KYBER_Q) & 1; + t = a->coeffs[8 * i + j]; + t += ((int16_t)t >> 15) & KYBER_Q; + t = (((t << 1) + KYBER_Q / 2) / KYBER_Q) & 1; msg[i] |= t << j; } } } /************************************************* -* Name: PQCLEAN_KYBER1024_CLEAN_poly_getnoise +* Name: PQCLEAN_KYBER1024_CLEAN_poly_getnoise_eta1 * * Description: Sample a polynomial deterministically from a seed and a nonce, * with output polynomial close to centered binomial distribution -* with parameter KYBER_ETA +* with parameter KYBER_ETA1 * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *seed: pointer to input seed * (of length KYBER_SYMBYTES bytes) -* - uint8_t nonce: one-byte input nonce +* - uint8_t nonce: one-byte input nonce **************************************************/ -void PQCLEAN_KYBER1024_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { - uint8_t buf[KYBER_ETA * KYBER_N / 4]; +void PQCLEAN_KYBER1024_CLEAN_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { + uint8_t buf[KYBER_ETA1 * KYBER_N / 4]; prf(buf, sizeof(buf), seed, nonce); - PQCLEAN_KYBER1024_CLEAN_cbd(r, buf); + PQCLEAN_KYBER1024_CLEAN_poly_cbd_eta1(r, buf); } +/************************************************* +* Name: PQCLEAN_KYBER1024_CLEAN_poly_getnoise_eta2 +* +* Description: Sample a polynomial deterministically from a seed and a nonce, +* with output polynomial close to centered binomial distribution +* with parameter KYBER_ETA2 +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *seed: pointer to input seed +* (of length KYBER_SYMBYTES bytes) +* - uint8_t nonce: one-byte input nonce +**************************************************/ +void PQCLEAN_KYBER1024_CLEAN_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { + uint8_t buf[KYBER_ETA2 * KYBER_N / 4]; + prf(buf, sizeof(buf), seed, nonce); + PQCLEAN_KYBER1024_CLEAN_poly_cbd_eta2(r, buf); +} + + /************************************************* * Name: PQCLEAN_KYBER1024_CLEAN_poly_ntt * @@ -202,7 +224,7 @@ void PQCLEAN_KYBER1024_CLEAN_poly_invntt_tomont(poly *r) { * * Description: Multiplication of two polynomials in NTT domain * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ @@ -210,8 +232,7 @@ void PQCLEAN_KYBER1024_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, con size_t i; for (i = 0; i < KYBER_N / 4; i++) { PQCLEAN_KYBER1024_CLEAN_basemul(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i], PQCLEAN_KYBER1024_CLEAN_zetas[64 + i]); - PQCLEAN_KYBER1024_CLEAN_basemul(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2], &b->coeffs[4 * i + 2], - -PQCLEAN_KYBER1024_CLEAN_zetas[64 + i]); + PQCLEAN_KYBER1024_CLEAN_basemul(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2], &b->coeffs[4 * i + 2], -PQCLEAN_KYBER1024_CLEAN_zetas[64 + i]); } } @@ -246,28 +267,12 @@ void PQCLEAN_KYBER1024_CLEAN_poly_reduce(poly *r) { } } -/************************************************* -* Name: PQCLEAN_KYBER1024_CLEAN_poly_csubq -* -* Description: Applies conditional subtraction of q to each coefficient -* of a polynomial. For details of conditional subtraction -* of q see comments in reduce.c -* -* Arguments: - poly *r: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_KYBER1024_CLEAN_poly_csubq(poly *r) { - size_t i; - for (i = 0; i < KYBER_N; i++) { - r->coeffs[i] = PQCLEAN_KYBER1024_CLEAN_csubq(r->coeffs[i]); - } -} - /************************************************* * Name: PQCLEAN_KYBER1024_CLEAN_poly_add * -* Description: Add two polynomials +* Description: Add two polynomials; no modular reduction is performed * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ @@ -281,7 +286,7 @@ void PQCLEAN_KYBER1024_CLEAN_poly_add(poly *r, const poly *a, const poly *b) { /************************************************* * Name: PQCLEAN_KYBER1024_CLEAN_poly_sub * -* Description: Subtract two polynomials +* Description: Subtract two polynomials; no modular reduction is performed * * Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial diff --git a/crypto_kem/kyber1024/clean/poly.h b/crypto_kem/kyber1024/clean/poly.h index 52d7e7e3..ba9cb704 100644 --- a/crypto_kem/kyber1024/clean/poly.h +++ b/crypto_kem/kyber1024/clean/poly.h @@ -11,16 +11,18 @@ typedef struct { int16_t coeffs[KYBER_N]; } poly; -void PQCLEAN_KYBER1024_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a); +void PQCLEAN_KYBER1024_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a); void PQCLEAN_KYBER1024_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]); -void PQCLEAN_KYBER1024_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a); +void PQCLEAN_KYBER1024_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a); void PQCLEAN_KYBER1024_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]); void PQCLEAN_KYBER1024_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]); -void PQCLEAN_KYBER1024_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a); +void PQCLEAN_KYBER1024_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *a); -void PQCLEAN_KYBER1024_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); +void PQCLEAN_KYBER1024_CLEAN_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); + +void PQCLEAN_KYBER1024_CLEAN_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); void PQCLEAN_KYBER1024_CLEAN_poly_ntt(poly *r); void PQCLEAN_KYBER1024_CLEAN_poly_invntt_tomont(poly *r); @@ -28,7 +30,6 @@ void PQCLEAN_KYBER1024_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, con void PQCLEAN_KYBER1024_CLEAN_poly_tomont(poly *r); void PQCLEAN_KYBER1024_CLEAN_poly_reduce(poly *r); -void PQCLEAN_KYBER1024_CLEAN_poly_csubq(poly *r); void PQCLEAN_KYBER1024_CLEAN_poly_add(poly *r, const poly *a, const poly *b); void PQCLEAN_KYBER1024_CLEAN_poly_sub(poly *r, const poly *a, const poly *b); diff --git a/crypto_kem/kyber1024/clean/polyvec.c b/crypto_kem/kyber1024/clean/polyvec.c index dac115b7..b2175f4c 100644 --- a/crypto_kem/kyber1024/clean/polyvec.c +++ b/crypto_kem/kyber1024/clean/polyvec.c @@ -10,19 +10,18 @@ * * Arguments: - uint8_t *r: pointer to output byte array * (needs space for KYBER_POLYVECCOMPRESSEDBYTES) -* - polyvec *a: pointer to input vector of polynomials +* - const polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER1024_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a) { +void PQCLEAN_KYBER1024_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a) { unsigned int i, j, k; - PQCLEAN_KYBER1024_CLEAN_polyvec_csubq(a); - uint16_t t[8]; for (i = 0; i < KYBER_K; i++) { for (j = 0; j < KYBER_N / 8; j++) { for (k = 0; k < 8; k++) { - t[k] = ((((uint32_t)a->vec[i].coeffs[8 * j + k] << 11) + KYBER_Q / 2) - / KYBER_Q) & 0x7ff; + t[k] = a->vec[i].coeffs[8 * j + k]; + t[k] += ((int16_t)t[k] >> 15) & KYBER_Q; + t[k] = ((((uint32_t)t[k] << 11) + KYBER_Q / 2) / KYBER_Q) & 0x7ff; } r[ 0] = (uint8_t)(t[0] >> 0); @@ -51,8 +50,7 @@ void PQCLEAN_KYBER1024_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDB * - const uint8_t *a: pointer to input byte array * (of length KYBER_POLYVECCOMPRESSEDBYTES) **************************************************/ -void PQCLEAN_KYBER1024_CLEAN_polyvec_decompress(polyvec *r, - const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { +void PQCLEAN_KYBER1024_CLEAN_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { unsigned int i, j, k; uint16_t t[8]; @@ -82,9 +80,9 @@ void PQCLEAN_KYBER1024_CLEAN_polyvec_decompress(polyvec *r, * * Arguments: - uint8_t *r: pointer to output byte array * (needs space for KYBER_POLYVECBYTES) -* - polyvec *a: pointer to input vector of polynomials +* - const polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER1024_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a) { +void PQCLEAN_KYBER1024_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a) { unsigned int i; for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER1024_CLEAN_poly_tobytes(r + i * KYBER_POLYBYTES, &a->vec[i]); @@ -138,18 +136,16 @@ void PQCLEAN_KYBER1024_CLEAN_polyvec_invntt_tomont(polyvec *r) { } /************************************************* -* Name: PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery +* Name: PQCLEAN_KYBER1024_CLEAN_polyvec_basemul_acc_montgomery * -* Description: Pointwise multiply elements of a and b, accumulate into r, +* Description: Multiply elements of a and b in NTT domain, accumulate into r, * and multiply by 2^-16. * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ -void PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, - const polyvec *a, - const polyvec *b) { +void PQCLEAN_KYBER1024_CLEAN_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) { unsigned int i; poly t; @@ -166,10 +162,10 @@ void PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, * Name: PQCLEAN_KYBER1024_CLEAN_polyvec_reduce * * Description: Applies Barrett reduction to each coefficient -* of each element of a vector of polynomials +* of each element of a vector of polynomials; * for details of the Barrett reduction see comments in reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - polyvec *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER1024_CLEAN_polyvec_reduce(polyvec *r) { unsigned int i; @@ -178,29 +174,12 @@ void PQCLEAN_KYBER1024_CLEAN_polyvec_reduce(polyvec *r) { } } -/************************************************* -* Name: PQCLEAN_KYBER1024_CLEAN_polyvec_csubq -* -* Description: Applies conditional subtraction of q to each coefficient -* of each element of a vector of polynomials -* for details of conditional subtraction of q see comments in -* reduce.c -* -* Arguments: - poly *r: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_KYBER1024_CLEAN_polyvec_csubq(polyvec *r) { - unsigned int i; - for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER1024_CLEAN_poly_csubq(&r->vec[i]); - } -} - /************************************************* * Name: PQCLEAN_KYBER1024_CLEAN_polyvec_add * * Description: Add vectors of polynomials * -* Arguments: - polyvec *r: pointer to output vector of polynomials +* Arguments: - polyvec *r: pointer to output vector of polynomials * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ diff --git a/crypto_kem/kyber1024/clean/polyvec.h b/crypto_kem/kyber1024/clean/polyvec.h index fc6477c5..31c0e351 100644 --- a/crypto_kem/kyber1024/clean/polyvec.h +++ b/crypto_kem/kyber1024/clean/polyvec.h @@ -8,22 +8,18 @@ typedef struct { poly vec[KYBER_K]; } polyvec; -void PQCLEAN_KYBER1024_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a); -void PQCLEAN_KYBER1024_CLEAN_polyvec_decompress(polyvec *r, - const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); +void PQCLEAN_KYBER1024_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a); +void PQCLEAN_KYBER1024_CLEAN_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); -void PQCLEAN_KYBER1024_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a); +void PQCLEAN_KYBER1024_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a); void PQCLEAN_KYBER1024_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]); void PQCLEAN_KYBER1024_CLEAN_polyvec_ntt(polyvec *r); void PQCLEAN_KYBER1024_CLEAN_polyvec_invntt_tomont(polyvec *r); -void PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, - const polyvec *a, - const polyvec *b); +void PQCLEAN_KYBER1024_CLEAN_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b); void PQCLEAN_KYBER1024_CLEAN_polyvec_reduce(polyvec *r); -void PQCLEAN_KYBER1024_CLEAN_polyvec_csubq(polyvec *r); void PQCLEAN_KYBER1024_CLEAN_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); diff --git a/crypto_kem/kyber1024/clean/reduce.c b/crypto_kem/kyber1024/clean/reduce.c index 35dadb26..6ddb6a52 100644 --- a/crypto_kem/kyber1024/clean/reduce.c +++ b/crypto_kem/kyber1024/clean/reduce.c @@ -6,8 +6,7 @@ * Name: PQCLEAN_KYBER1024_CLEAN_montgomery_reduce * * Description: Montgomery reduction; given a 32-bit integer a, computes -* 16-bit integer congruent to a * R^-1 mod q, -* where R=2^16 +* 16-bit integer congruent to a * R^-1 mod q, where R=2^16 * * Arguments: - int32_t a: input integer to be reduced; * has to be in {-q2^15,...,q2^15-1} @@ -29,32 +28,17 @@ int16_t PQCLEAN_KYBER1024_CLEAN_montgomery_reduce(int32_t a) { * Name: PQCLEAN_KYBER1024_CLEAN_barrett_reduce * * Description: Barrett reduction; given a 16-bit integer a, computes -* 16-bit integer congruent to a mod q in {0,...,q} +* centered representative congruent to a mod q in {-(q-1)/2,...,(q-1)/2} * * Arguments: - int16_t a: input integer to be reduced * -* Returns: integer in {0,...,q} congruent to a modulo q. +* Returns: integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q. **************************************************/ int16_t PQCLEAN_KYBER1024_CLEAN_barrett_reduce(int16_t a) { int16_t t; const int16_t v = ((1U << 26) + KYBER_Q / 2) / KYBER_Q; - t = (int32_t)v * a >> 26; + t = ((int32_t)v * a + (1 << 25)) >> 26; t *= KYBER_Q; return a - t; } - -/************************************************* -* Name: PQCLEAN_KYBER1024_CLEAN_csubq -* -* Description: Conditionallly subtract q -* -* Arguments: - int16_t x: input integer -* -* Returns: a - q if a >= q, else a -**************************************************/ -int16_t PQCLEAN_KYBER1024_CLEAN_csubq(int16_t a) { - a -= KYBER_Q; - a += (a >> 15) & KYBER_Q; - return a; -} diff --git a/crypto_kem/kyber1024/clean/reduce.h b/crypto_kem/kyber1024/clean/reduce.h index 7bfc2c30..34e86031 100644 --- a/crypto_kem/kyber1024/clean/reduce.h +++ b/crypto_kem/kyber1024/clean/reduce.h @@ -10,6 +10,4 @@ int16_t PQCLEAN_KYBER1024_CLEAN_montgomery_reduce(int32_t a); int16_t PQCLEAN_KYBER1024_CLEAN_barrett_reduce(int16_t a); -int16_t PQCLEAN_KYBER1024_CLEAN_csubq(int16_t a); - #endif diff --git a/crypto_kem/kyber1024/clean/symmetric-shake.c b/crypto_kem/kyber1024/clean/symmetric-shake.c index ff85bdd0..40e24a82 100644 --- a/crypto_kem/kyber1024/clean/symmetric-shake.c +++ b/crypto_kem/kyber1024/clean/symmetric-shake.c @@ -9,12 +9,10 @@ * * Description: Absorb step of the SHAKE128 specialized for the Kyber context. * -* Arguments: - xof_state *state: pointer to (uninitialized) output -* Keccak state -* - const uint8_t *seed: pointer to KYBER_SYMBYTES input -* to be absorbed into state -* - uint8_t i additional byte of input -* - uint8_t j additional byte of input +* Arguments: - xof_state *state: pointer to (uninitialized) output Keccak state +* - const uint8_t *seed: pointer to KYBER_SYMBYTES input to be absorbed into state +* - uint8_t i: additional byte of input +* - uint8_t j: additional byte of input **************************************************/ void PQCLEAN_KYBER1024_CLEAN_kyber_shake128_absorb(xof_state *state, const uint8_t seed[KYBER_SYMBYTES], @@ -26,8 +24,8 @@ void PQCLEAN_KYBER1024_CLEAN_kyber_shake128_absorb(xof_state *state, for (i = 0; i < KYBER_SYMBYTES; i++) { extseed[i] = seed[i]; } - extseed[i++] = x; - extseed[i] = y; + extseed[KYBER_SYMBYTES + 0] = x; + extseed[KYBER_SYMBYTES + 1] = y; shake128_absorb(state, extseed, sizeof(extseed)); } @@ -38,23 +36,19 @@ void PQCLEAN_KYBER1024_CLEAN_kyber_shake128_absorb(xof_state *state, * Description: Usage of SHAKE256 as a PRF, concatenates secret and public input * and then generates outlen bytes of SHAKE256 output * -* Arguments: - uint8_t *out: pointer to output -* - size_t outlen: number of requested output bytes -* - const uint8_t *key: pointer to the key -* (of length KYBER_SYMBYTES) -* - uint8_t nonce: single-byte nonce (public PRF input) +* Arguments: - uint8_t *out: pointer to output +* - size_t outlen: number of requested output bytes +* - const uint8_t *key: pointer to the key (of length KYBER_SYMBYTES) +* - uint8_t nonce: single-byte nonce (public PRF input) **************************************************/ -void PQCLEAN_KYBER1024_CLEAN_kyber_shake256_prf(uint8_t *out, - size_t outlen, - const uint8_t key[KYBER_SYMBYTES], - uint8_t nonce) { +void PQCLEAN_KYBER1024_CLEAN_kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce) { unsigned int i; uint8_t extkey[KYBER_SYMBYTES + 1]; for (i = 0; i < KYBER_SYMBYTES; i++) { extkey[i] = key[i]; } - extkey[i] = nonce; + extkey[KYBER_SYMBYTES] = nonce; shake256(out, outlen, extkey, sizeof(extkey)); } diff --git a/crypto_kem/kyber1024/clean/symmetric.h b/crypto_kem/kyber1024/clean/symmetric.h index 650ce3f0..e7b9ba66 100644 --- a/crypto_kem/kyber1024/clean/symmetric.h +++ b/crypto_kem/kyber1024/clean/symmetric.h @@ -14,21 +14,16 @@ void PQCLEAN_KYBER1024_CLEAN_kyber_shake128_absorb(xof_state *s, uint8_t x, uint8_t y); -void PQCLEAN_KYBER1024_CLEAN_kyber_shake256_prf(uint8_t *out, - size_t outlen, - const uint8_t key[KYBER_SYMBYTES], - uint8_t nonce); +void PQCLEAN_KYBER1024_CLEAN_kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce); #define XOF_BLOCKBYTES SHAKE128_RATE #define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES) #define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES) #define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER1024_CLEAN_kyber_shake128_absorb(STATE, SEED, X, Y) -#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) \ - shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) #define xof_ctx_release(STATE) shake128_ctx_release(STATE) -#define prf(OUT, OUTBYTES, KEY, NONCE) \ - PQCLEAN_KYBER1024_CLEAN_kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE) +#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER1024_CLEAN_kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE) #define kdf(OUT, IN, INBYTES) shake256(OUT, KYBER_SSBYTES, IN, INBYTES) diff --git a/crypto_kem/kyber512-90s/META.yml b/crypto_kem/kyber512-90s/META.yml index e58eac45..14f1639f 100644 --- a/crypto_kem/kyber512-90s/META.yml +++ b/crypto_kem/kyber512-90s/META.yml @@ -3,10 +3,10 @@ type: kem claimed-nist-level: 1 claimed-security: IND-CCA2 length-public-key: 800 -length-ciphertext: 736 +length-ciphertext: 768 length-secret-key: 1632 length-shared-secret: 32 -nistkat-sha256: d081dafce242de5d2a9b1cfe2b304cf5ebaed71b7a91f028fefd569693307d45 +nistkat-sha256: 7bfe0653b63b3fac7ee300a6e4801046c1a3d8d445b271633b6c9d81ed125e5b principal-submitters: - Peter Schwabe auxiliary-submitters: @@ -21,9 +21,9 @@ auxiliary-submitters: - Damien Stehlé implementations: - name: clean - version: https://github.com/pq-crystals/kyber/commit/844057468e69527bd15b17fbe03f4b61f9a22065 via https://github.com/jschanck/package-pqclean/tree/b45068b8/kyber + version: https://github.com/pq-crystals/kyber/commit/e7faae9f662f5b92fee4e966f09b2f23e1e91c65 via https://github.com/jschanck/package-pqclean/tree/231c9bec/kyber - name: avx2 - version: https://github.com/pq-crystals/kyber/commit/844057468e69527bd15b17fbe03f4b61f9a22065 via https://github.com/jschanck/package-pqclean/tree/b45068b8/kyber + version: https://github.com/pq-crystals/kyber/commit/e7faae9f662f5b92fee4e966f09b2f23e1e91c65 via https://github.com/jschanck/package-pqclean/tree/231c9bec/kyber supported_platforms: - architecture: x86_64 operating_systems: diff --git a/crypto_kem/kyber512-90s/avx2/aes256ctr.c b/crypto_kem/kyber512-90s/avx2/aes256ctr.c index 51c625ab..c55c0eb3 100644 --- a/crypto_kem/kyber512-90s/avx2/aes256ctr.c +++ b/crypto_kem/kyber512-90s/avx2/aes256ctr.c @@ -2,52 +2,48 @@ #include #include #include -/* - Based heavily on public-domain code by Romain Dolbeau - Different handling of nonce+counter than original version - using separated 64-bit nonce and internal 64-bit counter, starting from zero - Public Domain -*/ +/* Based heavily on public-domain code by Romain Dolbeau + * Different handling of nonce+counter than original version using + * separated 64-bit nonce and internal 64-bit counter, starting from zero + * Public Domain */ -static inline void aesni_encrypt4(uint8_t out[64], - __m128i *n, - const __m128i rkeys[16]) { - __m128i f, f0, f1, f2, f3, t; +static inline void aesni_encrypt4(uint8_t out[64], __m128i *n, const __m128i rkeys[16]) { + __m128i f, f0, f1, f2, f3; + const __m128i idx = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); /* Load current counter value */ f = _mm_load_si128(n); /* Increase counter in 4 consecutive blocks */ - t = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); - f0 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(0, 0)), t); - f1 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(1, 0)), t); - f2 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(2, 0)), t); - f3 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(3, 0)), t); + f0 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(0, 0)), idx); + f1 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(1, 0)), idx); + f2 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(2, 0)), idx); + f3 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(3, 0)), idx); /* Write counter for next iteration, increased by 4 */ _mm_store_si128(n, _mm_add_epi64(f, _mm_set_epi64x(4, 0))); /* Actual AES encryption, 4x interleaved */ - t = _mm_load_si128(&rkeys[0]); - f0 = _mm_xor_si128(f0, t); - f1 = _mm_xor_si128(f1, t); - f2 = _mm_xor_si128(f2, t); - f3 = _mm_xor_si128(f3, t); + f = _mm_load_si128(&rkeys[0]); + f0 = _mm_xor_si128(f0, f); + f1 = _mm_xor_si128(f1, f); + f2 = _mm_xor_si128(f2, f); + f3 = _mm_xor_si128(f3, f); for (int i = 1; i < 14; i++) { - t = _mm_load_si128(&rkeys[i]); - f0 = _mm_aesenc_si128(f0, t); - f1 = _mm_aesenc_si128(f1, t); - f2 = _mm_aesenc_si128(f2, t); - f3 = _mm_aesenc_si128(f3, t); + f = _mm_load_si128(&rkeys[i]); + f0 = _mm_aesenc_si128(f0, f); + f1 = _mm_aesenc_si128(f1, f); + f2 = _mm_aesenc_si128(f2, f); + f3 = _mm_aesenc_si128(f3, f); } - t = _mm_load_si128(&rkeys[14]); - f0 = _mm_aesenclast_si128(f0, t); - f1 = _mm_aesenclast_si128(f1, t); - f2 = _mm_aesenclast_si128(f2, t); - f3 = _mm_aesenclast_si128(f3, t); + f = _mm_load_si128(&rkeys[14]); + f0 = _mm_aesenclast_si128(f0, f); + f1 = _mm_aesenclast_si128(f1, f); + f2 = _mm_aesenclast_si128(f2, f); + f3 = _mm_aesenclast_si128(f3, f); /* Write results */ _mm_storeu_si128((__m128i *)(out + 0), f0); @@ -134,6 +130,7 @@ void PQCLEAN_KYBER51290S_AVX2_aes256ctr_prf(uint8_t *out, while (outlen >= 64) { aesni_encrypt4(out, &state.n, state.rkeys); outlen -= 64; + out += 64; } if (outlen) { diff --git a/crypto_kem/kyber512-90s/avx2/align.h b/crypto_kem/kyber512-90s/avx2/align.h index c6e88d68..94413cc3 100644 --- a/crypto_kem/kyber512-90s/avx2/align.h +++ b/crypto_kem/kyber512-90s/avx2/align.h @@ -2,22 +2,18 @@ #define PQCLEAN_KYBER51290S_AVX2_ALIGN_H #include +#include -#define ALIGN16_TYPE(t) \ - union { \ - __m128i vec; \ - t orig; \ +#define ALIGNED_UINT8(N) \ + union { \ + uint8_t coeffs[(N)]; \ + __m256i vec[((N)+31)/32]; \ } -#define ALIGN32_ARRAY(t, s) \ - union { \ - __m256i vec; \ - t arr[(s)]; \ +#define ALIGNED_INT16(N) \ + union { \ + int16_t coeffs[(N)]; \ + __m256i vec[((N)+15)/16]; \ } -#define ALIGN32_ARRAY_2D(t, n, m) \ - union { \ - __m256i vec; \ - t arr[(n)][(m)]; \ - } #endif diff --git a/crypto_kem/kyber512-90s/avx2/api.h b/crypto_kem/kyber512-90s/avx2/api.h index 9fe45e18..2e7ce509 100644 --- a/crypto_kem/kyber512-90s/avx2/api.h +++ b/crypto_kem/kyber512-90s/avx2/api.h @@ -5,7 +5,7 @@ #define PQCLEAN_KYBER51290S_AVX2_CRYPTO_SECRETKEYBYTES 1632 #define PQCLEAN_KYBER51290S_AVX2_CRYPTO_PUBLICKEYBYTES 800 -#define PQCLEAN_KYBER51290S_AVX2_CRYPTO_CIPHERTEXTBYTES 736 +#define PQCLEAN_KYBER51290S_AVX2_CRYPTO_CIPHERTEXTBYTES 768 #define PQCLEAN_KYBER51290S_AVX2_CRYPTO_BYTES 32 #define PQCLEAN_KYBER51290S_AVX2_CRYPTO_ALGNAME "Kyber512-90s" diff --git a/crypto_kem/kyber512-90s/avx2/basemul.S b/crypto_kem/kyber512-90s/avx2/basemul.S index a3f2ec5e..ffd8d413 100644 --- a/crypto_kem/kyber512-90s/avx2/basemul.S +++ b/crypto_kem/kyber512-90s/avx2/basemul.S @@ -1,216 +1,107 @@ #include "cdecl.h" -#include "params.h" -.macro schoolbook off,sign -#load -vmovdqa \off+32(%rsi),%ymm7 # b -vmovdqa \off+32(%rdx),%ymm8 # d -vmovdqa \off(%rsi),%ymm9 # a -vmovdqa \off(%rdx),%ymm10 # c +.macro schoolbook off +vmovdqa _16XQINV*2(%rcx),%ymm0 +vmovdqa (64*\off+ 0)*2(%rsi),%ymm1 # a0 +vmovdqa (64*\off+16)*2(%rsi),%ymm2 # b0 +vmovdqa (64*\off+32)*2(%rsi),%ymm3 # a1 +vmovdqa (64*\off+48)*2(%rsi),%ymm4 # b1 -#mul -vpmullw %ymm7,%ymm8,%ymm11 # bd.lo -vpmulhw %ymm7,%ymm8,%ymm12 # bd.hi -vpmullw %ymm7,%ymm10,%ymm13 # bc.lo -vpmulhw %ymm7,%ymm10,%ymm7 # bc.hi -vpmullw %ymm9,%ymm8,%ymm14 # ad.lo -vpmulhw %ymm9,%ymm8,%ymm8 # ad.hi -vpmullw %ymm9,%ymm10,%ymm15 # ac.lo -vpmulhw %ymm9,%ymm10,%ymm9 # ac.hi +vpmullw %ymm0,%ymm1,%ymm9 # a0.lo +vpmullw %ymm0,%ymm2,%ymm10 # b0.lo +vpmullw %ymm0,%ymm3,%ymm11 # a1.lo +vpmullw %ymm0,%ymm4,%ymm12 # b1.lo -#reduce -vpmullw %ymm1,%ymm11,%ymm11 -vpmulhw %ymm0,%ymm11,%ymm11 -vpsubw %ymm11,%ymm12,%ymm11 # bd +vmovdqa (64*\off+ 0)*2(%rdx),%ymm5 # c0 +vmovdqa (64*\off+16)*2(%rdx),%ymm6 # d0 -#mul -vpmullw %ymm2,%ymm11,%ymm10 # rbd.lo -vpmulhw %ymm2,%ymm11,%ymm11 # rbd.hi +vpmulhw %ymm5,%ymm1,%ymm13 # a0c0.hi +vpmulhw %ymm6,%ymm1,%ymm1 # a0d0.hi +vpmulhw %ymm5,%ymm2,%ymm14 # b0c0.hi +vpmulhw %ymm6,%ymm2,%ymm2 # b0d0.hi -#unpack -vpunpcklwd %ymm7,%ymm13,%ymm12 # bc0 -vpunpckhwd %ymm7,%ymm13,%ymm7 # bc1 -vpunpcklwd %ymm8,%ymm14,%ymm13 # ad0 -vpunpckhwd %ymm8,%ymm14,%ymm8 # ad1 -vpunpcklwd %ymm9,%ymm15,%ymm14 # ac0 -vpunpckhwd %ymm9,%ymm15,%ymm9 # ac1 -vpunpcklwd %ymm11,%ymm10,%ymm15 # rbd0 -vpunpckhwd %ymm11,%ymm10,%ymm10 # rbd1 +vmovdqa (64*\off+32)*2(%rdx),%ymm7 # c1 +vmovdqa (64*\off+48)*2(%rdx),%ymm8 # d1 -#add -.ifeq \sign -vpaddd %ymm14,%ymm15,%ymm14 # x0 -vpaddd %ymm9,%ymm10,%ymm9 # x1 -.else -vpsubd %ymm15,%ymm14,%ymm14 # x0 -vpsubd %ymm10,%ymm9,%ymm9 # x1 -.endif -vpaddd %ymm12,%ymm13,%ymm12 # y0 -vpaddd %ymm7,%ymm8,%ymm7 # y1 -.endm +vpmulhw %ymm7,%ymm3,%ymm15 # a1c1.hi +vpmulhw %ymm8,%ymm3,%ymm3 # a1d1.hi +vpmulhw %ymm7,%ymm4,%ymm0 # b1c1.hi +vpmulhw %ymm8,%ymm4,%ymm4 # b1d1.hi -.macro red a0,a1,b0,b1,x,y,z -#pack -vpxor %ymm\x,%ymm\x,%ymm\x -vpblendw $0xAA,%ymm\x,%ymm\a0,%ymm\y -vpblendw $0xAA,%ymm\x,%ymm\a1,%ymm\z -vpsrld $16,%ymm\a0,%ymm\a0 -vpsrld $16,%ymm\a1,%ymm\a1 -vpackusdw %ymm\z,%ymm\y,%ymm\z -vpackusdw %ymm\a1,%ymm\a0,%ymm\a0 -vpblendw $0xAA,%ymm\x,%ymm\b0,%ymm\y -vpblendw $0xAA,%ymm\x,%ymm\b1,%ymm\x -vpsrld $16,%ymm\b0,%ymm\b0 -vpsrld $16,%ymm\b1,%ymm\b1 -vpackusdw %ymm\x,%ymm\y,%ymm\y -vpackusdw %ymm\b1,%ymm\b0,%ymm\b0 +vmovdqa %ymm13,(%rsp) -#reduce -vpmullw %ymm1,%ymm\z,%ymm\z -vpmullw %ymm1,%ymm\y,%ymm\y -vpmulhw %ymm0,%ymm\z,%ymm\z -vpmulhw %ymm0,%ymm\y,%ymm\y -vpsubw %ymm\z,%ymm\a0,%ymm\a0 -vpsubw %ymm\y,%ymm\b0,%ymm\b0 +vpmullw %ymm5,%ymm9,%ymm13 # a0c0.lo +vpmullw %ymm6,%ymm9,%ymm9 # a0d0.lo +vpmullw %ymm5,%ymm10,%ymm5 # b0c0.lo +vpmullw %ymm6,%ymm10,%ymm10 # b0d0.lo + +vpmullw %ymm7,%ymm11,%ymm6 # a1c1.lo +vpmullw %ymm8,%ymm11,%ymm11 # a1d1.lo +vpmullw %ymm7,%ymm12,%ymm7 # b1c1.lo +vpmullw %ymm8,%ymm12,%ymm12 # b1d1.lo + +vmovdqa _16XQ*2(%rcx),%ymm8 +vpmulhw %ymm8,%ymm13,%ymm13 +vpmulhw %ymm8,%ymm9,%ymm9 +vpmulhw %ymm8,%ymm5,%ymm5 +vpmulhw %ymm8,%ymm10,%ymm10 +vpmulhw %ymm8,%ymm6,%ymm6 +vpmulhw %ymm8,%ymm11,%ymm11 +vpmulhw %ymm8,%ymm7,%ymm7 +vpmulhw %ymm8,%ymm12,%ymm12 + +vpsubw (%rsp),%ymm13,%ymm13 # -a0c0 +vpsubw %ymm9,%ymm1,%ymm9 # a0d0 +vpsubw %ymm5,%ymm14,%ymm5 # b0c0 +vpsubw %ymm10,%ymm2,%ymm10 # b0d0 + +vpsubw %ymm6,%ymm15,%ymm6 # a1c1 +vpsubw %ymm11,%ymm3,%ymm11 # a1d1 +vpsubw %ymm7,%ymm0,%ymm7 # b1c1 +vpsubw %ymm12,%ymm4,%ymm12 # b1d1 + +vmovdqa (%r9),%ymm0 +vmovdqa 32(%r9),%ymm1 +vpmullw %ymm0,%ymm10,%ymm2 +vpmullw %ymm0,%ymm12,%ymm3 +vpmulhw %ymm1,%ymm10,%ymm10 +vpmulhw %ymm1,%ymm12,%ymm12 +vpmulhw %ymm8,%ymm2,%ymm2 +vpmulhw %ymm8,%ymm3,%ymm3 +vpsubw %ymm2,%ymm10,%ymm10 # rb0d0 +vpsubw %ymm3,%ymm12,%ymm12 # rb1d1 + +vpaddw %ymm5,%ymm9,%ymm9 +vpaddw %ymm7,%ymm11,%ymm11 +vpsubw %ymm13,%ymm10,%ymm13 +vpsubw %ymm12,%ymm6,%ymm6 + +vmovdqa %ymm13,(64*\off+ 0)*2(%rdi) +vmovdqa %ymm9,(64*\off+16)*2(%rdi) +vmovdqa %ymm6,(64*\off+32)*2(%rdi) +vmovdqa %ymm11,(64*\off+48)*2(%rdi) .endm .text -basemul64_acc_avx: -poly0.0: -schoolbook 0,0 - -#mov -vmovdqa %ymm14,%ymm3 -vmovdqa %ymm9,%ymm4 -vmovdqa %ymm12,%ymm5 -vmovdqa %ymm7,%ymm6 - -poly1.0: -schoolbook 512,0 - -#add -vpaddd %ymm14,%ymm3,%ymm3 -vpaddd %ymm9,%ymm4,%ymm4 -vpaddd %ymm12,%ymm5,%ymm5 -vpaddd %ymm7,%ymm6,%ymm6 - - - -#reduce -red 3,4,5,6,7,8,9 - -#store -vmovdqa %ymm3,(%rdi) -vmovdqa %ymm5,32(%rdi) - -poly0.1: -schoolbook 64,1 - -#mov -vmovdqa %ymm14,%ymm3 -vmovdqa %ymm9,%ymm4 -vmovdqa %ymm12,%ymm5 -vmovdqa %ymm7,%ymm6 - -poly1.1: -schoolbook 576,1 - -#add -vpaddd %ymm14,%ymm3,%ymm3 -vpaddd %ymm9,%ymm4,%ymm4 -vpaddd %ymm12,%ymm5,%ymm5 -vpaddd %ymm7,%ymm6,%ymm6 - - - -#reduce -red 3,4,5,6,7,8,9 - -#store -vmovdqa %ymm3,64(%rdi) -vmovdqa %ymm5,96(%rdi) - -ret - -.global cdecl(PQCLEAN_KYBER51290S_AVX2_basemul_acc_avx) -.global _cdecl(PQCLEAN_KYBER51290S_AVX2_basemul_acc_avx) -cdecl(PQCLEAN_KYBER51290S_AVX2_basemul_acc_avx): -_cdecl(PQCLEAN_KYBER51290S_AVX2_basemul_acc_avx): -#consts -vmovdqa _16XQ*2(%rcx),%ymm0 -vmovdqa _16XQINV*2(%rcx),%ymm1 - -vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 -call basemul64_acc_avx - -vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_acc_avx - -vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_acc_avx - -vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_acc_avx - -ret - -basemul64_avx: -schoolbook 0,0 - -#reduce -red 14,9,12,7,8,10,11 - -#store -vmovdqa %ymm14,(%rdi) -vmovdqa %ymm12,32(%rdi) - -schoolbook 64,1 - -#reduce -red 14,9,12,7,8,10,11 - -#store -vmovdqa %ymm14,64(%rdi) -vmovdqa %ymm12,96(%rdi) - -ret - .global cdecl(PQCLEAN_KYBER51290S_AVX2_basemul_avx) .global _cdecl(PQCLEAN_KYBER51290S_AVX2_basemul_avx) cdecl(PQCLEAN_KYBER51290S_AVX2_basemul_avx): _cdecl(PQCLEAN_KYBER51290S_AVX2_basemul_avx): -#consts -vmovdqa _16XQ*2(%rcx),%ymm0 -vmovdqa _16XQINV*2(%rcx),%ymm1 +mov %rsp,%r8 +and $-32,%rsp +sub $32,%rsp -vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 -call basemul64_avx +lea (_ZETAS_EXP+176)*2(%rcx),%r9 +schoolbook 0 -vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_avx +add $32*2,%r9 +schoolbook 1 -vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_avx +add $192*2,%r9 +schoolbook 2 -vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_avx +add $32*2,%r9 +schoolbook 3 +mov %r8,%rsp ret diff --git a/crypto_kem/kyber512-90s/avx2/cbd.c b/crypto_kem/kyber512-90s/avx2/cbd.c index 33a14f63..8898cbe2 100644 --- a/crypto_kem/kyber512-90s/avx2/cbd.c +++ b/crypto_kem/kyber512-90s/avx2/cbd.c @@ -4,66 +4,125 @@ #include /************************************************* -* Name: PQCLEAN_KYBER51290S_AVX2_cbd +* Name: cbd2 * * Description: Given an array of uniformly random bytes, compute * polynomial with coefficients distributed according to -* a centered binomial distribution with parameter KYBER_ETA +* a centered binomial distribution with parameter eta=2 * -* Arguments: - poly *r: pointer to output polynomial -* - const unsigned char *buf: pointer to input byte array +* Arguments: - poly *r: pointer to output polynomial +* - const __m256i *buf: pointer to aligned input byte array **************************************************/ -void PQCLEAN_KYBER51290S_AVX2_cbd(poly *restrict r, const uint8_t *restrict buf) { +static void cbd2(poly *restrict r, const __m256i buf[2 * KYBER_N / 128]) { unsigned int i; - __m256i vec0, vec1, vec2, vec3, tmp; + __m256i f0, f1, f2, f3; const __m256i mask55 = _mm256_set1_epi32(0x55555555); const __m256i mask33 = _mm256_set1_epi32(0x33333333); const __m256i mask03 = _mm256_set1_epi32(0x03030303); + const __m256i mask0F = _mm256_set1_epi32(0x0F0F0F0F); for (i = 0; i < KYBER_N / 64; i++) { - vec0 = _mm256_load_si256((__m256i *)&buf[32 * i]); + f0 = _mm256_load_si256(&buf[i]); - vec1 = _mm256_srli_epi32(vec0, 1); - vec0 = _mm256_and_si256(mask55, vec0); - vec1 = _mm256_and_si256(mask55, vec1); - vec0 = _mm256_add_epi32(vec0, vec1); + f1 = _mm256_srli_epi16(f0, 1); + f0 = _mm256_and_si256(mask55, f0); + f1 = _mm256_and_si256(mask55, f1); + f0 = _mm256_add_epi8(f0, f1); - vec1 = _mm256_srli_epi32(vec0, 2); - vec0 = _mm256_and_si256(mask33, vec0); - vec1 = _mm256_and_si256(mask33, vec1); + f1 = _mm256_srli_epi16(f0, 2); + f0 = _mm256_and_si256(mask33, f0); + f1 = _mm256_and_si256(mask33, f1); + f0 = _mm256_add_epi8(f0, mask33); + f0 = _mm256_sub_epi8(f0, f1); - vec2 = _mm256_srli_epi32(vec0, 4); - vec3 = _mm256_srli_epi32(vec1, 4); - vec0 = _mm256_and_si256(mask03, vec0); - vec1 = _mm256_and_si256(mask03, vec1); - vec2 = _mm256_and_si256(mask03, vec2); - vec3 = _mm256_and_si256(mask03, vec3); + f1 = _mm256_srli_epi16(f0, 4); + f0 = _mm256_and_si256(mask0F, f0); + f1 = _mm256_and_si256(mask0F, f1); + f0 = _mm256_sub_epi8(f0, mask03); + f1 = _mm256_sub_epi8(f1, mask03); - vec1 = _mm256_sub_epi8(vec0, vec1); - vec3 = _mm256_sub_epi8(vec2, vec3); + f2 = _mm256_unpacklo_epi8(f0, f1); + f3 = _mm256_unpackhi_epi8(f0, f1); - vec0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec1)); - vec1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec1, 1)); - vec2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec3)); - vec3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec3, 1)); + f0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f2)); + f1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f2, 1)); + f2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f3)); + f3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f3, 1)); - tmp = _mm256_unpacklo_epi16(vec0, vec2); - vec2 = _mm256_unpackhi_epi16(vec0, vec2); - vec0 = tmp; - tmp = _mm256_unpacklo_epi16(vec1, vec3); - vec3 = _mm256_unpackhi_epi16(vec1, vec3); - vec1 = tmp; - - tmp = _mm256_permute2x128_si256(vec0, vec2, 0x20); - vec2 = _mm256_permute2x128_si256(vec0, vec2, 0x31); - vec0 = tmp; - tmp = _mm256_permute2x128_si256(vec1, vec3, 0x20); - vec3 = _mm256_permute2x128_si256(vec1, vec3, 0x31); - vec1 = tmp; - - _mm256_store_si256((__m256i *)&r->coeffs[64 * i + 0], vec0); - _mm256_store_si256((__m256i *)&r->coeffs[64 * i + 16], vec2); - _mm256_store_si256((__m256i *)&r->coeffs[64 * i + 32], vec1); - _mm256_store_si256((__m256i *)&r->coeffs[64 * i + 48], vec3); + _mm256_store_si256(&r->vec[4 * i + 0], f0); + _mm256_store_si256(&r->vec[4 * i + 1], f2); + _mm256_store_si256(&r->vec[4 * i + 2], f1); + _mm256_store_si256(&r->vec[4 * i + 3], f3); } } + +/************************************************* +* Name: cbd3 +* +* Description: Given an array of uniformly random bytes, compute +* polynomial with coefficients distributed according to +* a centered binomial distribution with parameter eta=3 +* This function is only needed for Kyber-512 +* +* Arguments: - poly *r: pointer to output polynomial +* - const __m256i *buf: pointer to aligned input byte array +**************************************************/ +static void cbd3(poly *restrict r, const uint8_t buf[3 * KYBER_N / 4 + 8]) { + unsigned int i; + __m256i f0, f1, f2, f3; + const __m256i mask249 = _mm256_set1_epi32(0x249249); + const __m256i mask6DB = _mm256_set1_epi32(0x6DB6DB); + const __m256i mask07 = _mm256_set1_epi32(7); + const __m256i mask70 = _mm256_set1_epi32(7 << 16); + const __m256i mask3 = _mm256_set1_epi16(3); + const __m256i shufbidx = _mm256_set_epi8(-1, 15, 14, 13, -1, 12, 11, 10, -1, 9, 8, 7, -1, 6, 5, 4, + -1, 11, 10, 9, -1, 8, 7, 6, -1, 5, 4, 3, -1, 2, 1, 0); + + for (i = 0; i < KYBER_N / 32; i++) { + f0 = _mm256_loadu_si256((__m256i *)&buf[24 * i]); + f0 = _mm256_permute4x64_epi64(f0, 0x94); + f0 = _mm256_shuffle_epi8(f0, shufbidx); + + f1 = _mm256_srli_epi32(f0, 1); + f2 = _mm256_srli_epi32(f0, 2); + f0 = _mm256_and_si256(mask249, f0); + f1 = _mm256_and_si256(mask249, f1); + f2 = _mm256_and_si256(mask249, f2); + f0 = _mm256_add_epi32(f0, f1); + f0 = _mm256_add_epi32(f0, f2); + + f1 = _mm256_srli_epi32(f0, 3); + f0 = _mm256_add_epi32(f0, mask6DB); + f0 = _mm256_sub_epi32(f0, f1); + + f1 = _mm256_slli_epi32(f0, 10); + f2 = _mm256_srli_epi32(f0, 12); + f3 = _mm256_srli_epi32(f0, 2); + f0 = _mm256_and_si256(f0, mask07); + f1 = _mm256_and_si256(f1, mask70); + f2 = _mm256_and_si256(f2, mask07); + f3 = _mm256_and_si256(f3, mask70); + f0 = _mm256_add_epi16(f0, f1); + f1 = _mm256_add_epi16(f2, f3); + f0 = _mm256_sub_epi16(f0, mask3); + f1 = _mm256_sub_epi16(f1, mask3); + + f2 = _mm256_unpacklo_epi32(f0, f1); + f3 = _mm256_unpackhi_epi32(f0, f1); + + f0 = _mm256_permute2x128_si256(f2, f3, 0x20); + f1 = _mm256_permute2x128_si256(f2, f3, 0x31); + + _mm256_store_si256(&r->vec[2 * i + 0], f0); + _mm256_store_si256(&r->vec[2 * i + 1], f1); + } +} + +/* buf 32 bytes longer for cbd3 */ +void PQCLEAN_KYBER51290S_AVX2_poly_cbd_eta1(poly *r, const __m256i buf[KYBER_ETA1 * KYBER_N / 128 + 1]) { + cbd3(r, (uint8_t *)buf); +} + +void PQCLEAN_KYBER51290S_AVX2_poly_cbd_eta2(poly *r, const __m256i buf[KYBER_ETA2 * KYBER_N / 128]) { + cbd2(r, buf); +} diff --git a/crypto_kem/kyber512-90s/avx2/cbd.h b/crypto_kem/kyber512-90s/avx2/cbd.h index 2415f5fd..72086475 100644 --- a/crypto_kem/kyber512-90s/avx2/cbd.h +++ b/crypto_kem/kyber512-90s/avx2/cbd.h @@ -2,8 +2,11 @@ #define PQCLEAN_KYBER51290S_AVX2_CBD_H #include "params.h" #include "poly.h" +#include #include -void PQCLEAN_KYBER51290S_AVX2_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]); +void PQCLEAN_KYBER51290S_AVX2_poly_cbd_eta1(poly *r, const __m256i buf[KYBER_ETA1 * KYBER_N / 128 + 1]); + +void PQCLEAN_KYBER51290S_AVX2_poly_cbd_eta2(poly *r, const __m256i buf[KYBER_ETA2 * KYBER_N / 128]); #endif diff --git a/crypto_kem/kyber512-90s/avx2/cdecl.h b/crypto_kem/kyber512-90s/avx2/cdecl.h index 79fd29ba..4a2a0a54 100644 --- a/crypto_kem/kyber512-90s/avx2/cdecl.h +++ b/crypto_kem/kyber512-90s/avx2/cdecl.h @@ -1,6 +1,8 @@ #ifndef PQCLEAN_KYBER51290S_AVX2_CDECL_H #define PQCLEAN_KYBER51290S_AVX2_CDECL_H + + #define _16XQ 0 #define _16XQINV 16 #define _16XV 32 @@ -9,9 +11,10 @@ #define _16XMONTSQLO 80 #define _16XMONTSQHI 96 #define _16XMASK 112 -#define _ZETAS_EXP 128 -#define _ZETAS_INV_EXP 528 - +#define _REVIDXB 128 +#define _REVIDXD 144 +#define _ZETAS_EXP 160 +#define _16XSHIFT 624 /* The C ABI on MacOS exports all symbols with a leading * underscore. This means that any symbols we refer to from @@ -23,4 +26,5 @@ #define _cdecl(s) _##s #define cdecl(s) s + #endif diff --git a/crypto_kem/kyber512-90s/avx2/consts.c b/crypto_kem/kyber512-90s/avx2/consts.c index 05e4122d..77f7ed73 100644 --- a/crypto_kem/kyber512-90s/avx2/consts.c +++ b/crypto_kem/kyber512-90s/avx2/consts.c @@ -1,155 +1,123 @@ +#include "align.h" #include "consts.h" #include "params.h" -#include + #define Q KYBER_Q -#define MONT ((1U << 16) % Q) -#define QINV 62209 // q^-1 mod 2^16 -#define V (((1U << 26) + Q/2)/Q) -#define FHI (MONT*(MONT*(Q-1)*((Q-1)/128) % Q) % Q) -#define FLO (FHI*QINV % 65536) -#define MONTSQHI (MONT*MONT % Q) -#define MONTSQLO (MONTSQHI*QINV % 65536) +#define MONT (-1044) // 2^16 mod q +#define QINV (-3327) // q^-1 mod 2^16 +#define V 20159 // floor(2^26/q + 0.5) +#define FHI 1441 // mont^2/128 +#define FLO (-10079) // qinv*FHI +#define MONTSQHI 1353 // mont^2 +#define MONTSQLO 20553 // qinv*MONTSQHI #define MASK 4095 +#define SHIFT 32 - -const qdata_t PQCLEAN_KYBER51290S_AVX2_qdata = {.as_arr = { -#define _16XQ 0 +const qdata_t PQCLEAN_KYBER51290S_AVX2_qdata = {.coeffs = { +//#define _16XQ 0 Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, -#define _16XQINV 16 +//#define _16XQINV 16 QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, -#define _16XV 32 +//#define _16XV 32 V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, -#define _16XFLO 48 +//#define _16XFLO 48 FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, -#define _16XFHI 64 +//#define _16XFHI 64 FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, -#define _16XMONTSQLO 80 +//#define _16XMONTSQLO 80 MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, -#define _16XMONTSQHI 96 +//#define _16XMONTSQHI 96 MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, -#define _16XMASK 112 +//#define _16XMASK 112 MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, -#define _ZETAS_EXP 128 - 31499, 31499, 2571, 2571, 14746, 14746, 2970, 2970, - 13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, - 53134, 53134, 53134, 53134, 53134, 53134, 53134, 53134, - 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, - 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, - 44630, 44630, 44630, 44630, 27758, 27758, 27758, 27758, - 61737, 61737, 61737, 61737, 49846, 49846, 49846, 49846, - 3158, 3158, 3158, 3158, 622, 622, 622, 622, - 1577, 1577, 1577, 1577, 182, 182, 182, 182, - 59709, 59709, 17364, 17364, 39176, 39176, 36479, 36479, - 5572, 5572, 64434, 64434, 21439, 21439, 39295, 39295, - 573, 573, 2004, 2004, 264, 264, 383, 383, - 2500, 2500, 1458, 1458, 1727, 1727, 3199, 3199, - 59847, 59020, 1497, 30967, 41972, 20179, 20711, 25081, - 52740, 26617, 16065, 53095, 9135, 64887, 39550, 27837, - 1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, - 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, - 65202, 54059, 33310, 20494, 37798, 945, 50654, 6182, - 32011, 10631, 29176, 36775, 47051, 17561, 51106, 60261, - 2226, 555, 2078, 1550, 422, 177, 3038, 1574, - 3083, 1159, 2552, 2727, 1739, 2457, 418, 3173, - 11182, 13387, 51303, 43881, 13131, 60950, 23093, 5493, - 33034, 30318, 46795, 12639, 20100, 18525, 19529, 52918, - 430, 843, 871, 105, 587, 3094, 2869, 1653, - 778, 3182, 1483, 1119, 644, 349, 329, 3254, - 788, 788, 1812, 1812, 28191, 28191, 28191, 28191, - 28191, 28191, 28191, 28191, 48842, 48842, 48842, 48842, - 48842, 48842, 48842, 48842, 287, 287, 287, 287, - 287, 287, 287, 287, 202, 202, 202, 202, - 202, 202, 202, 202, 10690, 10690, 10690, 10690, - 1359, 1359, 1359, 1359, 54335, 54335, 54335, 54335, - 31164, 31164, 31164, 31164, 962, 962, 962, 962, - 2127, 2127, 2127, 2127, 1855, 1855, 1855, 1855, - 1468, 1468, 1468, 1468, 37464, 37464, 24313, 24313, - 55004, 55004, 8800, 8800, 18427, 18427, 8859, 8859, - 26676, 26676, 49374, 49374, 2648, 2648, 1017, 1017, - 732, 732, 608, 608, 1787, 1787, 411, 411, - 3124, 3124, 1758, 1758, 19884, 37287, 49650, 56638, - 37227, 9076, 35338, 18250, 13427, 14017, 36381, 52780, - 16832, 4312, 41381, 47622, 2476, 3239, 3058, 830, - 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, - 448, 2264, 677, 2054, 34353, 25435, 58154, 24392, - 44610, 10946, 24215, 16990, 10336, 57603, 43035, 10907, - 31637, 28644, 23998, 48114, 817, 603, 1322, 1864, - 2114, 1218, 2455, 2142, 2144, 2051, 1819, 2459, - 3221, 996, 958, 1522, 20297, 2146, 15356, 33152, - 59257, 50634, 54492, 14470, 44039, 45338, 23211, 48094, - 41677, 45279, 7757, 23132, 1097, 610, 2044, 384, - 3193, 1994, 220, 1670, 1799, 794, 2475, 478, - 3021, 991, 1869, 1628, 0, 0, 0, 0, +//#define _REVIDXB 128 + 3854, 3340, 2826, 2312, 1798, 1284, 770, 256, + 3854, 3340, 2826, 2312, 1798, 1284, 770, 256, -#define _ZETAS_INV_EXP 528 - 42405, 57780, 20258, 23860, 17443, 42326, 20199, 21498, - 51067, 11045, 14903, 6280, 32385, 50181, 63391, 45240, - 1701, 1460, 2338, 308, 2851, 854, 2535, 1530, - 1659, 3109, 1335, 136, 2945, 1285, 2719, 2232, - 17423, 41539, 36893, 33900, 54630, 22502, 7934, 55201, - 48547, 41322, 54591, 20927, 41145, 7383, 40102, 31184, - 1807, 2371, 2333, 108, 870, 1510, 1278, 1185, - 1187, 874, 2111, 1215, 1465, 2007, 2726, 2512, - 17915, 24156, 61225, 48705, 12757, 29156, 51520, 52110, - 47287, 30199, 56461, 28310, 8899, 15887, 28250, 45653, - 1275, 2652, 1065, 2881, 725, 1508, 2368, 398, - 951, 247, 1421, 3222, 2499, 271, 90, 853, - 16163, 16163, 38861, 38861, 56678, 56678, 47110, 47110, - 56737, 56737, 10533, 10533, 41224, 41224, 28073, 28073, - 1571, 1571, 205, 205, 2918, 2918, 1542, 1542, - 2721, 2721, 2597, 2597, 2312, 2312, 681, 681, - 34373, 34373, 34373, 34373, 11202, 11202, 11202, 11202, - 64178, 64178, 64178, 64178, 54847, 54847, 54847, 54847, - 1861, 1861, 1861, 1861, 1474, 1474, 1474, 1474, - 1202, 1202, 1202, 1202, 2367, 2367, 2367, 2367, - 16695, 16695, 16695, 16695, 16695, 16695, 16695, 16695, - 37346, 37346, 37346, 37346, 37346, 37346, 37346, 37346, - 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127, - 3042, 3042, 3042, 3042, 3042, 3042, 3042, 3042, - 64749, 64749, 1517, 1517, 12619, 46008, 47012, 45437, - 52898, 18742, 35219, 32503, 60044, 42444, 4587, 52406, - 21656, 14234, 52150, 54355, 75, 3000, 2980, 2685, - 2210, 1846, 147, 2551, 1676, 460, 235, 2742, - 3224, 2458, 2486, 2899, 5276, 14431, 47976, 18486, - 28762, 36361, 54906, 33526, 59355, 14883, 64592, 27739, - 45043, 32227, 11478, 335, 156, 2911, 872, 1590, - 602, 777, 2170, 246, 1755, 291, 3152, 2907, - 1779, 1251, 2774, 1103, 37700, 25987, 650, 56402, - 12442, 49472, 38920, 12797, 40456, 44826, 45358, 23565, - 34570, 64040, 6517, 5690, 1860, 3203, 1162, 1618, - 666, 320, 8, 2813, 1544, 282, 1838, 1293, - 2314, 552, 2677, 2106, 26242, 26242, 44098, 44098, - 1103, 1103, 59965, 59965, 29058, 29058, 26361, 26361, - 48173, 48173, 5828, 5828, 130, 130, 1602, 1602, - 1871, 1871, 829, 829, 2946, 2946, 3065, 3065, - 1325, 1325, 2756, 2756, 15691, 15691, 15691, 15691, - 3800, 3800, 3800, 3800, 37779, 37779, 37779, 37779, - 20907, 20907, 20907, 20907, 3147, 3147, 3147, 3147, - 1752, 1752, 1752, 1752, 2707, 2707, 2707, 2707, - 171, 171, 171, 171, 12403, 12403, 12403, 12403, - 12403, 12403, 12403, 12403, 52012, 52012, 52012, 52012, - 52012, 52012, 52012, 52012, 1907, 1907, 1907, 1907, - 1907, 1907, 1907, 1907, 1836, 1836, 1836, 1836, - 1836, 1836, 1836, 1836, 50791, 50791, 359, 359, - 60300, 60300, 1932, 1932, 0, 0, 0, 0 +//#define _REVIDXD 144 + 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0, 0, + +//#define _ZETAS_EXP 160 + 31498, 31498, 31498, 31498, -758, -758, -758, -758, + 5237, 5237, 5237, 5237, 1397, 1397, 1397, 1397, + 14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745, + 14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745, + -359, -359, -359, -359, -359, -359, -359, -359, + -359, -359, -359, -359, -359, -359, -359, -359, + 13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, + -12402, -12402, -12402, -12402, -12402, -12402, -12402, -12402, + 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, + 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, + -20907, -20907, -20907, -20907, 27758, 27758, 27758, 27758, + -3799, -3799, -3799, -3799, -15690, -15690, -15690, -15690, + -171, -171, -171, -171, 622, 622, 622, 622, + 1577, 1577, 1577, 1577, 182, 182, 182, 182, + -5827, -5827, 17363, 17363, -26360, -26360, -29057, -29057, + 5571, 5571, -1102, -1102, 21438, 21438, -26242, -26242, + 573, 573, -1325, -1325, 264, 264, 383, 383, + -829, -829, 1458, 1458, -1602, -1602, -130, -130, + -5689, -6516, 1496, 30967, -23565, 20179, 20710, 25080, + -12796, 26616, 16064, -12442, 9134, -650, -25986, 27837, + 1223, 652, -552, 1015, -1293, 1491, -282, -1544, + 516, -8, -320, -666, -1618, -1162, 126, 1469, + -335, -11477, -32227, 20494, -27738, 945, -14883, 6182, + 32010, 10631, 29175, -28762, -18486, 17560, -14430, -5276, + -1103, 555, -1251, 1550, 422, 177, -291, 1574, + -246, 1159, -777, -602, -1590, -872, 418, -156, + 11182, 13387, -14233, -21655, 13131, -4587, 23092, 5493, + -32502, 30317, -18741, 12639, 20100, 18525, 19529, -12619, + 430, 843, 871, 105, 587, -235, -460, 1653, + 778, -147, 1483, 1119, 644, 349, 329, -75, + 787, 787, 787, 787, 787, 787, 787, 787, + 787, 787, 787, 787, 787, 787, 787, 787, + -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + 28191, 28191, 28191, 28191, 28191, 28191, 28191, 28191, + -16694, -16694, -16694, -16694, -16694, -16694, -16694, -16694, + 287, 287, 287, 287, 287, 287, 287, 287, + 202, 202, 202, 202, 202, 202, 202, 202, + 10690, 10690, 10690, 10690, 1358, 1358, 1358, 1358, + -11202, -11202, -11202, -11202, 31164, 31164, 31164, 31164, + 962, 962, 962, 962, -1202, -1202, -1202, -1202, + -1474, -1474, -1474, -1474, 1468, 1468, 1468, 1468, + -28073, -28073, 24313, 24313, -10532, -10532, 8800, 8800, + 18426, 18426, 8859, 8859, 26675, 26675, -16163, -16163, + -681, -681, 1017, 1017, 732, 732, 608, 608, + -1542, -1542, 411, 411, -205, -205, -1571, -1571, + 19883, -28250, -15887, -8898, -28309, 9075, -30199, 18249, + 13426, 14017, -29156, -12757, 16832, 4311, -24155, -17915, + -853, -90, -271, 830, 107, -1421, -247, -951, + -398, 961, -1508, -725, 448, -1065, 677, -1275, + -31183, 25435, -7382, 24391, -20927, 10946, 24214, 16989, + 10335, -7934, -22502, 10906, 31636, 28644, 23998, -17422, + 817, 603, 1322, -1465, -1215, 1218, -874, -1187, + -1185, -1278, -1510, -870, -108, 996, 958, 1522, + 20297, 2146, 15355, -32384, -6280, -14903, -11044, 14469, + -21498, -20198, 23210, -17442, -23860, -20257, 7756, 23132, + 1097, 610, -1285, 384, -136, -1335, 220, -1659, + -1530, 794, -854, 478, -308, 991, -1460, 1628, + +//#define _16XSHIFT 624 + SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, + SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT } }; diff --git a/crypto_kem/kyber512-90s/avx2/consts.h b/crypto_kem/kyber512-90s/avx2/consts.h index 49272952..c09524cd 100644 --- a/crypto_kem/kyber512-90s/avx2/consts.h +++ b/crypto_kem/kyber512-90s/avx2/consts.h @@ -1,19 +1,10 @@ #ifndef PQCLEAN_KYBER51290S_AVX2_CONSTS_H #define PQCLEAN_KYBER51290S_AVX2_CONSTS_H +#include "align.h" #include "cdecl.h" -#include "params.h" -#include -#include -#define ALIGNED_UINT16_T(N) \ - union { \ - __m256i as_vec; \ - uint16_t as_arr[(N)]; \ - } - -typedef ALIGNED_UINT16_T(928) qdata_t; - +typedef ALIGNED_INT16(640) qdata_t; extern const qdata_t PQCLEAN_KYBER51290S_AVX2_qdata; #endif diff --git a/crypto_kem/kyber512-90s/avx2/fq.S b/crypto_kem/kyber512-90s/avx2/fq.S index 2655380c..7d47f8b8 100644 --- a/crypto_kem/kyber512-90s/avx2/fq.S +++ b/crypto_kem/kyber512-90s/avx2/fq.S @@ -13,14 +13,14 @@ vmovdqa 160(%rdi),%ymm7 vmovdqa 192(%rdi),%ymm8 vmovdqa 224(%rdi),%ymm9 -red16 2,10 -red16 3,11 -red16 4,12 -red16 5,13 -red16 6,14 -red16 7,15 -red16 8,10 -red16 9,11 +red16 2 +red16 3 +red16 4 +red16 5 +red16 6 +red16 7 +red16 8 +red16 9 #store vmovdqa %ymm2,(%rdi) @@ -46,49 +46,6 @@ add $256,%rdi call reduce128_avx ret -csubq128_avx: -#load -vmovdqa (%rdi),%ymm1 -vmovdqa 32(%rdi),%ymm2 -vmovdqa 64(%rdi),%ymm3 -vmovdqa 96(%rdi),%ymm4 -vmovdqa 128(%rdi),%ymm5 -vmovdqa 160(%rdi),%ymm6 -vmovdqa 192(%rdi),%ymm7 -vmovdqa 224(%rdi),%ymm8 - -csubq 1,9 -csubq 2,10 -csubq 3,11 -csubq 4,12 -csubq 5,13 -csubq 6,14 -csubq 7,15 -csubq 8,9 - -#store -vmovdqa %ymm1,(%rdi) -vmovdqa %ymm2,32(%rdi) -vmovdqa %ymm3,64(%rdi) -vmovdqa %ymm4,96(%rdi) -vmovdqa %ymm5,128(%rdi) -vmovdqa %ymm6,160(%rdi) -vmovdqa %ymm7,192(%rdi) -vmovdqa %ymm8,224(%rdi) - -ret - -.global cdecl(PQCLEAN_KYBER51290S_AVX2_csubq_avx) -.global _cdecl(PQCLEAN_KYBER51290S_AVX2_csubq_avx) -cdecl(PQCLEAN_KYBER51290S_AVX2_csubq_avx): -_cdecl(PQCLEAN_KYBER51290S_AVX2_csubq_avx): -#consts -vmovdqa _16XQ*2(%rsi),%ymm0 -call csubq128_avx -add $256,%rdi -call csubq128_avx -ret - tomont128_avx: #load vmovdqa (%rdi),%ymm3 diff --git a/crypto_kem/kyber512-90s/avx2/fq.inc b/crypto_kem/kyber512-90s/avx2/fq.inc index 75df098a..4b7afc31 100644 --- a/crypto_kem/kyber512-90s/avx2/fq.inc +++ b/crypto_kem/kyber512-90s/avx2/fq.inc @@ -1,6 +1,10 @@ -.macro red16 r,x=12 +.macro red16 r,rs=0,x=12 vpmulhw %ymm1,%ymm\r,%ymm\x +.if \rs +vpmulhrsw %ymm\rs,%ymm\x,%ymm\x +.else vpsraw $10,%ymm\x,%ymm\x +.endif vpmullw %ymm0,%ymm\x,%ymm\x vpsubw %ymm\x,%ymm\r,%ymm\r .endm @@ -10,9 +14,6 @@ vpsubw %ymm0,%ymm\r,%ymm\r vpsraw $15,%ymm\r,%ymm\x vpand %ymm0,%ymm\x,%ymm\x vpaddw %ymm\x,%ymm\r,%ymm\r -#vpcmpgtw %ymm0,%ymm\r,%ymm\x -#vpand %ymm0,%ymm\x,%ymm\x -#vpsubw %ymm\x,%ymm\r,%ymm\r .endm .macro caddq r,x=12 diff --git a/crypto_kem/kyber512-90s/avx2/indcpa.c b/crypto_kem/kyber512-90s/avx2/indcpa.c index a0756386..92490a97 100644 --- a/crypto_kem/kyber512-90s/avx2/indcpa.c +++ b/crypto_kem/kyber512-90s/avx2/indcpa.c @@ -8,6 +8,7 @@ #include "randombytes.h" #include "rejsample.h" #include "symmetric.h" +#include #include #include @@ -15,11 +16,14 @@ * Name: pack_pk * * Description: Serialize the public key as concatenation of the -* serialized vector of polynomials pk -* and the public seed used to generate the matrix A. +* serialized vector of polynomials pk and the +* public seed used to generate the matrix A. +* The polynomial coefficients in pk are assumed to +* lie in the invertal [0,q], i.e. pk must be reduced +* by PQCLEAN_KYBER51290S_AVX2_polyvec_reduce(). * -* Arguments: uint8_t *r: pointer to the output serialized public key -* polyvec *pk: pointer to the input public-key polyvec +* Arguments: uint8_t *r: pointer to the output serialized public key +* polyvec *pk: pointer to the input public-key polyvec * const uint8_t *seed: pointer to the input public seed **************************************************/ static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], @@ -55,9 +59,12 @@ static void unpack_pk(polyvec *pk, /************************************************* * Name: pack_sk * -* Description: Serialize the secret key +* Description: Serialize the secret key. +* The polynomial coefficients in sk are assumed to +* lie in the invertal [0,q], i.e. sk must be reduced +* by PQCLEAN_KYBER51290S_AVX2_polyvec_reduce(). * -* Arguments: - uint8_t *r: pointer to output serialized secret key +* Arguments: - uint8_t *r: pointer to output serialized secret key * - polyvec *sk: pointer to input vector of polynomials (secret key) **************************************************/ static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { @@ -67,15 +74,12 @@ static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { /************************************************* * Name: unpack_sk * -* Description: De-serialize the secret key; -* inverse of pack_sk +* Description: De-serialize the secret key; inverse of pack_sk * -* Arguments: - polyvec *sk: pointer to output vector of polynomials -* (secret key) +* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key) * - const uint8_t *packedsk: pointer to input serialized secret key **************************************************/ -static void unpack_sk(polyvec *sk, - const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { +static void unpack_sk(polyvec *sk, const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { PQCLEAN_KYBER51290S_AVX2_polyvec_frombytes(sk, packedsk); } @@ -84,15 +88,16 @@ static void unpack_sk(polyvec *sk, * * Description: Serialize the ciphertext as concatenation of the * compressed and serialized vector of polynomials b -* and the compressed and serialized polynomial v +* and the compressed and serialized polynomial v. +* The polynomial coefficients in b and v are assumed to +* lie in the invertal [0,q], i.e. b and v must be reduced +* by PQCLEAN_KYBER51290S_AVX2_polyvec_reduce() and PQCLEAN_KYBER51290S_AVX2_poly_reduce(), respectively. * * Arguments: uint8_t *r: pointer to the output serialized ciphertext -* poly *pk: pointer to the input vector of polynomials b -* poly *v: pointer to the input polynomial v +* poly *pk: pointer to the input vector of polynomials b +* poly *v: pointer to the input polynomial v **************************************************/ -static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], - polyvec *b, - poly *v) { +static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], polyvec *b, poly *v) { PQCLEAN_KYBER51290S_AVX2_polyvec_compress(r, b); PQCLEAN_KYBER51290S_AVX2_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v); } @@ -103,13 +108,11 @@ static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], * Description: De-serialize and decompress ciphertext from a byte array; * approximate inverse of pack_ciphertext * -* Arguments: - polyvec *b: pointer to the output vector of polynomials b -* - poly *v: pointer to the output polynomial v +* Arguments: - polyvec *b: pointer to the output vector of polynomials b +* - poly *v: pointer to the output polynomial v * - const uint8_t *c: pointer to the input serialized ciphertext **************************************************/ -static void unpack_ciphertext(polyvec *b, - poly *v, - const uint8_t c[KYBER_INDCPA_BYTES]) { +static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t c[KYBER_INDCPA_BYTES]) { PQCLEAN_KYBER51290S_AVX2_polyvec_decompress(b, c); PQCLEAN_KYBER51290S_AVX2_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES); } @@ -120,11 +123,9 @@ static void unpack_ciphertext(polyvec *b, * Description: Run rejection sampling on uniform random bytes to generate * uniform random integers mod q * -* Arguments: - int16_t *r: pointer to output buffer -* - unsigned int len: requested number of 16-bit integers -* (uniform mod q) -* - const uint8_t *buf: pointer to input buffer -* (assumed to be uniform random bytes) +* Arguments: - int16_t *r: pointer to output array +* - unsigned int len: requested number of 16-bit integers (uniform mod q) +* - const uint8_t *buf: pointer to input buffer (assumed to be uniformly random bytes) * - unsigned int buflen: length of input buffer in bytes * * Returns number of sampled 16-bit integers (at most len) @@ -134,16 +135,19 @@ static unsigned int rej_uniform(int16_t *r, const uint8_t *buf, unsigned int buflen) { unsigned int ctr, pos; - uint16_t val; + uint16_t val0, val1; ctr = pos = 0; - while (ctr < len && pos + 2 <= buflen) { - val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); - pos += 2; + while (ctr < len && pos + 3 <= buflen) { + val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; + val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF; + pos += 3; - if (val < 19 * KYBER_Q) { - val -= ((uint32_t)val * 20159 >> 26) * KYBER_Q; // Barrett reduction - r[ctr++] = (int16_t)val; + if (val0 < KYBER_Q) { + r[ctr++] = val0; + } + if (ctr < len && val1 < KYBER_Q) { + r[ctr++] = val1; } } @@ -165,12 +169,11 @@ static unsigned int rej_uniform(int16_t *r, * - const uint8_t *seed: pointer to input seed * - int transposed: boolean deciding whether A or A^T is generated **************************************************/ -#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \ - + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) -void PQCLEAN_KYBER51290S_AVX2_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed) { - unsigned int ctr, i, j; - ALIGN16_TYPE(uint64_t) nonce = {.orig = 0}; - ALIGN32_ARRAY(uint8_t, GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES) buf; +void PQCLEAN_KYBER51290S_AVX2_gen_matrix(polyvec *a, const uint8_t seed[32], int transposed) { + unsigned int ctr, i, j, k; + unsigned int buflen, off; + uint64_t nonce = 0; + ALIGNED_UINT8(REJ_UNIFORM_AVX_NBLOCKS * AES256CTR_BLOCKBYTES) buf; aes256ctr_ctx state; PQCLEAN_KYBER51290S_AVX2_aes256ctr_init(&state, seed, 0); @@ -178,19 +181,24 @@ void PQCLEAN_KYBER51290S_AVX2_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SY for (i = 0; i < KYBER_K; i++) { for (j = 0; j < KYBER_K; j++) { if (transposed) { - nonce.orig = (j << 8) | i; + nonce = (j << 8) | i; } else { - nonce.orig = (i << 8) | j; + nonce = (i << 8) | j; } - state.n = _mm_loadl_epi64(&nonce.vec); - PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(buf.arr, GEN_MATRIX_NBLOCKS, &state); - ctr = PQCLEAN_KYBER51290S_AVX2_rej_uniform_avx(a[i].vec[j].coeffs, buf.arr); + state.n = _mm_loadl_epi64((__m128i *)&nonce); + PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(buf.coeffs, REJ_UNIFORM_AVX_NBLOCKS, &state); + buflen = REJ_UNIFORM_AVX_NBLOCKS * AES256CTR_BLOCKBYTES; + ctr = PQCLEAN_KYBER51290S_AVX2_rej_uniform_avx(a[i].vec[j].coeffs, buf.coeffs); while (ctr < KYBER_N) { - PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(buf.arr, 1, &state); - ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf.arr, - XOF_BLOCKBYTES); + off = buflen % 3; + for (k = 0; k < off; k++) { + buf.coeffs[k] = buf.coeffs[buflen - off + k]; + } + PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(buf.coeffs + off, 1, &state); + buflen = off + AES256CTR_BLOCKBYTES; + ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf.coeffs, buflen); } PQCLEAN_KYBER51290S_AVX2_poly_nttunpack(&a[i].vec[j]); @@ -212,39 +220,41 @@ void PQCLEAN_KYBER51290S_AVX2_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SY void PQCLEAN_KYBER51290S_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { unsigned int i; - ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; - const uint8_t *publicseed = buf.arr; - const uint8_t *noiseseed = buf.arr + KYBER_SYMBYTES; + uint8_t buf[2 * KYBER_SYMBYTES]; + const uint8_t *publicseed = buf; + const uint8_t *noiseseed = buf + KYBER_SYMBYTES; polyvec a[KYBER_K], e, pkpv, skpv; - randombytes(buf.arr, KYBER_SYMBYTES); - hash_g(buf.arr, buf.arr, KYBER_SYMBYTES); + randombytes(buf, KYBER_SYMBYTES); + hash_g(buf, buf, KYBER_SYMBYTES); gen_a(a, publicseed); - ALIGN16_TYPE(uint64_t) nonce = {.orig = 0}; +#define NOISE_NBLOCKS ((KYBER_ETA1*KYBER_N/4)/AES256CTR_BLOCKBYTES) /* Assumes divisibility */ + uint64_t nonce = 0; + ALIGNED_UINT8(NOISE_NBLOCKS * AES256CTR_BLOCKBYTES + 32) coins; // +32 bytes as required by PQCLEAN_KYBER51290S_AVX2_poly_cbd_eta1 aes256ctr_ctx state; - ALIGN32_ARRAY(uint8_t, 128) coins; - PQCLEAN_KYBER51290S_AVX2_aes256ctr_init(&state, noiseseed, nonce.orig++); + PQCLEAN_KYBER51290S_AVX2_aes256ctr_init(&state, noiseseed, nonce++); for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(coins.arr, 2, &state); - state.n = _mm_loadl_epi64(&nonce.vec); - nonce.orig++; - PQCLEAN_KYBER51290S_AVX2_cbd(&skpv.vec[i], coins.arr); + PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(coins.coeffs, NOISE_NBLOCKS, &state); + state.n = _mm_loadl_epi64((__m128i *)&nonce); + nonce += 1; + PQCLEAN_KYBER51290S_AVX2_poly_cbd_eta1(&skpv.vec[i], coins.vec); } for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(coins.arr, 2, &state); - state.n = _mm_loadl_epi64(&nonce.vec); - nonce.orig++; - PQCLEAN_KYBER51290S_AVX2_cbd(&e.vec[i], coins.arr); + PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(coins.coeffs, NOISE_NBLOCKS, &state); + state.n = _mm_loadl_epi64((__m128i *)&nonce); + nonce += 1; + PQCLEAN_KYBER51290S_AVX2_poly_cbd_eta1(&e.vec[i], coins.vec); } PQCLEAN_KYBER51290S_AVX2_polyvec_ntt(&skpv); + PQCLEAN_KYBER51290S_AVX2_polyvec_reduce(&skpv); PQCLEAN_KYBER51290S_AVX2_polyvec_ntt(&e); // matrix-vector multiplication for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER51290S_AVX2_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); + PQCLEAN_KYBER51290S_AVX2_polyvec_basemul_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); PQCLEAN_KYBER51290S_AVX2_poly_tomont(&pkpv.vec[i]); } @@ -261,70 +271,70 @@ void PQCLEAN_KYBER51290S_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBY * Description: Encryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *c: pointer to output ciphertext -* (of length KYBER_INDCPA_BYTES bytes) -* - const uint8_t *m: pointer to input message -* (of length KYBER_INDCPA_MSGBYTES bytes) -* - const uint8_t *pk: pointer to input public key -* (of length KYBER_INDCPA_PUBLICKEYBYTES) -* - const uint8_t *coins: pointer to input random coins -* used as seed (of length KYBER_SYMBYTES) -* to deterministically generate all -* randomness +* Arguments: - uint8_t *c: pointer to output ciphertext +* (of length KYBER_INDCPA_BYTES bytes) +* - const uint8_t *m: pointer to input message +* (of length KYBER_INDCPA_MSGBYTES bytes) +* - const uint8_t *pk: pointer to input public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES) +* - const uint8_t *coins: pointer to input random coins used as seed +* (of length KYBER_SYMBYTES) to deterministically +* generate all randomness **************************************************/ void PQCLEAN_KYBER51290S_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], const uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], const uint8_t coins[KYBER_SYMBYTES]) { unsigned int i; - ALIGN32_ARRAY(uint8_t, KYBER_SYMBYTES) seed; - polyvec sp, pkpv, ep, at[KYBER_K], bp; + uint8_t seed[KYBER_SYMBYTES]; + polyvec sp, pkpv, ep, at[KYBER_K], b; poly v, k, epp; - unpack_pk(&pkpv, seed.arr, pk); + unpack_pk(&pkpv, seed, pk); PQCLEAN_KYBER51290S_AVX2_poly_frommsg(&k, m); - gen_at(at, seed.arr); + gen_at(at, seed); - ALIGN16_TYPE(uint64_t) nonce = {.orig = 0}; +#define NOISE_NBLOCKS ((KYBER_ETA1*KYBER_N/4)/AES256CTR_BLOCKBYTES) /* Assumes divisibility */ +#define CIPHERTEXTNOISE_NBLOCKS ((KYBER_ETA2*KYBER_N/4)/AES256CTR_BLOCKBYTES) /* Assumes divisibility */ + uint64_t nonce = 0; + ALIGNED_UINT8(NOISE_NBLOCKS * AES256CTR_BLOCKBYTES + 32) buf; /* +32 bytes as required by PQCLEAN_KYBER51290S_AVX2_poly_cbd_eta1 */ aes256ctr_ctx state; - ALIGN32_ARRAY(uint8_t, 128) buf; - PQCLEAN_KYBER51290S_AVX2_aes256ctr_init(&state, coins, nonce.orig++); + PQCLEAN_KYBER51290S_AVX2_aes256ctr_init(&state, coins, nonce++); for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(buf.arr, 2, &state); - state.n = _mm_loadl_epi64(&nonce.vec); - nonce.orig++; - PQCLEAN_KYBER51290S_AVX2_cbd(&sp.vec[i], buf.arr); + PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(buf.coeffs, NOISE_NBLOCKS, &state); + state.n = _mm_loadl_epi64((__m128i *)&nonce); + nonce += 1; + PQCLEAN_KYBER51290S_AVX2_poly_cbd_eta1(&sp.vec[i], buf.vec); } for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(buf.arr, 2, &state); - state.n = _mm_loadl_epi64(&nonce.vec); - nonce.orig++; - PQCLEAN_KYBER51290S_AVX2_cbd(&ep.vec[i], buf.arr); + PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(buf.coeffs, CIPHERTEXTNOISE_NBLOCKS, &state); + state.n = _mm_loadl_epi64((__m128i *)&nonce); + nonce += 1; + PQCLEAN_KYBER51290S_AVX2_poly_cbd_eta2(&ep.vec[i], buf.vec); } - PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(buf.arr, 2, &state); - state.n = _mm_loadl_epi64(&nonce.vec); - nonce.orig++; - PQCLEAN_KYBER51290S_AVX2_cbd(&epp, buf.arr); + PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(buf.coeffs, CIPHERTEXTNOISE_NBLOCKS, &state); + state.n = _mm_loadl_epi64((__m128i *)&nonce); + nonce += 1; + PQCLEAN_KYBER51290S_AVX2_poly_cbd_eta2(&epp, buf.vec); PQCLEAN_KYBER51290S_AVX2_polyvec_ntt(&sp); // matrix-vector multiplication for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER51290S_AVX2_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp); + PQCLEAN_KYBER51290S_AVX2_polyvec_basemul_acc_montgomery(&b.vec[i], &at[i], &sp); } + PQCLEAN_KYBER51290S_AVX2_polyvec_basemul_acc_montgomery(&v, &pkpv, &sp); - PQCLEAN_KYBER51290S_AVX2_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp); - - PQCLEAN_KYBER51290S_AVX2_polyvec_invntt_tomont(&bp); + PQCLEAN_KYBER51290S_AVX2_polyvec_invntt_tomont(&b); PQCLEAN_KYBER51290S_AVX2_poly_invntt_tomont(&v); - PQCLEAN_KYBER51290S_AVX2_polyvec_add(&bp, &bp, &ep); + PQCLEAN_KYBER51290S_AVX2_polyvec_add(&b, &b, &ep); PQCLEAN_KYBER51290S_AVX2_poly_add(&v, &v, &epp); PQCLEAN_KYBER51290S_AVX2_poly_add(&v, &v, &k); - PQCLEAN_KYBER51290S_AVX2_polyvec_reduce(&bp); + PQCLEAN_KYBER51290S_AVX2_polyvec_reduce(&b); PQCLEAN_KYBER51290S_AVX2_poly_reduce(&v); - pack_ciphertext(c, &bp, &v); + pack_ciphertext(c, &b, &v); } /************************************************* @@ -333,24 +343,24 @@ void PQCLEAN_KYBER51290S_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], * Description: Decryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *m: pointer to output decrypted message -* (of length KYBER_INDCPA_MSGBYTES) -* - const uint8_t *c: pointer to input ciphertext -* (of length KYBER_INDCPA_BYTES) +* Arguments: - uint8_t *m: pointer to output decrypted message +* (of length KYBER_INDCPA_MSGBYTES) +* - const uint8_t *c: pointer to input ciphertext +* (of length KYBER_INDCPA_BYTES) * - const uint8_t *sk: pointer to input secret key * (of length KYBER_INDCPA_SECRETKEYBYTES) **************************************************/ void PQCLEAN_KYBER51290S_AVX2_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t c[KYBER_INDCPA_BYTES], const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { - polyvec bp, skpv; + polyvec b, skpv; poly v, mp; - unpack_ciphertext(&bp, &v, c); + unpack_ciphertext(&b, &v, c); unpack_sk(&skpv, sk); - PQCLEAN_KYBER51290S_AVX2_polyvec_ntt(&bp); - PQCLEAN_KYBER51290S_AVX2_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp); + PQCLEAN_KYBER51290S_AVX2_polyvec_ntt(&b); + PQCLEAN_KYBER51290S_AVX2_polyvec_basemul_acc_montgomery(&mp, &skpv, &b); PQCLEAN_KYBER51290S_AVX2_poly_invntt_tomont(&mp); PQCLEAN_KYBER51290S_AVX2_poly_sub(&mp, &v, &mp); diff --git a/crypto_kem/kyber512-90s/avx2/invntt.S b/crypto_kem/kyber512-90s/avx2/invntt.S index 0a1f28e1..d49bb282 100644 --- a/crypto_kem/kyber512-90s/avx2/invntt.S +++ b/crypto_kem/kyber512-90s/avx2/invntt.S @@ -2,22 +2,21 @@ .include "shuffle.inc" .include "fq.inc" -.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=1,zl1=1,zh0=2,zh1=2 -#update & mul -vpsubw %ymm\rh0,%ymm\rl0,%ymm12 -vpsubw %ymm\rh1,%ymm\rl1,%ymm13 -vpsubw %ymm\rh2,%ymm\rl2,%ymm14 - +.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=2,zl1=2,zh0=3,zh1=3 +vpsubw %ymm\rl0,%ymm\rh0,%ymm12 vpaddw %ymm\rh0,%ymm\rl0,%ymm\rl0 -vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl1 +vpsubw %ymm\rl1,%ymm\rh1,%ymm13 + vpmullw %ymm\zl0,%ymm12,%ymm\rh0 +vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl1 +vpsubw %ymm\rl2,%ymm\rh2,%ymm14 -vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl2 vpmullw %ymm\zl0,%ymm13,%ymm\rh1 -vpsubw %ymm\rh3,%ymm\rl3,%ymm15 +vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl2 +vpsubw %ymm\rl3,%ymm\rh3,%ymm15 -vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl3 vpmullw %ymm\zl1,%ymm14,%ymm\rh2 +vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl3 vpmullw %ymm\zl1,%ymm15,%ymm\rh3 vpmulhw %ymm\zh0,%ymm12,%ymm12 @@ -26,60 +25,84 @@ vpmulhw %ymm\zh0,%ymm13,%ymm13 vpmulhw %ymm\zh1,%ymm14,%ymm14 vpmulhw %ymm\zh1,%ymm15,%ymm15 -#reduce vpmulhw %ymm0,%ymm\rh0,%ymm\rh0 + vpmulhw %ymm0,%ymm\rh1,%ymm\rh1 + vpmulhw %ymm0,%ymm\rh2,%ymm\rh2 vpmulhw %ymm0,%ymm\rh3,%ymm\rh3 + +# + +# + vpsubw %ymm\rh0,%ymm12,%ymm\rh0 + vpsubw %ymm\rh1,%ymm13,%ymm\rh1 + vpsubw %ymm\rh2,%ymm14,%ymm\rh2 vpsubw %ymm\rh3,%ymm15,%ymm\rh3 .endm -.text -invntt_levels0t5_avx: -level0: -#zetas -vmovdqu (%rsi),%ymm15 -vmovdqu 64(%rsi),%ymm3 -vmovdqu 32(%rsi),%ymm1 -vmovdqu 96(%rsi),%ymm2 +.macro intt_levels0t5 off +/* level 0 */ +vmovdqa _16XFLO*2(%rsi),%ymm2 +vmovdqa _16XFHI*2(%rsi),%ymm3 -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 128(%rdi),%ymm8 -vmovdqa 160(%rdi),%ymm9 -vmovdqa 192(%rdi),%ymm10 -vmovdqa 224(%rdi),%ymm11 +vmovdqa (128*\off+ 0)*2(%rdi),%ymm4 +vmovdqa (128*\off+ 32)*2(%rdi),%ymm6 +vmovdqa (128*\off+ 16)*2(%rdi),%ymm5 +vmovdqa (128*\off+ 48)*2(%rdi),%ymm7 -butterfly 4,5,8,9,6,7,10,11,15,3,1,2 +fqmulprecomp 2,3,4 +fqmulprecomp 2,3,6 +fqmulprecomp 2,3,5 +fqmulprecomp 2,3,7 -level1: -#zetas -vmovdqu 128(%rsi),%ymm3 -vmovdqu 160(%rsi),%ymm2 +vmovdqa (128*\off+ 64)*2(%rdi),%ymm8 +vmovdqa (128*\off+ 96)*2(%rdi),%ymm10 +vmovdqa (128*\off+ 80)*2(%rdi),%ymm9 +vmovdqa (128*\off+112)*2(%rdi),%ymm11 -butterfly 4,5,6,7,8,9,10,11,3,3,2,2 +fqmulprecomp 2,3,8 +fqmulprecomp 2,3,10 +fqmulprecomp 2,3,9 +fqmulprecomp 2,3,11 + +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+208)*2(%rsi),%ymm15 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+176)*2(%rsi),%ymm1 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+224)*2(%rsi),%ymm2 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+192)*2(%rsi),%ymm3 +vmovdqa _REVIDXB*2(%rsi),%ymm12 +vpshufb %ymm12,%ymm15,%ymm15 +vpshufb %ymm12,%ymm1,%ymm1 +vpshufb %ymm12,%ymm2,%ymm2 +vpshufb %ymm12,%ymm3,%ymm3 + +butterfly 4,5,8,9,6,7,10,11,15,1,2,3 + +/* level 1 */ +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+144)*2(%rsi),%ymm2 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+160)*2(%rsi),%ymm3 +vmovdqa _REVIDXB*2(%rsi),%ymm1 +vpshufb %ymm1,%ymm2,%ymm2 +vpshufb %ymm1,%ymm3,%ymm3 + +butterfly 4,5,6,7,8,9,10,11,2,2,3,3 shuffle1 4,5,3,5 shuffle1 6,7,4,7 shuffle1 8,9,6,9 shuffle1 10,11,8,11 -level2: -#zetas -vmovdqu 192(%rsi),%ymm10 -vmovdqu 224(%rsi),%ymm2 +/* level 2 */ +vmovdqa _REVIDXD*2(%rsi),%ymm12 +vpermd (_ZETAS_EXP+(1-\off)*224+112)*2(%rsi),%ymm12,%ymm2 +vpermd (_ZETAS_EXP+(1-\off)*224+128)*2(%rsi),%ymm12,%ymm10 -#consts -vmovdqa _16XV*2(%rdx),%ymm1 - -butterfly 3,4,6,8,5,7,9,11,10,10,2,2 +butterfly 3,4,6,8,5,7,9,11,2,2,10,10 +vmovdqa _16XV*2(%rsi),%ymm1 red16 3 shuffle2 3,4,10,4 @@ -87,26 +110,22 @@ shuffle2 6,8,3,8 shuffle2 5,7,6,7 shuffle2 9,11,5,11 -level3: -#zetas -vmovdqu 256(%rsi),%ymm9 -vmovdqu 288(%rsi),%ymm2 +/* level 3 */ +vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+80)*2(%rsi),%ymm2 +vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+96)*2(%rsi),%ymm9 -butterfly 10,3,6,5,4,8,7,11,9,9,2,2 - -red16 10 +butterfly 10,3,6,5,4,8,7,11,2,2,9,9 shuffle4 10,3,9,3 shuffle4 6,5,10,5 shuffle4 4,8,6,8 shuffle4 7,11,4,11 -level4: -#zetas -vmovdqu 320(%rsi),%ymm7 -vmovdqu 352(%rsi),%ymm2 +/* level 4 */ +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+48)*2(%rsi),%ymm2 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+64)*2(%rsi),%ymm7 -butterfly 9,10,6,4,3,5,8,11,7,7,2,2 +butterfly 9,10,6,4,3,5,8,11,2,2,7,7 red16 9 @@ -115,113 +134,62 @@ shuffle8 6,4,9,4 shuffle8 3,5,6,5 shuffle8 8,11,3,11 -level5: -#zetas -vpbroadcastd 384(%rsi),%ymm8 -vpbroadcastd 388(%rsi),%ymm2 +/* level5 */ +vmovdqa (_ZETAS_EXP+(1-\off)*224+16)*2(%rsi),%ymm2 +vmovdqa (_ZETAS_EXP+(1-\off)*224+32)*2(%rsi),%ymm8 -butterfly 7,9,6,3,10,4,5,11,8,8,2,2 +butterfly 7,9,6,3,10,4,5,11,2,2,8,8 -red16 7 +vmovdqa %ymm7,(128*\off+ 0)*2(%rdi) +vmovdqa %ymm9,(128*\off+ 16)*2(%rdi) +vmovdqa %ymm6,(128*\off+ 32)*2(%rdi) +vmovdqa %ymm3,(128*\off+ 48)*2(%rdi) +vmovdqa %ymm10,(128*\off+ 64)*2(%rdi) +vmovdqa %ymm4,(128*\off+ 80)*2(%rdi) +vmovdqa %ymm5,(128*\off+ 96)*2(%rdi) +vmovdqa %ymm11,(128*\off+112)*2(%rdi) +.endm -#store -vmovdqa %ymm7,(%rdi) -vmovdqa %ymm9,32(%rdi) -vmovdqa %ymm6,64(%rdi) -vmovdqa %ymm3,96(%rdi) -vmovdqa %ymm10,128(%rdi) -vmovdqa %ymm4,160(%rdi) -vmovdqa %ymm5,192(%rdi) -vmovdqa %ymm11,224(%rdi) +.macro intt_level6 off +/* level 6 */ +vmovdqa (64*\off+ 0)*2(%rdi),%ymm4 +vmovdqa (64*\off+128)*2(%rdi),%ymm8 +vmovdqa (64*\off+ 16)*2(%rdi),%ymm5 +vmovdqa (64*\off+144)*2(%rdi),%ymm9 +vpbroadcastq (_ZETAS_EXP+0)*2(%rsi),%ymm2 -ret - -invntt_level6_avx: -#zetas -vpbroadcastd (%rsi),%ymm1 -vpbroadcastd 4(%rsi),%ymm2 - -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 256(%rdi),%ymm8 -vmovdqa 288(%rdi),%ymm9 -vmovdqa 320(%rdi),%ymm10 -vmovdqa 352(%rdi),%ymm11 +vmovdqa (64*\off+ 32)*2(%rdi),%ymm6 +vmovdqa (64*\off+160)*2(%rdi),%ymm10 +vmovdqa (64*\off+ 48)*2(%rdi),%ymm7 +vmovdqa (64*\off+176)*2(%rdi),%ymm11 +vpbroadcastq (_ZETAS_EXP+4)*2(%rsi),%ymm3 butterfly 4,5,6,7,8,9,10,11 -#consts -vmovdqa _16XFLO*2(%rdx),%ymm12 -vmovdqa _16XFHI*2(%rdx),%ymm13 +.if \off == 0 +red16 4 +.endif -#store -vmovdqa %ymm8,256(%rdi) -vmovdqa %ymm9,288(%rdi) -vmovdqa %ymm10,320(%rdi) -vmovdqa %ymm11,352(%rdi) - -fqmulprecomp 12,13,4,8 -fqmulprecomp 12,13,5,9 -fqmulprecomp 12,13,6,10 -fqmulprecomp 12,13,7,11 - -#store -vmovdqa %ymm4,(%rdi) -vmovdqa %ymm5,32(%rdi) -vmovdqa %ymm6,64(%rdi) -vmovdqa %ymm7,96(%rdi) - -#load -vmovdqa 128(%rdi),%ymm4 -vmovdqa 160(%rdi),%ymm5 -vmovdqa 192(%rdi),%ymm6 -vmovdqa 224(%rdi),%ymm7 -vmovdqa 384(%rdi),%ymm8 -vmovdqa 416(%rdi),%ymm9 -vmovdqa 448(%rdi),%ymm10 -vmovdqa 480(%rdi),%ymm11 - -butterfly 4,5,6,7,8,9,10,11 - -#consts -vmovdqa _16XFLO*2(%rdx),%ymm12 -vmovdqa _16XFHI*2(%rdx),%ymm13 - -#store -vmovdqa %ymm8,384(%rdi) -vmovdqa %ymm9,416(%rdi) -vmovdqa %ymm10,448(%rdi) -vmovdqa %ymm11,480(%rdi) - -fqmulprecomp 12,13,4,8 -fqmulprecomp 12,13,5,9 -fqmulprecomp 12,13,6,10 -fqmulprecomp 12,13,7,11 - -#store -vmovdqa %ymm4,128(%rdi) -vmovdqa %ymm5,160(%rdi) -vmovdqa %ymm6,192(%rdi) -vmovdqa %ymm7,224(%rdi) - -ret +vmovdqa %ymm4,(64*\off+ 0)*2(%rdi) +vmovdqa %ymm5,(64*\off+ 16)*2(%rdi) +vmovdqa %ymm6,(64*\off+ 32)*2(%rdi) +vmovdqa %ymm7,(64*\off+ 48)*2(%rdi) +vmovdqa %ymm8,(64*\off+128)*2(%rdi) +vmovdqa %ymm9,(64*\off+144)*2(%rdi) +vmovdqa %ymm10,(64*\off+160)*2(%rdi) +vmovdqa %ymm11,(64*\off+176)*2(%rdi) +.endm +.text .global cdecl(PQCLEAN_KYBER51290S_AVX2_invntt_avx) .global _cdecl(PQCLEAN_KYBER51290S_AVX2_invntt_avx) cdecl(PQCLEAN_KYBER51290S_AVX2_invntt_avx): _cdecl(PQCLEAN_KYBER51290S_AVX2_invntt_avx): -#consts vmovdqa _16XQ*2(%rsi),%ymm0 -mov %rsi,%rdx -add $_ZETAS_INV_EXP*2,%rsi -call invntt_levels0t5_avx -add $256,%rdi -add $392,%rsi -call invntt_levels0t5_avx -sub $256,%rdi -add $392,%rsi -call invntt_level6_avx + +intt_levels0t5 0 +intt_levels0t5 1 + +intt_level6 0 +intt_level6 1 ret diff --git a/crypto_kem/kyber512-90s/avx2/kem.c b/crypto_kem/kyber512-90s/avx2/kem.c index 890a6206..a33b2d33 100644 --- a/crypto_kem/kyber512-90s/avx2/kem.c +++ b/crypto_kem/kyber512-90s/avx2/kem.c @@ -1,4 +1,3 @@ -#include "align.h" #include "indcpa.h" #include "kem.h" #include "params.h" @@ -15,13 +14,14 @@ * for CCA-secure Kyber key encapsulation mechanism * * Arguments: - unsigned char *pk: pointer to output public key -* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) * - unsigned char *sk: pointer to output private key -* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* (an already allocated array of KYBER_SECRETKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER51290S_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { +int PQCLEAN_KYBER51290S_AVX2_crypto_kem_keypair(unsigned char pk[KYBER_PUBLICKEYBYTES], + unsigned char sk[KYBER_SECRETKEYBYTES]) { size_t i; PQCLEAN_KYBER51290S_AVX2_indcpa_keypair(pk, sk); for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) { @@ -40,36 +40,36 @@ int PQCLEAN_KYBER51290S_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char * secret for given public key * * Arguments: - unsigned char *ct: pointer to output cipher text -* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) * - unsigned char *ss: pointer to output shared secret -* (an already allocated array of CRYPTO_BYTES bytes) +* (an already allocated array of KYBER_SSBYTES bytes) * - const unsigned char *pk: pointer to input public key -* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER51290S_AVX2_crypto_kem_enc(unsigned char *ct, - unsigned char *ss, - const unsigned char *pk) { - ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; +int PQCLEAN_KYBER51290S_AVX2_crypto_kem_enc(unsigned char ct[KYBER_CIPHERTEXTBYTES], + unsigned char ss[KYBER_SSBYTES], + const unsigned char pk[KYBER_PUBLICKEYBYTES]) { + uint8_t buf[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ - ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr; + uint8_t kr[2 * KYBER_SYMBYTES]; - randombytes(buf.arr, KYBER_SYMBYTES); + randombytes(buf, KYBER_SYMBYTES); /* Don't release system RNG output */ - hash_h(buf.arr, buf.arr, KYBER_SYMBYTES); + hash_h(buf, buf, KYBER_SYMBYTES); /* Multitarget countermeasure for coins + contributory KEM */ - hash_h(buf.arr + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); - hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES); + hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); + hash_g(kr, buf, 2 * KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ - PQCLEAN_KYBER51290S_AVX2_indcpa_enc(ct, buf.arr, pk, kr.arr + KYBER_SYMBYTES); + PQCLEAN_KYBER51290S_AVX2_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); /* overwrite coins in kr with H(c) */ - hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); + hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* hash concatenation of pre-k and H(c) to k */ - kdf(ss, kr.arr, 2 * KYBER_SYMBYTES); + kdf(ss, kr, 2 * KYBER_SYMBYTES); return 0; } @@ -80,47 +80,47 @@ int PQCLEAN_KYBER51290S_AVX2_crypto_kem_enc(unsigned char *ct, * cipher text and private key * * Arguments: - unsigned char *ss: pointer to output shared secret -* (an already allocated array of CRYPTO_BYTES bytes) +* (an already allocated array of KYBER_SSBYTES bytes) * - const unsigned char *ct: pointer to input cipher text -* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) * - const unsigned char *sk: pointer to input private key -* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* (an already allocated array of KYBER_SECRETKEYBYTES bytes) * * Returns 0. * * On failure, ss will contain a pseudo-random value. **************************************************/ -int PQCLEAN_KYBER51290S_AVX2_crypto_kem_dec(unsigned char *ss, - const unsigned char *ct, - const unsigned char *sk) { +int PQCLEAN_KYBER51290S_AVX2_crypto_kem_dec(unsigned char ss[KYBER_SSBYTES], + const unsigned char ct[KYBER_CIPHERTEXTBYTES], + const unsigned char sk[KYBER_SECRETKEYBYTES]) { size_t i; int fail; - ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; + uint8_t buf[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ - ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr; - uint8_t cmp[KYBER_CIPHERTEXTBYTES]; + uint8_t kr[2 * KYBER_SYMBYTES]; + ALIGNED_UINT8(KYBER_CIPHERTEXTBYTES) cmp; const uint8_t *pk = sk + KYBER_INDCPA_SECRETKEYBYTES; - PQCLEAN_KYBER51290S_AVX2_indcpa_dec(buf.arr, ct, sk); + PQCLEAN_KYBER51290S_AVX2_indcpa_dec(buf, ct, sk); /* Multitarget countermeasure for coins + contributory KEM */ for (i = 0; i < KYBER_SYMBYTES; i++) { - buf.arr[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; + buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; } - hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES); + hash_g(kr, buf, 2 * KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ - PQCLEAN_KYBER51290S_AVX2_indcpa_enc(cmp, buf.arr, pk, kr.arr + KYBER_SYMBYTES); + PQCLEAN_KYBER51290S_AVX2_indcpa_enc(cmp.coeffs, buf, pk, kr + KYBER_SYMBYTES); - fail = PQCLEAN_KYBER51290S_AVX2_verify(ct, cmp, KYBER_CIPHERTEXTBYTES); + fail = PQCLEAN_KYBER51290S_AVX2_verify(ct, cmp.coeffs, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ - hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); + hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* Overwrite pre-k with z on re-encryption failure */ - PQCLEAN_KYBER51290S_AVX2_cmov(kr.arr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, (uint8_t)fail); + PQCLEAN_KYBER51290S_AVX2_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); /* hash concatenation of pre-k and H(c) to k */ - kdf(ss, kr.arr, 2 * KYBER_SYMBYTES); + kdf(ss, kr, 2 * KYBER_SYMBYTES); return 0; } diff --git a/crypto_kem/kyber512-90s/avx2/ntt.S b/crypto_kem/kyber512-90s/avx2/ntt.S index 51510563..1de466bc 100644 --- a/crypto_kem/kyber512-90s/avx2/ntt.S +++ b/crypto_kem/kyber512-90s/avx2/ntt.S @@ -1,222 +1,191 @@ #include "cdecl.h" .include "shuffle.inc" -.include "fq.inc" -.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=1,zh1=1 -#mul +.macro mul rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=2,zh1=2 vpmullw %ymm\zl0,%ymm\rh0,%ymm12 vpmullw %ymm\zl0,%ymm\rh1,%ymm13 + vpmullw %ymm\zl1,%ymm\rh2,%ymm14 vpmullw %ymm\zl1,%ymm\rh3,%ymm15 + vpmulhw %ymm\zh0,%ymm\rh0,%ymm\rh0 vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh1 + vpmulhw %ymm\zh1,%ymm\rh2,%ymm\rh2 vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh3 - -#reduce -vpmulhw %ymm0,%ymm12,%ymm12 -vpmulhw %ymm0,%ymm13,%ymm13 -vpmulhw %ymm0,%ymm14,%ymm14 -vpmulhw %ymm0,%ymm15,%ymm15 -vpsubw %ymm12,%ymm\rh0,%ymm12 -vpsubw %ymm13,%ymm\rh1,%ymm13 -vpsubw %ymm14,%ymm\rh2,%ymm14 -vpsubw %ymm15,%ymm\rh3,%ymm15 - -#update -vpsubw %ymm12,%ymm\rl0,%ymm\rh0 -vpaddw %ymm12,%ymm\rl0,%ymm\rl0 -vpsubw %ymm13,%ymm\rl1,%ymm\rh1 -vpaddw %ymm13,%ymm\rl1,%ymm\rl1 -vpsubw %ymm14,%ymm\rl2,%ymm\rh2 -vpaddw %ymm14,%ymm\rl2,%ymm\rl2 -vpsubw %ymm15,%ymm\rl3,%ymm\rh3 -vpaddw %ymm15,%ymm\rl3,%ymm\rl3 .endm -# We break the dependency chains with the cost of slightly more additions. -# But they can be run in parallel to the multiplications on execution port 5 -# (multiplications only go to ports 0 and 1) -.macro butterfly2 rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,x=3,y=2,zl0=15,zl1=15,zh0=1,zh1=1 -#mul -vpmullw %ymm\zl0,%ymm\rh0,%ymm12 -vpmulhw %ymm\zh0,%ymm\rh0,%ymm\x -vpmullw %ymm\zl0,%ymm\rh1,%ymm13 -vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh0 -vpmullw %ymm\zl1,%ymm\rh2,%ymm14 -vpmulhw %ymm\zh1,%ymm\rh2,%ymm\y -vpmullw %ymm\zl1,%ymm\rh3,%ymm15 -vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh2 - -#reduce +.macro reduce vpmulhw %ymm0,%ymm12,%ymm12 vpmulhw %ymm0,%ymm13,%ymm13 + vpmulhw %ymm0,%ymm14,%ymm14 vpmulhw %ymm0,%ymm15,%ymm15 +.endm -vpsubw %ymm\rh0,%ymm\rl1,%ymm\rh1 -vpaddw %ymm\rh0,%ymm\rl1,%ymm\rl1 -vpsubw %ymm\x,%ymm\rl0,%ymm\rh0 -vpaddw %ymm\x,%ymm\rl0,%ymm\rl0 -vpsubw %ymm\rh2,%ymm\rl3,%ymm\rh3 -vpaddw %ymm\rh2,%ymm\rl3,%ymm\rl3 -vpsubw %ymm\y,%ymm\rl2,%ymm\rh2 -vpaddw %ymm\y,%ymm\rl2,%ymm\rl2 +.macro update rln,rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 +vpaddw %ymm\rh0,%ymm\rl0,%ymm\rln +vpsubw %ymm\rh0,%ymm\rl0,%ymm\rh0 +vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl0 -#update +vpsubw %ymm\rh1,%ymm\rl1,%ymm\rh1 +vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl1 +vpsubw %ymm\rh2,%ymm\rl2,%ymm\rh2 + +vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl2 +vpsubw %ymm\rh3,%ymm\rl3,%ymm\rh3 + +vpsubw %ymm12,%ymm\rln,%ymm\rln vpaddw %ymm12,%ymm\rh0,%ymm\rh0 -vpsubw %ymm12,%ymm\rl0,%ymm\rl0 +vpsubw %ymm13,%ymm\rl0,%ymm\rl0 + vpaddw %ymm13,%ymm\rh1,%ymm\rh1 -vpsubw %ymm13,%ymm\rl1,%ymm\rl1 +vpsubw %ymm14,%ymm\rl1,%ymm\rl1 vpaddw %ymm14,%ymm\rh2,%ymm\rh2 -vpsubw %ymm14,%ymm\rl2,%ymm\rl2 + +vpsubw %ymm15,%ymm\rl2,%ymm\rl2 vpaddw %ymm15,%ymm\rh3,%ymm\rh3 -vpsubw %ymm15,%ymm\rl3,%ymm\rl3 +.endm + +.macro level0 off +vpbroadcastq (_ZETAS_EXP+0)*2(%rsi),%ymm15 +vmovdqa (64*\off+128)*2(%rdi),%ymm8 +vmovdqa (64*\off+144)*2(%rdi),%ymm9 +vmovdqa (64*\off+160)*2(%rdi),%ymm10 +vmovdqa (64*\off+176)*2(%rdi),%ymm11 +vpbroadcastq (_ZETAS_EXP+4)*2(%rsi),%ymm2 + +mul 8,9,10,11 + +vmovdqa (64*\off+ 0)*2(%rdi),%ymm4 +vmovdqa (64*\off+ 16)*2(%rdi),%ymm5 +vmovdqa (64*\off+ 32)*2(%rdi),%ymm6 +vmovdqa (64*\off+ 48)*2(%rdi),%ymm7 + +reduce +update 3,4,5,6,7,8,9,10,11 + +vmovdqa %ymm3,(64*\off+ 0)*2(%rdi) +vmovdqa %ymm4,(64*\off+ 16)*2(%rdi) +vmovdqa %ymm5,(64*\off+ 32)*2(%rdi) +vmovdqa %ymm6,(64*\off+ 48)*2(%rdi) +vmovdqa %ymm8,(64*\off+128)*2(%rdi) +vmovdqa %ymm9,(64*\off+144)*2(%rdi) +vmovdqa %ymm10,(64*\off+160)*2(%rdi) +vmovdqa %ymm11,(64*\off+176)*2(%rdi) +.endm + +.macro levels1t6 off +/* level 1 */ +vmovdqa (_ZETAS_EXP+224*\off+16)*2(%rsi),%ymm15 +vmovdqa (128*\off+ 64)*2(%rdi),%ymm8 +vmovdqa (128*\off+ 80)*2(%rdi),%ymm9 +vmovdqa (128*\off+ 96)*2(%rdi),%ymm10 +vmovdqa (128*\off+112)*2(%rdi),%ymm11 +vmovdqa (_ZETAS_EXP+224*\off+32)*2(%rsi),%ymm2 + +mul 8,9,10,11 + +vmovdqa (128*\off+ 0)*2(%rdi),%ymm4 +vmovdqa (128*\off+ 16)*2(%rdi),%ymm5 +vmovdqa (128*\off+ 32)*2(%rdi),%ymm6 +vmovdqa (128*\off+ 48)*2(%rdi),%ymm7 + +reduce +update 3,4,5,6,7,8,9,10,11 + +/* level 2 */ +shuffle8 5,10,7,10 +shuffle8 6,11,5,11 + +vmovdqa (_ZETAS_EXP+224*\off+48)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+64)*2(%rsi),%ymm2 + +mul 7,10,5,11 + +shuffle8 3,8,6,8 +shuffle8 4,9,3,9 + +reduce +update 4,6,8,3,9,7,10,5,11 + +/* level 3 */ +shuffle4 8,5,9,5 +shuffle4 3,11,8,11 + +vmovdqa (_ZETAS_EXP+224*\off+80)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+96)*2(%rsi),%ymm2 + +mul 9,5,8,11 + +shuffle4 4,7,3,7 +shuffle4 6,10,4,10 + +reduce +update 6,3,7,4,10,9,5,8,11 + +/* level 4 */ +shuffle2 7,8,10,8 +shuffle2 4,11,7,11 + +vmovdqa (_ZETAS_EXP+224*\off+112)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+128)*2(%rsi),%ymm2 + +mul 10,8,7,11 + +shuffle2 6,9,4,9 +shuffle2 3,5,6,5 + +reduce +update 3,4,9,6,5,10,8,7,11 + +/* level 5 */ +shuffle1 9,7,5,7 +shuffle1 6,11,9,11 + +vmovdqa (_ZETAS_EXP+224*\off+144)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+160)*2(%rsi),%ymm2 + +mul 5,7,9,11 + +shuffle1 3,10,6,10 +shuffle1 4,8,3,8 + +reduce +update 4,6,10,3,8,5,7,9,11 + +/* level 6 */ +vmovdqa (_ZETAS_EXP+224*\off+176)*2(%rsi),%ymm14 +vmovdqa (_ZETAS_EXP+224*\off+208)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+192)*2(%rsi),%ymm8 +vmovdqa (_ZETAS_EXP+224*\off+224)*2(%rsi),%ymm2 + +mul 10,3,9,11,14,15,8,2 + +reduce +update 8,4,6,5,7,10,3,9,11 + +vmovdqa %ymm8,(128*\off+ 0)*2(%rdi) +vmovdqa %ymm4,(128*\off+ 16)*2(%rdi) +vmovdqa %ymm10,(128*\off+ 32)*2(%rdi) +vmovdqa %ymm3,(128*\off+ 48)*2(%rdi) +vmovdqa %ymm6,(128*\off+ 64)*2(%rdi) +vmovdqa %ymm5,(128*\off+ 80)*2(%rdi) +vmovdqa %ymm9,(128*\off+ 96)*2(%rdi) +vmovdqa %ymm11,(128*\off+112)*2(%rdi) .endm .text -ntt_level0_avx: -level0: -#zetas -vpbroadcastd (%rsi),%ymm15 -vpbroadcastd 4(%rsi),%ymm1 - -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 256(%rdi),%ymm8 -vmovdqa 288(%rdi),%ymm9 -vmovdqa 320(%rdi),%ymm10 -vmovdqa 352(%rdi),%ymm11 - -butterfly2 4,5,6,7,8,9,10,11 - -#store -vmovdqa %ymm4,(%rdi) -vmovdqa %ymm5,32(%rdi) -vmovdqa %ymm6,64(%rdi) -vmovdqa %ymm7,96(%rdi) -vmovdqa %ymm8,256(%rdi) -vmovdqa %ymm9,288(%rdi) -vmovdqa %ymm10,320(%rdi) -vmovdqa %ymm11,352(%rdi) - -ret - -ntt_levels1t6_avx: -level1: -#zetas -vpbroadcastd (%rsi),%ymm15 -vpbroadcastd 4(%rsi),%ymm1 - -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 128(%rdi),%ymm8 -vmovdqa 160(%rdi),%ymm9 -vmovdqa 192(%rdi),%ymm10 -vmovdqa 224(%rdi),%ymm11 - -butterfly2 4,5,6,7,8,9,10,11,3 - -level2: -#zetas -vmovdqu 8(%rsi),%ymm15 -vmovdqu 40(%rsi),%ymm1 - -shuffle8 4,8,3,8 -shuffle8 5,9,4,9 -shuffle8 6,10,5,10 -shuffle8 7,11,6,11 - -butterfly2 3,8,4,9,5,10,6,11,7 - -level3: -#zetas -vmovdqu 72(%rsi),%ymm15 -vmovdqu 104(%rsi),%ymm1 - -shuffle4 3,5,7,5 -shuffle4 8,10,3,10 -shuffle4 4,6,8,6 -shuffle4 9,11,4,11 - -butterfly2 7,5,3,10,8,6,4,11,9 - -level4: -#zetas -vmovdqu 136(%rsi),%ymm15 -vmovdqu 168(%rsi),%ymm1 - -shuffle2 7,8,9,8 -shuffle2 5,6,7,6 -shuffle2 3,4,5,4 -shuffle2 10,11,3,11 - -butterfly2 9,8,7,6,5,4,3,11,10 - -level5: -#zetas -vmovdqu 200(%rsi),%ymm15 -vmovdqu 232(%rsi),%ymm1 - -shuffle1 9,5,10,5 -shuffle1 8,4,9,4 -shuffle1 7,3,8,3 -shuffle1 6,11,7,11 - -butterfly2 10,5,9,4,8,3,7,11,6 - -level6: -#zetas -vmovdqu 264(%rsi),%ymm14 -vmovdqu 328(%rsi),%ymm15 -vmovdqu 296(%rsi),%ymm1 -vmovdqu 360(%rsi),%ymm2 - -butterfly2 10,5,8,3,9,4,7,11,6,1,14,15,1,2 - -vmovdqa _16XV*2(%rdx),%ymm1 -red16 10,12 -red16 5,13 -red16 9,14 -red16 4,15 -red16 8,2 -red16 3,6 -red16 7,12 -red16 11,13 - -#store -vmovdqa %ymm10,(%rdi) -vmovdqa %ymm5,32(%rdi) -vmovdqa %ymm9,64(%rdi) -vmovdqa %ymm4,96(%rdi) -vmovdqa %ymm8,128(%rdi) -vmovdqa %ymm3,160(%rdi) -vmovdqa %ymm7,192(%rdi) -vmovdqa %ymm11,224(%rdi) - -ret - .global cdecl(PQCLEAN_KYBER51290S_AVX2_ntt_avx) .global _cdecl(PQCLEAN_KYBER51290S_AVX2_ntt_avx) cdecl(PQCLEAN_KYBER51290S_AVX2_ntt_avx): _cdecl(PQCLEAN_KYBER51290S_AVX2_ntt_avx): -#consts vmovdqa _16XQ*2(%rsi),%ymm0 -mov %rsi,%rdx -add $_ZETAS_EXP*2,%rsi -call ntt_level0_avx -add $128,%rdi -call ntt_level0_avx -sub $128,%rdi -add $8,%rsi -call ntt_levels1t6_avx -add $256,%rdi -add $392,%rsi -call ntt_levels1t6_avx + +level0 0 +level0 1 + +levels1t6 0 +levels1t6 1 + ret diff --git a/crypto_kem/kyber512-90s/avx2/ntt.h b/crypto_kem/kyber512-90s/avx2/ntt.h index 9b8698a4..90e23376 100644 --- a/crypto_kem/kyber512-90s/avx2/ntt.h +++ b/crypto_kem/kyber512-90s/avx2/ntt.h @@ -1,24 +1,21 @@ #ifndef PQCLEAN_KYBER51290S_AVX2_NTT_H #define PQCLEAN_KYBER51290S_AVX2_NTT_H -#include "consts.h" + +#include #include -void PQCLEAN_KYBER51290S_AVX2_ntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER51290S_AVX2_qdata); -void PQCLEAN_KYBER51290S_AVX2_invntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER51290S_AVX2_qdata); +void PQCLEAN_KYBER51290S_AVX2_ntt_avx(__m256i *r, const __m256i *PQCLEAN_KYBER51290S_AVX2_qdata); +void PQCLEAN_KYBER51290S_AVX2_invntt_avx(__m256i *r, const __m256i *PQCLEAN_KYBER51290S_AVX2_qdata); -void PQCLEAN_KYBER51290S_AVX2_nttpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER51290S_AVX2_qdata); -void PQCLEAN_KYBER51290S_AVX2_nttunpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER51290S_AVX2_qdata); +void PQCLEAN_KYBER51290S_AVX2_nttpack_avx(__m256i *r, const __m256i *PQCLEAN_KYBER51290S_AVX2_qdata); +void PQCLEAN_KYBER51290S_AVX2_nttunpack_avx(__m256i *r, const __m256i *PQCLEAN_KYBER51290S_AVX2_qdata); -void PQCLEAN_KYBER51290S_AVX2_basemul_avx(int16_t *r, - const int16_t *a, - const int16_t *b, - const qdata_t *PQCLEAN_KYBER51290S_AVX2_qdata); -void PQCLEAN_KYBER51290S_AVX2_basemul_acc_avx(int16_t *r, - const int16_t *a, - const int16_t *b, - const qdata_t *PQCLEAN_KYBER51290S_AVX2_qdata); +void PQCLEAN_KYBER51290S_AVX2_basemul_avx(__m256i *r, + const __m256i *a, + const __m256i *b, + const __m256i *PQCLEAN_KYBER51290S_AVX2_qdata); -void PQCLEAN_KYBER51290S_AVX2_ntttobytes_avx(uint8_t *r, const int16_t *a, const qdata_t *PQCLEAN_KYBER51290S_AVX2_qdata); -void PQCLEAN_KYBER51290S_AVX2_nttfrombytes_avx(int16_t *r, const uint8_t *a, const qdata_t *PQCLEAN_KYBER51290S_AVX2_qdata); +void PQCLEAN_KYBER51290S_AVX2_ntttobytes_avx(uint8_t *r, const __m256i *a, const __m256i *PQCLEAN_KYBER51290S_AVX2_qdata); +void PQCLEAN_KYBER51290S_AVX2_nttfrombytes_avx(__m256i *r, const uint8_t *a, const __m256i *PQCLEAN_KYBER51290S_AVX2_qdata); #endif diff --git a/crypto_kem/kyber512-90s/avx2/params.h b/crypto_kem/kyber512-90s/avx2/params.h index 58767b6c..78712efe 100644 --- a/crypto_kem/kyber512-90s/avx2/params.h +++ b/crypto_kem/kyber512-90s/avx2/params.h @@ -7,8 +7,6 @@ #define KYBER_N 256 #define KYBER_Q 3329 -#define KYBER_ETA 2 - #define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define KYBER_SSBYTES 32 /* size in bytes of shared key */ @@ -16,9 +14,12 @@ #define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) #define KYBER_K 2 -#define KYBER_POLYCOMPRESSEDBYTES 96 +#define KYBER_ETA1 3 +#define KYBER_POLYCOMPRESSEDBYTES 128 #define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320) +#define KYBER_ETA2 2 + #define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES #define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) #define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) diff --git a/crypto_kem/kyber512-90s/avx2/poly.c b/crypto_kem/kyber512-90s/avx2/poly.c index 93bfdfeb..ce936add 100644 --- a/crypto_kem/kyber512-90s/avx2/poly.c +++ b/crypto_kem/kyber512-90s/avx2/poly.c @@ -12,74 +12,89 @@ /************************************************* * Name: PQCLEAN_KYBER51290S_AVX2_poly_compress * -* Description: Compression and subsequent serialization of a polynomial +* Description: Compression and subsequent serialization of a polynomial. +* The coefficients of the input polynomial are assumed to +* lie in the invertal [0,q], i.e. the polynomial must be reduced +* by PQCLEAN_KYBER51290S_AVX2_poly_reduce(). * * Arguments: - uint8_t *r: pointer to output byte array * (of length KYBER_POLYCOMPRESSEDBYTES) -* - poly *a: pointer to input polynomial +* - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER51290S_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *restrict a) { - unsigned int i, j; - uint8_t t[8]; - - PQCLEAN_KYBER51290S_AVX2_poly_csubq(a); - - for (i = 0; i < KYBER_N / 8; i++) { - for (j = 0; j < 8; j++) { - t[j] = ((((uint16_t)a->coeffs[8 * i + j] << 3) + KYBER_Q / 2) / KYBER_Q) & 7; - } - - r[0] = (t[0] >> 0) | (t[1] << 3) | (t[2] << 6); - r[1] = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7); - r[2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5); - r += 3; - } -} - -/************************************************* -* Name: PQCLEAN_KYBER51290S_AVX2_poly_decompress -* -* Description: De-serialization and subsequent decompression of a polynomial; -* approximate inverse of PQCLEAN_KYBER51290S_AVX2_poly_compress -* -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *a: pointer to input byte array -* (of length KYBER_POLYCOMPRESSEDBYTES bytes) -**************************************************/ -void PQCLEAN_KYBER51290S_AVX2_poly_decompress(poly *restrict r, - const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) { +void PQCLEAN_KYBER51290S_AVX2_poly_compress(uint8_t r[128], const poly *restrict a) { unsigned int i; + __m256i f0, f1, f2, f3; + const __m256i v = _mm256_load_si256(&PQCLEAN_KYBER51290S_AVX2_qdata.vec[_16XV / 16]); + const __m256i shift1 = _mm256_set1_epi16(1 << 9); + const __m256i mask = _mm256_set1_epi16(15); + const __m256i shift2 = _mm256_set1_epi16((16 << 8) + 1); + const __m256i permdidx = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); - unsigned int j; - uint8_t t[8]; - for (i = 0; i < KYBER_N / 8; i++) { - t[0] = (a[0] >> 0); - t[1] = (a[0] >> 3); - t[2] = (a[0] >> 6) | (a[1] << 2); - t[3] = (a[1] >> 1); - t[4] = (a[1] >> 4); - t[5] = (a[1] >> 7) | (a[2] << 1); - t[6] = (a[2] >> 2); - t[7] = (a[2] >> 5); - a += 3; - - for (j = 0; j < 8; j++) { - r->coeffs[8 * i + j] = ((uint16_t)(t[j] & 7) * KYBER_Q + 4) >> 3; - } + for (i = 0; i < KYBER_N / 64; i++) { + f0 = _mm256_load_si256(&a->vec[4 * i + 0]); + f1 = _mm256_load_si256(&a->vec[4 * i + 1]); + f2 = _mm256_load_si256(&a->vec[4 * i + 2]); + f3 = _mm256_load_si256(&a->vec[4 * i + 3]); + f0 = _mm256_mulhi_epi16(f0, v); + f1 = _mm256_mulhi_epi16(f1, v); + f2 = _mm256_mulhi_epi16(f2, v); + f3 = _mm256_mulhi_epi16(f3, v); + f0 = _mm256_mulhrs_epi16(f0, shift1); + f1 = _mm256_mulhrs_epi16(f1, shift1); + f2 = _mm256_mulhrs_epi16(f2, shift1); + f3 = _mm256_mulhrs_epi16(f3, shift1); + f0 = _mm256_and_si256(f0, mask); + f1 = _mm256_and_si256(f1, mask); + f2 = _mm256_and_si256(f2, mask); + f3 = _mm256_and_si256(f3, mask); + f0 = _mm256_packus_epi16(f0, f1); + f2 = _mm256_packus_epi16(f2, f3); + f0 = _mm256_maddubs_epi16(f0, shift2); + f2 = _mm256_maddubs_epi16(f2, shift2); + f0 = _mm256_packus_epi16(f0, f2); + f0 = _mm256_permutevar8x32_epi32(f0, permdidx); + _mm256_storeu_si256((__m256i *)&r[32 * i], f0); } } +void PQCLEAN_KYBER51290S_AVX2_poly_decompress(poly *restrict r, const uint8_t a[128]) { + unsigned int i; + __m128i t; + __m256i f; + const __m256i q = _mm256_load_si256(&PQCLEAN_KYBER51290S_AVX2_qdata.vec[_16XQ / 16]); + const __m256i shufbidx = _mm256_set_epi8(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, + 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0); + const __m256i mask = _mm256_set1_epi32(0x00F0000F); + const __m256i shift = _mm256_set1_epi32((128 << 16) + 2048); + + for (i = 0; i < KYBER_N / 16; i++) { + t = _mm_loadl_epi64((__m128i *)&a[8 * i]); + f = _mm256_broadcastsi128_si256(t); + f = _mm256_shuffle_epi8(f, shufbidx); + f = _mm256_and_si256(f, mask); + f = _mm256_mullo_epi16(f, shift); + f = _mm256_mulhrs_epi16(f, q); + _mm256_store_si256(&r->vec[i], f); + } +} + + /************************************************* * Name: PQCLEAN_KYBER51290S_AVX2_poly_tobytes * -* Description: Serialization of a polynomial +* Description: Serialization of a polynomial in NTT representation. +* The coefficients of the input polynomial are assumed to +* lie in the invertal [0,q], i.e. the polynomial must be reduced +* by PQCLEAN_KYBER51290S_AVX2_poly_reduce(). The coefficients are orderd as output by +* PQCLEAN_KYBER51290S_AVX2_poly_ntt(); the serialized output coefficients are in bitreversed +* order. * * Arguments: - uint8_t *r: pointer to output byte array * (needs space for KYBER_POLYBYTES bytes) -* - poly *a: pointer to input polynomial +* - poly *a: pointer to input polynomial **************************************************/ void PQCLEAN_KYBER51290S_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { - PQCLEAN_KYBER51290S_AVX2_ntttobytes_avx(r, a->coeffs, &PQCLEAN_KYBER51290S_AVX2_qdata); + PQCLEAN_KYBER51290S_AVX2_ntttobytes_avx(r, a->vec, PQCLEAN_KYBER51290S_AVX2_qdata.vec); } /************************************************* @@ -88,12 +103,12 @@ void PQCLEAN_KYBER51290S_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) * Description: De-serialization of a polynomial; * inverse of PQCLEAN_KYBER51290S_AVX2_poly_tobytes * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: pointer to input byte array * (of KYBER_POLYBYTES bytes) **************************************************/ void PQCLEAN_KYBER51290S_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]) { - PQCLEAN_KYBER51290S_AVX2_nttfrombytes_avx(r->coeffs, a, &PQCLEAN_KYBER51290S_AVX2_qdata); + PQCLEAN_KYBER51290S_AVX2_nttfrombytes_avx(r->vec, a, PQCLEAN_KYBER51290S_AVX2_qdata.vec); } /************************************************* @@ -101,11 +116,10 @@ void PQCLEAN_KYBER51290S_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLY * * Description: Convert 32-byte message to polynomial * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *msg: pointer to input message **************************************************/ -void PQCLEAN_KYBER51290S_AVX2_poly_frommsg(poly *restrict r, - const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { +void PQCLEAN_KYBER51290S_AVX2_poly_frommsg(poly *restrict r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { __m256i f, g0, g1, g2, g3, h0, h1, h2, h3; const __m256i shift = _mm256_broadcastsi128_si256(_mm_set_epi32(0, 1, 2, 3)); const __m256i idx = _mm256_broadcastsi128_si256(_mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); @@ -134,12 +148,12 @@ void PQCLEAN_KYBER51290S_AVX2_poly_frommsg(poly *restrict r, g2 = _mm256_permute2x128_si256(h0,h1,0x31); \ g1 = _mm256_permute2x128_si256(h2,h3,0x20); \ g3 = _mm256_permute2x128_si256(h2,h3,0x31); \ - _mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+ 0],g0); \ - _mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+16],g1); \ - _mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+ 0],g2); \ - _mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+16],g3) + _mm256_store_si256(&r->vec[0+2*(i)+0],g0); \ + _mm256_store_si256(&r->vec[0+2*(i)+1],g1); \ + _mm256_store_si256(&r->vec[8+2*(i)+0],g2); \ + _mm256_store_si256(&r->vec[8+2*(i)+1],g3) - f = _mm256_load_si256((__m256i *)msg); + f = _mm256_loadu_si256((__m256i *)msg); FROMMSG64(0); FROMMSG64(1); FROMMSG64(2); @@ -149,32 +163,34 @@ void PQCLEAN_KYBER51290S_AVX2_poly_frommsg(poly *restrict r, /************************************************* * Name: PQCLEAN_KYBER51290S_AVX2_poly_tomsg * -* Description: Convert polynomial to 32-byte message +* Description: Convert polynomial to 32-byte message. +* The coefficients of the input polynomial are assumed to +* lie in the invertal [0,q], i.e. the polynomial must be reduced +* by PQCLEAN_KYBER51290S_AVX2_poly_reduce(). * * Arguments: - uint8_t *msg: pointer to output message -* - poly *a: pointer to input polynomial +* - poly *a: pointer to input polynomial **************************************************/ void PQCLEAN_KYBER51290S_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *restrict a) { unsigned int i; uint32_t small; __m256i f0, f1, g0, g1; - const __m256i hqs = _mm256_set1_epi16((KYBER_Q - 1) / 2); - const __m256i hhqs = _mm256_set1_epi16((KYBER_Q - 5) / 4); + const __m256i hq = _mm256_set1_epi16((KYBER_Q - 1) / 2); + const __m256i hhq = _mm256_set1_epi16((KYBER_Q - 1) / 4); for (i = 0; i < KYBER_N / 32; i++) { - f0 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i]); - f1 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i + 16]); - f0 = _mm256_sub_epi16(hqs, f0); - f1 = _mm256_sub_epi16(hqs, f1); + f0 = _mm256_load_si256(&a->vec[2 * i + 0]); + f1 = _mm256_load_si256(&a->vec[2 * i + 1]); + f0 = _mm256_sub_epi16(hq, f0); + f1 = _mm256_sub_epi16(hq, f1); g0 = _mm256_srai_epi16(f0, 15); g1 = _mm256_srai_epi16(f1, 15); f0 = _mm256_xor_si256(f0, g0); f1 = _mm256_xor_si256(f1, g1); - f0 = _mm256_sub_epi16(hhqs, f0); - f1 = _mm256_sub_epi16(hhqs, f1); + f0 = _mm256_sub_epi16(f0, hhq); + f1 = _mm256_sub_epi16(f1, hhq); f0 = _mm256_packs_epi16(f0, f1); small = _mm256_movemask_epi8(f0); - small = ~small; msg[4 * i + 0] = small; msg[4 * i + 1] = small >> 16; msg[4 * i + 2] = small >> 8; @@ -183,21 +199,39 @@ void PQCLEAN_KYBER51290S_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], pol } /************************************************* -* Name: PQCLEAN_KYBER51290S_AVX2_poly_getnoise +* Name: PQCLEAN_KYBER51290S_AVX2_poly_getnoise_eta1 * * Description: Sample a polynomial deterministically from a seed and a nonce, * with output polynomial close to centered binomial distribution -* with parameter KYBER_ETA +* with parameter KYBER_ETA1 * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *seed: pointer to input seed * (of length KYBER_SYMBYTES bytes) -* - uint8_t nonce: one-byte input nonce +* - uint8_t nonce: one-byte input nonce **************************************************/ -void PQCLEAN_KYBER51290S_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { - ALIGN32_ARRAY(uint8_t, KYBER_ETA * KYBER_N / 4) buf; - prf(buf.arr, sizeof(buf.arr), seed, nonce); - PQCLEAN_KYBER51290S_AVX2_cbd(r, buf.arr); +void PQCLEAN_KYBER51290S_AVX2_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { + ALIGNED_UINT8(KYBER_ETA1 * KYBER_N / 4 + 32) buf; // +32 bytes as required by PQCLEAN_KYBER51290S_AVX2_poly_cbd_eta1 + prf(buf.coeffs, KYBER_ETA1 * KYBER_N / 4, seed, nonce); + PQCLEAN_KYBER51290S_AVX2_poly_cbd_eta1(r, buf.vec); +} + +/************************************************* +* Name: PQCLEAN_KYBER51290S_AVX2_poly_getnoise_eta2 +* +* Description: Sample a polynomial deterministically from a seed and a nonce, +* with output polynomial close to centered binomial distribution +* with parameter KYBER_ETA2 +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *seed: pointer to input seed +* (of length KYBER_SYMBYTES bytes) +* - uint8_t nonce: one-byte input nonce +**************************************************/ +void PQCLEAN_KYBER51290S_AVX2_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { + ALIGNED_UINT8(KYBER_ETA2 * KYBER_N / 4) buf; + prf(buf.coeffs, KYBER_ETA2 * KYBER_N / 4, seed, nonce); + PQCLEAN_KYBER51290S_AVX2_poly_cbd_eta2(r, buf.vec); } @@ -205,13 +239,17 @@ void PQCLEAN_KYBER51290S_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SY * Name: PQCLEAN_KYBER51290S_AVX2_poly_ntt * * Description: Computes negacyclic number-theoretic transform (NTT) of -* a polynomial in place; -* inputs assumed to be in normal order, output in bitreversed order +* a polynomial in place. +* Input coefficients assumed to be in normal order, +* output coefficients are in special order that is natural +* for the vectorization. Input coefficients are assumed to be +* bounded by q in absolute value, output coefficients are bounded +* by 16118 in absolute value. * -* Arguments: - uint16_t *r: pointer to in/output polynomial +* Arguments: - poly *r: pointer to in/output polynomial **************************************************/ void PQCLEAN_KYBER51290S_AVX2_poly_ntt(poly *r) { - PQCLEAN_KYBER51290S_AVX2_ntt_avx(r->coeffs, &PQCLEAN_KYBER51290S_AVX2_qdata); + PQCLEAN_KYBER51290S_AVX2_ntt_avx(r->vec, PQCLEAN_KYBER51290S_AVX2_qdata.vec); } /************************************************* @@ -219,29 +257,35 @@ void PQCLEAN_KYBER51290S_AVX2_poly_ntt(poly *r) { * * Description: Computes inverse of negacyclic number-theoretic transform (NTT) * of a polynomial in place; -* inputs assumed to be in bitreversed order, output in normal order +* Input coefficients assumed to be in special order from vectorized +* forward ntt, output in normal order. Input coefficients can be +* arbitrary 16-bit integers, output coefficients are bounded by 14870 +* in absolute value. * -* Arguments: - uint16_t *a: pointer to in/output polynomial +* Arguments: - poly *a: pointer to in/output polynomial **************************************************/ void PQCLEAN_KYBER51290S_AVX2_poly_invntt_tomont(poly *r) { - PQCLEAN_KYBER51290S_AVX2_invntt_avx(r->coeffs, &PQCLEAN_KYBER51290S_AVX2_qdata); + PQCLEAN_KYBER51290S_AVX2_invntt_avx(r->vec, PQCLEAN_KYBER51290S_AVX2_qdata.vec); } void PQCLEAN_KYBER51290S_AVX2_poly_nttunpack(poly *r) { - PQCLEAN_KYBER51290S_AVX2_nttunpack_avx(r->coeffs, &PQCLEAN_KYBER51290S_AVX2_qdata); + PQCLEAN_KYBER51290S_AVX2_nttunpack_avx(r->vec, PQCLEAN_KYBER51290S_AVX2_qdata.vec); } /************************************************* * Name: PQCLEAN_KYBER51290S_AVX2_poly_basemul_montgomery * -* Description: Multiplication of two polynomials in NTT domain +* Description: Multiplication of two polynomials in NTT domain. +* One of the input polynomials needs to have coefficients +* bounded by q, the other polynomial can have arbitrary +* coefficients. Output coefficients are bounded by 6656. * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ void PQCLEAN_KYBER51290S_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const poly *b) { - PQCLEAN_KYBER51290S_AVX2_basemul_avx(r->coeffs, a->coeffs, b->coeffs, &PQCLEAN_KYBER51290S_AVX2_qdata); + PQCLEAN_KYBER51290S_AVX2_basemul_avx(r->vec, a->vec, b->vec, PQCLEAN_KYBER51290S_AVX2_qdata.vec); } /************************************************* @@ -253,7 +297,7 @@ void PQCLEAN_KYBER51290S_AVX2_poly_basemul_montgomery(poly *r, const poly *a, co * Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER51290S_AVX2_poly_tomont(poly *r) { - PQCLEAN_KYBER51290S_AVX2_tomont_avx(r->coeffs, &PQCLEAN_KYBER51290S_AVX2_qdata); + PQCLEAN_KYBER51290S_AVX2_tomont_avx(r->vec, PQCLEAN_KYBER51290S_AVX2_qdata.vec); } /************************************************* @@ -265,28 +309,16 @@ void PQCLEAN_KYBER51290S_AVX2_poly_tomont(poly *r) { * Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER51290S_AVX2_poly_reduce(poly *r) { - PQCLEAN_KYBER51290S_AVX2_reduce_avx(r->coeffs, &PQCLEAN_KYBER51290S_AVX2_qdata); -} - -/************************************************* -* Name: PQCLEAN_KYBER51290S_AVX2_poly_csubq -* -* Description: Applies conditional subtraction of q to each coefficient -* of a polynomial. For details of conditional subtraction -* of q see comments in reduce.c -* -* Arguments: - poly *r: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_KYBER51290S_AVX2_poly_csubq(poly *r) { - PQCLEAN_KYBER51290S_AVX2_csubq_avx(r->coeffs, &PQCLEAN_KYBER51290S_AVX2_qdata); + PQCLEAN_KYBER51290S_AVX2_reduce_avx(r->vec, PQCLEAN_KYBER51290S_AVX2_qdata.vec); } /************************************************* * Name: PQCLEAN_KYBER51290S_AVX2_poly_add * -* Description: Add two polynomials +* Description: Add two polynomials. No modular reduction +* is performed. * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ @@ -294,20 +326,21 @@ void PQCLEAN_KYBER51290S_AVX2_poly_add(poly *r, const poly *a, const poly *b) { unsigned int i; __m256i f0, f1; - for (i = 0; i < KYBER_N; i += 16) { - f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); - f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); + for (i = 0; i < KYBER_N / 16; i++) { + f0 = _mm256_load_si256(&a->vec[i]); + f1 = _mm256_load_si256(&b->vec[i]); f0 = _mm256_add_epi16(f0, f1); - _mm256_store_si256((__m256i *)&r->coeffs[i], f0); + _mm256_store_si256(&r->vec[i], f0); } } /************************************************* * Name: PQCLEAN_KYBER51290S_AVX2_poly_sub * -* Description: Subtract two polynomials +* Description: Subtract two polynomials. No modular reduction +* is performed. * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ @@ -315,10 +348,10 @@ void PQCLEAN_KYBER51290S_AVX2_poly_sub(poly *r, const poly *a, const poly *b) { unsigned int i; __m256i f0, f1; - for (i = 0; i < KYBER_N; i += 16) { - f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); - f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); + for (i = 0; i < KYBER_N / 16; i++) { + f0 = _mm256_load_si256(&a->vec[i]); + f1 = _mm256_load_si256(&b->vec[i]); f0 = _mm256_sub_epi16(f0, f1); - _mm256_store_si256((__m256i *)&r->coeffs[i], f0); + _mm256_store_si256(&r->vec[i], f0); } } diff --git a/crypto_kem/kyber512-90s/avx2/poly.h b/crypto_kem/kyber512-90s/avx2/poly.h index 0638a97c..152e6644 100644 --- a/crypto_kem/kyber512-90s/avx2/poly.h +++ b/crypto_kem/kyber512-90s/avx2/poly.h @@ -1,19 +1,13 @@ #ifndef PQCLEAN_KYBER51290S_AVX2_POLY_H #define PQCLEAN_KYBER51290S_AVX2_POLY_H +#include "align.h" #include "params.h" #include #include -/* - * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial - * coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1] - */ -typedef union { - __m256i dummy; - int16_t coeffs[KYBER_N]; -} poly; +typedef ALIGNED_INT16(KYBER_N) poly; -void PQCLEAN_KYBER51290S_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a); +void PQCLEAN_KYBER51290S_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a); void PQCLEAN_KYBER51290S_AVX2_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]); void PQCLEAN_KYBER51290S_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a); @@ -22,7 +16,11 @@ void PQCLEAN_KYBER51290S_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLY void PQCLEAN_KYBER51290S_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]); void PQCLEAN_KYBER51290S_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a); -void PQCLEAN_KYBER51290S_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); +void PQCLEAN_KYBER51290S_AVX2_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); + +void PQCLEAN_KYBER51290S_AVX2_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); + + void PQCLEAN_KYBER51290S_AVX2_poly_ntt(poly *r); void PQCLEAN_KYBER51290S_AVX2_poly_invntt_tomont(poly *r); @@ -31,7 +29,6 @@ void PQCLEAN_KYBER51290S_AVX2_poly_basemul_montgomery(poly *r, const poly *a, co void PQCLEAN_KYBER51290S_AVX2_poly_tomont(poly *r); void PQCLEAN_KYBER51290S_AVX2_poly_reduce(poly *r); -void PQCLEAN_KYBER51290S_AVX2_poly_csubq(poly *r); void PQCLEAN_KYBER51290S_AVX2_poly_add(poly *r, const poly *a, const poly *b); void PQCLEAN_KYBER51290S_AVX2_poly_sub(poly *r, const poly *a, const poly *b); diff --git a/crypto_kem/kyber512-90s/avx2/polyvec.c b/crypto_kem/kyber512-90s/avx2/polyvec.c index 698b8803..d3a369e8 100644 --- a/crypto_kem/kyber512-90s/avx2/polyvec.c +++ b/crypto_kem/kyber512-90s/avx2/polyvec.c @@ -3,8 +3,76 @@ #include "params.h" #include "poly.h" #include "polyvec.h" +#include #include +static void poly_compress10(uint8_t r[320], const poly *restrict a) { + size_t i; + uint32_t low; + __m256i f0, f1, f2; + __m128i t0, t1; + const __m256i v = _mm256_load_si256(&PQCLEAN_KYBER51290S_AVX2_qdata.vec[_16XV / 16]); + const __m256i v8 = _mm256_slli_epi16(v, 3); + const __m256i off = _mm256_set1_epi16(15); + const __m256i shift1 = _mm256_set1_epi16(1 << 12); + const __m256i mask = _mm256_set1_epi16(1023); + const __m256i shift2 = _mm256_set1_epi64x((1024LL << 48) + (1LL << 32) + (1024 << 16) + 1); + const __m256i sllvdidx = _mm256_set1_epi64x(12); + const __m256i shufbidx = _mm256_set_epi8( 8, 4, 3, 2, 1, 0, -1, -1, -1, -1, -1, -1, 12, 11, 10, 9, + -1, -1, -1, -1, -1, -1, 12, 11, 10, 9, 8, 4, 3, 2, 1, 0); + + for (i = 0; i < KYBER_N / 16; i++) { + f0 = _mm256_load_si256(&a->vec[i]); + f1 = _mm256_mullo_epi16(f0, v8); + f2 = _mm256_add_epi16(f0, off); + f0 = _mm256_slli_epi16(f0, 3); + f0 = _mm256_mulhi_epi16(f0, v); + f2 = _mm256_sub_epi16(f1, f2); + f1 = _mm256_andnot_si256(f1, f2); + f1 = _mm256_srli_epi16(f1, 15); + f0 = _mm256_sub_epi16(f0, f1); + f0 = _mm256_mulhrs_epi16(f0, shift1); + f0 = _mm256_and_si256(f0, mask); + f0 = _mm256_madd_epi16(f0, shift2); + f0 = _mm256_sllv_epi32(f0, sllvdidx); + f0 = _mm256_srli_epi64(f0, 12); + f0 = _mm256_shuffle_epi8(f0, shufbidx); + t0 = _mm256_castsi256_si128(f0); + t1 = _mm256_extracti128_si256(f0, 1); + t0 = _mm_blend_epi16(t0, t1, 0xE0); + _mm_storeu_si128((__m128i *)&r[20 * i + 0], t0); + _mm_store_ss((float *)&low, _mm_castsi128_ps(t1)); + r[20 * i + 16] = (uint8_t)low; + r[20 * i + 17] = (uint8_t)(low >> 0x08); + r[20 * i + 18] = (uint8_t)(low >> 0x10); + r[20 * i + 19] = (uint8_t)(low >> 0x18); + } +} + +static void poly_decompress10(poly *restrict r, const uint8_t a[320 + 12]) { + size_t i; + __m256i f; + const __m256i q = _mm256_set1_epi32((KYBER_Q << 16) + 4 * KYBER_Q); + const __m256i shufbidx = _mm256_set_epi8(11, 10, 10, 9, 9, 8, 8, 7, + 6, 5, 5, 4, 4, 3, 3, 2, + 9, 8, 8, 7, 7, 6, 6, 5, + 4, 3, 3, 2, 2, 1, 1, 0); + const __m256i sllvdidx = _mm256_set1_epi64x(4); + const __m256i mask = _mm256_set1_epi32((32736 << 16) + 8184); + + for (i = 0; i < KYBER_N / 16; i++) { + f = _mm256_loadu_si256((__m256i *)&a[20 * i]); + f = _mm256_permute4x64_epi64(f, 0x94); + f = _mm256_shuffle_epi8(f, shufbidx); + f = _mm256_sllv_epi32(f, sllvdidx); + f = _mm256_srli_epi16(f, 1); + f = _mm256_and_si256(f, mask); + f = _mm256_mulhrs_epi16(f, q); + _mm256_store_si256(&r->vec[i], f); + } +} + + /************************************************* * Name: PQCLEAN_KYBER51290S_AVX2_polyvec_compress * @@ -14,27 +82,11 @@ * (needs space for KYBER_POLYVECCOMPRESSEDBYTES) * - polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER51290S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], - polyvec *restrict a) { - size_t i, j, k; +void PQCLEAN_KYBER51290S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES + 2], polyvec *a) { + size_t i; - PQCLEAN_KYBER51290S_AVX2_polyvec_csubq(a); - - uint16_t t[4]; for (i = 0; i < KYBER_K; i++) { - for (j = 0; j < KYBER_N / 4; j++) { - for (k = 0; k < 4; k++) { - t[k] = ((((uint32_t)a->vec[i].coeffs[4 * j + k] << 10) + KYBER_Q / 2) - / KYBER_Q) & 0x3ff; - } - - r[0] = (t[0] >> 0); - r[1] = (t[0] >> 8) | (t[1] << 2); - r[2] = (t[1] >> 6) | (t[2] << 4); - r[3] = (t[2] >> 4) | (t[3] << 6); - r[4] = (t[3] >> 2); - r += 5; - } + poly_compress10(&r[320 * i], &a->vec[i]); } } @@ -44,27 +96,15 @@ void PQCLEAN_KYBER51290S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSED * Description: De-serialize and decompress vector of polynomials; * approximate inverse of PQCLEAN_KYBER51290S_AVX2_polyvec_compress * -* Arguments: - polyvec *r: pointer to output vector of polynomials +* Arguments: - polyvec *r: pointer to output vector of polynomials * - const uint8_t *a: pointer to input byte array * (of length KYBER_POLYVECCOMPRESSEDBYTES) **************************************************/ -void PQCLEAN_KYBER51290S_AVX2_polyvec_decompress(polyvec *restrict r, - const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { - size_t i, j, k; +void PQCLEAN_KYBER51290S_AVX2_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES + 12]) { + size_t i; - uint16_t t[4]; for (i = 0; i < KYBER_K; i++) { - for (j = 0; j < KYBER_N / 4; j++) { - t[0] = (a[0] >> 0) | ((uint16_t)a[1] << 8); - t[1] = (a[1] >> 2) | ((uint16_t)a[2] << 6); - t[2] = (a[2] >> 4) | ((uint16_t)a[3] << 4); - t[3] = (a[3] >> 6) | ((uint16_t)a[4] << 2); - a += 5; - - for (k = 0; k < 4; k++) { - r->vec[i].coeffs[4 * j + k] = ((uint32_t)(t[k] & 0x3FF) * KYBER_Q + 512) >> 10; - } - } + poly_decompress10(&r->vec[i], &a[320 * i]); } } @@ -90,7 +130,7 @@ void PQCLEAN_KYBER51290S_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], pol * Description: De-serialize vector of polynomials; * inverse of PQCLEAN_KYBER51290S_AVX2_polyvec_tobytes * -* Arguments: - uint8_t *r: pointer to output byte array +* Arguments: - uint8_t *r: pointer to output byte array * - const polyvec *a: pointer to input vector of polynomials * (of length KYBER_POLYVECBYTES) **************************************************/ @@ -131,29 +171,34 @@ void PQCLEAN_KYBER51290S_AVX2_polyvec_invntt_tomont(polyvec *r) { } /************************************************* -* Name: PQCLEAN_KYBER51290S_AVX2_polyvec_pointwise_acc_montgomery +* Name: PQCLEAN_KYBER51290S_AVX2_polyvec_basemul_acc_montgomery * -* Description: Pointwise multiply elements of a and b, accumulate into r, +* Description: Multiply elements in a and b in NTT domain, accumulate into r, * and multiply by 2^-16. * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ -void PQCLEAN_KYBER51290S_AVX2_polyvec_pointwise_acc_montgomery(poly *r, - const polyvec *a, - const polyvec *b) { - PQCLEAN_KYBER51290S_AVX2_basemul_acc_avx(r->coeffs, a->vec->coeffs, b->vec->coeffs, & PQCLEAN_KYBER51290S_AVX2_qdata); +void PQCLEAN_KYBER51290S_AVX2_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) { + size_t i; + poly tmp; + + PQCLEAN_KYBER51290S_AVX2_poly_basemul_montgomery(r, &a->vec[0], &b->vec[0]); + for (i = 1; i < KYBER_K; i++) { + PQCLEAN_KYBER51290S_AVX2_poly_basemul_montgomery(&tmp, &a->vec[i], &b->vec[i]); + PQCLEAN_KYBER51290S_AVX2_poly_add(r, r, &tmp); + } } /************************************************* * Name: PQCLEAN_KYBER51290S_AVX2_polyvec_reduce * * Description: Applies Barrett reduction to each coefficient -* of each element of a vector of polynomials +* of each element of a vector of polynomials; * for details of the Barrett reduction see comments in reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - polyvec *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER51290S_AVX2_polyvec_reduce(polyvec *r) { size_t i; @@ -162,23 +207,6 @@ void PQCLEAN_KYBER51290S_AVX2_polyvec_reduce(polyvec *r) { } } -/************************************************* -* Name: PQCLEAN_KYBER51290S_AVX2_polyvec_csubq -* -* Description: Applies conditional subtraction of q to each coefficient -* of each element of a vector of polynomials -* for details of conditional subtraction of q see comments in -* reduce.c -* -* Arguments: - poly *r: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_KYBER51290S_AVX2_polyvec_csubq(polyvec *r) { - size_t i; - for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER51290S_AVX2_poly_csubq(&r->vec[i]); - } -} - /************************************************* * Name: PQCLEAN_KYBER51290S_AVX2_polyvec_add * diff --git a/crypto_kem/kyber512-90s/avx2/polyvec.h b/crypto_kem/kyber512-90s/avx2/polyvec.h index ab3004e5..404e6e8f 100644 --- a/crypto_kem/kyber512-90s/avx2/polyvec.h +++ b/crypto_kem/kyber512-90s/avx2/polyvec.h @@ -8,9 +8,8 @@ typedef struct { poly vec[KYBER_K]; } polyvec; -void PQCLEAN_KYBER51290S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a); -void PQCLEAN_KYBER51290S_AVX2_polyvec_decompress(polyvec *r, - const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); +void PQCLEAN_KYBER51290S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES + 2], polyvec *a); +void PQCLEAN_KYBER51290S_AVX2_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES + 12]); void PQCLEAN_KYBER51290S_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a); void PQCLEAN_KYBER51290S_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]); @@ -18,12 +17,9 @@ void PQCLEAN_KYBER51290S_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBE void PQCLEAN_KYBER51290S_AVX2_polyvec_ntt(polyvec *r); void PQCLEAN_KYBER51290S_AVX2_polyvec_invntt_tomont(polyvec *r); -void PQCLEAN_KYBER51290S_AVX2_polyvec_pointwise_acc_montgomery(poly *r, - const polyvec *a, - const polyvec *b); +void PQCLEAN_KYBER51290S_AVX2_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b); void PQCLEAN_KYBER51290S_AVX2_polyvec_reduce(polyvec *r); -void PQCLEAN_KYBER51290S_AVX2_polyvec_csubq(polyvec *r); void PQCLEAN_KYBER51290S_AVX2_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); diff --git a/crypto_kem/kyber512-90s/avx2/reduce.h b/crypto_kem/kyber512-90s/avx2/reduce.h index d91d158b..38bcb00e 100644 --- a/crypto_kem/kyber512-90s/avx2/reduce.h +++ b/crypto_kem/kyber512-90s/avx2/reduce.h @@ -1,10 +1,9 @@ #ifndef PQCLEAN_KYBER51290S_AVX2_REDUCE_H #define PQCLEAN_KYBER51290S_AVX2_REDUCE_H -#include "consts.h" -#include +#include "params.h" +#include -int16_t PQCLEAN_KYBER51290S_AVX2_reduce_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER51290S_AVX2_qdata); -int16_t PQCLEAN_KYBER51290S_AVX2_csubq_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER51290S_AVX2_qdata); -int16_t PQCLEAN_KYBER51290S_AVX2_tomont_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER51290S_AVX2_qdata); +void PQCLEAN_KYBER51290S_AVX2_reduce_avx(__m256i *r, const __m256i *PQCLEAN_KYBER51290S_AVX2_qdata); +void PQCLEAN_KYBER51290S_AVX2_tomont_avx(__m256i *r, const __m256i *PQCLEAN_KYBER51290S_AVX2_qdata); #endif diff --git a/crypto_kem/kyber512-90s/avx2/rejsample.c b/crypto_kem/kyber512-90s/avx2/rejsample.c index 8735b944..f5cd0d0e 100644 --- a/crypto_kem/kyber512-90s/avx2/rejsample.c +++ b/crypto_kem/kyber512-90s/avx2/rejsample.c @@ -4,311 +4,68 @@ #include "rejsample.h" #include #include +#include + +//#define BMI -static const ALIGN32_ARRAY_2D(uint8_t, 256, 8) idx = {.arr = { - {-1, -1, -1, -1, -1, -1, -1, -1}, - { 0, -1, -1, -1, -1, -1, -1, -1}, - { 2, -1, -1, -1, -1, -1, -1, -1}, - { 0, 2, -1, -1, -1, -1, -1, -1}, - { 4, -1, -1, -1, -1, -1, -1, -1}, - { 0, 4, -1, -1, -1, -1, -1, -1}, - { 2, 4, -1, -1, -1, -1, -1, -1}, - { 0, 2, 4, -1, -1, -1, -1, -1}, - { 6, -1, -1, -1, -1, -1, -1, -1}, - { 0, 6, -1, -1, -1, -1, -1, -1}, - { 2, 6, -1, -1, -1, -1, -1, -1}, - { 0, 2, 6, -1, -1, -1, -1, -1}, - { 4, 6, -1, -1, -1, -1, -1, -1}, - { 0, 4, 6, -1, -1, -1, -1, -1}, - { 2, 4, 6, -1, -1, -1, -1, -1}, - { 0, 2, 4, 6, -1, -1, -1, -1}, - { 8, -1, -1, -1, -1, -1, -1, -1}, - { 0, 8, -1, -1, -1, -1, -1, -1}, - { 2, 8, -1, -1, -1, -1, -1, -1}, - { 0, 2, 8, -1, -1, -1, -1, -1}, - { 4, 8, -1, -1, -1, -1, -1, -1}, - { 0, 4, 8, -1, -1, -1, -1, -1}, - { 2, 4, 8, -1, -1, -1, -1, -1}, - { 0, 2, 4, 8, -1, -1, -1, -1}, - { 6, 8, -1, -1, -1, -1, -1, -1}, - { 0, 6, 8, -1, -1, -1, -1, -1}, - { 2, 6, 8, -1, -1, -1, -1, -1}, - { 0, 2, 6, 8, -1, -1, -1, -1}, - { 4, 6, 8, -1, -1, -1, -1, -1}, - { 0, 4, 6, 8, -1, -1, -1, -1}, - { 2, 4, 6, 8, -1, -1, -1, -1}, - { 0, 2, 4, 6, 8, -1, -1, -1}, - {10, -1, -1, -1, -1, -1, -1, -1}, - { 0, 10, -1, -1, -1, -1, -1, -1}, - { 2, 10, -1, -1, -1, -1, -1, -1}, - { 0, 2, 10, -1, -1, -1, -1, -1}, - { 4, 10, -1, -1, -1, -1, -1, -1}, - { 0, 4, 10, -1, -1, -1, -1, -1}, - { 2, 4, 10, -1, -1, -1, -1, -1}, - { 0, 2, 4, 10, -1, -1, -1, -1}, - { 6, 10, -1, -1, -1, -1, -1, -1}, - { 0, 6, 10, -1, -1, -1, -1, -1}, - { 2, 6, 10, -1, -1, -1, -1, -1}, - { 0, 2, 6, 10, -1, -1, -1, -1}, - { 4, 6, 10, -1, -1, -1, -1, -1}, - { 0, 4, 6, 10, -1, -1, -1, -1}, - { 2, 4, 6, 10, -1, -1, -1, -1}, - { 0, 2, 4, 6, 10, -1, -1, -1}, - { 8, 10, -1, -1, -1, -1, -1, -1}, - { 0, 8, 10, -1, -1, -1, -1, -1}, - { 2, 8, 10, -1, -1, -1, -1, -1}, - { 0, 2, 8, 10, -1, -1, -1, -1}, - { 4, 8, 10, -1, -1, -1, -1, -1}, - { 0, 4, 8, 10, -1, -1, -1, -1}, - { 2, 4, 8, 10, -1, -1, -1, -1}, - { 0, 2, 4, 8, 10, -1, -1, -1}, - { 6, 8, 10, -1, -1, -1, -1, -1}, - { 0, 6, 8, 10, -1, -1, -1, -1}, - { 2, 6, 8, 10, -1, -1, -1, -1}, - { 0, 2, 6, 8, 10, -1, -1, -1}, - { 4, 6, 8, 10, -1, -1, -1, -1}, - { 0, 4, 6, 8, 10, -1, -1, -1}, - { 2, 4, 6, 8, 10, -1, -1, -1}, - { 0, 2, 4, 6, 8, 10, -1, -1}, - {12, -1, -1, -1, -1, -1, -1, -1}, - { 0, 12, -1, -1, -1, -1, -1, -1}, - { 2, 12, -1, -1, -1, -1, -1, -1}, - { 0, 2, 12, -1, -1, -1, -1, -1}, - { 4, 12, -1, -1, -1, -1, -1, -1}, - { 0, 4, 12, -1, -1, -1, -1, -1}, - { 2, 4, 12, -1, -1, -1, -1, -1}, - { 0, 2, 4, 12, -1, -1, -1, -1}, - { 6, 12, -1, -1, -1, -1, -1, -1}, - { 0, 6, 12, -1, -1, -1, -1, -1}, - { 2, 6, 12, -1, -1, -1, -1, -1}, - { 0, 2, 6, 12, -1, -1, -1, -1}, - { 4, 6, 12, -1, -1, -1, -1, -1}, - { 0, 4, 6, 12, -1, -1, -1, -1}, - { 2, 4, 6, 12, -1, -1, -1, -1}, - { 0, 2, 4, 6, 12, -1, -1, -1}, - { 8, 12, -1, -1, -1, -1, -1, -1}, - { 0, 8, 12, -1, -1, -1, -1, -1}, - { 2, 8, 12, -1, -1, -1, -1, -1}, - { 0, 2, 8, 12, -1, -1, -1, -1}, - { 4, 8, 12, -1, -1, -1, -1, -1}, - { 0, 4, 8, 12, -1, -1, -1, -1}, - { 2, 4, 8, 12, -1, -1, -1, -1}, - { 0, 2, 4, 8, 12, -1, -1, -1}, - { 6, 8, 12, -1, -1, -1, -1, -1}, - { 0, 6, 8, 12, -1, -1, -1, -1}, - { 2, 6, 8, 12, -1, -1, -1, -1}, - { 0, 2, 6, 8, 12, -1, -1, -1}, - { 4, 6, 8, 12, -1, -1, -1, -1}, - { 0, 4, 6, 8, 12, -1, -1, -1}, - { 2, 4, 6, 8, 12, -1, -1, -1}, - { 0, 2, 4, 6, 8, 12, -1, -1}, - {10, 12, -1, -1, -1, -1, -1, -1}, - { 0, 10, 12, -1, -1, -1, -1, -1}, - { 2, 10, 12, -1, -1, -1, -1, -1}, - { 0, 2, 10, 12, -1, -1, -1, -1}, - { 4, 10, 12, -1, -1, -1, -1, -1}, - { 0, 4, 10, 12, -1, -1, -1, -1}, - { 2, 4, 10, 12, -1, -1, -1, -1}, - { 0, 2, 4, 10, 12, -1, -1, -1}, - { 6, 10, 12, -1, -1, -1, -1, -1}, - { 0, 6, 10, 12, -1, -1, -1, -1}, - { 2, 6, 10, 12, -1, -1, -1, -1}, - { 0, 2, 6, 10, 12, -1, -1, -1}, - { 4, 6, 10, 12, -1, -1, -1, -1}, - { 0, 4, 6, 10, 12, -1, -1, -1}, - { 2, 4, 6, 10, 12, -1, -1, -1}, - { 0, 2, 4, 6, 10, 12, -1, -1}, - { 8, 10, 12, -1, -1, -1, -1, -1}, - { 0, 8, 10, 12, -1, -1, -1, -1}, - { 2, 8, 10, 12, -1, -1, -1, -1}, - { 0, 2, 8, 10, 12, -1, -1, -1}, - { 4, 8, 10, 12, -1, -1, -1, -1}, - { 0, 4, 8, 10, 12, -1, -1, -1}, - { 2, 4, 8, 10, 12, -1, -1, -1}, - { 0, 2, 4, 8, 10, 12, -1, -1}, - { 6, 8, 10, 12, -1, -1, -1, -1}, - { 0, 6, 8, 10, 12, -1, -1, -1}, - { 2, 6, 8, 10, 12, -1, -1, -1}, - { 0, 2, 6, 8, 10, 12, -1, -1}, - { 4, 6, 8, 10, 12, -1, -1, -1}, - { 0, 4, 6, 8, 10, 12, -1, -1}, - { 2, 4, 6, 8, 10, 12, -1, -1}, - { 0, 2, 4, 6, 8, 10, 12, -1}, - {14, -1, -1, -1, -1, -1, -1, -1}, - { 0, 14, -1, -1, -1, -1, -1, -1}, - { 2, 14, -1, -1, -1, -1, -1, -1}, - { 0, 2, 14, -1, -1, -1, -1, -1}, - { 4, 14, -1, -1, -1, -1, -1, -1}, - { 0, 4, 14, -1, -1, -1, -1, -1}, - { 2, 4, 14, -1, -1, -1, -1, -1}, - { 0, 2, 4, 14, -1, -1, -1, -1}, - { 6, 14, -1, -1, -1, -1, -1, -1}, - { 0, 6, 14, -1, -1, -1, -1, -1}, - { 2, 6, 14, -1, -1, -1, -1, -1}, - { 0, 2, 6, 14, -1, -1, -1, -1}, - { 4, 6, 14, -1, -1, -1, -1, -1}, - { 0, 4, 6, 14, -1, -1, -1, -1}, - { 2, 4, 6, 14, -1, -1, -1, -1}, - { 0, 2, 4, 6, 14, -1, -1, -1}, - { 8, 14, -1, -1, -1, -1, -1, -1}, - { 0, 8, 14, -1, -1, -1, -1, -1}, - { 2, 8, 14, -1, -1, -1, -1, -1}, - { 0, 2, 8, 14, -1, -1, -1, -1}, - { 4, 8, 14, -1, -1, -1, -1, -1}, - { 0, 4, 8, 14, -1, -1, -1, -1}, - { 2, 4, 8, 14, -1, -1, -1, -1}, - { 0, 2, 4, 8, 14, -1, -1, -1}, - { 6, 8, 14, -1, -1, -1, -1, -1}, - { 0, 6, 8, 14, -1, -1, -1, -1}, - { 2, 6, 8, 14, -1, -1, -1, -1}, - { 0, 2, 6, 8, 14, -1, -1, -1}, - { 4, 6, 8, 14, -1, -1, -1, -1}, - { 0, 4, 6, 8, 14, -1, -1, -1}, - { 2, 4, 6, 8, 14, -1, -1, -1}, - { 0, 2, 4, 6, 8, 14, -1, -1}, - {10, 14, -1, -1, -1, -1, -1, -1}, - { 0, 10, 14, -1, -1, -1, -1, -1}, - { 2, 10, 14, -1, -1, -1, -1, -1}, - { 0, 2, 10, 14, -1, -1, -1, -1}, - { 4, 10, 14, -1, -1, -1, -1, -1}, - { 0, 4, 10, 14, -1, -1, -1, -1}, - { 2, 4, 10, 14, -1, -1, -1, -1}, - { 0, 2, 4, 10, 14, -1, -1, -1}, - { 6, 10, 14, -1, -1, -1, -1, -1}, - { 0, 6, 10, 14, -1, -1, -1, -1}, - { 2, 6, 10, 14, -1, -1, -1, -1}, - { 0, 2, 6, 10, 14, -1, -1, -1}, - { 4, 6, 10, 14, -1, -1, -1, -1}, - { 0, 4, 6, 10, 14, -1, -1, -1}, - { 2, 4, 6, 10, 14, -1, -1, -1}, - { 0, 2, 4, 6, 10, 14, -1, -1}, - { 8, 10, 14, -1, -1, -1, -1, -1}, - { 0, 8, 10, 14, -1, -1, -1, -1}, - { 2, 8, 10, 14, -1, -1, -1, -1}, - { 0, 2, 8, 10, 14, -1, -1, -1}, - { 4, 8, 10, 14, -1, -1, -1, -1}, - { 0, 4, 8, 10, 14, -1, -1, -1}, - { 2, 4, 8, 10, 14, -1, -1, -1}, - { 0, 2, 4, 8, 10, 14, -1, -1}, - { 6, 8, 10, 14, -1, -1, -1, -1}, - { 0, 6, 8, 10, 14, -1, -1, -1}, - { 2, 6, 8, 10, 14, -1, -1, -1}, - { 0, 2, 6, 8, 10, 14, -1, -1}, - { 4, 6, 8, 10, 14, -1, -1, -1}, - { 0, 4, 6, 8, 10, 14, -1, -1}, - { 2, 4, 6, 8, 10, 14, -1, -1}, - { 0, 2, 4, 6, 8, 10, 14, -1}, - {12, 14, -1, -1, -1, -1, -1, -1}, - { 0, 12, 14, -1, -1, -1, -1, -1}, - { 2, 12, 14, -1, -1, -1, -1, -1}, - { 0, 2, 12, 14, -1, -1, -1, -1}, - { 4, 12, 14, -1, -1, -1, -1, -1}, - { 0, 4, 12, 14, -1, -1, -1, -1}, - { 2, 4, 12, 14, -1, -1, -1, -1}, - { 0, 2, 4, 12, 14, -1, -1, -1}, - { 6, 12, 14, -1, -1, -1, -1, -1}, - { 0, 6, 12, 14, -1, -1, -1, -1}, - { 2, 6, 12, 14, -1, -1, -1, -1}, - { 0, 2, 6, 12, 14, -1, -1, -1}, - { 4, 6, 12, 14, -1, -1, -1, -1}, - { 0, 4, 6, 12, 14, -1, -1, -1}, - { 2, 4, 6, 12, 14, -1, -1, -1}, - { 0, 2, 4, 6, 12, 14, -1, -1}, - { 8, 12, 14, -1, -1, -1, -1, -1}, - { 0, 8, 12, 14, -1, -1, -1, -1}, - { 2, 8, 12, 14, -1, -1, -1, -1}, - { 0, 2, 8, 12, 14, -1, -1, -1}, - { 4, 8, 12, 14, -1, -1, -1, -1}, - { 0, 4, 8, 12, 14, -1, -1, -1}, - { 2, 4, 8, 12, 14, -1, -1, -1}, - { 0, 2, 4, 8, 12, 14, -1, -1}, - { 6, 8, 12, 14, -1, -1, -1, -1}, - { 0, 6, 8, 12, 14, -1, -1, -1}, - { 2, 6, 8, 12, 14, -1, -1, -1}, - { 0, 2, 6, 8, 12, 14, -1, -1}, - { 4, 6, 8, 12, 14, -1, -1, -1}, - { 0, 4, 6, 8, 12, 14, -1, -1}, - { 2, 4, 6, 8, 12, 14, -1, -1}, - { 0, 2, 4, 6, 8, 12, 14, -1}, - {10, 12, 14, -1, -1, -1, -1, -1}, - { 0, 10, 12, 14, -1, -1, -1, -1}, - { 2, 10, 12, 14, -1, -1, -1, -1}, - { 0, 2, 10, 12, 14, -1, -1, -1}, - { 4, 10, 12, 14, -1, -1, -1, -1}, - { 0, 4, 10, 12, 14, -1, -1, -1}, - { 2, 4, 10, 12, 14, -1, -1, -1}, - { 0, 2, 4, 10, 12, 14, -1, -1}, - { 6, 10, 12, 14, -1, -1, -1, -1}, - { 0, 6, 10, 12, 14, -1, -1, -1}, - { 2, 6, 10, 12, 14, -1, -1, -1}, - { 0, 2, 6, 10, 12, 14, -1, -1}, - { 4, 6, 10, 12, 14, -1, -1, -1}, - { 0, 4, 6, 10, 12, 14, -1, -1}, - { 2, 4, 6, 10, 12, 14, -1, -1}, - { 0, 2, 4, 6, 10, 12, 14, -1}, - { 8, 10, 12, 14, -1, -1, -1, -1}, - { 0, 8, 10, 12, 14, -1, -1, -1}, - { 2, 8, 10, 12, 14, -1, -1, -1}, - { 0, 2, 8, 10, 12, 14, -1, -1}, - { 4, 8, 10, 12, 14, -1, -1, -1}, - { 0, 4, 8, 10, 12, 14, -1, -1}, - { 2, 4, 8, 10, 12, 14, -1, -1}, - { 0, 2, 4, 8, 10, 12, 14, -1}, - { 6, 8, 10, 12, 14, -1, -1, -1}, - { 0, 6, 8, 10, 12, 14, -1, -1}, - { 2, 6, 8, 10, 12, 14, -1, -1}, - { 0, 2, 6, 8, 10, 12, 14, -1}, - { 4, 6, 8, 10, 12, 14, -1, -1}, - { 0, 4, 6, 8, 10, 12, 14, -1}, - { 2, 4, 6, 8, 10, 12, 14, -1}, - { 0, 2, 4, 6, 8, 10, 12, 14} - } -}; #define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a) #define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a) -#define REJ_UNIFORM_BUFLEN 576 -unsigned int PQCLEAN_KYBER51290S_AVX2_rej_uniform_avx(int16_t *restrict r, - const uint8_t *restrict buf) { +unsigned int PQCLEAN_KYBER51290S_AVX2_rej_uniform_avx(int16_t *restrict r, const uint8_t *buf) { unsigned int ctr, pos; - uint16_t val; + uint16_t val0, val1; uint32_t good; - const __m256i bound = _mm256_set1_epi16((int16_t)(19 * KYBER_Q - 1)); + uint64_t idx0, idx1, idx2, idx3; + const __m256i bound = _mm256_load_si256(&PQCLEAN_KYBER51290S_AVX2_qdata.vec[_16XQ / 16]); const __m256i ones = _mm256_set1_epi8(1); - const __m256i kyberq = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER51290S_AVX2_qdata.as_arr[_16XQ]); - const __m256i v = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER51290S_AVX2_qdata.as_arr[_16XV]); + const __m256i mask = _mm256_set1_epi16(0xFFF); + const __m256i idx8 = _mm256_set_epi8(15, 14, 14, 13, 12, 11, 11, 10, + 9, 8, 8, 7, 6, 5, 5, 4, + 11, 10, 10, 9, 8, 7, 7, 6, + 5, 4, 4, 3, 2, 1, 1, 0); __m256i f0, f1, g0, g1, g2, g3; __m128i f, t, pilo, pihi; - ctr = 0; - for (pos = 0; pos < 2 * KYBER_N; pos += 64) { - f0 = _mm256_load_si256((__m256i *)&buf[pos + 0]); - f1 = _mm256_load_si256((__m256i *)&buf[pos + 32]); + ctr = pos = 0; + while (ctr <= KYBER_N - 32 && pos <= REJ_UNIFORM_AVX_BUFLEN - 48) { + f0 = _mm256_loadu_si256((__m256i *)&buf[pos]); + f1 = _mm256_loadu_si256((__m256i *)&buf[pos + 24]); + f0 = _mm256_permute4x64_epi64(f0, 0x94); + f1 = _mm256_permute4x64_epi64(f1, 0x94); + f0 = _mm256_shuffle_epi8(f0, idx8); + f1 = _mm256_shuffle_epi8(f1, idx8); + g0 = _mm256_srli_epi16(f0, 4); + g1 = _mm256_srli_epi16(f1, 4); + f0 = _mm256_blend_epi16(f0, g0, 0xAA); + f1 = _mm256_blend_epi16(f1, g1, 0xAA); + f0 = _mm256_and_si256(f0, mask); + f1 = _mm256_and_si256(f1, mask); + pos += 48; - g0 = _mm256_cmpge_epu16(bound, f0); - g1 = _mm256_cmpge_epu16(bound, f1); + g0 = _mm256_cmpgt_epi16(bound, f0); + g1 = _mm256_cmpgt_epi16(bound, f1); g0 = _mm256_packs_epi16(g0, g1); good = _mm256_movemask_epi8(g0); - g0 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 0) & 0xFF])); - g1 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 8) & 0xFF])); - g0 = _mm256_inserti128_si256(g0, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 16) & 0xFF]), 1); - g1 = _mm256_inserti128_si256(g1, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 24) & 0xFF]), 1); + idx0 = _pdep_u64(good >> 0, 0x0101010101010101); + idx1 = _pdep_u64(good >> 8, 0x0101010101010101); + idx2 = _pdep_u64(good >> 16, 0x0101010101010101); + idx3 = _pdep_u64(good >> 24, 0x0101010101010101); + idx0 = (idx0 << 8) - idx0; + idx0 = _pext_u64(0x0E0C0A0806040200, idx0); + idx1 = (idx1 << 8) - idx1; + idx1 = _pext_u64(0x0E0C0A0806040200, idx1); + idx2 = (idx2 << 8) - idx2; + idx2 = _pext_u64(0x0E0C0A0806040200, idx2); + idx3 = (idx3 << 8) - idx3; + idx3 = _pext_u64(0x0E0C0A0806040200, idx3); - //g0 = _mm256_cvtepu8_epi64(_mm_loadl_epi64((__m128i *)&good)); - //g1 = _mm256_i64gather_epi64((long long *)idx.arr, g0, 8); - - /* Barrett reduction of (still unsigned) values */ - g2 = _mm256_mulhi_epu16(f0, v); - g3 = _mm256_mulhi_epu16(f1, v); - g2 = _mm256_srli_epi16(g2, 10); - g3 = _mm256_srli_epi16(g3, 10); - g2 = _mm256_mullo_epi16(g2, kyberq); - g3 = _mm256_mullo_epi16(g3, kyberq); - f0 = _mm256_sub_epi16(f0, g2); - f1 = _mm256_sub_epi16(f1, g3); + g0 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx0)); + g1 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx1)); + g0 = _mm256_inserti128_si256(g0, _mm_cvtsi64_si128(idx2), 1); + g1 = _mm256_inserti128_si256(g1, _mm_cvtsi64_si128(idx3), 1); g2 = _mm256_add_epi8(g0, ones); g3 = _mm256_add_epi8(g1, ones); @@ -328,34 +85,40 @@ unsigned int PQCLEAN_KYBER51290S_AVX2_rej_uniform_avx(int16_t *restrict r, ctr += _mm_popcnt_u32((good >> 24) & 0xFF); } - while (ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_BUFLEN - 16) { - f = _mm_load_si128((__m128i *)&buf[pos]); - t = _mm_cmpge_epu16(_mm256_castsi256_si128(bound), f); + while (ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_AVX_BUFLEN - 12) { + f = _mm_loadu_si128((__m128i *)&buf[pos]); + f = _mm_shuffle_epi8(f, _mm256_castsi256_si128(idx8)); + t = _mm_srli_epi16(f, 4); + f = _mm_blend_epi16(f, t, 0xAA); + f = _mm_and_si128(f, _mm256_castsi256_si128(mask)); + pos += 12; + + t = _mm_cmpgt_epi16(_mm256_castsi256_si128(bound), f); good = _mm_movemask_epi8(t); - good = _pext_u32(good, 0x5555); - pilo = _mm_loadl_epi64((__m128i *)&idx.arr[good]); + + good &= 0x5555; + idx0 = _pdep_u64(good, 0x1111111111111111); + idx0 = (idx0 << 8) - idx0; + idx0 = _pext_u64(0x0E0C0A0806040200, idx0); + pilo = _mm_cvtsi64_si128(idx0); + pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones)); pilo = _mm_unpacklo_epi8(pilo, pihi); - - /* Barrett reduction */ - t = _mm_mulhi_epu16(f, _mm256_castsi256_si128(v)); - t = _mm_srli_epi16(t, 10); - t = _mm_mullo_epi16(t, _mm256_castsi256_si128(kyberq)); - f = _mm_sub_epi16(f, t); - f = _mm_shuffle_epi8(f, pilo); _mm_storeu_si128((__m128i *)&r[ctr], f); ctr += _mm_popcnt_u32(good); - pos += 16; } - while (ctr < KYBER_N && pos <= REJ_UNIFORM_BUFLEN - 2) { - val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); - pos += 2; + while (ctr < KYBER_N && pos <= REJ_UNIFORM_AVX_BUFLEN - 3) { + val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; + val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)); + pos += 3; - if (val < 19 * KYBER_Q) { - val -= ((int32_t)val * 20159 >> 26) * KYBER_Q; - r[ctr++] = val; + if (val0 < KYBER_Q) { + r[ctr++] = val0; + } + if (val1 < KYBER_Q && ctr < KYBER_N) { + r[ctr++] = val1; } } diff --git a/crypto_kem/kyber512-90s/avx2/rejsample.h b/crypto_kem/kyber512-90s/avx2/rejsample.h index 5c981845..972fbe27 100644 --- a/crypto_kem/kyber512-90s/avx2/rejsample.h +++ b/crypto_kem/kyber512-90s/avx2/rejsample.h @@ -1,9 +1,12 @@ #ifndef PQCLEAN_KYBER51290S_AVX2_REJSAMPLE_H #define PQCLEAN_KYBER51290S_AVX2_REJSAMPLE_H #include "params.h" +#include "symmetric.h" #include -unsigned int PQCLEAN_KYBER51290S_AVX2_rej_uniform_avx(int16_t *r, - const unsigned char *buf); +#define REJ_UNIFORM_AVX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) +#define REJ_UNIFORM_AVX_BUFLEN (REJ_UNIFORM_AVX_NBLOCKS*XOF_BLOCKBYTES) + +unsigned int PQCLEAN_KYBER51290S_AVX2_rej_uniform_avx(int16_t *r, const uint8_t *buf); #endif diff --git a/crypto_kem/kyber512-90s/avx2/shuffle.S b/crypto_kem/kyber512-90s/avx2/shuffle.S index 01eeea32..4385e096 100644 --- a/crypto_kem/kyber512-90s/avx2/shuffle.S +++ b/crypto_kem/kyber512-90s/avx2/shuffle.S @@ -113,13 +113,13 @@ vmovdqa 224(%rsi),%ymm12 #csubq csubq 5,13 -csubq 6,14 -csubq 7,15 -csubq 8,1 +csubq 6,13 +csubq 7,13 +csubq 8,13 csubq 9,13 -csubq 10,14 -csubq 11,15 -csubq 12,1 +csubq 10,13 +csubq 11,13 +csubq 12,13 #bitpack vpsllw $12,%ymm6,%ymm4 diff --git a/crypto_kem/kyber512-90s/avx2/shuffle.inc b/crypto_kem/kyber512-90s/avx2/shuffle.inc index d4b092bc..73e9ffe0 100644 --- a/crypto_kem/kyber512-90s/avx2/shuffle.inc +++ b/crypto_kem/kyber512-90s/avx2/shuffle.inc @@ -9,17 +9,17 @@ vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 .endm .macro shuffle2 r0,r1,r2,r3 -#vshufps $0x20,%ymm\r1,%ymm\r0,%ymm\r2 -#vshufps $0x31,%ymm\r1,%ymm\r0,%ymm\r3 -vpsllq $32,%ymm\r1,%ymm12 -vpsrlq $32,%ymm\r0,%ymm13 -vpblendd $0xAA,%ymm12,%ymm\r0,%ymm\r2 -vpblendd $0xAA,%ymm\r1,%ymm13,%ymm\r3 +#vpsllq $32,%ymm\r1,%ymm\r2 +vmovsldup %ymm\r1,%ymm\r2 +vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrlq $32,%ymm\r0,%ymm\r0 +#vmovshdup %ymm\r0,%ymm\r0 +vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 .endm .macro shuffle1 r0,r1,r2,r3 -vpslld $16,%ymm\r1,%ymm12 -vpsrld $16,%ymm\r0,%ymm13 -vpblendw $0xAA,%ymm12,%ymm\r0,%ymm\r2 -vpblendw $0xAA,%ymm\r1,%ymm13,%ymm\r3 +vpslld $16,%ymm\r1,%ymm\r2 +vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrld $16,%ymm\r0,%ymm\r0 +vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 .endm diff --git a/crypto_kem/kyber512-90s/avx2/symmetric.h b/crypto_kem/kyber512-90s/avx2/symmetric.h index 93af0198..2cae9245 100644 --- a/crypto_kem/kyber512-90s/avx2/symmetric.h +++ b/crypto_kem/kyber512-90s/avx2/symmetric.h @@ -14,12 +14,10 @@ typedef aes256ctr_ctx xof_state; #define hash_h(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES) #define hash_g(OUT, IN, INBYTES) sha512(OUT, IN, INBYTES) -#define xof_absorb(STATE, SEED, X, Y) \ - PQCLEAN_KYBER51290S_AVX2_aes256ctr_init(STATE, SEED, (X) | ((uint16_t)(Y) << 8)) -#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) \ - PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) -#define prf(OUT, OUTBYTES, KEY, NONCE) \ - PQCLEAN_KYBER51290S_AVX2_aes256ctr_prf(OUT, OUTBYTES, KEY, NONCE) +#define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER51290S_AVX2_aes256ctr_init(STATE, SEED, (X) | ((uint16_t)(Y) << 8)) +#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define xof_ctx_release(STATE) +#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER51290S_AVX2_aes256ctr_prf(OUT, OUTBYTES, KEY, NONCE) #define kdf(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES) diff --git a/crypto_kem/kyber512-90s/avx2/verify.c b/crypto_kem/kyber512-90s/avx2/verify.c index bec349b4..2673d8e8 100644 --- a/crypto_kem/kyber512-90s/avx2/verify.c +++ b/crypto_kem/kyber512-90s/avx2/verify.c @@ -8,31 +8,31 @@ * * Description: Compare two arrays for equality in constant time. * -* Arguments: const unsigned char *a: pointer to first byte array -* const unsigned char *b: pointer to second byte array -* size_t len: length of the byte arrays +* Arguments: const uint8_t *a: pointer to first byte array +* const uint8_t *b: pointer to second byte array +* size_t len: length of the byte arrays * * Returns 0 if the byte arrays are equal, 1 otherwise **************************************************/ int PQCLEAN_KYBER51290S_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) { - size_t pos; + size_t i; uint64_t r; - __m256i avec, bvec, cvec; + __m256i f, g, h; - cvec = _mm256_setzero_si256(); - for (pos = 0; pos + 32 <= len; pos += 32) { - avec = _mm256_loadu_si256((__m256i *)&a[pos]); - bvec = _mm256_loadu_si256((__m256i *)&b[pos]); - avec = _mm256_xor_si256(avec, bvec); - cvec = _mm256_or_si256(cvec, avec); + h = _mm256_setzero_si256(); + for (i = 0; i < len / 32; i++) { + f = _mm256_loadu_si256((__m256i *)&a[32 * i]); + g = _mm256_loadu_si256((__m256i *)&b[32 * i]); + f = _mm256_xor_si256(f, g); + h = _mm256_or_si256(h, f); } - r = 1 - _mm256_testz_si256(cvec, cvec); + r = 1 - _mm256_testz_si256(h, h); - if (pos < len) { - avec = _mm256_loadu_si256((__m256i *)&a[pos]); - bvec = _mm256_loadu_si256((__m256i *)&b[pos]); - cvec = _mm256_cmpeq_epi8(avec, bvec); - r |= _mm256_movemask_epi8(cvec) & (-(uint32_t)1 >> (32 + pos - len)); + a += 32 * i; + b += 32 * i; + len -= 32 * i; + for (i = 0; i < len; i++) { + r |= a[i] ^ b[i]; } r = (-r) >> 63; @@ -47,29 +47,27 @@ int PQCLEAN_KYBER51290S_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t l * assumes two's complement representation of negative integers. * Runs in constant time. * -* Arguments: unsigned char *r: pointer to output byte array +* Arguments: unsigned char *r: pointer to output byte array * const unsigned char *x: pointer to input byte array -* size_t len: Amount of bytes to be copied -* unsigned char b: Condition bit; has to be in {0,1} +* size_t len: Amount of bytes to be copied +* unsigned char b: Condition bit; has to be in {0,1} **************************************************/ -void PQCLEAN_KYBER51290S_AVX2_cmov(uint8_t *restrict r, const uint8_t *restrict x, size_t len, uint8_t b) { - size_t pos; +void PQCLEAN_KYBER51290S_AVX2_cmov(uint8_t *restrict r, const uint8_t *x, size_t len, uint8_t b) { + size_t i; __m256i xvec, rvec, bvec; - b = -b; - bvec = _mm256_set1_epi8(b); - - for (pos = 0; pos + 32 <= len; pos += 32) { - rvec = _mm256_loadu_si256((__m256i *)&r[pos]); - xvec = _mm256_loadu_si256((__m256i *)&x[pos]); - xvec = _mm256_xor_si256(xvec, rvec); - xvec = _mm256_and_si256(xvec, bvec); - rvec = _mm256_xor_si256(rvec, xvec); - _mm256_storeu_si256((__m256i *)&r[pos], rvec); + bvec = _mm256_set1_epi64x(-(uint64_t)b); + for (i = 0; i < len / 32; i++) { + rvec = _mm256_loadu_si256((__m256i *)&r[32 * i]); + xvec = _mm256_loadu_si256((__m256i *)&x[32 * i]); + rvec = _mm256_blendv_epi8(rvec, xvec, bvec); + _mm256_storeu_si256((__m256i *)&r[32 * i], rvec); } - while (pos < len) { - r[pos] ^= b & (x[pos] ^ r[pos]); - pos += 1; + r += 32 * i; + x += 32 * i; + len -= 32 * i; + for (i = 0; i < len; i++) { + r[i] ^= -b & (x[i] ^ r[i]); } } diff --git a/crypto_kem/kyber512-90s/clean/Makefile b/crypto_kem/kyber512-90s/clean/Makefile index 192b7758..41c71729 100644 --- a/crypto_kem/kyber512-90s/clean/Makefile +++ b/crypto_kem/kyber512-90s/clean/Makefile @@ -1,8 +1,8 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libkyber512-90s_clean.a -HEADERS=api.h cbd.h indcpa.h kem.h ntt.h params.h poly.h polyvec.h reduce.h symmetric-aes.h symmetric.h verify.h -OBJECTS=cbd.o indcpa.o kem.o ntt.o poly.o polyvec.o reduce.o symmetric-aes.o verify.o +HEADERS=aes256ctr.h api.h cbd.h indcpa.h kem.h ntt.h params.h poly.h polyvec.h reduce.h symmetric.h verify.h +OBJECTS=aes256ctr.o cbd.o indcpa.o kem.o ntt.o poly.o polyvec.o reduce.o symmetric-aes.o verify.o CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) diff --git a/crypto_kem/kyber512-90s/clean/Makefile.Microsoft_nmake b/crypto_kem/kyber512-90s/clean/Makefile.Microsoft_nmake index c2ecfa3f..1b332771 100644 --- a/crypto_kem/kyber512-90s/clean/Makefile.Microsoft_nmake +++ b/crypto_kem/kyber512-90s/clean/Makefile.Microsoft_nmake @@ -2,7 +2,7 @@ # nmake /f Makefile.Microsoft_nmake LIBRARY=libkyber512-90s_clean.lib -OBJECTS=cbd.obj indcpa.obj kem.obj ntt.obj poly.obj polyvec.obj reduce.obj symmetric-aes.obj verify.obj +OBJECTS=aes256ctr.obj cbd.obj indcpa.obj kem.obj ntt.obj poly.obj polyvec.obj reduce.obj symmetric-aes.obj verify.obj # Warning C4146 is raised when a unary minus operator is applied to an # unsigned type; this has nonetheless been standard and portable for as diff --git a/crypto_kem/kyber512-90s/clean/aes256ctr.c b/crypto_kem/kyber512-90s/clean/aes256ctr.c new file mode 100644 index 00000000..f6139df3 --- /dev/null +++ b/crypto_kem/kyber512-90s/clean/aes256ctr.c @@ -0,0 +1,564 @@ +#include "aes256ctr.h" +#include +#include +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +static inline uint32_t br_dec32le(const uint8_t *src) { + return (uint32_t)src[0] + | ((uint32_t)src[1] << 8) + | ((uint32_t)src[2] << 16) + | ((uint32_t)src[3] << 24); +} + +static void br_range_dec32le(uint32_t *v, size_t num, const uint8_t *src) { + while (num-- > 0) { + *v ++ = br_dec32le(src); + src += 4; + } +} + +static inline uint32_t br_swap32(uint32_t x) { + x = ((x & (uint32_t)0x00FF00FF) << 8) + | ((x >> 8) & (uint32_t)0x00FF00FF); + return (x << 16) | (x >> 16); +} + +static inline void br_enc32le(uint8_t *dst, uint32_t x) { + dst[0] = (uint8_t)x; + dst[1] = (uint8_t)(x >> 8); + dst[2] = (uint8_t)(x >> 16); + dst[3] = (uint8_t)(x >> 24); +} + +static void br_range_enc32le(uint8_t *dst, const uint32_t *v, size_t num) { + while (num-- > 0) { + br_enc32le(dst, *v ++); + dst += 4; + } +} + +static void br_aes_ct64_bitslice_Sbox(uint64_t *q) { + /* + * This S-box implementation is a straightforward translation of + * the circuit described by Boyar and Peralta in "A new + * combinational logic minimization technique with applications + * to cryptology" (https://eprint.iacr.org/2009/191.pdf). + * + * Note that variables x* (input) and s* (output) are numbered + * in "reverse" order (x0 is the high bit, x7 is the low bit). + */ + + uint64_t x0, x1, x2, x3, x4, x5, x6, x7; + uint64_t y1, y2, y3, y4, y5, y6, y7, y8, y9; + uint64_t y10, y11, y12, y13, y14, y15, y16, y17, y18, y19; + uint64_t y20, y21; + uint64_t z0, z1, z2, z3, z4, z5, z6, z7, z8, z9; + uint64_t z10, z11, z12, z13, z14, z15, z16, z17; + uint64_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9; + uint64_t t10, t11, t12, t13, t14, t15, t16, t17, t18, t19; + uint64_t t20, t21, t22, t23, t24, t25, t26, t27, t28, t29; + uint64_t t30, t31, t32, t33, t34, t35, t36, t37, t38, t39; + uint64_t t40, t41, t42, t43, t44, t45, t46, t47, t48, t49; + uint64_t t50, t51, t52, t53, t54, t55, t56, t57, t58, t59; + uint64_t t60, t61, t62, t63, t64, t65, t66, t67; + uint64_t s0, s1, s2, s3, s4, s5, s6, s7; + + x0 = q[7]; + x1 = q[6]; + x2 = q[5]; + x3 = q[4]; + x4 = q[3]; + x5 = q[2]; + x6 = q[1]; + x7 = q[0]; + + /* + * Top linear transformation. + */ + y14 = x3 ^ x5; + y13 = x0 ^ x6; + y9 = x0 ^ x3; + y8 = x0 ^ x5; + t0 = x1 ^ x2; + y1 = t0 ^ x7; + y4 = y1 ^ x3; + y12 = y13 ^ y14; + y2 = y1 ^ x0; + y5 = y1 ^ x6; + y3 = y5 ^ y8; + t1 = x4 ^ y12; + y15 = t1 ^ x5; + y20 = t1 ^ x1; + y6 = y15 ^ x7; + y10 = y15 ^ t0; + y11 = y20 ^ y9; + y7 = x7 ^ y11; + y17 = y10 ^ y11; + y19 = y10 ^ y8; + y16 = t0 ^ y11; + y21 = y13 ^ y16; + y18 = x0 ^ y16; + + /* + * Non-linear section. + */ + t2 = y12 & y15; + t3 = y3 & y6; + t4 = t3 ^ t2; + t5 = y4 & x7; + t6 = t5 ^ t2; + t7 = y13 & y16; + t8 = y5 & y1; + t9 = t8 ^ t7; + t10 = y2 & y7; + t11 = t10 ^ t7; + t12 = y9 & y11; + t13 = y14 & y17; + t14 = t13 ^ t12; + t15 = y8 & y10; + t16 = t15 ^ t12; + t17 = t4 ^ t14; + t18 = t6 ^ t16; + t19 = t9 ^ t14; + t20 = t11 ^ t16; + t21 = t17 ^ y20; + t22 = t18 ^ y19; + t23 = t19 ^ y21; + t24 = t20 ^ y18; + + t25 = t21 ^ t22; + t26 = t21 & t23; + t27 = t24 ^ t26; + t28 = t25 & t27; + t29 = t28 ^ t22; + t30 = t23 ^ t24; + t31 = t22 ^ t26; + t32 = t31 & t30; + t33 = t32 ^ t24; + t34 = t23 ^ t33; + t35 = t27 ^ t33; + t36 = t24 & t35; + t37 = t36 ^ t34; + t38 = t27 ^ t36; + t39 = t29 & t38; + t40 = t25 ^ t39; + + t41 = t40 ^ t37; + t42 = t29 ^ t33; + t43 = t29 ^ t40; + t44 = t33 ^ t37; + t45 = t42 ^ t41; + z0 = t44 & y15; + z1 = t37 & y6; + z2 = t33 & x7; + z3 = t43 & y16; + z4 = t40 & y1; + z5 = t29 & y7; + z6 = t42 & y11; + z7 = t45 & y17; + z8 = t41 & y10; + z9 = t44 & y12; + z10 = t37 & y3; + z11 = t33 & y4; + z12 = t43 & y13; + z13 = t40 & y5; + z14 = t29 & y2; + z15 = t42 & y9; + z16 = t45 & y14; + z17 = t41 & y8; + + /* + * Bottom linear transformation. + */ + t46 = z15 ^ z16; + t47 = z10 ^ z11; + t48 = z5 ^ z13; + t49 = z9 ^ z10; + t50 = z2 ^ z12; + t51 = z2 ^ z5; + t52 = z7 ^ z8; + t53 = z0 ^ z3; + t54 = z6 ^ z7; + t55 = z16 ^ z17; + t56 = z12 ^ t48; + t57 = t50 ^ t53; + t58 = z4 ^ t46; + t59 = z3 ^ t54; + t60 = t46 ^ t57; + t61 = z14 ^ t57; + t62 = t52 ^ t58; + t63 = t49 ^ t58; + t64 = z4 ^ t59; + t65 = t61 ^ t62; + t66 = z1 ^ t63; + s0 = t59 ^ t63; + s6 = t56 ^ ~t62; + s7 = t48 ^ ~t60; + t67 = t64 ^ t65; + s3 = t53 ^ t66; + s4 = t51 ^ t66; + s5 = t47 ^ t65; + s1 = t64 ^ ~s3; + s2 = t55 ^ ~t67; + + q[7] = s0; + q[6] = s1; + q[5] = s2; + q[4] = s3; + q[3] = s4; + q[2] = s5; + q[1] = s6; + q[0] = s7; +} + +static void br_aes_ct64_ortho(uint64_t *q) { +#define SWAPN(cl, ch, s, x, y) do { \ + uint64_t a, b; \ + a = (x); \ + b = (y); \ + (x) = (a & (uint64_t)(cl)) | ((b & (uint64_t)(cl)) << (s)); \ + (y) = ((a & (uint64_t)(ch)) >> (s)) | (b & (uint64_t)(ch)); \ + } while (0) + +#define SWAP2(x, y) SWAPN(0x5555555555555555, 0xAAAAAAAAAAAAAAAA, 1, x, y) +#define SWAP4(x, y) SWAPN(0x3333333333333333, 0xCCCCCCCCCCCCCCCC, 2, x, y) +#define SWAP8(x, y) SWAPN(0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 4, x, y) + + SWAP2(q[0], q[1]); + SWAP2(q[2], q[3]); + SWAP2(q[4], q[5]); + SWAP2(q[6], q[7]); + + SWAP4(q[0], q[2]); + SWAP4(q[1], q[3]); + SWAP4(q[4], q[6]); + SWAP4(q[5], q[7]); + + SWAP8(q[0], q[4]); + SWAP8(q[1], q[5]); + SWAP8(q[2], q[6]); + SWAP8(q[3], q[7]); +} + +static void br_aes_ct64_interleave_in(uint64_t *q0, uint64_t *q1, const uint32_t *w) { + uint64_t x0, x1, x2, x3; + + x0 = w[0]; + x1 = w[1]; + x2 = w[2]; + x3 = w[3]; + x0 |= (x0 << 16); + x1 |= (x1 << 16); + x2 |= (x2 << 16); + x3 |= (x3 << 16); + x0 &= (uint64_t)0x0000FFFF0000FFFF; + x1 &= (uint64_t)0x0000FFFF0000FFFF; + x2 &= (uint64_t)0x0000FFFF0000FFFF; + x3 &= (uint64_t)0x0000FFFF0000FFFF; + x0 |= (x0 << 8); + x1 |= (x1 << 8); + x2 |= (x2 << 8); + x3 |= (x3 << 8); + x0 &= (uint64_t)0x00FF00FF00FF00FF; + x1 &= (uint64_t)0x00FF00FF00FF00FF; + x2 &= (uint64_t)0x00FF00FF00FF00FF; + x3 &= (uint64_t)0x00FF00FF00FF00FF; + *q0 = x0 | (x2 << 8); + *q1 = x1 | (x3 << 8); +} + +static void br_aes_ct64_interleave_out(uint32_t *w, uint64_t q0, uint64_t q1) { + uint64_t x0, x1, x2, x3; + + x0 = q0 & (uint64_t)0x00FF00FF00FF00FF; + x1 = q1 & (uint64_t)0x00FF00FF00FF00FF; + x2 = (q0 >> 8) & (uint64_t)0x00FF00FF00FF00FF; + x3 = (q1 >> 8) & (uint64_t)0x00FF00FF00FF00FF; + x0 |= (x0 >> 8); + x1 |= (x1 >> 8); + x2 |= (x2 >> 8); + x3 |= (x3 >> 8); + x0 &= (uint64_t)0x0000FFFF0000FFFF; + x1 &= (uint64_t)0x0000FFFF0000FFFF; + x2 &= (uint64_t)0x0000FFFF0000FFFF; + x3 &= (uint64_t)0x0000FFFF0000FFFF; + w[0] = (uint32_t)x0 | (uint32_t)(x0 >> 16); + w[1] = (uint32_t)x1 | (uint32_t)(x1 >> 16); + w[2] = (uint32_t)x2 | (uint32_t)(x2 >> 16); + w[3] = (uint32_t)x3 | (uint32_t)(x3 >> 16); +} + +static const uint8_t Rcon[] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36 +}; + +static uint32_t sub_word(uint32_t x) { + uint64_t q[8]; + + memset(q, 0, sizeof q); + q[0] = x; + br_aes_ct64_ortho(q); + br_aes_ct64_bitslice_Sbox(q); + br_aes_ct64_ortho(q); + return (uint32_t)q[0]; +} + +static void br_aes_ct64_keysched(uint64_t *comp_skey, const uint8_t *key) { + int i, j, k, nk, nkf; + uint32_t tmp; + uint32_t skey[60]; + + int key_len = 32; + + nk = (int)(key_len >> 2); + nkf = (int)((14 + 1) << 2); + br_range_dec32le(skey, (key_len >> 2), key); + tmp = skey[(key_len >> 2) - 1]; + for (i = nk, j = 0, k = 0; i < nkf; i ++) { + if (j == 0) { + tmp = (tmp << 24) | (tmp >> 8); + tmp = sub_word(tmp) ^ Rcon[k]; + } else if (nk > 6 && j == 4) { + tmp = sub_word(tmp); + } + tmp ^= skey[i - nk]; + skey[i] = tmp; + if (++ j == nk) { + j = 0; + k ++; + } + } + + for (i = 0, j = 0; i < nkf; i += 4, j += 2) { + uint64_t q[8]; + + br_aes_ct64_interleave_in(&q[0], &q[4], skey + i); + q[1] = q[0]; + q[2] = q[0]; + q[3] = q[0]; + q[5] = q[4]; + q[6] = q[4]; + q[7] = q[4]; + br_aes_ct64_ortho(q); + comp_skey[j + 0] = + (q[0] & (uint64_t)0x1111111111111111) + | (q[1] & (uint64_t)0x2222222222222222) + | (q[2] & (uint64_t)0x4444444444444444) + | (q[3] & (uint64_t)0x8888888888888888); + comp_skey[j + 1] = + (q[4] & (uint64_t)0x1111111111111111) + | (q[5] & (uint64_t)0x2222222222222222) + | (q[6] & (uint64_t)0x4444444444444444) + | (q[7] & (uint64_t)0x8888888888888888); + } +} + +static void br_aes_ct64_skey_expand(uint64_t *skey, const uint64_t *comp_skey) { + unsigned u, v, n; + + n = (14 + 1) << 1; + for (u = 0, v = 0; u < n; u ++, v += 4) { + uint64_t x0, x1, x2, x3; + + x0 = x1 = x2 = x3 = comp_skey[u]; + x0 &= (uint64_t)0x1111111111111111; + x1 &= (uint64_t)0x2222222222222222; + x2 &= (uint64_t)0x4444444444444444; + x3 &= (uint64_t)0x8888888888888888; + x1 >>= 1; + x2 >>= 2; + x3 >>= 3; + skey[v + 0] = (x0 << 4) - x0; + skey[v + 1] = (x1 << 4) - x1; + skey[v + 2] = (x2 << 4) - x2; + skey[v + 3] = (x3 << 4) - x3; + } +} + +static inline void add_round_key(uint64_t *q, const uint64_t *sk) { + q[0] ^= sk[0]; + q[1] ^= sk[1]; + q[2] ^= sk[2]; + q[3] ^= sk[3]; + q[4] ^= sk[4]; + q[5] ^= sk[5]; + q[6] ^= sk[6]; + q[7] ^= sk[7]; +} + +static inline void shift_rows(uint64_t *q) { + int i; + + for (i = 0; i < 8; i ++) { + uint64_t x; + + x = q[i]; + q[i] = (x & (uint64_t)0x000000000000FFFF) + | ((x & (uint64_t)0x00000000FFF00000) >> 4) + | ((x & (uint64_t)0x00000000000F0000) << 12) + | ((x & (uint64_t)0x0000FF0000000000) >> 8) + | ((x & (uint64_t)0x000000FF00000000) << 8) + | ((x & (uint64_t)0xF000000000000000) >> 12) + | ((x & (uint64_t)0x0FFF000000000000) << 4); + } +} + +static inline uint64_t rotr32(uint64_t x) { + return (x << 32) | (x >> 32); +} + +static inline void mix_columns(uint64_t *q) { + uint64_t q0, q1, q2, q3, q4, q5, q6, q7; + uint64_t r0, r1, r2, r3, r4, r5, r6, r7; + + q0 = q[0]; + q1 = q[1]; + q2 = q[2]; + q3 = q[3]; + q4 = q[4]; + q5 = q[5]; + q6 = q[6]; + q7 = q[7]; + r0 = (q0 >> 16) | (q0 << 48); + r1 = (q1 >> 16) | (q1 << 48); + r2 = (q2 >> 16) | (q2 << 48); + r3 = (q3 >> 16) | (q3 << 48); + r4 = (q4 >> 16) | (q4 << 48); + r5 = (q5 >> 16) | (q5 << 48); + r6 = (q6 >> 16) | (q6 << 48); + r7 = (q7 >> 16) | (q7 << 48); + + q[0] = q7 ^ r7 ^ r0 ^ rotr32(q0 ^ r0); + q[1] = q0 ^ r0 ^ q7 ^ r7 ^ r1 ^ rotr32(q1 ^ r1); + q[2] = q1 ^ r1 ^ r2 ^ rotr32(q2 ^ r2); + q[3] = q2 ^ r2 ^ q7 ^ r7 ^ r3 ^ rotr32(q3 ^ r3); + q[4] = q3 ^ r3 ^ q7 ^ r7 ^ r4 ^ rotr32(q4 ^ r4); + q[5] = q4 ^ r4 ^ r5 ^ rotr32(q5 ^ r5); + q[6] = q5 ^ r5 ^ r6 ^ rotr32(q6 ^ r6); + q[7] = q6 ^ r6 ^ r7 ^ rotr32(q7 ^ r7); +} + +static void inc4_be(uint32_t *x) { + *x = br_swap32(*x) + 4; + *x = br_swap32(*x); +} + +static void aes_ctr4x(uint8_t out[64], uint32_t ivw[16], uint64_t sk_exp[64]) { + uint32_t w[16]; + uint64_t q[8]; + int i; + + memcpy(w, ivw, sizeof(w)); + for (i = 0; i < 4; i++) { + br_aes_ct64_interleave_in(&q[i], &q[i + 4], w + (i << 2)); + } + br_aes_ct64_ortho(q); + + add_round_key(q, sk_exp); + for (i = 1; i < 14; i++) { + br_aes_ct64_bitslice_Sbox(q); + shift_rows(q); + mix_columns(q); + add_round_key(q, sk_exp + (i << 3)); + } + br_aes_ct64_bitslice_Sbox(q); + shift_rows(q); + add_round_key(q, sk_exp + 112); + + br_aes_ct64_ortho(q); + for (i = 0; i < 4; i ++) { + br_aes_ct64_interleave_out(w + (i << 2), q[i], q[i + 4]); + } + br_range_enc32le(out, w, 16); + + /* Increase counter for next 4 blocks */ + inc4_be(ivw + 3); + inc4_be(ivw + 7); + inc4_be(ivw + 11); + inc4_be(ivw + 15); +} + +static void br_aes_ct64_ctr_init(uint64_t sk_exp[120], const uint8_t *key) { + uint64_t skey[30]; + + br_aes_ct64_keysched(skey, key); + br_aes_ct64_skey_expand(sk_exp, skey); +} + +static void br_aes_ct64_ctr_run(uint64_t sk_exp[120], const uint8_t *iv, uint32_t cc, uint8_t *data, size_t len) { + uint32_t ivw[16]; + size_t i; + + br_range_dec32le(ivw, 3, iv); + memcpy(ivw + 4, ivw, 3 * sizeof(uint32_t)); + memcpy(ivw + 8, ivw, 3 * sizeof(uint32_t)); + memcpy(ivw + 12, ivw, 3 * sizeof(uint32_t)); + ivw[ 3] = br_swap32(cc); + ivw[ 7] = br_swap32(cc + 1); + ivw[11] = br_swap32(cc + 2); + ivw[15] = br_swap32(cc + 3); + + while (len > 64) { + aes_ctr4x(data, ivw, sk_exp); + data += 64; + len -= 64; + } + if (len > 0) { + uint8_t tmp[64]; + aes_ctr4x(tmp, ivw, sk_exp); + for (i = 0; i < len; i++) { + data[i] = tmp[i]; + } + } +} + +void PQCLEAN_KYBER51290S_CLEAN_aes256ctr_prf(uint8_t *out, size_t outlen, const uint8_t *key, const uint8_t *nonce) { + uint64_t sk_exp[120]; + + br_aes_ct64_ctr_init(sk_exp, key); + br_aes_ct64_ctr_run(sk_exp, nonce, 0, out, outlen); +} + +void PQCLEAN_KYBER51290S_CLEAN_aes256ctr_init(aes256ctr_ctx *s, const uint8_t *key, const uint8_t *nonce) { + br_aes_ct64_ctr_init(s->sk_exp, key); + + br_range_dec32le(s->ivw, 3, nonce); + memcpy(s->ivw + 4, s->ivw, 3 * sizeof(uint32_t)); + memcpy(s->ivw + 8, s->ivw, 3 * sizeof(uint32_t)); + memcpy(s->ivw + 12, s->ivw, 3 * sizeof(uint32_t)); + s->ivw[ 3] = br_swap32(0); + s->ivw[ 7] = br_swap32(1); + s->ivw[11] = br_swap32(2); + s->ivw[15] = br_swap32(3); +} + +void PQCLEAN_KYBER51290S_CLEAN_aes256ctr_squeezeblocks(uint8_t *out, size_t nblocks, aes256ctr_ctx *s) { + while (nblocks > 0) { + aes_ctr4x(out, s->ivw, s->sk_exp); + out += 64; + nblocks--; + } +} diff --git a/crypto_kem/kyber512-90s/clean/aes256ctr.h b/crypto_kem/kyber512-90s/clean/aes256ctr.h new file mode 100644 index 00000000..33cc73c8 --- /dev/null +++ b/crypto_kem/kyber512-90s/clean/aes256ctr.h @@ -0,0 +1,28 @@ +#ifndef PQCLEAN_KYBER51290S_CLEAN_AES256CTR_H +#define PQCLEAN_KYBER51290S_CLEAN_AES256CTR_H + +#include +#include + +#define AES256CTR_BLOCKBYTES 64 + + +typedef struct { + uint64_t sk_exp[120]; + uint32_t ivw[16]; +} aes256ctr_ctx; + +void PQCLEAN_KYBER51290S_CLEAN_aes256ctr_prf(uint8_t *out, + size_t outlen, + const uint8_t key[32], + const uint8_t nonce[12]); + +void PQCLEAN_KYBER51290S_CLEAN_aes256ctr_init(aes256ctr_ctx *state, + const uint8_t key[32], + const uint8_t nonce[12]); + +void PQCLEAN_KYBER51290S_CLEAN_aes256ctr_squeezeblocks(uint8_t *out, + size_t nblocks, + aes256ctr_ctx *state); + +#endif diff --git a/crypto_kem/kyber512-90s/clean/api.h b/crypto_kem/kyber512-90s/clean/api.h index 757a5bc6..20e169c6 100644 --- a/crypto_kem/kyber512-90s/clean/api.h +++ b/crypto_kem/kyber512-90s/clean/api.h @@ -5,7 +5,7 @@ #define PQCLEAN_KYBER51290S_CLEAN_CRYPTO_SECRETKEYBYTES 1632 #define PQCLEAN_KYBER51290S_CLEAN_CRYPTO_PUBLICKEYBYTES 800 -#define PQCLEAN_KYBER51290S_CLEAN_CRYPTO_CIPHERTEXTBYTES 736 +#define PQCLEAN_KYBER51290S_CLEAN_CRYPTO_CIPHERTEXTBYTES 768 #define PQCLEAN_KYBER51290S_CLEAN_CRYPTO_BYTES 32 #define PQCLEAN_KYBER51290S_CLEAN_CRYPTO_ALGNAME "Kyber512-90s" diff --git a/crypto_kem/kyber512-90s/clean/cbd.c b/crypto_kem/kyber512-90s/clean/cbd.c index 8dd938fe..08cee4a6 100644 --- a/crypto_kem/kyber512-90s/clean/cbd.c +++ b/crypto_kem/kyber512-90s/clean/cbd.c @@ -5,7 +5,7 @@ /************************************************* * Name: load32_littleendian * -* Description: load bytes into a 32-bit integer +* Description: load 4 bytes into a 32-bit integer * in little-endian order * * Arguments: - const uint8_t *x: pointer to input byte array @@ -22,16 +22,36 @@ static uint32_t load32_littleendian(const uint8_t x[4]) { } /************************************************* -* Name: PQCLEAN_KYBER51290S_CLEAN_cbd +* Name: load24_littleendian +* +* Description: load 3 bytes into a 32-bit integer +* in little-endian order. +* This function is only needed for Kyber-512 +* +* Arguments: - const uint8_t *x: pointer to input byte array +* +* Returns 32-bit unsigned integer loaded from x (most significant byte is zero) +**************************************************/ +static uint32_t load24_littleendian(const uint8_t x[3]) { + uint32_t r; + r = (uint32_t)x[0]; + r |= (uint32_t)x[1] << 8; + r |= (uint32_t)x[2] << 16; + return r; +} + + +/************************************************* +* Name: cbd2 * * Description: Given an array of uniformly random bytes, compute * polynomial with coefficients distributed according to -* a centered binomial distribution with parameter KYBER_ETA +* a centered binomial distribution with parameter eta=2 * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *buf: pointer to input byte array **************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]) { +static void cbd2(poly *r, const uint8_t buf[2 * KYBER_N / 4]) { unsigned int i, j; uint32_t t, d; int16_t a, b; @@ -48,3 +68,41 @@ void PQCLEAN_KYBER51290S_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_ } } } + +/************************************************* +* Name: cbd3 +* +* Description: Given an array of uniformly random bytes, compute +* polynomial with coefficients distributed according to +* a centered binomial distribution with parameter eta=3. +* This function is only needed for Kyber-512 +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *buf: pointer to input byte array +**************************************************/ +static void cbd3(poly *r, const uint8_t buf[3 * KYBER_N / 4]) { + unsigned int i, j; + uint32_t t, d; + int16_t a, b; + + for (i = 0; i < KYBER_N / 4; i++) { + t = load24_littleendian(buf + 3 * i); + d = t & 0x00249249; + d += (t >> 1) & 0x00249249; + d += (t >> 2) & 0x00249249; + + for (j = 0; j < 4; j++) { + a = (d >> (6 * j + 0)) & 0x7; + b = (d >> (6 * j + 3)) & 0x7; + r->coeffs[4 * i + j] = a - b; + } + } +} + +void PQCLEAN_KYBER51290S_CLEAN_poly_cbd_eta1(poly *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]) { + cbd3(r, buf); +} + +void PQCLEAN_KYBER51290S_CLEAN_poly_cbd_eta2(poly *r, const uint8_t buf[KYBER_ETA2 * KYBER_N / 4]) { + cbd2(r, buf); +} diff --git a/crypto_kem/kyber512-90s/clean/cbd.h b/crypto_kem/kyber512-90s/clean/cbd.h index 7601ab56..d33d61f9 100644 --- a/crypto_kem/kyber512-90s/clean/cbd.h +++ b/crypto_kem/kyber512-90s/clean/cbd.h @@ -4,6 +4,8 @@ #include "poly.h" #include -void PQCLEAN_KYBER51290S_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]); +void PQCLEAN_KYBER51290S_CLEAN_poly_cbd_eta1(poly *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]); + +void PQCLEAN_KYBER51290S_CLEAN_poly_cbd_eta2(poly *r, const uint8_t buf[KYBER_ETA2 * KYBER_N / 4]); #endif diff --git a/crypto_kem/kyber512-90s/clean/indcpa.c b/crypto_kem/kyber512-90s/clean/indcpa.c index bb5712ef..927307fc 100644 --- a/crypto_kem/kyber512-90s/clean/indcpa.c +++ b/crypto_kem/kyber512-90s/clean/indcpa.c @@ -15,8 +15,8 @@ * serialized vector of polynomials pk * and the public seed used to generate the matrix A. * -* Arguments: uint8_t *r: pointer to the output serialized public key -* polyvec *pk: pointer to the input public-key polyvec +* Arguments: uint8_t *r: pointer to the output serialized public key +* polyvec *pk: pointer to the input public-key polyvec * const uint8_t *seed: pointer to the input public seed **************************************************/ static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], @@ -35,10 +35,8 @@ static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], * Description: De-serialize public key from a byte array; * approximate inverse of pack_pk * -* Arguments: - polyvec *pk: pointer to output public-key -* polynomial vector -* - uint8_t *seed: pointer to output seed to generate -* matrix A +* Arguments: - polyvec *pk: pointer to output public-key polynomial vector +* - uint8_t *seed: pointer to output seed to generate matrix A * - const uint8_t *packedpk: pointer to input serialized public key **************************************************/ static void unpack_pk(polyvec *pk, @@ -56,7 +54,7 @@ static void unpack_pk(polyvec *pk, * * Description: Serialize the secret key * -* Arguments: - uint8_t *r: pointer to output serialized secret key +* Arguments: - uint8_t *r: pointer to output serialized secret key * - polyvec *sk: pointer to input vector of polynomials (secret key) **************************************************/ static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { @@ -66,15 +64,12 @@ static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { /************************************************* * Name: unpack_sk * -* Description: De-serialize the secret key; -* inverse of pack_sk +* Description: De-serialize the secret key; inverse of pack_sk * -* Arguments: - polyvec *sk: pointer to output vector of -* polynomials (secret key) +* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key) * - const uint8_t *packedsk: pointer to input serialized secret key **************************************************/ -static void unpack_sk(polyvec *sk, - const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { +static void unpack_sk(polyvec *sk, const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { PQCLEAN_KYBER51290S_CLEAN_polyvec_frombytes(sk, packedsk); } @@ -86,12 +81,10 @@ static void unpack_sk(polyvec *sk, * and the compressed and serialized polynomial v * * Arguments: uint8_t *r: pointer to the output serialized ciphertext -* poly *pk: pointer to the input vector of polynomials b -* poly *v: pointer to the input polynomial v +* poly *pk: pointer to the input vector of polynomials b +* poly *v: pointer to the input polynomial v **************************************************/ -static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], - polyvec *b, - poly *v) { +static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], polyvec *b, poly *v) { PQCLEAN_KYBER51290S_CLEAN_polyvec_compress(r, b); PQCLEAN_KYBER51290S_CLEAN_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v); } @@ -102,13 +95,11 @@ static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], * Description: De-serialize and decompress ciphertext from a byte array; * approximate inverse of pack_ciphertext * -* Arguments: - polyvec *b: pointer to the output vector of polynomials b -* - poly *v: pointer to the output polynomial v +* Arguments: - polyvec *b: pointer to the output vector of polynomials b +* - poly *v: pointer to the output polynomial v * - const uint8_t *c: pointer to the input serialized ciphertext **************************************************/ -static void unpack_ciphertext(polyvec *b, - poly *v, - const uint8_t c[KYBER_INDCPA_BYTES]) { +static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t c[KYBER_INDCPA_BYTES]) { PQCLEAN_KYBER51290S_CLEAN_polyvec_decompress(b, c); PQCLEAN_KYBER51290S_CLEAN_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES); } @@ -119,11 +110,9 @@ static void unpack_ciphertext(polyvec *b, * Description: Run rejection sampling on uniform random bytes to generate * uniform random integers mod q * -* Arguments: - int16_t *r: pointer to output buffer -* - unsigned int len: requested number of 16-bit integers -* (uniform mod q) -* - const uint8_t *buf: pointer to input buffer -* (assumed to be uniform random bytes) +* Arguments: - int16_t *r: pointer to output buffer +* - unsigned int len: requested number of 16-bit integers (uniform mod q) +* - const uint8_t *buf: pointer to input buffer (assumed to be uniformly random bytes) * - unsigned int buflen: length of input buffer in bytes * * Returns number of sampled 16-bit integers (at most len) @@ -133,16 +122,19 @@ static unsigned int rej_uniform(int16_t *r, const uint8_t *buf, unsigned int buflen) { unsigned int ctr, pos; - uint16_t val; + uint16_t val0, val1; ctr = pos = 0; - while (ctr < len && pos + 2 <= buflen) { - val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); - pos += 2; + while (ctr < len && pos + 3 <= buflen) { + val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; + val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF; + pos += 3; - if (val < 19 * KYBER_Q) { - val -= (val >> 12) * KYBER_Q; // Barrett reduction - r[ctr++] = (int16_t)val; + if (val0 < KYBER_Q) { + r[ctr++] = val0; + } + if (ctr < len && val1 < KYBER_Q) { + r[ctr++] = val1; } } @@ -160,17 +152,16 @@ static unsigned int rej_uniform(int16_t *r, * uniformly random. Performs rejection sampling on output of * a XOF * -* Arguments: - polyvec *a: pointer to ouptput matrix A +* Arguments: - polyvec *a: pointer to ouptput matrix A * - const uint8_t *seed: pointer to input seed -* - int transposed: boolean deciding whether A or A^T -* is generated +* - int transposed: boolean deciding whether A or A^T is generated **************************************************/ -#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \ - + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) +#define GEN_MATRIX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) // Not static for benchmarking void PQCLEAN_KYBER51290S_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed) { - unsigned int ctr, i, j; - uint8_t buf[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES]; + unsigned int ctr, i, j, k; + unsigned int buflen, off; + uint8_t buf[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2]; xof_state state; for (i = 0; i < KYBER_K; i++) { @@ -182,12 +173,17 @@ void PQCLEAN_KYBER51290S_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_S } xof_squeezeblocks(buf, GEN_MATRIX_NBLOCKS, &state); - ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, sizeof(buf)); + buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES; + ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, buflen); while (ctr < KYBER_N) { - xof_squeezeblocks(buf, 1, &state); - ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, - XOF_BLOCKBYTES); + off = buflen % 3; + for (k = 0; k < off; k++) { + buf[k] = buf[buflen - off + k]; + } + xof_squeezeblocks(buf + off, 1, &state); + buflen = off + XOF_BLOCKBYTES; + ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, buflen); } xof_ctx_release(&state); } @@ -220,10 +216,10 @@ void PQCLEAN_KYBER51290S_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYB gen_a(a, publicseed); for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER51290S_CLEAN_poly_getnoise(&skpv.vec[i], noiseseed, nonce++); + PQCLEAN_KYBER51290S_CLEAN_poly_getnoise_eta1(&skpv.vec[i], noiseseed, nonce++); } for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER51290S_CLEAN_poly_getnoise(&e.vec[i], noiseseed, nonce++); + PQCLEAN_KYBER51290S_CLEAN_poly_getnoise_eta1(&e.vec[i], noiseseed, nonce++); } PQCLEAN_KYBER51290S_CLEAN_polyvec_ntt(&skpv); @@ -231,7 +227,7 @@ void PQCLEAN_KYBER51290S_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYB // matrix-vector multiplication for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER51290S_CLEAN_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); + PQCLEAN_KYBER51290S_CLEAN_polyvec_basemul_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); PQCLEAN_KYBER51290S_CLEAN_poly_tomont(&pkpv.vec[i]); } @@ -248,16 +244,15 @@ void PQCLEAN_KYBER51290S_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYB * Description: Encryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *c: pointer to output ciphertext -* (of length KYBER_INDCPA_BYTES bytes) -* - const uint8_t *m: pointer to input message -* (of length KYBER_INDCPA_MSGBYTES bytes) -* - const uint8_t *pk: pointer to input public key -* (of length KYBER_INDCPA_PUBLICKEYBYTES) -* - const uint8_t *coins: pointer to input random coins -* used as seed (of length KYBER_SYMBYTES) -* to deterministically generate all -* randomness +* Arguments: - uint8_t *c: pointer to output ciphertext +* (of length KYBER_INDCPA_BYTES bytes) +* - const uint8_t *m: pointer to input message +* (of length KYBER_INDCPA_MSGBYTES bytes) +* - const uint8_t *pk: pointer to input public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES) +* - const uint8_t *coins: pointer to input random coins used as seed +* (of length KYBER_SYMBYTES) to deterministically +* generate all randomness **************************************************/ void PQCLEAN_KYBER51290S_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], const uint8_t m[KYBER_INDCPA_MSGBYTES], @@ -266,7 +261,7 @@ void PQCLEAN_KYBER51290S_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], unsigned int i; uint8_t seed[KYBER_SYMBYTES]; uint8_t nonce = 0; - polyvec sp, pkpv, ep, at[KYBER_K], bp; + polyvec sp, pkpv, ep, at[KYBER_K], b; poly v, k, epp; unpack_pk(&pkpv, seed, pk); @@ -274,32 +269,32 @@ void PQCLEAN_KYBER51290S_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], gen_at(at, seed); for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER51290S_CLEAN_poly_getnoise(sp.vec + i, coins, nonce++); + PQCLEAN_KYBER51290S_CLEAN_poly_getnoise_eta1(sp.vec + i, coins, nonce++); } for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER51290S_CLEAN_poly_getnoise(ep.vec + i, coins, nonce++); + PQCLEAN_KYBER51290S_CLEAN_poly_getnoise_eta2(ep.vec + i, coins, nonce++); } - PQCLEAN_KYBER51290S_CLEAN_poly_getnoise(&epp, coins, nonce++); + PQCLEAN_KYBER51290S_CLEAN_poly_getnoise_eta2(&epp, coins, nonce++); PQCLEAN_KYBER51290S_CLEAN_polyvec_ntt(&sp); // matrix-vector multiplication for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER51290S_CLEAN_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp); + PQCLEAN_KYBER51290S_CLEAN_polyvec_basemul_acc_montgomery(&b.vec[i], &at[i], &sp); } - PQCLEAN_KYBER51290S_CLEAN_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp); + PQCLEAN_KYBER51290S_CLEAN_polyvec_basemul_acc_montgomery(&v, &pkpv, &sp); - PQCLEAN_KYBER51290S_CLEAN_polyvec_invntt_tomont(&bp); + PQCLEAN_KYBER51290S_CLEAN_polyvec_invntt_tomont(&b); PQCLEAN_KYBER51290S_CLEAN_poly_invntt_tomont(&v); - PQCLEAN_KYBER51290S_CLEAN_polyvec_add(&bp, &bp, &ep); + PQCLEAN_KYBER51290S_CLEAN_polyvec_add(&b, &b, &ep); PQCLEAN_KYBER51290S_CLEAN_poly_add(&v, &v, &epp); PQCLEAN_KYBER51290S_CLEAN_poly_add(&v, &v, &k); - PQCLEAN_KYBER51290S_CLEAN_polyvec_reduce(&bp); + PQCLEAN_KYBER51290S_CLEAN_polyvec_reduce(&b); PQCLEAN_KYBER51290S_CLEAN_poly_reduce(&v); - pack_ciphertext(c, &bp, &v); + pack_ciphertext(c, &b, &v); } /************************************************* @@ -308,24 +303,24 @@ void PQCLEAN_KYBER51290S_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], * Description: Decryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *m: pointer to output decrypted message -* (of length KYBER_INDCPA_MSGBYTES) -* - const uint8_t *c: pointer to input ciphertext -* (of length KYBER_INDCPA_BYTES) +* Arguments: - uint8_t *m: pointer to output decrypted message +* (of length KYBER_INDCPA_MSGBYTES) +* - const uint8_t *c: pointer to input ciphertext +* (of length KYBER_INDCPA_BYTES) * - const uint8_t *sk: pointer to input secret key * (of length KYBER_INDCPA_SECRETKEYBYTES) **************************************************/ void PQCLEAN_KYBER51290S_CLEAN_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t c[KYBER_INDCPA_BYTES], const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { - polyvec bp, skpv; + polyvec b, skpv; poly v, mp; - unpack_ciphertext(&bp, &v, c); + unpack_ciphertext(&b, &v, c); unpack_sk(&skpv, sk); - PQCLEAN_KYBER51290S_CLEAN_polyvec_ntt(&bp); - PQCLEAN_KYBER51290S_CLEAN_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp); + PQCLEAN_KYBER51290S_CLEAN_polyvec_ntt(&b); + PQCLEAN_KYBER51290S_CLEAN_polyvec_basemul_acc_montgomery(&mp, &skpv, &b); PQCLEAN_KYBER51290S_CLEAN_poly_invntt_tomont(&mp); PQCLEAN_KYBER51290S_CLEAN_poly_sub(&mp, &v, &mp); diff --git a/crypto_kem/kyber512-90s/clean/kem.c b/crypto_kem/kyber512-90s/clean/kem.c index 9e27c337..39510438 100644 --- a/crypto_kem/kyber512-90s/clean/kem.c +++ b/crypto_kem/kyber512-90s/clean/kem.c @@ -14,13 +14,14 @@ * for CCA-secure Kyber key encapsulation mechanism * * Arguments: - unsigned char *pk: pointer to output public key -* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) * - unsigned char *sk: pointer to output private key -* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* (an already allocated array of KYBER_SECRETKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER51290S_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { +int PQCLEAN_KYBER51290S_CLEAN_crypto_kem_keypair(unsigned char pk[KYBER_PUBLICKEYBYTES], + unsigned char sk[KYBER_SECRETKEYBYTES]) { size_t i; PQCLEAN_KYBER51290S_CLEAN_indcpa_keypair(pk, sk); for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) { @@ -39,17 +40,17 @@ int PQCLEAN_KYBER51290S_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned cha * secret for given public key * * Arguments: - unsigned char *ct: pointer to output cipher text -* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) * - unsigned char *ss: pointer to output shared secret -* (an already allocated array of CRYPTO_BYTES bytes) +* (an already allocated array of KYBER_SSBYTES bytes) * - const unsigned char *pk: pointer to input public key -* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER51290S_CLEAN_crypto_kem_enc(unsigned char *ct, - unsigned char *ss, - const unsigned char *pk) { +int PQCLEAN_KYBER51290S_CLEAN_crypto_kem_enc(unsigned char ct[KYBER_CIPHERTEXTBYTES], + unsigned char ss[KYBER_SSBYTES], + const unsigned char pk[KYBER_PUBLICKEYBYTES]) { uint8_t buf[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ uint8_t kr[2 * KYBER_SYMBYTES]; @@ -79,19 +80,19 @@ int PQCLEAN_KYBER51290S_CLEAN_crypto_kem_enc(unsigned char *ct, * cipher text and private key * * Arguments: - unsigned char *ss: pointer to output shared secret -* (an already allocated array of CRYPTO_BYTES bytes) +* (an already allocated array of KYBER_SSBYTES bytes) * - const unsigned char *ct: pointer to input cipher text -* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) * - const unsigned char *sk: pointer to input private key -* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* (an already allocated array of KYBER_SECRETKEYBYTES bytes) * * Returns 0. * * On failure, ss will contain a pseudo-random value. **************************************************/ -int PQCLEAN_KYBER51290S_CLEAN_crypto_kem_dec(unsigned char *ss, - const unsigned char *ct, - const unsigned char *sk) { +int PQCLEAN_KYBER51290S_CLEAN_crypto_kem_dec(unsigned char ss[KYBER_SSBYTES], + const unsigned char ct[KYBER_CIPHERTEXTBYTES], + const unsigned char sk[KYBER_SECRETKEYBYTES]) { size_t i; int fail; uint8_t buf[2 * KYBER_SYMBYTES]; diff --git a/crypto_kem/kyber512-90s/clean/ntt.c b/crypto_kem/kyber512-90s/clean/ntt.c index dcdf4d1e..52b92a2f 100644 --- a/crypto_kem/kyber512-90s/clean/ntt.c +++ b/crypto_kem/kyber512-90s/clean/ntt.c @@ -3,11 +3,11 @@ #include "reduce.h" #include -/* Code to generate PQCLEAN_KYBER51290S_CLEAN_zetas and PQCLEAN_KYBER51290S_CLEAN_zetas_inv used in the number-theoretic transform: +/* Code to generate PQCLEAN_KYBER51290S_CLEAN_zetas and zetas_inv used in the number-theoretic transform: #define KYBER_ROOT_OF_UNITY 17 -static const uint16_t tree[128] = { +static const uint8_t tree[128] = { 0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120, 4, 68, 36, 100, 20, 84, 52, 116, 12, 76, 44, 108, 28, 92, 60, 124, 2, 66, 34, 98, 18, 82, 50, 114, 10, 74, 42, 106, 26, 90, 58, 122, @@ -19,51 +19,41 @@ static const uint16_t tree[128] = { }; void init_ntt() { - unsigned int i, j, k; + unsigned int i; int16_t tmp[128]; tmp[0] = MONT; - for(i = 1; i < 128; ++i) - tmp[i] = fqmul(tmp[i-1], KYBER_ROOT_OF_UNITY*MONT % KYBER_Q); + for(i=1;i<128;i++) + tmp[i] = fqmul(tmp[i-1],MONT*KYBER_ROOT_OF_UNITY % KYBER_Q); - for(i = 0; i < 128; ++i) + for(i=0;i<128;i++) { PQCLEAN_KYBER51290S_CLEAN_zetas[i] = tmp[tree[i]]; - - k = 0; - for(i = 64; i >= 1; i >>= 1) - for(j = i; j < 2*i; ++j) - PQCLEAN_KYBER51290S_CLEAN_zetas_inv[k++] = -tmp[128 - tree[j]]; - - PQCLEAN_KYBER51290S_CLEAN_zetas_inv[127] = MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q; + if(PQCLEAN_KYBER51290S_CLEAN_zetas[i] > KYBER_Q/2) + PQCLEAN_KYBER51290S_CLEAN_zetas[i] -= KYBER_Q; + if(PQCLEAN_KYBER51290S_CLEAN_zetas[i] < -KYBER_Q/2) + PQCLEAN_KYBER51290S_CLEAN_zetas[i] += KYBER_Q; + } } - */ const int16_t PQCLEAN_KYBER51290S_CLEAN_zetas[128] = { - 2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182, 962, - 2127, 1855, 1468, 573, 2004, 264, 383, 2500, 1458, 1727, 3199, 2648, 1017, - 732, 608, 1787, 411, 3124, 1758, 1223, 652, 2777, 1015, 2036, 1491, 3047, - 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, 2476, 3239, 3058, 830, - 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, 2226, - 430, 555, 843, 2078, 871, 1550, 105, 422, 587, 177, 3094, 3038, 2869, 1574, - 1653, 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349, - 418, 329, 3173, 3254, 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193, - 1218, 1994, 2455, 220, 2142, 1670, 2144, 1799, 2051, 794, 1819, 2475, 2459, - 478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628 -}; - -const int16_t PQCLEAN_KYBER51290S_CLEAN_zetas_inv[128] = { - 1701, 1807, 1460, 2371, 2338, 2333, 308, 108, 2851, 870, 854, 1510, 2535, - 1278, 1530, 1185, 1659, 1187, 3109, 874, 1335, 2111, 136, 1215, 2945, 1465, - 1285, 2007, 2719, 2726, 2232, 2512, 75, 156, 3000, 2911, 2980, 872, 2685, - 1590, 2210, 602, 1846, 777, 147, 2170, 2551, 246, 1676, 1755, 460, 291, 235, - 3152, 2742, 2907, 3224, 1779, 2458, 1251, 2486, 2774, 2899, 1103, 1275, 2652, - 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, - 1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, - 2677, 2106, 1571, 205, 2918, 1542, 2721, 2597, 2312, 681, 130, 1602, 1871, - 829, 2946, 3065, 1325, 2756, 1861, 1474, 1202, 2367, 3147, 1752, 2707, 171, - 3127, 3042, 1907, 1836, 1517, 359, 758, 1441 -}; + -1044, -758, -359, -1517, 1493, 1422, 287, 202, + -171, 622, 1577, 182, 962, -1202, -1474, 1468, + 573, -1325, 264, 383, -829, 1458, -1602, -130, + -681, 1017, 732, 608, -1542, 411, -205, -1571, + 1223, 652, -552, 1015, -1293, 1491, -282, -1544, + 516, -8, -320, -666, -1618, -1162, 126, 1469, + -853, -90, -271, 830, 107, -1421, -247, -951, + -398, 961, -1508, -725, 448, -1065, 677, -1275, + -1103, 430, 555, 843, -1251, 871, 1550, 105, + 422, 587, 177, -235, -291, -460, 1574, 1653, + -246, 778, 1159, -147, -777, 1483, -602, 1119, + -1590, 644, -872, 349, 418, 329, -156, -75, + 817, 1097, 603, 610, 1322, -1285, -1465, 384, + -1215, -136, 1218, -1335, -874, 220, -1187, -1659, + -1185, -1530, -1278, 794, -1510, -854, -870, 478, + -108, -308, 996, 991, 958, -1460, 1522, 1628 + }; /************************************************* * Name: fqmul @@ -82,11 +72,10 @@ static int16_t fqmul(int16_t a, int16_t b) { /************************************************* * Name: PQCLEAN_KYBER51290S_CLEAN_ntt * -* Description: Inplace number-theoretic transform (NTT) in Rq +* Description: Inplace number-theoretic transform (NTT) in Rq. * input is in standard order, output is in bitreversed order * -* Arguments: - int16_t r[256]: pointer to input/output vector of elements -* of Zq +* Arguments: - int16_t r[256]: pointer to input/output vector of elements of Zq **************************************************/ void PQCLEAN_KYBER51290S_CLEAN_ntt(int16_t r[256]) { unsigned int len, start, j, k; @@ -96,7 +85,7 @@ void PQCLEAN_KYBER51290S_CLEAN_ntt(int16_t r[256]) { for (len = 128; len >= 2; len >>= 1) { for (start = 0; start < 256; start = j + len) { zeta = PQCLEAN_KYBER51290S_CLEAN_zetas[k++]; - for (j = start; j < start + len; ++j) { + for (j = start; j < start + len; j++) { t = fqmul(zeta, r[j + len]); r[j + len] = r[j] - t; r[j] = r[j] + t; @@ -112,28 +101,28 @@ void PQCLEAN_KYBER51290S_CLEAN_ntt(int16_t r[256]) { * multiplication by Montgomery factor 2^16. * Input is in bitreversed order, output is in standard order * -* Arguments: - int16_t r[256]: pointer to input/output vector of elements -* of Zq +* Arguments: - int16_t r[256]: pointer to input/output vector of elements of Zq **************************************************/ void PQCLEAN_KYBER51290S_CLEAN_invntt(int16_t r[256]) { unsigned int start, len, j, k; int16_t t, zeta; + const int16_t f = 1441; // mont^2/128 - k = 0; + k = 127; for (len = 2; len <= 128; len <<= 1) { for (start = 0; start < 256; start = j + len) { - zeta = PQCLEAN_KYBER51290S_CLEAN_zetas_inv[k++]; - for (j = start; j < start + len; ++j) { + zeta = PQCLEAN_KYBER51290S_CLEAN_zetas[k--]; + for (j = start; j < start + len; j++) { t = r[j]; r[j] = PQCLEAN_KYBER51290S_CLEAN_barrett_reduce(t + r[j + len]); - r[j + len] = t - r[j + len]; + r[j + len] = r[j + len] - t; r[j + len] = fqmul(zeta, r[j + len]); } } } - for (j = 0; j < 256; ++j) { - r[j] = fqmul(r[j], PQCLEAN_KYBER51290S_CLEAN_zetas_inv[127]); + for (j = 0; j < 256; j++) { + r[j] = fqmul(r[j], f); } } @@ -143,19 +132,15 @@ void PQCLEAN_KYBER51290S_CLEAN_invntt(int16_t r[256]) { * Description: Multiplication of polynomials in Zq[X]/(X^2-zeta) * used for multiplication of elements in Rq in NTT domain * -* Arguments: - int16_t r[2]: pointer to the output polynomial +* Arguments: - int16_t r[2]: pointer to the output polynomial * - const int16_t a[2]: pointer to the first factor * - const int16_t b[2]: pointer to the second factor -* - int16_t zeta: integer defining the reduction polynomial +* - int16_t zeta: integer defining the reduction polynomial **************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_basemul(int16_t r[2], - const int16_t a[2], - const int16_t b[2], - int16_t zeta) { +void PQCLEAN_KYBER51290S_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta) { r[0] = fqmul(a[1], b[1]); r[0] = fqmul(r[0], zeta); r[0] += fqmul(a[0], b[0]); - r[1] = fqmul(a[0], b[1]); r[1] += fqmul(a[1], b[0]); } diff --git a/crypto_kem/kyber512-90s/clean/ntt.h b/crypto_kem/kyber512-90s/clean/ntt.h index bcb98594..263f42bd 100644 --- a/crypto_kem/kyber512-90s/clean/ntt.h +++ b/crypto_kem/kyber512-90s/clean/ntt.h @@ -5,15 +5,10 @@ extern const int16_t PQCLEAN_KYBER51290S_CLEAN_zetas[128]; -extern const int16_t PQCLEAN_KYBER51290S_CLEAN_zetas_inv[128]; - void PQCLEAN_KYBER51290S_CLEAN_ntt(int16_t r[256]); void PQCLEAN_KYBER51290S_CLEAN_invntt(int16_t r[256]); -void PQCLEAN_KYBER51290S_CLEAN_basemul(int16_t r[2], - const int16_t a[2], - const int16_t b[2], - int16_t zeta); +void PQCLEAN_KYBER51290S_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta); #endif diff --git a/crypto_kem/kyber512-90s/clean/params.h b/crypto_kem/kyber512-90s/clean/params.h index ea083a4a..07c3ed78 100644 --- a/crypto_kem/kyber512-90s/clean/params.h +++ b/crypto_kem/kyber512-90s/clean/params.h @@ -7,8 +7,6 @@ #define KYBER_N 256 #define KYBER_Q 3329 -#define KYBER_ETA 2 - #define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define KYBER_SSBYTES 32 /* size in bytes of shared key */ @@ -16,20 +14,20 @@ #define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) #define KYBER_K 2 -#define KYBER_POLYCOMPRESSEDBYTES 96 +#define KYBER_ETA1 3 +#define KYBER_POLYCOMPRESSEDBYTES 128 #define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320) -#define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES +#define KYBER_ETA2 2 + +#define KYBER_INDCPA_MSGBYTES (KYBER_SYMBYTES) #define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) #define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) -#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES \ - + KYBER_POLYCOMPRESSEDBYTES) +#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES) #define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES) /* 32 bytes of additional space to save H(pk) */ -#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES \ - + KYBER_INDCPA_PUBLICKEYBYTES \ - + 2*KYBER_SYMBYTES) -#define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES +#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) +#define KYBER_CIPHERTEXTBYTES (KYBER_INDCPA_BYTES) #endif diff --git a/crypto_kem/kyber512-90s/clean/poly.c b/crypto_kem/kyber512-90s/clean/poly.c index 41f54df9..737e102b 100644 --- a/crypto_kem/kyber512-90s/clean/poly.c +++ b/crypto_kem/kyber512-90s/clean/poly.c @@ -13,23 +13,26 @@ * * Arguments: - uint8_t *r: pointer to output byte array * (of length KYBER_POLYCOMPRESSEDBYTES) -* - poly *a: pointer to input polynomial +* - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a) { +void PQCLEAN_KYBER51290S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a) { size_t i, j; + int16_t u; uint8_t t[8]; - PQCLEAN_KYBER51290S_CLEAN_poly_csubq(a); - for (i = 0; i < KYBER_N / 8; i++) { for (j = 0; j < 8; j++) { - t[j] = ((((uint16_t)a->coeffs[8 * i + j] << 3) + KYBER_Q / 2) / KYBER_Q) & 7; + // map to positive standard representatives + u = a->coeffs[8 * i + j]; + u += (u >> 15) & KYBER_Q; + t[j] = ((((uint16_t)u << 4) + KYBER_Q / 2) / KYBER_Q) & 15; } - r[0] = (t[0] >> 0) | (t[1] << 3) | (t[2] << 6); - r[1] = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7); - r[2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5); - r += 3; + r[0] = t[0] | (t[1] << 4); + r[1] = t[2] | (t[3] << 4); + r[2] = t[4] | (t[5] << 4); + r[3] = t[6] | (t[7] << 4); + r += 4; } } @@ -39,29 +42,17 @@ void PQCLEAN_KYBER51290S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES * Description: De-serialization and subsequent decompression of a polynomial; * approximate inverse of PQCLEAN_KYBER51290S_CLEAN_poly_compress * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: pointer to input byte array * (of length KYBER_POLYCOMPRESSEDBYTES bytes) **************************************************/ void PQCLEAN_KYBER51290S_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) { size_t i; - size_t j; - uint8_t t[8]; - for (i = 0; i < KYBER_N / 8; i++) { - t[0] = (a[0] >> 0); - t[1] = (a[0] >> 3); - t[2] = (a[0] >> 6) | (a[1] << 2); - t[3] = (a[1] >> 1); - t[4] = (a[1] >> 4); - t[5] = (a[1] >> 7) | (a[2] << 1); - t[6] = (a[2] >> 2); - t[7] = (a[2] >> 5); - a += 3; - - for (j = 0; j < 8; j++) { - r->coeffs[8 * i + j] = ((uint16_t)(t[j] & 7) * KYBER_Q + 4) >> 3; - } + for (i = 0; i < KYBER_N / 2; i++) { + r->coeffs[2 * i + 0] = (((uint16_t)(a[0] & 15) * KYBER_Q) + 8) >> 4; + r->coeffs[2 * i + 1] = (((uint16_t)(a[0] >> 4) * KYBER_Q) + 8) >> 4; + a += 1; } } @@ -72,20 +63,21 @@ void PQCLEAN_KYBER51290S_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_PO * * Arguments: - uint8_t *r: pointer to output byte array * (needs space for KYBER_POLYBYTES bytes) -* - poly *a: pointer to input polynomial +* - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { +void PQCLEAN_KYBER51290S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a) { size_t i; uint16_t t0, t1; - PQCLEAN_KYBER51290S_CLEAN_poly_csubq(a); - for (i = 0; i < KYBER_N / 2; i++) { - t0 = a->coeffs[2 * i]; + // map to positive standard representatives + t0 = a->coeffs[2 * i]; + t0 += ((int16_t)t0 >> 15) & KYBER_Q; t1 = a->coeffs[2 * i + 1]; - r[3 * i + 0] = (uint8_t)(t0 >> 0); - r[3 * i + 1] = (uint8_t)((t0 >> 8) | (t1 << 4)); - r[3 * i + 2] = (uint8_t)(t1 >> 4); + t1 += ((int16_t)t1 >> 15) & KYBER_Q; + r[3 * i + 0] = (t0 >> 0); + r[3 * i + 1] = (t0 >> 8) | (t1 << 4); + r[3 * i + 2] = (t1 >> 4); } } @@ -95,7 +87,7 @@ void PQCLEAN_KYBER51290S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) * Description: De-serialization of a polynomial; * inverse of PQCLEAN_KYBER51290S_CLEAN_poly_tobytes * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: pointer to input byte array * (of KYBER_POLYBYTES bytes) **************************************************/ @@ -112,7 +104,7 @@ void PQCLEAN_KYBER51290S_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_POL * * Description: Convert 32-byte message to polynomial * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *msg: pointer to input message **************************************************/ void PQCLEAN_KYBER51290S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { @@ -133,41 +125,60 @@ void PQCLEAN_KYBER51290S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_IND * Description: Convert polynomial to 32-byte message * * Arguments: - uint8_t *msg: pointer to output message -* - poly *a: pointer to input polynomial +* - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a) { +void PQCLEAN_KYBER51290S_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *a) { size_t i, j; uint16_t t; - PQCLEAN_KYBER51290S_CLEAN_poly_csubq(a); - for (i = 0; i < KYBER_N / 8; i++) { msg[i] = 0; for (j = 0; j < 8; j++) { - t = ((((uint16_t)a->coeffs[8 * i + j] << 1) + KYBER_Q / 2) / KYBER_Q) & 1; + t = a->coeffs[8 * i + j]; + t += ((int16_t)t >> 15) & KYBER_Q; + t = (((t << 1) + KYBER_Q / 2) / KYBER_Q) & 1; msg[i] |= t << j; } } } /************************************************* -* Name: PQCLEAN_KYBER51290S_CLEAN_poly_getnoise +* Name: PQCLEAN_KYBER51290S_CLEAN_poly_getnoise_eta1 * * Description: Sample a polynomial deterministically from a seed and a nonce, * with output polynomial close to centered binomial distribution -* with parameter KYBER_ETA +* with parameter KYBER_ETA1 * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *seed: pointer to input seed * (of length KYBER_SYMBYTES bytes) -* - uint8_t nonce: one-byte input nonce +* - uint8_t nonce: one-byte input nonce **************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { - uint8_t buf[KYBER_ETA * KYBER_N / 4]; +void PQCLEAN_KYBER51290S_CLEAN_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { + uint8_t buf[KYBER_ETA1 * KYBER_N / 4]; prf(buf, sizeof(buf), seed, nonce); - PQCLEAN_KYBER51290S_CLEAN_cbd(r, buf); + PQCLEAN_KYBER51290S_CLEAN_poly_cbd_eta1(r, buf); } +/************************************************* +* Name: PQCLEAN_KYBER51290S_CLEAN_poly_getnoise_eta2 +* +* Description: Sample a polynomial deterministically from a seed and a nonce, +* with output polynomial close to centered binomial distribution +* with parameter KYBER_ETA2 +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *seed: pointer to input seed +* (of length KYBER_SYMBYTES bytes) +* - uint8_t nonce: one-byte input nonce +**************************************************/ +void PQCLEAN_KYBER51290S_CLEAN_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { + uint8_t buf[KYBER_ETA2 * KYBER_N / 4]; + prf(buf, sizeof(buf), seed, nonce); + PQCLEAN_KYBER51290S_CLEAN_poly_cbd_eta2(r, buf); +} + + /************************************************* * Name: PQCLEAN_KYBER51290S_CLEAN_poly_ntt * @@ -200,7 +211,7 @@ void PQCLEAN_KYBER51290S_CLEAN_poly_invntt_tomont(poly *r) { * * Description: Multiplication of two polynomials in NTT domain * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ @@ -208,8 +219,7 @@ void PQCLEAN_KYBER51290S_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, c size_t i; for (i = 0; i < KYBER_N / 4; i++) { PQCLEAN_KYBER51290S_CLEAN_basemul(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i], PQCLEAN_KYBER51290S_CLEAN_zetas[64 + i]); - PQCLEAN_KYBER51290S_CLEAN_basemul(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2], &b->coeffs[4 * i + 2], - -PQCLEAN_KYBER51290S_CLEAN_zetas[64 + i]); + PQCLEAN_KYBER51290S_CLEAN_basemul(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2], &b->coeffs[4 * i + 2], -PQCLEAN_KYBER51290S_CLEAN_zetas[64 + i]); } } @@ -244,28 +254,12 @@ void PQCLEAN_KYBER51290S_CLEAN_poly_reduce(poly *r) { } } -/************************************************* -* Name: PQCLEAN_KYBER51290S_CLEAN_poly_csubq -* -* Description: Applies conditional subtraction of q to each coefficient -* of a polynomial. For details of conditional subtraction -* of q see comments in reduce.c -* -* Arguments: - poly *r: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_poly_csubq(poly *r) { - size_t i; - for (i = 0; i < KYBER_N; i++) { - r->coeffs[i] = PQCLEAN_KYBER51290S_CLEAN_csubq(r->coeffs[i]); - } -} - /************************************************* * Name: PQCLEAN_KYBER51290S_CLEAN_poly_add * -* Description: Add two polynomials +* Description: Add two polynomials; no modular reduction is performed * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ @@ -279,7 +273,7 @@ void PQCLEAN_KYBER51290S_CLEAN_poly_add(poly *r, const poly *a, const poly *b) { /************************************************* * Name: PQCLEAN_KYBER51290S_CLEAN_poly_sub * -* Description: Subtract two polynomials +* Description: Subtract two polynomials; no modular reduction is performed * * Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial diff --git a/crypto_kem/kyber512-90s/clean/poly.h b/crypto_kem/kyber512-90s/clean/poly.h index 4f9a0aef..256bea74 100644 --- a/crypto_kem/kyber512-90s/clean/poly.h +++ b/crypto_kem/kyber512-90s/clean/poly.h @@ -11,16 +11,18 @@ typedef struct { int16_t coeffs[KYBER_N]; } poly; -void PQCLEAN_KYBER51290S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a); +void PQCLEAN_KYBER51290S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a); void PQCLEAN_KYBER51290S_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]); -void PQCLEAN_KYBER51290S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a); +void PQCLEAN_KYBER51290S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a); void PQCLEAN_KYBER51290S_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]); void PQCLEAN_KYBER51290S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]); -void PQCLEAN_KYBER51290S_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a); +void PQCLEAN_KYBER51290S_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *a); -void PQCLEAN_KYBER51290S_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); +void PQCLEAN_KYBER51290S_CLEAN_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); + +void PQCLEAN_KYBER51290S_CLEAN_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); void PQCLEAN_KYBER51290S_CLEAN_poly_ntt(poly *r); void PQCLEAN_KYBER51290S_CLEAN_poly_invntt_tomont(poly *r); @@ -28,7 +30,6 @@ void PQCLEAN_KYBER51290S_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, c void PQCLEAN_KYBER51290S_CLEAN_poly_tomont(poly *r); void PQCLEAN_KYBER51290S_CLEAN_poly_reduce(poly *r); -void PQCLEAN_KYBER51290S_CLEAN_poly_csubq(poly *r); void PQCLEAN_KYBER51290S_CLEAN_poly_add(poly *r, const poly *a, const poly *b); void PQCLEAN_KYBER51290S_CLEAN_poly_sub(poly *r, const poly *a, const poly *b); diff --git a/crypto_kem/kyber512-90s/clean/polyvec.c b/crypto_kem/kyber512-90s/clean/polyvec.c index b6414f7a..26a65e50 100644 --- a/crypto_kem/kyber512-90s/clean/polyvec.c +++ b/crypto_kem/kyber512-90s/clean/polyvec.c @@ -10,19 +10,18 @@ * * Arguments: - uint8_t *r: pointer to output byte array * (needs space for KYBER_POLYVECCOMPRESSEDBYTES) -* - polyvec *a: pointer to input vector of polynomials +* - const polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a) { +void PQCLEAN_KYBER51290S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a) { unsigned int i, j, k; - PQCLEAN_KYBER51290S_CLEAN_polyvec_csubq(a); - uint16_t t[4]; for (i = 0; i < KYBER_K; i++) { for (j = 0; j < KYBER_N / 4; j++) { for (k = 0; k < 4; k++) { - t[k] = ((((uint32_t)a->vec[i].coeffs[4 * j + k] << 10) + KYBER_Q / 2) - / KYBER_Q) & 0x3ff; + t[k] = a->vec[i].coeffs[4 * j + k]; + t[k] += ((int16_t)t[k] >> 15) & KYBER_Q; + t[k] = ((((uint32_t)t[k] << 10) + KYBER_Q / 2) / KYBER_Q) & 0x3ff; } r[0] = (uint8_t)(t[0] >> 0); @@ -45,8 +44,7 @@ void PQCLEAN_KYBER51290S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSE * - const uint8_t *a: pointer to input byte array * (of length KYBER_POLYVECCOMPRESSEDBYTES) **************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_polyvec_decompress(polyvec *r, - const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { +void PQCLEAN_KYBER51290S_CLEAN_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { unsigned int i, j, k; uint16_t t[4]; @@ -72,9 +70,9 @@ void PQCLEAN_KYBER51290S_CLEAN_polyvec_decompress(polyvec *r, * * Arguments: - uint8_t *r: pointer to output byte array * (needs space for KYBER_POLYVECBYTES) -* - polyvec *a: pointer to input vector of polynomials +* - const polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a) { +void PQCLEAN_KYBER51290S_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a) { unsigned int i; for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER51290S_CLEAN_poly_tobytes(r + i * KYBER_POLYBYTES, &a->vec[i]); @@ -128,18 +126,16 @@ void PQCLEAN_KYBER51290S_CLEAN_polyvec_invntt_tomont(polyvec *r) { } /************************************************* -* Name: PQCLEAN_KYBER51290S_CLEAN_polyvec_pointwise_acc_montgomery +* Name: PQCLEAN_KYBER51290S_CLEAN_polyvec_basemul_acc_montgomery * -* Description: Pointwise multiply elements of a and b, accumulate into r, +* Description: Multiply elements of a and b in NTT domain, accumulate into r, * and multiply by 2^-16. * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, - const polyvec *a, - const polyvec *b) { +void PQCLEAN_KYBER51290S_CLEAN_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) { unsigned int i; poly t; @@ -156,10 +152,10 @@ void PQCLEAN_KYBER51290S_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, * Name: PQCLEAN_KYBER51290S_CLEAN_polyvec_reduce * * Description: Applies Barrett reduction to each coefficient -* of each element of a vector of polynomials +* of each element of a vector of polynomials; * for details of the Barrett reduction see comments in reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - polyvec *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER51290S_CLEAN_polyvec_reduce(polyvec *r) { unsigned int i; @@ -168,29 +164,12 @@ void PQCLEAN_KYBER51290S_CLEAN_polyvec_reduce(polyvec *r) { } } -/************************************************* -* Name: PQCLEAN_KYBER51290S_CLEAN_polyvec_csubq -* -* Description: Applies conditional subtraction of q to each coefficient -* of each element of a vector of polynomials -* for details of conditional subtraction of q see comments in -* reduce.c -* -* Arguments: - poly *r: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_polyvec_csubq(polyvec *r) { - unsigned int i; - for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER51290S_CLEAN_poly_csubq(&r->vec[i]); - } -} - /************************************************* * Name: PQCLEAN_KYBER51290S_CLEAN_polyvec_add * * Description: Add vectors of polynomials * -* Arguments: - polyvec *r: pointer to output vector of polynomials +* Arguments: - polyvec *r: pointer to output vector of polynomials * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ diff --git a/crypto_kem/kyber512-90s/clean/polyvec.h b/crypto_kem/kyber512-90s/clean/polyvec.h index dd78deb7..b2d1a11b 100644 --- a/crypto_kem/kyber512-90s/clean/polyvec.h +++ b/crypto_kem/kyber512-90s/clean/polyvec.h @@ -8,22 +8,18 @@ typedef struct { poly vec[KYBER_K]; } polyvec; -void PQCLEAN_KYBER51290S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a); -void PQCLEAN_KYBER51290S_CLEAN_polyvec_decompress(polyvec *r, - const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); +void PQCLEAN_KYBER51290S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a); +void PQCLEAN_KYBER51290S_CLEAN_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); -void PQCLEAN_KYBER51290S_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a); +void PQCLEAN_KYBER51290S_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a); void PQCLEAN_KYBER51290S_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]); void PQCLEAN_KYBER51290S_CLEAN_polyvec_ntt(polyvec *r); void PQCLEAN_KYBER51290S_CLEAN_polyvec_invntt_tomont(polyvec *r); -void PQCLEAN_KYBER51290S_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, - const polyvec *a, - const polyvec *b); +void PQCLEAN_KYBER51290S_CLEAN_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b); void PQCLEAN_KYBER51290S_CLEAN_polyvec_reduce(polyvec *r); -void PQCLEAN_KYBER51290S_CLEAN_polyvec_csubq(polyvec *r); void PQCLEAN_KYBER51290S_CLEAN_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); diff --git a/crypto_kem/kyber512-90s/clean/reduce.c b/crypto_kem/kyber512-90s/clean/reduce.c index cbd9d790..ac98f020 100644 --- a/crypto_kem/kyber512-90s/clean/reduce.c +++ b/crypto_kem/kyber512-90s/clean/reduce.c @@ -6,8 +6,7 @@ * Name: PQCLEAN_KYBER51290S_CLEAN_montgomery_reduce * * Description: Montgomery reduction; given a 32-bit integer a, computes -* 16-bit integer congruent to a * R^-1 mod q, -* where R=2^16 +* 16-bit integer congruent to a * R^-1 mod q, where R=2^16 * * Arguments: - int32_t a: input integer to be reduced; * has to be in {-q2^15,...,q2^15-1} @@ -29,32 +28,17 @@ int16_t PQCLEAN_KYBER51290S_CLEAN_montgomery_reduce(int32_t a) { * Name: PQCLEAN_KYBER51290S_CLEAN_barrett_reduce * * Description: Barrett reduction; given a 16-bit integer a, computes -* 16-bit integer congruent to a mod q in {0,...,q} +* centered representative congruent to a mod q in {-(q-1)/2,...,(q-1)/2} * * Arguments: - int16_t a: input integer to be reduced * -* Returns: integer in {0,...,q} congruent to a modulo q. +* Returns: integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q. **************************************************/ int16_t PQCLEAN_KYBER51290S_CLEAN_barrett_reduce(int16_t a) { int16_t t; const int16_t v = ((1U << 26) + KYBER_Q / 2) / KYBER_Q; - t = (int32_t)v * a >> 26; + t = ((int32_t)v * a + (1 << 25)) >> 26; t *= KYBER_Q; return a - t; } - -/************************************************* -* Name: PQCLEAN_KYBER51290S_CLEAN_csubq -* -* Description: Conditionallly subtract q -* -* Arguments: - int16_t x: input integer -* -* Returns: a - q if a >= q, else a -**************************************************/ -int16_t PQCLEAN_KYBER51290S_CLEAN_csubq(int16_t a) { - a -= KYBER_Q; - a += (a >> 15) & KYBER_Q; - return a; -} diff --git a/crypto_kem/kyber512-90s/clean/reduce.h b/crypto_kem/kyber512-90s/clean/reduce.h index bf1b009f..b2b0790d 100644 --- a/crypto_kem/kyber512-90s/clean/reduce.h +++ b/crypto_kem/kyber512-90s/clean/reduce.h @@ -10,6 +10,4 @@ int16_t PQCLEAN_KYBER51290S_CLEAN_montgomery_reduce(int32_t a); int16_t PQCLEAN_KYBER51290S_CLEAN_barrett_reduce(int16_t a); -int16_t PQCLEAN_KYBER51290S_CLEAN_csubq(int16_t a); - #endif diff --git a/crypto_kem/kyber512-90s/clean/symmetric-aes.c b/crypto_kem/kyber512-90s/clean/symmetric-aes.c index a62e233e..7a5db783 100644 --- a/crypto_kem/kyber512-90s/clean/symmetric-aes.c +++ b/crypto_kem/kyber512-90s/clean/symmetric-aes.c @@ -1,100 +1,18 @@ -#include "aes.h" +#include "aes256ctr.h" #include "params.h" #include "symmetric.h" #include #include -#include -static inline void br_enc32be(unsigned char *dst, uint32_t x) { - dst[3] = (unsigned char)x; - dst[2] = (unsigned char)(x >> 8); - dst[1] = (unsigned char)(x >> 16); - dst[0] = (unsigned char)(x >> 24); +void PQCLEAN_KYBER51290S_CLEAN_kyber_aes256xof_absorb(aes256ctr_ctx *state, const uint8_t seed[32], uint8_t x, uint8_t y) { + uint8_t expnonce[12] = {0}; + expnonce[0] = x; + expnonce[1] = y; + PQCLEAN_KYBER51290S_CLEAN_aes256ctr_init(state, seed, expnonce); } -static void aes256_ctr_xof(unsigned char *out, size_t outlen, const unsigned char *iv, uint32_t ctr, const aes256ctx *ctx) { - uint8_t ivw[16]; - uint8_t buf[AES_BLOCKBYTES]; - size_t i = 0; - - memcpy(ivw, iv, AESCTR_NONCEBYTES); - br_enc32be(ivw + AESCTR_NONCEBYTES, ctr); - - while (outlen > AES_BLOCKBYTES) { - aes256_ecb(out, ivw, 1, ctx); - br_enc32be(ivw + AESCTR_NONCEBYTES, ++ctr); - out += AES_BLOCKBYTES; - outlen -= AES_BLOCKBYTES; - } - if (outlen > 0) { - aes256_ecb(buf, ivw, 1, ctx); - for (i = 0; i < outlen; i++) { - out[i] = buf[i]; - } - } -} - -/************************************************* -* Name: PQCLEAN_KYBER51290S_CLEAN_aes256_prf -* -* Description: AES256 stream generation in CTR mode using 32-bit counter, -* nonce is zero-padded to 12 bytes, counter starts at zero -* -* Arguments: - uint8_t *output: pointer to output -* - size_t outlen: length of requested output in bytes -* - const uint8_t *key: pointer to 32-byte key -* - uint8_t nonce: 1-byte nonce (will be zero-padded to 12 bytes) -**************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_aes256_prf(uint8_t *output, size_t outlen, const uint8_t *key, uint8_t nonce) { - uint8_t iv[12]; - for (int i = 1; i < 12; i++) { - iv[i] = 0; - } - iv[0] = nonce; - - aes256ctx ctx; - aes256_ctr_keyexp(&ctx, key); - aes256_ctr(output, outlen, iv, &ctx); - aes256_ctx_release(&ctx); -} - -/************************************************* -* Name: PQCLEAN_KYBER51290S_CLEAN_aes256xof_absorb -* -* Description: AES256 CTR used as a replacement for a XOF; this function -* "absorbs" a 32-byte key and two additional bytes that are zero-padded -* to a 12-byte nonce -* -* Arguments: - aes256xof_ctx *s: pointer to state to "absorb" key and IV into -* - const uint8_t *key: pointer to 32-byte key -* - uint8_t x: first additional byte to "absorb" -* - uint8_t y: second additional byte to "absorb" -**************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_aes256xof_absorb(aes256xof_ctx *s, const uint8_t *key, uint8_t x, uint8_t y) { - aes256_ecb_keyexp(&s->sk_exp, key); - for (int i = 2; i < 12; i++) { - s->iv[i] = 0; - } - s->iv[0] = x; - s->iv[1] = y; - s->ctr = 0; -} - -/************************************************* -* Name: PQCLEAN_KYBER51290S_CLEAN_aes256xof_squeezeblocks -* -* Description: AES256 CTR used as a replacement for a XOF; this function -* generates 4 blocks out AES256-CTR output -* -* Arguments: - uint8_t *out: pointer to output -* - size_t nblocks: number of reqested 64-byte output blocks -* - aes256xof_ctx *s: AES "state", i.e. expanded key and IV -**************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_aes256xof_squeezeblocks(uint8_t *out, size_t nblocks, aes256xof_ctx *s) { - aes256_ctr_xof(out, nblocks * 64, s->iv, s->ctr, &s->sk_exp); - s->ctr += (uint32_t) (4 * nblocks); -} - -void PQCLEAN_KYBER51290S_CLEAN_aes256xof_ctx_release(aes256xof_ctx *s) { - aes256_ctx_release(&s->sk_exp); +void PQCLEAN_KYBER51290S_CLEAN_kyber_aes256ctr_prf(uint8_t *out, size_t outlen, const uint8_t key[32], uint8_t nonce) { + uint8_t expnonce[12] = {0}; + expnonce[0] = nonce; + PQCLEAN_KYBER51290S_CLEAN_aes256ctr_prf(out, outlen, key, expnonce); } diff --git a/crypto_kem/kyber512-90s/clean/symmetric-aes.h b/crypto_kem/kyber512-90s/clean/symmetric-aes.h deleted file mode 100644 index 57550e19..00000000 --- a/crypto_kem/kyber512-90s/clean/symmetric-aes.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef PQCLEAN_KYBER51290S_CLEAN_SYMMETRIC_AES_H -#define PQCLEAN_KYBER51290S_CLEAN_SYMMETRIC_AES_H -#include "aes.h" -#include -#include - - -typedef struct { - aes256ctx sk_exp; - uint8_t iv[12]; - uint32_t ctr; -} aes256xof_ctx; - -void PQCLEAN_KYBER51290S_CLEAN_aes256_prf(uint8_t *output, size_t outlen, const uint8_t *key, uint8_t nonce); -void PQCLEAN_KYBER51290S_CLEAN_aes256xof_absorb(aes256xof_ctx *s, const uint8_t *key, uint8_t x, uint8_t y); -void PQCLEAN_KYBER51290S_CLEAN_aes256xof_squeezeblocks(uint8_t *out, size_t nblocks, aes256xof_ctx *s); -void PQCLEAN_KYBER51290S_CLEAN_aes256xof_ctx_release(aes256xof_ctx *s); - -#endif diff --git a/crypto_kem/kyber512-90s/clean/symmetric.h b/crypto_kem/kyber512-90s/clean/symmetric.h index 5e189c16..f84feb4e 100644 --- a/crypto_kem/kyber512-90s/clean/symmetric.h +++ b/crypto_kem/kyber512-90s/clean/symmetric.h @@ -1,23 +1,28 @@ #ifndef PQCLEAN_KYBER51290S_CLEAN_SYMMETRIC_H #define PQCLEAN_KYBER51290S_CLEAN_SYMMETRIC_H +#include "aes256ctr.h" #include "params.h" #include "sha2.h" -#include "symmetric-aes.h" #include #include -typedef aes256xof_ctx xof_state; -#define XOF_BLOCKBYTES 64 +typedef aes256ctr_ctx xof_state; + +void PQCLEAN_KYBER51290S_CLEAN_kyber_aes256xof_absorb(aes256ctr_ctx *state, const uint8_t seed[32], uint8_t x, uint8_t y); + +void PQCLEAN_KYBER51290S_CLEAN_kyber_aes256ctr_prf(uint8_t *out, size_t outlen, const uint8_t key[32], uint8_t nonce); + +#define XOF_BLOCKBYTES AES256CTR_BLOCKBYTES #define hash_h(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES) #define hash_g(OUT, IN, INBYTES) sha512(OUT, IN, INBYTES) -#define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER51290S_CLEAN_aes256xof_absorb(STATE, SEED, X, Y) -#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_KYBER51290S_CLEAN_aes256xof_squeezeblocks(OUT, OUTBLOCKS, STATE) -#define xof_ctx_release(STATE) PQCLEAN_KYBER51290S_CLEAN_aes256xof_ctx_release(STATE) -#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER51290S_CLEAN_aes256_prf(OUT, OUTBYTES, KEY, NONCE) +#define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER51290S_CLEAN_kyber_aes256xof_absorb(STATE, SEED, X, Y) +#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_KYBER51290S_CLEAN_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define xof_ctx_release(STATE) +#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER51290S_CLEAN_kyber_aes256ctr_prf(OUT, OUTBYTES, KEY, NONCE) #define kdf(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES) diff --git a/crypto_kem/kyber512/META.yml b/crypto_kem/kyber512/META.yml index b40fcd66..e82196d7 100644 --- a/crypto_kem/kyber512/META.yml +++ b/crypto_kem/kyber512/META.yml @@ -3,10 +3,10 @@ type: kem claimed-nist-level: 1 claimed-security: IND-CCA2 length-public-key: 800 -length-ciphertext: 736 +length-ciphertext: 768 length-secret-key: 1632 length-shared-secret: 32 -nistkat-sha256: bdd9b46001de4595a4f185aec8d5d04d217705e65e10711c99fa3f0ac3d61c21 +nistkat-sha256: bb0481d3325d828817900b709d23917cefbc10026fc857f098979451f67bb0ca principal-submitters: - Peter Schwabe auxiliary-submitters: @@ -21,9 +21,9 @@ auxiliary-submitters: - Damien Stehlé implementations: - name: clean - version: https://github.com/pq-crystals/kyber/commit/844057468e69527bd15b17fbe03f4b61f9a22065 via https://github.com/jschanck/package-pqclean/tree/b45068b8/kyber + version: https://github.com/pq-crystals/kyber/commit/e7faae9f662f5b92fee4e966f09b2f23e1e91c65 via https://github.com/jschanck/package-pqclean/tree/231c9bec/kyber - name: avx2 - version: https://github.com/pq-crystals/kyber/commit/844057468e69527bd15b17fbe03f4b61f9a22065 via https://github.com/jschanck/package-pqclean/tree/b45068b8/kyber + version: https://github.com/pq-crystals/kyber/commit/e7faae9f662f5b92fee4e966f09b2f23e1e91c65 via https://github.com/jschanck/package-pqclean/tree/231c9bec/kyber supported_platforms: - architecture: x86_64 operating_systems: diff --git a/crypto_kem/kyber512/avx2/align.h b/crypto_kem/kyber512/avx2/align.h index fb553ac5..c655fc1b 100644 --- a/crypto_kem/kyber512/avx2/align.h +++ b/crypto_kem/kyber512/avx2/align.h @@ -2,22 +2,18 @@ #define PQCLEAN_KYBER512_AVX2_ALIGN_H #include +#include -#define ALIGN16_TYPE(t) \ - union { \ - __m128i vec; \ - t orig; \ +#define ALIGNED_UINT8(N) \ + union { \ + uint8_t coeffs[(N)]; \ + __m256i vec[((N)+31)/32]; \ } -#define ALIGN32_ARRAY(t, s) \ - union { \ - __m256i vec; \ - t arr[(s)]; \ +#define ALIGNED_INT16(N) \ + union { \ + int16_t coeffs[(N)]; \ + __m256i vec[((N)+15)/16]; \ } -#define ALIGN32_ARRAY_2D(t, n, m) \ - union { \ - __m256i vec; \ - t arr[(n)][(m)]; \ - } #endif diff --git a/crypto_kem/kyber512/avx2/api.h b/crypto_kem/kyber512/avx2/api.h index 07cc6289..0d9e2b5e 100644 --- a/crypto_kem/kyber512/avx2/api.h +++ b/crypto_kem/kyber512/avx2/api.h @@ -5,7 +5,7 @@ #define PQCLEAN_KYBER512_AVX2_CRYPTO_SECRETKEYBYTES 1632 #define PQCLEAN_KYBER512_AVX2_CRYPTO_PUBLICKEYBYTES 800 -#define PQCLEAN_KYBER512_AVX2_CRYPTO_CIPHERTEXTBYTES 736 +#define PQCLEAN_KYBER512_AVX2_CRYPTO_CIPHERTEXTBYTES 768 #define PQCLEAN_KYBER512_AVX2_CRYPTO_BYTES 32 #define PQCLEAN_KYBER512_AVX2_CRYPTO_ALGNAME "Kyber512" diff --git a/crypto_kem/kyber512/avx2/basemul.S b/crypto_kem/kyber512/avx2/basemul.S index c8c86d0a..f7454686 100644 --- a/crypto_kem/kyber512/avx2/basemul.S +++ b/crypto_kem/kyber512/avx2/basemul.S @@ -1,216 +1,107 @@ #include "cdecl.h" -#include "params.h" -.macro schoolbook off,sign -#load -vmovdqa \off+32(%rsi),%ymm7 # b -vmovdqa \off+32(%rdx),%ymm8 # d -vmovdqa \off(%rsi),%ymm9 # a -vmovdqa \off(%rdx),%ymm10 # c +.macro schoolbook off +vmovdqa _16XQINV*2(%rcx),%ymm0 +vmovdqa (64*\off+ 0)*2(%rsi),%ymm1 # a0 +vmovdqa (64*\off+16)*2(%rsi),%ymm2 # b0 +vmovdqa (64*\off+32)*2(%rsi),%ymm3 # a1 +vmovdqa (64*\off+48)*2(%rsi),%ymm4 # b1 -#mul -vpmullw %ymm7,%ymm8,%ymm11 # bd.lo -vpmulhw %ymm7,%ymm8,%ymm12 # bd.hi -vpmullw %ymm7,%ymm10,%ymm13 # bc.lo -vpmulhw %ymm7,%ymm10,%ymm7 # bc.hi -vpmullw %ymm9,%ymm8,%ymm14 # ad.lo -vpmulhw %ymm9,%ymm8,%ymm8 # ad.hi -vpmullw %ymm9,%ymm10,%ymm15 # ac.lo -vpmulhw %ymm9,%ymm10,%ymm9 # ac.hi +vpmullw %ymm0,%ymm1,%ymm9 # a0.lo +vpmullw %ymm0,%ymm2,%ymm10 # b0.lo +vpmullw %ymm0,%ymm3,%ymm11 # a1.lo +vpmullw %ymm0,%ymm4,%ymm12 # b1.lo -#reduce -vpmullw %ymm1,%ymm11,%ymm11 -vpmulhw %ymm0,%ymm11,%ymm11 -vpsubw %ymm11,%ymm12,%ymm11 # bd +vmovdqa (64*\off+ 0)*2(%rdx),%ymm5 # c0 +vmovdqa (64*\off+16)*2(%rdx),%ymm6 # d0 -#mul -vpmullw %ymm2,%ymm11,%ymm10 # rbd.lo -vpmulhw %ymm2,%ymm11,%ymm11 # rbd.hi +vpmulhw %ymm5,%ymm1,%ymm13 # a0c0.hi +vpmulhw %ymm6,%ymm1,%ymm1 # a0d0.hi +vpmulhw %ymm5,%ymm2,%ymm14 # b0c0.hi +vpmulhw %ymm6,%ymm2,%ymm2 # b0d0.hi -#unpack -vpunpcklwd %ymm7,%ymm13,%ymm12 # bc0 -vpunpckhwd %ymm7,%ymm13,%ymm7 # bc1 -vpunpcklwd %ymm8,%ymm14,%ymm13 # ad0 -vpunpckhwd %ymm8,%ymm14,%ymm8 # ad1 -vpunpcklwd %ymm9,%ymm15,%ymm14 # ac0 -vpunpckhwd %ymm9,%ymm15,%ymm9 # ac1 -vpunpcklwd %ymm11,%ymm10,%ymm15 # rbd0 -vpunpckhwd %ymm11,%ymm10,%ymm10 # rbd1 +vmovdqa (64*\off+32)*2(%rdx),%ymm7 # c1 +vmovdqa (64*\off+48)*2(%rdx),%ymm8 # d1 -#add -.ifeq \sign -vpaddd %ymm14,%ymm15,%ymm14 # x0 -vpaddd %ymm9,%ymm10,%ymm9 # x1 -.else -vpsubd %ymm15,%ymm14,%ymm14 # x0 -vpsubd %ymm10,%ymm9,%ymm9 # x1 -.endif -vpaddd %ymm12,%ymm13,%ymm12 # y0 -vpaddd %ymm7,%ymm8,%ymm7 # y1 -.endm +vpmulhw %ymm7,%ymm3,%ymm15 # a1c1.hi +vpmulhw %ymm8,%ymm3,%ymm3 # a1d1.hi +vpmulhw %ymm7,%ymm4,%ymm0 # b1c1.hi +vpmulhw %ymm8,%ymm4,%ymm4 # b1d1.hi -.macro red a0,a1,b0,b1,x,y,z -#pack -vpxor %ymm\x,%ymm\x,%ymm\x -vpblendw $0xAA,%ymm\x,%ymm\a0,%ymm\y -vpblendw $0xAA,%ymm\x,%ymm\a1,%ymm\z -vpsrld $16,%ymm\a0,%ymm\a0 -vpsrld $16,%ymm\a1,%ymm\a1 -vpackusdw %ymm\z,%ymm\y,%ymm\z -vpackusdw %ymm\a1,%ymm\a0,%ymm\a0 -vpblendw $0xAA,%ymm\x,%ymm\b0,%ymm\y -vpblendw $0xAA,%ymm\x,%ymm\b1,%ymm\x -vpsrld $16,%ymm\b0,%ymm\b0 -vpsrld $16,%ymm\b1,%ymm\b1 -vpackusdw %ymm\x,%ymm\y,%ymm\y -vpackusdw %ymm\b1,%ymm\b0,%ymm\b0 +vmovdqa %ymm13,(%rsp) -#reduce -vpmullw %ymm1,%ymm\z,%ymm\z -vpmullw %ymm1,%ymm\y,%ymm\y -vpmulhw %ymm0,%ymm\z,%ymm\z -vpmulhw %ymm0,%ymm\y,%ymm\y -vpsubw %ymm\z,%ymm\a0,%ymm\a0 -vpsubw %ymm\y,%ymm\b0,%ymm\b0 +vpmullw %ymm5,%ymm9,%ymm13 # a0c0.lo +vpmullw %ymm6,%ymm9,%ymm9 # a0d0.lo +vpmullw %ymm5,%ymm10,%ymm5 # b0c0.lo +vpmullw %ymm6,%ymm10,%ymm10 # b0d0.lo + +vpmullw %ymm7,%ymm11,%ymm6 # a1c1.lo +vpmullw %ymm8,%ymm11,%ymm11 # a1d1.lo +vpmullw %ymm7,%ymm12,%ymm7 # b1c1.lo +vpmullw %ymm8,%ymm12,%ymm12 # b1d1.lo + +vmovdqa _16XQ*2(%rcx),%ymm8 +vpmulhw %ymm8,%ymm13,%ymm13 +vpmulhw %ymm8,%ymm9,%ymm9 +vpmulhw %ymm8,%ymm5,%ymm5 +vpmulhw %ymm8,%ymm10,%ymm10 +vpmulhw %ymm8,%ymm6,%ymm6 +vpmulhw %ymm8,%ymm11,%ymm11 +vpmulhw %ymm8,%ymm7,%ymm7 +vpmulhw %ymm8,%ymm12,%ymm12 + +vpsubw (%rsp),%ymm13,%ymm13 # -a0c0 +vpsubw %ymm9,%ymm1,%ymm9 # a0d0 +vpsubw %ymm5,%ymm14,%ymm5 # b0c0 +vpsubw %ymm10,%ymm2,%ymm10 # b0d0 + +vpsubw %ymm6,%ymm15,%ymm6 # a1c1 +vpsubw %ymm11,%ymm3,%ymm11 # a1d1 +vpsubw %ymm7,%ymm0,%ymm7 # b1c1 +vpsubw %ymm12,%ymm4,%ymm12 # b1d1 + +vmovdqa (%r9),%ymm0 +vmovdqa 32(%r9),%ymm1 +vpmullw %ymm0,%ymm10,%ymm2 +vpmullw %ymm0,%ymm12,%ymm3 +vpmulhw %ymm1,%ymm10,%ymm10 +vpmulhw %ymm1,%ymm12,%ymm12 +vpmulhw %ymm8,%ymm2,%ymm2 +vpmulhw %ymm8,%ymm3,%ymm3 +vpsubw %ymm2,%ymm10,%ymm10 # rb0d0 +vpsubw %ymm3,%ymm12,%ymm12 # rb1d1 + +vpaddw %ymm5,%ymm9,%ymm9 +vpaddw %ymm7,%ymm11,%ymm11 +vpsubw %ymm13,%ymm10,%ymm13 +vpsubw %ymm12,%ymm6,%ymm6 + +vmovdqa %ymm13,(64*\off+ 0)*2(%rdi) +vmovdqa %ymm9,(64*\off+16)*2(%rdi) +vmovdqa %ymm6,(64*\off+32)*2(%rdi) +vmovdqa %ymm11,(64*\off+48)*2(%rdi) .endm .text -basemul64_acc_avx: -poly0.0: -schoolbook 0,0 - -#mov -vmovdqa %ymm14,%ymm3 -vmovdqa %ymm9,%ymm4 -vmovdqa %ymm12,%ymm5 -vmovdqa %ymm7,%ymm6 - -poly1.0: -schoolbook 512,0 - -#add -vpaddd %ymm14,%ymm3,%ymm3 -vpaddd %ymm9,%ymm4,%ymm4 -vpaddd %ymm12,%ymm5,%ymm5 -vpaddd %ymm7,%ymm6,%ymm6 - - - -#reduce -red 3,4,5,6,7,8,9 - -#store -vmovdqa %ymm3,(%rdi) -vmovdqa %ymm5,32(%rdi) - -poly0.1: -schoolbook 64,1 - -#mov -vmovdqa %ymm14,%ymm3 -vmovdqa %ymm9,%ymm4 -vmovdqa %ymm12,%ymm5 -vmovdqa %ymm7,%ymm6 - -poly1.1: -schoolbook 576,1 - -#add -vpaddd %ymm14,%ymm3,%ymm3 -vpaddd %ymm9,%ymm4,%ymm4 -vpaddd %ymm12,%ymm5,%ymm5 -vpaddd %ymm7,%ymm6,%ymm6 - - - -#reduce -red 3,4,5,6,7,8,9 - -#store -vmovdqa %ymm3,64(%rdi) -vmovdqa %ymm5,96(%rdi) - -ret - -.global cdecl(PQCLEAN_KYBER512_AVX2_basemul_acc_avx) -.global _cdecl(PQCLEAN_KYBER512_AVX2_basemul_acc_avx) -cdecl(PQCLEAN_KYBER512_AVX2_basemul_acc_avx): -_cdecl(PQCLEAN_KYBER512_AVX2_basemul_acc_avx): -#consts -vmovdqa _16XQ*2(%rcx),%ymm0 -vmovdqa _16XQINV*2(%rcx),%ymm1 - -vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 -call basemul64_acc_avx - -vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_acc_avx - -vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_acc_avx - -vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_acc_avx - -ret - -basemul64_avx: -schoolbook 0,0 - -#reduce -red 14,9,12,7,8,10,11 - -#store -vmovdqa %ymm14,(%rdi) -vmovdqa %ymm12,32(%rdi) - -schoolbook 64,1 - -#reduce -red 14,9,12,7,8,10,11 - -#store -vmovdqa %ymm14,64(%rdi) -vmovdqa %ymm12,96(%rdi) - -ret - .global cdecl(PQCLEAN_KYBER512_AVX2_basemul_avx) .global _cdecl(PQCLEAN_KYBER512_AVX2_basemul_avx) cdecl(PQCLEAN_KYBER512_AVX2_basemul_avx): _cdecl(PQCLEAN_KYBER512_AVX2_basemul_avx): -#consts -vmovdqa _16XQ*2(%rcx),%ymm0 -vmovdqa _16XQINV*2(%rcx),%ymm1 +mov %rsp,%r8 +and $-32,%rsp +sub $32,%rsp -vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 -call basemul64_avx +lea (_ZETAS_EXP+176)*2(%rcx),%r9 +schoolbook 0 -vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_avx +add $32*2,%r9 +schoolbook 1 -vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_avx +add $192*2,%r9 +schoolbook 2 -vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_avx +add $32*2,%r9 +schoolbook 3 +mov %r8,%rsp ret diff --git a/crypto_kem/kyber512/avx2/cbd.c b/crypto_kem/kyber512/avx2/cbd.c index a9d7ae86..80516de1 100644 --- a/crypto_kem/kyber512/avx2/cbd.c +++ b/crypto_kem/kyber512/avx2/cbd.c @@ -4,66 +4,125 @@ #include /************************************************* -* Name: PQCLEAN_KYBER512_AVX2_cbd +* Name: cbd2 * * Description: Given an array of uniformly random bytes, compute * polynomial with coefficients distributed according to -* a centered binomial distribution with parameter KYBER_ETA +* a centered binomial distribution with parameter eta=2 * -* Arguments: - poly *r: pointer to output polynomial -* - const unsigned char *buf: pointer to input byte array +* Arguments: - poly *r: pointer to output polynomial +* - const __m256i *buf: pointer to aligned input byte array **************************************************/ -void PQCLEAN_KYBER512_AVX2_cbd(poly *restrict r, const uint8_t *restrict buf) { +static void cbd2(poly *restrict r, const __m256i buf[2 * KYBER_N / 128]) { unsigned int i; - __m256i vec0, vec1, vec2, vec3, tmp; + __m256i f0, f1, f2, f3; const __m256i mask55 = _mm256_set1_epi32(0x55555555); const __m256i mask33 = _mm256_set1_epi32(0x33333333); const __m256i mask03 = _mm256_set1_epi32(0x03030303); + const __m256i mask0F = _mm256_set1_epi32(0x0F0F0F0F); for (i = 0; i < KYBER_N / 64; i++) { - vec0 = _mm256_load_si256((__m256i *)&buf[32 * i]); + f0 = _mm256_load_si256(&buf[i]); - vec1 = _mm256_srli_epi32(vec0, 1); - vec0 = _mm256_and_si256(mask55, vec0); - vec1 = _mm256_and_si256(mask55, vec1); - vec0 = _mm256_add_epi32(vec0, vec1); + f1 = _mm256_srli_epi16(f0, 1); + f0 = _mm256_and_si256(mask55, f0); + f1 = _mm256_and_si256(mask55, f1); + f0 = _mm256_add_epi8(f0, f1); - vec1 = _mm256_srli_epi32(vec0, 2); - vec0 = _mm256_and_si256(mask33, vec0); - vec1 = _mm256_and_si256(mask33, vec1); + f1 = _mm256_srli_epi16(f0, 2); + f0 = _mm256_and_si256(mask33, f0); + f1 = _mm256_and_si256(mask33, f1); + f0 = _mm256_add_epi8(f0, mask33); + f0 = _mm256_sub_epi8(f0, f1); - vec2 = _mm256_srli_epi32(vec0, 4); - vec3 = _mm256_srli_epi32(vec1, 4); - vec0 = _mm256_and_si256(mask03, vec0); - vec1 = _mm256_and_si256(mask03, vec1); - vec2 = _mm256_and_si256(mask03, vec2); - vec3 = _mm256_and_si256(mask03, vec3); + f1 = _mm256_srli_epi16(f0, 4); + f0 = _mm256_and_si256(mask0F, f0); + f1 = _mm256_and_si256(mask0F, f1); + f0 = _mm256_sub_epi8(f0, mask03); + f1 = _mm256_sub_epi8(f1, mask03); - vec1 = _mm256_sub_epi8(vec0, vec1); - vec3 = _mm256_sub_epi8(vec2, vec3); + f2 = _mm256_unpacklo_epi8(f0, f1); + f3 = _mm256_unpackhi_epi8(f0, f1); - vec0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec1)); - vec1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec1, 1)); - vec2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec3)); - vec3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec3, 1)); + f0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f2)); + f1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f2, 1)); + f2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f3)); + f3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f3, 1)); - tmp = _mm256_unpacklo_epi16(vec0, vec2); - vec2 = _mm256_unpackhi_epi16(vec0, vec2); - vec0 = tmp; - tmp = _mm256_unpacklo_epi16(vec1, vec3); - vec3 = _mm256_unpackhi_epi16(vec1, vec3); - vec1 = tmp; - - tmp = _mm256_permute2x128_si256(vec0, vec2, 0x20); - vec2 = _mm256_permute2x128_si256(vec0, vec2, 0x31); - vec0 = tmp; - tmp = _mm256_permute2x128_si256(vec1, vec3, 0x20); - vec3 = _mm256_permute2x128_si256(vec1, vec3, 0x31); - vec1 = tmp; - - _mm256_store_si256((__m256i *)&r->coeffs[64 * i + 0], vec0); - _mm256_store_si256((__m256i *)&r->coeffs[64 * i + 16], vec2); - _mm256_store_si256((__m256i *)&r->coeffs[64 * i + 32], vec1); - _mm256_store_si256((__m256i *)&r->coeffs[64 * i + 48], vec3); + _mm256_store_si256(&r->vec[4 * i + 0], f0); + _mm256_store_si256(&r->vec[4 * i + 1], f2); + _mm256_store_si256(&r->vec[4 * i + 2], f1); + _mm256_store_si256(&r->vec[4 * i + 3], f3); } } + +/************************************************* +* Name: cbd3 +* +* Description: Given an array of uniformly random bytes, compute +* polynomial with coefficients distributed according to +* a centered binomial distribution with parameter eta=3 +* This function is only needed for Kyber-512 +* +* Arguments: - poly *r: pointer to output polynomial +* - const __m256i *buf: pointer to aligned input byte array +**************************************************/ +static void cbd3(poly *restrict r, const uint8_t buf[3 * KYBER_N / 4 + 8]) { + unsigned int i; + __m256i f0, f1, f2, f3; + const __m256i mask249 = _mm256_set1_epi32(0x249249); + const __m256i mask6DB = _mm256_set1_epi32(0x6DB6DB); + const __m256i mask07 = _mm256_set1_epi32(7); + const __m256i mask70 = _mm256_set1_epi32(7 << 16); + const __m256i mask3 = _mm256_set1_epi16(3); + const __m256i shufbidx = _mm256_set_epi8(-1, 15, 14, 13, -1, 12, 11, 10, -1, 9, 8, 7, -1, 6, 5, 4, + -1, 11, 10, 9, -1, 8, 7, 6, -1, 5, 4, 3, -1, 2, 1, 0); + + for (i = 0; i < KYBER_N / 32; i++) { + f0 = _mm256_loadu_si256((__m256i *)&buf[24 * i]); + f0 = _mm256_permute4x64_epi64(f0, 0x94); + f0 = _mm256_shuffle_epi8(f0, shufbidx); + + f1 = _mm256_srli_epi32(f0, 1); + f2 = _mm256_srli_epi32(f0, 2); + f0 = _mm256_and_si256(mask249, f0); + f1 = _mm256_and_si256(mask249, f1); + f2 = _mm256_and_si256(mask249, f2); + f0 = _mm256_add_epi32(f0, f1); + f0 = _mm256_add_epi32(f0, f2); + + f1 = _mm256_srli_epi32(f0, 3); + f0 = _mm256_add_epi32(f0, mask6DB); + f0 = _mm256_sub_epi32(f0, f1); + + f1 = _mm256_slli_epi32(f0, 10); + f2 = _mm256_srli_epi32(f0, 12); + f3 = _mm256_srli_epi32(f0, 2); + f0 = _mm256_and_si256(f0, mask07); + f1 = _mm256_and_si256(f1, mask70); + f2 = _mm256_and_si256(f2, mask07); + f3 = _mm256_and_si256(f3, mask70); + f0 = _mm256_add_epi16(f0, f1); + f1 = _mm256_add_epi16(f2, f3); + f0 = _mm256_sub_epi16(f0, mask3); + f1 = _mm256_sub_epi16(f1, mask3); + + f2 = _mm256_unpacklo_epi32(f0, f1); + f3 = _mm256_unpackhi_epi32(f0, f1); + + f0 = _mm256_permute2x128_si256(f2, f3, 0x20); + f1 = _mm256_permute2x128_si256(f2, f3, 0x31); + + _mm256_store_si256(&r->vec[2 * i + 0], f0); + _mm256_store_si256(&r->vec[2 * i + 1], f1); + } +} + +/* buf 32 bytes longer for cbd3 */ +void PQCLEAN_KYBER512_AVX2_poly_cbd_eta1(poly *r, const __m256i buf[KYBER_ETA1 * KYBER_N / 128 + 1]) { + cbd3(r, (uint8_t *)buf); +} + +void PQCLEAN_KYBER512_AVX2_poly_cbd_eta2(poly *r, const __m256i buf[KYBER_ETA2 * KYBER_N / 128]) { + cbd2(r, buf); +} diff --git a/crypto_kem/kyber512/avx2/cbd.h b/crypto_kem/kyber512/avx2/cbd.h index af290f88..7bf8e687 100644 --- a/crypto_kem/kyber512/avx2/cbd.h +++ b/crypto_kem/kyber512/avx2/cbd.h @@ -2,8 +2,11 @@ #define PQCLEAN_KYBER512_AVX2_CBD_H #include "params.h" #include "poly.h" +#include #include -void PQCLEAN_KYBER512_AVX2_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]); +void PQCLEAN_KYBER512_AVX2_poly_cbd_eta1(poly *r, const __m256i buf[KYBER_ETA1 * KYBER_N / 128 + 1]); + +void PQCLEAN_KYBER512_AVX2_poly_cbd_eta2(poly *r, const __m256i buf[KYBER_ETA2 * KYBER_N / 128]); #endif diff --git a/crypto_kem/kyber512/avx2/cdecl.h b/crypto_kem/kyber512/avx2/cdecl.h index 34f4ab57..410c8cf4 100644 --- a/crypto_kem/kyber512/avx2/cdecl.h +++ b/crypto_kem/kyber512/avx2/cdecl.h @@ -1,6 +1,8 @@ #ifndef PQCLEAN_KYBER512_AVX2_CDECL_H #define PQCLEAN_KYBER512_AVX2_CDECL_H + + #define _16XQ 0 #define _16XQINV 16 #define _16XV 32 @@ -9,9 +11,10 @@ #define _16XMONTSQLO 80 #define _16XMONTSQHI 96 #define _16XMASK 112 -#define _ZETAS_EXP 128 -#define _ZETAS_INV_EXP 528 - +#define _REVIDXB 128 +#define _REVIDXD 144 +#define _ZETAS_EXP 160 +#define _16XSHIFT 624 /* The C ABI on MacOS exports all symbols with a leading * underscore. This means that any symbols we refer to from @@ -23,4 +26,5 @@ #define _cdecl(s) _##s #define cdecl(s) s + #endif diff --git a/crypto_kem/kyber512/avx2/consts.c b/crypto_kem/kyber512/avx2/consts.c index 85731362..a4355cc8 100644 --- a/crypto_kem/kyber512/avx2/consts.c +++ b/crypto_kem/kyber512/avx2/consts.c @@ -1,155 +1,123 @@ +#include "align.h" #include "consts.h" #include "params.h" -#include + #define Q KYBER_Q -#define MONT ((1U << 16) % Q) -#define QINV 62209 // q^-1 mod 2^16 -#define V (((1U << 26) + Q/2)/Q) -#define FHI (MONT*(MONT*(Q-1)*((Q-1)/128) % Q) % Q) -#define FLO (FHI*QINV % 65536) -#define MONTSQHI (MONT*MONT % Q) -#define MONTSQLO (MONTSQHI*QINV % 65536) +#define MONT (-1044) // 2^16 mod q +#define QINV (-3327) // q^-1 mod 2^16 +#define V 20159 // floor(2^26/q + 0.5) +#define FHI 1441 // mont^2/128 +#define FLO (-10079) // qinv*FHI +#define MONTSQHI 1353 // mont^2 +#define MONTSQLO 20553 // qinv*MONTSQHI #define MASK 4095 +#define SHIFT 32 - -const qdata_t PQCLEAN_KYBER512_AVX2_qdata = {.as_arr = { -#define _16XQ 0 +const qdata_t PQCLEAN_KYBER512_AVX2_qdata = {.coeffs = { +//#define _16XQ 0 Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, -#define _16XQINV 16 +//#define _16XQINV 16 QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, -#define _16XV 32 +//#define _16XV 32 V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, -#define _16XFLO 48 +//#define _16XFLO 48 FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, -#define _16XFHI 64 +//#define _16XFHI 64 FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, -#define _16XMONTSQLO 80 +//#define _16XMONTSQLO 80 MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, -#define _16XMONTSQHI 96 +//#define _16XMONTSQHI 96 MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, -#define _16XMASK 112 +//#define _16XMASK 112 MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, -#define _ZETAS_EXP 128 - 31499, 31499, 2571, 2571, 14746, 14746, 2970, 2970, - 13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, - 53134, 53134, 53134, 53134, 53134, 53134, 53134, 53134, - 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, - 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, - 44630, 44630, 44630, 44630, 27758, 27758, 27758, 27758, - 61737, 61737, 61737, 61737, 49846, 49846, 49846, 49846, - 3158, 3158, 3158, 3158, 622, 622, 622, 622, - 1577, 1577, 1577, 1577, 182, 182, 182, 182, - 59709, 59709, 17364, 17364, 39176, 39176, 36479, 36479, - 5572, 5572, 64434, 64434, 21439, 21439, 39295, 39295, - 573, 573, 2004, 2004, 264, 264, 383, 383, - 2500, 2500, 1458, 1458, 1727, 1727, 3199, 3199, - 59847, 59020, 1497, 30967, 41972, 20179, 20711, 25081, - 52740, 26617, 16065, 53095, 9135, 64887, 39550, 27837, - 1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, - 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, - 65202, 54059, 33310, 20494, 37798, 945, 50654, 6182, - 32011, 10631, 29176, 36775, 47051, 17561, 51106, 60261, - 2226, 555, 2078, 1550, 422, 177, 3038, 1574, - 3083, 1159, 2552, 2727, 1739, 2457, 418, 3173, - 11182, 13387, 51303, 43881, 13131, 60950, 23093, 5493, - 33034, 30318, 46795, 12639, 20100, 18525, 19529, 52918, - 430, 843, 871, 105, 587, 3094, 2869, 1653, - 778, 3182, 1483, 1119, 644, 349, 329, 3254, - 788, 788, 1812, 1812, 28191, 28191, 28191, 28191, - 28191, 28191, 28191, 28191, 48842, 48842, 48842, 48842, - 48842, 48842, 48842, 48842, 287, 287, 287, 287, - 287, 287, 287, 287, 202, 202, 202, 202, - 202, 202, 202, 202, 10690, 10690, 10690, 10690, - 1359, 1359, 1359, 1359, 54335, 54335, 54335, 54335, - 31164, 31164, 31164, 31164, 962, 962, 962, 962, - 2127, 2127, 2127, 2127, 1855, 1855, 1855, 1855, - 1468, 1468, 1468, 1468, 37464, 37464, 24313, 24313, - 55004, 55004, 8800, 8800, 18427, 18427, 8859, 8859, - 26676, 26676, 49374, 49374, 2648, 2648, 1017, 1017, - 732, 732, 608, 608, 1787, 1787, 411, 411, - 3124, 3124, 1758, 1758, 19884, 37287, 49650, 56638, - 37227, 9076, 35338, 18250, 13427, 14017, 36381, 52780, - 16832, 4312, 41381, 47622, 2476, 3239, 3058, 830, - 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, - 448, 2264, 677, 2054, 34353, 25435, 58154, 24392, - 44610, 10946, 24215, 16990, 10336, 57603, 43035, 10907, - 31637, 28644, 23998, 48114, 817, 603, 1322, 1864, - 2114, 1218, 2455, 2142, 2144, 2051, 1819, 2459, - 3221, 996, 958, 1522, 20297, 2146, 15356, 33152, - 59257, 50634, 54492, 14470, 44039, 45338, 23211, 48094, - 41677, 45279, 7757, 23132, 1097, 610, 2044, 384, - 3193, 1994, 220, 1670, 1799, 794, 2475, 478, - 3021, 991, 1869, 1628, 0, 0, 0, 0, +//#define _REVIDXB 128 + 3854, 3340, 2826, 2312, 1798, 1284, 770, 256, + 3854, 3340, 2826, 2312, 1798, 1284, 770, 256, -#define _ZETAS_INV_EXP 528 - 42405, 57780, 20258, 23860, 17443, 42326, 20199, 21498, - 51067, 11045, 14903, 6280, 32385, 50181, 63391, 45240, - 1701, 1460, 2338, 308, 2851, 854, 2535, 1530, - 1659, 3109, 1335, 136, 2945, 1285, 2719, 2232, - 17423, 41539, 36893, 33900, 54630, 22502, 7934, 55201, - 48547, 41322, 54591, 20927, 41145, 7383, 40102, 31184, - 1807, 2371, 2333, 108, 870, 1510, 1278, 1185, - 1187, 874, 2111, 1215, 1465, 2007, 2726, 2512, - 17915, 24156, 61225, 48705, 12757, 29156, 51520, 52110, - 47287, 30199, 56461, 28310, 8899, 15887, 28250, 45653, - 1275, 2652, 1065, 2881, 725, 1508, 2368, 398, - 951, 247, 1421, 3222, 2499, 271, 90, 853, - 16163, 16163, 38861, 38861, 56678, 56678, 47110, 47110, - 56737, 56737, 10533, 10533, 41224, 41224, 28073, 28073, - 1571, 1571, 205, 205, 2918, 2918, 1542, 1542, - 2721, 2721, 2597, 2597, 2312, 2312, 681, 681, - 34373, 34373, 34373, 34373, 11202, 11202, 11202, 11202, - 64178, 64178, 64178, 64178, 54847, 54847, 54847, 54847, - 1861, 1861, 1861, 1861, 1474, 1474, 1474, 1474, - 1202, 1202, 1202, 1202, 2367, 2367, 2367, 2367, - 16695, 16695, 16695, 16695, 16695, 16695, 16695, 16695, - 37346, 37346, 37346, 37346, 37346, 37346, 37346, 37346, - 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127, - 3042, 3042, 3042, 3042, 3042, 3042, 3042, 3042, - 64749, 64749, 1517, 1517, 12619, 46008, 47012, 45437, - 52898, 18742, 35219, 32503, 60044, 42444, 4587, 52406, - 21656, 14234, 52150, 54355, 75, 3000, 2980, 2685, - 2210, 1846, 147, 2551, 1676, 460, 235, 2742, - 3224, 2458, 2486, 2899, 5276, 14431, 47976, 18486, - 28762, 36361, 54906, 33526, 59355, 14883, 64592, 27739, - 45043, 32227, 11478, 335, 156, 2911, 872, 1590, - 602, 777, 2170, 246, 1755, 291, 3152, 2907, - 1779, 1251, 2774, 1103, 37700, 25987, 650, 56402, - 12442, 49472, 38920, 12797, 40456, 44826, 45358, 23565, - 34570, 64040, 6517, 5690, 1860, 3203, 1162, 1618, - 666, 320, 8, 2813, 1544, 282, 1838, 1293, - 2314, 552, 2677, 2106, 26242, 26242, 44098, 44098, - 1103, 1103, 59965, 59965, 29058, 29058, 26361, 26361, - 48173, 48173, 5828, 5828, 130, 130, 1602, 1602, - 1871, 1871, 829, 829, 2946, 2946, 3065, 3065, - 1325, 1325, 2756, 2756, 15691, 15691, 15691, 15691, - 3800, 3800, 3800, 3800, 37779, 37779, 37779, 37779, - 20907, 20907, 20907, 20907, 3147, 3147, 3147, 3147, - 1752, 1752, 1752, 1752, 2707, 2707, 2707, 2707, - 171, 171, 171, 171, 12403, 12403, 12403, 12403, - 12403, 12403, 12403, 12403, 52012, 52012, 52012, 52012, - 52012, 52012, 52012, 52012, 1907, 1907, 1907, 1907, - 1907, 1907, 1907, 1907, 1836, 1836, 1836, 1836, - 1836, 1836, 1836, 1836, 50791, 50791, 359, 359, - 60300, 60300, 1932, 1932, 0, 0, 0, 0 +//#define _REVIDXD 144 + 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0, 0, + +//#define _ZETAS_EXP 160 + 31498, 31498, 31498, 31498, -758, -758, -758, -758, + 5237, 5237, 5237, 5237, 1397, 1397, 1397, 1397, + 14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745, + 14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745, + -359, -359, -359, -359, -359, -359, -359, -359, + -359, -359, -359, -359, -359, -359, -359, -359, + 13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, + -12402, -12402, -12402, -12402, -12402, -12402, -12402, -12402, + 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, + 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, + -20907, -20907, -20907, -20907, 27758, 27758, 27758, 27758, + -3799, -3799, -3799, -3799, -15690, -15690, -15690, -15690, + -171, -171, -171, -171, 622, 622, 622, 622, + 1577, 1577, 1577, 1577, 182, 182, 182, 182, + -5827, -5827, 17363, 17363, -26360, -26360, -29057, -29057, + 5571, 5571, -1102, -1102, 21438, 21438, -26242, -26242, + 573, 573, -1325, -1325, 264, 264, 383, 383, + -829, -829, 1458, 1458, -1602, -1602, -130, -130, + -5689, -6516, 1496, 30967, -23565, 20179, 20710, 25080, + -12796, 26616, 16064, -12442, 9134, -650, -25986, 27837, + 1223, 652, -552, 1015, -1293, 1491, -282, -1544, + 516, -8, -320, -666, -1618, -1162, 126, 1469, + -335, -11477, -32227, 20494, -27738, 945, -14883, 6182, + 32010, 10631, 29175, -28762, -18486, 17560, -14430, -5276, + -1103, 555, -1251, 1550, 422, 177, -291, 1574, + -246, 1159, -777, -602, -1590, -872, 418, -156, + 11182, 13387, -14233, -21655, 13131, -4587, 23092, 5493, + -32502, 30317, -18741, 12639, 20100, 18525, 19529, -12619, + 430, 843, 871, 105, 587, -235, -460, 1653, + 778, -147, 1483, 1119, 644, 349, 329, -75, + 787, 787, 787, 787, 787, 787, 787, 787, + 787, 787, 787, 787, 787, 787, 787, 787, + -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + 28191, 28191, 28191, 28191, 28191, 28191, 28191, 28191, + -16694, -16694, -16694, -16694, -16694, -16694, -16694, -16694, + 287, 287, 287, 287, 287, 287, 287, 287, + 202, 202, 202, 202, 202, 202, 202, 202, + 10690, 10690, 10690, 10690, 1358, 1358, 1358, 1358, + -11202, -11202, -11202, -11202, 31164, 31164, 31164, 31164, + 962, 962, 962, 962, -1202, -1202, -1202, -1202, + -1474, -1474, -1474, -1474, 1468, 1468, 1468, 1468, + -28073, -28073, 24313, 24313, -10532, -10532, 8800, 8800, + 18426, 18426, 8859, 8859, 26675, 26675, -16163, -16163, + -681, -681, 1017, 1017, 732, 732, 608, 608, + -1542, -1542, 411, 411, -205, -205, -1571, -1571, + 19883, -28250, -15887, -8898, -28309, 9075, -30199, 18249, + 13426, 14017, -29156, -12757, 16832, 4311, -24155, -17915, + -853, -90, -271, 830, 107, -1421, -247, -951, + -398, 961, -1508, -725, 448, -1065, 677, -1275, + -31183, 25435, -7382, 24391, -20927, 10946, 24214, 16989, + 10335, -7934, -22502, 10906, 31636, 28644, 23998, -17422, + 817, 603, 1322, -1465, -1215, 1218, -874, -1187, + -1185, -1278, -1510, -870, -108, 996, 958, 1522, + 20297, 2146, 15355, -32384, -6280, -14903, -11044, 14469, + -21498, -20198, 23210, -17442, -23860, -20257, 7756, 23132, + 1097, 610, -1285, 384, -136, -1335, 220, -1659, + -1530, 794, -854, 478, -308, 991, -1460, 1628, + +//#define _16XSHIFT 624 + SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, + SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT } }; diff --git a/crypto_kem/kyber512/avx2/consts.h b/crypto_kem/kyber512/avx2/consts.h index cd45b1b3..b67005a9 100644 --- a/crypto_kem/kyber512/avx2/consts.h +++ b/crypto_kem/kyber512/avx2/consts.h @@ -1,19 +1,10 @@ #ifndef PQCLEAN_KYBER512_AVX2_CONSTS_H #define PQCLEAN_KYBER512_AVX2_CONSTS_H +#include "align.h" #include "cdecl.h" -#include "params.h" -#include -#include -#define ALIGNED_UINT16_T(N) \ - union { \ - __m256i as_vec; \ - uint16_t as_arr[(N)]; \ - } - -typedef ALIGNED_UINT16_T(928) qdata_t; - +typedef ALIGNED_INT16(640) qdata_t; extern const qdata_t PQCLEAN_KYBER512_AVX2_qdata; #endif diff --git a/crypto_kem/kyber512/avx2/fips202x4.c b/crypto_kem/kyber512/avx2/fips202x4.c index 2a3b5686..e84bf6f5 100644 --- a/crypto_kem/kyber512/avx2/fips202x4.c +++ b/crypto_kem/kyber512/avx2/fips202x4.c @@ -9,22 +9,14 @@ #define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds extern void KeccakF1600_StatePermute4x(__m256i *s); -static inline void store64(uint8_t x[8], uint64_t u) { - unsigned int i; - - for (i = 0; i < 8; i++) { - x[i] = u >> 8 * i; - } -} - -static void keccakx4_absorb(__m256i s[25], - unsigned int r, - const uint8_t *in0, - const uint8_t *in1, - const uint8_t *in2, - const uint8_t *in3, - size_t inlen, - uint8_t p) { +static void keccakx4_absorb_once(__m256i s[25], + unsigned int r, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen, + uint8_t p) { size_t i, pos = 0; __m256i t, idx; @@ -39,20 +31,17 @@ static void keccakx4_absorb(__m256i s[25], s[i] = _mm256_xor_si256(s[i], t); pos += 8; } + inlen -= r; KeccakF1600_StatePermute4x(s); - inlen -= r; } - i = 0; - while (inlen >= 8) { + for (i = 0; i < inlen / 8; ++i) { t = _mm256_i64gather_epi64((long long *)pos, idx, 1); s[i] = _mm256_xor_si256(s[i], t); - - i++; pos += 8; - inlen -= 8; } + inlen -= 8 * i; if (inlen) { t = _mm256_i64gather_epi64((long long *)pos, idx, 1); @@ -75,37 +64,34 @@ static void keccakx4_squeezeblocks(uint8_t *out0, unsigned int r, __m256i s[25]) { unsigned int i; - uint64_t f0, f1, f2, f3; + __m128d t; while (nblocks > 0) { KeccakF1600_StatePermute4x(s); for (i = 0; i < r / 8; ++i) { - f0 = _mm256_extract_epi64(s[i], 0); - f1 = _mm256_extract_epi64(s[i], 1); - f2 = _mm256_extract_epi64(s[i], 2); - f3 = _mm256_extract_epi64(s[i], 3); - store64(out0, f0); - store64(out1, f1); - store64(out2, f2); - store64(out3, f3); - - out0 += 8; - out1 += 8; - out2 += 8; - out3 += 8; + t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i])); + _mm_storel_pd((double *)&out0[8 * i], t); + _mm_storeh_pd((double *)&out1[8 * i], t); + t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i], 1)); + _mm_storel_pd((double *)&out2[8 * i], t); + _mm_storeh_pd((double *)&out3[8 * i], t); } + out0 += r; + out1 += r; + out2 += r; + out3 += r; --nblocks; } } -void PQCLEAN_KYBER512_AVX2_shake128x4_absorb(keccakx4_state *state, +void PQCLEAN_KYBER512_AVX2_shake128x4_absorb_once(keccakx4_state *state, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen) { - keccakx4_absorb(state->s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F); + keccakx4_absorb_once(state->s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F); } void PQCLEAN_KYBER512_AVX2_shake128x4_squeezeblocks(uint8_t *out0, @@ -114,17 +100,16 @@ void PQCLEAN_KYBER512_AVX2_shake128x4_squeezeblocks(uint8_t *out0, uint8_t *out3, size_t nblocks, keccakx4_state *state) { - keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE128_RATE, - state->s); + keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE128_RATE, state->s); } -void PQCLEAN_KYBER512_AVX2_shake256x4_absorb(keccakx4_state *state, +void PQCLEAN_KYBER512_AVX2_shake256x4_absorb_once(keccakx4_state *state, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen) { - keccakx4_absorb(state->s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F); + keccakx4_absorb_once(state->s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F); } void PQCLEAN_KYBER512_AVX2_shake256x4_squeezeblocks(uint8_t *out0, @@ -133,8 +118,7 @@ void PQCLEAN_KYBER512_AVX2_shake256x4_squeezeblocks(uint8_t *out0, uint8_t *out3, size_t nblocks, keccakx4_state *state) { - keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE256_RATE, - state->s); + keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE256_RATE, state->s); } void PQCLEAN_KYBER512_AVX2_shake128x4(uint8_t *out0, @@ -152,7 +136,7 @@ void PQCLEAN_KYBER512_AVX2_shake128x4(uint8_t *out0, uint8_t t[4][SHAKE128_RATE]; keccakx4_state state; - PQCLEAN_KYBER512_AVX2_shake128x4_absorb(&state, in0, in1, in2, in3, inlen); + PQCLEAN_KYBER512_AVX2_shake128x4_absorb_once(&state, in0, in1, in2, in3, inlen); PQCLEAN_KYBER512_AVX2_shake128x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); out0 += nblocks * SHAKE128_RATE; @@ -187,7 +171,7 @@ void PQCLEAN_KYBER512_AVX2_shake256x4(uint8_t *out0, uint8_t t[4][SHAKE256_RATE]; keccakx4_state state; - PQCLEAN_KYBER512_AVX2_shake256x4_absorb(&state, in0, in1, in2, in3, inlen); + PQCLEAN_KYBER512_AVX2_shake256x4_absorb_once(&state, in0, in1, in2, in3, inlen); PQCLEAN_KYBER512_AVX2_shake256x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); out0 += nblocks * SHAKE256_RATE; diff --git a/crypto_kem/kyber512/avx2/fips202x4.h b/crypto_kem/kyber512/avx2/fips202x4.h index 237e2903..d822b620 100644 --- a/crypto_kem/kyber512/avx2/fips202x4.h +++ b/crypto_kem/kyber512/avx2/fips202x4.h @@ -9,7 +9,7 @@ typedef struct { __m256i s[25]; } keccakx4_state; -void PQCLEAN_KYBER512_AVX2_shake128x4_absorb(keccakx4_state *state, +void PQCLEAN_KYBER512_AVX2_shake128x4_absorb_once(keccakx4_state *state, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, @@ -23,7 +23,7 @@ void PQCLEAN_KYBER512_AVX2_shake128x4_squeezeblocks(uint8_t *out0, size_t nblocks, keccakx4_state *state); -void PQCLEAN_KYBER512_AVX2_shake256x4_absorb(keccakx4_state *state, +void PQCLEAN_KYBER512_AVX2_shake256x4_absorb_once(keccakx4_state *state, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, diff --git a/crypto_kem/kyber512/avx2/fq.S b/crypto_kem/kyber512/avx2/fq.S index 4bb716db..68bf0382 100644 --- a/crypto_kem/kyber512/avx2/fq.S +++ b/crypto_kem/kyber512/avx2/fq.S @@ -13,14 +13,14 @@ vmovdqa 160(%rdi),%ymm7 vmovdqa 192(%rdi),%ymm8 vmovdqa 224(%rdi),%ymm9 -red16 2,10 -red16 3,11 -red16 4,12 -red16 5,13 -red16 6,14 -red16 7,15 -red16 8,10 -red16 9,11 +red16 2 +red16 3 +red16 4 +red16 5 +red16 6 +red16 7 +red16 8 +red16 9 #store vmovdqa %ymm2,(%rdi) @@ -46,49 +46,6 @@ add $256,%rdi call reduce128_avx ret -csubq128_avx: -#load -vmovdqa (%rdi),%ymm1 -vmovdqa 32(%rdi),%ymm2 -vmovdqa 64(%rdi),%ymm3 -vmovdqa 96(%rdi),%ymm4 -vmovdqa 128(%rdi),%ymm5 -vmovdqa 160(%rdi),%ymm6 -vmovdqa 192(%rdi),%ymm7 -vmovdqa 224(%rdi),%ymm8 - -csubq 1,9 -csubq 2,10 -csubq 3,11 -csubq 4,12 -csubq 5,13 -csubq 6,14 -csubq 7,15 -csubq 8,9 - -#store -vmovdqa %ymm1,(%rdi) -vmovdqa %ymm2,32(%rdi) -vmovdqa %ymm3,64(%rdi) -vmovdqa %ymm4,96(%rdi) -vmovdqa %ymm5,128(%rdi) -vmovdqa %ymm6,160(%rdi) -vmovdqa %ymm7,192(%rdi) -vmovdqa %ymm8,224(%rdi) - -ret - -.global cdecl(PQCLEAN_KYBER512_AVX2_csubq_avx) -.global _cdecl(PQCLEAN_KYBER512_AVX2_csubq_avx) -cdecl(PQCLEAN_KYBER512_AVX2_csubq_avx): -_cdecl(PQCLEAN_KYBER512_AVX2_csubq_avx): -#consts -vmovdqa _16XQ*2(%rsi),%ymm0 -call csubq128_avx -add $256,%rdi -call csubq128_avx -ret - tomont128_avx: #load vmovdqa (%rdi),%ymm3 diff --git a/crypto_kem/kyber512/avx2/fq.inc b/crypto_kem/kyber512/avx2/fq.inc index 75df098a..4b7afc31 100644 --- a/crypto_kem/kyber512/avx2/fq.inc +++ b/crypto_kem/kyber512/avx2/fq.inc @@ -1,6 +1,10 @@ -.macro red16 r,x=12 +.macro red16 r,rs=0,x=12 vpmulhw %ymm1,%ymm\r,%ymm\x +.if \rs +vpmulhrsw %ymm\rs,%ymm\x,%ymm\x +.else vpsraw $10,%ymm\x,%ymm\x +.endif vpmullw %ymm0,%ymm\x,%ymm\x vpsubw %ymm\x,%ymm\r,%ymm\r .endm @@ -10,9 +14,6 @@ vpsubw %ymm0,%ymm\r,%ymm\r vpsraw $15,%ymm\r,%ymm\x vpand %ymm0,%ymm\x,%ymm\x vpaddw %ymm\x,%ymm\r,%ymm\r -#vpcmpgtw %ymm0,%ymm\r,%ymm\x -#vpand %ymm0,%ymm\x,%ymm\x -#vpsubw %ymm\x,%ymm\r,%ymm\r .endm .macro caddq r,x=12 diff --git a/crypto_kem/kyber512/avx2/indcpa.c b/crypto_kem/kyber512/avx2/indcpa.c index 7a5d74ae..80990ca3 100644 --- a/crypto_kem/kyber512/avx2/indcpa.c +++ b/crypto_kem/kyber512/avx2/indcpa.c @@ -8,6 +8,7 @@ #include "randombytes.h" #include "rejsample.h" #include "symmetric.h" +#include #include #include @@ -15,11 +16,14 @@ * Name: pack_pk * * Description: Serialize the public key as concatenation of the -* serialized vector of polynomials pk -* and the public seed used to generate the matrix A. +* serialized vector of polynomials pk and the +* public seed used to generate the matrix A. +* The polynomial coefficients in pk are assumed to +* lie in the invertal [0,q], i.e. pk must be reduced +* by PQCLEAN_KYBER512_AVX2_polyvec_reduce(). * -* Arguments: uint8_t *r: pointer to the output serialized public key -* polyvec *pk: pointer to the input public-key polyvec +* Arguments: uint8_t *r: pointer to the output serialized public key +* polyvec *pk: pointer to the input public-key polyvec * const uint8_t *seed: pointer to the input public seed **************************************************/ static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], @@ -55,9 +59,12 @@ static void unpack_pk(polyvec *pk, /************************************************* * Name: pack_sk * -* Description: Serialize the secret key +* Description: Serialize the secret key. +* The polynomial coefficients in sk are assumed to +* lie in the invertal [0,q], i.e. sk must be reduced +* by PQCLEAN_KYBER512_AVX2_polyvec_reduce(). * -* Arguments: - uint8_t *r: pointer to output serialized secret key +* Arguments: - uint8_t *r: pointer to output serialized secret key * - polyvec *sk: pointer to input vector of polynomials (secret key) **************************************************/ static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { @@ -67,15 +74,12 @@ static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { /************************************************* * Name: unpack_sk * -* Description: De-serialize the secret key; -* inverse of pack_sk +* Description: De-serialize the secret key; inverse of pack_sk * -* Arguments: - polyvec *sk: pointer to output vector of polynomials -* (secret key) +* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key) * - const uint8_t *packedsk: pointer to input serialized secret key **************************************************/ -static void unpack_sk(polyvec *sk, - const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { +static void unpack_sk(polyvec *sk, const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { PQCLEAN_KYBER512_AVX2_polyvec_frombytes(sk, packedsk); } @@ -84,15 +88,16 @@ static void unpack_sk(polyvec *sk, * * Description: Serialize the ciphertext as concatenation of the * compressed and serialized vector of polynomials b -* and the compressed and serialized polynomial v +* and the compressed and serialized polynomial v. +* The polynomial coefficients in b and v are assumed to +* lie in the invertal [0,q], i.e. b and v must be reduced +* by PQCLEAN_KYBER512_AVX2_polyvec_reduce() and PQCLEAN_KYBER512_AVX2_poly_reduce(), respectively. * * Arguments: uint8_t *r: pointer to the output serialized ciphertext -* poly *pk: pointer to the input vector of polynomials b -* poly *v: pointer to the input polynomial v +* poly *pk: pointer to the input vector of polynomials b +* poly *v: pointer to the input polynomial v **************************************************/ -static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], - polyvec *b, - poly *v) { +static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], polyvec *b, poly *v) { PQCLEAN_KYBER512_AVX2_polyvec_compress(r, b); PQCLEAN_KYBER512_AVX2_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v); } @@ -103,13 +108,11 @@ static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], * Description: De-serialize and decompress ciphertext from a byte array; * approximate inverse of pack_ciphertext * -* Arguments: - polyvec *b: pointer to the output vector of polynomials b -* - poly *v: pointer to the output polynomial v +* Arguments: - polyvec *b: pointer to the output vector of polynomials b +* - poly *v: pointer to the output polynomial v * - const uint8_t *c: pointer to the input serialized ciphertext **************************************************/ -static void unpack_ciphertext(polyvec *b, - poly *v, - const uint8_t c[KYBER_INDCPA_BYTES]) { +static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t c[KYBER_INDCPA_BYTES]) { PQCLEAN_KYBER512_AVX2_polyvec_decompress(b, c); PQCLEAN_KYBER512_AVX2_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES); } @@ -120,11 +123,9 @@ static void unpack_ciphertext(polyvec *b, * Description: Run rejection sampling on uniform random bytes to generate * uniform random integers mod q * -* Arguments: - int16_t *r: pointer to output buffer -* - unsigned int len: requested number of 16-bit integers -* (uniform mod q) -* - const uint8_t *buf: pointer to input buffer -* (assumed to be uniform random bytes) +* Arguments: - int16_t *r: pointer to output array +* - unsigned int len: requested number of 16-bit integers (uniform mod q) +* - const uint8_t *buf: pointer to input buffer (assumed to be uniformly random bytes) * - unsigned int buflen: length of input buffer in bytes * * Returns number of sampled 16-bit integers (at most len) @@ -134,16 +135,19 @@ static unsigned int rej_uniform(int16_t *r, const uint8_t *buf, unsigned int buflen) { unsigned int ctr, pos; - uint16_t val; + uint16_t val0, val1; ctr = pos = 0; - while (ctr < len && pos + 2 <= buflen) { - val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); - pos += 2; + while (ctr < len && pos + 3 <= buflen) { + val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; + val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF; + pos += 3; - if (val < 19 * KYBER_Q) { - val -= ((uint32_t)val * 20159 >> 26) * KYBER_Q; // Barrett reduction - r[ctr++] = (int16_t)val; + if (val0 < KYBER_Q) { + r[ctr++] = val0; + } + if (ctr < len && val1 < KYBER_Q) { + r[ctr++] = val1; } } @@ -165,60 +169,53 @@ static unsigned int rej_uniform(int16_t *r, * - const uint8_t *seed: pointer to input seed * - int transposed: boolean deciding whether A or A^T is generated **************************************************/ -#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \ - + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) void PQCLEAN_KYBER512_AVX2_gen_matrix(polyvec *a, const uint8_t seed[32], int transposed) { unsigned int ctr0, ctr1, ctr2, ctr3; - ALIGN32_ARRAY_2D(uint8_t, 4, (GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 31) / 32 * 32) buf; + ALIGNED_UINT8(REJ_UNIFORM_AVX_NBLOCKS * SHAKE128_RATE) buf[4]; __m256i f; keccakx4_state state; - f = _mm256_load_si256((__m256i *)seed); - _mm256_store_si256((__m256i *)buf.arr[0], f); - _mm256_store_si256((__m256i *)buf.arr[1], f); - _mm256_store_si256((__m256i *)buf.arr[2], f); - _mm256_store_si256((__m256i *)buf.arr[3], f); + f = _mm256_loadu_si256((__m256i *)seed); + _mm256_store_si256(buf[0].vec, f); + _mm256_store_si256(buf[1].vec, f); + _mm256_store_si256(buf[2].vec, f); + _mm256_store_si256(buf[3].vec, f); if (transposed) { - buf.arr[0][KYBER_SYMBYTES + 0] = 0; - buf.arr[0][KYBER_SYMBYTES + 1] = 0; - buf.arr[1][KYBER_SYMBYTES + 0] = 0; - buf.arr[1][KYBER_SYMBYTES + 1] = 1; - buf.arr[2][KYBER_SYMBYTES + 0] = 1; - buf.arr[2][KYBER_SYMBYTES + 1] = 0; - buf.arr[3][KYBER_SYMBYTES + 0] = 1; - buf.arr[3][KYBER_SYMBYTES + 1] = 1; + buf[0].coeffs[32] = 0; + buf[0].coeffs[33] = 0; + buf[1].coeffs[32] = 0; + buf[1].coeffs[33] = 1; + buf[2].coeffs[32] = 1; + buf[2].coeffs[33] = 0; + buf[3].coeffs[32] = 1; + buf[3].coeffs[33] = 1; } else { - buf.arr[0][KYBER_SYMBYTES + 0] = 0; - buf.arr[0][KYBER_SYMBYTES + 1] = 0; - buf.arr[1][KYBER_SYMBYTES + 0] = 1; - buf.arr[1][KYBER_SYMBYTES + 1] = 0; - buf.arr[2][KYBER_SYMBYTES + 0] = 0; - buf.arr[2][KYBER_SYMBYTES + 1] = 1; - buf.arr[3][KYBER_SYMBYTES + 0] = 1; - buf.arr[3][KYBER_SYMBYTES + 1] = 1; + buf[0].coeffs[32] = 0; + buf[0].coeffs[33] = 0; + buf[1].coeffs[32] = 1; + buf[1].coeffs[33] = 0; + buf[2].coeffs[32] = 0; + buf[2].coeffs[33] = 1; + buf[3].coeffs[32] = 1; + buf[3].coeffs[33] = 1; } - PQCLEAN_KYBER512_AVX2_shake128x4_absorb(&state, buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], KYBER_SYMBYTES + 2); - PQCLEAN_KYBER512_AVX2_shake128x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], GEN_MATRIX_NBLOCKS, - &state); + PQCLEAN_KYBER512_AVX2_shake128x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 34); + PQCLEAN_KYBER512_AVX2_shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, REJ_UNIFORM_AVX_NBLOCKS, &state); - ctr0 = PQCLEAN_KYBER512_AVX2_rej_uniform_avx(a[0].vec[0].coeffs, buf.arr[0]); - ctr1 = PQCLEAN_KYBER512_AVX2_rej_uniform_avx(a[0].vec[1].coeffs, buf.arr[1]); - ctr2 = PQCLEAN_KYBER512_AVX2_rej_uniform_avx(a[1].vec[0].coeffs, buf.arr[2]); - ctr3 = PQCLEAN_KYBER512_AVX2_rej_uniform_avx(a[1].vec[1].coeffs, buf.arr[3]); + ctr0 = PQCLEAN_KYBER512_AVX2_rej_uniform_avx(a[0].vec[0].coeffs, buf[0].coeffs); + ctr1 = PQCLEAN_KYBER512_AVX2_rej_uniform_avx(a[0].vec[1].coeffs, buf[1].coeffs); + ctr2 = PQCLEAN_KYBER512_AVX2_rej_uniform_avx(a[1].vec[0].coeffs, buf[2].coeffs); + ctr3 = PQCLEAN_KYBER512_AVX2_rej_uniform_avx(a[1].vec[1].coeffs, buf[3].coeffs); while (ctr0 < KYBER_N || ctr1 < KYBER_N || ctr2 < KYBER_N || ctr3 < KYBER_N) { - PQCLEAN_KYBER512_AVX2_shake128x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], 1, &state); + PQCLEAN_KYBER512_AVX2_shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 1, &state); - ctr0 += rej_uniform(a[0].vec[0].coeffs + ctr0, KYBER_N - ctr0, buf.arr[0], - XOF_BLOCKBYTES); - ctr1 += rej_uniform(a[0].vec[1].coeffs + ctr1, KYBER_N - ctr1, buf.arr[1], - XOF_BLOCKBYTES); - ctr2 += rej_uniform(a[1].vec[0].coeffs + ctr2, KYBER_N - ctr2, buf.arr[2], - XOF_BLOCKBYTES); - ctr3 += rej_uniform(a[1].vec[1].coeffs + ctr3, KYBER_N - ctr3, buf.arr[3], - XOF_BLOCKBYTES); + ctr0 += rej_uniform(a[0].vec[0].coeffs + ctr0, KYBER_N - ctr0, buf[0].coeffs, SHAKE128_RATE); + ctr1 += rej_uniform(a[0].vec[1].coeffs + ctr1, KYBER_N - ctr1, buf[1].coeffs, SHAKE128_RATE); + ctr2 += rej_uniform(a[1].vec[0].coeffs + ctr2, KYBER_N - ctr2, buf[2].coeffs, SHAKE128_RATE); + ctr3 += rej_uniform(a[1].vec[1].coeffs + ctr3, KYBER_N - ctr3, buf[3].coeffs, SHAKE128_RATE); } PQCLEAN_KYBER512_AVX2_poly_nttunpack(&a[0].vec[0]); @@ -241,25 +238,25 @@ void PQCLEAN_KYBER512_AVX2_gen_matrix(polyvec *a, const uint8_t seed[32], int tr void PQCLEAN_KYBER512_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { unsigned int i; - ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; - const uint8_t *publicseed = buf.arr; - const uint8_t *noiseseed = buf.arr + KYBER_SYMBYTES; + uint8_t buf[2 * KYBER_SYMBYTES]; + const uint8_t *publicseed = buf; + const uint8_t *noiseseed = buf + KYBER_SYMBYTES; polyvec a[KYBER_K], e, pkpv, skpv; - randombytes(buf.arr, KYBER_SYMBYTES); - hash_g(buf.arr, buf.arr, KYBER_SYMBYTES); + randombytes(buf, KYBER_SYMBYTES); + hash_g(buf, buf, KYBER_SYMBYTES); gen_a(a, publicseed); - PQCLEAN_KYBER512_AVX2_poly_getnoise4x(skpv.vec + 0, skpv.vec + 1, e.vec + 0, e.vec + 1, noiseseed, - 0, 1, 2, 3); + PQCLEAN_KYBER512_AVX2_poly_getnoise_eta1_4x(skpv.vec + 0, skpv.vec + 1, e.vec + 0, e.vec + 1, noiseseed, 0, 1, 2, 3); PQCLEAN_KYBER512_AVX2_polyvec_ntt(&skpv); + PQCLEAN_KYBER512_AVX2_polyvec_reduce(&skpv); PQCLEAN_KYBER512_AVX2_polyvec_ntt(&e); // matrix-vector multiplication for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER512_AVX2_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); + PQCLEAN_KYBER512_AVX2_polyvec_basemul_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); PQCLEAN_KYBER512_AVX2_poly_tomont(&pkpv.vec[i]); } @@ -276,53 +273,50 @@ void PQCLEAN_KYBER512_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES * Description: Encryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *c: pointer to output ciphertext -* (of length KYBER_INDCPA_BYTES bytes) -* - const uint8_t *m: pointer to input message -* (of length KYBER_INDCPA_MSGBYTES bytes) -* - const uint8_t *pk: pointer to input public key -* (of length KYBER_INDCPA_PUBLICKEYBYTES) -* - const uint8_t *coins: pointer to input random coins -* used as seed (of length KYBER_SYMBYTES) -* to deterministically generate all -* randomness +* Arguments: - uint8_t *c: pointer to output ciphertext +* (of length KYBER_INDCPA_BYTES bytes) +* - const uint8_t *m: pointer to input message +* (of length KYBER_INDCPA_MSGBYTES bytes) +* - const uint8_t *pk: pointer to input public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES) +* - const uint8_t *coins: pointer to input random coins used as seed +* (of length KYBER_SYMBYTES) to deterministically +* generate all randomness **************************************************/ void PQCLEAN_KYBER512_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], const uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], const uint8_t coins[KYBER_SYMBYTES]) { unsigned int i; - ALIGN32_ARRAY(uint8_t, KYBER_SYMBYTES) seed; - polyvec sp, pkpv, ep, at[KYBER_K], bp; + uint8_t seed[KYBER_SYMBYTES]; + polyvec sp, pkpv, ep, at[KYBER_K], b; poly v, k, epp; - unpack_pk(&pkpv, seed.arr, pk); + unpack_pk(&pkpv, seed, pk); PQCLEAN_KYBER512_AVX2_poly_frommsg(&k, m); - gen_at(at, seed.arr); + gen_at(at, seed); - PQCLEAN_KYBER512_AVX2_poly_getnoise4x(sp.vec + 0, sp.vec + 1, ep.vec + 0, ep.vec + 1, coins, - 0, 1, 2, 3); - PQCLEAN_KYBER512_AVX2_poly_getnoise(&epp, coins, 4); + PQCLEAN_KYBER512_AVX2_poly_getnoise_eta1122_4x(sp.vec + 0, sp.vec + 1, ep.vec + 0, ep.vec + 1, coins, 0, 1, 2, 3); + PQCLEAN_KYBER512_AVX2_poly_getnoise_eta2(&epp, coins, 4); PQCLEAN_KYBER512_AVX2_polyvec_ntt(&sp); // matrix-vector multiplication for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER512_AVX2_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp); + PQCLEAN_KYBER512_AVX2_polyvec_basemul_acc_montgomery(&b.vec[i], &at[i], &sp); } + PQCLEAN_KYBER512_AVX2_polyvec_basemul_acc_montgomery(&v, &pkpv, &sp); - PQCLEAN_KYBER512_AVX2_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp); - - PQCLEAN_KYBER512_AVX2_polyvec_invntt_tomont(&bp); + PQCLEAN_KYBER512_AVX2_polyvec_invntt_tomont(&b); PQCLEAN_KYBER512_AVX2_poly_invntt_tomont(&v); - PQCLEAN_KYBER512_AVX2_polyvec_add(&bp, &bp, &ep); + PQCLEAN_KYBER512_AVX2_polyvec_add(&b, &b, &ep); PQCLEAN_KYBER512_AVX2_poly_add(&v, &v, &epp); PQCLEAN_KYBER512_AVX2_poly_add(&v, &v, &k); - PQCLEAN_KYBER512_AVX2_polyvec_reduce(&bp); + PQCLEAN_KYBER512_AVX2_polyvec_reduce(&b); PQCLEAN_KYBER512_AVX2_poly_reduce(&v); - pack_ciphertext(c, &bp, &v); + pack_ciphertext(c, &b, &v); } /************************************************* @@ -331,24 +325,24 @@ void PQCLEAN_KYBER512_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], * Description: Decryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *m: pointer to output decrypted message -* (of length KYBER_INDCPA_MSGBYTES) -* - const uint8_t *c: pointer to input ciphertext -* (of length KYBER_INDCPA_BYTES) +* Arguments: - uint8_t *m: pointer to output decrypted message +* (of length KYBER_INDCPA_MSGBYTES) +* - const uint8_t *c: pointer to input ciphertext +* (of length KYBER_INDCPA_BYTES) * - const uint8_t *sk: pointer to input secret key * (of length KYBER_INDCPA_SECRETKEYBYTES) **************************************************/ void PQCLEAN_KYBER512_AVX2_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t c[KYBER_INDCPA_BYTES], const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { - polyvec bp, skpv; + polyvec b, skpv; poly v, mp; - unpack_ciphertext(&bp, &v, c); + unpack_ciphertext(&b, &v, c); unpack_sk(&skpv, sk); - PQCLEAN_KYBER512_AVX2_polyvec_ntt(&bp); - PQCLEAN_KYBER512_AVX2_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp); + PQCLEAN_KYBER512_AVX2_polyvec_ntt(&b); + PQCLEAN_KYBER512_AVX2_polyvec_basemul_acc_montgomery(&mp, &skpv, &b); PQCLEAN_KYBER512_AVX2_poly_invntt_tomont(&mp); PQCLEAN_KYBER512_AVX2_poly_sub(&mp, &v, &mp); diff --git a/crypto_kem/kyber512/avx2/invntt.S b/crypto_kem/kyber512/avx2/invntt.S index a191797e..7e9400ef 100644 --- a/crypto_kem/kyber512/avx2/invntt.S +++ b/crypto_kem/kyber512/avx2/invntt.S @@ -2,22 +2,21 @@ .include "shuffle.inc" .include "fq.inc" -.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=1,zl1=1,zh0=2,zh1=2 -#update & mul -vpsubw %ymm\rh0,%ymm\rl0,%ymm12 -vpsubw %ymm\rh1,%ymm\rl1,%ymm13 -vpsubw %ymm\rh2,%ymm\rl2,%ymm14 - +.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=2,zl1=2,zh0=3,zh1=3 +vpsubw %ymm\rl0,%ymm\rh0,%ymm12 vpaddw %ymm\rh0,%ymm\rl0,%ymm\rl0 -vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl1 +vpsubw %ymm\rl1,%ymm\rh1,%ymm13 + vpmullw %ymm\zl0,%ymm12,%ymm\rh0 +vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl1 +vpsubw %ymm\rl2,%ymm\rh2,%ymm14 -vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl2 vpmullw %ymm\zl0,%ymm13,%ymm\rh1 -vpsubw %ymm\rh3,%ymm\rl3,%ymm15 +vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl2 +vpsubw %ymm\rl3,%ymm\rh3,%ymm15 -vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl3 vpmullw %ymm\zl1,%ymm14,%ymm\rh2 +vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl3 vpmullw %ymm\zl1,%ymm15,%ymm\rh3 vpmulhw %ymm\zh0,%ymm12,%ymm12 @@ -26,60 +25,84 @@ vpmulhw %ymm\zh0,%ymm13,%ymm13 vpmulhw %ymm\zh1,%ymm14,%ymm14 vpmulhw %ymm\zh1,%ymm15,%ymm15 -#reduce vpmulhw %ymm0,%ymm\rh0,%ymm\rh0 + vpmulhw %ymm0,%ymm\rh1,%ymm\rh1 + vpmulhw %ymm0,%ymm\rh2,%ymm\rh2 vpmulhw %ymm0,%ymm\rh3,%ymm\rh3 + +# + +# + vpsubw %ymm\rh0,%ymm12,%ymm\rh0 + vpsubw %ymm\rh1,%ymm13,%ymm\rh1 + vpsubw %ymm\rh2,%ymm14,%ymm\rh2 vpsubw %ymm\rh3,%ymm15,%ymm\rh3 .endm -.text -invntt_levels0t5_avx: -level0: -#zetas -vmovdqu (%rsi),%ymm15 -vmovdqu 64(%rsi),%ymm3 -vmovdqu 32(%rsi),%ymm1 -vmovdqu 96(%rsi),%ymm2 +.macro intt_levels0t5 off +/* level 0 */ +vmovdqa _16XFLO*2(%rsi),%ymm2 +vmovdqa _16XFHI*2(%rsi),%ymm3 -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 128(%rdi),%ymm8 -vmovdqa 160(%rdi),%ymm9 -vmovdqa 192(%rdi),%ymm10 -vmovdqa 224(%rdi),%ymm11 +vmovdqa (128*\off+ 0)*2(%rdi),%ymm4 +vmovdqa (128*\off+ 32)*2(%rdi),%ymm6 +vmovdqa (128*\off+ 16)*2(%rdi),%ymm5 +vmovdqa (128*\off+ 48)*2(%rdi),%ymm7 -butterfly 4,5,8,9,6,7,10,11,15,3,1,2 +fqmulprecomp 2,3,4 +fqmulprecomp 2,3,6 +fqmulprecomp 2,3,5 +fqmulprecomp 2,3,7 -level1: -#zetas -vmovdqu 128(%rsi),%ymm3 -vmovdqu 160(%rsi),%ymm2 +vmovdqa (128*\off+ 64)*2(%rdi),%ymm8 +vmovdqa (128*\off+ 96)*2(%rdi),%ymm10 +vmovdqa (128*\off+ 80)*2(%rdi),%ymm9 +vmovdqa (128*\off+112)*2(%rdi),%ymm11 -butterfly 4,5,6,7,8,9,10,11,3,3,2,2 +fqmulprecomp 2,3,8 +fqmulprecomp 2,3,10 +fqmulprecomp 2,3,9 +fqmulprecomp 2,3,11 + +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+208)*2(%rsi),%ymm15 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+176)*2(%rsi),%ymm1 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+224)*2(%rsi),%ymm2 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+192)*2(%rsi),%ymm3 +vmovdqa _REVIDXB*2(%rsi),%ymm12 +vpshufb %ymm12,%ymm15,%ymm15 +vpshufb %ymm12,%ymm1,%ymm1 +vpshufb %ymm12,%ymm2,%ymm2 +vpshufb %ymm12,%ymm3,%ymm3 + +butterfly 4,5,8,9,6,7,10,11,15,1,2,3 + +/* level 1 */ +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+144)*2(%rsi),%ymm2 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+160)*2(%rsi),%ymm3 +vmovdqa _REVIDXB*2(%rsi),%ymm1 +vpshufb %ymm1,%ymm2,%ymm2 +vpshufb %ymm1,%ymm3,%ymm3 + +butterfly 4,5,6,7,8,9,10,11,2,2,3,3 shuffle1 4,5,3,5 shuffle1 6,7,4,7 shuffle1 8,9,6,9 shuffle1 10,11,8,11 -level2: -#zetas -vmovdqu 192(%rsi),%ymm10 -vmovdqu 224(%rsi),%ymm2 +/* level 2 */ +vmovdqa _REVIDXD*2(%rsi),%ymm12 +vpermd (_ZETAS_EXP+(1-\off)*224+112)*2(%rsi),%ymm12,%ymm2 +vpermd (_ZETAS_EXP+(1-\off)*224+128)*2(%rsi),%ymm12,%ymm10 -#consts -vmovdqa _16XV*2(%rdx),%ymm1 - -butterfly 3,4,6,8,5,7,9,11,10,10,2,2 +butterfly 3,4,6,8,5,7,9,11,2,2,10,10 +vmovdqa _16XV*2(%rsi),%ymm1 red16 3 shuffle2 3,4,10,4 @@ -87,26 +110,22 @@ shuffle2 6,8,3,8 shuffle2 5,7,6,7 shuffle2 9,11,5,11 -level3: -#zetas -vmovdqu 256(%rsi),%ymm9 -vmovdqu 288(%rsi),%ymm2 +/* level 3 */ +vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+80)*2(%rsi),%ymm2 +vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+96)*2(%rsi),%ymm9 -butterfly 10,3,6,5,4,8,7,11,9,9,2,2 - -red16 10 +butterfly 10,3,6,5,4,8,7,11,2,2,9,9 shuffle4 10,3,9,3 shuffle4 6,5,10,5 shuffle4 4,8,6,8 shuffle4 7,11,4,11 -level4: -#zetas -vmovdqu 320(%rsi),%ymm7 -vmovdqu 352(%rsi),%ymm2 +/* level 4 */ +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+48)*2(%rsi),%ymm2 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+64)*2(%rsi),%ymm7 -butterfly 9,10,6,4,3,5,8,11,7,7,2,2 +butterfly 9,10,6,4,3,5,8,11,2,2,7,7 red16 9 @@ -115,113 +134,62 @@ shuffle8 6,4,9,4 shuffle8 3,5,6,5 shuffle8 8,11,3,11 -level5: -#zetas -vpbroadcastd 384(%rsi),%ymm8 -vpbroadcastd 388(%rsi),%ymm2 +/* level5 */ +vmovdqa (_ZETAS_EXP+(1-\off)*224+16)*2(%rsi),%ymm2 +vmovdqa (_ZETAS_EXP+(1-\off)*224+32)*2(%rsi),%ymm8 -butterfly 7,9,6,3,10,4,5,11,8,8,2,2 +butterfly 7,9,6,3,10,4,5,11,2,2,8,8 -red16 7 +vmovdqa %ymm7,(128*\off+ 0)*2(%rdi) +vmovdqa %ymm9,(128*\off+ 16)*2(%rdi) +vmovdqa %ymm6,(128*\off+ 32)*2(%rdi) +vmovdqa %ymm3,(128*\off+ 48)*2(%rdi) +vmovdqa %ymm10,(128*\off+ 64)*2(%rdi) +vmovdqa %ymm4,(128*\off+ 80)*2(%rdi) +vmovdqa %ymm5,(128*\off+ 96)*2(%rdi) +vmovdqa %ymm11,(128*\off+112)*2(%rdi) +.endm -#store -vmovdqa %ymm7,(%rdi) -vmovdqa %ymm9,32(%rdi) -vmovdqa %ymm6,64(%rdi) -vmovdqa %ymm3,96(%rdi) -vmovdqa %ymm10,128(%rdi) -vmovdqa %ymm4,160(%rdi) -vmovdqa %ymm5,192(%rdi) -vmovdqa %ymm11,224(%rdi) +.macro intt_level6 off +/* level 6 */ +vmovdqa (64*\off+ 0)*2(%rdi),%ymm4 +vmovdqa (64*\off+128)*2(%rdi),%ymm8 +vmovdqa (64*\off+ 16)*2(%rdi),%ymm5 +vmovdqa (64*\off+144)*2(%rdi),%ymm9 +vpbroadcastq (_ZETAS_EXP+0)*2(%rsi),%ymm2 -ret - -invntt_level6_avx: -#zetas -vpbroadcastd (%rsi),%ymm1 -vpbroadcastd 4(%rsi),%ymm2 - -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 256(%rdi),%ymm8 -vmovdqa 288(%rdi),%ymm9 -vmovdqa 320(%rdi),%ymm10 -vmovdqa 352(%rdi),%ymm11 +vmovdqa (64*\off+ 32)*2(%rdi),%ymm6 +vmovdqa (64*\off+160)*2(%rdi),%ymm10 +vmovdqa (64*\off+ 48)*2(%rdi),%ymm7 +vmovdqa (64*\off+176)*2(%rdi),%ymm11 +vpbroadcastq (_ZETAS_EXP+4)*2(%rsi),%ymm3 butterfly 4,5,6,7,8,9,10,11 -#consts -vmovdqa _16XFLO*2(%rdx),%ymm12 -vmovdqa _16XFHI*2(%rdx),%ymm13 +.if \off == 0 +red16 4 +.endif -#store -vmovdqa %ymm8,256(%rdi) -vmovdqa %ymm9,288(%rdi) -vmovdqa %ymm10,320(%rdi) -vmovdqa %ymm11,352(%rdi) - -fqmulprecomp 12,13,4,8 -fqmulprecomp 12,13,5,9 -fqmulprecomp 12,13,6,10 -fqmulprecomp 12,13,7,11 - -#store -vmovdqa %ymm4,(%rdi) -vmovdqa %ymm5,32(%rdi) -vmovdqa %ymm6,64(%rdi) -vmovdqa %ymm7,96(%rdi) - -#load -vmovdqa 128(%rdi),%ymm4 -vmovdqa 160(%rdi),%ymm5 -vmovdqa 192(%rdi),%ymm6 -vmovdqa 224(%rdi),%ymm7 -vmovdqa 384(%rdi),%ymm8 -vmovdqa 416(%rdi),%ymm9 -vmovdqa 448(%rdi),%ymm10 -vmovdqa 480(%rdi),%ymm11 - -butterfly 4,5,6,7,8,9,10,11 - -#consts -vmovdqa _16XFLO*2(%rdx),%ymm12 -vmovdqa _16XFHI*2(%rdx),%ymm13 - -#store -vmovdqa %ymm8,384(%rdi) -vmovdqa %ymm9,416(%rdi) -vmovdqa %ymm10,448(%rdi) -vmovdqa %ymm11,480(%rdi) - -fqmulprecomp 12,13,4,8 -fqmulprecomp 12,13,5,9 -fqmulprecomp 12,13,6,10 -fqmulprecomp 12,13,7,11 - -#store -vmovdqa %ymm4,128(%rdi) -vmovdqa %ymm5,160(%rdi) -vmovdqa %ymm6,192(%rdi) -vmovdqa %ymm7,224(%rdi) - -ret +vmovdqa %ymm4,(64*\off+ 0)*2(%rdi) +vmovdqa %ymm5,(64*\off+ 16)*2(%rdi) +vmovdqa %ymm6,(64*\off+ 32)*2(%rdi) +vmovdqa %ymm7,(64*\off+ 48)*2(%rdi) +vmovdqa %ymm8,(64*\off+128)*2(%rdi) +vmovdqa %ymm9,(64*\off+144)*2(%rdi) +vmovdqa %ymm10,(64*\off+160)*2(%rdi) +vmovdqa %ymm11,(64*\off+176)*2(%rdi) +.endm +.text .global cdecl(PQCLEAN_KYBER512_AVX2_invntt_avx) .global _cdecl(PQCLEAN_KYBER512_AVX2_invntt_avx) cdecl(PQCLEAN_KYBER512_AVX2_invntt_avx): _cdecl(PQCLEAN_KYBER512_AVX2_invntt_avx): -#consts vmovdqa _16XQ*2(%rsi),%ymm0 -mov %rsi,%rdx -add $_ZETAS_INV_EXP*2,%rsi -call invntt_levels0t5_avx -add $256,%rdi -add $392,%rsi -call invntt_levels0t5_avx -sub $256,%rdi -add $392,%rsi -call invntt_level6_avx + +intt_levels0t5 0 +intt_levels0t5 1 + +intt_level6 0 +intt_level6 1 ret diff --git a/crypto_kem/kyber512/avx2/kem.c b/crypto_kem/kyber512/avx2/kem.c index a52e286f..290f1f61 100644 --- a/crypto_kem/kyber512/avx2/kem.c +++ b/crypto_kem/kyber512/avx2/kem.c @@ -1,4 +1,3 @@ -#include "align.h" #include "indcpa.h" #include "kem.h" #include "params.h" @@ -15,13 +14,14 @@ * for CCA-secure Kyber key encapsulation mechanism * * Arguments: - unsigned char *pk: pointer to output public key -* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) * - unsigned char *sk: pointer to output private key -* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* (an already allocated array of KYBER_SECRETKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER512_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { +int PQCLEAN_KYBER512_AVX2_crypto_kem_keypair(unsigned char pk[KYBER_PUBLICKEYBYTES], + unsigned char sk[KYBER_SECRETKEYBYTES]) { size_t i; PQCLEAN_KYBER512_AVX2_indcpa_keypair(pk, sk); for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) { @@ -40,36 +40,36 @@ int PQCLEAN_KYBER512_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *s * secret for given public key * * Arguments: - unsigned char *ct: pointer to output cipher text -* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) * - unsigned char *ss: pointer to output shared secret -* (an already allocated array of CRYPTO_BYTES bytes) +* (an already allocated array of KYBER_SSBYTES bytes) * - const unsigned char *pk: pointer to input public key -* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER512_AVX2_crypto_kem_enc(unsigned char *ct, - unsigned char *ss, - const unsigned char *pk) { - ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; +int PQCLEAN_KYBER512_AVX2_crypto_kem_enc(unsigned char ct[KYBER_CIPHERTEXTBYTES], + unsigned char ss[KYBER_SSBYTES], + const unsigned char pk[KYBER_PUBLICKEYBYTES]) { + uint8_t buf[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ - ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr; + uint8_t kr[2 * KYBER_SYMBYTES]; - randombytes(buf.arr, KYBER_SYMBYTES); + randombytes(buf, KYBER_SYMBYTES); /* Don't release system RNG output */ - hash_h(buf.arr, buf.arr, KYBER_SYMBYTES); + hash_h(buf, buf, KYBER_SYMBYTES); /* Multitarget countermeasure for coins + contributory KEM */ - hash_h(buf.arr + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); - hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES); + hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); + hash_g(kr, buf, 2 * KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ - PQCLEAN_KYBER512_AVX2_indcpa_enc(ct, buf.arr, pk, kr.arr + KYBER_SYMBYTES); + PQCLEAN_KYBER512_AVX2_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); /* overwrite coins in kr with H(c) */ - hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); + hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* hash concatenation of pre-k and H(c) to k */ - kdf(ss, kr.arr, 2 * KYBER_SYMBYTES); + kdf(ss, kr, 2 * KYBER_SYMBYTES); return 0; } @@ -80,47 +80,47 @@ int PQCLEAN_KYBER512_AVX2_crypto_kem_enc(unsigned char *ct, * cipher text and private key * * Arguments: - unsigned char *ss: pointer to output shared secret -* (an already allocated array of CRYPTO_BYTES bytes) +* (an already allocated array of KYBER_SSBYTES bytes) * - const unsigned char *ct: pointer to input cipher text -* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) * - const unsigned char *sk: pointer to input private key -* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* (an already allocated array of KYBER_SECRETKEYBYTES bytes) * * Returns 0. * * On failure, ss will contain a pseudo-random value. **************************************************/ -int PQCLEAN_KYBER512_AVX2_crypto_kem_dec(unsigned char *ss, - const unsigned char *ct, - const unsigned char *sk) { +int PQCLEAN_KYBER512_AVX2_crypto_kem_dec(unsigned char ss[KYBER_SSBYTES], + const unsigned char ct[KYBER_CIPHERTEXTBYTES], + const unsigned char sk[KYBER_SECRETKEYBYTES]) { size_t i; int fail; - ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; + uint8_t buf[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ - ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr; - uint8_t cmp[KYBER_CIPHERTEXTBYTES]; + uint8_t kr[2 * KYBER_SYMBYTES]; + ALIGNED_UINT8(KYBER_CIPHERTEXTBYTES) cmp; const uint8_t *pk = sk + KYBER_INDCPA_SECRETKEYBYTES; - PQCLEAN_KYBER512_AVX2_indcpa_dec(buf.arr, ct, sk); + PQCLEAN_KYBER512_AVX2_indcpa_dec(buf, ct, sk); /* Multitarget countermeasure for coins + contributory KEM */ for (i = 0; i < KYBER_SYMBYTES; i++) { - buf.arr[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; + buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; } - hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES); + hash_g(kr, buf, 2 * KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ - PQCLEAN_KYBER512_AVX2_indcpa_enc(cmp, buf.arr, pk, kr.arr + KYBER_SYMBYTES); + PQCLEAN_KYBER512_AVX2_indcpa_enc(cmp.coeffs, buf, pk, kr + KYBER_SYMBYTES); - fail = PQCLEAN_KYBER512_AVX2_verify(ct, cmp, KYBER_CIPHERTEXTBYTES); + fail = PQCLEAN_KYBER512_AVX2_verify(ct, cmp.coeffs, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ - hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); + hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* Overwrite pre-k with z on re-encryption failure */ - PQCLEAN_KYBER512_AVX2_cmov(kr.arr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, (uint8_t)fail); + PQCLEAN_KYBER512_AVX2_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); /* hash concatenation of pre-k and H(c) to k */ - kdf(ss, kr.arr, 2 * KYBER_SYMBYTES); + kdf(ss, kr, 2 * KYBER_SYMBYTES); return 0; } diff --git a/crypto_kem/kyber512/avx2/ntt.S b/crypto_kem/kyber512/avx2/ntt.S index 02107bc4..a22a14e0 100644 --- a/crypto_kem/kyber512/avx2/ntt.S +++ b/crypto_kem/kyber512/avx2/ntt.S @@ -1,222 +1,191 @@ #include "cdecl.h" .include "shuffle.inc" -.include "fq.inc" -.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=1,zh1=1 -#mul +.macro mul rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=2,zh1=2 vpmullw %ymm\zl0,%ymm\rh0,%ymm12 vpmullw %ymm\zl0,%ymm\rh1,%ymm13 + vpmullw %ymm\zl1,%ymm\rh2,%ymm14 vpmullw %ymm\zl1,%ymm\rh3,%ymm15 + vpmulhw %ymm\zh0,%ymm\rh0,%ymm\rh0 vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh1 + vpmulhw %ymm\zh1,%ymm\rh2,%ymm\rh2 vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh3 - -#reduce -vpmulhw %ymm0,%ymm12,%ymm12 -vpmulhw %ymm0,%ymm13,%ymm13 -vpmulhw %ymm0,%ymm14,%ymm14 -vpmulhw %ymm0,%ymm15,%ymm15 -vpsubw %ymm12,%ymm\rh0,%ymm12 -vpsubw %ymm13,%ymm\rh1,%ymm13 -vpsubw %ymm14,%ymm\rh2,%ymm14 -vpsubw %ymm15,%ymm\rh3,%ymm15 - -#update -vpsubw %ymm12,%ymm\rl0,%ymm\rh0 -vpaddw %ymm12,%ymm\rl0,%ymm\rl0 -vpsubw %ymm13,%ymm\rl1,%ymm\rh1 -vpaddw %ymm13,%ymm\rl1,%ymm\rl1 -vpsubw %ymm14,%ymm\rl2,%ymm\rh2 -vpaddw %ymm14,%ymm\rl2,%ymm\rl2 -vpsubw %ymm15,%ymm\rl3,%ymm\rh3 -vpaddw %ymm15,%ymm\rl3,%ymm\rl3 .endm -# We break the dependency chains with the cost of slightly more additions. -# But they can be run in parallel to the multiplications on execution port 5 -# (multiplications only go to ports 0 and 1) -.macro butterfly2 rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,x=3,y=2,zl0=15,zl1=15,zh0=1,zh1=1 -#mul -vpmullw %ymm\zl0,%ymm\rh0,%ymm12 -vpmulhw %ymm\zh0,%ymm\rh0,%ymm\x -vpmullw %ymm\zl0,%ymm\rh1,%ymm13 -vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh0 -vpmullw %ymm\zl1,%ymm\rh2,%ymm14 -vpmulhw %ymm\zh1,%ymm\rh2,%ymm\y -vpmullw %ymm\zl1,%ymm\rh3,%ymm15 -vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh2 - -#reduce +.macro reduce vpmulhw %ymm0,%ymm12,%ymm12 vpmulhw %ymm0,%ymm13,%ymm13 + vpmulhw %ymm0,%ymm14,%ymm14 vpmulhw %ymm0,%ymm15,%ymm15 +.endm -vpsubw %ymm\rh0,%ymm\rl1,%ymm\rh1 -vpaddw %ymm\rh0,%ymm\rl1,%ymm\rl1 -vpsubw %ymm\x,%ymm\rl0,%ymm\rh0 -vpaddw %ymm\x,%ymm\rl0,%ymm\rl0 -vpsubw %ymm\rh2,%ymm\rl3,%ymm\rh3 -vpaddw %ymm\rh2,%ymm\rl3,%ymm\rl3 -vpsubw %ymm\y,%ymm\rl2,%ymm\rh2 -vpaddw %ymm\y,%ymm\rl2,%ymm\rl2 +.macro update rln,rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 +vpaddw %ymm\rh0,%ymm\rl0,%ymm\rln +vpsubw %ymm\rh0,%ymm\rl0,%ymm\rh0 +vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl0 -#update +vpsubw %ymm\rh1,%ymm\rl1,%ymm\rh1 +vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl1 +vpsubw %ymm\rh2,%ymm\rl2,%ymm\rh2 + +vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl2 +vpsubw %ymm\rh3,%ymm\rl3,%ymm\rh3 + +vpsubw %ymm12,%ymm\rln,%ymm\rln vpaddw %ymm12,%ymm\rh0,%ymm\rh0 -vpsubw %ymm12,%ymm\rl0,%ymm\rl0 +vpsubw %ymm13,%ymm\rl0,%ymm\rl0 + vpaddw %ymm13,%ymm\rh1,%ymm\rh1 -vpsubw %ymm13,%ymm\rl1,%ymm\rl1 +vpsubw %ymm14,%ymm\rl1,%ymm\rl1 vpaddw %ymm14,%ymm\rh2,%ymm\rh2 -vpsubw %ymm14,%ymm\rl2,%ymm\rl2 + +vpsubw %ymm15,%ymm\rl2,%ymm\rl2 vpaddw %ymm15,%ymm\rh3,%ymm\rh3 -vpsubw %ymm15,%ymm\rl3,%ymm\rl3 +.endm + +.macro level0 off +vpbroadcastq (_ZETAS_EXP+0)*2(%rsi),%ymm15 +vmovdqa (64*\off+128)*2(%rdi),%ymm8 +vmovdqa (64*\off+144)*2(%rdi),%ymm9 +vmovdqa (64*\off+160)*2(%rdi),%ymm10 +vmovdqa (64*\off+176)*2(%rdi),%ymm11 +vpbroadcastq (_ZETAS_EXP+4)*2(%rsi),%ymm2 + +mul 8,9,10,11 + +vmovdqa (64*\off+ 0)*2(%rdi),%ymm4 +vmovdqa (64*\off+ 16)*2(%rdi),%ymm5 +vmovdqa (64*\off+ 32)*2(%rdi),%ymm6 +vmovdqa (64*\off+ 48)*2(%rdi),%ymm7 + +reduce +update 3,4,5,6,7,8,9,10,11 + +vmovdqa %ymm3,(64*\off+ 0)*2(%rdi) +vmovdqa %ymm4,(64*\off+ 16)*2(%rdi) +vmovdqa %ymm5,(64*\off+ 32)*2(%rdi) +vmovdqa %ymm6,(64*\off+ 48)*2(%rdi) +vmovdqa %ymm8,(64*\off+128)*2(%rdi) +vmovdqa %ymm9,(64*\off+144)*2(%rdi) +vmovdqa %ymm10,(64*\off+160)*2(%rdi) +vmovdqa %ymm11,(64*\off+176)*2(%rdi) +.endm + +.macro levels1t6 off +/* level 1 */ +vmovdqa (_ZETAS_EXP+224*\off+16)*2(%rsi),%ymm15 +vmovdqa (128*\off+ 64)*2(%rdi),%ymm8 +vmovdqa (128*\off+ 80)*2(%rdi),%ymm9 +vmovdqa (128*\off+ 96)*2(%rdi),%ymm10 +vmovdqa (128*\off+112)*2(%rdi),%ymm11 +vmovdqa (_ZETAS_EXP+224*\off+32)*2(%rsi),%ymm2 + +mul 8,9,10,11 + +vmovdqa (128*\off+ 0)*2(%rdi),%ymm4 +vmovdqa (128*\off+ 16)*2(%rdi),%ymm5 +vmovdqa (128*\off+ 32)*2(%rdi),%ymm6 +vmovdqa (128*\off+ 48)*2(%rdi),%ymm7 + +reduce +update 3,4,5,6,7,8,9,10,11 + +/* level 2 */ +shuffle8 5,10,7,10 +shuffle8 6,11,5,11 + +vmovdqa (_ZETAS_EXP+224*\off+48)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+64)*2(%rsi),%ymm2 + +mul 7,10,5,11 + +shuffle8 3,8,6,8 +shuffle8 4,9,3,9 + +reduce +update 4,6,8,3,9,7,10,5,11 + +/* level 3 */ +shuffle4 8,5,9,5 +shuffle4 3,11,8,11 + +vmovdqa (_ZETAS_EXP+224*\off+80)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+96)*2(%rsi),%ymm2 + +mul 9,5,8,11 + +shuffle4 4,7,3,7 +shuffle4 6,10,4,10 + +reduce +update 6,3,7,4,10,9,5,8,11 + +/* level 4 */ +shuffle2 7,8,10,8 +shuffle2 4,11,7,11 + +vmovdqa (_ZETAS_EXP+224*\off+112)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+128)*2(%rsi),%ymm2 + +mul 10,8,7,11 + +shuffle2 6,9,4,9 +shuffle2 3,5,6,5 + +reduce +update 3,4,9,6,5,10,8,7,11 + +/* level 5 */ +shuffle1 9,7,5,7 +shuffle1 6,11,9,11 + +vmovdqa (_ZETAS_EXP+224*\off+144)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+160)*2(%rsi),%ymm2 + +mul 5,7,9,11 + +shuffle1 3,10,6,10 +shuffle1 4,8,3,8 + +reduce +update 4,6,10,3,8,5,7,9,11 + +/* level 6 */ +vmovdqa (_ZETAS_EXP+224*\off+176)*2(%rsi),%ymm14 +vmovdqa (_ZETAS_EXP+224*\off+208)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+192)*2(%rsi),%ymm8 +vmovdqa (_ZETAS_EXP+224*\off+224)*2(%rsi),%ymm2 + +mul 10,3,9,11,14,15,8,2 + +reduce +update 8,4,6,5,7,10,3,9,11 + +vmovdqa %ymm8,(128*\off+ 0)*2(%rdi) +vmovdqa %ymm4,(128*\off+ 16)*2(%rdi) +vmovdqa %ymm10,(128*\off+ 32)*2(%rdi) +vmovdqa %ymm3,(128*\off+ 48)*2(%rdi) +vmovdqa %ymm6,(128*\off+ 64)*2(%rdi) +vmovdqa %ymm5,(128*\off+ 80)*2(%rdi) +vmovdqa %ymm9,(128*\off+ 96)*2(%rdi) +vmovdqa %ymm11,(128*\off+112)*2(%rdi) .endm .text -ntt_level0_avx: -level0: -#zetas -vpbroadcastd (%rsi),%ymm15 -vpbroadcastd 4(%rsi),%ymm1 - -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 256(%rdi),%ymm8 -vmovdqa 288(%rdi),%ymm9 -vmovdqa 320(%rdi),%ymm10 -vmovdqa 352(%rdi),%ymm11 - -butterfly2 4,5,6,7,8,9,10,11 - -#store -vmovdqa %ymm4,(%rdi) -vmovdqa %ymm5,32(%rdi) -vmovdqa %ymm6,64(%rdi) -vmovdqa %ymm7,96(%rdi) -vmovdqa %ymm8,256(%rdi) -vmovdqa %ymm9,288(%rdi) -vmovdqa %ymm10,320(%rdi) -vmovdqa %ymm11,352(%rdi) - -ret - -ntt_levels1t6_avx: -level1: -#zetas -vpbroadcastd (%rsi),%ymm15 -vpbroadcastd 4(%rsi),%ymm1 - -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 128(%rdi),%ymm8 -vmovdqa 160(%rdi),%ymm9 -vmovdqa 192(%rdi),%ymm10 -vmovdqa 224(%rdi),%ymm11 - -butterfly2 4,5,6,7,8,9,10,11,3 - -level2: -#zetas -vmovdqu 8(%rsi),%ymm15 -vmovdqu 40(%rsi),%ymm1 - -shuffle8 4,8,3,8 -shuffle8 5,9,4,9 -shuffle8 6,10,5,10 -shuffle8 7,11,6,11 - -butterfly2 3,8,4,9,5,10,6,11,7 - -level3: -#zetas -vmovdqu 72(%rsi),%ymm15 -vmovdqu 104(%rsi),%ymm1 - -shuffle4 3,5,7,5 -shuffle4 8,10,3,10 -shuffle4 4,6,8,6 -shuffle4 9,11,4,11 - -butterfly2 7,5,3,10,8,6,4,11,9 - -level4: -#zetas -vmovdqu 136(%rsi),%ymm15 -vmovdqu 168(%rsi),%ymm1 - -shuffle2 7,8,9,8 -shuffle2 5,6,7,6 -shuffle2 3,4,5,4 -shuffle2 10,11,3,11 - -butterfly2 9,8,7,6,5,4,3,11,10 - -level5: -#zetas -vmovdqu 200(%rsi),%ymm15 -vmovdqu 232(%rsi),%ymm1 - -shuffle1 9,5,10,5 -shuffle1 8,4,9,4 -shuffle1 7,3,8,3 -shuffle1 6,11,7,11 - -butterfly2 10,5,9,4,8,3,7,11,6 - -level6: -#zetas -vmovdqu 264(%rsi),%ymm14 -vmovdqu 328(%rsi),%ymm15 -vmovdqu 296(%rsi),%ymm1 -vmovdqu 360(%rsi),%ymm2 - -butterfly2 10,5,8,3,9,4,7,11,6,1,14,15,1,2 - -vmovdqa _16XV*2(%rdx),%ymm1 -red16 10,12 -red16 5,13 -red16 9,14 -red16 4,15 -red16 8,2 -red16 3,6 -red16 7,12 -red16 11,13 - -#store -vmovdqa %ymm10,(%rdi) -vmovdqa %ymm5,32(%rdi) -vmovdqa %ymm9,64(%rdi) -vmovdqa %ymm4,96(%rdi) -vmovdqa %ymm8,128(%rdi) -vmovdqa %ymm3,160(%rdi) -vmovdqa %ymm7,192(%rdi) -vmovdqa %ymm11,224(%rdi) - -ret - .global cdecl(PQCLEAN_KYBER512_AVX2_ntt_avx) .global _cdecl(PQCLEAN_KYBER512_AVX2_ntt_avx) cdecl(PQCLEAN_KYBER512_AVX2_ntt_avx): _cdecl(PQCLEAN_KYBER512_AVX2_ntt_avx): -#consts vmovdqa _16XQ*2(%rsi),%ymm0 -mov %rsi,%rdx -add $_ZETAS_EXP*2,%rsi -call ntt_level0_avx -add $128,%rdi -call ntt_level0_avx -sub $128,%rdi -add $8,%rsi -call ntt_levels1t6_avx -add $256,%rdi -add $392,%rsi -call ntt_levels1t6_avx + +level0 0 +level0 1 + +levels1t6 0 +levels1t6 1 + ret diff --git a/crypto_kem/kyber512/avx2/ntt.h b/crypto_kem/kyber512/avx2/ntt.h index 142e282c..399ea645 100644 --- a/crypto_kem/kyber512/avx2/ntt.h +++ b/crypto_kem/kyber512/avx2/ntt.h @@ -1,24 +1,21 @@ #ifndef PQCLEAN_KYBER512_AVX2_NTT_H #define PQCLEAN_KYBER512_AVX2_NTT_H -#include "consts.h" + +#include #include -void PQCLEAN_KYBER512_AVX2_ntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER512_AVX2_qdata); -void PQCLEAN_KYBER512_AVX2_invntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER512_AVX2_qdata); +void PQCLEAN_KYBER512_AVX2_ntt_avx(__m256i *r, const __m256i *PQCLEAN_KYBER512_AVX2_qdata); +void PQCLEAN_KYBER512_AVX2_invntt_avx(__m256i *r, const __m256i *PQCLEAN_KYBER512_AVX2_qdata); -void PQCLEAN_KYBER512_AVX2_nttpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER512_AVX2_qdata); -void PQCLEAN_KYBER512_AVX2_nttunpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER512_AVX2_qdata); +void PQCLEAN_KYBER512_AVX2_nttpack_avx(__m256i *r, const __m256i *PQCLEAN_KYBER512_AVX2_qdata); +void PQCLEAN_KYBER512_AVX2_nttunpack_avx(__m256i *r, const __m256i *PQCLEAN_KYBER512_AVX2_qdata); -void PQCLEAN_KYBER512_AVX2_basemul_avx(int16_t *r, - const int16_t *a, - const int16_t *b, - const qdata_t *PQCLEAN_KYBER512_AVX2_qdata); -void PQCLEAN_KYBER512_AVX2_basemul_acc_avx(int16_t *r, - const int16_t *a, - const int16_t *b, - const qdata_t *PQCLEAN_KYBER512_AVX2_qdata); +void PQCLEAN_KYBER512_AVX2_basemul_avx(__m256i *r, + const __m256i *a, + const __m256i *b, + const __m256i *PQCLEAN_KYBER512_AVX2_qdata); -void PQCLEAN_KYBER512_AVX2_ntttobytes_avx(uint8_t *r, const int16_t *a, const qdata_t *PQCLEAN_KYBER512_AVX2_qdata); -void PQCLEAN_KYBER512_AVX2_nttfrombytes_avx(int16_t *r, const uint8_t *a, const qdata_t *PQCLEAN_KYBER512_AVX2_qdata); +void PQCLEAN_KYBER512_AVX2_ntttobytes_avx(uint8_t *r, const __m256i *a, const __m256i *PQCLEAN_KYBER512_AVX2_qdata); +void PQCLEAN_KYBER512_AVX2_nttfrombytes_avx(__m256i *r, const uint8_t *a, const __m256i *PQCLEAN_KYBER512_AVX2_qdata); #endif diff --git a/crypto_kem/kyber512/avx2/params.h b/crypto_kem/kyber512/avx2/params.h index 5d0b9aae..8fb33943 100644 --- a/crypto_kem/kyber512/avx2/params.h +++ b/crypto_kem/kyber512/avx2/params.h @@ -7,8 +7,6 @@ #define KYBER_N 256 #define KYBER_Q 3329 -#define KYBER_ETA 2 - #define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define KYBER_SSBYTES 32 /* size in bytes of shared key */ @@ -16,9 +14,12 @@ #define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) #define KYBER_K 2 -#define KYBER_POLYCOMPRESSEDBYTES 96 +#define KYBER_ETA1 3 +#define KYBER_POLYCOMPRESSEDBYTES 128 #define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320) +#define KYBER_ETA2 2 + #define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES #define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) #define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) diff --git a/crypto_kem/kyber512/avx2/poly.c b/crypto_kem/kyber512/avx2/poly.c index c651e73a..07409cc6 100644 --- a/crypto_kem/kyber512/avx2/poly.c +++ b/crypto_kem/kyber512/avx2/poly.c @@ -12,74 +12,89 @@ /************************************************* * Name: PQCLEAN_KYBER512_AVX2_poly_compress * -* Description: Compression and subsequent serialization of a polynomial +* Description: Compression and subsequent serialization of a polynomial. +* The coefficients of the input polynomial are assumed to +* lie in the invertal [0,q], i.e. the polynomial must be reduced +* by PQCLEAN_KYBER512_AVX2_poly_reduce(). * * Arguments: - uint8_t *r: pointer to output byte array * (of length KYBER_POLYCOMPRESSEDBYTES) -* - poly *a: pointer to input polynomial +* - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER512_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *restrict a) { - unsigned int i, j; - uint8_t t[8]; - - PQCLEAN_KYBER512_AVX2_poly_csubq(a); - - for (i = 0; i < KYBER_N / 8; i++) { - for (j = 0; j < 8; j++) { - t[j] = ((((uint16_t)a->coeffs[8 * i + j] << 3) + KYBER_Q / 2) / KYBER_Q) & 7; - } - - r[0] = (t[0] >> 0) | (t[1] << 3) | (t[2] << 6); - r[1] = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7); - r[2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5); - r += 3; - } -} - -/************************************************* -* Name: PQCLEAN_KYBER512_AVX2_poly_decompress -* -* Description: De-serialization and subsequent decompression of a polynomial; -* approximate inverse of PQCLEAN_KYBER512_AVX2_poly_compress -* -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *a: pointer to input byte array -* (of length KYBER_POLYCOMPRESSEDBYTES bytes) -**************************************************/ -void PQCLEAN_KYBER512_AVX2_poly_decompress(poly *restrict r, - const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) { +void PQCLEAN_KYBER512_AVX2_poly_compress(uint8_t r[128], const poly *restrict a) { unsigned int i; + __m256i f0, f1, f2, f3; + const __m256i v = _mm256_load_si256(&PQCLEAN_KYBER512_AVX2_qdata.vec[_16XV / 16]); + const __m256i shift1 = _mm256_set1_epi16(1 << 9); + const __m256i mask = _mm256_set1_epi16(15); + const __m256i shift2 = _mm256_set1_epi16((16 << 8) + 1); + const __m256i permdidx = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); - unsigned int j; - uint8_t t[8]; - for (i = 0; i < KYBER_N / 8; i++) { - t[0] = (a[0] >> 0); - t[1] = (a[0] >> 3); - t[2] = (a[0] >> 6) | (a[1] << 2); - t[3] = (a[1] >> 1); - t[4] = (a[1] >> 4); - t[5] = (a[1] >> 7) | (a[2] << 1); - t[6] = (a[2] >> 2); - t[7] = (a[2] >> 5); - a += 3; - - for (j = 0; j < 8; j++) { - r->coeffs[8 * i + j] = ((uint16_t)(t[j] & 7) * KYBER_Q + 4) >> 3; - } + for (i = 0; i < KYBER_N / 64; i++) { + f0 = _mm256_load_si256(&a->vec[4 * i + 0]); + f1 = _mm256_load_si256(&a->vec[4 * i + 1]); + f2 = _mm256_load_si256(&a->vec[4 * i + 2]); + f3 = _mm256_load_si256(&a->vec[4 * i + 3]); + f0 = _mm256_mulhi_epi16(f0, v); + f1 = _mm256_mulhi_epi16(f1, v); + f2 = _mm256_mulhi_epi16(f2, v); + f3 = _mm256_mulhi_epi16(f3, v); + f0 = _mm256_mulhrs_epi16(f0, shift1); + f1 = _mm256_mulhrs_epi16(f1, shift1); + f2 = _mm256_mulhrs_epi16(f2, shift1); + f3 = _mm256_mulhrs_epi16(f3, shift1); + f0 = _mm256_and_si256(f0, mask); + f1 = _mm256_and_si256(f1, mask); + f2 = _mm256_and_si256(f2, mask); + f3 = _mm256_and_si256(f3, mask); + f0 = _mm256_packus_epi16(f0, f1); + f2 = _mm256_packus_epi16(f2, f3); + f0 = _mm256_maddubs_epi16(f0, shift2); + f2 = _mm256_maddubs_epi16(f2, shift2); + f0 = _mm256_packus_epi16(f0, f2); + f0 = _mm256_permutevar8x32_epi32(f0, permdidx); + _mm256_storeu_si256((__m256i *)&r[32 * i], f0); } } +void PQCLEAN_KYBER512_AVX2_poly_decompress(poly *restrict r, const uint8_t a[128]) { + unsigned int i; + __m128i t; + __m256i f; + const __m256i q = _mm256_load_si256(&PQCLEAN_KYBER512_AVX2_qdata.vec[_16XQ / 16]); + const __m256i shufbidx = _mm256_set_epi8(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, + 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0); + const __m256i mask = _mm256_set1_epi32(0x00F0000F); + const __m256i shift = _mm256_set1_epi32((128 << 16) + 2048); + + for (i = 0; i < KYBER_N / 16; i++) { + t = _mm_loadl_epi64((__m128i *)&a[8 * i]); + f = _mm256_broadcastsi128_si256(t); + f = _mm256_shuffle_epi8(f, shufbidx); + f = _mm256_and_si256(f, mask); + f = _mm256_mullo_epi16(f, shift); + f = _mm256_mulhrs_epi16(f, q); + _mm256_store_si256(&r->vec[i], f); + } +} + + /************************************************* * Name: PQCLEAN_KYBER512_AVX2_poly_tobytes * -* Description: Serialization of a polynomial +* Description: Serialization of a polynomial in NTT representation. +* The coefficients of the input polynomial are assumed to +* lie in the invertal [0,q], i.e. the polynomial must be reduced +* by PQCLEAN_KYBER512_AVX2_poly_reduce(). The coefficients are orderd as output by +* PQCLEAN_KYBER512_AVX2_poly_ntt(); the serialized output coefficients are in bitreversed +* order. * * Arguments: - uint8_t *r: pointer to output byte array * (needs space for KYBER_POLYBYTES bytes) -* - poly *a: pointer to input polynomial +* - poly *a: pointer to input polynomial **************************************************/ void PQCLEAN_KYBER512_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { - PQCLEAN_KYBER512_AVX2_ntttobytes_avx(r, a->coeffs, &PQCLEAN_KYBER512_AVX2_qdata); + PQCLEAN_KYBER512_AVX2_ntttobytes_avx(r, a->vec, PQCLEAN_KYBER512_AVX2_qdata.vec); } /************************************************* @@ -88,12 +103,12 @@ void PQCLEAN_KYBER512_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { * Description: De-serialization of a polynomial; * inverse of PQCLEAN_KYBER512_AVX2_poly_tobytes * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: pointer to input byte array * (of KYBER_POLYBYTES bytes) **************************************************/ void PQCLEAN_KYBER512_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]) { - PQCLEAN_KYBER512_AVX2_nttfrombytes_avx(r->coeffs, a, &PQCLEAN_KYBER512_AVX2_qdata); + PQCLEAN_KYBER512_AVX2_nttfrombytes_avx(r->vec, a, PQCLEAN_KYBER512_AVX2_qdata.vec); } /************************************************* @@ -101,11 +116,10 @@ void PQCLEAN_KYBER512_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYT * * Description: Convert 32-byte message to polynomial * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *msg: pointer to input message **************************************************/ -void PQCLEAN_KYBER512_AVX2_poly_frommsg(poly *restrict r, - const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { +void PQCLEAN_KYBER512_AVX2_poly_frommsg(poly *restrict r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { __m256i f, g0, g1, g2, g3, h0, h1, h2, h3; const __m256i shift = _mm256_broadcastsi128_si256(_mm_set_epi32(0, 1, 2, 3)); const __m256i idx = _mm256_broadcastsi128_si256(_mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); @@ -134,12 +148,12 @@ void PQCLEAN_KYBER512_AVX2_poly_frommsg(poly *restrict r, g2 = _mm256_permute2x128_si256(h0,h1,0x31); \ g1 = _mm256_permute2x128_si256(h2,h3,0x20); \ g3 = _mm256_permute2x128_si256(h2,h3,0x31); \ - _mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+ 0],g0); \ - _mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+16],g1); \ - _mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+ 0],g2); \ - _mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+16],g3) + _mm256_store_si256(&r->vec[0+2*(i)+0],g0); \ + _mm256_store_si256(&r->vec[0+2*(i)+1],g1); \ + _mm256_store_si256(&r->vec[8+2*(i)+0],g2); \ + _mm256_store_si256(&r->vec[8+2*(i)+1],g3) - f = _mm256_load_si256((__m256i *)msg); + f = _mm256_loadu_si256((__m256i *)msg); FROMMSG64(0); FROMMSG64(1); FROMMSG64(2); @@ -149,32 +163,34 @@ void PQCLEAN_KYBER512_AVX2_poly_frommsg(poly *restrict r, /************************************************* * Name: PQCLEAN_KYBER512_AVX2_poly_tomsg * -* Description: Convert polynomial to 32-byte message +* Description: Convert polynomial to 32-byte message. +* The coefficients of the input polynomial are assumed to +* lie in the invertal [0,q], i.e. the polynomial must be reduced +* by PQCLEAN_KYBER512_AVX2_poly_reduce(). * * Arguments: - uint8_t *msg: pointer to output message -* - poly *a: pointer to input polynomial +* - poly *a: pointer to input polynomial **************************************************/ void PQCLEAN_KYBER512_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *restrict a) { unsigned int i; uint32_t small; __m256i f0, f1, g0, g1; - const __m256i hqs = _mm256_set1_epi16((KYBER_Q - 1) / 2); - const __m256i hhqs = _mm256_set1_epi16((KYBER_Q - 5) / 4); + const __m256i hq = _mm256_set1_epi16((KYBER_Q - 1) / 2); + const __m256i hhq = _mm256_set1_epi16((KYBER_Q - 1) / 4); for (i = 0; i < KYBER_N / 32; i++) { - f0 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i]); - f1 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i + 16]); - f0 = _mm256_sub_epi16(hqs, f0); - f1 = _mm256_sub_epi16(hqs, f1); + f0 = _mm256_load_si256(&a->vec[2 * i + 0]); + f1 = _mm256_load_si256(&a->vec[2 * i + 1]); + f0 = _mm256_sub_epi16(hq, f0); + f1 = _mm256_sub_epi16(hq, f1); g0 = _mm256_srai_epi16(f0, 15); g1 = _mm256_srai_epi16(f1, 15); f0 = _mm256_xor_si256(f0, g0); f1 = _mm256_xor_si256(f1, g1); - f0 = _mm256_sub_epi16(hhqs, f0); - f1 = _mm256_sub_epi16(hhqs, f1); + f0 = _mm256_sub_epi16(f0, hhq); + f1 = _mm256_sub_epi16(f1, hhq); f0 = _mm256_packs_epi16(f0, f1); small = _mm256_movemask_epi8(f0); - small = ~small; msg[4 * i + 0] = small; msg[4 * i + 1] = small >> 16; msg[4 * i + 2] = small >> 8; @@ -183,24 +199,43 @@ void PQCLEAN_KYBER512_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly * } /************************************************* -* Name: PQCLEAN_KYBER512_AVX2_poly_getnoise +* Name: PQCLEAN_KYBER512_AVX2_poly_getnoise_eta1 * * Description: Sample a polynomial deterministically from a seed and a nonce, * with output polynomial close to centered binomial distribution -* with parameter KYBER_ETA +* with parameter KYBER_ETA1 * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *seed: pointer to input seed * (of length KYBER_SYMBYTES bytes) -* - uint8_t nonce: one-byte input nonce +* - uint8_t nonce: one-byte input nonce **************************************************/ -void PQCLEAN_KYBER512_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { - ALIGN32_ARRAY(uint8_t, KYBER_ETA * KYBER_N / 4) buf; - prf(buf.arr, sizeof(buf.arr), seed, nonce); - PQCLEAN_KYBER512_AVX2_cbd(r, buf.arr); +void PQCLEAN_KYBER512_AVX2_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { + ALIGNED_UINT8(KYBER_ETA1 * KYBER_N / 4 + 32) buf; // +32 bytes as required by PQCLEAN_KYBER512_AVX2_poly_cbd_eta1 + prf(buf.coeffs, KYBER_ETA1 * KYBER_N / 4, seed, nonce); + PQCLEAN_KYBER512_AVX2_poly_cbd_eta1(r, buf.vec); } -void PQCLEAN_KYBER512_AVX2_poly_getnoise4x(poly *r0, +/************************************************* +* Name: PQCLEAN_KYBER512_AVX2_poly_getnoise_eta2 +* +* Description: Sample a polynomial deterministically from a seed and a nonce, +* with output polynomial close to centered binomial distribution +* with parameter KYBER_ETA2 +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *seed: pointer to input seed +* (of length KYBER_SYMBYTES bytes) +* - uint8_t nonce: one-byte input nonce +**************************************************/ +void PQCLEAN_KYBER512_AVX2_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { + ALIGNED_UINT8(KYBER_ETA2 * KYBER_N / 4) buf; + prf(buf.coeffs, KYBER_ETA2 * KYBER_N / 4, seed, nonce); + PQCLEAN_KYBER512_AVX2_poly_cbd_eta2(r, buf.vec); +} + +#define NOISE_NBLOCKS ((KYBER_ETA1*KYBER_N/4+SHAKE256_RATE-1)/SHAKE256_RATE) +void PQCLEAN_KYBER512_AVX2_poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3, @@ -209,41 +244,78 @@ void PQCLEAN_KYBER512_AVX2_poly_getnoise4x(poly *r0, uint8_t nonce1, uint8_t nonce2, uint8_t nonce3) { - ALIGN32_ARRAY_2D(uint8_t, 4, 160) buf; + ALIGNED_UINT8(NOISE_NBLOCKS * SHAKE256_RATE) buf[4]; __m256i f; keccakx4_state state; - f = _mm256_load_si256((__m256i *)seed); - _mm256_store_si256((__m256i *)buf.arr[0], f); - _mm256_store_si256((__m256i *)buf.arr[1], f); - _mm256_store_si256((__m256i *)buf.arr[2], f); - _mm256_store_si256((__m256i *)buf.arr[3], f); + f = _mm256_loadu_si256((__m256i *)seed); + _mm256_store_si256(buf[0].vec, f); + _mm256_store_si256(buf[1].vec, f); + _mm256_store_si256(buf[2].vec, f); + _mm256_store_si256(buf[3].vec, f); - buf.arr[0][32] = nonce0; - buf.arr[1][32] = nonce1; - buf.arr[2][32] = nonce2; - buf.arr[3][32] = nonce3; + buf[0].coeffs[32] = nonce0; + buf[1].coeffs[32] = nonce1; + buf[2].coeffs[32] = nonce2; + buf[3].coeffs[32] = nonce3; - PQCLEAN_KYBER512_AVX2_shake256x4_absorb(&state, buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], 33); - PQCLEAN_KYBER512_AVX2_shake256x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], 1, &state); + PQCLEAN_KYBER512_AVX2_shake256x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 33); + PQCLEAN_KYBER512_AVX2_shake256x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, NOISE_NBLOCKS, &state); - PQCLEAN_KYBER512_AVX2_cbd(r0, buf.arr[0]); - PQCLEAN_KYBER512_AVX2_cbd(r1, buf.arr[1]); - PQCLEAN_KYBER512_AVX2_cbd(r2, buf.arr[2]); - PQCLEAN_KYBER512_AVX2_cbd(r3, buf.arr[3]); + PQCLEAN_KYBER512_AVX2_poly_cbd_eta1(r0, buf[0].vec); + PQCLEAN_KYBER512_AVX2_poly_cbd_eta1(r1, buf[1].vec); + PQCLEAN_KYBER512_AVX2_poly_cbd_eta1(r2, buf[2].vec); + PQCLEAN_KYBER512_AVX2_poly_cbd_eta1(r3, buf[3].vec); +} + +void PQCLEAN_KYBER512_AVX2_poly_getnoise_eta1122_4x(poly *r0, + poly *r1, + poly *r2, + poly *r3, + const uint8_t seed[32], + uint8_t nonce0, + uint8_t nonce1, + uint8_t nonce2, + uint8_t nonce3) { + ALIGNED_UINT8(NOISE_NBLOCKS * SHAKE256_RATE) buf[4]; + __m256i f; + keccakx4_state state; + + f = _mm256_loadu_si256((__m256i *)seed); + _mm256_store_si256(buf[0].vec, f); + _mm256_store_si256(buf[1].vec, f); + _mm256_store_si256(buf[2].vec, f); + _mm256_store_si256(buf[3].vec, f); + + buf[0].coeffs[32] = nonce0; + buf[1].coeffs[32] = nonce1; + buf[2].coeffs[32] = nonce2; + buf[3].coeffs[32] = nonce3; + + PQCLEAN_KYBER512_AVX2_shake256x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 33); + PQCLEAN_KYBER512_AVX2_shake256x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, NOISE_NBLOCKS, &state); + + PQCLEAN_KYBER512_AVX2_poly_cbd_eta1(r0, buf[0].vec); + PQCLEAN_KYBER512_AVX2_poly_cbd_eta1(r1, buf[1].vec); + PQCLEAN_KYBER512_AVX2_poly_cbd_eta2(r2, buf[2].vec); + PQCLEAN_KYBER512_AVX2_poly_cbd_eta2(r3, buf[3].vec); } /************************************************* * Name: PQCLEAN_KYBER512_AVX2_poly_ntt * * Description: Computes negacyclic number-theoretic transform (NTT) of -* a polynomial in place; -* inputs assumed to be in normal order, output in bitreversed order +* a polynomial in place. +* Input coefficients assumed to be in normal order, +* output coefficients are in special order that is natural +* for the vectorization. Input coefficients are assumed to be +* bounded by q in absolute value, output coefficients are bounded +* by 16118 in absolute value. * -* Arguments: - uint16_t *r: pointer to in/output polynomial +* Arguments: - poly *r: pointer to in/output polynomial **************************************************/ void PQCLEAN_KYBER512_AVX2_poly_ntt(poly *r) { - PQCLEAN_KYBER512_AVX2_ntt_avx(r->coeffs, &PQCLEAN_KYBER512_AVX2_qdata); + PQCLEAN_KYBER512_AVX2_ntt_avx(r->vec, PQCLEAN_KYBER512_AVX2_qdata.vec); } /************************************************* @@ -251,29 +323,35 @@ void PQCLEAN_KYBER512_AVX2_poly_ntt(poly *r) { * * Description: Computes inverse of negacyclic number-theoretic transform (NTT) * of a polynomial in place; -* inputs assumed to be in bitreversed order, output in normal order +* Input coefficients assumed to be in special order from vectorized +* forward ntt, output in normal order. Input coefficients can be +* arbitrary 16-bit integers, output coefficients are bounded by 14870 +* in absolute value. * -* Arguments: - uint16_t *a: pointer to in/output polynomial +* Arguments: - poly *a: pointer to in/output polynomial **************************************************/ void PQCLEAN_KYBER512_AVX2_poly_invntt_tomont(poly *r) { - PQCLEAN_KYBER512_AVX2_invntt_avx(r->coeffs, &PQCLEAN_KYBER512_AVX2_qdata); + PQCLEAN_KYBER512_AVX2_invntt_avx(r->vec, PQCLEAN_KYBER512_AVX2_qdata.vec); } void PQCLEAN_KYBER512_AVX2_poly_nttunpack(poly *r) { - PQCLEAN_KYBER512_AVX2_nttunpack_avx(r->coeffs, &PQCLEAN_KYBER512_AVX2_qdata); + PQCLEAN_KYBER512_AVX2_nttunpack_avx(r->vec, PQCLEAN_KYBER512_AVX2_qdata.vec); } /************************************************* * Name: PQCLEAN_KYBER512_AVX2_poly_basemul_montgomery * -* Description: Multiplication of two polynomials in NTT domain +* Description: Multiplication of two polynomials in NTT domain. +* One of the input polynomials needs to have coefficients +* bounded by q, the other polynomial can have arbitrary +* coefficients. Output coefficients are bounded by 6656. * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ void PQCLEAN_KYBER512_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const poly *b) { - PQCLEAN_KYBER512_AVX2_basemul_avx(r->coeffs, a->coeffs, b->coeffs, &PQCLEAN_KYBER512_AVX2_qdata); + PQCLEAN_KYBER512_AVX2_basemul_avx(r->vec, a->vec, b->vec, PQCLEAN_KYBER512_AVX2_qdata.vec); } /************************************************* @@ -285,7 +363,7 @@ void PQCLEAN_KYBER512_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const * Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER512_AVX2_poly_tomont(poly *r) { - PQCLEAN_KYBER512_AVX2_tomont_avx(r->coeffs, &PQCLEAN_KYBER512_AVX2_qdata); + PQCLEAN_KYBER512_AVX2_tomont_avx(r->vec, PQCLEAN_KYBER512_AVX2_qdata.vec); } /************************************************* @@ -297,28 +375,16 @@ void PQCLEAN_KYBER512_AVX2_poly_tomont(poly *r) { * Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER512_AVX2_poly_reduce(poly *r) { - PQCLEAN_KYBER512_AVX2_reduce_avx(r->coeffs, &PQCLEAN_KYBER512_AVX2_qdata); -} - -/************************************************* -* Name: PQCLEAN_KYBER512_AVX2_poly_csubq -* -* Description: Applies conditional subtraction of q to each coefficient -* of a polynomial. For details of conditional subtraction -* of q see comments in reduce.c -* -* Arguments: - poly *r: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_KYBER512_AVX2_poly_csubq(poly *r) { - PQCLEAN_KYBER512_AVX2_csubq_avx(r->coeffs, &PQCLEAN_KYBER512_AVX2_qdata); + PQCLEAN_KYBER512_AVX2_reduce_avx(r->vec, PQCLEAN_KYBER512_AVX2_qdata.vec); } /************************************************* * Name: PQCLEAN_KYBER512_AVX2_poly_add * -* Description: Add two polynomials +* Description: Add two polynomials. No modular reduction +* is performed. * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ @@ -326,20 +392,21 @@ void PQCLEAN_KYBER512_AVX2_poly_add(poly *r, const poly *a, const poly *b) { unsigned int i; __m256i f0, f1; - for (i = 0; i < KYBER_N; i += 16) { - f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); - f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); + for (i = 0; i < KYBER_N / 16; i++) { + f0 = _mm256_load_si256(&a->vec[i]); + f1 = _mm256_load_si256(&b->vec[i]); f0 = _mm256_add_epi16(f0, f1); - _mm256_store_si256((__m256i *)&r->coeffs[i], f0); + _mm256_store_si256(&r->vec[i], f0); } } /************************************************* * Name: PQCLEAN_KYBER512_AVX2_poly_sub * -* Description: Subtract two polynomials +* Description: Subtract two polynomials. No modular reduction +* is performed. * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ @@ -347,10 +414,10 @@ void PQCLEAN_KYBER512_AVX2_poly_sub(poly *r, const poly *a, const poly *b) { unsigned int i; __m256i f0, f1; - for (i = 0; i < KYBER_N; i += 16) { - f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); - f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); + for (i = 0; i < KYBER_N / 16; i++) { + f0 = _mm256_load_si256(&a->vec[i]); + f1 = _mm256_load_si256(&b->vec[i]); f0 = _mm256_sub_epi16(f0, f1); - _mm256_store_si256((__m256i *)&r->coeffs[i], f0); + _mm256_store_si256(&r->vec[i], f0); } } diff --git a/crypto_kem/kyber512/avx2/poly.h b/crypto_kem/kyber512/avx2/poly.h index 046bd83f..bf897dbe 100644 --- a/crypto_kem/kyber512/avx2/poly.h +++ b/crypto_kem/kyber512/avx2/poly.h @@ -1,19 +1,13 @@ #ifndef PQCLEAN_KYBER512_AVX2_POLY_H #define PQCLEAN_KYBER512_AVX2_POLY_H +#include "align.h" #include "params.h" #include #include -/* - * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial - * coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1] - */ -typedef union { - __m256i dummy; - int16_t coeffs[KYBER_N]; -} poly; +typedef ALIGNED_INT16(KYBER_N) poly; -void PQCLEAN_KYBER512_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a); +void PQCLEAN_KYBER512_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a); void PQCLEAN_KYBER512_AVX2_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]); void PQCLEAN_KYBER512_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a); @@ -22,8 +16,11 @@ void PQCLEAN_KYBER512_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYT void PQCLEAN_KYBER512_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]); void PQCLEAN_KYBER512_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a); -void PQCLEAN_KYBER512_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); -void PQCLEAN_KYBER512_AVX2_poly_getnoise4x(poly *r0, +void PQCLEAN_KYBER512_AVX2_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); + +void PQCLEAN_KYBER512_AVX2_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); + +void PQCLEAN_KYBER512_AVX2_poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3, @@ -33,6 +30,17 @@ void PQCLEAN_KYBER512_AVX2_poly_getnoise4x(poly *r0, uint8_t nonce2, uint8_t nonce3); +void PQCLEAN_KYBER512_AVX2_poly_getnoise_eta1122_4x(poly *r0, + poly *r1, + poly *r2, + poly *r3, + const uint8_t *seed, + uint8_t nonce0, + uint8_t nonce1, + uint8_t nonce2, + uint8_t nonce3); + + void PQCLEAN_KYBER512_AVX2_poly_ntt(poly *r); void PQCLEAN_KYBER512_AVX2_poly_invntt_tomont(poly *r); void PQCLEAN_KYBER512_AVX2_poly_nttunpack(poly *r); @@ -40,7 +48,6 @@ void PQCLEAN_KYBER512_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const void PQCLEAN_KYBER512_AVX2_poly_tomont(poly *r); void PQCLEAN_KYBER512_AVX2_poly_reduce(poly *r); -void PQCLEAN_KYBER512_AVX2_poly_csubq(poly *r); void PQCLEAN_KYBER512_AVX2_poly_add(poly *r, const poly *a, const poly *b); void PQCLEAN_KYBER512_AVX2_poly_sub(poly *r, const poly *a, const poly *b); diff --git a/crypto_kem/kyber512/avx2/polyvec.c b/crypto_kem/kyber512/avx2/polyvec.c index 4cc60aa6..90b28695 100644 --- a/crypto_kem/kyber512/avx2/polyvec.c +++ b/crypto_kem/kyber512/avx2/polyvec.c @@ -3,8 +3,76 @@ #include "params.h" #include "poly.h" #include "polyvec.h" +#include #include +static void poly_compress10(uint8_t r[320], const poly *restrict a) { + size_t i; + uint32_t low; + __m256i f0, f1, f2; + __m128i t0, t1; + const __m256i v = _mm256_load_si256(&PQCLEAN_KYBER512_AVX2_qdata.vec[_16XV / 16]); + const __m256i v8 = _mm256_slli_epi16(v, 3); + const __m256i off = _mm256_set1_epi16(15); + const __m256i shift1 = _mm256_set1_epi16(1 << 12); + const __m256i mask = _mm256_set1_epi16(1023); + const __m256i shift2 = _mm256_set1_epi64x((1024LL << 48) + (1LL << 32) + (1024 << 16) + 1); + const __m256i sllvdidx = _mm256_set1_epi64x(12); + const __m256i shufbidx = _mm256_set_epi8( 8, 4, 3, 2, 1, 0, -1, -1, -1, -1, -1, -1, 12, 11, 10, 9, + -1, -1, -1, -1, -1, -1, 12, 11, 10, 9, 8, 4, 3, 2, 1, 0); + + for (i = 0; i < KYBER_N / 16; i++) { + f0 = _mm256_load_si256(&a->vec[i]); + f1 = _mm256_mullo_epi16(f0, v8); + f2 = _mm256_add_epi16(f0, off); + f0 = _mm256_slli_epi16(f0, 3); + f0 = _mm256_mulhi_epi16(f0, v); + f2 = _mm256_sub_epi16(f1, f2); + f1 = _mm256_andnot_si256(f1, f2); + f1 = _mm256_srli_epi16(f1, 15); + f0 = _mm256_sub_epi16(f0, f1); + f0 = _mm256_mulhrs_epi16(f0, shift1); + f0 = _mm256_and_si256(f0, mask); + f0 = _mm256_madd_epi16(f0, shift2); + f0 = _mm256_sllv_epi32(f0, sllvdidx); + f0 = _mm256_srli_epi64(f0, 12); + f0 = _mm256_shuffle_epi8(f0, shufbidx); + t0 = _mm256_castsi256_si128(f0); + t1 = _mm256_extracti128_si256(f0, 1); + t0 = _mm_blend_epi16(t0, t1, 0xE0); + _mm_storeu_si128((__m128i *)&r[20 * i + 0], t0); + _mm_store_ss((float *)&low, _mm_castsi128_ps(t1)); + r[20 * i + 16] = (uint8_t)low; + r[20 * i + 17] = (uint8_t)(low >> 0x08); + r[20 * i + 18] = (uint8_t)(low >> 0x10); + r[20 * i + 19] = (uint8_t)(low >> 0x18); + } +} + +static void poly_decompress10(poly *restrict r, const uint8_t a[320 + 12]) { + size_t i; + __m256i f; + const __m256i q = _mm256_set1_epi32((KYBER_Q << 16) + 4 * KYBER_Q); + const __m256i shufbidx = _mm256_set_epi8(11, 10, 10, 9, 9, 8, 8, 7, + 6, 5, 5, 4, 4, 3, 3, 2, + 9, 8, 8, 7, 7, 6, 6, 5, + 4, 3, 3, 2, 2, 1, 1, 0); + const __m256i sllvdidx = _mm256_set1_epi64x(4); + const __m256i mask = _mm256_set1_epi32((32736 << 16) + 8184); + + for (i = 0; i < KYBER_N / 16; i++) { + f = _mm256_loadu_si256((__m256i *)&a[20 * i]); + f = _mm256_permute4x64_epi64(f, 0x94); + f = _mm256_shuffle_epi8(f, shufbidx); + f = _mm256_sllv_epi32(f, sllvdidx); + f = _mm256_srli_epi16(f, 1); + f = _mm256_and_si256(f, mask); + f = _mm256_mulhrs_epi16(f, q); + _mm256_store_si256(&r->vec[i], f); + } +} + + /************************************************* * Name: PQCLEAN_KYBER512_AVX2_polyvec_compress * @@ -14,27 +82,11 @@ * (needs space for KYBER_POLYVECCOMPRESSEDBYTES) * - polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER512_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], - polyvec *restrict a) { - size_t i, j, k; +void PQCLEAN_KYBER512_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES + 2], polyvec *a) { + size_t i; - PQCLEAN_KYBER512_AVX2_polyvec_csubq(a); - - uint16_t t[4]; for (i = 0; i < KYBER_K; i++) { - for (j = 0; j < KYBER_N / 4; j++) { - for (k = 0; k < 4; k++) { - t[k] = ((((uint32_t)a->vec[i].coeffs[4 * j + k] << 10) + KYBER_Q / 2) - / KYBER_Q) & 0x3ff; - } - - r[0] = (t[0] >> 0); - r[1] = (t[0] >> 8) | (t[1] << 2); - r[2] = (t[1] >> 6) | (t[2] << 4); - r[3] = (t[2] >> 4) | (t[3] << 6); - r[4] = (t[3] >> 2); - r += 5; - } + poly_compress10(&r[320 * i], &a->vec[i]); } } @@ -44,27 +96,15 @@ void PQCLEAN_KYBER512_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYT * Description: De-serialize and decompress vector of polynomials; * approximate inverse of PQCLEAN_KYBER512_AVX2_polyvec_compress * -* Arguments: - polyvec *r: pointer to output vector of polynomials +* Arguments: - polyvec *r: pointer to output vector of polynomials * - const uint8_t *a: pointer to input byte array * (of length KYBER_POLYVECCOMPRESSEDBYTES) **************************************************/ -void PQCLEAN_KYBER512_AVX2_polyvec_decompress(polyvec *restrict r, - const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { - size_t i, j, k; +void PQCLEAN_KYBER512_AVX2_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES + 12]) { + size_t i; - uint16_t t[4]; for (i = 0; i < KYBER_K; i++) { - for (j = 0; j < KYBER_N / 4; j++) { - t[0] = (a[0] >> 0) | ((uint16_t)a[1] << 8); - t[1] = (a[1] >> 2) | ((uint16_t)a[2] << 6); - t[2] = (a[2] >> 4) | ((uint16_t)a[3] << 4); - t[3] = (a[3] >> 6) | ((uint16_t)a[4] << 2); - a += 5; - - for (k = 0; k < 4; k++) { - r->vec[i].coeffs[4 * j + k] = ((uint32_t)(t[k] & 0x3FF) * KYBER_Q + 512) >> 10; - } - } + poly_decompress10(&r->vec[i], &a[320 * i]); } } @@ -90,7 +130,7 @@ void PQCLEAN_KYBER512_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyve * Description: De-serialize vector of polynomials; * inverse of PQCLEAN_KYBER512_AVX2_polyvec_tobytes * -* Arguments: - uint8_t *r: pointer to output byte array +* Arguments: - uint8_t *r: pointer to output byte array * - const polyvec *a: pointer to input vector of polynomials * (of length KYBER_POLYVECBYTES) **************************************************/ @@ -131,29 +171,34 @@ void PQCLEAN_KYBER512_AVX2_polyvec_invntt_tomont(polyvec *r) { } /************************************************* -* Name: PQCLEAN_KYBER512_AVX2_polyvec_pointwise_acc_montgomery +* Name: PQCLEAN_KYBER512_AVX2_polyvec_basemul_acc_montgomery * -* Description: Pointwise multiply elements of a and b, accumulate into r, +* Description: Multiply elements in a and b in NTT domain, accumulate into r, * and multiply by 2^-16. * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ -void PQCLEAN_KYBER512_AVX2_polyvec_pointwise_acc_montgomery(poly *r, - const polyvec *a, - const polyvec *b) { - PQCLEAN_KYBER512_AVX2_basemul_acc_avx(r->coeffs, a->vec->coeffs, b->vec->coeffs, & PQCLEAN_KYBER512_AVX2_qdata); +void PQCLEAN_KYBER512_AVX2_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) { + size_t i; + poly tmp; + + PQCLEAN_KYBER512_AVX2_poly_basemul_montgomery(r, &a->vec[0], &b->vec[0]); + for (i = 1; i < KYBER_K; i++) { + PQCLEAN_KYBER512_AVX2_poly_basemul_montgomery(&tmp, &a->vec[i], &b->vec[i]); + PQCLEAN_KYBER512_AVX2_poly_add(r, r, &tmp); + } } /************************************************* * Name: PQCLEAN_KYBER512_AVX2_polyvec_reduce * * Description: Applies Barrett reduction to each coefficient -* of each element of a vector of polynomials +* of each element of a vector of polynomials; * for details of the Barrett reduction see comments in reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - polyvec *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER512_AVX2_polyvec_reduce(polyvec *r) { size_t i; @@ -162,23 +207,6 @@ void PQCLEAN_KYBER512_AVX2_polyvec_reduce(polyvec *r) { } } -/************************************************* -* Name: PQCLEAN_KYBER512_AVX2_polyvec_csubq -* -* Description: Applies conditional subtraction of q to each coefficient -* of each element of a vector of polynomials -* for details of conditional subtraction of q see comments in -* reduce.c -* -* Arguments: - poly *r: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_KYBER512_AVX2_polyvec_csubq(polyvec *r) { - size_t i; - for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER512_AVX2_poly_csubq(&r->vec[i]); - } -} - /************************************************* * Name: PQCLEAN_KYBER512_AVX2_polyvec_add * diff --git a/crypto_kem/kyber512/avx2/polyvec.h b/crypto_kem/kyber512/avx2/polyvec.h index 12928a76..0331c954 100644 --- a/crypto_kem/kyber512/avx2/polyvec.h +++ b/crypto_kem/kyber512/avx2/polyvec.h @@ -8,9 +8,8 @@ typedef struct { poly vec[KYBER_K]; } polyvec; -void PQCLEAN_KYBER512_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a); -void PQCLEAN_KYBER512_AVX2_polyvec_decompress(polyvec *r, - const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); +void PQCLEAN_KYBER512_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES + 2], polyvec *a); +void PQCLEAN_KYBER512_AVX2_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES + 12]); void PQCLEAN_KYBER512_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a); void PQCLEAN_KYBER512_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]); @@ -18,12 +17,9 @@ void PQCLEAN_KYBER512_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_P void PQCLEAN_KYBER512_AVX2_polyvec_ntt(polyvec *r); void PQCLEAN_KYBER512_AVX2_polyvec_invntt_tomont(polyvec *r); -void PQCLEAN_KYBER512_AVX2_polyvec_pointwise_acc_montgomery(poly *r, - const polyvec *a, - const polyvec *b); +void PQCLEAN_KYBER512_AVX2_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b); void PQCLEAN_KYBER512_AVX2_polyvec_reduce(polyvec *r); -void PQCLEAN_KYBER512_AVX2_polyvec_csubq(polyvec *r); void PQCLEAN_KYBER512_AVX2_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); diff --git a/crypto_kem/kyber512/avx2/reduce.h b/crypto_kem/kyber512/avx2/reduce.h index fad9114d..4ff82569 100644 --- a/crypto_kem/kyber512/avx2/reduce.h +++ b/crypto_kem/kyber512/avx2/reduce.h @@ -1,10 +1,9 @@ #ifndef PQCLEAN_KYBER512_AVX2_REDUCE_H #define PQCLEAN_KYBER512_AVX2_REDUCE_H -#include "consts.h" -#include +#include "params.h" +#include -int16_t PQCLEAN_KYBER512_AVX2_reduce_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER512_AVX2_qdata); -int16_t PQCLEAN_KYBER512_AVX2_csubq_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER512_AVX2_qdata); -int16_t PQCLEAN_KYBER512_AVX2_tomont_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER512_AVX2_qdata); +void PQCLEAN_KYBER512_AVX2_reduce_avx(__m256i *r, const __m256i *PQCLEAN_KYBER512_AVX2_qdata); +void PQCLEAN_KYBER512_AVX2_tomont_avx(__m256i *r, const __m256i *PQCLEAN_KYBER512_AVX2_qdata); #endif diff --git a/crypto_kem/kyber512/avx2/rejsample.c b/crypto_kem/kyber512/avx2/rejsample.c index 52f2c691..6e42856b 100644 --- a/crypto_kem/kyber512/avx2/rejsample.c +++ b/crypto_kem/kyber512/avx2/rejsample.c @@ -4,311 +4,68 @@ #include "rejsample.h" #include #include +#include + +//#define BMI -static const ALIGN32_ARRAY_2D(uint8_t, 256, 8) idx = {.arr = { - {-1, -1, -1, -1, -1, -1, -1, -1}, - { 0, -1, -1, -1, -1, -1, -1, -1}, - { 2, -1, -1, -1, -1, -1, -1, -1}, - { 0, 2, -1, -1, -1, -1, -1, -1}, - { 4, -1, -1, -1, -1, -1, -1, -1}, - { 0, 4, -1, -1, -1, -1, -1, -1}, - { 2, 4, -1, -1, -1, -1, -1, -1}, - { 0, 2, 4, -1, -1, -1, -1, -1}, - { 6, -1, -1, -1, -1, -1, -1, -1}, - { 0, 6, -1, -1, -1, -1, -1, -1}, - { 2, 6, -1, -1, -1, -1, -1, -1}, - { 0, 2, 6, -1, -1, -1, -1, -1}, - { 4, 6, -1, -1, -1, -1, -1, -1}, - { 0, 4, 6, -1, -1, -1, -1, -1}, - { 2, 4, 6, -1, -1, -1, -1, -1}, - { 0, 2, 4, 6, -1, -1, -1, -1}, - { 8, -1, -1, -1, -1, -1, -1, -1}, - { 0, 8, -1, -1, -1, -1, -1, -1}, - { 2, 8, -1, -1, -1, -1, -1, -1}, - { 0, 2, 8, -1, -1, -1, -1, -1}, - { 4, 8, -1, -1, -1, -1, -1, -1}, - { 0, 4, 8, -1, -1, -1, -1, -1}, - { 2, 4, 8, -1, -1, -1, -1, -1}, - { 0, 2, 4, 8, -1, -1, -1, -1}, - { 6, 8, -1, -1, -1, -1, -1, -1}, - { 0, 6, 8, -1, -1, -1, -1, -1}, - { 2, 6, 8, -1, -1, -1, -1, -1}, - { 0, 2, 6, 8, -1, -1, -1, -1}, - { 4, 6, 8, -1, -1, -1, -1, -1}, - { 0, 4, 6, 8, -1, -1, -1, -1}, - { 2, 4, 6, 8, -1, -1, -1, -1}, - { 0, 2, 4, 6, 8, -1, -1, -1}, - {10, -1, -1, -1, -1, -1, -1, -1}, - { 0, 10, -1, -1, -1, -1, -1, -1}, - { 2, 10, -1, -1, -1, -1, -1, -1}, - { 0, 2, 10, -1, -1, -1, -1, -1}, - { 4, 10, -1, -1, -1, -1, -1, -1}, - { 0, 4, 10, -1, -1, -1, -1, -1}, - { 2, 4, 10, -1, -1, -1, -1, -1}, - { 0, 2, 4, 10, -1, -1, -1, -1}, - { 6, 10, -1, -1, -1, -1, -1, -1}, - { 0, 6, 10, -1, -1, -1, -1, -1}, - { 2, 6, 10, -1, -1, -1, -1, -1}, - { 0, 2, 6, 10, -1, -1, -1, -1}, - { 4, 6, 10, -1, -1, -1, -1, -1}, - { 0, 4, 6, 10, -1, -1, -1, -1}, - { 2, 4, 6, 10, -1, -1, -1, -1}, - { 0, 2, 4, 6, 10, -1, -1, -1}, - { 8, 10, -1, -1, -1, -1, -1, -1}, - { 0, 8, 10, -1, -1, -1, -1, -1}, - { 2, 8, 10, -1, -1, -1, -1, -1}, - { 0, 2, 8, 10, -1, -1, -1, -1}, - { 4, 8, 10, -1, -1, -1, -1, -1}, - { 0, 4, 8, 10, -1, -1, -1, -1}, - { 2, 4, 8, 10, -1, -1, -1, -1}, - { 0, 2, 4, 8, 10, -1, -1, -1}, - { 6, 8, 10, -1, -1, -1, -1, -1}, - { 0, 6, 8, 10, -1, -1, -1, -1}, - { 2, 6, 8, 10, -1, -1, -1, -1}, - { 0, 2, 6, 8, 10, -1, -1, -1}, - { 4, 6, 8, 10, -1, -1, -1, -1}, - { 0, 4, 6, 8, 10, -1, -1, -1}, - { 2, 4, 6, 8, 10, -1, -1, -1}, - { 0, 2, 4, 6, 8, 10, -1, -1}, - {12, -1, -1, -1, -1, -1, -1, -1}, - { 0, 12, -1, -1, -1, -1, -1, -1}, - { 2, 12, -1, -1, -1, -1, -1, -1}, - { 0, 2, 12, -1, -1, -1, -1, -1}, - { 4, 12, -1, -1, -1, -1, -1, -1}, - { 0, 4, 12, -1, -1, -1, -1, -1}, - { 2, 4, 12, -1, -1, -1, -1, -1}, - { 0, 2, 4, 12, -1, -1, -1, -1}, - { 6, 12, -1, -1, -1, -1, -1, -1}, - { 0, 6, 12, -1, -1, -1, -1, -1}, - { 2, 6, 12, -1, -1, -1, -1, -1}, - { 0, 2, 6, 12, -1, -1, -1, -1}, - { 4, 6, 12, -1, -1, -1, -1, -1}, - { 0, 4, 6, 12, -1, -1, -1, -1}, - { 2, 4, 6, 12, -1, -1, -1, -1}, - { 0, 2, 4, 6, 12, -1, -1, -1}, - { 8, 12, -1, -1, -1, -1, -1, -1}, - { 0, 8, 12, -1, -1, -1, -1, -1}, - { 2, 8, 12, -1, -1, -1, -1, -1}, - { 0, 2, 8, 12, -1, -1, -1, -1}, - { 4, 8, 12, -1, -1, -1, -1, -1}, - { 0, 4, 8, 12, -1, -1, -1, -1}, - { 2, 4, 8, 12, -1, -1, -1, -1}, - { 0, 2, 4, 8, 12, -1, -1, -1}, - { 6, 8, 12, -1, -1, -1, -1, -1}, - { 0, 6, 8, 12, -1, -1, -1, -1}, - { 2, 6, 8, 12, -1, -1, -1, -1}, - { 0, 2, 6, 8, 12, -1, -1, -1}, - { 4, 6, 8, 12, -1, -1, -1, -1}, - { 0, 4, 6, 8, 12, -1, -1, -1}, - { 2, 4, 6, 8, 12, -1, -1, -1}, - { 0, 2, 4, 6, 8, 12, -1, -1}, - {10, 12, -1, -1, -1, -1, -1, -1}, - { 0, 10, 12, -1, -1, -1, -1, -1}, - { 2, 10, 12, -1, -1, -1, -1, -1}, - { 0, 2, 10, 12, -1, -1, -1, -1}, - { 4, 10, 12, -1, -1, -1, -1, -1}, - { 0, 4, 10, 12, -1, -1, -1, -1}, - { 2, 4, 10, 12, -1, -1, -1, -1}, - { 0, 2, 4, 10, 12, -1, -1, -1}, - { 6, 10, 12, -1, -1, -1, -1, -1}, - { 0, 6, 10, 12, -1, -1, -1, -1}, - { 2, 6, 10, 12, -1, -1, -1, -1}, - { 0, 2, 6, 10, 12, -1, -1, -1}, - { 4, 6, 10, 12, -1, -1, -1, -1}, - { 0, 4, 6, 10, 12, -1, -1, -1}, - { 2, 4, 6, 10, 12, -1, -1, -1}, - { 0, 2, 4, 6, 10, 12, -1, -1}, - { 8, 10, 12, -1, -1, -1, -1, -1}, - { 0, 8, 10, 12, -1, -1, -1, -1}, - { 2, 8, 10, 12, -1, -1, -1, -1}, - { 0, 2, 8, 10, 12, -1, -1, -1}, - { 4, 8, 10, 12, -1, -1, -1, -1}, - { 0, 4, 8, 10, 12, -1, -1, -1}, - { 2, 4, 8, 10, 12, -1, -1, -1}, - { 0, 2, 4, 8, 10, 12, -1, -1}, - { 6, 8, 10, 12, -1, -1, -1, -1}, - { 0, 6, 8, 10, 12, -1, -1, -1}, - { 2, 6, 8, 10, 12, -1, -1, -1}, - { 0, 2, 6, 8, 10, 12, -1, -1}, - { 4, 6, 8, 10, 12, -1, -1, -1}, - { 0, 4, 6, 8, 10, 12, -1, -1}, - { 2, 4, 6, 8, 10, 12, -1, -1}, - { 0, 2, 4, 6, 8, 10, 12, -1}, - {14, -1, -1, -1, -1, -1, -1, -1}, - { 0, 14, -1, -1, -1, -1, -1, -1}, - { 2, 14, -1, -1, -1, -1, -1, -1}, - { 0, 2, 14, -1, -1, -1, -1, -1}, - { 4, 14, -1, -1, -1, -1, -1, -1}, - { 0, 4, 14, -1, -1, -1, -1, -1}, - { 2, 4, 14, -1, -1, -1, -1, -1}, - { 0, 2, 4, 14, -1, -1, -1, -1}, - { 6, 14, -1, -1, -1, -1, -1, -1}, - { 0, 6, 14, -1, -1, -1, -1, -1}, - { 2, 6, 14, -1, -1, -1, -1, -1}, - { 0, 2, 6, 14, -1, -1, -1, -1}, - { 4, 6, 14, -1, -1, -1, -1, -1}, - { 0, 4, 6, 14, -1, -1, -1, -1}, - { 2, 4, 6, 14, -1, -1, -1, -1}, - { 0, 2, 4, 6, 14, -1, -1, -1}, - { 8, 14, -1, -1, -1, -1, -1, -1}, - { 0, 8, 14, -1, -1, -1, -1, -1}, - { 2, 8, 14, -1, -1, -1, -1, -1}, - { 0, 2, 8, 14, -1, -1, -1, -1}, - { 4, 8, 14, -1, -1, -1, -1, -1}, - { 0, 4, 8, 14, -1, -1, -1, -1}, - { 2, 4, 8, 14, -1, -1, -1, -1}, - { 0, 2, 4, 8, 14, -1, -1, -1}, - { 6, 8, 14, -1, -1, -1, -1, -1}, - { 0, 6, 8, 14, -1, -1, -1, -1}, - { 2, 6, 8, 14, -1, -1, -1, -1}, - { 0, 2, 6, 8, 14, -1, -1, -1}, - { 4, 6, 8, 14, -1, -1, -1, -1}, - { 0, 4, 6, 8, 14, -1, -1, -1}, - { 2, 4, 6, 8, 14, -1, -1, -1}, - { 0, 2, 4, 6, 8, 14, -1, -1}, - {10, 14, -1, -1, -1, -1, -1, -1}, - { 0, 10, 14, -1, -1, -1, -1, -1}, - { 2, 10, 14, -1, -1, -1, -1, -1}, - { 0, 2, 10, 14, -1, -1, -1, -1}, - { 4, 10, 14, -1, -1, -1, -1, -1}, - { 0, 4, 10, 14, -1, -1, -1, -1}, - { 2, 4, 10, 14, -1, -1, -1, -1}, - { 0, 2, 4, 10, 14, -1, -1, -1}, - { 6, 10, 14, -1, -1, -1, -1, -1}, - { 0, 6, 10, 14, -1, -1, -1, -1}, - { 2, 6, 10, 14, -1, -1, -1, -1}, - { 0, 2, 6, 10, 14, -1, -1, -1}, - { 4, 6, 10, 14, -1, -1, -1, -1}, - { 0, 4, 6, 10, 14, -1, -1, -1}, - { 2, 4, 6, 10, 14, -1, -1, -1}, - { 0, 2, 4, 6, 10, 14, -1, -1}, - { 8, 10, 14, -1, -1, -1, -1, -1}, - { 0, 8, 10, 14, -1, -1, -1, -1}, - { 2, 8, 10, 14, -1, -1, -1, -1}, - { 0, 2, 8, 10, 14, -1, -1, -1}, - { 4, 8, 10, 14, -1, -1, -1, -1}, - { 0, 4, 8, 10, 14, -1, -1, -1}, - { 2, 4, 8, 10, 14, -1, -1, -1}, - { 0, 2, 4, 8, 10, 14, -1, -1}, - { 6, 8, 10, 14, -1, -1, -1, -1}, - { 0, 6, 8, 10, 14, -1, -1, -1}, - { 2, 6, 8, 10, 14, -1, -1, -1}, - { 0, 2, 6, 8, 10, 14, -1, -1}, - { 4, 6, 8, 10, 14, -1, -1, -1}, - { 0, 4, 6, 8, 10, 14, -1, -1}, - { 2, 4, 6, 8, 10, 14, -1, -1}, - { 0, 2, 4, 6, 8, 10, 14, -1}, - {12, 14, -1, -1, -1, -1, -1, -1}, - { 0, 12, 14, -1, -1, -1, -1, -1}, - { 2, 12, 14, -1, -1, -1, -1, -1}, - { 0, 2, 12, 14, -1, -1, -1, -1}, - { 4, 12, 14, -1, -1, -1, -1, -1}, - { 0, 4, 12, 14, -1, -1, -1, -1}, - { 2, 4, 12, 14, -1, -1, -1, -1}, - { 0, 2, 4, 12, 14, -1, -1, -1}, - { 6, 12, 14, -1, -1, -1, -1, -1}, - { 0, 6, 12, 14, -1, -1, -1, -1}, - { 2, 6, 12, 14, -1, -1, -1, -1}, - { 0, 2, 6, 12, 14, -1, -1, -1}, - { 4, 6, 12, 14, -1, -1, -1, -1}, - { 0, 4, 6, 12, 14, -1, -1, -1}, - { 2, 4, 6, 12, 14, -1, -1, -1}, - { 0, 2, 4, 6, 12, 14, -1, -1}, - { 8, 12, 14, -1, -1, -1, -1, -1}, - { 0, 8, 12, 14, -1, -1, -1, -1}, - { 2, 8, 12, 14, -1, -1, -1, -1}, - { 0, 2, 8, 12, 14, -1, -1, -1}, - { 4, 8, 12, 14, -1, -1, -1, -1}, - { 0, 4, 8, 12, 14, -1, -1, -1}, - { 2, 4, 8, 12, 14, -1, -1, -1}, - { 0, 2, 4, 8, 12, 14, -1, -1}, - { 6, 8, 12, 14, -1, -1, -1, -1}, - { 0, 6, 8, 12, 14, -1, -1, -1}, - { 2, 6, 8, 12, 14, -1, -1, -1}, - { 0, 2, 6, 8, 12, 14, -1, -1}, - { 4, 6, 8, 12, 14, -1, -1, -1}, - { 0, 4, 6, 8, 12, 14, -1, -1}, - { 2, 4, 6, 8, 12, 14, -1, -1}, - { 0, 2, 4, 6, 8, 12, 14, -1}, - {10, 12, 14, -1, -1, -1, -1, -1}, - { 0, 10, 12, 14, -1, -1, -1, -1}, - { 2, 10, 12, 14, -1, -1, -1, -1}, - { 0, 2, 10, 12, 14, -1, -1, -1}, - { 4, 10, 12, 14, -1, -1, -1, -1}, - { 0, 4, 10, 12, 14, -1, -1, -1}, - { 2, 4, 10, 12, 14, -1, -1, -1}, - { 0, 2, 4, 10, 12, 14, -1, -1}, - { 6, 10, 12, 14, -1, -1, -1, -1}, - { 0, 6, 10, 12, 14, -1, -1, -1}, - { 2, 6, 10, 12, 14, -1, -1, -1}, - { 0, 2, 6, 10, 12, 14, -1, -1}, - { 4, 6, 10, 12, 14, -1, -1, -1}, - { 0, 4, 6, 10, 12, 14, -1, -1}, - { 2, 4, 6, 10, 12, 14, -1, -1}, - { 0, 2, 4, 6, 10, 12, 14, -1}, - { 8, 10, 12, 14, -1, -1, -1, -1}, - { 0, 8, 10, 12, 14, -1, -1, -1}, - { 2, 8, 10, 12, 14, -1, -1, -1}, - { 0, 2, 8, 10, 12, 14, -1, -1}, - { 4, 8, 10, 12, 14, -1, -1, -1}, - { 0, 4, 8, 10, 12, 14, -1, -1}, - { 2, 4, 8, 10, 12, 14, -1, -1}, - { 0, 2, 4, 8, 10, 12, 14, -1}, - { 6, 8, 10, 12, 14, -1, -1, -1}, - { 0, 6, 8, 10, 12, 14, -1, -1}, - { 2, 6, 8, 10, 12, 14, -1, -1}, - { 0, 2, 6, 8, 10, 12, 14, -1}, - { 4, 6, 8, 10, 12, 14, -1, -1}, - { 0, 4, 6, 8, 10, 12, 14, -1}, - { 2, 4, 6, 8, 10, 12, 14, -1}, - { 0, 2, 4, 6, 8, 10, 12, 14} - } -}; #define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a) #define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a) -#define REJ_UNIFORM_BUFLEN 672 -unsigned int PQCLEAN_KYBER512_AVX2_rej_uniform_avx(int16_t *restrict r, - const uint8_t *restrict buf) { +unsigned int PQCLEAN_KYBER512_AVX2_rej_uniform_avx(int16_t *restrict r, const uint8_t *buf) { unsigned int ctr, pos; - uint16_t val; + uint16_t val0, val1; uint32_t good; - const __m256i bound = _mm256_set1_epi16((int16_t)(19 * KYBER_Q - 1)); + uint64_t idx0, idx1, idx2, idx3; + const __m256i bound = _mm256_load_si256(&PQCLEAN_KYBER512_AVX2_qdata.vec[_16XQ / 16]); const __m256i ones = _mm256_set1_epi8(1); - const __m256i kyberq = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER512_AVX2_qdata.as_arr[_16XQ]); - const __m256i v = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER512_AVX2_qdata.as_arr[_16XV]); + const __m256i mask = _mm256_set1_epi16(0xFFF); + const __m256i idx8 = _mm256_set_epi8(15, 14, 14, 13, 12, 11, 11, 10, + 9, 8, 8, 7, 6, 5, 5, 4, + 11, 10, 10, 9, 8, 7, 7, 6, + 5, 4, 4, 3, 2, 1, 1, 0); __m256i f0, f1, g0, g1, g2, g3; __m128i f, t, pilo, pihi; - ctr = 0; - for (pos = 0; pos < 2 * KYBER_N; pos += 64) { - f0 = _mm256_load_si256((__m256i *)&buf[pos + 0]); - f1 = _mm256_load_si256((__m256i *)&buf[pos + 32]); + ctr = pos = 0; + while (ctr <= KYBER_N - 32 && pos <= REJ_UNIFORM_AVX_BUFLEN - 48) { + f0 = _mm256_loadu_si256((__m256i *)&buf[pos]); + f1 = _mm256_loadu_si256((__m256i *)&buf[pos + 24]); + f0 = _mm256_permute4x64_epi64(f0, 0x94); + f1 = _mm256_permute4x64_epi64(f1, 0x94); + f0 = _mm256_shuffle_epi8(f0, idx8); + f1 = _mm256_shuffle_epi8(f1, idx8); + g0 = _mm256_srli_epi16(f0, 4); + g1 = _mm256_srli_epi16(f1, 4); + f0 = _mm256_blend_epi16(f0, g0, 0xAA); + f1 = _mm256_blend_epi16(f1, g1, 0xAA); + f0 = _mm256_and_si256(f0, mask); + f1 = _mm256_and_si256(f1, mask); + pos += 48; - g0 = _mm256_cmpge_epu16(bound, f0); - g1 = _mm256_cmpge_epu16(bound, f1); + g0 = _mm256_cmpgt_epi16(bound, f0); + g1 = _mm256_cmpgt_epi16(bound, f1); g0 = _mm256_packs_epi16(g0, g1); good = _mm256_movemask_epi8(g0); - g0 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 0) & 0xFF])); - g1 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 8) & 0xFF])); - g0 = _mm256_inserti128_si256(g0, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 16) & 0xFF]), 1); - g1 = _mm256_inserti128_si256(g1, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 24) & 0xFF]), 1); + idx0 = _pdep_u64(good >> 0, 0x0101010101010101); + idx1 = _pdep_u64(good >> 8, 0x0101010101010101); + idx2 = _pdep_u64(good >> 16, 0x0101010101010101); + idx3 = _pdep_u64(good >> 24, 0x0101010101010101); + idx0 = (idx0 << 8) - idx0; + idx0 = _pext_u64(0x0E0C0A0806040200, idx0); + idx1 = (idx1 << 8) - idx1; + idx1 = _pext_u64(0x0E0C0A0806040200, idx1); + idx2 = (idx2 << 8) - idx2; + idx2 = _pext_u64(0x0E0C0A0806040200, idx2); + idx3 = (idx3 << 8) - idx3; + idx3 = _pext_u64(0x0E0C0A0806040200, idx3); - //g0 = _mm256_cvtepu8_epi64(_mm_loadl_epi64((__m128i *)&good)); - //g1 = _mm256_i64gather_epi64((long long *)idx.arr, g0, 8); - - /* Barrett reduction of (still unsigned) values */ - g2 = _mm256_mulhi_epu16(f0, v); - g3 = _mm256_mulhi_epu16(f1, v); - g2 = _mm256_srli_epi16(g2, 10); - g3 = _mm256_srli_epi16(g3, 10); - g2 = _mm256_mullo_epi16(g2, kyberq); - g3 = _mm256_mullo_epi16(g3, kyberq); - f0 = _mm256_sub_epi16(f0, g2); - f1 = _mm256_sub_epi16(f1, g3); + g0 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx0)); + g1 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx1)); + g0 = _mm256_inserti128_si256(g0, _mm_cvtsi64_si128(idx2), 1); + g1 = _mm256_inserti128_si256(g1, _mm_cvtsi64_si128(idx3), 1); g2 = _mm256_add_epi8(g0, ones); g3 = _mm256_add_epi8(g1, ones); @@ -328,34 +85,40 @@ unsigned int PQCLEAN_KYBER512_AVX2_rej_uniform_avx(int16_t *restrict r, ctr += _mm_popcnt_u32((good >> 24) & 0xFF); } - while (ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_BUFLEN - 16) { - f = _mm_load_si128((__m128i *)&buf[pos]); - t = _mm_cmpge_epu16(_mm256_castsi256_si128(bound), f); + while (ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_AVX_BUFLEN - 12) { + f = _mm_loadu_si128((__m128i *)&buf[pos]); + f = _mm_shuffle_epi8(f, _mm256_castsi256_si128(idx8)); + t = _mm_srli_epi16(f, 4); + f = _mm_blend_epi16(f, t, 0xAA); + f = _mm_and_si128(f, _mm256_castsi256_si128(mask)); + pos += 12; + + t = _mm_cmpgt_epi16(_mm256_castsi256_si128(bound), f); good = _mm_movemask_epi8(t); - good = _pext_u32(good, 0x5555); - pilo = _mm_loadl_epi64((__m128i *)&idx.arr[good]); + + good &= 0x5555; + idx0 = _pdep_u64(good, 0x1111111111111111); + idx0 = (idx0 << 8) - idx0; + idx0 = _pext_u64(0x0E0C0A0806040200, idx0); + pilo = _mm_cvtsi64_si128(idx0); + pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones)); pilo = _mm_unpacklo_epi8(pilo, pihi); - - /* Barrett reduction */ - t = _mm_mulhi_epu16(f, _mm256_castsi256_si128(v)); - t = _mm_srli_epi16(t, 10); - t = _mm_mullo_epi16(t, _mm256_castsi256_si128(kyberq)); - f = _mm_sub_epi16(f, t); - f = _mm_shuffle_epi8(f, pilo); _mm_storeu_si128((__m128i *)&r[ctr], f); ctr += _mm_popcnt_u32(good); - pos += 16; } - while (ctr < KYBER_N && pos <= REJ_UNIFORM_BUFLEN - 2) { - val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); - pos += 2; + while (ctr < KYBER_N && pos <= REJ_UNIFORM_AVX_BUFLEN - 3) { + val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; + val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)); + pos += 3; - if (val < 19 * KYBER_Q) { - val -= ((int32_t)val * 20159 >> 26) * KYBER_Q; - r[ctr++] = val; + if (val0 < KYBER_Q) { + r[ctr++] = val0; + } + if (val1 < KYBER_Q && ctr < KYBER_N) { + r[ctr++] = val1; } } diff --git a/crypto_kem/kyber512/avx2/rejsample.h b/crypto_kem/kyber512/avx2/rejsample.h index dba49d6d..1a902140 100644 --- a/crypto_kem/kyber512/avx2/rejsample.h +++ b/crypto_kem/kyber512/avx2/rejsample.h @@ -1,9 +1,12 @@ #ifndef PQCLEAN_KYBER512_AVX2_REJSAMPLE_H #define PQCLEAN_KYBER512_AVX2_REJSAMPLE_H #include "params.h" +#include "symmetric.h" #include -unsigned int PQCLEAN_KYBER512_AVX2_rej_uniform_avx(int16_t *r, - const unsigned char *buf); +#define REJ_UNIFORM_AVX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) +#define REJ_UNIFORM_AVX_BUFLEN (REJ_UNIFORM_AVX_NBLOCKS*XOF_BLOCKBYTES) + +unsigned int PQCLEAN_KYBER512_AVX2_rej_uniform_avx(int16_t *r, const uint8_t *buf); #endif diff --git a/crypto_kem/kyber512/avx2/shuffle.S b/crypto_kem/kyber512/avx2/shuffle.S index 997d4bb5..ed296396 100644 --- a/crypto_kem/kyber512/avx2/shuffle.S +++ b/crypto_kem/kyber512/avx2/shuffle.S @@ -113,13 +113,13 @@ vmovdqa 224(%rsi),%ymm12 #csubq csubq 5,13 -csubq 6,14 -csubq 7,15 -csubq 8,1 +csubq 6,13 +csubq 7,13 +csubq 8,13 csubq 9,13 -csubq 10,14 -csubq 11,15 -csubq 12,1 +csubq 10,13 +csubq 11,13 +csubq 12,13 #bitpack vpsllw $12,%ymm6,%ymm4 diff --git a/crypto_kem/kyber512/avx2/shuffle.inc b/crypto_kem/kyber512/avx2/shuffle.inc index d4b092bc..73e9ffe0 100644 --- a/crypto_kem/kyber512/avx2/shuffle.inc +++ b/crypto_kem/kyber512/avx2/shuffle.inc @@ -9,17 +9,17 @@ vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 .endm .macro shuffle2 r0,r1,r2,r3 -#vshufps $0x20,%ymm\r1,%ymm\r0,%ymm\r2 -#vshufps $0x31,%ymm\r1,%ymm\r0,%ymm\r3 -vpsllq $32,%ymm\r1,%ymm12 -vpsrlq $32,%ymm\r0,%ymm13 -vpblendd $0xAA,%ymm12,%ymm\r0,%ymm\r2 -vpblendd $0xAA,%ymm\r1,%ymm13,%ymm\r3 +#vpsllq $32,%ymm\r1,%ymm\r2 +vmovsldup %ymm\r1,%ymm\r2 +vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrlq $32,%ymm\r0,%ymm\r0 +#vmovshdup %ymm\r0,%ymm\r0 +vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 .endm .macro shuffle1 r0,r1,r2,r3 -vpslld $16,%ymm\r1,%ymm12 -vpsrld $16,%ymm\r0,%ymm13 -vpblendw $0xAA,%ymm12,%ymm\r0,%ymm\r2 -vpblendw $0xAA,%ymm\r1,%ymm13,%ymm\r3 +vpslld $16,%ymm\r1,%ymm\r2 +vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrld $16,%ymm\r0,%ymm\r0 +vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 .endm diff --git a/crypto_kem/kyber512/avx2/symmetric-shake.c b/crypto_kem/kyber512/avx2/symmetric-shake.c index 99b7921b..181f686b 100644 --- a/crypto_kem/kyber512/avx2/symmetric-shake.c +++ b/crypto_kem/kyber512/avx2/symmetric-shake.c @@ -9,12 +9,10 @@ * * Description: Absorb step of the SHAKE128 specialized for the Kyber context. * -* Arguments: - xof_state *state: pointer to (uninitialized) output -* Keccak state -* - const uint8_t *seed: pointer to KYBER_SYMBYTES input -* to be absorbed into state -* - uint8_t i additional byte of input -* - uint8_t j additional byte of input +* Arguments: - xof_state *state: pointer to (uninitialized) output Keccak state +* - const uint8_t *seed: pointer to KYBER_SYMBYTES input to be absorbed into state +* - uint8_t i: additional byte of input +* - uint8_t j: additional byte of input **************************************************/ void PQCLEAN_KYBER512_AVX2_kyber_shake128_absorb(xof_state *state, const uint8_t seed[KYBER_SYMBYTES], @@ -26,8 +24,8 @@ void PQCLEAN_KYBER512_AVX2_kyber_shake128_absorb(xof_state *state, for (i = 0; i < KYBER_SYMBYTES; i++) { extseed[i] = seed[i]; } - extseed[i++] = x; - extseed[i] = y; + extseed[KYBER_SYMBYTES + 0] = x; + extseed[KYBER_SYMBYTES + 1] = y; shake128_absorb(state, extseed, sizeof(extseed)); } @@ -38,23 +36,19 @@ void PQCLEAN_KYBER512_AVX2_kyber_shake128_absorb(xof_state *state, * Description: Usage of SHAKE256 as a PRF, concatenates secret and public input * and then generates outlen bytes of SHAKE256 output * -* Arguments: - uint8_t *out: pointer to output -* - size_t outlen: number of requested output bytes -* - const uint8_t *key: pointer to the key -* (of length KYBER_SYMBYTES) -* - uint8_t nonce: single-byte nonce (public PRF input) +* Arguments: - uint8_t *out: pointer to output +* - size_t outlen: number of requested output bytes +* - const uint8_t *key: pointer to the key (of length KYBER_SYMBYTES) +* - uint8_t nonce: single-byte nonce (public PRF input) **************************************************/ -void PQCLEAN_KYBER512_AVX2_kyber_shake256_prf(uint8_t *out, - size_t outlen, - const uint8_t key[KYBER_SYMBYTES], - uint8_t nonce) { +void PQCLEAN_KYBER512_AVX2_kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce) { unsigned int i; uint8_t extkey[KYBER_SYMBYTES + 1]; for (i = 0; i < KYBER_SYMBYTES; i++) { extkey[i] = key[i]; } - extkey[i] = nonce; + extkey[KYBER_SYMBYTES] = nonce; shake256(out, outlen, extkey, sizeof(extkey)); } diff --git a/crypto_kem/kyber512/avx2/symmetric.h b/crypto_kem/kyber512/avx2/symmetric.h index bc2e5a4c..ccab88a4 100644 --- a/crypto_kem/kyber512/avx2/symmetric.h +++ b/crypto_kem/kyber512/avx2/symmetric.h @@ -15,21 +15,16 @@ void PQCLEAN_KYBER512_AVX2_kyber_shake128_absorb(xof_state *s, uint8_t x, uint8_t y); -void PQCLEAN_KYBER512_AVX2_kyber_shake256_prf(uint8_t *out, - size_t outlen, - const uint8_t key[KYBER_SYMBYTES], - uint8_t nonce); +void PQCLEAN_KYBER512_AVX2_kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce); #define XOF_BLOCKBYTES SHAKE128_RATE #define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES) #define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES) #define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER512_AVX2_kyber_shake128_absorb(STATE, SEED, X, Y) -#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) \ - shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) #define xof_ctx_release(STATE) shake128_ctx_release(STATE) -#define prf(OUT, OUTBYTES, KEY, NONCE) \ - PQCLEAN_KYBER512_AVX2_kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE) +#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER512_AVX2_kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE) #define kdf(OUT, IN, INBYTES) shake256(OUT, KYBER_SSBYTES, IN, INBYTES) diff --git a/crypto_kem/kyber512/avx2/verify.c b/crypto_kem/kyber512/avx2/verify.c index 1ce71b35..0a8d88fc 100644 --- a/crypto_kem/kyber512/avx2/verify.c +++ b/crypto_kem/kyber512/avx2/verify.c @@ -8,31 +8,31 @@ * * Description: Compare two arrays for equality in constant time. * -* Arguments: const unsigned char *a: pointer to first byte array -* const unsigned char *b: pointer to second byte array -* size_t len: length of the byte arrays +* Arguments: const uint8_t *a: pointer to first byte array +* const uint8_t *b: pointer to second byte array +* size_t len: length of the byte arrays * * Returns 0 if the byte arrays are equal, 1 otherwise **************************************************/ int PQCLEAN_KYBER512_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) { - size_t pos; + size_t i; uint64_t r; - __m256i avec, bvec, cvec; + __m256i f, g, h; - cvec = _mm256_setzero_si256(); - for (pos = 0; pos + 32 <= len; pos += 32) { - avec = _mm256_loadu_si256((__m256i *)&a[pos]); - bvec = _mm256_loadu_si256((__m256i *)&b[pos]); - avec = _mm256_xor_si256(avec, bvec); - cvec = _mm256_or_si256(cvec, avec); + h = _mm256_setzero_si256(); + for (i = 0; i < len / 32; i++) { + f = _mm256_loadu_si256((__m256i *)&a[32 * i]); + g = _mm256_loadu_si256((__m256i *)&b[32 * i]); + f = _mm256_xor_si256(f, g); + h = _mm256_or_si256(h, f); } - r = 1 - _mm256_testz_si256(cvec, cvec); + r = 1 - _mm256_testz_si256(h, h); - if (pos < len) { - avec = _mm256_loadu_si256((__m256i *)&a[pos]); - bvec = _mm256_loadu_si256((__m256i *)&b[pos]); - cvec = _mm256_cmpeq_epi8(avec, bvec); - r |= _mm256_movemask_epi8(cvec) & (-(uint32_t)1 >> (32 + pos - len)); + a += 32 * i; + b += 32 * i; + len -= 32 * i; + for (i = 0; i < len; i++) { + r |= a[i] ^ b[i]; } r = (-r) >> 63; @@ -47,29 +47,27 @@ int PQCLEAN_KYBER512_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) * assumes two's complement representation of negative integers. * Runs in constant time. * -* Arguments: unsigned char *r: pointer to output byte array +* Arguments: unsigned char *r: pointer to output byte array * const unsigned char *x: pointer to input byte array -* size_t len: Amount of bytes to be copied -* unsigned char b: Condition bit; has to be in {0,1} +* size_t len: Amount of bytes to be copied +* unsigned char b: Condition bit; has to be in {0,1} **************************************************/ -void PQCLEAN_KYBER512_AVX2_cmov(uint8_t *restrict r, const uint8_t *restrict x, size_t len, uint8_t b) { - size_t pos; +void PQCLEAN_KYBER512_AVX2_cmov(uint8_t *restrict r, const uint8_t *x, size_t len, uint8_t b) { + size_t i; __m256i xvec, rvec, bvec; - b = -b; - bvec = _mm256_set1_epi8(b); - - for (pos = 0; pos + 32 <= len; pos += 32) { - rvec = _mm256_loadu_si256((__m256i *)&r[pos]); - xvec = _mm256_loadu_si256((__m256i *)&x[pos]); - xvec = _mm256_xor_si256(xvec, rvec); - xvec = _mm256_and_si256(xvec, bvec); - rvec = _mm256_xor_si256(rvec, xvec); - _mm256_storeu_si256((__m256i *)&r[pos], rvec); + bvec = _mm256_set1_epi64x(-(uint64_t)b); + for (i = 0; i < len / 32; i++) { + rvec = _mm256_loadu_si256((__m256i *)&r[32 * i]); + xvec = _mm256_loadu_si256((__m256i *)&x[32 * i]); + rvec = _mm256_blendv_epi8(rvec, xvec, bvec); + _mm256_storeu_si256((__m256i *)&r[32 * i], rvec); } - while (pos < len) { - r[pos] ^= b & (x[pos] ^ r[pos]); - pos += 1; + r += 32 * i; + x += 32 * i; + len -= 32 * i; + for (i = 0; i < len; i++) { + r[i] ^= -b & (x[i] ^ r[i]); } } diff --git a/crypto_kem/kyber512/clean/api.h b/crypto_kem/kyber512/clean/api.h index e14c7d96..29817fe3 100644 --- a/crypto_kem/kyber512/clean/api.h +++ b/crypto_kem/kyber512/clean/api.h @@ -5,7 +5,7 @@ #define PQCLEAN_KYBER512_CLEAN_CRYPTO_SECRETKEYBYTES 1632 #define PQCLEAN_KYBER512_CLEAN_CRYPTO_PUBLICKEYBYTES 800 -#define PQCLEAN_KYBER512_CLEAN_CRYPTO_CIPHERTEXTBYTES 736 +#define PQCLEAN_KYBER512_CLEAN_CRYPTO_CIPHERTEXTBYTES 768 #define PQCLEAN_KYBER512_CLEAN_CRYPTO_BYTES 32 #define PQCLEAN_KYBER512_CLEAN_CRYPTO_ALGNAME "Kyber512" diff --git a/crypto_kem/kyber512/clean/cbd.c b/crypto_kem/kyber512/clean/cbd.c index cd88cf05..ff661320 100644 --- a/crypto_kem/kyber512/clean/cbd.c +++ b/crypto_kem/kyber512/clean/cbd.c @@ -5,7 +5,7 @@ /************************************************* * Name: load32_littleendian * -* Description: load bytes into a 32-bit integer +* Description: load 4 bytes into a 32-bit integer * in little-endian order * * Arguments: - const uint8_t *x: pointer to input byte array @@ -22,16 +22,36 @@ static uint32_t load32_littleendian(const uint8_t x[4]) { } /************************************************* -* Name: PQCLEAN_KYBER512_CLEAN_cbd +* Name: load24_littleendian +* +* Description: load 3 bytes into a 32-bit integer +* in little-endian order. +* This function is only needed for Kyber-512 +* +* Arguments: - const uint8_t *x: pointer to input byte array +* +* Returns 32-bit unsigned integer loaded from x (most significant byte is zero) +**************************************************/ +static uint32_t load24_littleendian(const uint8_t x[3]) { + uint32_t r; + r = (uint32_t)x[0]; + r |= (uint32_t)x[1] << 8; + r |= (uint32_t)x[2] << 16; + return r; +} + + +/************************************************* +* Name: cbd2 * * Description: Given an array of uniformly random bytes, compute * polynomial with coefficients distributed according to -* a centered binomial distribution with parameter KYBER_ETA +* a centered binomial distribution with parameter eta=2 * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *buf: pointer to input byte array **************************************************/ -void PQCLEAN_KYBER512_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]) { +static void cbd2(poly *r, const uint8_t buf[2 * KYBER_N / 4]) { unsigned int i, j; uint32_t t, d; int16_t a, b; @@ -48,3 +68,41 @@ void PQCLEAN_KYBER512_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / } } } + +/************************************************* +* Name: cbd3 +* +* Description: Given an array of uniformly random bytes, compute +* polynomial with coefficients distributed according to +* a centered binomial distribution with parameter eta=3. +* This function is only needed for Kyber-512 +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *buf: pointer to input byte array +**************************************************/ +static void cbd3(poly *r, const uint8_t buf[3 * KYBER_N / 4]) { + unsigned int i, j; + uint32_t t, d; + int16_t a, b; + + for (i = 0; i < KYBER_N / 4; i++) { + t = load24_littleendian(buf + 3 * i); + d = t & 0x00249249; + d += (t >> 1) & 0x00249249; + d += (t >> 2) & 0x00249249; + + for (j = 0; j < 4; j++) { + a = (d >> (6 * j + 0)) & 0x7; + b = (d >> (6 * j + 3)) & 0x7; + r->coeffs[4 * i + j] = a - b; + } + } +} + +void PQCLEAN_KYBER512_CLEAN_poly_cbd_eta1(poly *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]) { + cbd3(r, buf); +} + +void PQCLEAN_KYBER512_CLEAN_poly_cbd_eta2(poly *r, const uint8_t buf[KYBER_ETA2 * KYBER_N / 4]) { + cbd2(r, buf); +} diff --git a/crypto_kem/kyber512/clean/cbd.h b/crypto_kem/kyber512/clean/cbd.h index b3d184ba..fc8788b3 100644 --- a/crypto_kem/kyber512/clean/cbd.h +++ b/crypto_kem/kyber512/clean/cbd.h @@ -4,6 +4,8 @@ #include "poly.h" #include -void PQCLEAN_KYBER512_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]); +void PQCLEAN_KYBER512_CLEAN_poly_cbd_eta1(poly *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]); + +void PQCLEAN_KYBER512_CLEAN_poly_cbd_eta2(poly *r, const uint8_t buf[KYBER_ETA2 * KYBER_N / 4]); #endif diff --git a/crypto_kem/kyber512/clean/indcpa.c b/crypto_kem/kyber512/clean/indcpa.c index 8da03b43..a413614d 100644 --- a/crypto_kem/kyber512/clean/indcpa.c +++ b/crypto_kem/kyber512/clean/indcpa.c @@ -15,8 +15,8 @@ * serialized vector of polynomials pk * and the public seed used to generate the matrix A. * -* Arguments: uint8_t *r: pointer to the output serialized public key -* polyvec *pk: pointer to the input public-key polyvec +* Arguments: uint8_t *r: pointer to the output serialized public key +* polyvec *pk: pointer to the input public-key polyvec * const uint8_t *seed: pointer to the input public seed **************************************************/ static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], @@ -35,10 +35,8 @@ static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], * Description: De-serialize public key from a byte array; * approximate inverse of pack_pk * -* Arguments: - polyvec *pk: pointer to output public-key -* polynomial vector -* - uint8_t *seed: pointer to output seed to generate -* matrix A +* Arguments: - polyvec *pk: pointer to output public-key polynomial vector +* - uint8_t *seed: pointer to output seed to generate matrix A * - const uint8_t *packedpk: pointer to input serialized public key **************************************************/ static void unpack_pk(polyvec *pk, @@ -56,7 +54,7 @@ static void unpack_pk(polyvec *pk, * * Description: Serialize the secret key * -* Arguments: - uint8_t *r: pointer to output serialized secret key +* Arguments: - uint8_t *r: pointer to output serialized secret key * - polyvec *sk: pointer to input vector of polynomials (secret key) **************************************************/ static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { @@ -66,15 +64,12 @@ static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { /************************************************* * Name: unpack_sk * -* Description: De-serialize the secret key; -* inverse of pack_sk +* Description: De-serialize the secret key; inverse of pack_sk * -* Arguments: - polyvec *sk: pointer to output vector of -* polynomials (secret key) +* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key) * - const uint8_t *packedsk: pointer to input serialized secret key **************************************************/ -static void unpack_sk(polyvec *sk, - const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { +static void unpack_sk(polyvec *sk, const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { PQCLEAN_KYBER512_CLEAN_polyvec_frombytes(sk, packedsk); } @@ -86,12 +81,10 @@ static void unpack_sk(polyvec *sk, * and the compressed and serialized polynomial v * * Arguments: uint8_t *r: pointer to the output serialized ciphertext -* poly *pk: pointer to the input vector of polynomials b -* poly *v: pointer to the input polynomial v +* poly *pk: pointer to the input vector of polynomials b +* poly *v: pointer to the input polynomial v **************************************************/ -static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], - polyvec *b, - poly *v) { +static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], polyvec *b, poly *v) { PQCLEAN_KYBER512_CLEAN_polyvec_compress(r, b); PQCLEAN_KYBER512_CLEAN_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v); } @@ -102,13 +95,11 @@ static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], * Description: De-serialize and decompress ciphertext from a byte array; * approximate inverse of pack_ciphertext * -* Arguments: - polyvec *b: pointer to the output vector of polynomials b -* - poly *v: pointer to the output polynomial v +* Arguments: - polyvec *b: pointer to the output vector of polynomials b +* - poly *v: pointer to the output polynomial v * - const uint8_t *c: pointer to the input serialized ciphertext **************************************************/ -static void unpack_ciphertext(polyvec *b, - poly *v, - const uint8_t c[KYBER_INDCPA_BYTES]) { +static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t c[KYBER_INDCPA_BYTES]) { PQCLEAN_KYBER512_CLEAN_polyvec_decompress(b, c); PQCLEAN_KYBER512_CLEAN_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES); } @@ -119,11 +110,9 @@ static void unpack_ciphertext(polyvec *b, * Description: Run rejection sampling on uniform random bytes to generate * uniform random integers mod q * -* Arguments: - int16_t *r: pointer to output buffer -* - unsigned int len: requested number of 16-bit integers -* (uniform mod q) -* - const uint8_t *buf: pointer to input buffer -* (assumed to be uniform random bytes) +* Arguments: - int16_t *r: pointer to output buffer +* - unsigned int len: requested number of 16-bit integers (uniform mod q) +* - const uint8_t *buf: pointer to input buffer (assumed to be uniformly random bytes) * - unsigned int buflen: length of input buffer in bytes * * Returns number of sampled 16-bit integers (at most len) @@ -133,16 +122,19 @@ static unsigned int rej_uniform(int16_t *r, const uint8_t *buf, unsigned int buflen) { unsigned int ctr, pos; - uint16_t val; + uint16_t val0, val1; ctr = pos = 0; - while (ctr < len && pos + 2 <= buflen) { - val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); - pos += 2; + while (ctr < len && pos + 3 <= buflen) { + val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; + val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF; + pos += 3; - if (val < 19 * KYBER_Q) { - val -= (val >> 12) * KYBER_Q; // Barrett reduction - r[ctr++] = (int16_t)val; + if (val0 < KYBER_Q) { + r[ctr++] = val0; + } + if (ctr < len && val1 < KYBER_Q) { + r[ctr++] = val1; } } @@ -160,17 +152,16 @@ static unsigned int rej_uniform(int16_t *r, * uniformly random. Performs rejection sampling on output of * a XOF * -* Arguments: - polyvec *a: pointer to ouptput matrix A +* Arguments: - polyvec *a: pointer to ouptput matrix A * - const uint8_t *seed: pointer to input seed -* - int transposed: boolean deciding whether A or A^T -* is generated +* - int transposed: boolean deciding whether A or A^T is generated **************************************************/ -#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \ - + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) +#define GEN_MATRIX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) // Not static for benchmarking void PQCLEAN_KYBER512_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed) { - unsigned int ctr, i, j; - uint8_t buf[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES]; + unsigned int ctr, i, j, k; + unsigned int buflen, off; + uint8_t buf[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2]; xof_state state; for (i = 0; i < KYBER_K; i++) { @@ -182,12 +173,17 @@ void PQCLEAN_KYBER512_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMB } xof_squeezeblocks(buf, GEN_MATRIX_NBLOCKS, &state); - ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, sizeof(buf)); + buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES; + ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, buflen); while (ctr < KYBER_N) { - xof_squeezeblocks(buf, 1, &state); - ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, - XOF_BLOCKBYTES); + off = buflen % 3; + for (k = 0; k < off; k++) { + buf[k] = buf[buflen - off + k]; + } + xof_squeezeblocks(buf + off, 1, &state); + buflen = off + XOF_BLOCKBYTES; + ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, buflen); } xof_ctx_release(&state); } @@ -220,10 +216,10 @@ void PQCLEAN_KYBER512_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTE gen_a(a, publicseed); for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER512_CLEAN_poly_getnoise(&skpv.vec[i], noiseseed, nonce++); + PQCLEAN_KYBER512_CLEAN_poly_getnoise_eta1(&skpv.vec[i], noiseseed, nonce++); } for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER512_CLEAN_poly_getnoise(&e.vec[i], noiseseed, nonce++); + PQCLEAN_KYBER512_CLEAN_poly_getnoise_eta1(&e.vec[i], noiseseed, nonce++); } PQCLEAN_KYBER512_CLEAN_polyvec_ntt(&skpv); @@ -231,7 +227,7 @@ void PQCLEAN_KYBER512_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTE // matrix-vector multiplication for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER512_CLEAN_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); + PQCLEAN_KYBER512_CLEAN_polyvec_basemul_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); PQCLEAN_KYBER512_CLEAN_poly_tomont(&pkpv.vec[i]); } @@ -248,16 +244,15 @@ void PQCLEAN_KYBER512_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTE * Description: Encryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *c: pointer to output ciphertext -* (of length KYBER_INDCPA_BYTES bytes) -* - const uint8_t *m: pointer to input message -* (of length KYBER_INDCPA_MSGBYTES bytes) -* - const uint8_t *pk: pointer to input public key -* (of length KYBER_INDCPA_PUBLICKEYBYTES) -* - const uint8_t *coins: pointer to input random coins -* used as seed (of length KYBER_SYMBYTES) -* to deterministically generate all -* randomness +* Arguments: - uint8_t *c: pointer to output ciphertext +* (of length KYBER_INDCPA_BYTES bytes) +* - const uint8_t *m: pointer to input message +* (of length KYBER_INDCPA_MSGBYTES bytes) +* - const uint8_t *pk: pointer to input public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES) +* - const uint8_t *coins: pointer to input random coins used as seed +* (of length KYBER_SYMBYTES) to deterministically +* generate all randomness **************************************************/ void PQCLEAN_KYBER512_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], const uint8_t m[KYBER_INDCPA_MSGBYTES], @@ -266,7 +261,7 @@ void PQCLEAN_KYBER512_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], unsigned int i; uint8_t seed[KYBER_SYMBYTES]; uint8_t nonce = 0; - polyvec sp, pkpv, ep, at[KYBER_K], bp; + polyvec sp, pkpv, ep, at[KYBER_K], b; poly v, k, epp; unpack_pk(&pkpv, seed, pk); @@ -274,32 +269,32 @@ void PQCLEAN_KYBER512_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], gen_at(at, seed); for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER512_CLEAN_poly_getnoise(sp.vec + i, coins, nonce++); + PQCLEAN_KYBER512_CLEAN_poly_getnoise_eta1(sp.vec + i, coins, nonce++); } for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER512_CLEAN_poly_getnoise(ep.vec + i, coins, nonce++); + PQCLEAN_KYBER512_CLEAN_poly_getnoise_eta2(ep.vec + i, coins, nonce++); } - PQCLEAN_KYBER512_CLEAN_poly_getnoise(&epp, coins, nonce++); + PQCLEAN_KYBER512_CLEAN_poly_getnoise_eta2(&epp, coins, nonce++); PQCLEAN_KYBER512_CLEAN_polyvec_ntt(&sp); // matrix-vector multiplication for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER512_CLEAN_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp); + PQCLEAN_KYBER512_CLEAN_polyvec_basemul_acc_montgomery(&b.vec[i], &at[i], &sp); } - PQCLEAN_KYBER512_CLEAN_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp); + PQCLEAN_KYBER512_CLEAN_polyvec_basemul_acc_montgomery(&v, &pkpv, &sp); - PQCLEAN_KYBER512_CLEAN_polyvec_invntt_tomont(&bp); + PQCLEAN_KYBER512_CLEAN_polyvec_invntt_tomont(&b); PQCLEAN_KYBER512_CLEAN_poly_invntt_tomont(&v); - PQCLEAN_KYBER512_CLEAN_polyvec_add(&bp, &bp, &ep); + PQCLEAN_KYBER512_CLEAN_polyvec_add(&b, &b, &ep); PQCLEAN_KYBER512_CLEAN_poly_add(&v, &v, &epp); PQCLEAN_KYBER512_CLEAN_poly_add(&v, &v, &k); - PQCLEAN_KYBER512_CLEAN_polyvec_reduce(&bp); + PQCLEAN_KYBER512_CLEAN_polyvec_reduce(&b); PQCLEAN_KYBER512_CLEAN_poly_reduce(&v); - pack_ciphertext(c, &bp, &v); + pack_ciphertext(c, &b, &v); } /************************************************* @@ -308,24 +303,24 @@ void PQCLEAN_KYBER512_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], * Description: Decryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *m: pointer to output decrypted message -* (of length KYBER_INDCPA_MSGBYTES) -* - const uint8_t *c: pointer to input ciphertext -* (of length KYBER_INDCPA_BYTES) +* Arguments: - uint8_t *m: pointer to output decrypted message +* (of length KYBER_INDCPA_MSGBYTES) +* - const uint8_t *c: pointer to input ciphertext +* (of length KYBER_INDCPA_BYTES) * - const uint8_t *sk: pointer to input secret key * (of length KYBER_INDCPA_SECRETKEYBYTES) **************************************************/ void PQCLEAN_KYBER512_CLEAN_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t c[KYBER_INDCPA_BYTES], const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { - polyvec bp, skpv; + polyvec b, skpv; poly v, mp; - unpack_ciphertext(&bp, &v, c); + unpack_ciphertext(&b, &v, c); unpack_sk(&skpv, sk); - PQCLEAN_KYBER512_CLEAN_polyvec_ntt(&bp); - PQCLEAN_KYBER512_CLEAN_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp); + PQCLEAN_KYBER512_CLEAN_polyvec_ntt(&b); + PQCLEAN_KYBER512_CLEAN_polyvec_basemul_acc_montgomery(&mp, &skpv, &b); PQCLEAN_KYBER512_CLEAN_poly_invntt_tomont(&mp); PQCLEAN_KYBER512_CLEAN_poly_sub(&mp, &v, &mp); diff --git a/crypto_kem/kyber512/clean/kem.c b/crypto_kem/kyber512/clean/kem.c index db097438..5d7ffc2f 100644 --- a/crypto_kem/kyber512/clean/kem.c +++ b/crypto_kem/kyber512/clean/kem.c @@ -14,13 +14,14 @@ * for CCA-secure Kyber key encapsulation mechanism * * Arguments: - unsigned char *pk: pointer to output public key -* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) * - unsigned char *sk: pointer to output private key -* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* (an already allocated array of KYBER_SECRETKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER512_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { +int PQCLEAN_KYBER512_CLEAN_crypto_kem_keypair(unsigned char pk[KYBER_PUBLICKEYBYTES], + unsigned char sk[KYBER_SECRETKEYBYTES]) { size_t i; PQCLEAN_KYBER512_CLEAN_indcpa_keypair(pk, sk); for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) { @@ -39,17 +40,17 @@ int PQCLEAN_KYBER512_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char * * secret for given public key * * Arguments: - unsigned char *ct: pointer to output cipher text -* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) * - unsigned char *ss: pointer to output shared secret -* (an already allocated array of CRYPTO_BYTES bytes) +* (an already allocated array of KYBER_SSBYTES bytes) * - const unsigned char *pk: pointer to input public key -* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER512_CLEAN_crypto_kem_enc(unsigned char *ct, - unsigned char *ss, - const unsigned char *pk) { +int PQCLEAN_KYBER512_CLEAN_crypto_kem_enc(unsigned char ct[KYBER_CIPHERTEXTBYTES], + unsigned char ss[KYBER_SSBYTES], + const unsigned char pk[KYBER_PUBLICKEYBYTES]) { uint8_t buf[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ uint8_t kr[2 * KYBER_SYMBYTES]; @@ -79,19 +80,19 @@ int PQCLEAN_KYBER512_CLEAN_crypto_kem_enc(unsigned char *ct, * cipher text and private key * * Arguments: - unsigned char *ss: pointer to output shared secret -* (an already allocated array of CRYPTO_BYTES bytes) +* (an already allocated array of KYBER_SSBYTES bytes) * - const unsigned char *ct: pointer to input cipher text -* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) * - const unsigned char *sk: pointer to input private key -* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* (an already allocated array of KYBER_SECRETKEYBYTES bytes) * * Returns 0. * * On failure, ss will contain a pseudo-random value. **************************************************/ -int PQCLEAN_KYBER512_CLEAN_crypto_kem_dec(unsigned char *ss, - const unsigned char *ct, - const unsigned char *sk) { +int PQCLEAN_KYBER512_CLEAN_crypto_kem_dec(unsigned char ss[KYBER_SSBYTES], + const unsigned char ct[KYBER_CIPHERTEXTBYTES], + const unsigned char sk[KYBER_SECRETKEYBYTES]) { size_t i; int fail; uint8_t buf[2 * KYBER_SYMBYTES]; diff --git a/crypto_kem/kyber512/clean/ntt.c b/crypto_kem/kyber512/clean/ntt.c index 44e94b9a..e284a8c5 100644 --- a/crypto_kem/kyber512/clean/ntt.c +++ b/crypto_kem/kyber512/clean/ntt.c @@ -3,11 +3,11 @@ #include "reduce.h" #include -/* Code to generate PQCLEAN_KYBER512_CLEAN_zetas and PQCLEAN_KYBER512_CLEAN_zetas_inv used in the number-theoretic transform: +/* Code to generate PQCLEAN_KYBER512_CLEAN_zetas and zetas_inv used in the number-theoretic transform: #define KYBER_ROOT_OF_UNITY 17 -static const uint16_t tree[128] = { +static const uint8_t tree[128] = { 0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120, 4, 68, 36, 100, 20, 84, 52, 116, 12, 76, 44, 108, 28, 92, 60, 124, 2, 66, 34, 98, 18, 82, 50, 114, 10, 74, 42, 106, 26, 90, 58, 122, @@ -19,51 +19,41 @@ static const uint16_t tree[128] = { }; void init_ntt() { - unsigned int i, j, k; + unsigned int i; int16_t tmp[128]; tmp[0] = MONT; - for(i = 1; i < 128; ++i) - tmp[i] = fqmul(tmp[i-1], KYBER_ROOT_OF_UNITY*MONT % KYBER_Q); + for(i=1;i<128;i++) + tmp[i] = fqmul(tmp[i-1],MONT*KYBER_ROOT_OF_UNITY % KYBER_Q); - for(i = 0; i < 128; ++i) + for(i=0;i<128;i++) { PQCLEAN_KYBER512_CLEAN_zetas[i] = tmp[tree[i]]; - - k = 0; - for(i = 64; i >= 1; i >>= 1) - for(j = i; j < 2*i; ++j) - PQCLEAN_KYBER512_CLEAN_zetas_inv[k++] = -tmp[128 - tree[j]]; - - PQCLEAN_KYBER512_CLEAN_zetas_inv[127] = MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q; + if(PQCLEAN_KYBER512_CLEAN_zetas[i] > KYBER_Q/2) + PQCLEAN_KYBER512_CLEAN_zetas[i] -= KYBER_Q; + if(PQCLEAN_KYBER512_CLEAN_zetas[i] < -KYBER_Q/2) + PQCLEAN_KYBER512_CLEAN_zetas[i] += KYBER_Q; + } } - */ const int16_t PQCLEAN_KYBER512_CLEAN_zetas[128] = { - 2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182, 962, - 2127, 1855, 1468, 573, 2004, 264, 383, 2500, 1458, 1727, 3199, 2648, 1017, - 732, 608, 1787, 411, 3124, 1758, 1223, 652, 2777, 1015, 2036, 1491, 3047, - 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, 2476, 3239, 3058, 830, - 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, 2226, - 430, 555, 843, 2078, 871, 1550, 105, 422, 587, 177, 3094, 3038, 2869, 1574, - 1653, 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349, - 418, 329, 3173, 3254, 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193, - 1218, 1994, 2455, 220, 2142, 1670, 2144, 1799, 2051, 794, 1819, 2475, 2459, - 478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628 -}; - -const int16_t PQCLEAN_KYBER512_CLEAN_zetas_inv[128] = { - 1701, 1807, 1460, 2371, 2338, 2333, 308, 108, 2851, 870, 854, 1510, 2535, - 1278, 1530, 1185, 1659, 1187, 3109, 874, 1335, 2111, 136, 1215, 2945, 1465, - 1285, 2007, 2719, 2726, 2232, 2512, 75, 156, 3000, 2911, 2980, 872, 2685, - 1590, 2210, 602, 1846, 777, 147, 2170, 2551, 246, 1676, 1755, 460, 291, 235, - 3152, 2742, 2907, 3224, 1779, 2458, 1251, 2486, 2774, 2899, 1103, 1275, 2652, - 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, - 1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, - 2677, 2106, 1571, 205, 2918, 1542, 2721, 2597, 2312, 681, 130, 1602, 1871, - 829, 2946, 3065, 1325, 2756, 1861, 1474, 1202, 2367, 3147, 1752, 2707, 171, - 3127, 3042, 1907, 1836, 1517, 359, 758, 1441 -}; + -1044, -758, -359, -1517, 1493, 1422, 287, 202, + -171, 622, 1577, 182, 962, -1202, -1474, 1468, + 573, -1325, 264, 383, -829, 1458, -1602, -130, + -681, 1017, 732, 608, -1542, 411, -205, -1571, + 1223, 652, -552, 1015, -1293, 1491, -282, -1544, + 516, -8, -320, -666, -1618, -1162, 126, 1469, + -853, -90, -271, 830, 107, -1421, -247, -951, + -398, 961, -1508, -725, 448, -1065, 677, -1275, + -1103, 430, 555, 843, -1251, 871, 1550, 105, + 422, 587, 177, -235, -291, -460, 1574, 1653, + -246, 778, 1159, -147, -777, 1483, -602, 1119, + -1590, 644, -872, 349, 418, 329, -156, -75, + 817, 1097, 603, 610, 1322, -1285, -1465, 384, + -1215, -136, 1218, -1335, -874, 220, -1187, -1659, + -1185, -1530, -1278, 794, -1510, -854, -870, 478, + -108, -308, 996, 991, 958, -1460, 1522, 1628 + }; /************************************************* * Name: fqmul @@ -82,11 +72,10 @@ static int16_t fqmul(int16_t a, int16_t b) { /************************************************* * Name: PQCLEAN_KYBER512_CLEAN_ntt * -* Description: Inplace number-theoretic transform (NTT) in Rq +* Description: Inplace number-theoretic transform (NTT) in Rq. * input is in standard order, output is in bitreversed order * -* Arguments: - int16_t r[256]: pointer to input/output vector of elements -* of Zq +* Arguments: - int16_t r[256]: pointer to input/output vector of elements of Zq **************************************************/ void PQCLEAN_KYBER512_CLEAN_ntt(int16_t r[256]) { unsigned int len, start, j, k; @@ -96,7 +85,7 @@ void PQCLEAN_KYBER512_CLEAN_ntt(int16_t r[256]) { for (len = 128; len >= 2; len >>= 1) { for (start = 0; start < 256; start = j + len) { zeta = PQCLEAN_KYBER512_CLEAN_zetas[k++]; - for (j = start; j < start + len; ++j) { + for (j = start; j < start + len; j++) { t = fqmul(zeta, r[j + len]); r[j + len] = r[j] - t; r[j] = r[j] + t; @@ -112,28 +101,28 @@ void PQCLEAN_KYBER512_CLEAN_ntt(int16_t r[256]) { * multiplication by Montgomery factor 2^16. * Input is in bitreversed order, output is in standard order * -* Arguments: - int16_t r[256]: pointer to input/output vector of elements -* of Zq +* Arguments: - int16_t r[256]: pointer to input/output vector of elements of Zq **************************************************/ void PQCLEAN_KYBER512_CLEAN_invntt(int16_t r[256]) { unsigned int start, len, j, k; int16_t t, zeta; + const int16_t f = 1441; // mont^2/128 - k = 0; + k = 127; for (len = 2; len <= 128; len <<= 1) { for (start = 0; start < 256; start = j + len) { - zeta = PQCLEAN_KYBER512_CLEAN_zetas_inv[k++]; - for (j = start; j < start + len; ++j) { + zeta = PQCLEAN_KYBER512_CLEAN_zetas[k--]; + for (j = start; j < start + len; j++) { t = r[j]; r[j] = PQCLEAN_KYBER512_CLEAN_barrett_reduce(t + r[j + len]); - r[j + len] = t - r[j + len]; + r[j + len] = r[j + len] - t; r[j + len] = fqmul(zeta, r[j + len]); } } } - for (j = 0; j < 256; ++j) { - r[j] = fqmul(r[j], PQCLEAN_KYBER512_CLEAN_zetas_inv[127]); + for (j = 0; j < 256; j++) { + r[j] = fqmul(r[j], f); } } @@ -143,19 +132,15 @@ void PQCLEAN_KYBER512_CLEAN_invntt(int16_t r[256]) { * Description: Multiplication of polynomials in Zq[X]/(X^2-zeta) * used for multiplication of elements in Rq in NTT domain * -* Arguments: - int16_t r[2]: pointer to the output polynomial +* Arguments: - int16_t r[2]: pointer to the output polynomial * - const int16_t a[2]: pointer to the first factor * - const int16_t b[2]: pointer to the second factor -* - int16_t zeta: integer defining the reduction polynomial +* - int16_t zeta: integer defining the reduction polynomial **************************************************/ -void PQCLEAN_KYBER512_CLEAN_basemul(int16_t r[2], - const int16_t a[2], - const int16_t b[2], - int16_t zeta) { +void PQCLEAN_KYBER512_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta) { r[0] = fqmul(a[1], b[1]); r[0] = fqmul(r[0], zeta); r[0] += fqmul(a[0], b[0]); - r[1] = fqmul(a[0], b[1]); r[1] += fqmul(a[1], b[0]); } diff --git a/crypto_kem/kyber512/clean/ntt.h b/crypto_kem/kyber512/clean/ntt.h index 34c43c17..d25bc40c 100644 --- a/crypto_kem/kyber512/clean/ntt.h +++ b/crypto_kem/kyber512/clean/ntt.h @@ -5,15 +5,10 @@ extern const int16_t PQCLEAN_KYBER512_CLEAN_zetas[128]; -extern const int16_t PQCLEAN_KYBER512_CLEAN_zetas_inv[128]; - void PQCLEAN_KYBER512_CLEAN_ntt(int16_t r[256]); void PQCLEAN_KYBER512_CLEAN_invntt(int16_t r[256]); -void PQCLEAN_KYBER512_CLEAN_basemul(int16_t r[2], - const int16_t a[2], - const int16_t b[2], - int16_t zeta); +void PQCLEAN_KYBER512_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta); #endif diff --git a/crypto_kem/kyber512/clean/params.h b/crypto_kem/kyber512/clean/params.h index c774d397..f2c7efc7 100644 --- a/crypto_kem/kyber512/clean/params.h +++ b/crypto_kem/kyber512/clean/params.h @@ -7,8 +7,6 @@ #define KYBER_N 256 #define KYBER_Q 3329 -#define KYBER_ETA 2 - #define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define KYBER_SSBYTES 32 /* size in bytes of shared key */ @@ -16,20 +14,20 @@ #define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) #define KYBER_K 2 -#define KYBER_POLYCOMPRESSEDBYTES 96 +#define KYBER_ETA1 3 +#define KYBER_POLYCOMPRESSEDBYTES 128 #define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320) -#define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES +#define KYBER_ETA2 2 + +#define KYBER_INDCPA_MSGBYTES (KYBER_SYMBYTES) #define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) #define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) -#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES \ - + KYBER_POLYCOMPRESSEDBYTES) +#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES) #define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES) /* 32 bytes of additional space to save H(pk) */ -#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES \ - + KYBER_INDCPA_PUBLICKEYBYTES \ - + 2*KYBER_SYMBYTES) -#define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES +#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) +#define KYBER_CIPHERTEXTBYTES (KYBER_INDCPA_BYTES) #endif diff --git a/crypto_kem/kyber512/clean/poly.c b/crypto_kem/kyber512/clean/poly.c index 0bc99a97..14cac577 100644 --- a/crypto_kem/kyber512/clean/poly.c +++ b/crypto_kem/kyber512/clean/poly.c @@ -13,23 +13,26 @@ * * Arguments: - uint8_t *r: pointer to output byte array * (of length KYBER_POLYCOMPRESSEDBYTES) -* - poly *a: pointer to input polynomial +* - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER512_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a) { +void PQCLEAN_KYBER512_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a) { size_t i, j; + int16_t u; uint8_t t[8]; - PQCLEAN_KYBER512_CLEAN_poly_csubq(a); - for (i = 0; i < KYBER_N / 8; i++) { for (j = 0; j < 8; j++) { - t[j] = ((((uint16_t)a->coeffs[8 * i + j] << 3) + KYBER_Q / 2) / KYBER_Q) & 7; + // map to positive standard representatives + u = a->coeffs[8 * i + j]; + u += (u >> 15) & KYBER_Q; + t[j] = ((((uint16_t)u << 4) + KYBER_Q / 2) / KYBER_Q) & 15; } - r[0] = (t[0] >> 0) | (t[1] << 3) | (t[2] << 6); - r[1] = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7); - r[2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5); - r += 3; + r[0] = t[0] | (t[1] << 4); + r[1] = t[2] | (t[3] << 4); + r[2] = t[4] | (t[5] << 4); + r[3] = t[6] | (t[7] << 4); + r += 4; } } @@ -39,29 +42,17 @@ void PQCLEAN_KYBER512_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], * Description: De-serialization and subsequent decompression of a polynomial; * approximate inverse of PQCLEAN_KYBER512_CLEAN_poly_compress * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: pointer to input byte array * (of length KYBER_POLYCOMPRESSEDBYTES bytes) **************************************************/ void PQCLEAN_KYBER512_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) { size_t i; - size_t j; - uint8_t t[8]; - for (i = 0; i < KYBER_N / 8; i++) { - t[0] = (a[0] >> 0); - t[1] = (a[0] >> 3); - t[2] = (a[0] >> 6) | (a[1] << 2); - t[3] = (a[1] >> 1); - t[4] = (a[1] >> 4); - t[5] = (a[1] >> 7) | (a[2] << 1); - t[6] = (a[2] >> 2); - t[7] = (a[2] >> 5); - a += 3; - - for (j = 0; j < 8; j++) { - r->coeffs[8 * i + j] = ((uint16_t)(t[j] & 7) * KYBER_Q + 4) >> 3; - } + for (i = 0; i < KYBER_N / 2; i++) { + r->coeffs[2 * i + 0] = (((uint16_t)(a[0] & 15) * KYBER_Q) + 8) >> 4; + r->coeffs[2 * i + 1] = (((uint16_t)(a[0] >> 4) * KYBER_Q) + 8) >> 4; + a += 1; } } @@ -72,20 +63,21 @@ void PQCLEAN_KYBER512_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_POLYC * * Arguments: - uint8_t *r: pointer to output byte array * (needs space for KYBER_POLYBYTES bytes) -* - poly *a: pointer to input polynomial +* - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER512_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { +void PQCLEAN_KYBER512_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a) { size_t i; uint16_t t0, t1; - PQCLEAN_KYBER512_CLEAN_poly_csubq(a); - for (i = 0; i < KYBER_N / 2; i++) { - t0 = a->coeffs[2 * i]; + // map to positive standard representatives + t0 = a->coeffs[2 * i]; + t0 += ((int16_t)t0 >> 15) & KYBER_Q; t1 = a->coeffs[2 * i + 1]; - r[3 * i + 0] = (uint8_t)(t0 >> 0); - r[3 * i + 1] = (uint8_t)((t0 >> 8) | (t1 << 4)); - r[3 * i + 2] = (uint8_t)(t1 >> 4); + t1 += ((int16_t)t1 >> 15) & KYBER_Q; + r[3 * i + 0] = (t0 >> 0); + r[3 * i + 1] = (t0 >> 8) | (t1 << 4); + r[3 * i + 2] = (t1 >> 4); } } @@ -95,7 +87,7 @@ void PQCLEAN_KYBER512_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { * Description: De-serialization of a polynomial; * inverse of PQCLEAN_KYBER512_CLEAN_poly_tobytes * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: pointer to input byte array * (of KYBER_POLYBYTES bytes) **************************************************/ @@ -112,7 +104,7 @@ void PQCLEAN_KYBER512_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBY * * Description: Convert 32-byte message to polynomial * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *msg: pointer to input message **************************************************/ void PQCLEAN_KYBER512_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { @@ -133,41 +125,60 @@ void PQCLEAN_KYBER512_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA * Description: Convert polynomial to 32-byte message * * Arguments: - uint8_t *msg: pointer to output message -* - poly *a: pointer to input polynomial +* - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER512_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a) { +void PQCLEAN_KYBER512_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *a) { size_t i, j; uint16_t t; - PQCLEAN_KYBER512_CLEAN_poly_csubq(a); - for (i = 0; i < KYBER_N / 8; i++) { msg[i] = 0; for (j = 0; j < 8; j++) { - t = ((((uint16_t)a->coeffs[8 * i + j] << 1) + KYBER_Q / 2) / KYBER_Q) & 1; + t = a->coeffs[8 * i + j]; + t += ((int16_t)t >> 15) & KYBER_Q; + t = (((t << 1) + KYBER_Q / 2) / KYBER_Q) & 1; msg[i] |= t << j; } } } /************************************************* -* Name: PQCLEAN_KYBER512_CLEAN_poly_getnoise +* Name: PQCLEAN_KYBER512_CLEAN_poly_getnoise_eta1 * * Description: Sample a polynomial deterministically from a seed and a nonce, * with output polynomial close to centered binomial distribution -* with parameter KYBER_ETA +* with parameter KYBER_ETA1 * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *seed: pointer to input seed * (of length KYBER_SYMBYTES bytes) -* - uint8_t nonce: one-byte input nonce +* - uint8_t nonce: one-byte input nonce **************************************************/ -void PQCLEAN_KYBER512_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { - uint8_t buf[KYBER_ETA * KYBER_N / 4]; +void PQCLEAN_KYBER512_CLEAN_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { + uint8_t buf[KYBER_ETA1 * KYBER_N / 4]; prf(buf, sizeof(buf), seed, nonce); - PQCLEAN_KYBER512_CLEAN_cbd(r, buf); + PQCLEAN_KYBER512_CLEAN_poly_cbd_eta1(r, buf); } +/************************************************* +* Name: PQCLEAN_KYBER512_CLEAN_poly_getnoise_eta2 +* +* Description: Sample a polynomial deterministically from a seed and a nonce, +* with output polynomial close to centered binomial distribution +* with parameter KYBER_ETA2 +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *seed: pointer to input seed +* (of length KYBER_SYMBYTES bytes) +* - uint8_t nonce: one-byte input nonce +**************************************************/ +void PQCLEAN_KYBER512_CLEAN_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { + uint8_t buf[KYBER_ETA2 * KYBER_N / 4]; + prf(buf, sizeof(buf), seed, nonce); + PQCLEAN_KYBER512_CLEAN_poly_cbd_eta2(r, buf); +} + + /************************************************* * Name: PQCLEAN_KYBER512_CLEAN_poly_ntt * @@ -200,7 +211,7 @@ void PQCLEAN_KYBER512_CLEAN_poly_invntt_tomont(poly *r) { * * Description: Multiplication of two polynomials in NTT domain * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ @@ -208,8 +219,7 @@ void PQCLEAN_KYBER512_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, cons size_t i; for (i = 0; i < KYBER_N / 4; i++) { PQCLEAN_KYBER512_CLEAN_basemul(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i], PQCLEAN_KYBER512_CLEAN_zetas[64 + i]); - PQCLEAN_KYBER512_CLEAN_basemul(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2], &b->coeffs[4 * i + 2], - -PQCLEAN_KYBER512_CLEAN_zetas[64 + i]); + PQCLEAN_KYBER512_CLEAN_basemul(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2], &b->coeffs[4 * i + 2], -PQCLEAN_KYBER512_CLEAN_zetas[64 + i]); } } @@ -244,28 +254,12 @@ void PQCLEAN_KYBER512_CLEAN_poly_reduce(poly *r) { } } -/************************************************* -* Name: PQCLEAN_KYBER512_CLEAN_poly_csubq -* -* Description: Applies conditional subtraction of q to each coefficient -* of a polynomial. For details of conditional subtraction -* of q see comments in reduce.c -* -* Arguments: - poly *r: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_KYBER512_CLEAN_poly_csubq(poly *r) { - size_t i; - for (i = 0; i < KYBER_N; i++) { - r->coeffs[i] = PQCLEAN_KYBER512_CLEAN_csubq(r->coeffs[i]); - } -} - /************************************************* * Name: PQCLEAN_KYBER512_CLEAN_poly_add * -* Description: Add two polynomials +* Description: Add two polynomials; no modular reduction is performed * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ @@ -279,7 +273,7 @@ void PQCLEAN_KYBER512_CLEAN_poly_add(poly *r, const poly *a, const poly *b) { /************************************************* * Name: PQCLEAN_KYBER512_CLEAN_poly_sub * -* Description: Subtract two polynomials +* Description: Subtract two polynomials; no modular reduction is performed * * Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial diff --git a/crypto_kem/kyber512/clean/poly.h b/crypto_kem/kyber512/clean/poly.h index 1446d212..5ca491d1 100644 --- a/crypto_kem/kyber512/clean/poly.h +++ b/crypto_kem/kyber512/clean/poly.h @@ -11,16 +11,18 @@ typedef struct { int16_t coeffs[KYBER_N]; } poly; -void PQCLEAN_KYBER512_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a); +void PQCLEAN_KYBER512_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a); void PQCLEAN_KYBER512_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]); -void PQCLEAN_KYBER512_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a); +void PQCLEAN_KYBER512_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a); void PQCLEAN_KYBER512_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]); void PQCLEAN_KYBER512_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]); -void PQCLEAN_KYBER512_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a); +void PQCLEAN_KYBER512_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *a); -void PQCLEAN_KYBER512_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); +void PQCLEAN_KYBER512_CLEAN_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); + +void PQCLEAN_KYBER512_CLEAN_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); void PQCLEAN_KYBER512_CLEAN_poly_ntt(poly *r); void PQCLEAN_KYBER512_CLEAN_poly_invntt_tomont(poly *r); @@ -28,7 +30,6 @@ void PQCLEAN_KYBER512_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, cons void PQCLEAN_KYBER512_CLEAN_poly_tomont(poly *r); void PQCLEAN_KYBER512_CLEAN_poly_reduce(poly *r); -void PQCLEAN_KYBER512_CLEAN_poly_csubq(poly *r); void PQCLEAN_KYBER512_CLEAN_poly_add(poly *r, const poly *a, const poly *b); void PQCLEAN_KYBER512_CLEAN_poly_sub(poly *r, const poly *a, const poly *b); diff --git a/crypto_kem/kyber512/clean/polyvec.c b/crypto_kem/kyber512/clean/polyvec.c index 29eee439..9f169f38 100644 --- a/crypto_kem/kyber512/clean/polyvec.c +++ b/crypto_kem/kyber512/clean/polyvec.c @@ -10,19 +10,18 @@ * * Arguments: - uint8_t *r: pointer to output byte array * (needs space for KYBER_POLYVECCOMPRESSEDBYTES) -* - polyvec *a: pointer to input vector of polynomials +* - const polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER512_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a) { +void PQCLEAN_KYBER512_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a) { unsigned int i, j, k; - PQCLEAN_KYBER512_CLEAN_polyvec_csubq(a); - uint16_t t[4]; for (i = 0; i < KYBER_K; i++) { for (j = 0; j < KYBER_N / 4; j++) { for (k = 0; k < 4; k++) { - t[k] = ((((uint32_t)a->vec[i].coeffs[4 * j + k] << 10) + KYBER_Q / 2) - / KYBER_Q) & 0x3ff; + t[k] = a->vec[i].coeffs[4 * j + k]; + t[k] += ((int16_t)t[k] >> 15) & KYBER_Q; + t[k] = ((((uint32_t)t[k] << 10) + KYBER_Q / 2) / KYBER_Q) & 0x3ff; } r[0] = (uint8_t)(t[0] >> 0); @@ -45,8 +44,7 @@ void PQCLEAN_KYBER512_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBY * - const uint8_t *a: pointer to input byte array * (of length KYBER_POLYVECCOMPRESSEDBYTES) **************************************************/ -void PQCLEAN_KYBER512_CLEAN_polyvec_decompress(polyvec *r, - const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { +void PQCLEAN_KYBER512_CLEAN_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { unsigned int i, j, k; uint16_t t[4]; @@ -72,9 +70,9 @@ void PQCLEAN_KYBER512_CLEAN_polyvec_decompress(polyvec *r, * * Arguments: - uint8_t *r: pointer to output byte array * (needs space for KYBER_POLYVECBYTES) -* - polyvec *a: pointer to input vector of polynomials +* - const polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER512_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a) { +void PQCLEAN_KYBER512_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a) { unsigned int i; for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER512_CLEAN_poly_tobytes(r + i * KYBER_POLYBYTES, &a->vec[i]); @@ -128,18 +126,16 @@ void PQCLEAN_KYBER512_CLEAN_polyvec_invntt_tomont(polyvec *r) { } /************************************************* -* Name: PQCLEAN_KYBER512_CLEAN_polyvec_pointwise_acc_montgomery +* Name: PQCLEAN_KYBER512_CLEAN_polyvec_basemul_acc_montgomery * -* Description: Pointwise multiply elements of a and b, accumulate into r, +* Description: Multiply elements of a and b in NTT domain, accumulate into r, * and multiply by 2^-16. * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ -void PQCLEAN_KYBER512_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, - const polyvec *a, - const polyvec *b) { +void PQCLEAN_KYBER512_CLEAN_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) { unsigned int i; poly t; @@ -156,10 +152,10 @@ void PQCLEAN_KYBER512_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, * Name: PQCLEAN_KYBER512_CLEAN_polyvec_reduce * * Description: Applies Barrett reduction to each coefficient -* of each element of a vector of polynomials +* of each element of a vector of polynomials; * for details of the Barrett reduction see comments in reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - polyvec *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER512_CLEAN_polyvec_reduce(polyvec *r) { unsigned int i; @@ -168,29 +164,12 @@ void PQCLEAN_KYBER512_CLEAN_polyvec_reduce(polyvec *r) { } } -/************************************************* -* Name: PQCLEAN_KYBER512_CLEAN_polyvec_csubq -* -* Description: Applies conditional subtraction of q to each coefficient -* of each element of a vector of polynomials -* for details of conditional subtraction of q see comments in -* reduce.c -* -* Arguments: - poly *r: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_KYBER512_CLEAN_polyvec_csubq(polyvec *r) { - unsigned int i; - for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER512_CLEAN_poly_csubq(&r->vec[i]); - } -} - /************************************************* * Name: PQCLEAN_KYBER512_CLEAN_polyvec_add * * Description: Add vectors of polynomials * -* Arguments: - polyvec *r: pointer to output vector of polynomials +* Arguments: - polyvec *r: pointer to output vector of polynomials * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ diff --git a/crypto_kem/kyber512/clean/polyvec.h b/crypto_kem/kyber512/clean/polyvec.h index f26e149c..e0b2e13d 100644 --- a/crypto_kem/kyber512/clean/polyvec.h +++ b/crypto_kem/kyber512/clean/polyvec.h @@ -8,22 +8,18 @@ typedef struct { poly vec[KYBER_K]; } polyvec; -void PQCLEAN_KYBER512_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a); -void PQCLEAN_KYBER512_CLEAN_polyvec_decompress(polyvec *r, - const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); +void PQCLEAN_KYBER512_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a); +void PQCLEAN_KYBER512_CLEAN_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); -void PQCLEAN_KYBER512_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a); +void PQCLEAN_KYBER512_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a); void PQCLEAN_KYBER512_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]); void PQCLEAN_KYBER512_CLEAN_polyvec_ntt(polyvec *r); void PQCLEAN_KYBER512_CLEAN_polyvec_invntt_tomont(polyvec *r); -void PQCLEAN_KYBER512_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, - const polyvec *a, - const polyvec *b); +void PQCLEAN_KYBER512_CLEAN_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b); void PQCLEAN_KYBER512_CLEAN_polyvec_reduce(polyvec *r); -void PQCLEAN_KYBER512_CLEAN_polyvec_csubq(polyvec *r); void PQCLEAN_KYBER512_CLEAN_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); diff --git a/crypto_kem/kyber512/clean/reduce.c b/crypto_kem/kyber512/clean/reduce.c index c9afbdd0..917e7681 100644 --- a/crypto_kem/kyber512/clean/reduce.c +++ b/crypto_kem/kyber512/clean/reduce.c @@ -6,8 +6,7 @@ * Name: PQCLEAN_KYBER512_CLEAN_montgomery_reduce * * Description: Montgomery reduction; given a 32-bit integer a, computes -* 16-bit integer congruent to a * R^-1 mod q, -* where R=2^16 +* 16-bit integer congruent to a * R^-1 mod q, where R=2^16 * * Arguments: - int32_t a: input integer to be reduced; * has to be in {-q2^15,...,q2^15-1} @@ -29,32 +28,17 @@ int16_t PQCLEAN_KYBER512_CLEAN_montgomery_reduce(int32_t a) { * Name: PQCLEAN_KYBER512_CLEAN_barrett_reduce * * Description: Barrett reduction; given a 16-bit integer a, computes -* 16-bit integer congruent to a mod q in {0,...,q} +* centered representative congruent to a mod q in {-(q-1)/2,...,(q-1)/2} * * Arguments: - int16_t a: input integer to be reduced * -* Returns: integer in {0,...,q} congruent to a modulo q. +* Returns: integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q. **************************************************/ int16_t PQCLEAN_KYBER512_CLEAN_barrett_reduce(int16_t a) { int16_t t; const int16_t v = ((1U << 26) + KYBER_Q / 2) / KYBER_Q; - t = (int32_t)v * a >> 26; + t = ((int32_t)v * a + (1 << 25)) >> 26; t *= KYBER_Q; return a - t; } - -/************************************************* -* Name: PQCLEAN_KYBER512_CLEAN_csubq -* -* Description: Conditionallly subtract q -* -* Arguments: - int16_t x: input integer -* -* Returns: a - q if a >= q, else a -**************************************************/ -int16_t PQCLEAN_KYBER512_CLEAN_csubq(int16_t a) { - a -= KYBER_Q; - a += (a >> 15) & KYBER_Q; - return a; -} diff --git a/crypto_kem/kyber512/clean/reduce.h b/crypto_kem/kyber512/clean/reduce.h index 27f68b34..c0668071 100644 --- a/crypto_kem/kyber512/clean/reduce.h +++ b/crypto_kem/kyber512/clean/reduce.h @@ -10,6 +10,4 @@ int16_t PQCLEAN_KYBER512_CLEAN_montgomery_reduce(int32_t a); int16_t PQCLEAN_KYBER512_CLEAN_barrett_reduce(int16_t a); -int16_t PQCLEAN_KYBER512_CLEAN_csubq(int16_t a); - #endif diff --git a/crypto_kem/kyber512/clean/symmetric-shake.c b/crypto_kem/kyber512/clean/symmetric-shake.c index 363b24b5..3cba4b3a 100644 --- a/crypto_kem/kyber512/clean/symmetric-shake.c +++ b/crypto_kem/kyber512/clean/symmetric-shake.c @@ -9,12 +9,10 @@ * * Description: Absorb step of the SHAKE128 specialized for the Kyber context. * -* Arguments: - xof_state *state: pointer to (uninitialized) output -* Keccak state -* - const uint8_t *seed: pointer to KYBER_SYMBYTES input -* to be absorbed into state -* - uint8_t i additional byte of input -* - uint8_t j additional byte of input +* Arguments: - xof_state *state: pointer to (uninitialized) output Keccak state +* - const uint8_t *seed: pointer to KYBER_SYMBYTES input to be absorbed into state +* - uint8_t i: additional byte of input +* - uint8_t j: additional byte of input **************************************************/ void PQCLEAN_KYBER512_CLEAN_kyber_shake128_absorb(xof_state *state, const uint8_t seed[KYBER_SYMBYTES], @@ -26,8 +24,8 @@ void PQCLEAN_KYBER512_CLEAN_kyber_shake128_absorb(xof_state *state, for (i = 0; i < KYBER_SYMBYTES; i++) { extseed[i] = seed[i]; } - extseed[i++] = x; - extseed[i] = y; + extseed[KYBER_SYMBYTES + 0] = x; + extseed[KYBER_SYMBYTES + 1] = y; shake128_absorb(state, extseed, sizeof(extseed)); } @@ -38,23 +36,19 @@ void PQCLEAN_KYBER512_CLEAN_kyber_shake128_absorb(xof_state *state, * Description: Usage of SHAKE256 as a PRF, concatenates secret and public input * and then generates outlen bytes of SHAKE256 output * -* Arguments: - uint8_t *out: pointer to output -* - size_t outlen: number of requested output bytes -* - const uint8_t *key: pointer to the key -* (of length KYBER_SYMBYTES) -* - uint8_t nonce: single-byte nonce (public PRF input) +* Arguments: - uint8_t *out: pointer to output +* - size_t outlen: number of requested output bytes +* - const uint8_t *key: pointer to the key (of length KYBER_SYMBYTES) +* - uint8_t nonce: single-byte nonce (public PRF input) **************************************************/ -void PQCLEAN_KYBER512_CLEAN_kyber_shake256_prf(uint8_t *out, - size_t outlen, - const uint8_t key[KYBER_SYMBYTES], - uint8_t nonce) { +void PQCLEAN_KYBER512_CLEAN_kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce) { unsigned int i; uint8_t extkey[KYBER_SYMBYTES + 1]; for (i = 0; i < KYBER_SYMBYTES; i++) { extkey[i] = key[i]; } - extkey[i] = nonce; + extkey[KYBER_SYMBYTES] = nonce; shake256(out, outlen, extkey, sizeof(extkey)); } diff --git a/crypto_kem/kyber512/clean/symmetric.h b/crypto_kem/kyber512/clean/symmetric.h index d011c556..7b9e78ad 100644 --- a/crypto_kem/kyber512/clean/symmetric.h +++ b/crypto_kem/kyber512/clean/symmetric.h @@ -14,21 +14,16 @@ void PQCLEAN_KYBER512_CLEAN_kyber_shake128_absorb(xof_state *s, uint8_t x, uint8_t y); -void PQCLEAN_KYBER512_CLEAN_kyber_shake256_prf(uint8_t *out, - size_t outlen, - const uint8_t key[KYBER_SYMBYTES], - uint8_t nonce); +void PQCLEAN_KYBER512_CLEAN_kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce); #define XOF_BLOCKBYTES SHAKE128_RATE #define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES) #define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES) #define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER512_CLEAN_kyber_shake128_absorb(STATE, SEED, X, Y) -#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) \ - shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) #define xof_ctx_release(STATE) shake128_ctx_release(STATE) -#define prf(OUT, OUTBYTES, KEY, NONCE) \ - PQCLEAN_KYBER512_CLEAN_kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE) +#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER512_CLEAN_kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE) #define kdf(OUT, IN, INBYTES) shake256(OUT, KYBER_SSBYTES, IN, INBYTES) diff --git a/crypto_kem/kyber768-90s/META.yml b/crypto_kem/kyber768-90s/META.yml index 2c1b4b5d..bc6d2775 100644 --- a/crypto_kem/kyber768-90s/META.yml +++ b/crypto_kem/kyber768-90s/META.yml @@ -6,7 +6,7 @@ length-public-key: 1184 length-ciphertext: 1088 length-secret-key: 2400 length-shared-secret: 32 -nistkat-sha256: 57fa080a0b2295044b128f1e4f7d978a7863ec6c99ebd6239fba747525a3d451 +nistkat-sha256: 68bf2e3914c0b4e053cefc67dd9f10f567946da5720f0b453b347610c3cc2c0a principal-submitters: - Peter Schwabe auxiliary-submitters: @@ -21,9 +21,9 @@ auxiliary-submitters: - Damien Stehlé implementations: - name: clean - version: https://github.com/pq-crystals/kyber/commit/844057468e69527bd15b17fbe03f4b61f9a22065 via https://github.com/jschanck/package-pqclean/tree/b45068b8/kyber + version: https://github.com/pq-crystals/kyber/commit/e7faae9f662f5b92fee4e966f09b2f23e1e91c65 via https://github.com/jschanck/package-pqclean/tree/231c9bec/kyber - name: avx2 - version: https://github.com/pq-crystals/kyber/commit/844057468e69527bd15b17fbe03f4b61f9a22065 via https://github.com/jschanck/package-pqclean/tree/b45068b8/kyber + version: https://github.com/pq-crystals/kyber/commit/e7faae9f662f5b92fee4e966f09b2f23e1e91c65 via https://github.com/jschanck/package-pqclean/tree/231c9bec/kyber supported_platforms: - architecture: x86_64 operating_systems: diff --git a/crypto_kem/kyber768-90s/avx2/aes256ctr.c b/crypto_kem/kyber768-90s/avx2/aes256ctr.c index a8c70e91..957f1440 100644 --- a/crypto_kem/kyber768-90s/avx2/aes256ctr.c +++ b/crypto_kem/kyber768-90s/avx2/aes256ctr.c @@ -2,52 +2,48 @@ #include #include #include -/* - Based heavily on public-domain code by Romain Dolbeau - Different handling of nonce+counter than original version - using separated 64-bit nonce and internal 64-bit counter, starting from zero - Public Domain -*/ +/* Based heavily on public-domain code by Romain Dolbeau + * Different handling of nonce+counter than original version using + * separated 64-bit nonce and internal 64-bit counter, starting from zero + * Public Domain */ -static inline void aesni_encrypt4(uint8_t out[64], - __m128i *n, - const __m128i rkeys[16]) { - __m128i f, f0, f1, f2, f3, t; +static inline void aesni_encrypt4(uint8_t out[64], __m128i *n, const __m128i rkeys[16]) { + __m128i f, f0, f1, f2, f3; + const __m128i idx = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); /* Load current counter value */ f = _mm_load_si128(n); /* Increase counter in 4 consecutive blocks */ - t = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); - f0 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(0, 0)), t); - f1 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(1, 0)), t); - f2 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(2, 0)), t); - f3 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(3, 0)), t); + f0 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(0, 0)), idx); + f1 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(1, 0)), idx); + f2 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(2, 0)), idx); + f3 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(3, 0)), idx); /* Write counter for next iteration, increased by 4 */ _mm_store_si128(n, _mm_add_epi64(f, _mm_set_epi64x(4, 0))); /* Actual AES encryption, 4x interleaved */ - t = _mm_load_si128(&rkeys[0]); - f0 = _mm_xor_si128(f0, t); - f1 = _mm_xor_si128(f1, t); - f2 = _mm_xor_si128(f2, t); - f3 = _mm_xor_si128(f3, t); + f = _mm_load_si128(&rkeys[0]); + f0 = _mm_xor_si128(f0, f); + f1 = _mm_xor_si128(f1, f); + f2 = _mm_xor_si128(f2, f); + f3 = _mm_xor_si128(f3, f); for (int i = 1; i < 14; i++) { - t = _mm_load_si128(&rkeys[i]); - f0 = _mm_aesenc_si128(f0, t); - f1 = _mm_aesenc_si128(f1, t); - f2 = _mm_aesenc_si128(f2, t); - f3 = _mm_aesenc_si128(f3, t); + f = _mm_load_si128(&rkeys[i]); + f0 = _mm_aesenc_si128(f0, f); + f1 = _mm_aesenc_si128(f1, f); + f2 = _mm_aesenc_si128(f2, f); + f3 = _mm_aesenc_si128(f3, f); } - t = _mm_load_si128(&rkeys[14]); - f0 = _mm_aesenclast_si128(f0, t); - f1 = _mm_aesenclast_si128(f1, t); - f2 = _mm_aesenclast_si128(f2, t); - f3 = _mm_aesenclast_si128(f3, t); + f = _mm_load_si128(&rkeys[14]); + f0 = _mm_aesenclast_si128(f0, f); + f1 = _mm_aesenclast_si128(f1, f); + f2 = _mm_aesenclast_si128(f2, f); + f3 = _mm_aesenclast_si128(f3, f); /* Write results */ _mm_storeu_si128((__m128i *)(out + 0), f0); @@ -134,6 +130,7 @@ void PQCLEAN_KYBER76890S_AVX2_aes256ctr_prf(uint8_t *out, while (outlen >= 64) { aesni_encrypt4(out, &state.n, state.rkeys); outlen -= 64; + out += 64; } if (outlen) { diff --git a/crypto_kem/kyber768-90s/avx2/align.h b/crypto_kem/kyber768-90s/avx2/align.h index 50e576f7..3a74f591 100644 --- a/crypto_kem/kyber768-90s/avx2/align.h +++ b/crypto_kem/kyber768-90s/avx2/align.h @@ -2,22 +2,18 @@ #define PQCLEAN_KYBER76890S_AVX2_ALIGN_H #include +#include -#define ALIGN16_TYPE(t) \ - union { \ - __m128i vec; \ - t orig; \ +#define ALIGNED_UINT8(N) \ + union { \ + uint8_t coeffs[(N)]; \ + __m256i vec[((N)+31)/32]; \ } -#define ALIGN32_ARRAY(t, s) \ - union { \ - __m256i vec; \ - t arr[(s)]; \ +#define ALIGNED_INT16(N) \ + union { \ + int16_t coeffs[(N)]; \ + __m256i vec[((N)+15)/16]; \ } -#define ALIGN32_ARRAY_2D(t, n, m) \ - union { \ - __m256i vec; \ - t arr[(n)][(m)]; \ - } #endif diff --git a/crypto_kem/kyber768-90s/avx2/basemul.S b/crypto_kem/kyber768-90s/avx2/basemul.S index 99296569..7cbff8a8 100644 --- a/crypto_kem/kyber768-90s/avx2/basemul.S +++ b/crypto_kem/kyber768-90s/avx2/basemul.S @@ -1,232 +1,107 @@ #include "cdecl.h" -#include "params.h" -.macro schoolbook off,sign -#load -vmovdqa \off+32(%rsi),%ymm7 # b -vmovdqa \off+32(%rdx),%ymm8 # d -vmovdqa \off(%rsi),%ymm9 # a -vmovdqa \off(%rdx),%ymm10 # c +.macro schoolbook off +vmovdqa _16XQINV*2(%rcx),%ymm0 +vmovdqa (64*\off+ 0)*2(%rsi),%ymm1 # a0 +vmovdqa (64*\off+16)*2(%rsi),%ymm2 # b0 +vmovdqa (64*\off+32)*2(%rsi),%ymm3 # a1 +vmovdqa (64*\off+48)*2(%rsi),%ymm4 # b1 -#mul -vpmullw %ymm7,%ymm8,%ymm11 # bd.lo -vpmulhw %ymm7,%ymm8,%ymm12 # bd.hi -vpmullw %ymm7,%ymm10,%ymm13 # bc.lo -vpmulhw %ymm7,%ymm10,%ymm7 # bc.hi -vpmullw %ymm9,%ymm8,%ymm14 # ad.lo -vpmulhw %ymm9,%ymm8,%ymm8 # ad.hi -vpmullw %ymm9,%ymm10,%ymm15 # ac.lo -vpmulhw %ymm9,%ymm10,%ymm9 # ac.hi +vpmullw %ymm0,%ymm1,%ymm9 # a0.lo +vpmullw %ymm0,%ymm2,%ymm10 # b0.lo +vpmullw %ymm0,%ymm3,%ymm11 # a1.lo +vpmullw %ymm0,%ymm4,%ymm12 # b1.lo -#reduce -vpmullw %ymm1,%ymm11,%ymm11 -vpmulhw %ymm0,%ymm11,%ymm11 -vpsubw %ymm11,%ymm12,%ymm11 # bd +vmovdqa (64*\off+ 0)*2(%rdx),%ymm5 # c0 +vmovdqa (64*\off+16)*2(%rdx),%ymm6 # d0 -#mul -vpmullw %ymm2,%ymm11,%ymm10 # rbd.lo -vpmulhw %ymm2,%ymm11,%ymm11 # rbd.hi +vpmulhw %ymm5,%ymm1,%ymm13 # a0c0.hi +vpmulhw %ymm6,%ymm1,%ymm1 # a0d0.hi +vpmulhw %ymm5,%ymm2,%ymm14 # b0c0.hi +vpmulhw %ymm6,%ymm2,%ymm2 # b0d0.hi -#unpack -vpunpcklwd %ymm7,%ymm13,%ymm12 # bc0 -vpunpckhwd %ymm7,%ymm13,%ymm7 # bc1 -vpunpcklwd %ymm8,%ymm14,%ymm13 # ad0 -vpunpckhwd %ymm8,%ymm14,%ymm8 # ad1 -vpunpcklwd %ymm9,%ymm15,%ymm14 # ac0 -vpunpckhwd %ymm9,%ymm15,%ymm9 # ac1 -vpunpcklwd %ymm11,%ymm10,%ymm15 # rbd0 -vpunpckhwd %ymm11,%ymm10,%ymm10 # rbd1 +vmovdqa (64*\off+32)*2(%rdx),%ymm7 # c1 +vmovdqa (64*\off+48)*2(%rdx),%ymm8 # d1 -#add -.ifeq \sign -vpaddd %ymm14,%ymm15,%ymm14 # x0 -vpaddd %ymm9,%ymm10,%ymm9 # x1 -.else -vpsubd %ymm15,%ymm14,%ymm14 # x0 -vpsubd %ymm10,%ymm9,%ymm9 # x1 -.endif -vpaddd %ymm12,%ymm13,%ymm12 # y0 -vpaddd %ymm7,%ymm8,%ymm7 # y1 -.endm +vpmulhw %ymm7,%ymm3,%ymm15 # a1c1.hi +vpmulhw %ymm8,%ymm3,%ymm3 # a1d1.hi +vpmulhw %ymm7,%ymm4,%ymm0 # b1c1.hi +vpmulhw %ymm8,%ymm4,%ymm4 # b1d1.hi -.macro red a0,a1,b0,b1,x,y,z -#pack -vpxor %ymm\x,%ymm\x,%ymm\x -vpblendw $0xAA,%ymm\x,%ymm\a0,%ymm\y -vpblendw $0xAA,%ymm\x,%ymm\a1,%ymm\z -vpsrld $16,%ymm\a0,%ymm\a0 -vpsrld $16,%ymm\a1,%ymm\a1 -vpackusdw %ymm\z,%ymm\y,%ymm\z -vpackusdw %ymm\a1,%ymm\a0,%ymm\a0 -vpblendw $0xAA,%ymm\x,%ymm\b0,%ymm\y -vpblendw $0xAA,%ymm\x,%ymm\b1,%ymm\x -vpsrld $16,%ymm\b0,%ymm\b0 -vpsrld $16,%ymm\b1,%ymm\b1 -vpackusdw %ymm\x,%ymm\y,%ymm\y -vpackusdw %ymm\b1,%ymm\b0,%ymm\b0 +vmovdqa %ymm13,(%rsp) -#reduce -vpmullw %ymm1,%ymm\z,%ymm\z -vpmullw %ymm1,%ymm\y,%ymm\y -vpmulhw %ymm0,%ymm\z,%ymm\z -vpmulhw %ymm0,%ymm\y,%ymm\y -vpsubw %ymm\z,%ymm\a0,%ymm\a0 -vpsubw %ymm\y,%ymm\b0,%ymm\b0 +vpmullw %ymm5,%ymm9,%ymm13 # a0c0.lo +vpmullw %ymm6,%ymm9,%ymm9 # a0d0.lo +vpmullw %ymm5,%ymm10,%ymm5 # b0c0.lo +vpmullw %ymm6,%ymm10,%ymm10 # b0d0.lo + +vpmullw %ymm7,%ymm11,%ymm6 # a1c1.lo +vpmullw %ymm8,%ymm11,%ymm11 # a1d1.lo +vpmullw %ymm7,%ymm12,%ymm7 # b1c1.lo +vpmullw %ymm8,%ymm12,%ymm12 # b1d1.lo + +vmovdqa _16XQ*2(%rcx),%ymm8 +vpmulhw %ymm8,%ymm13,%ymm13 +vpmulhw %ymm8,%ymm9,%ymm9 +vpmulhw %ymm8,%ymm5,%ymm5 +vpmulhw %ymm8,%ymm10,%ymm10 +vpmulhw %ymm8,%ymm6,%ymm6 +vpmulhw %ymm8,%ymm11,%ymm11 +vpmulhw %ymm8,%ymm7,%ymm7 +vpmulhw %ymm8,%ymm12,%ymm12 + +vpsubw (%rsp),%ymm13,%ymm13 # -a0c0 +vpsubw %ymm9,%ymm1,%ymm9 # a0d0 +vpsubw %ymm5,%ymm14,%ymm5 # b0c0 +vpsubw %ymm10,%ymm2,%ymm10 # b0d0 + +vpsubw %ymm6,%ymm15,%ymm6 # a1c1 +vpsubw %ymm11,%ymm3,%ymm11 # a1d1 +vpsubw %ymm7,%ymm0,%ymm7 # b1c1 +vpsubw %ymm12,%ymm4,%ymm12 # b1d1 + +vmovdqa (%r9),%ymm0 +vmovdqa 32(%r9),%ymm1 +vpmullw %ymm0,%ymm10,%ymm2 +vpmullw %ymm0,%ymm12,%ymm3 +vpmulhw %ymm1,%ymm10,%ymm10 +vpmulhw %ymm1,%ymm12,%ymm12 +vpmulhw %ymm8,%ymm2,%ymm2 +vpmulhw %ymm8,%ymm3,%ymm3 +vpsubw %ymm2,%ymm10,%ymm10 # rb0d0 +vpsubw %ymm3,%ymm12,%ymm12 # rb1d1 + +vpaddw %ymm5,%ymm9,%ymm9 +vpaddw %ymm7,%ymm11,%ymm11 +vpsubw %ymm13,%ymm10,%ymm13 +vpsubw %ymm12,%ymm6,%ymm6 + +vmovdqa %ymm13,(64*\off+ 0)*2(%rdi) +vmovdqa %ymm9,(64*\off+16)*2(%rdi) +vmovdqa %ymm6,(64*\off+32)*2(%rdi) +vmovdqa %ymm11,(64*\off+48)*2(%rdi) .endm .text -basemul64_acc_avx: -poly0.0: -schoolbook 0,0 - -#mov -vmovdqa %ymm14,%ymm3 -vmovdqa %ymm9,%ymm4 -vmovdqa %ymm12,%ymm5 -vmovdqa %ymm7,%ymm6 - -poly1.0: -schoolbook 512,0 - -#add -vpaddd %ymm14,%ymm3,%ymm3 -vpaddd %ymm9,%ymm4,%ymm4 -vpaddd %ymm12,%ymm5,%ymm5 -vpaddd %ymm7,%ymm6,%ymm6 - -poly2.0: -schoolbook 1024,0 - -#add -vpaddd %ymm14,%ymm3,%ymm3 -vpaddd %ymm9,%ymm4,%ymm4 -vpaddd %ymm12,%ymm5,%ymm5 -vpaddd %ymm7,%ymm6,%ymm6 - - -#reduce -red 3,4,5,6,7,8,9 - -#store -vmovdqa %ymm3,(%rdi) -vmovdqa %ymm5,32(%rdi) - -poly0.1: -schoolbook 64,1 - -#mov -vmovdqa %ymm14,%ymm3 -vmovdqa %ymm9,%ymm4 -vmovdqa %ymm12,%ymm5 -vmovdqa %ymm7,%ymm6 - -poly1.1: -schoolbook 576,1 - -#add -vpaddd %ymm14,%ymm3,%ymm3 -vpaddd %ymm9,%ymm4,%ymm4 -vpaddd %ymm12,%ymm5,%ymm5 -vpaddd %ymm7,%ymm6,%ymm6 - -poly2.1: -schoolbook 1088,1 - -#add -vpaddd %ymm14,%ymm3,%ymm3 -vpaddd %ymm9,%ymm4,%ymm4 -vpaddd %ymm12,%ymm5,%ymm5 -vpaddd %ymm7,%ymm6,%ymm6 - - -#reduce -red 3,4,5,6,7,8,9 - -#store -vmovdqa %ymm3,64(%rdi) -vmovdqa %ymm5,96(%rdi) - -ret - -.global cdecl(PQCLEAN_KYBER76890S_AVX2_basemul_acc_avx) -.global _cdecl(PQCLEAN_KYBER76890S_AVX2_basemul_acc_avx) -cdecl(PQCLEAN_KYBER76890S_AVX2_basemul_acc_avx): -_cdecl(PQCLEAN_KYBER76890S_AVX2_basemul_acc_avx): -#consts -vmovdqa _16XQ*2(%rcx),%ymm0 -vmovdqa _16XQINV*2(%rcx),%ymm1 - -vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 -call basemul64_acc_avx - -vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_acc_avx - -vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_acc_avx - -vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_acc_avx - -ret - -basemul64_avx: -schoolbook 0,0 - -#reduce -red 14,9,12,7,8,10,11 - -#store -vmovdqa %ymm14,(%rdi) -vmovdqa %ymm12,32(%rdi) - -schoolbook 64,1 - -#reduce -red 14,9,12,7,8,10,11 - -#store -vmovdqa %ymm14,64(%rdi) -vmovdqa %ymm12,96(%rdi) - -ret - .global cdecl(PQCLEAN_KYBER76890S_AVX2_basemul_avx) .global _cdecl(PQCLEAN_KYBER76890S_AVX2_basemul_avx) cdecl(PQCLEAN_KYBER76890S_AVX2_basemul_avx): _cdecl(PQCLEAN_KYBER76890S_AVX2_basemul_avx): -#consts -vmovdqa _16XQ*2(%rcx),%ymm0 -vmovdqa _16XQINV*2(%rcx),%ymm1 +mov %rsp,%r8 +and $-32,%rsp +sub $32,%rsp -vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 -call basemul64_avx +lea (_ZETAS_EXP+176)*2(%rcx),%r9 +schoolbook 0 -vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_avx +add $32*2,%r9 +schoolbook 1 -vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_avx +add $192*2,%r9 +schoolbook 2 -vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_avx +add $32*2,%r9 +schoolbook 3 +mov %r8,%rsp ret diff --git a/crypto_kem/kyber768-90s/avx2/cbd.c b/crypto_kem/kyber768-90s/avx2/cbd.c index a69a2eda..09b47b24 100644 --- a/crypto_kem/kyber768-90s/avx2/cbd.c +++ b/crypto_kem/kyber768-90s/avx2/cbd.c @@ -4,66 +4,64 @@ #include /************************************************* -* Name: PQCLEAN_KYBER76890S_AVX2_cbd +* Name: cbd2 * * Description: Given an array of uniformly random bytes, compute * polynomial with coefficients distributed according to -* a centered binomial distribution with parameter KYBER_ETA +* a centered binomial distribution with parameter eta=2 * -* Arguments: - poly *r: pointer to output polynomial -* - const unsigned char *buf: pointer to input byte array +* Arguments: - poly *r: pointer to output polynomial +* - const __m256i *buf: pointer to aligned input byte array **************************************************/ -void PQCLEAN_KYBER76890S_AVX2_cbd(poly *restrict r, const uint8_t *restrict buf) { +static void cbd2(poly *restrict r, const __m256i buf[2 * KYBER_N / 128]) { unsigned int i; - __m256i vec0, vec1, vec2, vec3, tmp; + __m256i f0, f1, f2, f3; const __m256i mask55 = _mm256_set1_epi32(0x55555555); const __m256i mask33 = _mm256_set1_epi32(0x33333333); const __m256i mask03 = _mm256_set1_epi32(0x03030303); + const __m256i mask0F = _mm256_set1_epi32(0x0F0F0F0F); for (i = 0; i < KYBER_N / 64; i++) { - vec0 = _mm256_load_si256((__m256i *)&buf[32 * i]); + f0 = _mm256_load_si256(&buf[i]); - vec1 = _mm256_srli_epi32(vec0, 1); - vec0 = _mm256_and_si256(mask55, vec0); - vec1 = _mm256_and_si256(mask55, vec1); - vec0 = _mm256_add_epi32(vec0, vec1); + f1 = _mm256_srli_epi16(f0, 1); + f0 = _mm256_and_si256(mask55, f0); + f1 = _mm256_and_si256(mask55, f1); + f0 = _mm256_add_epi8(f0, f1); - vec1 = _mm256_srli_epi32(vec0, 2); - vec0 = _mm256_and_si256(mask33, vec0); - vec1 = _mm256_and_si256(mask33, vec1); + f1 = _mm256_srli_epi16(f0, 2); + f0 = _mm256_and_si256(mask33, f0); + f1 = _mm256_and_si256(mask33, f1); + f0 = _mm256_add_epi8(f0, mask33); + f0 = _mm256_sub_epi8(f0, f1); - vec2 = _mm256_srli_epi32(vec0, 4); - vec3 = _mm256_srli_epi32(vec1, 4); - vec0 = _mm256_and_si256(mask03, vec0); - vec1 = _mm256_and_si256(mask03, vec1); - vec2 = _mm256_and_si256(mask03, vec2); - vec3 = _mm256_and_si256(mask03, vec3); + f1 = _mm256_srli_epi16(f0, 4); + f0 = _mm256_and_si256(mask0F, f0); + f1 = _mm256_and_si256(mask0F, f1); + f0 = _mm256_sub_epi8(f0, mask03); + f1 = _mm256_sub_epi8(f1, mask03); - vec1 = _mm256_sub_epi8(vec0, vec1); - vec3 = _mm256_sub_epi8(vec2, vec3); + f2 = _mm256_unpacklo_epi8(f0, f1); + f3 = _mm256_unpackhi_epi8(f0, f1); - vec0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec1)); - vec1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec1, 1)); - vec2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec3)); - vec3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec3, 1)); + f0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f2)); + f1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f2, 1)); + f2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f3)); + f3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f3, 1)); - tmp = _mm256_unpacklo_epi16(vec0, vec2); - vec2 = _mm256_unpackhi_epi16(vec0, vec2); - vec0 = tmp; - tmp = _mm256_unpacklo_epi16(vec1, vec3); - vec3 = _mm256_unpackhi_epi16(vec1, vec3); - vec1 = tmp; - - tmp = _mm256_permute2x128_si256(vec0, vec2, 0x20); - vec2 = _mm256_permute2x128_si256(vec0, vec2, 0x31); - vec0 = tmp; - tmp = _mm256_permute2x128_si256(vec1, vec3, 0x20); - vec3 = _mm256_permute2x128_si256(vec1, vec3, 0x31); - vec1 = tmp; - - _mm256_store_si256((__m256i *)&r->coeffs[64 * i + 0], vec0); - _mm256_store_si256((__m256i *)&r->coeffs[64 * i + 16], vec2); - _mm256_store_si256((__m256i *)&r->coeffs[64 * i + 32], vec1); - _mm256_store_si256((__m256i *)&r->coeffs[64 * i + 48], vec3); + _mm256_store_si256(&r->vec[4 * i + 0], f0); + _mm256_store_si256(&r->vec[4 * i + 1], f2); + _mm256_store_si256(&r->vec[4 * i + 2], f1); + _mm256_store_si256(&r->vec[4 * i + 3], f3); } } + + +/* buf 32 bytes longer for cbd3 */ +void PQCLEAN_KYBER76890S_AVX2_poly_cbd_eta1(poly *r, const __m256i buf[KYBER_ETA1 * KYBER_N / 128 + 1]) { + cbd2(r, buf); +} + +void PQCLEAN_KYBER76890S_AVX2_poly_cbd_eta2(poly *r, const __m256i buf[KYBER_ETA2 * KYBER_N / 128]) { + cbd2(r, buf); +} diff --git a/crypto_kem/kyber768-90s/avx2/cbd.h b/crypto_kem/kyber768-90s/avx2/cbd.h index 6a300bf7..806b6177 100644 --- a/crypto_kem/kyber768-90s/avx2/cbd.h +++ b/crypto_kem/kyber768-90s/avx2/cbd.h @@ -2,8 +2,11 @@ #define PQCLEAN_KYBER76890S_AVX2_CBD_H #include "params.h" #include "poly.h" +#include #include -void PQCLEAN_KYBER76890S_AVX2_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]); +void PQCLEAN_KYBER76890S_AVX2_poly_cbd_eta1(poly *r, const __m256i buf[KYBER_ETA1 * KYBER_N / 128 + 1]); + +void PQCLEAN_KYBER76890S_AVX2_poly_cbd_eta2(poly *r, const __m256i buf[KYBER_ETA2 * KYBER_N / 128]); #endif diff --git a/crypto_kem/kyber768-90s/avx2/cdecl.h b/crypto_kem/kyber768-90s/avx2/cdecl.h index 40500f53..bebb78c3 100644 --- a/crypto_kem/kyber768-90s/avx2/cdecl.h +++ b/crypto_kem/kyber768-90s/avx2/cdecl.h @@ -1,6 +1,8 @@ #ifndef PQCLEAN_KYBER76890S_AVX2_CDECL_H #define PQCLEAN_KYBER76890S_AVX2_CDECL_H + + #define _16XQ 0 #define _16XQINV 16 #define _16XV 32 @@ -9,9 +11,10 @@ #define _16XMONTSQLO 80 #define _16XMONTSQHI 96 #define _16XMASK 112 -#define _ZETAS_EXP 128 -#define _ZETAS_INV_EXP 528 - +#define _REVIDXB 128 +#define _REVIDXD 144 +#define _ZETAS_EXP 160 +#define _16XSHIFT 624 /* The C ABI on MacOS exports all symbols with a leading * underscore. This means that any symbols we refer to from @@ -23,4 +26,5 @@ #define _cdecl(s) _##s #define cdecl(s) s + #endif diff --git a/crypto_kem/kyber768-90s/avx2/consts.c b/crypto_kem/kyber768-90s/avx2/consts.c index 7a8e798b..167cff84 100644 --- a/crypto_kem/kyber768-90s/avx2/consts.c +++ b/crypto_kem/kyber768-90s/avx2/consts.c @@ -1,155 +1,123 @@ +#include "align.h" #include "consts.h" #include "params.h" -#include + #define Q KYBER_Q -#define MONT ((1U << 16) % Q) -#define QINV 62209 // q^-1 mod 2^16 -#define V (((1U << 26) + Q/2)/Q) -#define FHI (MONT*(MONT*(Q-1)*((Q-1)/128) % Q) % Q) -#define FLO (FHI*QINV % 65536) -#define MONTSQHI (MONT*MONT % Q) -#define MONTSQLO (MONTSQHI*QINV % 65536) +#define MONT (-1044) // 2^16 mod q +#define QINV (-3327) // q^-1 mod 2^16 +#define V 20159 // floor(2^26/q + 0.5) +#define FHI 1441 // mont^2/128 +#define FLO (-10079) // qinv*FHI +#define MONTSQHI 1353 // mont^2 +#define MONTSQLO 20553 // qinv*MONTSQHI #define MASK 4095 +#define SHIFT 32 - -const qdata_t PQCLEAN_KYBER76890S_AVX2_qdata = {.as_arr = { -#define _16XQ 0 +const qdata_t PQCLEAN_KYBER76890S_AVX2_qdata = {.coeffs = { +//#define _16XQ 0 Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, -#define _16XQINV 16 +//#define _16XQINV 16 QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, -#define _16XV 32 +//#define _16XV 32 V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, -#define _16XFLO 48 +//#define _16XFLO 48 FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, -#define _16XFHI 64 +//#define _16XFHI 64 FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, -#define _16XMONTSQLO 80 +//#define _16XMONTSQLO 80 MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, -#define _16XMONTSQHI 96 +//#define _16XMONTSQHI 96 MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, -#define _16XMASK 112 +//#define _16XMASK 112 MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, -#define _ZETAS_EXP 128 - 31499, 31499, 2571, 2571, 14746, 14746, 2970, 2970, - 13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, - 53134, 53134, 53134, 53134, 53134, 53134, 53134, 53134, - 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, - 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, - 44630, 44630, 44630, 44630, 27758, 27758, 27758, 27758, - 61737, 61737, 61737, 61737, 49846, 49846, 49846, 49846, - 3158, 3158, 3158, 3158, 622, 622, 622, 622, - 1577, 1577, 1577, 1577, 182, 182, 182, 182, - 59709, 59709, 17364, 17364, 39176, 39176, 36479, 36479, - 5572, 5572, 64434, 64434, 21439, 21439, 39295, 39295, - 573, 573, 2004, 2004, 264, 264, 383, 383, - 2500, 2500, 1458, 1458, 1727, 1727, 3199, 3199, - 59847, 59020, 1497, 30967, 41972, 20179, 20711, 25081, - 52740, 26617, 16065, 53095, 9135, 64887, 39550, 27837, - 1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, - 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, - 65202, 54059, 33310, 20494, 37798, 945, 50654, 6182, - 32011, 10631, 29176, 36775, 47051, 17561, 51106, 60261, - 2226, 555, 2078, 1550, 422, 177, 3038, 1574, - 3083, 1159, 2552, 2727, 1739, 2457, 418, 3173, - 11182, 13387, 51303, 43881, 13131, 60950, 23093, 5493, - 33034, 30318, 46795, 12639, 20100, 18525, 19529, 52918, - 430, 843, 871, 105, 587, 3094, 2869, 1653, - 778, 3182, 1483, 1119, 644, 349, 329, 3254, - 788, 788, 1812, 1812, 28191, 28191, 28191, 28191, - 28191, 28191, 28191, 28191, 48842, 48842, 48842, 48842, - 48842, 48842, 48842, 48842, 287, 287, 287, 287, - 287, 287, 287, 287, 202, 202, 202, 202, - 202, 202, 202, 202, 10690, 10690, 10690, 10690, - 1359, 1359, 1359, 1359, 54335, 54335, 54335, 54335, - 31164, 31164, 31164, 31164, 962, 962, 962, 962, - 2127, 2127, 2127, 2127, 1855, 1855, 1855, 1855, - 1468, 1468, 1468, 1468, 37464, 37464, 24313, 24313, - 55004, 55004, 8800, 8800, 18427, 18427, 8859, 8859, - 26676, 26676, 49374, 49374, 2648, 2648, 1017, 1017, - 732, 732, 608, 608, 1787, 1787, 411, 411, - 3124, 3124, 1758, 1758, 19884, 37287, 49650, 56638, - 37227, 9076, 35338, 18250, 13427, 14017, 36381, 52780, - 16832, 4312, 41381, 47622, 2476, 3239, 3058, 830, - 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, - 448, 2264, 677, 2054, 34353, 25435, 58154, 24392, - 44610, 10946, 24215, 16990, 10336, 57603, 43035, 10907, - 31637, 28644, 23998, 48114, 817, 603, 1322, 1864, - 2114, 1218, 2455, 2142, 2144, 2051, 1819, 2459, - 3221, 996, 958, 1522, 20297, 2146, 15356, 33152, - 59257, 50634, 54492, 14470, 44039, 45338, 23211, 48094, - 41677, 45279, 7757, 23132, 1097, 610, 2044, 384, - 3193, 1994, 220, 1670, 1799, 794, 2475, 478, - 3021, 991, 1869, 1628, 0, 0, 0, 0, +//#define _REVIDXB 128 + 3854, 3340, 2826, 2312, 1798, 1284, 770, 256, + 3854, 3340, 2826, 2312, 1798, 1284, 770, 256, -#define _ZETAS_INV_EXP 528 - 42405, 57780, 20258, 23860, 17443, 42326, 20199, 21498, - 51067, 11045, 14903, 6280, 32385, 50181, 63391, 45240, - 1701, 1460, 2338, 308, 2851, 854, 2535, 1530, - 1659, 3109, 1335, 136, 2945, 1285, 2719, 2232, - 17423, 41539, 36893, 33900, 54630, 22502, 7934, 55201, - 48547, 41322, 54591, 20927, 41145, 7383, 40102, 31184, - 1807, 2371, 2333, 108, 870, 1510, 1278, 1185, - 1187, 874, 2111, 1215, 1465, 2007, 2726, 2512, - 17915, 24156, 61225, 48705, 12757, 29156, 51520, 52110, - 47287, 30199, 56461, 28310, 8899, 15887, 28250, 45653, - 1275, 2652, 1065, 2881, 725, 1508, 2368, 398, - 951, 247, 1421, 3222, 2499, 271, 90, 853, - 16163, 16163, 38861, 38861, 56678, 56678, 47110, 47110, - 56737, 56737, 10533, 10533, 41224, 41224, 28073, 28073, - 1571, 1571, 205, 205, 2918, 2918, 1542, 1542, - 2721, 2721, 2597, 2597, 2312, 2312, 681, 681, - 34373, 34373, 34373, 34373, 11202, 11202, 11202, 11202, - 64178, 64178, 64178, 64178, 54847, 54847, 54847, 54847, - 1861, 1861, 1861, 1861, 1474, 1474, 1474, 1474, - 1202, 1202, 1202, 1202, 2367, 2367, 2367, 2367, - 16695, 16695, 16695, 16695, 16695, 16695, 16695, 16695, - 37346, 37346, 37346, 37346, 37346, 37346, 37346, 37346, - 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127, - 3042, 3042, 3042, 3042, 3042, 3042, 3042, 3042, - 64749, 64749, 1517, 1517, 12619, 46008, 47012, 45437, - 52898, 18742, 35219, 32503, 60044, 42444, 4587, 52406, - 21656, 14234, 52150, 54355, 75, 3000, 2980, 2685, - 2210, 1846, 147, 2551, 1676, 460, 235, 2742, - 3224, 2458, 2486, 2899, 5276, 14431, 47976, 18486, - 28762, 36361, 54906, 33526, 59355, 14883, 64592, 27739, - 45043, 32227, 11478, 335, 156, 2911, 872, 1590, - 602, 777, 2170, 246, 1755, 291, 3152, 2907, - 1779, 1251, 2774, 1103, 37700, 25987, 650, 56402, - 12442, 49472, 38920, 12797, 40456, 44826, 45358, 23565, - 34570, 64040, 6517, 5690, 1860, 3203, 1162, 1618, - 666, 320, 8, 2813, 1544, 282, 1838, 1293, - 2314, 552, 2677, 2106, 26242, 26242, 44098, 44098, - 1103, 1103, 59965, 59965, 29058, 29058, 26361, 26361, - 48173, 48173, 5828, 5828, 130, 130, 1602, 1602, - 1871, 1871, 829, 829, 2946, 2946, 3065, 3065, - 1325, 1325, 2756, 2756, 15691, 15691, 15691, 15691, - 3800, 3800, 3800, 3800, 37779, 37779, 37779, 37779, - 20907, 20907, 20907, 20907, 3147, 3147, 3147, 3147, - 1752, 1752, 1752, 1752, 2707, 2707, 2707, 2707, - 171, 171, 171, 171, 12403, 12403, 12403, 12403, - 12403, 12403, 12403, 12403, 52012, 52012, 52012, 52012, - 52012, 52012, 52012, 52012, 1907, 1907, 1907, 1907, - 1907, 1907, 1907, 1907, 1836, 1836, 1836, 1836, - 1836, 1836, 1836, 1836, 50791, 50791, 359, 359, - 60300, 60300, 1932, 1932, 0, 0, 0, 0 +//#define _REVIDXD 144 + 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0, 0, + +//#define _ZETAS_EXP 160 + 31498, 31498, 31498, 31498, -758, -758, -758, -758, + 5237, 5237, 5237, 5237, 1397, 1397, 1397, 1397, + 14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745, + 14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745, + -359, -359, -359, -359, -359, -359, -359, -359, + -359, -359, -359, -359, -359, -359, -359, -359, + 13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, + -12402, -12402, -12402, -12402, -12402, -12402, -12402, -12402, + 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, + 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, + -20907, -20907, -20907, -20907, 27758, 27758, 27758, 27758, + -3799, -3799, -3799, -3799, -15690, -15690, -15690, -15690, + -171, -171, -171, -171, 622, 622, 622, 622, + 1577, 1577, 1577, 1577, 182, 182, 182, 182, + -5827, -5827, 17363, 17363, -26360, -26360, -29057, -29057, + 5571, 5571, -1102, -1102, 21438, 21438, -26242, -26242, + 573, 573, -1325, -1325, 264, 264, 383, 383, + -829, -829, 1458, 1458, -1602, -1602, -130, -130, + -5689, -6516, 1496, 30967, -23565, 20179, 20710, 25080, + -12796, 26616, 16064, -12442, 9134, -650, -25986, 27837, + 1223, 652, -552, 1015, -1293, 1491, -282, -1544, + 516, -8, -320, -666, -1618, -1162, 126, 1469, + -335, -11477, -32227, 20494, -27738, 945, -14883, 6182, + 32010, 10631, 29175, -28762, -18486, 17560, -14430, -5276, + -1103, 555, -1251, 1550, 422, 177, -291, 1574, + -246, 1159, -777, -602, -1590, -872, 418, -156, + 11182, 13387, -14233, -21655, 13131, -4587, 23092, 5493, + -32502, 30317, -18741, 12639, 20100, 18525, 19529, -12619, + 430, 843, 871, 105, 587, -235, -460, 1653, + 778, -147, 1483, 1119, 644, 349, 329, -75, + 787, 787, 787, 787, 787, 787, 787, 787, + 787, 787, 787, 787, 787, 787, 787, 787, + -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + 28191, 28191, 28191, 28191, 28191, 28191, 28191, 28191, + -16694, -16694, -16694, -16694, -16694, -16694, -16694, -16694, + 287, 287, 287, 287, 287, 287, 287, 287, + 202, 202, 202, 202, 202, 202, 202, 202, + 10690, 10690, 10690, 10690, 1358, 1358, 1358, 1358, + -11202, -11202, -11202, -11202, 31164, 31164, 31164, 31164, + 962, 962, 962, 962, -1202, -1202, -1202, -1202, + -1474, -1474, -1474, -1474, 1468, 1468, 1468, 1468, + -28073, -28073, 24313, 24313, -10532, -10532, 8800, 8800, + 18426, 18426, 8859, 8859, 26675, 26675, -16163, -16163, + -681, -681, 1017, 1017, 732, 732, 608, 608, + -1542, -1542, 411, 411, -205, -205, -1571, -1571, + 19883, -28250, -15887, -8898, -28309, 9075, -30199, 18249, + 13426, 14017, -29156, -12757, 16832, 4311, -24155, -17915, + -853, -90, -271, 830, 107, -1421, -247, -951, + -398, 961, -1508, -725, 448, -1065, 677, -1275, + -31183, 25435, -7382, 24391, -20927, 10946, 24214, 16989, + 10335, -7934, -22502, 10906, 31636, 28644, 23998, -17422, + 817, 603, 1322, -1465, -1215, 1218, -874, -1187, + -1185, -1278, -1510, -870, -108, 996, 958, 1522, + 20297, 2146, 15355, -32384, -6280, -14903, -11044, 14469, + -21498, -20198, 23210, -17442, -23860, -20257, 7756, 23132, + 1097, 610, -1285, 384, -136, -1335, 220, -1659, + -1530, 794, -854, 478, -308, 991, -1460, 1628, + +//#define _16XSHIFT 624 + SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, + SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT } }; diff --git a/crypto_kem/kyber768-90s/avx2/consts.h b/crypto_kem/kyber768-90s/avx2/consts.h index 3bcce40b..9f415363 100644 --- a/crypto_kem/kyber768-90s/avx2/consts.h +++ b/crypto_kem/kyber768-90s/avx2/consts.h @@ -1,19 +1,10 @@ #ifndef PQCLEAN_KYBER76890S_AVX2_CONSTS_H #define PQCLEAN_KYBER76890S_AVX2_CONSTS_H +#include "align.h" #include "cdecl.h" -#include "params.h" -#include -#include -#define ALIGNED_UINT16_T(N) \ - union { \ - __m256i as_vec; \ - uint16_t as_arr[(N)]; \ - } - -typedef ALIGNED_UINT16_T(928) qdata_t; - +typedef ALIGNED_INT16(640) qdata_t; extern const qdata_t PQCLEAN_KYBER76890S_AVX2_qdata; #endif diff --git a/crypto_kem/kyber768-90s/avx2/fq.S b/crypto_kem/kyber768-90s/avx2/fq.S index ccada6de..55789344 100644 --- a/crypto_kem/kyber768-90s/avx2/fq.S +++ b/crypto_kem/kyber768-90s/avx2/fq.S @@ -13,14 +13,14 @@ vmovdqa 160(%rdi),%ymm7 vmovdqa 192(%rdi),%ymm8 vmovdqa 224(%rdi),%ymm9 -red16 2,10 -red16 3,11 -red16 4,12 -red16 5,13 -red16 6,14 -red16 7,15 -red16 8,10 -red16 9,11 +red16 2 +red16 3 +red16 4 +red16 5 +red16 6 +red16 7 +red16 8 +red16 9 #store vmovdqa %ymm2,(%rdi) @@ -46,49 +46,6 @@ add $256,%rdi call reduce128_avx ret -csubq128_avx: -#load -vmovdqa (%rdi),%ymm1 -vmovdqa 32(%rdi),%ymm2 -vmovdqa 64(%rdi),%ymm3 -vmovdqa 96(%rdi),%ymm4 -vmovdqa 128(%rdi),%ymm5 -vmovdqa 160(%rdi),%ymm6 -vmovdqa 192(%rdi),%ymm7 -vmovdqa 224(%rdi),%ymm8 - -csubq 1,9 -csubq 2,10 -csubq 3,11 -csubq 4,12 -csubq 5,13 -csubq 6,14 -csubq 7,15 -csubq 8,9 - -#store -vmovdqa %ymm1,(%rdi) -vmovdqa %ymm2,32(%rdi) -vmovdqa %ymm3,64(%rdi) -vmovdqa %ymm4,96(%rdi) -vmovdqa %ymm5,128(%rdi) -vmovdqa %ymm6,160(%rdi) -vmovdqa %ymm7,192(%rdi) -vmovdqa %ymm8,224(%rdi) - -ret - -.global cdecl(PQCLEAN_KYBER76890S_AVX2_csubq_avx) -.global _cdecl(PQCLEAN_KYBER76890S_AVX2_csubq_avx) -cdecl(PQCLEAN_KYBER76890S_AVX2_csubq_avx): -_cdecl(PQCLEAN_KYBER76890S_AVX2_csubq_avx): -#consts -vmovdqa _16XQ*2(%rsi),%ymm0 -call csubq128_avx -add $256,%rdi -call csubq128_avx -ret - tomont128_avx: #load vmovdqa (%rdi),%ymm3 diff --git a/crypto_kem/kyber768-90s/avx2/fq.inc b/crypto_kem/kyber768-90s/avx2/fq.inc index 75df098a..4b7afc31 100644 --- a/crypto_kem/kyber768-90s/avx2/fq.inc +++ b/crypto_kem/kyber768-90s/avx2/fq.inc @@ -1,6 +1,10 @@ -.macro red16 r,x=12 +.macro red16 r,rs=0,x=12 vpmulhw %ymm1,%ymm\r,%ymm\x +.if \rs +vpmulhrsw %ymm\rs,%ymm\x,%ymm\x +.else vpsraw $10,%ymm\x,%ymm\x +.endif vpmullw %ymm0,%ymm\x,%ymm\x vpsubw %ymm\x,%ymm\r,%ymm\r .endm @@ -10,9 +14,6 @@ vpsubw %ymm0,%ymm\r,%ymm\r vpsraw $15,%ymm\r,%ymm\x vpand %ymm0,%ymm\x,%ymm\x vpaddw %ymm\x,%ymm\r,%ymm\r -#vpcmpgtw %ymm0,%ymm\r,%ymm\x -#vpand %ymm0,%ymm\x,%ymm\x -#vpsubw %ymm\x,%ymm\r,%ymm\r .endm .macro caddq r,x=12 diff --git a/crypto_kem/kyber768-90s/avx2/indcpa.c b/crypto_kem/kyber768-90s/avx2/indcpa.c index d189d841..317c6635 100644 --- a/crypto_kem/kyber768-90s/avx2/indcpa.c +++ b/crypto_kem/kyber768-90s/avx2/indcpa.c @@ -8,6 +8,7 @@ #include "randombytes.h" #include "rejsample.h" #include "symmetric.h" +#include #include #include @@ -15,11 +16,14 @@ * Name: pack_pk * * Description: Serialize the public key as concatenation of the -* serialized vector of polynomials pk -* and the public seed used to generate the matrix A. +* serialized vector of polynomials pk and the +* public seed used to generate the matrix A. +* The polynomial coefficients in pk are assumed to +* lie in the invertal [0,q], i.e. pk must be reduced +* by PQCLEAN_KYBER76890S_AVX2_polyvec_reduce(). * -* Arguments: uint8_t *r: pointer to the output serialized public key -* polyvec *pk: pointer to the input public-key polyvec +* Arguments: uint8_t *r: pointer to the output serialized public key +* polyvec *pk: pointer to the input public-key polyvec * const uint8_t *seed: pointer to the input public seed **************************************************/ static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], @@ -55,9 +59,12 @@ static void unpack_pk(polyvec *pk, /************************************************* * Name: pack_sk * -* Description: Serialize the secret key +* Description: Serialize the secret key. +* The polynomial coefficients in sk are assumed to +* lie in the invertal [0,q], i.e. sk must be reduced +* by PQCLEAN_KYBER76890S_AVX2_polyvec_reduce(). * -* Arguments: - uint8_t *r: pointer to output serialized secret key +* Arguments: - uint8_t *r: pointer to output serialized secret key * - polyvec *sk: pointer to input vector of polynomials (secret key) **************************************************/ static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { @@ -67,15 +74,12 @@ static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { /************************************************* * Name: unpack_sk * -* Description: De-serialize the secret key; -* inverse of pack_sk +* Description: De-serialize the secret key; inverse of pack_sk * -* Arguments: - polyvec *sk: pointer to output vector of polynomials -* (secret key) +* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key) * - const uint8_t *packedsk: pointer to input serialized secret key **************************************************/ -static void unpack_sk(polyvec *sk, - const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { +static void unpack_sk(polyvec *sk, const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { PQCLEAN_KYBER76890S_AVX2_polyvec_frombytes(sk, packedsk); } @@ -84,15 +88,16 @@ static void unpack_sk(polyvec *sk, * * Description: Serialize the ciphertext as concatenation of the * compressed and serialized vector of polynomials b -* and the compressed and serialized polynomial v +* and the compressed and serialized polynomial v. +* The polynomial coefficients in b and v are assumed to +* lie in the invertal [0,q], i.e. b and v must be reduced +* by PQCLEAN_KYBER76890S_AVX2_polyvec_reduce() and PQCLEAN_KYBER76890S_AVX2_poly_reduce(), respectively. * * Arguments: uint8_t *r: pointer to the output serialized ciphertext -* poly *pk: pointer to the input vector of polynomials b -* poly *v: pointer to the input polynomial v +* poly *pk: pointer to the input vector of polynomials b +* poly *v: pointer to the input polynomial v **************************************************/ -static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], - polyvec *b, - poly *v) { +static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], polyvec *b, poly *v) { PQCLEAN_KYBER76890S_AVX2_polyvec_compress(r, b); PQCLEAN_KYBER76890S_AVX2_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v); } @@ -103,13 +108,11 @@ static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], * Description: De-serialize and decompress ciphertext from a byte array; * approximate inverse of pack_ciphertext * -* Arguments: - polyvec *b: pointer to the output vector of polynomials b -* - poly *v: pointer to the output polynomial v +* Arguments: - polyvec *b: pointer to the output vector of polynomials b +* - poly *v: pointer to the output polynomial v * - const uint8_t *c: pointer to the input serialized ciphertext **************************************************/ -static void unpack_ciphertext(polyvec *b, - poly *v, - const uint8_t c[KYBER_INDCPA_BYTES]) { +static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t c[KYBER_INDCPA_BYTES]) { PQCLEAN_KYBER76890S_AVX2_polyvec_decompress(b, c); PQCLEAN_KYBER76890S_AVX2_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES); } @@ -120,11 +123,9 @@ static void unpack_ciphertext(polyvec *b, * Description: Run rejection sampling on uniform random bytes to generate * uniform random integers mod q * -* Arguments: - int16_t *r: pointer to output buffer -* - unsigned int len: requested number of 16-bit integers -* (uniform mod q) -* - const uint8_t *buf: pointer to input buffer -* (assumed to be uniform random bytes) +* Arguments: - int16_t *r: pointer to output array +* - unsigned int len: requested number of 16-bit integers (uniform mod q) +* - const uint8_t *buf: pointer to input buffer (assumed to be uniformly random bytes) * - unsigned int buflen: length of input buffer in bytes * * Returns number of sampled 16-bit integers (at most len) @@ -134,16 +135,19 @@ static unsigned int rej_uniform(int16_t *r, const uint8_t *buf, unsigned int buflen) { unsigned int ctr, pos; - uint16_t val; + uint16_t val0, val1; ctr = pos = 0; - while (ctr < len && pos + 2 <= buflen) { - val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); - pos += 2; + while (ctr < len && pos + 3 <= buflen) { + val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; + val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF; + pos += 3; - if (val < 19 * KYBER_Q) { - val -= ((uint32_t)val * 20159 >> 26) * KYBER_Q; // Barrett reduction - r[ctr++] = (int16_t)val; + if (val0 < KYBER_Q) { + r[ctr++] = val0; + } + if (ctr < len && val1 < KYBER_Q) { + r[ctr++] = val1; } } @@ -165,12 +169,11 @@ static unsigned int rej_uniform(int16_t *r, * - const uint8_t *seed: pointer to input seed * - int transposed: boolean deciding whether A or A^T is generated **************************************************/ -#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \ - + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) -void PQCLEAN_KYBER76890S_AVX2_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed) { - unsigned int ctr, i, j; - ALIGN16_TYPE(uint64_t) nonce = {.orig = 0}; - ALIGN32_ARRAY(uint8_t, GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES) buf; +void PQCLEAN_KYBER76890S_AVX2_gen_matrix(polyvec *a, const uint8_t seed[32], int transposed) { + unsigned int ctr, i, j, k; + unsigned int buflen, off; + uint64_t nonce = 0; + ALIGNED_UINT8(REJ_UNIFORM_AVX_NBLOCKS * AES256CTR_BLOCKBYTES) buf; aes256ctr_ctx state; PQCLEAN_KYBER76890S_AVX2_aes256ctr_init(&state, seed, 0); @@ -178,19 +181,24 @@ void PQCLEAN_KYBER76890S_AVX2_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SY for (i = 0; i < KYBER_K; i++) { for (j = 0; j < KYBER_K; j++) { if (transposed) { - nonce.orig = (j << 8) | i; + nonce = (j << 8) | i; } else { - nonce.orig = (i << 8) | j; + nonce = (i << 8) | j; } - state.n = _mm_loadl_epi64(&nonce.vec); - PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(buf.arr, GEN_MATRIX_NBLOCKS, &state); - ctr = PQCLEAN_KYBER76890S_AVX2_rej_uniform_avx(a[i].vec[j].coeffs, buf.arr); + state.n = _mm_loadl_epi64((__m128i *)&nonce); + PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(buf.coeffs, REJ_UNIFORM_AVX_NBLOCKS, &state); + buflen = REJ_UNIFORM_AVX_NBLOCKS * AES256CTR_BLOCKBYTES; + ctr = PQCLEAN_KYBER76890S_AVX2_rej_uniform_avx(a[i].vec[j].coeffs, buf.coeffs); while (ctr < KYBER_N) { - PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(buf.arr, 1, &state); - ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf.arr, - XOF_BLOCKBYTES); + off = buflen % 3; + for (k = 0; k < off; k++) { + buf.coeffs[k] = buf.coeffs[buflen - off + k]; + } + PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(buf.coeffs + off, 1, &state); + buflen = off + AES256CTR_BLOCKBYTES; + ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf.coeffs, buflen); } PQCLEAN_KYBER76890S_AVX2_poly_nttunpack(&a[i].vec[j]); @@ -212,39 +220,41 @@ void PQCLEAN_KYBER76890S_AVX2_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SY void PQCLEAN_KYBER76890S_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { unsigned int i; - ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; - const uint8_t *publicseed = buf.arr; - const uint8_t *noiseseed = buf.arr + KYBER_SYMBYTES; + uint8_t buf[2 * KYBER_SYMBYTES]; + const uint8_t *publicseed = buf; + const uint8_t *noiseseed = buf + KYBER_SYMBYTES; polyvec a[KYBER_K], e, pkpv, skpv; - randombytes(buf.arr, KYBER_SYMBYTES); - hash_g(buf.arr, buf.arr, KYBER_SYMBYTES); + randombytes(buf, KYBER_SYMBYTES); + hash_g(buf, buf, KYBER_SYMBYTES); gen_a(a, publicseed); - ALIGN16_TYPE(uint64_t) nonce = {.orig = 0}; +#define NOISE_NBLOCKS ((KYBER_ETA1*KYBER_N/4)/AES256CTR_BLOCKBYTES) /* Assumes divisibility */ + uint64_t nonce = 0; + ALIGNED_UINT8(NOISE_NBLOCKS * AES256CTR_BLOCKBYTES + 32) coins; // +32 bytes as required by PQCLEAN_KYBER76890S_AVX2_poly_cbd_eta1 aes256ctr_ctx state; - ALIGN32_ARRAY(uint8_t, 128) coins; - PQCLEAN_KYBER76890S_AVX2_aes256ctr_init(&state, noiseseed, nonce.orig++); + PQCLEAN_KYBER76890S_AVX2_aes256ctr_init(&state, noiseseed, nonce++); for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(coins.arr, 2, &state); - state.n = _mm_loadl_epi64(&nonce.vec); - nonce.orig++; - PQCLEAN_KYBER76890S_AVX2_cbd(&skpv.vec[i], coins.arr); + PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(coins.coeffs, NOISE_NBLOCKS, &state); + state.n = _mm_loadl_epi64((__m128i *)&nonce); + nonce += 1; + PQCLEAN_KYBER76890S_AVX2_poly_cbd_eta1(&skpv.vec[i], coins.vec); } for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(coins.arr, 2, &state); - state.n = _mm_loadl_epi64(&nonce.vec); - nonce.orig++; - PQCLEAN_KYBER76890S_AVX2_cbd(&e.vec[i], coins.arr); + PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(coins.coeffs, NOISE_NBLOCKS, &state); + state.n = _mm_loadl_epi64((__m128i *)&nonce); + nonce += 1; + PQCLEAN_KYBER76890S_AVX2_poly_cbd_eta1(&e.vec[i], coins.vec); } PQCLEAN_KYBER76890S_AVX2_polyvec_ntt(&skpv); + PQCLEAN_KYBER76890S_AVX2_polyvec_reduce(&skpv); PQCLEAN_KYBER76890S_AVX2_polyvec_ntt(&e); // matrix-vector multiplication for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER76890S_AVX2_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); + PQCLEAN_KYBER76890S_AVX2_polyvec_basemul_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); PQCLEAN_KYBER76890S_AVX2_poly_tomont(&pkpv.vec[i]); } @@ -261,70 +271,70 @@ void PQCLEAN_KYBER76890S_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBY * Description: Encryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *c: pointer to output ciphertext -* (of length KYBER_INDCPA_BYTES bytes) -* - const uint8_t *m: pointer to input message -* (of length KYBER_INDCPA_MSGBYTES bytes) -* - const uint8_t *pk: pointer to input public key -* (of length KYBER_INDCPA_PUBLICKEYBYTES) -* - const uint8_t *coins: pointer to input random coins -* used as seed (of length KYBER_SYMBYTES) -* to deterministically generate all -* randomness +* Arguments: - uint8_t *c: pointer to output ciphertext +* (of length KYBER_INDCPA_BYTES bytes) +* - const uint8_t *m: pointer to input message +* (of length KYBER_INDCPA_MSGBYTES bytes) +* - const uint8_t *pk: pointer to input public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES) +* - const uint8_t *coins: pointer to input random coins used as seed +* (of length KYBER_SYMBYTES) to deterministically +* generate all randomness **************************************************/ void PQCLEAN_KYBER76890S_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], const uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], const uint8_t coins[KYBER_SYMBYTES]) { unsigned int i; - ALIGN32_ARRAY(uint8_t, KYBER_SYMBYTES) seed; - polyvec sp, pkpv, ep, at[KYBER_K], bp; + uint8_t seed[KYBER_SYMBYTES]; + polyvec sp, pkpv, ep, at[KYBER_K], b; poly v, k, epp; - unpack_pk(&pkpv, seed.arr, pk); + unpack_pk(&pkpv, seed, pk); PQCLEAN_KYBER76890S_AVX2_poly_frommsg(&k, m); - gen_at(at, seed.arr); + gen_at(at, seed); - ALIGN16_TYPE(uint64_t) nonce = {.orig = 0}; +#define NOISE_NBLOCKS ((KYBER_ETA1*KYBER_N/4)/AES256CTR_BLOCKBYTES) /* Assumes divisibility */ +#define CIPHERTEXTNOISE_NBLOCKS ((KYBER_ETA2*KYBER_N/4)/AES256CTR_BLOCKBYTES) /* Assumes divisibility */ + uint64_t nonce = 0; + ALIGNED_UINT8(NOISE_NBLOCKS * AES256CTR_BLOCKBYTES + 32) buf; /* +32 bytes as required by PQCLEAN_KYBER76890S_AVX2_poly_cbd_eta1 */ aes256ctr_ctx state; - ALIGN32_ARRAY(uint8_t, 128) buf; - PQCLEAN_KYBER76890S_AVX2_aes256ctr_init(&state, coins, nonce.orig++); + PQCLEAN_KYBER76890S_AVX2_aes256ctr_init(&state, coins, nonce++); for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(buf.arr, 2, &state); - state.n = _mm_loadl_epi64(&nonce.vec); - nonce.orig++; - PQCLEAN_KYBER76890S_AVX2_cbd(&sp.vec[i], buf.arr); + PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(buf.coeffs, NOISE_NBLOCKS, &state); + state.n = _mm_loadl_epi64((__m128i *)&nonce); + nonce += 1; + PQCLEAN_KYBER76890S_AVX2_poly_cbd_eta1(&sp.vec[i], buf.vec); } for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(buf.arr, 2, &state); - state.n = _mm_loadl_epi64(&nonce.vec); - nonce.orig++; - PQCLEAN_KYBER76890S_AVX2_cbd(&ep.vec[i], buf.arr); + PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(buf.coeffs, CIPHERTEXTNOISE_NBLOCKS, &state); + state.n = _mm_loadl_epi64((__m128i *)&nonce); + nonce += 1; + PQCLEAN_KYBER76890S_AVX2_poly_cbd_eta2(&ep.vec[i], buf.vec); } - PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(buf.arr, 2, &state); - state.n = _mm_loadl_epi64(&nonce.vec); - nonce.orig++; - PQCLEAN_KYBER76890S_AVX2_cbd(&epp, buf.arr); + PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(buf.coeffs, CIPHERTEXTNOISE_NBLOCKS, &state); + state.n = _mm_loadl_epi64((__m128i *)&nonce); + nonce += 1; + PQCLEAN_KYBER76890S_AVX2_poly_cbd_eta2(&epp, buf.vec); PQCLEAN_KYBER76890S_AVX2_polyvec_ntt(&sp); // matrix-vector multiplication for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER76890S_AVX2_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp); + PQCLEAN_KYBER76890S_AVX2_polyvec_basemul_acc_montgomery(&b.vec[i], &at[i], &sp); } + PQCLEAN_KYBER76890S_AVX2_polyvec_basemul_acc_montgomery(&v, &pkpv, &sp); - PQCLEAN_KYBER76890S_AVX2_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp); - - PQCLEAN_KYBER76890S_AVX2_polyvec_invntt_tomont(&bp); + PQCLEAN_KYBER76890S_AVX2_polyvec_invntt_tomont(&b); PQCLEAN_KYBER76890S_AVX2_poly_invntt_tomont(&v); - PQCLEAN_KYBER76890S_AVX2_polyvec_add(&bp, &bp, &ep); + PQCLEAN_KYBER76890S_AVX2_polyvec_add(&b, &b, &ep); PQCLEAN_KYBER76890S_AVX2_poly_add(&v, &v, &epp); PQCLEAN_KYBER76890S_AVX2_poly_add(&v, &v, &k); - PQCLEAN_KYBER76890S_AVX2_polyvec_reduce(&bp); + PQCLEAN_KYBER76890S_AVX2_polyvec_reduce(&b); PQCLEAN_KYBER76890S_AVX2_poly_reduce(&v); - pack_ciphertext(c, &bp, &v); + pack_ciphertext(c, &b, &v); } /************************************************* @@ -333,24 +343,24 @@ void PQCLEAN_KYBER76890S_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], * Description: Decryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *m: pointer to output decrypted message -* (of length KYBER_INDCPA_MSGBYTES) -* - const uint8_t *c: pointer to input ciphertext -* (of length KYBER_INDCPA_BYTES) +* Arguments: - uint8_t *m: pointer to output decrypted message +* (of length KYBER_INDCPA_MSGBYTES) +* - const uint8_t *c: pointer to input ciphertext +* (of length KYBER_INDCPA_BYTES) * - const uint8_t *sk: pointer to input secret key * (of length KYBER_INDCPA_SECRETKEYBYTES) **************************************************/ void PQCLEAN_KYBER76890S_AVX2_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t c[KYBER_INDCPA_BYTES], const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { - polyvec bp, skpv; + polyvec b, skpv; poly v, mp; - unpack_ciphertext(&bp, &v, c); + unpack_ciphertext(&b, &v, c); unpack_sk(&skpv, sk); - PQCLEAN_KYBER76890S_AVX2_polyvec_ntt(&bp); - PQCLEAN_KYBER76890S_AVX2_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp); + PQCLEAN_KYBER76890S_AVX2_polyvec_ntt(&b); + PQCLEAN_KYBER76890S_AVX2_polyvec_basemul_acc_montgomery(&mp, &skpv, &b); PQCLEAN_KYBER76890S_AVX2_poly_invntt_tomont(&mp); PQCLEAN_KYBER76890S_AVX2_poly_sub(&mp, &v, &mp); diff --git a/crypto_kem/kyber768-90s/avx2/invntt.S b/crypto_kem/kyber768-90s/avx2/invntt.S index 42e82545..53e2d8dd 100644 --- a/crypto_kem/kyber768-90s/avx2/invntt.S +++ b/crypto_kem/kyber768-90s/avx2/invntt.S @@ -2,22 +2,21 @@ .include "shuffle.inc" .include "fq.inc" -.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=1,zl1=1,zh0=2,zh1=2 -#update & mul -vpsubw %ymm\rh0,%ymm\rl0,%ymm12 -vpsubw %ymm\rh1,%ymm\rl1,%ymm13 -vpsubw %ymm\rh2,%ymm\rl2,%ymm14 - +.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=2,zl1=2,zh0=3,zh1=3 +vpsubw %ymm\rl0,%ymm\rh0,%ymm12 vpaddw %ymm\rh0,%ymm\rl0,%ymm\rl0 -vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl1 +vpsubw %ymm\rl1,%ymm\rh1,%ymm13 + vpmullw %ymm\zl0,%ymm12,%ymm\rh0 +vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl1 +vpsubw %ymm\rl2,%ymm\rh2,%ymm14 -vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl2 vpmullw %ymm\zl0,%ymm13,%ymm\rh1 -vpsubw %ymm\rh3,%ymm\rl3,%ymm15 +vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl2 +vpsubw %ymm\rl3,%ymm\rh3,%ymm15 -vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl3 vpmullw %ymm\zl1,%ymm14,%ymm\rh2 +vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl3 vpmullw %ymm\zl1,%ymm15,%ymm\rh3 vpmulhw %ymm\zh0,%ymm12,%ymm12 @@ -26,60 +25,84 @@ vpmulhw %ymm\zh0,%ymm13,%ymm13 vpmulhw %ymm\zh1,%ymm14,%ymm14 vpmulhw %ymm\zh1,%ymm15,%ymm15 -#reduce vpmulhw %ymm0,%ymm\rh0,%ymm\rh0 + vpmulhw %ymm0,%ymm\rh1,%ymm\rh1 + vpmulhw %ymm0,%ymm\rh2,%ymm\rh2 vpmulhw %ymm0,%ymm\rh3,%ymm\rh3 + +# + +# + vpsubw %ymm\rh0,%ymm12,%ymm\rh0 + vpsubw %ymm\rh1,%ymm13,%ymm\rh1 + vpsubw %ymm\rh2,%ymm14,%ymm\rh2 vpsubw %ymm\rh3,%ymm15,%ymm\rh3 .endm -.text -invntt_levels0t5_avx: -level0: -#zetas -vmovdqu (%rsi),%ymm15 -vmovdqu 64(%rsi),%ymm3 -vmovdqu 32(%rsi),%ymm1 -vmovdqu 96(%rsi),%ymm2 +.macro intt_levels0t5 off +/* level 0 */ +vmovdqa _16XFLO*2(%rsi),%ymm2 +vmovdqa _16XFHI*2(%rsi),%ymm3 -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 128(%rdi),%ymm8 -vmovdqa 160(%rdi),%ymm9 -vmovdqa 192(%rdi),%ymm10 -vmovdqa 224(%rdi),%ymm11 +vmovdqa (128*\off+ 0)*2(%rdi),%ymm4 +vmovdqa (128*\off+ 32)*2(%rdi),%ymm6 +vmovdqa (128*\off+ 16)*2(%rdi),%ymm5 +vmovdqa (128*\off+ 48)*2(%rdi),%ymm7 -butterfly 4,5,8,9,6,7,10,11,15,3,1,2 +fqmulprecomp 2,3,4 +fqmulprecomp 2,3,6 +fqmulprecomp 2,3,5 +fqmulprecomp 2,3,7 -level1: -#zetas -vmovdqu 128(%rsi),%ymm3 -vmovdqu 160(%rsi),%ymm2 +vmovdqa (128*\off+ 64)*2(%rdi),%ymm8 +vmovdqa (128*\off+ 96)*2(%rdi),%ymm10 +vmovdqa (128*\off+ 80)*2(%rdi),%ymm9 +vmovdqa (128*\off+112)*2(%rdi),%ymm11 -butterfly 4,5,6,7,8,9,10,11,3,3,2,2 +fqmulprecomp 2,3,8 +fqmulprecomp 2,3,10 +fqmulprecomp 2,3,9 +fqmulprecomp 2,3,11 + +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+208)*2(%rsi),%ymm15 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+176)*2(%rsi),%ymm1 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+224)*2(%rsi),%ymm2 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+192)*2(%rsi),%ymm3 +vmovdqa _REVIDXB*2(%rsi),%ymm12 +vpshufb %ymm12,%ymm15,%ymm15 +vpshufb %ymm12,%ymm1,%ymm1 +vpshufb %ymm12,%ymm2,%ymm2 +vpshufb %ymm12,%ymm3,%ymm3 + +butterfly 4,5,8,9,6,7,10,11,15,1,2,3 + +/* level 1 */ +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+144)*2(%rsi),%ymm2 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+160)*2(%rsi),%ymm3 +vmovdqa _REVIDXB*2(%rsi),%ymm1 +vpshufb %ymm1,%ymm2,%ymm2 +vpshufb %ymm1,%ymm3,%ymm3 + +butterfly 4,5,6,7,8,9,10,11,2,2,3,3 shuffle1 4,5,3,5 shuffle1 6,7,4,7 shuffle1 8,9,6,9 shuffle1 10,11,8,11 -level2: -#zetas -vmovdqu 192(%rsi),%ymm10 -vmovdqu 224(%rsi),%ymm2 +/* level 2 */ +vmovdqa _REVIDXD*2(%rsi),%ymm12 +vpermd (_ZETAS_EXP+(1-\off)*224+112)*2(%rsi),%ymm12,%ymm2 +vpermd (_ZETAS_EXP+(1-\off)*224+128)*2(%rsi),%ymm12,%ymm10 -#consts -vmovdqa _16XV*2(%rdx),%ymm1 - -butterfly 3,4,6,8,5,7,9,11,10,10,2,2 +butterfly 3,4,6,8,5,7,9,11,2,2,10,10 +vmovdqa _16XV*2(%rsi),%ymm1 red16 3 shuffle2 3,4,10,4 @@ -87,26 +110,22 @@ shuffle2 6,8,3,8 shuffle2 5,7,6,7 shuffle2 9,11,5,11 -level3: -#zetas -vmovdqu 256(%rsi),%ymm9 -vmovdqu 288(%rsi),%ymm2 +/* level 3 */ +vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+80)*2(%rsi),%ymm2 +vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+96)*2(%rsi),%ymm9 -butterfly 10,3,6,5,4,8,7,11,9,9,2,2 - -red16 10 +butterfly 10,3,6,5,4,8,7,11,2,2,9,9 shuffle4 10,3,9,3 shuffle4 6,5,10,5 shuffle4 4,8,6,8 shuffle4 7,11,4,11 -level4: -#zetas -vmovdqu 320(%rsi),%ymm7 -vmovdqu 352(%rsi),%ymm2 +/* level 4 */ +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+48)*2(%rsi),%ymm2 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+64)*2(%rsi),%ymm7 -butterfly 9,10,6,4,3,5,8,11,7,7,2,2 +butterfly 9,10,6,4,3,5,8,11,2,2,7,7 red16 9 @@ -115,113 +134,62 @@ shuffle8 6,4,9,4 shuffle8 3,5,6,5 shuffle8 8,11,3,11 -level5: -#zetas -vpbroadcastd 384(%rsi),%ymm8 -vpbroadcastd 388(%rsi),%ymm2 +/* level5 */ +vmovdqa (_ZETAS_EXP+(1-\off)*224+16)*2(%rsi),%ymm2 +vmovdqa (_ZETAS_EXP+(1-\off)*224+32)*2(%rsi),%ymm8 -butterfly 7,9,6,3,10,4,5,11,8,8,2,2 +butterfly 7,9,6,3,10,4,5,11,2,2,8,8 -red16 7 +vmovdqa %ymm7,(128*\off+ 0)*2(%rdi) +vmovdqa %ymm9,(128*\off+ 16)*2(%rdi) +vmovdqa %ymm6,(128*\off+ 32)*2(%rdi) +vmovdqa %ymm3,(128*\off+ 48)*2(%rdi) +vmovdqa %ymm10,(128*\off+ 64)*2(%rdi) +vmovdqa %ymm4,(128*\off+ 80)*2(%rdi) +vmovdqa %ymm5,(128*\off+ 96)*2(%rdi) +vmovdqa %ymm11,(128*\off+112)*2(%rdi) +.endm -#store -vmovdqa %ymm7,(%rdi) -vmovdqa %ymm9,32(%rdi) -vmovdqa %ymm6,64(%rdi) -vmovdqa %ymm3,96(%rdi) -vmovdqa %ymm10,128(%rdi) -vmovdqa %ymm4,160(%rdi) -vmovdqa %ymm5,192(%rdi) -vmovdqa %ymm11,224(%rdi) +.macro intt_level6 off +/* level 6 */ +vmovdqa (64*\off+ 0)*2(%rdi),%ymm4 +vmovdqa (64*\off+128)*2(%rdi),%ymm8 +vmovdqa (64*\off+ 16)*2(%rdi),%ymm5 +vmovdqa (64*\off+144)*2(%rdi),%ymm9 +vpbroadcastq (_ZETAS_EXP+0)*2(%rsi),%ymm2 -ret - -invntt_level6_avx: -#zetas -vpbroadcastd (%rsi),%ymm1 -vpbroadcastd 4(%rsi),%ymm2 - -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 256(%rdi),%ymm8 -vmovdqa 288(%rdi),%ymm9 -vmovdqa 320(%rdi),%ymm10 -vmovdqa 352(%rdi),%ymm11 +vmovdqa (64*\off+ 32)*2(%rdi),%ymm6 +vmovdqa (64*\off+160)*2(%rdi),%ymm10 +vmovdqa (64*\off+ 48)*2(%rdi),%ymm7 +vmovdqa (64*\off+176)*2(%rdi),%ymm11 +vpbroadcastq (_ZETAS_EXP+4)*2(%rsi),%ymm3 butterfly 4,5,6,7,8,9,10,11 -#consts -vmovdqa _16XFLO*2(%rdx),%ymm12 -vmovdqa _16XFHI*2(%rdx),%ymm13 +.if \off == 0 +red16 4 +.endif -#store -vmovdqa %ymm8,256(%rdi) -vmovdqa %ymm9,288(%rdi) -vmovdqa %ymm10,320(%rdi) -vmovdqa %ymm11,352(%rdi) - -fqmulprecomp 12,13,4,8 -fqmulprecomp 12,13,5,9 -fqmulprecomp 12,13,6,10 -fqmulprecomp 12,13,7,11 - -#store -vmovdqa %ymm4,(%rdi) -vmovdqa %ymm5,32(%rdi) -vmovdqa %ymm6,64(%rdi) -vmovdqa %ymm7,96(%rdi) - -#load -vmovdqa 128(%rdi),%ymm4 -vmovdqa 160(%rdi),%ymm5 -vmovdqa 192(%rdi),%ymm6 -vmovdqa 224(%rdi),%ymm7 -vmovdqa 384(%rdi),%ymm8 -vmovdqa 416(%rdi),%ymm9 -vmovdqa 448(%rdi),%ymm10 -vmovdqa 480(%rdi),%ymm11 - -butterfly 4,5,6,7,8,9,10,11 - -#consts -vmovdqa _16XFLO*2(%rdx),%ymm12 -vmovdqa _16XFHI*2(%rdx),%ymm13 - -#store -vmovdqa %ymm8,384(%rdi) -vmovdqa %ymm9,416(%rdi) -vmovdqa %ymm10,448(%rdi) -vmovdqa %ymm11,480(%rdi) - -fqmulprecomp 12,13,4,8 -fqmulprecomp 12,13,5,9 -fqmulprecomp 12,13,6,10 -fqmulprecomp 12,13,7,11 - -#store -vmovdqa %ymm4,128(%rdi) -vmovdqa %ymm5,160(%rdi) -vmovdqa %ymm6,192(%rdi) -vmovdqa %ymm7,224(%rdi) - -ret +vmovdqa %ymm4,(64*\off+ 0)*2(%rdi) +vmovdqa %ymm5,(64*\off+ 16)*2(%rdi) +vmovdqa %ymm6,(64*\off+ 32)*2(%rdi) +vmovdqa %ymm7,(64*\off+ 48)*2(%rdi) +vmovdqa %ymm8,(64*\off+128)*2(%rdi) +vmovdqa %ymm9,(64*\off+144)*2(%rdi) +vmovdqa %ymm10,(64*\off+160)*2(%rdi) +vmovdqa %ymm11,(64*\off+176)*2(%rdi) +.endm +.text .global cdecl(PQCLEAN_KYBER76890S_AVX2_invntt_avx) .global _cdecl(PQCLEAN_KYBER76890S_AVX2_invntt_avx) cdecl(PQCLEAN_KYBER76890S_AVX2_invntt_avx): _cdecl(PQCLEAN_KYBER76890S_AVX2_invntt_avx): -#consts vmovdqa _16XQ*2(%rsi),%ymm0 -mov %rsi,%rdx -add $_ZETAS_INV_EXP*2,%rsi -call invntt_levels0t5_avx -add $256,%rdi -add $392,%rsi -call invntt_levels0t5_avx -sub $256,%rdi -add $392,%rsi -call invntt_level6_avx + +intt_levels0t5 0 +intt_levels0t5 1 + +intt_level6 0 +intt_level6 1 ret diff --git a/crypto_kem/kyber768-90s/avx2/kem.c b/crypto_kem/kyber768-90s/avx2/kem.c index 04a3b412..460ccbbb 100644 --- a/crypto_kem/kyber768-90s/avx2/kem.c +++ b/crypto_kem/kyber768-90s/avx2/kem.c @@ -1,4 +1,3 @@ -#include "align.h" #include "indcpa.h" #include "kem.h" #include "params.h" @@ -15,13 +14,14 @@ * for CCA-secure Kyber key encapsulation mechanism * * Arguments: - unsigned char *pk: pointer to output public key -* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) * - unsigned char *sk: pointer to output private key -* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* (an already allocated array of KYBER_SECRETKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER76890S_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { +int PQCLEAN_KYBER76890S_AVX2_crypto_kem_keypair(unsigned char pk[KYBER_PUBLICKEYBYTES], + unsigned char sk[KYBER_SECRETKEYBYTES]) { size_t i; PQCLEAN_KYBER76890S_AVX2_indcpa_keypair(pk, sk); for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) { @@ -40,36 +40,36 @@ int PQCLEAN_KYBER76890S_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char * secret for given public key * * Arguments: - unsigned char *ct: pointer to output cipher text -* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) * - unsigned char *ss: pointer to output shared secret -* (an already allocated array of CRYPTO_BYTES bytes) +* (an already allocated array of KYBER_SSBYTES bytes) * - const unsigned char *pk: pointer to input public key -* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER76890S_AVX2_crypto_kem_enc(unsigned char *ct, - unsigned char *ss, - const unsigned char *pk) { - ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; +int PQCLEAN_KYBER76890S_AVX2_crypto_kem_enc(unsigned char ct[KYBER_CIPHERTEXTBYTES], + unsigned char ss[KYBER_SSBYTES], + const unsigned char pk[KYBER_PUBLICKEYBYTES]) { + uint8_t buf[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ - ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr; + uint8_t kr[2 * KYBER_SYMBYTES]; - randombytes(buf.arr, KYBER_SYMBYTES); + randombytes(buf, KYBER_SYMBYTES); /* Don't release system RNG output */ - hash_h(buf.arr, buf.arr, KYBER_SYMBYTES); + hash_h(buf, buf, KYBER_SYMBYTES); /* Multitarget countermeasure for coins + contributory KEM */ - hash_h(buf.arr + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); - hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES); + hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); + hash_g(kr, buf, 2 * KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ - PQCLEAN_KYBER76890S_AVX2_indcpa_enc(ct, buf.arr, pk, kr.arr + KYBER_SYMBYTES); + PQCLEAN_KYBER76890S_AVX2_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); /* overwrite coins in kr with H(c) */ - hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); + hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* hash concatenation of pre-k and H(c) to k */ - kdf(ss, kr.arr, 2 * KYBER_SYMBYTES); + kdf(ss, kr, 2 * KYBER_SYMBYTES); return 0; } @@ -80,47 +80,47 @@ int PQCLEAN_KYBER76890S_AVX2_crypto_kem_enc(unsigned char *ct, * cipher text and private key * * Arguments: - unsigned char *ss: pointer to output shared secret -* (an already allocated array of CRYPTO_BYTES bytes) +* (an already allocated array of KYBER_SSBYTES bytes) * - const unsigned char *ct: pointer to input cipher text -* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) * - const unsigned char *sk: pointer to input private key -* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* (an already allocated array of KYBER_SECRETKEYBYTES bytes) * * Returns 0. * * On failure, ss will contain a pseudo-random value. **************************************************/ -int PQCLEAN_KYBER76890S_AVX2_crypto_kem_dec(unsigned char *ss, - const unsigned char *ct, - const unsigned char *sk) { +int PQCLEAN_KYBER76890S_AVX2_crypto_kem_dec(unsigned char ss[KYBER_SSBYTES], + const unsigned char ct[KYBER_CIPHERTEXTBYTES], + const unsigned char sk[KYBER_SECRETKEYBYTES]) { size_t i; int fail; - ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; + uint8_t buf[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ - ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr; - uint8_t cmp[KYBER_CIPHERTEXTBYTES]; + uint8_t kr[2 * KYBER_SYMBYTES]; + ALIGNED_UINT8(KYBER_CIPHERTEXTBYTES) cmp; const uint8_t *pk = sk + KYBER_INDCPA_SECRETKEYBYTES; - PQCLEAN_KYBER76890S_AVX2_indcpa_dec(buf.arr, ct, sk); + PQCLEAN_KYBER76890S_AVX2_indcpa_dec(buf, ct, sk); /* Multitarget countermeasure for coins + contributory KEM */ for (i = 0; i < KYBER_SYMBYTES; i++) { - buf.arr[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; + buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; } - hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES); + hash_g(kr, buf, 2 * KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ - PQCLEAN_KYBER76890S_AVX2_indcpa_enc(cmp, buf.arr, pk, kr.arr + KYBER_SYMBYTES); + PQCLEAN_KYBER76890S_AVX2_indcpa_enc(cmp.coeffs, buf, pk, kr + KYBER_SYMBYTES); - fail = PQCLEAN_KYBER76890S_AVX2_verify(ct, cmp, KYBER_CIPHERTEXTBYTES); + fail = PQCLEAN_KYBER76890S_AVX2_verify(ct, cmp.coeffs, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ - hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); + hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* Overwrite pre-k with z on re-encryption failure */ - PQCLEAN_KYBER76890S_AVX2_cmov(kr.arr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, (uint8_t)fail); + PQCLEAN_KYBER76890S_AVX2_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); /* hash concatenation of pre-k and H(c) to k */ - kdf(ss, kr.arr, 2 * KYBER_SYMBYTES); + kdf(ss, kr, 2 * KYBER_SYMBYTES); return 0; } diff --git a/crypto_kem/kyber768-90s/avx2/ntt.S b/crypto_kem/kyber768-90s/avx2/ntt.S index 1f62b2a6..41a54aa9 100644 --- a/crypto_kem/kyber768-90s/avx2/ntt.S +++ b/crypto_kem/kyber768-90s/avx2/ntt.S @@ -1,222 +1,191 @@ #include "cdecl.h" .include "shuffle.inc" -.include "fq.inc" -.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=1,zh1=1 -#mul +.macro mul rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=2,zh1=2 vpmullw %ymm\zl0,%ymm\rh0,%ymm12 vpmullw %ymm\zl0,%ymm\rh1,%ymm13 + vpmullw %ymm\zl1,%ymm\rh2,%ymm14 vpmullw %ymm\zl1,%ymm\rh3,%ymm15 + vpmulhw %ymm\zh0,%ymm\rh0,%ymm\rh0 vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh1 + vpmulhw %ymm\zh1,%ymm\rh2,%ymm\rh2 vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh3 - -#reduce -vpmulhw %ymm0,%ymm12,%ymm12 -vpmulhw %ymm0,%ymm13,%ymm13 -vpmulhw %ymm0,%ymm14,%ymm14 -vpmulhw %ymm0,%ymm15,%ymm15 -vpsubw %ymm12,%ymm\rh0,%ymm12 -vpsubw %ymm13,%ymm\rh1,%ymm13 -vpsubw %ymm14,%ymm\rh2,%ymm14 -vpsubw %ymm15,%ymm\rh3,%ymm15 - -#update -vpsubw %ymm12,%ymm\rl0,%ymm\rh0 -vpaddw %ymm12,%ymm\rl0,%ymm\rl0 -vpsubw %ymm13,%ymm\rl1,%ymm\rh1 -vpaddw %ymm13,%ymm\rl1,%ymm\rl1 -vpsubw %ymm14,%ymm\rl2,%ymm\rh2 -vpaddw %ymm14,%ymm\rl2,%ymm\rl2 -vpsubw %ymm15,%ymm\rl3,%ymm\rh3 -vpaddw %ymm15,%ymm\rl3,%ymm\rl3 .endm -# We break the dependency chains with the cost of slightly more additions. -# But they can be run in parallel to the multiplications on execution port 5 -# (multiplications only go to ports 0 and 1) -.macro butterfly2 rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,x=3,y=2,zl0=15,zl1=15,zh0=1,zh1=1 -#mul -vpmullw %ymm\zl0,%ymm\rh0,%ymm12 -vpmulhw %ymm\zh0,%ymm\rh0,%ymm\x -vpmullw %ymm\zl0,%ymm\rh1,%ymm13 -vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh0 -vpmullw %ymm\zl1,%ymm\rh2,%ymm14 -vpmulhw %ymm\zh1,%ymm\rh2,%ymm\y -vpmullw %ymm\zl1,%ymm\rh3,%ymm15 -vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh2 - -#reduce +.macro reduce vpmulhw %ymm0,%ymm12,%ymm12 vpmulhw %ymm0,%ymm13,%ymm13 + vpmulhw %ymm0,%ymm14,%ymm14 vpmulhw %ymm0,%ymm15,%ymm15 +.endm -vpsubw %ymm\rh0,%ymm\rl1,%ymm\rh1 -vpaddw %ymm\rh0,%ymm\rl1,%ymm\rl1 -vpsubw %ymm\x,%ymm\rl0,%ymm\rh0 -vpaddw %ymm\x,%ymm\rl0,%ymm\rl0 -vpsubw %ymm\rh2,%ymm\rl3,%ymm\rh3 -vpaddw %ymm\rh2,%ymm\rl3,%ymm\rl3 -vpsubw %ymm\y,%ymm\rl2,%ymm\rh2 -vpaddw %ymm\y,%ymm\rl2,%ymm\rl2 +.macro update rln,rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 +vpaddw %ymm\rh0,%ymm\rl0,%ymm\rln +vpsubw %ymm\rh0,%ymm\rl0,%ymm\rh0 +vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl0 -#update +vpsubw %ymm\rh1,%ymm\rl1,%ymm\rh1 +vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl1 +vpsubw %ymm\rh2,%ymm\rl2,%ymm\rh2 + +vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl2 +vpsubw %ymm\rh3,%ymm\rl3,%ymm\rh3 + +vpsubw %ymm12,%ymm\rln,%ymm\rln vpaddw %ymm12,%ymm\rh0,%ymm\rh0 -vpsubw %ymm12,%ymm\rl0,%ymm\rl0 +vpsubw %ymm13,%ymm\rl0,%ymm\rl0 + vpaddw %ymm13,%ymm\rh1,%ymm\rh1 -vpsubw %ymm13,%ymm\rl1,%ymm\rl1 +vpsubw %ymm14,%ymm\rl1,%ymm\rl1 vpaddw %ymm14,%ymm\rh2,%ymm\rh2 -vpsubw %ymm14,%ymm\rl2,%ymm\rl2 + +vpsubw %ymm15,%ymm\rl2,%ymm\rl2 vpaddw %ymm15,%ymm\rh3,%ymm\rh3 -vpsubw %ymm15,%ymm\rl3,%ymm\rl3 +.endm + +.macro level0 off +vpbroadcastq (_ZETAS_EXP+0)*2(%rsi),%ymm15 +vmovdqa (64*\off+128)*2(%rdi),%ymm8 +vmovdqa (64*\off+144)*2(%rdi),%ymm9 +vmovdqa (64*\off+160)*2(%rdi),%ymm10 +vmovdqa (64*\off+176)*2(%rdi),%ymm11 +vpbroadcastq (_ZETAS_EXP+4)*2(%rsi),%ymm2 + +mul 8,9,10,11 + +vmovdqa (64*\off+ 0)*2(%rdi),%ymm4 +vmovdqa (64*\off+ 16)*2(%rdi),%ymm5 +vmovdqa (64*\off+ 32)*2(%rdi),%ymm6 +vmovdqa (64*\off+ 48)*2(%rdi),%ymm7 + +reduce +update 3,4,5,6,7,8,9,10,11 + +vmovdqa %ymm3,(64*\off+ 0)*2(%rdi) +vmovdqa %ymm4,(64*\off+ 16)*2(%rdi) +vmovdqa %ymm5,(64*\off+ 32)*2(%rdi) +vmovdqa %ymm6,(64*\off+ 48)*2(%rdi) +vmovdqa %ymm8,(64*\off+128)*2(%rdi) +vmovdqa %ymm9,(64*\off+144)*2(%rdi) +vmovdqa %ymm10,(64*\off+160)*2(%rdi) +vmovdqa %ymm11,(64*\off+176)*2(%rdi) +.endm + +.macro levels1t6 off +/* level 1 */ +vmovdqa (_ZETAS_EXP+224*\off+16)*2(%rsi),%ymm15 +vmovdqa (128*\off+ 64)*2(%rdi),%ymm8 +vmovdqa (128*\off+ 80)*2(%rdi),%ymm9 +vmovdqa (128*\off+ 96)*2(%rdi),%ymm10 +vmovdqa (128*\off+112)*2(%rdi),%ymm11 +vmovdqa (_ZETAS_EXP+224*\off+32)*2(%rsi),%ymm2 + +mul 8,9,10,11 + +vmovdqa (128*\off+ 0)*2(%rdi),%ymm4 +vmovdqa (128*\off+ 16)*2(%rdi),%ymm5 +vmovdqa (128*\off+ 32)*2(%rdi),%ymm6 +vmovdqa (128*\off+ 48)*2(%rdi),%ymm7 + +reduce +update 3,4,5,6,7,8,9,10,11 + +/* level 2 */ +shuffle8 5,10,7,10 +shuffle8 6,11,5,11 + +vmovdqa (_ZETAS_EXP+224*\off+48)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+64)*2(%rsi),%ymm2 + +mul 7,10,5,11 + +shuffle8 3,8,6,8 +shuffle8 4,9,3,9 + +reduce +update 4,6,8,3,9,7,10,5,11 + +/* level 3 */ +shuffle4 8,5,9,5 +shuffle4 3,11,8,11 + +vmovdqa (_ZETAS_EXP+224*\off+80)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+96)*2(%rsi),%ymm2 + +mul 9,5,8,11 + +shuffle4 4,7,3,7 +shuffle4 6,10,4,10 + +reduce +update 6,3,7,4,10,9,5,8,11 + +/* level 4 */ +shuffle2 7,8,10,8 +shuffle2 4,11,7,11 + +vmovdqa (_ZETAS_EXP+224*\off+112)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+128)*2(%rsi),%ymm2 + +mul 10,8,7,11 + +shuffle2 6,9,4,9 +shuffle2 3,5,6,5 + +reduce +update 3,4,9,6,5,10,8,7,11 + +/* level 5 */ +shuffle1 9,7,5,7 +shuffle1 6,11,9,11 + +vmovdqa (_ZETAS_EXP+224*\off+144)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+160)*2(%rsi),%ymm2 + +mul 5,7,9,11 + +shuffle1 3,10,6,10 +shuffle1 4,8,3,8 + +reduce +update 4,6,10,3,8,5,7,9,11 + +/* level 6 */ +vmovdqa (_ZETAS_EXP+224*\off+176)*2(%rsi),%ymm14 +vmovdqa (_ZETAS_EXP+224*\off+208)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+192)*2(%rsi),%ymm8 +vmovdqa (_ZETAS_EXP+224*\off+224)*2(%rsi),%ymm2 + +mul 10,3,9,11,14,15,8,2 + +reduce +update 8,4,6,5,7,10,3,9,11 + +vmovdqa %ymm8,(128*\off+ 0)*2(%rdi) +vmovdqa %ymm4,(128*\off+ 16)*2(%rdi) +vmovdqa %ymm10,(128*\off+ 32)*2(%rdi) +vmovdqa %ymm3,(128*\off+ 48)*2(%rdi) +vmovdqa %ymm6,(128*\off+ 64)*2(%rdi) +vmovdqa %ymm5,(128*\off+ 80)*2(%rdi) +vmovdqa %ymm9,(128*\off+ 96)*2(%rdi) +vmovdqa %ymm11,(128*\off+112)*2(%rdi) .endm .text -ntt_level0_avx: -level0: -#zetas -vpbroadcastd (%rsi),%ymm15 -vpbroadcastd 4(%rsi),%ymm1 - -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 256(%rdi),%ymm8 -vmovdqa 288(%rdi),%ymm9 -vmovdqa 320(%rdi),%ymm10 -vmovdqa 352(%rdi),%ymm11 - -butterfly2 4,5,6,7,8,9,10,11 - -#store -vmovdqa %ymm4,(%rdi) -vmovdqa %ymm5,32(%rdi) -vmovdqa %ymm6,64(%rdi) -vmovdqa %ymm7,96(%rdi) -vmovdqa %ymm8,256(%rdi) -vmovdqa %ymm9,288(%rdi) -vmovdqa %ymm10,320(%rdi) -vmovdqa %ymm11,352(%rdi) - -ret - -ntt_levels1t6_avx: -level1: -#zetas -vpbroadcastd (%rsi),%ymm15 -vpbroadcastd 4(%rsi),%ymm1 - -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 128(%rdi),%ymm8 -vmovdqa 160(%rdi),%ymm9 -vmovdqa 192(%rdi),%ymm10 -vmovdqa 224(%rdi),%ymm11 - -butterfly2 4,5,6,7,8,9,10,11,3 - -level2: -#zetas -vmovdqu 8(%rsi),%ymm15 -vmovdqu 40(%rsi),%ymm1 - -shuffle8 4,8,3,8 -shuffle8 5,9,4,9 -shuffle8 6,10,5,10 -shuffle8 7,11,6,11 - -butterfly2 3,8,4,9,5,10,6,11,7 - -level3: -#zetas -vmovdqu 72(%rsi),%ymm15 -vmovdqu 104(%rsi),%ymm1 - -shuffle4 3,5,7,5 -shuffle4 8,10,3,10 -shuffle4 4,6,8,6 -shuffle4 9,11,4,11 - -butterfly2 7,5,3,10,8,6,4,11,9 - -level4: -#zetas -vmovdqu 136(%rsi),%ymm15 -vmovdqu 168(%rsi),%ymm1 - -shuffle2 7,8,9,8 -shuffle2 5,6,7,6 -shuffle2 3,4,5,4 -shuffle2 10,11,3,11 - -butterfly2 9,8,7,6,5,4,3,11,10 - -level5: -#zetas -vmovdqu 200(%rsi),%ymm15 -vmovdqu 232(%rsi),%ymm1 - -shuffle1 9,5,10,5 -shuffle1 8,4,9,4 -shuffle1 7,3,8,3 -shuffle1 6,11,7,11 - -butterfly2 10,5,9,4,8,3,7,11,6 - -level6: -#zetas -vmovdqu 264(%rsi),%ymm14 -vmovdqu 328(%rsi),%ymm15 -vmovdqu 296(%rsi),%ymm1 -vmovdqu 360(%rsi),%ymm2 - -butterfly2 10,5,8,3,9,4,7,11,6,1,14,15,1,2 - -vmovdqa _16XV*2(%rdx),%ymm1 -red16 10,12 -red16 5,13 -red16 9,14 -red16 4,15 -red16 8,2 -red16 3,6 -red16 7,12 -red16 11,13 - -#store -vmovdqa %ymm10,(%rdi) -vmovdqa %ymm5,32(%rdi) -vmovdqa %ymm9,64(%rdi) -vmovdqa %ymm4,96(%rdi) -vmovdqa %ymm8,128(%rdi) -vmovdqa %ymm3,160(%rdi) -vmovdqa %ymm7,192(%rdi) -vmovdqa %ymm11,224(%rdi) - -ret - .global cdecl(PQCLEAN_KYBER76890S_AVX2_ntt_avx) .global _cdecl(PQCLEAN_KYBER76890S_AVX2_ntt_avx) cdecl(PQCLEAN_KYBER76890S_AVX2_ntt_avx): _cdecl(PQCLEAN_KYBER76890S_AVX2_ntt_avx): -#consts vmovdqa _16XQ*2(%rsi),%ymm0 -mov %rsi,%rdx -add $_ZETAS_EXP*2,%rsi -call ntt_level0_avx -add $128,%rdi -call ntt_level0_avx -sub $128,%rdi -add $8,%rsi -call ntt_levels1t6_avx -add $256,%rdi -add $392,%rsi -call ntt_levels1t6_avx + +level0 0 +level0 1 + +levels1t6 0 +levels1t6 1 + ret diff --git a/crypto_kem/kyber768-90s/avx2/ntt.h b/crypto_kem/kyber768-90s/avx2/ntt.h index 2148419b..291d5b0a 100644 --- a/crypto_kem/kyber768-90s/avx2/ntt.h +++ b/crypto_kem/kyber768-90s/avx2/ntt.h @@ -1,24 +1,21 @@ #ifndef PQCLEAN_KYBER76890S_AVX2_NTT_H #define PQCLEAN_KYBER76890S_AVX2_NTT_H -#include "consts.h" + +#include #include -void PQCLEAN_KYBER76890S_AVX2_ntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER76890S_AVX2_qdata); -void PQCLEAN_KYBER76890S_AVX2_invntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER76890S_AVX2_qdata); +void PQCLEAN_KYBER76890S_AVX2_ntt_avx(__m256i *r, const __m256i *PQCLEAN_KYBER76890S_AVX2_qdata); +void PQCLEAN_KYBER76890S_AVX2_invntt_avx(__m256i *r, const __m256i *PQCLEAN_KYBER76890S_AVX2_qdata); -void PQCLEAN_KYBER76890S_AVX2_nttpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER76890S_AVX2_qdata); -void PQCLEAN_KYBER76890S_AVX2_nttunpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER76890S_AVX2_qdata); +void PQCLEAN_KYBER76890S_AVX2_nttpack_avx(__m256i *r, const __m256i *PQCLEAN_KYBER76890S_AVX2_qdata); +void PQCLEAN_KYBER76890S_AVX2_nttunpack_avx(__m256i *r, const __m256i *PQCLEAN_KYBER76890S_AVX2_qdata); -void PQCLEAN_KYBER76890S_AVX2_basemul_avx(int16_t *r, - const int16_t *a, - const int16_t *b, - const qdata_t *PQCLEAN_KYBER76890S_AVX2_qdata); -void PQCLEAN_KYBER76890S_AVX2_basemul_acc_avx(int16_t *r, - const int16_t *a, - const int16_t *b, - const qdata_t *PQCLEAN_KYBER76890S_AVX2_qdata); +void PQCLEAN_KYBER76890S_AVX2_basemul_avx(__m256i *r, + const __m256i *a, + const __m256i *b, + const __m256i *PQCLEAN_KYBER76890S_AVX2_qdata); -void PQCLEAN_KYBER76890S_AVX2_ntttobytes_avx(uint8_t *r, const int16_t *a, const qdata_t *PQCLEAN_KYBER76890S_AVX2_qdata); -void PQCLEAN_KYBER76890S_AVX2_nttfrombytes_avx(int16_t *r, const uint8_t *a, const qdata_t *PQCLEAN_KYBER76890S_AVX2_qdata); +void PQCLEAN_KYBER76890S_AVX2_ntttobytes_avx(uint8_t *r, const __m256i *a, const __m256i *PQCLEAN_KYBER76890S_AVX2_qdata); +void PQCLEAN_KYBER76890S_AVX2_nttfrombytes_avx(__m256i *r, const uint8_t *a, const __m256i *PQCLEAN_KYBER76890S_AVX2_qdata); #endif diff --git a/crypto_kem/kyber768-90s/avx2/params.h b/crypto_kem/kyber768-90s/avx2/params.h index 72a2e0fe..be92ce0c 100644 --- a/crypto_kem/kyber768-90s/avx2/params.h +++ b/crypto_kem/kyber768-90s/avx2/params.h @@ -7,8 +7,6 @@ #define KYBER_N 256 #define KYBER_Q 3329 -#define KYBER_ETA 2 - #define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define KYBER_SSBYTES 32 /* size in bytes of shared key */ @@ -16,9 +14,12 @@ #define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) #define KYBER_K 3 +#define KYBER_ETA1 2 #define KYBER_POLYCOMPRESSEDBYTES 128 #define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320) +#define KYBER_ETA2 2 + #define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES #define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) #define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) diff --git a/crypto_kem/kyber768-90s/avx2/poly.c b/crypto_kem/kyber768-90s/avx2/poly.c index db0b0ff2..edb305ca 100644 --- a/crypto_kem/kyber768-90s/avx2/poly.c +++ b/crypto_kem/kyber768-90s/avx2/poly.c @@ -12,63 +12,89 @@ /************************************************* * Name: PQCLEAN_KYBER76890S_AVX2_poly_compress * -* Description: Compression and subsequent serialization of a polynomial +* Description: Compression and subsequent serialization of a polynomial. +* The coefficients of the input polynomial are assumed to +* lie in the invertal [0,q], i.e. the polynomial must be reduced +* by PQCLEAN_KYBER76890S_AVX2_poly_reduce(). * * Arguments: - uint8_t *r: pointer to output byte array * (of length KYBER_POLYCOMPRESSEDBYTES) -* - poly *a: pointer to input polynomial +* - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER76890S_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *restrict a) { - unsigned int i, j; - uint8_t t[8]; - - PQCLEAN_KYBER76890S_AVX2_poly_csubq(a); - - for (i = 0; i < KYBER_N / 8; i++) { - for (j = 0; j < 8; j++) { - t[j] = ((((uint16_t)a->coeffs[8 * i + j] << 4) + KYBER_Q / 2) / KYBER_Q) & 15; - } - - r[0] = t[0] | (t[1] << 4); - r[1] = t[2] | (t[3] << 4); - r[2] = t[4] | (t[5] << 4); - r[3] = t[6] | (t[7] << 4); - r += 4; - } -} - -/************************************************* -* Name: PQCLEAN_KYBER76890S_AVX2_poly_decompress -* -* Description: De-serialization and subsequent decompression of a polynomial; -* approximate inverse of PQCLEAN_KYBER76890S_AVX2_poly_compress -* -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *a: pointer to input byte array -* (of length KYBER_POLYCOMPRESSEDBYTES bytes) -**************************************************/ -void PQCLEAN_KYBER76890S_AVX2_poly_decompress(poly *restrict r, - const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) { +void PQCLEAN_KYBER76890S_AVX2_poly_compress(uint8_t r[128], const poly *restrict a) { unsigned int i; + __m256i f0, f1, f2, f3; + const __m256i v = _mm256_load_si256(&PQCLEAN_KYBER76890S_AVX2_qdata.vec[_16XV / 16]); + const __m256i shift1 = _mm256_set1_epi16(1 << 9); + const __m256i mask = _mm256_set1_epi16(15); + const __m256i shift2 = _mm256_set1_epi16((16 << 8) + 1); + const __m256i permdidx = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); - for (i = 0; i < KYBER_N / 2; i++) { - r->coeffs[2 * i + 0] = (((uint16_t)(a[0] & 15) * KYBER_Q) + 8) >> 4; - r->coeffs[2 * i + 1] = (((uint16_t)(a[0] >> 4) * KYBER_Q) + 8) >> 4; - a += 1; + for (i = 0; i < KYBER_N / 64; i++) { + f0 = _mm256_load_si256(&a->vec[4 * i + 0]); + f1 = _mm256_load_si256(&a->vec[4 * i + 1]); + f2 = _mm256_load_si256(&a->vec[4 * i + 2]); + f3 = _mm256_load_si256(&a->vec[4 * i + 3]); + f0 = _mm256_mulhi_epi16(f0, v); + f1 = _mm256_mulhi_epi16(f1, v); + f2 = _mm256_mulhi_epi16(f2, v); + f3 = _mm256_mulhi_epi16(f3, v); + f0 = _mm256_mulhrs_epi16(f0, shift1); + f1 = _mm256_mulhrs_epi16(f1, shift1); + f2 = _mm256_mulhrs_epi16(f2, shift1); + f3 = _mm256_mulhrs_epi16(f3, shift1); + f0 = _mm256_and_si256(f0, mask); + f1 = _mm256_and_si256(f1, mask); + f2 = _mm256_and_si256(f2, mask); + f3 = _mm256_and_si256(f3, mask); + f0 = _mm256_packus_epi16(f0, f1); + f2 = _mm256_packus_epi16(f2, f3); + f0 = _mm256_maddubs_epi16(f0, shift2); + f2 = _mm256_maddubs_epi16(f2, shift2); + f0 = _mm256_packus_epi16(f0, f2); + f0 = _mm256_permutevar8x32_epi32(f0, permdidx); + _mm256_storeu_si256((__m256i *)&r[32 * i], f0); } } +void PQCLEAN_KYBER76890S_AVX2_poly_decompress(poly *restrict r, const uint8_t a[128]) { + unsigned int i; + __m128i t; + __m256i f; + const __m256i q = _mm256_load_si256(&PQCLEAN_KYBER76890S_AVX2_qdata.vec[_16XQ / 16]); + const __m256i shufbidx = _mm256_set_epi8(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, + 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0); + const __m256i mask = _mm256_set1_epi32(0x00F0000F); + const __m256i shift = _mm256_set1_epi32((128 << 16) + 2048); + + for (i = 0; i < KYBER_N / 16; i++) { + t = _mm_loadl_epi64((__m128i *)&a[8 * i]); + f = _mm256_broadcastsi128_si256(t); + f = _mm256_shuffle_epi8(f, shufbidx); + f = _mm256_and_si256(f, mask); + f = _mm256_mullo_epi16(f, shift); + f = _mm256_mulhrs_epi16(f, q); + _mm256_store_si256(&r->vec[i], f); + } +} + + /************************************************* * Name: PQCLEAN_KYBER76890S_AVX2_poly_tobytes * -* Description: Serialization of a polynomial +* Description: Serialization of a polynomial in NTT representation. +* The coefficients of the input polynomial are assumed to +* lie in the invertal [0,q], i.e. the polynomial must be reduced +* by PQCLEAN_KYBER76890S_AVX2_poly_reduce(). The coefficients are orderd as output by +* PQCLEAN_KYBER76890S_AVX2_poly_ntt(); the serialized output coefficients are in bitreversed +* order. * * Arguments: - uint8_t *r: pointer to output byte array * (needs space for KYBER_POLYBYTES bytes) -* - poly *a: pointer to input polynomial +* - poly *a: pointer to input polynomial **************************************************/ void PQCLEAN_KYBER76890S_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { - PQCLEAN_KYBER76890S_AVX2_ntttobytes_avx(r, a->coeffs, &PQCLEAN_KYBER76890S_AVX2_qdata); + PQCLEAN_KYBER76890S_AVX2_ntttobytes_avx(r, a->vec, PQCLEAN_KYBER76890S_AVX2_qdata.vec); } /************************************************* @@ -77,12 +103,12 @@ void PQCLEAN_KYBER76890S_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) * Description: De-serialization of a polynomial; * inverse of PQCLEAN_KYBER76890S_AVX2_poly_tobytes * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: pointer to input byte array * (of KYBER_POLYBYTES bytes) **************************************************/ void PQCLEAN_KYBER76890S_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]) { - PQCLEAN_KYBER76890S_AVX2_nttfrombytes_avx(r->coeffs, a, &PQCLEAN_KYBER76890S_AVX2_qdata); + PQCLEAN_KYBER76890S_AVX2_nttfrombytes_avx(r->vec, a, PQCLEAN_KYBER76890S_AVX2_qdata.vec); } /************************************************* @@ -90,11 +116,10 @@ void PQCLEAN_KYBER76890S_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLY * * Description: Convert 32-byte message to polynomial * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *msg: pointer to input message **************************************************/ -void PQCLEAN_KYBER76890S_AVX2_poly_frommsg(poly *restrict r, - const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { +void PQCLEAN_KYBER76890S_AVX2_poly_frommsg(poly *restrict r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { __m256i f, g0, g1, g2, g3, h0, h1, h2, h3; const __m256i shift = _mm256_broadcastsi128_si256(_mm_set_epi32(0, 1, 2, 3)); const __m256i idx = _mm256_broadcastsi128_si256(_mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); @@ -123,12 +148,12 @@ void PQCLEAN_KYBER76890S_AVX2_poly_frommsg(poly *restrict r, g2 = _mm256_permute2x128_si256(h0,h1,0x31); \ g1 = _mm256_permute2x128_si256(h2,h3,0x20); \ g3 = _mm256_permute2x128_si256(h2,h3,0x31); \ - _mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+ 0],g0); \ - _mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+16],g1); \ - _mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+ 0],g2); \ - _mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+16],g3) + _mm256_store_si256(&r->vec[0+2*(i)+0],g0); \ + _mm256_store_si256(&r->vec[0+2*(i)+1],g1); \ + _mm256_store_si256(&r->vec[8+2*(i)+0],g2); \ + _mm256_store_si256(&r->vec[8+2*(i)+1],g3) - f = _mm256_load_si256((__m256i *)msg); + f = _mm256_loadu_si256((__m256i *)msg); FROMMSG64(0); FROMMSG64(1); FROMMSG64(2); @@ -138,32 +163,34 @@ void PQCLEAN_KYBER76890S_AVX2_poly_frommsg(poly *restrict r, /************************************************* * Name: PQCLEAN_KYBER76890S_AVX2_poly_tomsg * -* Description: Convert polynomial to 32-byte message +* Description: Convert polynomial to 32-byte message. +* The coefficients of the input polynomial are assumed to +* lie in the invertal [0,q], i.e. the polynomial must be reduced +* by PQCLEAN_KYBER76890S_AVX2_poly_reduce(). * * Arguments: - uint8_t *msg: pointer to output message -* - poly *a: pointer to input polynomial +* - poly *a: pointer to input polynomial **************************************************/ void PQCLEAN_KYBER76890S_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *restrict a) { unsigned int i; uint32_t small; __m256i f0, f1, g0, g1; - const __m256i hqs = _mm256_set1_epi16((KYBER_Q - 1) / 2); - const __m256i hhqs = _mm256_set1_epi16((KYBER_Q - 5) / 4); + const __m256i hq = _mm256_set1_epi16((KYBER_Q - 1) / 2); + const __m256i hhq = _mm256_set1_epi16((KYBER_Q - 1) / 4); for (i = 0; i < KYBER_N / 32; i++) { - f0 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i]); - f1 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i + 16]); - f0 = _mm256_sub_epi16(hqs, f0); - f1 = _mm256_sub_epi16(hqs, f1); + f0 = _mm256_load_si256(&a->vec[2 * i + 0]); + f1 = _mm256_load_si256(&a->vec[2 * i + 1]); + f0 = _mm256_sub_epi16(hq, f0); + f1 = _mm256_sub_epi16(hq, f1); g0 = _mm256_srai_epi16(f0, 15); g1 = _mm256_srai_epi16(f1, 15); f0 = _mm256_xor_si256(f0, g0); f1 = _mm256_xor_si256(f1, g1); - f0 = _mm256_sub_epi16(hhqs, f0); - f1 = _mm256_sub_epi16(hhqs, f1); + f0 = _mm256_sub_epi16(f0, hhq); + f1 = _mm256_sub_epi16(f1, hhq); f0 = _mm256_packs_epi16(f0, f1); small = _mm256_movemask_epi8(f0); - small = ~small; msg[4 * i + 0] = small; msg[4 * i + 1] = small >> 16; msg[4 * i + 2] = small >> 8; @@ -172,21 +199,39 @@ void PQCLEAN_KYBER76890S_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], pol } /************************************************* -* Name: PQCLEAN_KYBER76890S_AVX2_poly_getnoise +* Name: PQCLEAN_KYBER76890S_AVX2_poly_getnoise_eta1 * * Description: Sample a polynomial deterministically from a seed and a nonce, * with output polynomial close to centered binomial distribution -* with parameter KYBER_ETA +* with parameter KYBER_ETA1 * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *seed: pointer to input seed * (of length KYBER_SYMBYTES bytes) -* - uint8_t nonce: one-byte input nonce +* - uint8_t nonce: one-byte input nonce **************************************************/ -void PQCLEAN_KYBER76890S_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { - ALIGN32_ARRAY(uint8_t, KYBER_ETA * KYBER_N / 4) buf; - prf(buf.arr, sizeof(buf.arr), seed, nonce); - PQCLEAN_KYBER76890S_AVX2_cbd(r, buf.arr); +void PQCLEAN_KYBER76890S_AVX2_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { + ALIGNED_UINT8(KYBER_ETA1 * KYBER_N / 4 + 32) buf; // +32 bytes as required by PQCLEAN_KYBER76890S_AVX2_poly_cbd_eta1 + prf(buf.coeffs, KYBER_ETA1 * KYBER_N / 4, seed, nonce); + PQCLEAN_KYBER76890S_AVX2_poly_cbd_eta1(r, buf.vec); +} + +/************************************************* +* Name: PQCLEAN_KYBER76890S_AVX2_poly_getnoise_eta2 +* +* Description: Sample a polynomial deterministically from a seed and a nonce, +* with output polynomial close to centered binomial distribution +* with parameter KYBER_ETA2 +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *seed: pointer to input seed +* (of length KYBER_SYMBYTES bytes) +* - uint8_t nonce: one-byte input nonce +**************************************************/ +void PQCLEAN_KYBER76890S_AVX2_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { + ALIGNED_UINT8(KYBER_ETA2 * KYBER_N / 4) buf; + prf(buf.coeffs, KYBER_ETA2 * KYBER_N / 4, seed, nonce); + PQCLEAN_KYBER76890S_AVX2_poly_cbd_eta2(r, buf.vec); } @@ -194,13 +239,17 @@ void PQCLEAN_KYBER76890S_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SY * Name: PQCLEAN_KYBER76890S_AVX2_poly_ntt * * Description: Computes negacyclic number-theoretic transform (NTT) of -* a polynomial in place; -* inputs assumed to be in normal order, output in bitreversed order +* a polynomial in place. +* Input coefficients assumed to be in normal order, +* output coefficients are in special order that is natural +* for the vectorization. Input coefficients are assumed to be +* bounded by q in absolute value, output coefficients are bounded +* by 16118 in absolute value. * -* Arguments: - uint16_t *r: pointer to in/output polynomial +* Arguments: - poly *r: pointer to in/output polynomial **************************************************/ void PQCLEAN_KYBER76890S_AVX2_poly_ntt(poly *r) { - PQCLEAN_KYBER76890S_AVX2_ntt_avx(r->coeffs, &PQCLEAN_KYBER76890S_AVX2_qdata); + PQCLEAN_KYBER76890S_AVX2_ntt_avx(r->vec, PQCLEAN_KYBER76890S_AVX2_qdata.vec); } /************************************************* @@ -208,29 +257,35 @@ void PQCLEAN_KYBER76890S_AVX2_poly_ntt(poly *r) { * * Description: Computes inverse of negacyclic number-theoretic transform (NTT) * of a polynomial in place; -* inputs assumed to be in bitreversed order, output in normal order +* Input coefficients assumed to be in special order from vectorized +* forward ntt, output in normal order. Input coefficients can be +* arbitrary 16-bit integers, output coefficients are bounded by 14870 +* in absolute value. * -* Arguments: - uint16_t *a: pointer to in/output polynomial +* Arguments: - poly *a: pointer to in/output polynomial **************************************************/ void PQCLEAN_KYBER76890S_AVX2_poly_invntt_tomont(poly *r) { - PQCLEAN_KYBER76890S_AVX2_invntt_avx(r->coeffs, &PQCLEAN_KYBER76890S_AVX2_qdata); + PQCLEAN_KYBER76890S_AVX2_invntt_avx(r->vec, PQCLEAN_KYBER76890S_AVX2_qdata.vec); } void PQCLEAN_KYBER76890S_AVX2_poly_nttunpack(poly *r) { - PQCLEAN_KYBER76890S_AVX2_nttunpack_avx(r->coeffs, &PQCLEAN_KYBER76890S_AVX2_qdata); + PQCLEAN_KYBER76890S_AVX2_nttunpack_avx(r->vec, PQCLEAN_KYBER76890S_AVX2_qdata.vec); } /************************************************* * Name: PQCLEAN_KYBER76890S_AVX2_poly_basemul_montgomery * -* Description: Multiplication of two polynomials in NTT domain +* Description: Multiplication of two polynomials in NTT domain. +* One of the input polynomials needs to have coefficients +* bounded by q, the other polynomial can have arbitrary +* coefficients. Output coefficients are bounded by 6656. * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ void PQCLEAN_KYBER76890S_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const poly *b) { - PQCLEAN_KYBER76890S_AVX2_basemul_avx(r->coeffs, a->coeffs, b->coeffs, &PQCLEAN_KYBER76890S_AVX2_qdata); + PQCLEAN_KYBER76890S_AVX2_basemul_avx(r->vec, a->vec, b->vec, PQCLEAN_KYBER76890S_AVX2_qdata.vec); } /************************************************* @@ -242,7 +297,7 @@ void PQCLEAN_KYBER76890S_AVX2_poly_basemul_montgomery(poly *r, const poly *a, co * Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER76890S_AVX2_poly_tomont(poly *r) { - PQCLEAN_KYBER76890S_AVX2_tomont_avx(r->coeffs, &PQCLEAN_KYBER76890S_AVX2_qdata); + PQCLEAN_KYBER76890S_AVX2_tomont_avx(r->vec, PQCLEAN_KYBER76890S_AVX2_qdata.vec); } /************************************************* @@ -254,28 +309,16 @@ void PQCLEAN_KYBER76890S_AVX2_poly_tomont(poly *r) { * Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER76890S_AVX2_poly_reduce(poly *r) { - PQCLEAN_KYBER76890S_AVX2_reduce_avx(r->coeffs, &PQCLEAN_KYBER76890S_AVX2_qdata); -} - -/************************************************* -* Name: PQCLEAN_KYBER76890S_AVX2_poly_csubq -* -* Description: Applies conditional subtraction of q to each coefficient -* of a polynomial. For details of conditional subtraction -* of q see comments in reduce.c -* -* Arguments: - poly *r: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_KYBER76890S_AVX2_poly_csubq(poly *r) { - PQCLEAN_KYBER76890S_AVX2_csubq_avx(r->coeffs, &PQCLEAN_KYBER76890S_AVX2_qdata); + PQCLEAN_KYBER76890S_AVX2_reduce_avx(r->vec, PQCLEAN_KYBER76890S_AVX2_qdata.vec); } /************************************************* * Name: PQCLEAN_KYBER76890S_AVX2_poly_add * -* Description: Add two polynomials +* Description: Add two polynomials. No modular reduction +* is performed. * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ @@ -283,20 +326,21 @@ void PQCLEAN_KYBER76890S_AVX2_poly_add(poly *r, const poly *a, const poly *b) { unsigned int i; __m256i f0, f1; - for (i = 0; i < KYBER_N; i += 16) { - f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); - f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); + for (i = 0; i < KYBER_N / 16; i++) { + f0 = _mm256_load_si256(&a->vec[i]); + f1 = _mm256_load_si256(&b->vec[i]); f0 = _mm256_add_epi16(f0, f1); - _mm256_store_si256((__m256i *)&r->coeffs[i], f0); + _mm256_store_si256(&r->vec[i], f0); } } /************************************************* * Name: PQCLEAN_KYBER76890S_AVX2_poly_sub * -* Description: Subtract two polynomials +* Description: Subtract two polynomials. No modular reduction +* is performed. * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ @@ -304,10 +348,10 @@ void PQCLEAN_KYBER76890S_AVX2_poly_sub(poly *r, const poly *a, const poly *b) { unsigned int i; __m256i f0, f1; - for (i = 0; i < KYBER_N; i += 16) { - f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); - f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); + for (i = 0; i < KYBER_N / 16; i++) { + f0 = _mm256_load_si256(&a->vec[i]); + f1 = _mm256_load_si256(&b->vec[i]); f0 = _mm256_sub_epi16(f0, f1); - _mm256_store_si256((__m256i *)&r->coeffs[i], f0); + _mm256_store_si256(&r->vec[i], f0); } } diff --git a/crypto_kem/kyber768-90s/avx2/poly.h b/crypto_kem/kyber768-90s/avx2/poly.h index 03d799be..f28d7d10 100644 --- a/crypto_kem/kyber768-90s/avx2/poly.h +++ b/crypto_kem/kyber768-90s/avx2/poly.h @@ -1,19 +1,13 @@ #ifndef PQCLEAN_KYBER76890S_AVX2_POLY_H #define PQCLEAN_KYBER76890S_AVX2_POLY_H +#include "align.h" #include "params.h" #include #include -/* - * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial - * coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1] - */ -typedef union { - __m256i dummy; - int16_t coeffs[KYBER_N]; -} poly; +typedef ALIGNED_INT16(KYBER_N) poly; -void PQCLEAN_KYBER76890S_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a); +void PQCLEAN_KYBER76890S_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a); void PQCLEAN_KYBER76890S_AVX2_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]); void PQCLEAN_KYBER76890S_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a); @@ -22,7 +16,11 @@ void PQCLEAN_KYBER76890S_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLY void PQCLEAN_KYBER76890S_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]); void PQCLEAN_KYBER76890S_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a); -void PQCLEAN_KYBER76890S_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); +void PQCLEAN_KYBER76890S_AVX2_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); + +void PQCLEAN_KYBER76890S_AVX2_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); + + void PQCLEAN_KYBER76890S_AVX2_poly_ntt(poly *r); void PQCLEAN_KYBER76890S_AVX2_poly_invntt_tomont(poly *r); @@ -31,7 +29,6 @@ void PQCLEAN_KYBER76890S_AVX2_poly_basemul_montgomery(poly *r, const poly *a, co void PQCLEAN_KYBER76890S_AVX2_poly_tomont(poly *r); void PQCLEAN_KYBER76890S_AVX2_poly_reduce(poly *r); -void PQCLEAN_KYBER76890S_AVX2_poly_csubq(poly *r); void PQCLEAN_KYBER76890S_AVX2_poly_add(poly *r, const poly *a, const poly *b); void PQCLEAN_KYBER76890S_AVX2_poly_sub(poly *r, const poly *a, const poly *b); diff --git a/crypto_kem/kyber768-90s/avx2/polyvec.c b/crypto_kem/kyber768-90s/avx2/polyvec.c index 86af2e57..3357b6e9 100644 --- a/crypto_kem/kyber768-90s/avx2/polyvec.c +++ b/crypto_kem/kyber768-90s/avx2/polyvec.c @@ -3,8 +3,76 @@ #include "params.h" #include "poly.h" #include "polyvec.h" +#include #include +static void poly_compress10(uint8_t r[320], const poly *restrict a) { + size_t i; + uint32_t low; + __m256i f0, f1, f2; + __m128i t0, t1; + const __m256i v = _mm256_load_si256(&PQCLEAN_KYBER76890S_AVX2_qdata.vec[_16XV / 16]); + const __m256i v8 = _mm256_slli_epi16(v, 3); + const __m256i off = _mm256_set1_epi16(15); + const __m256i shift1 = _mm256_set1_epi16(1 << 12); + const __m256i mask = _mm256_set1_epi16(1023); + const __m256i shift2 = _mm256_set1_epi64x((1024LL << 48) + (1LL << 32) + (1024 << 16) + 1); + const __m256i sllvdidx = _mm256_set1_epi64x(12); + const __m256i shufbidx = _mm256_set_epi8( 8, 4, 3, 2, 1, 0, -1, -1, -1, -1, -1, -1, 12, 11, 10, 9, + -1, -1, -1, -1, -1, -1, 12, 11, 10, 9, 8, 4, 3, 2, 1, 0); + + for (i = 0; i < KYBER_N / 16; i++) { + f0 = _mm256_load_si256(&a->vec[i]); + f1 = _mm256_mullo_epi16(f0, v8); + f2 = _mm256_add_epi16(f0, off); + f0 = _mm256_slli_epi16(f0, 3); + f0 = _mm256_mulhi_epi16(f0, v); + f2 = _mm256_sub_epi16(f1, f2); + f1 = _mm256_andnot_si256(f1, f2); + f1 = _mm256_srli_epi16(f1, 15); + f0 = _mm256_sub_epi16(f0, f1); + f0 = _mm256_mulhrs_epi16(f0, shift1); + f0 = _mm256_and_si256(f0, mask); + f0 = _mm256_madd_epi16(f0, shift2); + f0 = _mm256_sllv_epi32(f0, sllvdidx); + f0 = _mm256_srli_epi64(f0, 12); + f0 = _mm256_shuffle_epi8(f0, shufbidx); + t0 = _mm256_castsi256_si128(f0); + t1 = _mm256_extracti128_si256(f0, 1); + t0 = _mm_blend_epi16(t0, t1, 0xE0); + _mm_storeu_si128((__m128i *)&r[20 * i + 0], t0); + _mm_store_ss((float *)&low, _mm_castsi128_ps(t1)); + r[20 * i + 16] = (uint8_t)low; + r[20 * i + 17] = (uint8_t)(low >> 0x08); + r[20 * i + 18] = (uint8_t)(low >> 0x10); + r[20 * i + 19] = (uint8_t)(low >> 0x18); + } +} + +static void poly_decompress10(poly *restrict r, const uint8_t a[320 + 12]) { + size_t i; + __m256i f; + const __m256i q = _mm256_set1_epi32((KYBER_Q << 16) + 4 * KYBER_Q); + const __m256i shufbidx = _mm256_set_epi8(11, 10, 10, 9, 9, 8, 8, 7, + 6, 5, 5, 4, 4, 3, 3, 2, + 9, 8, 8, 7, 7, 6, 6, 5, + 4, 3, 3, 2, 2, 1, 1, 0); + const __m256i sllvdidx = _mm256_set1_epi64x(4); + const __m256i mask = _mm256_set1_epi32((32736 << 16) + 8184); + + for (i = 0; i < KYBER_N / 16; i++) { + f = _mm256_loadu_si256((__m256i *)&a[20 * i]); + f = _mm256_permute4x64_epi64(f, 0x94); + f = _mm256_shuffle_epi8(f, shufbidx); + f = _mm256_sllv_epi32(f, sllvdidx); + f = _mm256_srli_epi16(f, 1); + f = _mm256_and_si256(f, mask); + f = _mm256_mulhrs_epi16(f, q); + _mm256_store_si256(&r->vec[i], f); + } +} + + /************************************************* * Name: PQCLEAN_KYBER76890S_AVX2_polyvec_compress * @@ -14,27 +82,11 @@ * (needs space for KYBER_POLYVECCOMPRESSEDBYTES) * - polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER76890S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], - polyvec *restrict a) { - size_t i, j, k; +void PQCLEAN_KYBER76890S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES + 2], polyvec *a) { + size_t i; - PQCLEAN_KYBER76890S_AVX2_polyvec_csubq(a); - - uint16_t t[4]; for (i = 0; i < KYBER_K; i++) { - for (j = 0; j < KYBER_N / 4; j++) { - for (k = 0; k < 4; k++) { - t[k] = ((((uint32_t)a->vec[i].coeffs[4 * j + k] << 10) + KYBER_Q / 2) - / KYBER_Q) & 0x3ff; - } - - r[0] = (t[0] >> 0); - r[1] = (t[0] >> 8) | (t[1] << 2); - r[2] = (t[1] >> 6) | (t[2] << 4); - r[3] = (t[2] >> 4) | (t[3] << 6); - r[4] = (t[3] >> 2); - r += 5; - } + poly_compress10(&r[320 * i], &a->vec[i]); } } @@ -44,27 +96,15 @@ void PQCLEAN_KYBER76890S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSED * Description: De-serialize and decompress vector of polynomials; * approximate inverse of PQCLEAN_KYBER76890S_AVX2_polyvec_compress * -* Arguments: - polyvec *r: pointer to output vector of polynomials +* Arguments: - polyvec *r: pointer to output vector of polynomials * - const uint8_t *a: pointer to input byte array * (of length KYBER_POLYVECCOMPRESSEDBYTES) **************************************************/ -void PQCLEAN_KYBER76890S_AVX2_polyvec_decompress(polyvec *restrict r, - const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { - size_t i, j, k; +void PQCLEAN_KYBER76890S_AVX2_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES + 12]) { + size_t i; - uint16_t t[4]; for (i = 0; i < KYBER_K; i++) { - for (j = 0; j < KYBER_N / 4; j++) { - t[0] = (a[0] >> 0) | ((uint16_t)a[1] << 8); - t[1] = (a[1] >> 2) | ((uint16_t)a[2] << 6); - t[2] = (a[2] >> 4) | ((uint16_t)a[3] << 4); - t[3] = (a[3] >> 6) | ((uint16_t)a[4] << 2); - a += 5; - - for (k = 0; k < 4; k++) { - r->vec[i].coeffs[4 * j + k] = ((uint32_t)(t[k] & 0x3FF) * KYBER_Q + 512) >> 10; - } - } + poly_decompress10(&r->vec[i], &a[320 * i]); } } @@ -90,7 +130,7 @@ void PQCLEAN_KYBER76890S_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], pol * Description: De-serialize vector of polynomials; * inverse of PQCLEAN_KYBER76890S_AVX2_polyvec_tobytes * -* Arguments: - uint8_t *r: pointer to output byte array +* Arguments: - uint8_t *r: pointer to output byte array * - const polyvec *a: pointer to input vector of polynomials * (of length KYBER_POLYVECBYTES) **************************************************/ @@ -131,29 +171,34 @@ void PQCLEAN_KYBER76890S_AVX2_polyvec_invntt_tomont(polyvec *r) { } /************************************************* -* Name: PQCLEAN_KYBER76890S_AVX2_polyvec_pointwise_acc_montgomery +* Name: PQCLEAN_KYBER76890S_AVX2_polyvec_basemul_acc_montgomery * -* Description: Pointwise multiply elements of a and b, accumulate into r, +* Description: Multiply elements in a and b in NTT domain, accumulate into r, * and multiply by 2^-16. * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ -void PQCLEAN_KYBER76890S_AVX2_polyvec_pointwise_acc_montgomery(poly *r, - const polyvec *a, - const polyvec *b) { - PQCLEAN_KYBER76890S_AVX2_basemul_acc_avx(r->coeffs, a->vec->coeffs, b->vec->coeffs, & PQCLEAN_KYBER76890S_AVX2_qdata); +void PQCLEAN_KYBER76890S_AVX2_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) { + size_t i; + poly tmp; + + PQCLEAN_KYBER76890S_AVX2_poly_basemul_montgomery(r, &a->vec[0], &b->vec[0]); + for (i = 1; i < KYBER_K; i++) { + PQCLEAN_KYBER76890S_AVX2_poly_basemul_montgomery(&tmp, &a->vec[i], &b->vec[i]); + PQCLEAN_KYBER76890S_AVX2_poly_add(r, r, &tmp); + } } /************************************************* * Name: PQCLEAN_KYBER76890S_AVX2_polyvec_reduce * * Description: Applies Barrett reduction to each coefficient -* of each element of a vector of polynomials +* of each element of a vector of polynomials; * for details of the Barrett reduction see comments in reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - polyvec *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER76890S_AVX2_polyvec_reduce(polyvec *r) { size_t i; @@ -162,23 +207,6 @@ void PQCLEAN_KYBER76890S_AVX2_polyvec_reduce(polyvec *r) { } } -/************************************************* -* Name: PQCLEAN_KYBER76890S_AVX2_polyvec_csubq -* -* Description: Applies conditional subtraction of q to each coefficient -* of each element of a vector of polynomials -* for details of conditional subtraction of q see comments in -* reduce.c -* -* Arguments: - poly *r: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_KYBER76890S_AVX2_polyvec_csubq(polyvec *r) { - size_t i; - for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER76890S_AVX2_poly_csubq(&r->vec[i]); - } -} - /************************************************* * Name: PQCLEAN_KYBER76890S_AVX2_polyvec_add * diff --git a/crypto_kem/kyber768-90s/avx2/polyvec.h b/crypto_kem/kyber768-90s/avx2/polyvec.h index 0984febd..3ef4364f 100644 --- a/crypto_kem/kyber768-90s/avx2/polyvec.h +++ b/crypto_kem/kyber768-90s/avx2/polyvec.h @@ -8,9 +8,8 @@ typedef struct { poly vec[KYBER_K]; } polyvec; -void PQCLEAN_KYBER76890S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a); -void PQCLEAN_KYBER76890S_AVX2_polyvec_decompress(polyvec *r, - const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); +void PQCLEAN_KYBER76890S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES + 2], polyvec *a); +void PQCLEAN_KYBER76890S_AVX2_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES + 12]); void PQCLEAN_KYBER76890S_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a); void PQCLEAN_KYBER76890S_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]); @@ -18,12 +17,9 @@ void PQCLEAN_KYBER76890S_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBE void PQCLEAN_KYBER76890S_AVX2_polyvec_ntt(polyvec *r); void PQCLEAN_KYBER76890S_AVX2_polyvec_invntt_tomont(polyvec *r); -void PQCLEAN_KYBER76890S_AVX2_polyvec_pointwise_acc_montgomery(poly *r, - const polyvec *a, - const polyvec *b); +void PQCLEAN_KYBER76890S_AVX2_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b); void PQCLEAN_KYBER76890S_AVX2_polyvec_reduce(polyvec *r); -void PQCLEAN_KYBER76890S_AVX2_polyvec_csubq(polyvec *r); void PQCLEAN_KYBER76890S_AVX2_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); diff --git a/crypto_kem/kyber768-90s/avx2/reduce.h b/crypto_kem/kyber768-90s/avx2/reduce.h index 334b851f..758e4f82 100644 --- a/crypto_kem/kyber768-90s/avx2/reduce.h +++ b/crypto_kem/kyber768-90s/avx2/reduce.h @@ -1,10 +1,9 @@ #ifndef PQCLEAN_KYBER76890S_AVX2_REDUCE_H #define PQCLEAN_KYBER76890S_AVX2_REDUCE_H -#include "consts.h" -#include +#include "params.h" +#include -int16_t PQCLEAN_KYBER76890S_AVX2_reduce_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER76890S_AVX2_qdata); -int16_t PQCLEAN_KYBER76890S_AVX2_csubq_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER76890S_AVX2_qdata); -int16_t PQCLEAN_KYBER76890S_AVX2_tomont_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER76890S_AVX2_qdata); +void PQCLEAN_KYBER76890S_AVX2_reduce_avx(__m256i *r, const __m256i *PQCLEAN_KYBER76890S_AVX2_qdata); +void PQCLEAN_KYBER76890S_AVX2_tomont_avx(__m256i *r, const __m256i *PQCLEAN_KYBER76890S_AVX2_qdata); #endif diff --git a/crypto_kem/kyber768-90s/avx2/rejsample.c b/crypto_kem/kyber768-90s/avx2/rejsample.c index bb78eaae..3fc98ef6 100644 --- a/crypto_kem/kyber768-90s/avx2/rejsample.c +++ b/crypto_kem/kyber768-90s/avx2/rejsample.c @@ -4,311 +4,68 @@ #include "rejsample.h" #include #include +#include + +//#define BMI -static const ALIGN32_ARRAY_2D(uint8_t, 256, 8) idx = {.arr = { - {-1, -1, -1, -1, -1, -1, -1, -1}, - { 0, -1, -1, -1, -1, -1, -1, -1}, - { 2, -1, -1, -1, -1, -1, -1, -1}, - { 0, 2, -1, -1, -1, -1, -1, -1}, - { 4, -1, -1, -1, -1, -1, -1, -1}, - { 0, 4, -1, -1, -1, -1, -1, -1}, - { 2, 4, -1, -1, -1, -1, -1, -1}, - { 0, 2, 4, -1, -1, -1, -1, -1}, - { 6, -1, -1, -1, -1, -1, -1, -1}, - { 0, 6, -1, -1, -1, -1, -1, -1}, - { 2, 6, -1, -1, -1, -1, -1, -1}, - { 0, 2, 6, -1, -1, -1, -1, -1}, - { 4, 6, -1, -1, -1, -1, -1, -1}, - { 0, 4, 6, -1, -1, -1, -1, -1}, - { 2, 4, 6, -1, -1, -1, -1, -1}, - { 0, 2, 4, 6, -1, -1, -1, -1}, - { 8, -1, -1, -1, -1, -1, -1, -1}, - { 0, 8, -1, -1, -1, -1, -1, -1}, - { 2, 8, -1, -1, -1, -1, -1, -1}, - { 0, 2, 8, -1, -1, -1, -1, -1}, - { 4, 8, -1, -1, -1, -1, -1, -1}, - { 0, 4, 8, -1, -1, -1, -1, -1}, - { 2, 4, 8, -1, -1, -1, -1, -1}, - { 0, 2, 4, 8, -1, -1, -1, -1}, - { 6, 8, -1, -1, -1, -1, -1, -1}, - { 0, 6, 8, -1, -1, -1, -1, -1}, - { 2, 6, 8, -1, -1, -1, -1, -1}, - { 0, 2, 6, 8, -1, -1, -1, -1}, - { 4, 6, 8, -1, -1, -1, -1, -1}, - { 0, 4, 6, 8, -1, -1, -1, -1}, - { 2, 4, 6, 8, -1, -1, -1, -1}, - { 0, 2, 4, 6, 8, -1, -1, -1}, - {10, -1, -1, -1, -1, -1, -1, -1}, - { 0, 10, -1, -1, -1, -1, -1, -1}, - { 2, 10, -1, -1, -1, -1, -1, -1}, - { 0, 2, 10, -1, -1, -1, -1, -1}, - { 4, 10, -1, -1, -1, -1, -1, -1}, - { 0, 4, 10, -1, -1, -1, -1, -1}, - { 2, 4, 10, -1, -1, -1, -1, -1}, - { 0, 2, 4, 10, -1, -1, -1, -1}, - { 6, 10, -1, -1, -1, -1, -1, -1}, - { 0, 6, 10, -1, -1, -1, -1, -1}, - { 2, 6, 10, -1, -1, -1, -1, -1}, - { 0, 2, 6, 10, -1, -1, -1, -1}, - { 4, 6, 10, -1, -1, -1, -1, -1}, - { 0, 4, 6, 10, -1, -1, -1, -1}, - { 2, 4, 6, 10, -1, -1, -1, -1}, - { 0, 2, 4, 6, 10, -1, -1, -1}, - { 8, 10, -1, -1, -1, -1, -1, -1}, - { 0, 8, 10, -1, -1, -1, -1, -1}, - { 2, 8, 10, -1, -1, -1, -1, -1}, - { 0, 2, 8, 10, -1, -1, -1, -1}, - { 4, 8, 10, -1, -1, -1, -1, -1}, - { 0, 4, 8, 10, -1, -1, -1, -1}, - { 2, 4, 8, 10, -1, -1, -1, -1}, - { 0, 2, 4, 8, 10, -1, -1, -1}, - { 6, 8, 10, -1, -1, -1, -1, -1}, - { 0, 6, 8, 10, -1, -1, -1, -1}, - { 2, 6, 8, 10, -1, -1, -1, -1}, - { 0, 2, 6, 8, 10, -1, -1, -1}, - { 4, 6, 8, 10, -1, -1, -1, -1}, - { 0, 4, 6, 8, 10, -1, -1, -1}, - { 2, 4, 6, 8, 10, -1, -1, -1}, - { 0, 2, 4, 6, 8, 10, -1, -1}, - {12, -1, -1, -1, -1, -1, -1, -1}, - { 0, 12, -1, -1, -1, -1, -1, -1}, - { 2, 12, -1, -1, -1, -1, -1, -1}, - { 0, 2, 12, -1, -1, -1, -1, -1}, - { 4, 12, -1, -1, -1, -1, -1, -1}, - { 0, 4, 12, -1, -1, -1, -1, -1}, - { 2, 4, 12, -1, -1, -1, -1, -1}, - { 0, 2, 4, 12, -1, -1, -1, -1}, - { 6, 12, -1, -1, -1, -1, -1, -1}, - { 0, 6, 12, -1, -1, -1, -1, -1}, - { 2, 6, 12, -1, -1, -1, -1, -1}, - { 0, 2, 6, 12, -1, -1, -1, -1}, - { 4, 6, 12, -1, -1, -1, -1, -1}, - { 0, 4, 6, 12, -1, -1, -1, -1}, - { 2, 4, 6, 12, -1, -1, -1, -1}, - { 0, 2, 4, 6, 12, -1, -1, -1}, - { 8, 12, -1, -1, -1, -1, -1, -1}, - { 0, 8, 12, -1, -1, -1, -1, -1}, - { 2, 8, 12, -1, -1, -1, -1, -1}, - { 0, 2, 8, 12, -1, -1, -1, -1}, - { 4, 8, 12, -1, -1, -1, -1, -1}, - { 0, 4, 8, 12, -1, -1, -1, -1}, - { 2, 4, 8, 12, -1, -1, -1, -1}, - { 0, 2, 4, 8, 12, -1, -1, -1}, - { 6, 8, 12, -1, -1, -1, -1, -1}, - { 0, 6, 8, 12, -1, -1, -1, -1}, - { 2, 6, 8, 12, -1, -1, -1, -1}, - { 0, 2, 6, 8, 12, -1, -1, -1}, - { 4, 6, 8, 12, -1, -1, -1, -1}, - { 0, 4, 6, 8, 12, -1, -1, -1}, - { 2, 4, 6, 8, 12, -1, -1, -1}, - { 0, 2, 4, 6, 8, 12, -1, -1}, - {10, 12, -1, -1, -1, -1, -1, -1}, - { 0, 10, 12, -1, -1, -1, -1, -1}, - { 2, 10, 12, -1, -1, -1, -1, -1}, - { 0, 2, 10, 12, -1, -1, -1, -1}, - { 4, 10, 12, -1, -1, -1, -1, -1}, - { 0, 4, 10, 12, -1, -1, -1, -1}, - { 2, 4, 10, 12, -1, -1, -1, -1}, - { 0, 2, 4, 10, 12, -1, -1, -1}, - { 6, 10, 12, -1, -1, -1, -1, -1}, - { 0, 6, 10, 12, -1, -1, -1, -1}, - { 2, 6, 10, 12, -1, -1, -1, -1}, - { 0, 2, 6, 10, 12, -1, -1, -1}, - { 4, 6, 10, 12, -1, -1, -1, -1}, - { 0, 4, 6, 10, 12, -1, -1, -1}, - { 2, 4, 6, 10, 12, -1, -1, -1}, - { 0, 2, 4, 6, 10, 12, -1, -1}, - { 8, 10, 12, -1, -1, -1, -1, -1}, - { 0, 8, 10, 12, -1, -1, -1, -1}, - { 2, 8, 10, 12, -1, -1, -1, -1}, - { 0, 2, 8, 10, 12, -1, -1, -1}, - { 4, 8, 10, 12, -1, -1, -1, -1}, - { 0, 4, 8, 10, 12, -1, -1, -1}, - { 2, 4, 8, 10, 12, -1, -1, -1}, - { 0, 2, 4, 8, 10, 12, -1, -1}, - { 6, 8, 10, 12, -1, -1, -1, -1}, - { 0, 6, 8, 10, 12, -1, -1, -1}, - { 2, 6, 8, 10, 12, -1, -1, -1}, - { 0, 2, 6, 8, 10, 12, -1, -1}, - { 4, 6, 8, 10, 12, -1, -1, -1}, - { 0, 4, 6, 8, 10, 12, -1, -1}, - { 2, 4, 6, 8, 10, 12, -1, -1}, - { 0, 2, 4, 6, 8, 10, 12, -1}, - {14, -1, -1, -1, -1, -1, -1, -1}, - { 0, 14, -1, -1, -1, -1, -1, -1}, - { 2, 14, -1, -1, -1, -1, -1, -1}, - { 0, 2, 14, -1, -1, -1, -1, -1}, - { 4, 14, -1, -1, -1, -1, -1, -1}, - { 0, 4, 14, -1, -1, -1, -1, -1}, - { 2, 4, 14, -1, -1, -1, -1, -1}, - { 0, 2, 4, 14, -1, -1, -1, -1}, - { 6, 14, -1, -1, -1, -1, -1, -1}, - { 0, 6, 14, -1, -1, -1, -1, -1}, - { 2, 6, 14, -1, -1, -1, -1, -1}, - { 0, 2, 6, 14, -1, -1, -1, -1}, - { 4, 6, 14, -1, -1, -1, -1, -1}, - { 0, 4, 6, 14, -1, -1, -1, -1}, - { 2, 4, 6, 14, -1, -1, -1, -1}, - { 0, 2, 4, 6, 14, -1, -1, -1}, - { 8, 14, -1, -1, -1, -1, -1, -1}, - { 0, 8, 14, -1, -1, -1, -1, -1}, - { 2, 8, 14, -1, -1, -1, -1, -1}, - { 0, 2, 8, 14, -1, -1, -1, -1}, - { 4, 8, 14, -1, -1, -1, -1, -1}, - { 0, 4, 8, 14, -1, -1, -1, -1}, - { 2, 4, 8, 14, -1, -1, -1, -1}, - { 0, 2, 4, 8, 14, -1, -1, -1}, - { 6, 8, 14, -1, -1, -1, -1, -1}, - { 0, 6, 8, 14, -1, -1, -1, -1}, - { 2, 6, 8, 14, -1, -1, -1, -1}, - { 0, 2, 6, 8, 14, -1, -1, -1}, - { 4, 6, 8, 14, -1, -1, -1, -1}, - { 0, 4, 6, 8, 14, -1, -1, -1}, - { 2, 4, 6, 8, 14, -1, -1, -1}, - { 0, 2, 4, 6, 8, 14, -1, -1}, - {10, 14, -1, -1, -1, -1, -1, -1}, - { 0, 10, 14, -1, -1, -1, -1, -1}, - { 2, 10, 14, -1, -1, -1, -1, -1}, - { 0, 2, 10, 14, -1, -1, -1, -1}, - { 4, 10, 14, -1, -1, -1, -1, -1}, - { 0, 4, 10, 14, -1, -1, -1, -1}, - { 2, 4, 10, 14, -1, -1, -1, -1}, - { 0, 2, 4, 10, 14, -1, -1, -1}, - { 6, 10, 14, -1, -1, -1, -1, -1}, - { 0, 6, 10, 14, -1, -1, -1, -1}, - { 2, 6, 10, 14, -1, -1, -1, -1}, - { 0, 2, 6, 10, 14, -1, -1, -1}, - { 4, 6, 10, 14, -1, -1, -1, -1}, - { 0, 4, 6, 10, 14, -1, -1, -1}, - { 2, 4, 6, 10, 14, -1, -1, -1}, - { 0, 2, 4, 6, 10, 14, -1, -1}, - { 8, 10, 14, -1, -1, -1, -1, -1}, - { 0, 8, 10, 14, -1, -1, -1, -1}, - { 2, 8, 10, 14, -1, -1, -1, -1}, - { 0, 2, 8, 10, 14, -1, -1, -1}, - { 4, 8, 10, 14, -1, -1, -1, -1}, - { 0, 4, 8, 10, 14, -1, -1, -1}, - { 2, 4, 8, 10, 14, -1, -1, -1}, - { 0, 2, 4, 8, 10, 14, -1, -1}, - { 6, 8, 10, 14, -1, -1, -1, -1}, - { 0, 6, 8, 10, 14, -1, -1, -1}, - { 2, 6, 8, 10, 14, -1, -1, -1}, - { 0, 2, 6, 8, 10, 14, -1, -1}, - { 4, 6, 8, 10, 14, -1, -1, -1}, - { 0, 4, 6, 8, 10, 14, -1, -1}, - { 2, 4, 6, 8, 10, 14, -1, -1}, - { 0, 2, 4, 6, 8, 10, 14, -1}, - {12, 14, -1, -1, -1, -1, -1, -1}, - { 0, 12, 14, -1, -1, -1, -1, -1}, - { 2, 12, 14, -1, -1, -1, -1, -1}, - { 0, 2, 12, 14, -1, -1, -1, -1}, - { 4, 12, 14, -1, -1, -1, -1, -1}, - { 0, 4, 12, 14, -1, -1, -1, -1}, - { 2, 4, 12, 14, -1, -1, -1, -1}, - { 0, 2, 4, 12, 14, -1, -1, -1}, - { 6, 12, 14, -1, -1, -1, -1, -1}, - { 0, 6, 12, 14, -1, -1, -1, -1}, - { 2, 6, 12, 14, -1, -1, -1, -1}, - { 0, 2, 6, 12, 14, -1, -1, -1}, - { 4, 6, 12, 14, -1, -1, -1, -1}, - { 0, 4, 6, 12, 14, -1, -1, -1}, - { 2, 4, 6, 12, 14, -1, -1, -1}, - { 0, 2, 4, 6, 12, 14, -1, -1}, - { 8, 12, 14, -1, -1, -1, -1, -1}, - { 0, 8, 12, 14, -1, -1, -1, -1}, - { 2, 8, 12, 14, -1, -1, -1, -1}, - { 0, 2, 8, 12, 14, -1, -1, -1}, - { 4, 8, 12, 14, -1, -1, -1, -1}, - { 0, 4, 8, 12, 14, -1, -1, -1}, - { 2, 4, 8, 12, 14, -1, -1, -1}, - { 0, 2, 4, 8, 12, 14, -1, -1}, - { 6, 8, 12, 14, -1, -1, -1, -1}, - { 0, 6, 8, 12, 14, -1, -1, -1}, - { 2, 6, 8, 12, 14, -1, -1, -1}, - { 0, 2, 6, 8, 12, 14, -1, -1}, - { 4, 6, 8, 12, 14, -1, -1, -1}, - { 0, 4, 6, 8, 12, 14, -1, -1}, - { 2, 4, 6, 8, 12, 14, -1, -1}, - { 0, 2, 4, 6, 8, 12, 14, -1}, - {10, 12, 14, -1, -1, -1, -1, -1}, - { 0, 10, 12, 14, -1, -1, -1, -1}, - { 2, 10, 12, 14, -1, -1, -1, -1}, - { 0, 2, 10, 12, 14, -1, -1, -1}, - { 4, 10, 12, 14, -1, -1, -1, -1}, - { 0, 4, 10, 12, 14, -1, -1, -1}, - { 2, 4, 10, 12, 14, -1, -1, -1}, - { 0, 2, 4, 10, 12, 14, -1, -1}, - { 6, 10, 12, 14, -1, -1, -1, -1}, - { 0, 6, 10, 12, 14, -1, -1, -1}, - { 2, 6, 10, 12, 14, -1, -1, -1}, - { 0, 2, 6, 10, 12, 14, -1, -1}, - { 4, 6, 10, 12, 14, -1, -1, -1}, - { 0, 4, 6, 10, 12, 14, -1, -1}, - { 2, 4, 6, 10, 12, 14, -1, -1}, - { 0, 2, 4, 6, 10, 12, 14, -1}, - { 8, 10, 12, 14, -1, -1, -1, -1}, - { 0, 8, 10, 12, 14, -1, -1, -1}, - { 2, 8, 10, 12, 14, -1, -1, -1}, - { 0, 2, 8, 10, 12, 14, -1, -1}, - { 4, 8, 10, 12, 14, -1, -1, -1}, - { 0, 4, 8, 10, 12, 14, -1, -1}, - { 2, 4, 8, 10, 12, 14, -1, -1}, - { 0, 2, 4, 8, 10, 12, 14, -1}, - { 6, 8, 10, 12, 14, -1, -1, -1}, - { 0, 6, 8, 10, 12, 14, -1, -1}, - { 2, 6, 8, 10, 12, 14, -1, -1}, - { 0, 2, 6, 8, 10, 12, 14, -1}, - { 4, 6, 8, 10, 12, 14, -1, -1}, - { 0, 4, 6, 8, 10, 12, 14, -1}, - { 2, 4, 6, 8, 10, 12, 14, -1}, - { 0, 2, 4, 6, 8, 10, 12, 14} - } -}; #define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a) #define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a) -#define REJ_UNIFORM_BUFLEN 576 -unsigned int PQCLEAN_KYBER76890S_AVX2_rej_uniform_avx(int16_t *restrict r, - const uint8_t *restrict buf) { +unsigned int PQCLEAN_KYBER76890S_AVX2_rej_uniform_avx(int16_t *restrict r, const uint8_t *buf) { unsigned int ctr, pos; - uint16_t val; + uint16_t val0, val1; uint32_t good; - const __m256i bound = _mm256_set1_epi16((int16_t)(19 * KYBER_Q - 1)); + uint64_t idx0, idx1, idx2, idx3; + const __m256i bound = _mm256_load_si256(&PQCLEAN_KYBER76890S_AVX2_qdata.vec[_16XQ / 16]); const __m256i ones = _mm256_set1_epi8(1); - const __m256i kyberq = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER76890S_AVX2_qdata.as_arr[_16XQ]); - const __m256i v = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER76890S_AVX2_qdata.as_arr[_16XV]); + const __m256i mask = _mm256_set1_epi16(0xFFF); + const __m256i idx8 = _mm256_set_epi8(15, 14, 14, 13, 12, 11, 11, 10, + 9, 8, 8, 7, 6, 5, 5, 4, + 11, 10, 10, 9, 8, 7, 7, 6, + 5, 4, 4, 3, 2, 1, 1, 0); __m256i f0, f1, g0, g1, g2, g3; __m128i f, t, pilo, pihi; - ctr = 0; - for (pos = 0; pos < 2 * KYBER_N; pos += 64) { - f0 = _mm256_load_si256((__m256i *)&buf[pos + 0]); - f1 = _mm256_load_si256((__m256i *)&buf[pos + 32]); + ctr = pos = 0; + while (ctr <= KYBER_N - 32 && pos <= REJ_UNIFORM_AVX_BUFLEN - 48) { + f0 = _mm256_loadu_si256((__m256i *)&buf[pos]); + f1 = _mm256_loadu_si256((__m256i *)&buf[pos + 24]); + f0 = _mm256_permute4x64_epi64(f0, 0x94); + f1 = _mm256_permute4x64_epi64(f1, 0x94); + f0 = _mm256_shuffle_epi8(f0, idx8); + f1 = _mm256_shuffle_epi8(f1, idx8); + g0 = _mm256_srli_epi16(f0, 4); + g1 = _mm256_srli_epi16(f1, 4); + f0 = _mm256_blend_epi16(f0, g0, 0xAA); + f1 = _mm256_blend_epi16(f1, g1, 0xAA); + f0 = _mm256_and_si256(f0, mask); + f1 = _mm256_and_si256(f1, mask); + pos += 48; - g0 = _mm256_cmpge_epu16(bound, f0); - g1 = _mm256_cmpge_epu16(bound, f1); + g0 = _mm256_cmpgt_epi16(bound, f0); + g1 = _mm256_cmpgt_epi16(bound, f1); g0 = _mm256_packs_epi16(g0, g1); good = _mm256_movemask_epi8(g0); - g0 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 0) & 0xFF])); - g1 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 8) & 0xFF])); - g0 = _mm256_inserti128_si256(g0, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 16) & 0xFF]), 1); - g1 = _mm256_inserti128_si256(g1, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 24) & 0xFF]), 1); + idx0 = _pdep_u64(good >> 0, 0x0101010101010101); + idx1 = _pdep_u64(good >> 8, 0x0101010101010101); + idx2 = _pdep_u64(good >> 16, 0x0101010101010101); + idx3 = _pdep_u64(good >> 24, 0x0101010101010101); + idx0 = (idx0 << 8) - idx0; + idx0 = _pext_u64(0x0E0C0A0806040200, idx0); + idx1 = (idx1 << 8) - idx1; + idx1 = _pext_u64(0x0E0C0A0806040200, idx1); + idx2 = (idx2 << 8) - idx2; + idx2 = _pext_u64(0x0E0C0A0806040200, idx2); + idx3 = (idx3 << 8) - idx3; + idx3 = _pext_u64(0x0E0C0A0806040200, idx3); - //g0 = _mm256_cvtepu8_epi64(_mm_loadl_epi64((__m128i *)&good)); - //g1 = _mm256_i64gather_epi64((long long *)idx.arr, g0, 8); - - /* Barrett reduction of (still unsigned) values */ - g2 = _mm256_mulhi_epu16(f0, v); - g3 = _mm256_mulhi_epu16(f1, v); - g2 = _mm256_srli_epi16(g2, 10); - g3 = _mm256_srli_epi16(g3, 10); - g2 = _mm256_mullo_epi16(g2, kyberq); - g3 = _mm256_mullo_epi16(g3, kyberq); - f0 = _mm256_sub_epi16(f0, g2); - f1 = _mm256_sub_epi16(f1, g3); + g0 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx0)); + g1 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx1)); + g0 = _mm256_inserti128_si256(g0, _mm_cvtsi64_si128(idx2), 1); + g1 = _mm256_inserti128_si256(g1, _mm_cvtsi64_si128(idx3), 1); g2 = _mm256_add_epi8(g0, ones); g3 = _mm256_add_epi8(g1, ones); @@ -328,34 +85,40 @@ unsigned int PQCLEAN_KYBER76890S_AVX2_rej_uniform_avx(int16_t *restrict r, ctr += _mm_popcnt_u32((good >> 24) & 0xFF); } - while (ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_BUFLEN - 16) { - f = _mm_load_si128((__m128i *)&buf[pos]); - t = _mm_cmpge_epu16(_mm256_castsi256_si128(bound), f); + while (ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_AVX_BUFLEN - 12) { + f = _mm_loadu_si128((__m128i *)&buf[pos]); + f = _mm_shuffle_epi8(f, _mm256_castsi256_si128(idx8)); + t = _mm_srli_epi16(f, 4); + f = _mm_blend_epi16(f, t, 0xAA); + f = _mm_and_si128(f, _mm256_castsi256_si128(mask)); + pos += 12; + + t = _mm_cmpgt_epi16(_mm256_castsi256_si128(bound), f); good = _mm_movemask_epi8(t); - good = _pext_u32(good, 0x5555); - pilo = _mm_loadl_epi64((__m128i *)&idx.arr[good]); + + good &= 0x5555; + idx0 = _pdep_u64(good, 0x1111111111111111); + idx0 = (idx0 << 8) - idx0; + idx0 = _pext_u64(0x0E0C0A0806040200, idx0); + pilo = _mm_cvtsi64_si128(idx0); + pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones)); pilo = _mm_unpacklo_epi8(pilo, pihi); - - /* Barrett reduction */ - t = _mm_mulhi_epu16(f, _mm256_castsi256_si128(v)); - t = _mm_srli_epi16(t, 10); - t = _mm_mullo_epi16(t, _mm256_castsi256_si128(kyberq)); - f = _mm_sub_epi16(f, t); - f = _mm_shuffle_epi8(f, pilo); _mm_storeu_si128((__m128i *)&r[ctr], f); ctr += _mm_popcnt_u32(good); - pos += 16; } - while (ctr < KYBER_N && pos <= REJ_UNIFORM_BUFLEN - 2) { - val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); - pos += 2; + while (ctr < KYBER_N && pos <= REJ_UNIFORM_AVX_BUFLEN - 3) { + val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; + val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)); + pos += 3; - if (val < 19 * KYBER_Q) { - val -= ((int32_t)val * 20159 >> 26) * KYBER_Q; - r[ctr++] = val; + if (val0 < KYBER_Q) { + r[ctr++] = val0; + } + if (val1 < KYBER_Q && ctr < KYBER_N) { + r[ctr++] = val1; } } diff --git a/crypto_kem/kyber768-90s/avx2/rejsample.h b/crypto_kem/kyber768-90s/avx2/rejsample.h index 49f47d57..1b8e7888 100644 --- a/crypto_kem/kyber768-90s/avx2/rejsample.h +++ b/crypto_kem/kyber768-90s/avx2/rejsample.h @@ -1,9 +1,12 @@ #ifndef PQCLEAN_KYBER76890S_AVX2_REJSAMPLE_H #define PQCLEAN_KYBER76890S_AVX2_REJSAMPLE_H #include "params.h" +#include "symmetric.h" #include -unsigned int PQCLEAN_KYBER76890S_AVX2_rej_uniform_avx(int16_t *r, - const unsigned char *buf); +#define REJ_UNIFORM_AVX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) +#define REJ_UNIFORM_AVX_BUFLEN (REJ_UNIFORM_AVX_NBLOCKS*XOF_BLOCKBYTES) + +unsigned int PQCLEAN_KYBER76890S_AVX2_rej_uniform_avx(int16_t *r, const uint8_t *buf); #endif diff --git a/crypto_kem/kyber768-90s/avx2/shuffle.S b/crypto_kem/kyber768-90s/avx2/shuffle.S index 31e7c7c2..bc1d58b3 100644 --- a/crypto_kem/kyber768-90s/avx2/shuffle.S +++ b/crypto_kem/kyber768-90s/avx2/shuffle.S @@ -113,13 +113,13 @@ vmovdqa 224(%rsi),%ymm12 #csubq csubq 5,13 -csubq 6,14 -csubq 7,15 -csubq 8,1 +csubq 6,13 +csubq 7,13 +csubq 8,13 csubq 9,13 -csubq 10,14 -csubq 11,15 -csubq 12,1 +csubq 10,13 +csubq 11,13 +csubq 12,13 #bitpack vpsllw $12,%ymm6,%ymm4 diff --git a/crypto_kem/kyber768-90s/avx2/shuffle.inc b/crypto_kem/kyber768-90s/avx2/shuffle.inc index d4b092bc..73e9ffe0 100644 --- a/crypto_kem/kyber768-90s/avx2/shuffle.inc +++ b/crypto_kem/kyber768-90s/avx2/shuffle.inc @@ -9,17 +9,17 @@ vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 .endm .macro shuffle2 r0,r1,r2,r3 -#vshufps $0x20,%ymm\r1,%ymm\r0,%ymm\r2 -#vshufps $0x31,%ymm\r1,%ymm\r0,%ymm\r3 -vpsllq $32,%ymm\r1,%ymm12 -vpsrlq $32,%ymm\r0,%ymm13 -vpblendd $0xAA,%ymm12,%ymm\r0,%ymm\r2 -vpblendd $0xAA,%ymm\r1,%ymm13,%ymm\r3 +#vpsllq $32,%ymm\r1,%ymm\r2 +vmovsldup %ymm\r1,%ymm\r2 +vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrlq $32,%ymm\r0,%ymm\r0 +#vmovshdup %ymm\r0,%ymm\r0 +vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 .endm .macro shuffle1 r0,r1,r2,r3 -vpslld $16,%ymm\r1,%ymm12 -vpsrld $16,%ymm\r0,%ymm13 -vpblendw $0xAA,%ymm12,%ymm\r0,%ymm\r2 -vpblendw $0xAA,%ymm\r1,%ymm13,%ymm\r3 +vpslld $16,%ymm\r1,%ymm\r2 +vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrld $16,%ymm\r0,%ymm\r0 +vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 .endm diff --git a/crypto_kem/kyber768-90s/avx2/symmetric.h b/crypto_kem/kyber768-90s/avx2/symmetric.h index cd0f7687..3473cfc0 100644 --- a/crypto_kem/kyber768-90s/avx2/symmetric.h +++ b/crypto_kem/kyber768-90s/avx2/symmetric.h @@ -14,12 +14,10 @@ typedef aes256ctr_ctx xof_state; #define hash_h(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES) #define hash_g(OUT, IN, INBYTES) sha512(OUT, IN, INBYTES) -#define xof_absorb(STATE, SEED, X, Y) \ - PQCLEAN_KYBER76890S_AVX2_aes256ctr_init(STATE, SEED, (X) | ((uint16_t)(Y) << 8)) -#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) \ - PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) -#define prf(OUT, OUTBYTES, KEY, NONCE) \ - PQCLEAN_KYBER76890S_AVX2_aes256ctr_prf(OUT, OUTBYTES, KEY, NONCE) +#define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER76890S_AVX2_aes256ctr_init(STATE, SEED, (X) | ((uint16_t)(Y) << 8)) +#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define xof_ctx_release(STATE) +#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER76890S_AVX2_aes256ctr_prf(OUT, OUTBYTES, KEY, NONCE) #define kdf(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES) diff --git a/crypto_kem/kyber768-90s/avx2/verify.c b/crypto_kem/kyber768-90s/avx2/verify.c index 2117bea7..9ec47757 100644 --- a/crypto_kem/kyber768-90s/avx2/verify.c +++ b/crypto_kem/kyber768-90s/avx2/verify.c @@ -8,31 +8,31 @@ * * Description: Compare two arrays for equality in constant time. * -* Arguments: const unsigned char *a: pointer to first byte array -* const unsigned char *b: pointer to second byte array -* size_t len: length of the byte arrays +* Arguments: const uint8_t *a: pointer to first byte array +* const uint8_t *b: pointer to second byte array +* size_t len: length of the byte arrays * * Returns 0 if the byte arrays are equal, 1 otherwise **************************************************/ int PQCLEAN_KYBER76890S_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) { - size_t pos; + size_t i; uint64_t r; - __m256i avec, bvec, cvec; + __m256i f, g, h; - cvec = _mm256_setzero_si256(); - for (pos = 0; pos + 32 <= len; pos += 32) { - avec = _mm256_loadu_si256((__m256i *)&a[pos]); - bvec = _mm256_loadu_si256((__m256i *)&b[pos]); - avec = _mm256_xor_si256(avec, bvec); - cvec = _mm256_or_si256(cvec, avec); + h = _mm256_setzero_si256(); + for (i = 0; i < len / 32; i++) { + f = _mm256_loadu_si256((__m256i *)&a[32 * i]); + g = _mm256_loadu_si256((__m256i *)&b[32 * i]); + f = _mm256_xor_si256(f, g); + h = _mm256_or_si256(h, f); } - r = 1 - _mm256_testz_si256(cvec, cvec); + r = 1 - _mm256_testz_si256(h, h); - if (pos < len) { - avec = _mm256_loadu_si256((__m256i *)&a[pos]); - bvec = _mm256_loadu_si256((__m256i *)&b[pos]); - cvec = _mm256_cmpeq_epi8(avec, bvec); - r |= _mm256_movemask_epi8(cvec) & (-(uint32_t)1 >> (32 + pos - len)); + a += 32 * i; + b += 32 * i; + len -= 32 * i; + for (i = 0; i < len; i++) { + r |= a[i] ^ b[i]; } r = (-r) >> 63; @@ -47,29 +47,27 @@ int PQCLEAN_KYBER76890S_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t l * assumes two's complement representation of negative integers. * Runs in constant time. * -* Arguments: unsigned char *r: pointer to output byte array +* Arguments: unsigned char *r: pointer to output byte array * const unsigned char *x: pointer to input byte array -* size_t len: Amount of bytes to be copied -* unsigned char b: Condition bit; has to be in {0,1} +* size_t len: Amount of bytes to be copied +* unsigned char b: Condition bit; has to be in {0,1} **************************************************/ -void PQCLEAN_KYBER76890S_AVX2_cmov(uint8_t *restrict r, const uint8_t *restrict x, size_t len, uint8_t b) { - size_t pos; +void PQCLEAN_KYBER76890S_AVX2_cmov(uint8_t *restrict r, const uint8_t *x, size_t len, uint8_t b) { + size_t i; __m256i xvec, rvec, bvec; - b = -b; - bvec = _mm256_set1_epi8(b); - - for (pos = 0; pos + 32 <= len; pos += 32) { - rvec = _mm256_loadu_si256((__m256i *)&r[pos]); - xvec = _mm256_loadu_si256((__m256i *)&x[pos]); - xvec = _mm256_xor_si256(xvec, rvec); - xvec = _mm256_and_si256(xvec, bvec); - rvec = _mm256_xor_si256(rvec, xvec); - _mm256_storeu_si256((__m256i *)&r[pos], rvec); + bvec = _mm256_set1_epi64x(-(uint64_t)b); + for (i = 0; i < len / 32; i++) { + rvec = _mm256_loadu_si256((__m256i *)&r[32 * i]); + xvec = _mm256_loadu_si256((__m256i *)&x[32 * i]); + rvec = _mm256_blendv_epi8(rvec, xvec, bvec); + _mm256_storeu_si256((__m256i *)&r[32 * i], rvec); } - while (pos < len) { - r[pos] ^= b & (x[pos] ^ r[pos]); - pos += 1; + r += 32 * i; + x += 32 * i; + len -= 32 * i; + for (i = 0; i < len; i++) { + r[i] ^= -b & (x[i] ^ r[i]); } } diff --git a/crypto_kem/kyber768-90s/clean/Makefile b/crypto_kem/kyber768-90s/clean/Makefile index 8b223086..750ed1a3 100644 --- a/crypto_kem/kyber768-90s/clean/Makefile +++ b/crypto_kem/kyber768-90s/clean/Makefile @@ -1,8 +1,8 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libkyber768-90s_clean.a -HEADERS=api.h cbd.h indcpa.h kem.h ntt.h params.h poly.h polyvec.h reduce.h symmetric-aes.h symmetric.h verify.h -OBJECTS=cbd.o indcpa.o kem.o ntt.o poly.o polyvec.o reduce.o symmetric-aes.o verify.o +HEADERS=aes256ctr.h api.h cbd.h indcpa.h kem.h ntt.h params.h poly.h polyvec.h reduce.h symmetric.h verify.h +OBJECTS=aes256ctr.o cbd.o indcpa.o kem.o ntt.o poly.o polyvec.o reduce.o symmetric-aes.o verify.o CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) diff --git a/crypto_kem/kyber768-90s/clean/Makefile.Microsoft_nmake b/crypto_kem/kyber768-90s/clean/Makefile.Microsoft_nmake index 381f983f..a144cb86 100644 --- a/crypto_kem/kyber768-90s/clean/Makefile.Microsoft_nmake +++ b/crypto_kem/kyber768-90s/clean/Makefile.Microsoft_nmake @@ -2,7 +2,7 @@ # nmake /f Makefile.Microsoft_nmake LIBRARY=libkyber768-90s_clean.lib -OBJECTS=cbd.obj indcpa.obj kem.obj ntt.obj poly.obj polyvec.obj reduce.obj symmetric-aes.obj verify.obj +OBJECTS=aes256ctr.obj cbd.obj indcpa.obj kem.obj ntt.obj poly.obj polyvec.obj reduce.obj symmetric-aes.obj verify.obj # Warning C4146 is raised when a unary minus operator is applied to an # unsigned type; this has nonetheless been standard and portable for as diff --git a/crypto_kem/kyber768-90s/clean/aes256ctr.c b/crypto_kem/kyber768-90s/clean/aes256ctr.c new file mode 100644 index 00000000..29159715 --- /dev/null +++ b/crypto_kem/kyber768-90s/clean/aes256ctr.c @@ -0,0 +1,564 @@ +#include "aes256ctr.h" +#include +#include +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +static inline uint32_t br_dec32le(const uint8_t *src) { + return (uint32_t)src[0] + | ((uint32_t)src[1] << 8) + | ((uint32_t)src[2] << 16) + | ((uint32_t)src[3] << 24); +} + +static void br_range_dec32le(uint32_t *v, size_t num, const uint8_t *src) { + while (num-- > 0) { + *v ++ = br_dec32le(src); + src += 4; + } +} + +static inline uint32_t br_swap32(uint32_t x) { + x = ((x & (uint32_t)0x00FF00FF) << 8) + | ((x >> 8) & (uint32_t)0x00FF00FF); + return (x << 16) | (x >> 16); +} + +static inline void br_enc32le(uint8_t *dst, uint32_t x) { + dst[0] = (uint8_t)x; + dst[1] = (uint8_t)(x >> 8); + dst[2] = (uint8_t)(x >> 16); + dst[3] = (uint8_t)(x >> 24); +} + +static void br_range_enc32le(uint8_t *dst, const uint32_t *v, size_t num) { + while (num-- > 0) { + br_enc32le(dst, *v ++); + dst += 4; + } +} + +static void br_aes_ct64_bitslice_Sbox(uint64_t *q) { + /* + * This S-box implementation is a straightforward translation of + * the circuit described by Boyar and Peralta in "A new + * combinational logic minimization technique with applications + * to cryptology" (https://eprint.iacr.org/2009/191.pdf). + * + * Note that variables x* (input) and s* (output) are numbered + * in "reverse" order (x0 is the high bit, x7 is the low bit). + */ + + uint64_t x0, x1, x2, x3, x4, x5, x6, x7; + uint64_t y1, y2, y3, y4, y5, y6, y7, y8, y9; + uint64_t y10, y11, y12, y13, y14, y15, y16, y17, y18, y19; + uint64_t y20, y21; + uint64_t z0, z1, z2, z3, z4, z5, z6, z7, z8, z9; + uint64_t z10, z11, z12, z13, z14, z15, z16, z17; + uint64_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9; + uint64_t t10, t11, t12, t13, t14, t15, t16, t17, t18, t19; + uint64_t t20, t21, t22, t23, t24, t25, t26, t27, t28, t29; + uint64_t t30, t31, t32, t33, t34, t35, t36, t37, t38, t39; + uint64_t t40, t41, t42, t43, t44, t45, t46, t47, t48, t49; + uint64_t t50, t51, t52, t53, t54, t55, t56, t57, t58, t59; + uint64_t t60, t61, t62, t63, t64, t65, t66, t67; + uint64_t s0, s1, s2, s3, s4, s5, s6, s7; + + x0 = q[7]; + x1 = q[6]; + x2 = q[5]; + x3 = q[4]; + x4 = q[3]; + x5 = q[2]; + x6 = q[1]; + x7 = q[0]; + + /* + * Top linear transformation. + */ + y14 = x3 ^ x5; + y13 = x0 ^ x6; + y9 = x0 ^ x3; + y8 = x0 ^ x5; + t0 = x1 ^ x2; + y1 = t0 ^ x7; + y4 = y1 ^ x3; + y12 = y13 ^ y14; + y2 = y1 ^ x0; + y5 = y1 ^ x6; + y3 = y5 ^ y8; + t1 = x4 ^ y12; + y15 = t1 ^ x5; + y20 = t1 ^ x1; + y6 = y15 ^ x7; + y10 = y15 ^ t0; + y11 = y20 ^ y9; + y7 = x7 ^ y11; + y17 = y10 ^ y11; + y19 = y10 ^ y8; + y16 = t0 ^ y11; + y21 = y13 ^ y16; + y18 = x0 ^ y16; + + /* + * Non-linear section. + */ + t2 = y12 & y15; + t3 = y3 & y6; + t4 = t3 ^ t2; + t5 = y4 & x7; + t6 = t5 ^ t2; + t7 = y13 & y16; + t8 = y5 & y1; + t9 = t8 ^ t7; + t10 = y2 & y7; + t11 = t10 ^ t7; + t12 = y9 & y11; + t13 = y14 & y17; + t14 = t13 ^ t12; + t15 = y8 & y10; + t16 = t15 ^ t12; + t17 = t4 ^ t14; + t18 = t6 ^ t16; + t19 = t9 ^ t14; + t20 = t11 ^ t16; + t21 = t17 ^ y20; + t22 = t18 ^ y19; + t23 = t19 ^ y21; + t24 = t20 ^ y18; + + t25 = t21 ^ t22; + t26 = t21 & t23; + t27 = t24 ^ t26; + t28 = t25 & t27; + t29 = t28 ^ t22; + t30 = t23 ^ t24; + t31 = t22 ^ t26; + t32 = t31 & t30; + t33 = t32 ^ t24; + t34 = t23 ^ t33; + t35 = t27 ^ t33; + t36 = t24 & t35; + t37 = t36 ^ t34; + t38 = t27 ^ t36; + t39 = t29 & t38; + t40 = t25 ^ t39; + + t41 = t40 ^ t37; + t42 = t29 ^ t33; + t43 = t29 ^ t40; + t44 = t33 ^ t37; + t45 = t42 ^ t41; + z0 = t44 & y15; + z1 = t37 & y6; + z2 = t33 & x7; + z3 = t43 & y16; + z4 = t40 & y1; + z5 = t29 & y7; + z6 = t42 & y11; + z7 = t45 & y17; + z8 = t41 & y10; + z9 = t44 & y12; + z10 = t37 & y3; + z11 = t33 & y4; + z12 = t43 & y13; + z13 = t40 & y5; + z14 = t29 & y2; + z15 = t42 & y9; + z16 = t45 & y14; + z17 = t41 & y8; + + /* + * Bottom linear transformation. + */ + t46 = z15 ^ z16; + t47 = z10 ^ z11; + t48 = z5 ^ z13; + t49 = z9 ^ z10; + t50 = z2 ^ z12; + t51 = z2 ^ z5; + t52 = z7 ^ z8; + t53 = z0 ^ z3; + t54 = z6 ^ z7; + t55 = z16 ^ z17; + t56 = z12 ^ t48; + t57 = t50 ^ t53; + t58 = z4 ^ t46; + t59 = z3 ^ t54; + t60 = t46 ^ t57; + t61 = z14 ^ t57; + t62 = t52 ^ t58; + t63 = t49 ^ t58; + t64 = z4 ^ t59; + t65 = t61 ^ t62; + t66 = z1 ^ t63; + s0 = t59 ^ t63; + s6 = t56 ^ ~t62; + s7 = t48 ^ ~t60; + t67 = t64 ^ t65; + s3 = t53 ^ t66; + s4 = t51 ^ t66; + s5 = t47 ^ t65; + s1 = t64 ^ ~s3; + s2 = t55 ^ ~t67; + + q[7] = s0; + q[6] = s1; + q[5] = s2; + q[4] = s3; + q[3] = s4; + q[2] = s5; + q[1] = s6; + q[0] = s7; +} + +static void br_aes_ct64_ortho(uint64_t *q) { +#define SWAPN(cl, ch, s, x, y) do { \ + uint64_t a, b; \ + a = (x); \ + b = (y); \ + (x) = (a & (uint64_t)(cl)) | ((b & (uint64_t)(cl)) << (s)); \ + (y) = ((a & (uint64_t)(ch)) >> (s)) | (b & (uint64_t)(ch)); \ + } while (0) + +#define SWAP2(x, y) SWAPN(0x5555555555555555, 0xAAAAAAAAAAAAAAAA, 1, x, y) +#define SWAP4(x, y) SWAPN(0x3333333333333333, 0xCCCCCCCCCCCCCCCC, 2, x, y) +#define SWAP8(x, y) SWAPN(0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 4, x, y) + + SWAP2(q[0], q[1]); + SWAP2(q[2], q[3]); + SWAP2(q[4], q[5]); + SWAP2(q[6], q[7]); + + SWAP4(q[0], q[2]); + SWAP4(q[1], q[3]); + SWAP4(q[4], q[6]); + SWAP4(q[5], q[7]); + + SWAP8(q[0], q[4]); + SWAP8(q[1], q[5]); + SWAP8(q[2], q[6]); + SWAP8(q[3], q[7]); +} + +static void br_aes_ct64_interleave_in(uint64_t *q0, uint64_t *q1, const uint32_t *w) { + uint64_t x0, x1, x2, x3; + + x0 = w[0]; + x1 = w[1]; + x2 = w[2]; + x3 = w[3]; + x0 |= (x0 << 16); + x1 |= (x1 << 16); + x2 |= (x2 << 16); + x3 |= (x3 << 16); + x0 &= (uint64_t)0x0000FFFF0000FFFF; + x1 &= (uint64_t)0x0000FFFF0000FFFF; + x2 &= (uint64_t)0x0000FFFF0000FFFF; + x3 &= (uint64_t)0x0000FFFF0000FFFF; + x0 |= (x0 << 8); + x1 |= (x1 << 8); + x2 |= (x2 << 8); + x3 |= (x3 << 8); + x0 &= (uint64_t)0x00FF00FF00FF00FF; + x1 &= (uint64_t)0x00FF00FF00FF00FF; + x2 &= (uint64_t)0x00FF00FF00FF00FF; + x3 &= (uint64_t)0x00FF00FF00FF00FF; + *q0 = x0 | (x2 << 8); + *q1 = x1 | (x3 << 8); +} + +static void br_aes_ct64_interleave_out(uint32_t *w, uint64_t q0, uint64_t q1) { + uint64_t x0, x1, x2, x3; + + x0 = q0 & (uint64_t)0x00FF00FF00FF00FF; + x1 = q1 & (uint64_t)0x00FF00FF00FF00FF; + x2 = (q0 >> 8) & (uint64_t)0x00FF00FF00FF00FF; + x3 = (q1 >> 8) & (uint64_t)0x00FF00FF00FF00FF; + x0 |= (x0 >> 8); + x1 |= (x1 >> 8); + x2 |= (x2 >> 8); + x3 |= (x3 >> 8); + x0 &= (uint64_t)0x0000FFFF0000FFFF; + x1 &= (uint64_t)0x0000FFFF0000FFFF; + x2 &= (uint64_t)0x0000FFFF0000FFFF; + x3 &= (uint64_t)0x0000FFFF0000FFFF; + w[0] = (uint32_t)x0 | (uint32_t)(x0 >> 16); + w[1] = (uint32_t)x1 | (uint32_t)(x1 >> 16); + w[2] = (uint32_t)x2 | (uint32_t)(x2 >> 16); + w[3] = (uint32_t)x3 | (uint32_t)(x3 >> 16); +} + +static const uint8_t Rcon[] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36 +}; + +static uint32_t sub_word(uint32_t x) { + uint64_t q[8]; + + memset(q, 0, sizeof q); + q[0] = x; + br_aes_ct64_ortho(q); + br_aes_ct64_bitslice_Sbox(q); + br_aes_ct64_ortho(q); + return (uint32_t)q[0]; +} + +static void br_aes_ct64_keysched(uint64_t *comp_skey, const uint8_t *key) { + int i, j, k, nk, nkf; + uint32_t tmp; + uint32_t skey[60]; + + int key_len = 32; + + nk = (int)(key_len >> 2); + nkf = (int)((14 + 1) << 2); + br_range_dec32le(skey, (key_len >> 2), key); + tmp = skey[(key_len >> 2) - 1]; + for (i = nk, j = 0, k = 0; i < nkf; i ++) { + if (j == 0) { + tmp = (tmp << 24) | (tmp >> 8); + tmp = sub_word(tmp) ^ Rcon[k]; + } else if (nk > 6 && j == 4) { + tmp = sub_word(tmp); + } + tmp ^= skey[i - nk]; + skey[i] = tmp; + if (++ j == nk) { + j = 0; + k ++; + } + } + + for (i = 0, j = 0; i < nkf; i += 4, j += 2) { + uint64_t q[8]; + + br_aes_ct64_interleave_in(&q[0], &q[4], skey + i); + q[1] = q[0]; + q[2] = q[0]; + q[3] = q[0]; + q[5] = q[4]; + q[6] = q[4]; + q[7] = q[4]; + br_aes_ct64_ortho(q); + comp_skey[j + 0] = + (q[0] & (uint64_t)0x1111111111111111) + | (q[1] & (uint64_t)0x2222222222222222) + | (q[2] & (uint64_t)0x4444444444444444) + | (q[3] & (uint64_t)0x8888888888888888); + comp_skey[j + 1] = + (q[4] & (uint64_t)0x1111111111111111) + | (q[5] & (uint64_t)0x2222222222222222) + | (q[6] & (uint64_t)0x4444444444444444) + | (q[7] & (uint64_t)0x8888888888888888); + } +} + +static void br_aes_ct64_skey_expand(uint64_t *skey, const uint64_t *comp_skey) { + unsigned u, v, n; + + n = (14 + 1) << 1; + for (u = 0, v = 0; u < n; u ++, v += 4) { + uint64_t x0, x1, x2, x3; + + x0 = x1 = x2 = x3 = comp_skey[u]; + x0 &= (uint64_t)0x1111111111111111; + x1 &= (uint64_t)0x2222222222222222; + x2 &= (uint64_t)0x4444444444444444; + x3 &= (uint64_t)0x8888888888888888; + x1 >>= 1; + x2 >>= 2; + x3 >>= 3; + skey[v + 0] = (x0 << 4) - x0; + skey[v + 1] = (x1 << 4) - x1; + skey[v + 2] = (x2 << 4) - x2; + skey[v + 3] = (x3 << 4) - x3; + } +} + +static inline void add_round_key(uint64_t *q, const uint64_t *sk) { + q[0] ^= sk[0]; + q[1] ^= sk[1]; + q[2] ^= sk[2]; + q[3] ^= sk[3]; + q[4] ^= sk[4]; + q[5] ^= sk[5]; + q[6] ^= sk[6]; + q[7] ^= sk[7]; +} + +static inline void shift_rows(uint64_t *q) { + int i; + + for (i = 0; i < 8; i ++) { + uint64_t x; + + x = q[i]; + q[i] = (x & (uint64_t)0x000000000000FFFF) + | ((x & (uint64_t)0x00000000FFF00000) >> 4) + | ((x & (uint64_t)0x00000000000F0000) << 12) + | ((x & (uint64_t)0x0000FF0000000000) >> 8) + | ((x & (uint64_t)0x000000FF00000000) << 8) + | ((x & (uint64_t)0xF000000000000000) >> 12) + | ((x & (uint64_t)0x0FFF000000000000) << 4); + } +} + +static inline uint64_t rotr32(uint64_t x) { + return (x << 32) | (x >> 32); +} + +static inline void mix_columns(uint64_t *q) { + uint64_t q0, q1, q2, q3, q4, q5, q6, q7; + uint64_t r0, r1, r2, r3, r4, r5, r6, r7; + + q0 = q[0]; + q1 = q[1]; + q2 = q[2]; + q3 = q[3]; + q4 = q[4]; + q5 = q[5]; + q6 = q[6]; + q7 = q[7]; + r0 = (q0 >> 16) | (q0 << 48); + r1 = (q1 >> 16) | (q1 << 48); + r2 = (q2 >> 16) | (q2 << 48); + r3 = (q3 >> 16) | (q3 << 48); + r4 = (q4 >> 16) | (q4 << 48); + r5 = (q5 >> 16) | (q5 << 48); + r6 = (q6 >> 16) | (q6 << 48); + r7 = (q7 >> 16) | (q7 << 48); + + q[0] = q7 ^ r7 ^ r0 ^ rotr32(q0 ^ r0); + q[1] = q0 ^ r0 ^ q7 ^ r7 ^ r1 ^ rotr32(q1 ^ r1); + q[2] = q1 ^ r1 ^ r2 ^ rotr32(q2 ^ r2); + q[3] = q2 ^ r2 ^ q7 ^ r7 ^ r3 ^ rotr32(q3 ^ r3); + q[4] = q3 ^ r3 ^ q7 ^ r7 ^ r4 ^ rotr32(q4 ^ r4); + q[5] = q4 ^ r4 ^ r5 ^ rotr32(q5 ^ r5); + q[6] = q5 ^ r5 ^ r6 ^ rotr32(q6 ^ r6); + q[7] = q6 ^ r6 ^ r7 ^ rotr32(q7 ^ r7); +} + +static void inc4_be(uint32_t *x) { + *x = br_swap32(*x) + 4; + *x = br_swap32(*x); +} + +static void aes_ctr4x(uint8_t out[64], uint32_t ivw[16], uint64_t sk_exp[64]) { + uint32_t w[16]; + uint64_t q[8]; + int i; + + memcpy(w, ivw, sizeof(w)); + for (i = 0; i < 4; i++) { + br_aes_ct64_interleave_in(&q[i], &q[i + 4], w + (i << 2)); + } + br_aes_ct64_ortho(q); + + add_round_key(q, sk_exp); + for (i = 1; i < 14; i++) { + br_aes_ct64_bitslice_Sbox(q); + shift_rows(q); + mix_columns(q); + add_round_key(q, sk_exp + (i << 3)); + } + br_aes_ct64_bitslice_Sbox(q); + shift_rows(q); + add_round_key(q, sk_exp + 112); + + br_aes_ct64_ortho(q); + for (i = 0; i < 4; i ++) { + br_aes_ct64_interleave_out(w + (i << 2), q[i], q[i + 4]); + } + br_range_enc32le(out, w, 16); + + /* Increase counter for next 4 blocks */ + inc4_be(ivw + 3); + inc4_be(ivw + 7); + inc4_be(ivw + 11); + inc4_be(ivw + 15); +} + +static void br_aes_ct64_ctr_init(uint64_t sk_exp[120], const uint8_t *key) { + uint64_t skey[30]; + + br_aes_ct64_keysched(skey, key); + br_aes_ct64_skey_expand(sk_exp, skey); +} + +static void br_aes_ct64_ctr_run(uint64_t sk_exp[120], const uint8_t *iv, uint32_t cc, uint8_t *data, size_t len) { + uint32_t ivw[16]; + size_t i; + + br_range_dec32le(ivw, 3, iv); + memcpy(ivw + 4, ivw, 3 * sizeof(uint32_t)); + memcpy(ivw + 8, ivw, 3 * sizeof(uint32_t)); + memcpy(ivw + 12, ivw, 3 * sizeof(uint32_t)); + ivw[ 3] = br_swap32(cc); + ivw[ 7] = br_swap32(cc + 1); + ivw[11] = br_swap32(cc + 2); + ivw[15] = br_swap32(cc + 3); + + while (len > 64) { + aes_ctr4x(data, ivw, sk_exp); + data += 64; + len -= 64; + } + if (len > 0) { + uint8_t tmp[64]; + aes_ctr4x(tmp, ivw, sk_exp); + for (i = 0; i < len; i++) { + data[i] = tmp[i]; + } + } +} + +void PQCLEAN_KYBER76890S_CLEAN_aes256ctr_prf(uint8_t *out, size_t outlen, const uint8_t *key, const uint8_t *nonce) { + uint64_t sk_exp[120]; + + br_aes_ct64_ctr_init(sk_exp, key); + br_aes_ct64_ctr_run(sk_exp, nonce, 0, out, outlen); +} + +void PQCLEAN_KYBER76890S_CLEAN_aes256ctr_init(aes256ctr_ctx *s, const uint8_t *key, const uint8_t *nonce) { + br_aes_ct64_ctr_init(s->sk_exp, key); + + br_range_dec32le(s->ivw, 3, nonce); + memcpy(s->ivw + 4, s->ivw, 3 * sizeof(uint32_t)); + memcpy(s->ivw + 8, s->ivw, 3 * sizeof(uint32_t)); + memcpy(s->ivw + 12, s->ivw, 3 * sizeof(uint32_t)); + s->ivw[ 3] = br_swap32(0); + s->ivw[ 7] = br_swap32(1); + s->ivw[11] = br_swap32(2); + s->ivw[15] = br_swap32(3); +} + +void PQCLEAN_KYBER76890S_CLEAN_aes256ctr_squeezeblocks(uint8_t *out, size_t nblocks, aes256ctr_ctx *s) { + while (nblocks > 0) { + aes_ctr4x(out, s->ivw, s->sk_exp); + out += 64; + nblocks--; + } +} diff --git a/crypto_kem/kyber768-90s/clean/aes256ctr.h b/crypto_kem/kyber768-90s/clean/aes256ctr.h new file mode 100644 index 00000000..3c94676b --- /dev/null +++ b/crypto_kem/kyber768-90s/clean/aes256ctr.h @@ -0,0 +1,28 @@ +#ifndef PQCLEAN_KYBER76890S_CLEAN_AES256CTR_H +#define PQCLEAN_KYBER76890S_CLEAN_AES256CTR_H + +#include +#include + +#define AES256CTR_BLOCKBYTES 64 + + +typedef struct { + uint64_t sk_exp[120]; + uint32_t ivw[16]; +} aes256ctr_ctx; + +void PQCLEAN_KYBER76890S_CLEAN_aes256ctr_prf(uint8_t *out, + size_t outlen, + const uint8_t key[32], + const uint8_t nonce[12]); + +void PQCLEAN_KYBER76890S_CLEAN_aes256ctr_init(aes256ctr_ctx *state, + const uint8_t key[32], + const uint8_t nonce[12]); + +void PQCLEAN_KYBER76890S_CLEAN_aes256ctr_squeezeblocks(uint8_t *out, + size_t nblocks, + aes256ctr_ctx *state); + +#endif diff --git a/crypto_kem/kyber768-90s/clean/cbd.c b/crypto_kem/kyber768-90s/clean/cbd.c index 7d527ffb..3c1290e9 100644 --- a/crypto_kem/kyber768-90s/clean/cbd.c +++ b/crypto_kem/kyber768-90s/clean/cbd.c @@ -5,7 +5,7 @@ /************************************************* * Name: load32_littleendian * -* Description: load bytes into a 32-bit integer +* Description: load 4 bytes into a 32-bit integer * in little-endian order * * Arguments: - const uint8_t *x: pointer to input byte array @@ -22,16 +22,29 @@ static uint32_t load32_littleendian(const uint8_t x[4]) { } /************************************************* -* Name: PQCLEAN_KYBER76890S_CLEAN_cbd +* Name: load24_littleendian +* +* Description: load 3 bytes into a 32-bit integer +* in little-endian order. +* This function is only needed for Kyber-512 +* +* Arguments: - const uint8_t *x: pointer to input byte array +* +* Returns 32-bit unsigned integer loaded from x (most significant byte is zero) +**************************************************/ + + +/************************************************* +* Name: cbd2 * * Description: Given an array of uniformly random bytes, compute * polynomial with coefficients distributed according to -* a centered binomial distribution with parameter KYBER_ETA +* a centered binomial distribution with parameter eta=2 * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *buf: pointer to input byte array **************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]) { +static void cbd2(poly *r, const uint8_t buf[2 * KYBER_N / 4]) { unsigned int i, j; uint32_t t, d; int16_t a, b; @@ -48,3 +61,23 @@ void PQCLEAN_KYBER76890S_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_ } } } + +/************************************************* +* Name: cbd3 +* +* Description: Given an array of uniformly random bytes, compute +* polynomial with coefficients distributed according to +* a centered binomial distribution with parameter eta=3. +* This function is only needed for Kyber-512 +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *buf: pointer to input byte array +**************************************************/ + +void PQCLEAN_KYBER76890S_CLEAN_poly_cbd_eta1(poly *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]) { + cbd2(r, buf); +} + +void PQCLEAN_KYBER76890S_CLEAN_poly_cbd_eta2(poly *r, const uint8_t buf[KYBER_ETA2 * KYBER_N / 4]) { + cbd2(r, buf); +} diff --git a/crypto_kem/kyber768-90s/clean/cbd.h b/crypto_kem/kyber768-90s/clean/cbd.h index 41007b06..f2de6763 100644 --- a/crypto_kem/kyber768-90s/clean/cbd.h +++ b/crypto_kem/kyber768-90s/clean/cbd.h @@ -4,6 +4,8 @@ #include "poly.h" #include -void PQCLEAN_KYBER76890S_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]); +void PQCLEAN_KYBER76890S_CLEAN_poly_cbd_eta1(poly *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]); + +void PQCLEAN_KYBER76890S_CLEAN_poly_cbd_eta2(poly *r, const uint8_t buf[KYBER_ETA2 * KYBER_N / 4]); #endif diff --git a/crypto_kem/kyber768-90s/clean/indcpa.c b/crypto_kem/kyber768-90s/clean/indcpa.c index 57abc00f..b68871d9 100644 --- a/crypto_kem/kyber768-90s/clean/indcpa.c +++ b/crypto_kem/kyber768-90s/clean/indcpa.c @@ -15,8 +15,8 @@ * serialized vector of polynomials pk * and the public seed used to generate the matrix A. * -* Arguments: uint8_t *r: pointer to the output serialized public key -* polyvec *pk: pointer to the input public-key polyvec +* Arguments: uint8_t *r: pointer to the output serialized public key +* polyvec *pk: pointer to the input public-key polyvec * const uint8_t *seed: pointer to the input public seed **************************************************/ static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], @@ -35,10 +35,8 @@ static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], * Description: De-serialize public key from a byte array; * approximate inverse of pack_pk * -* Arguments: - polyvec *pk: pointer to output public-key -* polynomial vector -* - uint8_t *seed: pointer to output seed to generate -* matrix A +* Arguments: - polyvec *pk: pointer to output public-key polynomial vector +* - uint8_t *seed: pointer to output seed to generate matrix A * - const uint8_t *packedpk: pointer to input serialized public key **************************************************/ static void unpack_pk(polyvec *pk, @@ -56,7 +54,7 @@ static void unpack_pk(polyvec *pk, * * Description: Serialize the secret key * -* Arguments: - uint8_t *r: pointer to output serialized secret key +* Arguments: - uint8_t *r: pointer to output serialized secret key * - polyvec *sk: pointer to input vector of polynomials (secret key) **************************************************/ static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { @@ -66,15 +64,12 @@ static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { /************************************************* * Name: unpack_sk * -* Description: De-serialize the secret key; -* inverse of pack_sk +* Description: De-serialize the secret key; inverse of pack_sk * -* Arguments: - polyvec *sk: pointer to output vector of -* polynomials (secret key) +* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key) * - const uint8_t *packedsk: pointer to input serialized secret key **************************************************/ -static void unpack_sk(polyvec *sk, - const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { +static void unpack_sk(polyvec *sk, const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { PQCLEAN_KYBER76890S_CLEAN_polyvec_frombytes(sk, packedsk); } @@ -86,12 +81,10 @@ static void unpack_sk(polyvec *sk, * and the compressed and serialized polynomial v * * Arguments: uint8_t *r: pointer to the output serialized ciphertext -* poly *pk: pointer to the input vector of polynomials b -* poly *v: pointer to the input polynomial v +* poly *pk: pointer to the input vector of polynomials b +* poly *v: pointer to the input polynomial v **************************************************/ -static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], - polyvec *b, - poly *v) { +static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], polyvec *b, poly *v) { PQCLEAN_KYBER76890S_CLEAN_polyvec_compress(r, b); PQCLEAN_KYBER76890S_CLEAN_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v); } @@ -102,13 +95,11 @@ static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], * Description: De-serialize and decompress ciphertext from a byte array; * approximate inverse of pack_ciphertext * -* Arguments: - polyvec *b: pointer to the output vector of polynomials b -* - poly *v: pointer to the output polynomial v +* Arguments: - polyvec *b: pointer to the output vector of polynomials b +* - poly *v: pointer to the output polynomial v * - const uint8_t *c: pointer to the input serialized ciphertext **************************************************/ -static void unpack_ciphertext(polyvec *b, - poly *v, - const uint8_t c[KYBER_INDCPA_BYTES]) { +static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t c[KYBER_INDCPA_BYTES]) { PQCLEAN_KYBER76890S_CLEAN_polyvec_decompress(b, c); PQCLEAN_KYBER76890S_CLEAN_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES); } @@ -119,11 +110,9 @@ static void unpack_ciphertext(polyvec *b, * Description: Run rejection sampling on uniform random bytes to generate * uniform random integers mod q * -* Arguments: - int16_t *r: pointer to output buffer -* - unsigned int len: requested number of 16-bit integers -* (uniform mod q) -* - const uint8_t *buf: pointer to input buffer -* (assumed to be uniform random bytes) +* Arguments: - int16_t *r: pointer to output buffer +* - unsigned int len: requested number of 16-bit integers (uniform mod q) +* - const uint8_t *buf: pointer to input buffer (assumed to be uniformly random bytes) * - unsigned int buflen: length of input buffer in bytes * * Returns number of sampled 16-bit integers (at most len) @@ -133,16 +122,19 @@ static unsigned int rej_uniform(int16_t *r, const uint8_t *buf, unsigned int buflen) { unsigned int ctr, pos; - uint16_t val; + uint16_t val0, val1; ctr = pos = 0; - while (ctr < len && pos + 2 <= buflen) { - val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); - pos += 2; + while (ctr < len && pos + 3 <= buflen) { + val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; + val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF; + pos += 3; - if (val < 19 * KYBER_Q) { - val -= (val >> 12) * KYBER_Q; // Barrett reduction - r[ctr++] = (int16_t)val; + if (val0 < KYBER_Q) { + r[ctr++] = val0; + } + if (ctr < len && val1 < KYBER_Q) { + r[ctr++] = val1; } } @@ -160,17 +152,16 @@ static unsigned int rej_uniform(int16_t *r, * uniformly random. Performs rejection sampling on output of * a XOF * -* Arguments: - polyvec *a: pointer to ouptput matrix A +* Arguments: - polyvec *a: pointer to ouptput matrix A * - const uint8_t *seed: pointer to input seed -* - int transposed: boolean deciding whether A or A^T -* is generated +* - int transposed: boolean deciding whether A or A^T is generated **************************************************/ -#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \ - + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) +#define GEN_MATRIX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) // Not static for benchmarking void PQCLEAN_KYBER76890S_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed) { - unsigned int ctr, i, j; - uint8_t buf[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES]; + unsigned int ctr, i, j, k; + unsigned int buflen, off; + uint8_t buf[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2]; xof_state state; for (i = 0; i < KYBER_K; i++) { @@ -182,12 +173,17 @@ void PQCLEAN_KYBER76890S_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_S } xof_squeezeblocks(buf, GEN_MATRIX_NBLOCKS, &state); - ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, sizeof(buf)); + buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES; + ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, buflen); while (ctr < KYBER_N) { - xof_squeezeblocks(buf, 1, &state); - ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, - XOF_BLOCKBYTES); + off = buflen % 3; + for (k = 0; k < off; k++) { + buf[k] = buf[buflen - off + k]; + } + xof_squeezeblocks(buf + off, 1, &state); + buflen = off + XOF_BLOCKBYTES; + ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, buflen); } xof_ctx_release(&state); } @@ -220,10 +216,10 @@ void PQCLEAN_KYBER76890S_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYB gen_a(a, publicseed); for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER76890S_CLEAN_poly_getnoise(&skpv.vec[i], noiseseed, nonce++); + PQCLEAN_KYBER76890S_CLEAN_poly_getnoise_eta1(&skpv.vec[i], noiseseed, nonce++); } for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER76890S_CLEAN_poly_getnoise(&e.vec[i], noiseseed, nonce++); + PQCLEAN_KYBER76890S_CLEAN_poly_getnoise_eta1(&e.vec[i], noiseseed, nonce++); } PQCLEAN_KYBER76890S_CLEAN_polyvec_ntt(&skpv); @@ -231,7 +227,7 @@ void PQCLEAN_KYBER76890S_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYB // matrix-vector multiplication for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER76890S_CLEAN_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); + PQCLEAN_KYBER76890S_CLEAN_polyvec_basemul_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); PQCLEAN_KYBER76890S_CLEAN_poly_tomont(&pkpv.vec[i]); } @@ -248,16 +244,15 @@ void PQCLEAN_KYBER76890S_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYB * Description: Encryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *c: pointer to output ciphertext -* (of length KYBER_INDCPA_BYTES bytes) -* - const uint8_t *m: pointer to input message -* (of length KYBER_INDCPA_MSGBYTES bytes) -* - const uint8_t *pk: pointer to input public key -* (of length KYBER_INDCPA_PUBLICKEYBYTES) -* - const uint8_t *coins: pointer to input random coins -* used as seed (of length KYBER_SYMBYTES) -* to deterministically generate all -* randomness +* Arguments: - uint8_t *c: pointer to output ciphertext +* (of length KYBER_INDCPA_BYTES bytes) +* - const uint8_t *m: pointer to input message +* (of length KYBER_INDCPA_MSGBYTES bytes) +* - const uint8_t *pk: pointer to input public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES) +* - const uint8_t *coins: pointer to input random coins used as seed +* (of length KYBER_SYMBYTES) to deterministically +* generate all randomness **************************************************/ void PQCLEAN_KYBER76890S_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], const uint8_t m[KYBER_INDCPA_MSGBYTES], @@ -266,7 +261,7 @@ void PQCLEAN_KYBER76890S_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], unsigned int i; uint8_t seed[KYBER_SYMBYTES]; uint8_t nonce = 0; - polyvec sp, pkpv, ep, at[KYBER_K], bp; + polyvec sp, pkpv, ep, at[KYBER_K], b; poly v, k, epp; unpack_pk(&pkpv, seed, pk); @@ -274,32 +269,32 @@ void PQCLEAN_KYBER76890S_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], gen_at(at, seed); for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER76890S_CLEAN_poly_getnoise(sp.vec + i, coins, nonce++); + PQCLEAN_KYBER76890S_CLEAN_poly_getnoise_eta1(sp.vec + i, coins, nonce++); } for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER76890S_CLEAN_poly_getnoise(ep.vec + i, coins, nonce++); + PQCLEAN_KYBER76890S_CLEAN_poly_getnoise_eta2(ep.vec + i, coins, nonce++); } - PQCLEAN_KYBER76890S_CLEAN_poly_getnoise(&epp, coins, nonce++); + PQCLEAN_KYBER76890S_CLEAN_poly_getnoise_eta2(&epp, coins, nonce++); PQCLEAN_KYBER76890S_CLEAN_polyvec_ntt(&sp); // matrix-vector multiplication for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER76890S_CLEAN_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp); + PQCLEAN_KYBER76890S_CLEAN_polyvec_basemul_acc_montgomery(&b.vec[i], &at[i], &sp); } - PQCLEAN_KYBER76890S_CLEAN_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp); + PQCLEAN_KYBER76890S_CLEAN_polyvec_basemul_acc_montgomery(&v, &pkpv, &sp); - PQCLEAN_KYBER76890S_CLEAN_polyvec_invntt_tomont(&bp); + PQCLEAN_KYBER76890S_CLEAN_polyvec_invntt_tomont(&b); PQCLEAN_KYBER76890S_CLEAN_poly_invntt_tomont(&v); - PQCLEAN_KYBER76890S_CLEAN_polyvec_add(&bp, &bp, &ep); + PQCLEAN_KYBER76890S_CLEAN_polyvec_add(&b, &b, &ep); PQCLEAN_KYBER76890S_CLEAN_poly_add(&v, &v, &epp); PQCLEAN_KYBER76890S_CLEAN_poly_add(&v, &v, &k); - PQCLEAN_KYBER76890S_CLEAN_polyvec_reduce(&bp); + PQCLEAN_KYBER76890S_CLEAN_polyvec_reduce(&b); PQCLEAN_KYBER76890S_CLEAN_poly_reduce(&v); - pack_ciphertext(c, &bp, &v); + pack_ciphertext(c, &b, &v); } /************************************************* @@ -308,24 +303,24 @@ void PQCLEAN_KYBER76890S_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], * Description: Decryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *m: pointer to output decrypted message -* (of length KYBER_INDCPA_MSGBYTES) -* - const uint8_t *c: pointer to input ciphertext -* (of length KYBER_INDCPA_BYTES) +* Arguments: - uint8_t *m: pointer to output decrypted message +* (of length KYBER_INDCPA_MSGBYTES) +* - const uint8_t *c: pointer to input ciphertext +* (of length KYBER_INDCPA_BYTES) * - const uint8_t *sk: pointer to input secret key * (of length KYBER_INDCPA_SECRETKEYBYTES) **************************************************/ void PQCLEAN_KYBER76890S_CLEAN_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t c[KYBER_INDCPA_BYTES], const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { - polyvec bp, skpv; + polyvec b, skpv; poly v, mp; - unpack_ciphertext(&bp, &v, c); + unpack_ciphertext(&b, &v, c); unpack_sk(&skpv, sk); - PQCLEAN_KYBER76890S_CLEAN_polyvec_ntt(&bp); - PQCLEAN_KYBER76890S_CLEAN_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp); + PQCLEAN_KYBER76890S_CLEAN_polyvec_ntt(&b); + PQCLEAN_KYBER76890S_CLEAN_polyvec_basemul_acc_montgomery(&mp, &skpv, &b); PQCLEAN_KYBER76890S_CLEAN_poly_invntt_tomont(&mp); PQCLEAN_KYBER76890S_CLEAN_poly_sub(&mp, &v, &mp); diff --git a/crypto_kem/kyber768-90s/clean/kem.c b/crypto_kem/kyber768-90s/clean/kem.c index 528d5080..c662fbbd 100644 --- a/crypto_kem/kyber768-90s/clean/kem.c +++ b/crypto_kem/kyber768-90s/clean/kem.c @@ -14,13 +14,14 @@ * for CCA-secure Kyber key encapsulation mechanism * * Arguments: - unsigned char *pk: pointer to output public key -* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) * - unsigned char *sk: pointer to output private key -* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* (an already allocated array of KYBER_SECRETKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER76890S_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { +int PQCLEAN_KYBER76890S_CLEAN_crypto_kem_keypair(unsigned char pk[KYBER_PUBLICKEYBYTES], + unsigned char sk[KYBER_SECRETKEYBYTES]) { size_t i; PQCLEAN_KYBER76890S_CLEAN_indcpa_keypair(pk, sk); for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) { @@ -39,17 +40,17 @@ int PQCLEAN_KYBER76890S_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned cha * secret for given public key * * Arguments: - unsigned char *ct: pointer to output cipher text -* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) * - unsigned char *ss: pointer to output shared secret -* (an already allocated array of CRYPTO_BYTES bytes) +* (an already allocated array of KYBER_SSBYTES bytes) * - const unsigned char *pk: pointer to input public key -* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER76890S_CLEAN_crypto_kem_enc(unsigned char *ct, - unsigned char *ss, - const unsigned char *pk) { +int PQCLEAN_KYBER76890S_CLEAN_crypto_kem_enc(unsigned char ct[KYBER_CIPHERTEXTBYTES], + unsigned char ss[KYBER_SSBYTES], + const unsigned char pk[KYBER_PUBLICKEYBYTES]) { uint8_t buf[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ uint8_t kr[2 * KYBER_SYMBYTES]; @@ -79,19 +80,19 @@ int PQCLEAN_KYBER76890S_CLEAN_crypto_kem_enc(unsigned char *ct, * cipher text and private key * * Arguments: - unsigned char *ss: pointer to output shared secret -* (an already allocated array of CRYPTO_BYTES bytes) +* (an already allocated array of KYBER_SSBYTES bytes) * - const unsigned char *ct: pointer to input cipher text -* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) * - const unsigned char *sk: pointer to input private key -* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* (an already allocated array of KYBER_SECRETKEYBYTES bytes) * * Returns 0. * * On failure, ss will contain a pseudo-random value. **************************************************/ -int PQCLEAN_KYBER76890S_CLEAN_crypto_kem_dec(unsigned char *ss, - const unsigned char *ct, - const unsigned char *sk) { +int PQCLEAN_KYBER76890S_CLEAN_crypto_kem_dec(unsigned char ss[KYBER_SSBYTES], + const unsigned char ct[KYBER_CIPHERTEXTBYTES], + const unsigned char sk[KYBER_SECRETKEYBYTES]) { size_t i; int fail; uint8_t buf[2 * KYBER_SYMBYTES]; diff --git a/crypto_kem/kyber768-90s/clean/ntt.c b/crypto_kem/kyber768-90s/clean/ntt.c index a51fd5b5..e9e4890a 100644 --- a/crypto_kem/kyber768-90s/clean/ntt.c +++ b/crypto_kem/kyber768-90s/clean/ntt.c @@ -3,11 +3,11 @@ #include "reduce.h" #include -/* Code to generate PQCLEAN_KYBER76890S_CLEAN_zetas and PQCLEAN_KYBER76890S_CLEAN_zetas_inv used in the number-theoretic transform: +/* Code to generate PQCLEAN_KYBER76890S_CLEAN_zetas and zetas_inv used in the number-theoretic transform: #define KYBER_ROOT_OF_UNITY 17 -static const uint16_t tree[128] = { +static const uint8_t tree[128] = { 0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120, 4, 68, 36, 100, 20, 84, 52, 116, 12, 76, 44, 108, 28, 92, 60, 124, 2, 66, 34, 98, 18, 82, 50, 114, 10, 74, 42, 106, 26, 90, 58, 122, @@ -19,51 +19,41 @@ static const uint16_t tree[128] = { }; void init_ntt() { - unsigned int i, j, k; + unsigned int i; int16_t tmp[128]; tmp[0] = MONT; - for(i = 1; i < 128; ++i) - tmp[i] = fqmul(tmp[i-1], KYBER_ROOT_OF_UNITY*MONT % KYBER_Q); + for(i=1;i<128;i++) + tmp[i] = fqmul(tmp[i-1],MONT*KYBER_ROOT_OF_UNITY % KYBER_Q); - for(i = 0; i < 128; ++i) + for(i=0;i<128;i++) { PQCLEAN_KYBER76890S_CLEAN_zetas[i] = tmp[tree[i]]; - - k = 0; - for(i = 64; i >= 1; i >>= 1) - for(j = i; j < 2*i; ++j) - PQCLEAN_KYBER76890S_CLEAN_zetas_inv[k++] = -tmp[128 - tree[j]]; - - PQCLEAN_KYBER76890S_CLEAN_zetas_inv[127] = MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q; + if(PQCLEAN_KYBER76890S_CLEAN_zetas[i] > KYBER_Q/2) + PQCLEAN_KYBER76890S_CLEAN_zetas[i] -= KYBER_Q; + if(PQCLEAN_KYBER76890S_CLEAN_zetas[i] < -KYBER_Q/2) + PQCLEAN_KYBER76890S_CLEAN_zetas[i] += KYBER_Q; + } } - */ const int16_t PQCLEAN_KYBER76890S_CLEAN_zetas[128] = { - 2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182, 962, - 2127, 1855, 1468, 573, 2004, 264, 383, 2500, 1458, 1727, 3199, 2648, 1017, - 732, 608, 1787, 411, 3124, 1758, 1223, 652, 2777, 1015, 2036, 1491, 3047, - 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, 2476, 3239, 3058, 830, - 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, 2226, - 430, 555, 843, 2078, 871, 1550, 105, 422, 587, 177, 3094, 3038, 2869, 1574, - 1653, 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349, - 418, 329, 3173, 3254, 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193, - 1218, 1994, 2455, 220, 2142, 1670, 2144, 1799, 2051, 794, 1819, 2475, 2459, - 478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628 -}; - -const int16_t PQCLEAN_KYBER76890S_CLEAN_zetas_inv[128] = { - 1701, 1807, 1460, 2371, 2338, 2333, 308, 108, 2851, 870, 854, 1510, 2535, - 1278, 1530, 1185, 1659, 1187, 3109, 874, 1335, 2111, 136, 1215, 2945, 1465, - 1285, 2007, 2719, 2726, 2232, 2512, 75, 156, 3000, 2911, 2980, 872, 2685, - 1590, 2210, 602, 1846, 777, 147, 2170, 2551, 246, 1676, 1755, 460, 291, 235, - 3152, 2742, 2907, 3224, 1779, 2458, 1251, 2486, 2774, 2899, 1103, 1275, 2652, - 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, - 1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, - 2677, 2106, 1571, 205, 2918, 1542, 2721, 2597, 2312, 681, 130, 1602, 1871, - 829, 2946, 3065, 1325, 2756, 1861, 1474, 1202, 2367, 3147, 1752, 2707, 171, - 3127, 3042, 1907, 1836, 1517, 359, 758, 1441 -}; + -1044, -758, -359, -1517, 1493, 1422, 287, 202, + -171, 622, 1577, 182, 962, -1202, -1474, 1468, + 573, -1325, 264, 383, -829, 1458, -1602, -130, + -681, 1017, 732, 608, -1542, 411, -205, -1571, + 1223, 652, -552, 1015, -1293, 1491, -282, -1544, + 516, -8, -320, -666, -1618, -1162, 126, 1469, + -853, -90, -271, 830, 107, -1421, -247, -951, + -398, 961, -1508, -725, 448, -1065, 677, -1275, + -1103, 430, 555, 843, -1251, 871, 1550, 105, + 422, 587, 177, -235, -291, -460, 1574, 1653, + -246, 778, 1159, -147, -777, 1483, -602, 1119, + -1590, 644, -872, 349, 418, 329, -156, -75, + 817, 1097, 603, 610, 1322, -1285, -1465, 384, + -1215, -136, 1218, -1335, -874, 220, -1187, -1659, + -1185, -1530, -1278, 794, -1510, -854, -870, 478, + -108, -308, 996, 991, 958, -1460, 1522, 1628 + }; /************************************************* * Name: fqmul @@ -82,11 +72,10 @@ static int16_t fqmul(int16_t a, int16_t b) { /************************************************* * Name: PQCLEAN_KYBER76890S_CLEAN_ntt * -* Description: Inplace number-theoretic transform (NTT) in Rq +* Description: Inplace number-theoretic transform (NTT) in Rq. * input is in standard order, output is in bitreversed order * -* Arguments: - int16_t r[256]: pointer to input/output vector of elements -* of Zq +* Arguments: - int16_t r[256]: pointer to input/output vector of elements of Zq **************************************************/ void PQCLEAN_KYBER76890S_CLEAN_ntt(int16_t r[256]) { unsigned int len, start, j, k; @@ -96,7 +85,7 @@ void PQCLEAN_KYBER76890S_CLEAN_ntt(int16_t r[256]) { for (len = 128; len >= 2; len >>= 1) { for (start = 0; start < 256; start = j + len) { zeta = PQCLEAN_KYBER76890S_CLEAN_zetas[k++]; - for (j = start; j < start + len; ++j) { + for (j = start; j < start + len; j++) { t = fqmul(zeta, r[j + len]); r[j + len] = r[j] - t; r[j] = r[j] + t; @@ -112,28 +101,28 @@ void PQCLEAN_KYBER76890S_CLEAN_ntt(int16_t r[256]) { * multiplication by Montgomery factor 2^16. * Input is in bitreversed order, output is in standard order * -* Arguments: - int16_t r[256]: pointer to input/output vector of elements -* of Zq +* Arguments: - int16_t r[256]: pointer to input/output vector of elements of Zq **************************************************/ void PQCLEAN_KYBER76890S_CLEAN_invntt(int16_t r[256]) { unsigned int start, len, j, k; int16_t t, zeta; + const int16_t f = 1441; // mont^2/128 - k = 0; + k = 127; for (len = 2; len <= 128; len <<= 1) { for (start = 0; start < 256; start = j + len) { - zeta = PQCLEAN_KYBER76890S_CLEAN_zetas_inv[k++]; - for (j = start; j < start + len; ++j) { + zeta = PQCLEAN_KYBER76890S_CLEAN_zetas[k--]; + for (j = start; j < start + len; j++) { t = r[j]; r[j] = PQCLEAN_KYBER76890S_CLEAN_barrett_reduce(t + r[j + len]); - r[j + len] = t - r[j + len]; + r[j + len] = r[j + len] - t; r[j + len] = fqmul(zeta, r[j + len]); } } } - for (j = 0; j < 256; ++j) { - r[j] = fqmul(r[j], PQCLEAN_KYBER76890S_CLEAN_zetas_inv[127]); + for (j = 0; j < 256; j++) { + r[j] = fqmul(r[j], f); } } @@ -143,19 +132,15 @@ void PQCLEAN_KYBER76890S_CLEAN_invntt(int16_t r[256]) { * Description: Multiplication of polynomials in Zq[X]/(X^2-zeta) * used for multiplication of elements in Rq in NTT domain * -* Arguments: - int16_t r[2]: pointer to the output polynomial +* Arguments: - int16_t r[2]: pointer to the output polynomial * - const int16_t a[2]: pointer to the first factor * - const int16_t b[2]: pointer to the second factor -* - int16_t zeta: integer defining the reduction polynomial +* - int16_t zeta: integer defining the reduction polynomial **************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_basemul(int16_t r[2], - const int16_t a[2], - const int16_t b[2], - int16_t zeta) { +void PQCLEAN_KYBER76890S_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta) { r[0] = fqmul(a[1], b[1]); r[0] = fqmul(r[0], zeta); r[0] += fqmul(a[0], b[0]); - r[1] = fqmul(a[0], b[1]); r[1] += fqmul(a[1], b[0]); } diff --git a/crypto_kem/kyber768-90s/clean/ntt.h b/crypto_kem/kyber768-90s/clean/ntt.h index 8a744408..68ae2734 100644 --- a/crypto_kem/kyber768-90s/clean/ntt.h +++ b/crypto_kem/kyber768-90s/clean/ntt.h @@ -5,15 +5,10 @@ extern const int16_t PQCLEAN_KYBER76890S_CLEAN_zetas[128]; -extern const int16_t PQCLEAN_KYBER76890S_CLEAN_zetas_inv[128]; - void PQCLEAN_KYBER76890S_CLEAN_ntt(int16_t r[256]); void PQCLEAN_KYBER76890S_CLEAN_invntt(int16_t r[256]); -void PQCLEAN_KYBER76890S_CLEAN_basemul(int16_t r[2], - const int16_t a[2], - const int16_t b[2], - int16_t zeta); +void PQCLEAN_KYBER76890S_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta); #endif diff --git a/crypto_kem/kyber768-90s/clean/params.h b/crypto_kem/kyber768-90s/clean/params.h index 678358d8..a30ea122 100644 --- a/crypto_kem/kyber768-90s/clean/params.h +++ b/crypto_kem/kyber768-90s/clean/params.h @@ -7,8 +7,6 @@ #define KYBER_N 256 #define KYBER_Q 3329 -#define KYBER_ETA 2 - #define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define KYBER_SSBYTES 32 /* size in bytes of shared key */ @@ -16,20 +14,20 @@ #define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) #define KYBER_K 3 +#define KYBER_ETA1 2 #define KYBER_POLYCOMPRESSEDBYTES 128 #define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320) -#define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES +#define KYBER_ETA2 2 + +#define KYBER_INDCPA_MSGBYTES (KYBER_SYMBYTES) #define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) #define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) -#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES \ - + KYBER_POLYCOMPRESSEDBYTES) +#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES) #define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES) /* 32 bytes of additional space to save H(pk) */ -#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES \ - + KYBER_INDCPA_PUBLICKEYBYTES \ - + 2*KYBER_SYMBYTES) -#define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES +#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) +#define KYBER_CIPHERTEXTBYTES (KYBER_INDCPA_BYTES) #endif diff --git a/crypto_kem/kyber768-90s/clean/poly.c b/crypto_kem/kyber768-90s/clean/poly.c index 324de5d5..746fff57 100644 --- a/crypto_kem/kyber768-90s/clean/poly.c +++ b/crypto_kem/kyber768-90s/clean/poly.c @@ -13,17 +13,19 @@ * * Arguments: - uint8_t *r: pointer to output byte array * (of length KYBER_POLYCOMPRESSEDBYTES) -* - poly *a: pointer to input polynomial +* - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a) { +void PQCLEAN_KYBER76890S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a) { size_t i, j; + int16_t u; uint8_t t[8]; - PQCLEAN_KYBER76890S_CLEAN_poly_csubq(a); - for (i = 0; i < KYBER_N / 8; i++) { for (j = 0; j < 8; j++) { - t[j] = ((((uint16_t)a->coeffs[8 * i + j] << 4) + KYBER_Q / 2) / KYBER_Q) & 15; + // map to positive standard representatives + u = a->coeffs[8 * i + j]; + u += (u >> 15) & KYBER_Q; + t[j] = ((((uint16_t)u << 4) + KYBER_Q / 2) / KYBER_Q) & 15; } r[0] = t[0] | (t[1] << 4); @@ -40,7 +42,7 @@ void PQCLEAN_KYBER76890S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES * Description: De-serialization and subsequent decompression of a polynomial; * approximate inverse of PQCLEAN_KYBER76890S_CLEAN_poly_compress * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: pointer to input byte array * (of length KYBER_POLYCOMPRESSEDBYTES bytes) **************************************************/ @@ -61,20 +63,21 @@ void PQCLEAN_KYBER76890S_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_PO * * Arguments: - uint8_t *r: pointer to output byte array * (needs space for KYBER_POLYBYTES bytes) -* - poly *a: pointer to input polynomial +* - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { +void PQCLEAN_KYBER76890S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a) { size_t i; uint16_t t0, t1; - PQCLEAN_KYBER76890S_CLEAN_poly_csubq(a); - for (i = 0; i < KYBER_N / 2; i++) { - t0 = a->coeffs[2 * i]; + // map to positive standard representatives + t0 = a->coeffs[2 * i]; + t0 += ((int16_t)t0 >> 15) & KYBER_Q; t1 = a->coeffs[2 * i + 1]; - r[3 * i + 0] = (uint8_t)(t0 >> 0); - r[3 * i + 1] = (uint8_t)((t0 >> 8) | (t1 << 4)); - r[3 * i + 2] = (uint8_t)(t1 >> 4); + t1 += ((int16_t)t1 >> 15) & KYBER_Q; + r[3 * i + 0] = (t0 >> 0); + r[3 * i + 1] = (t0 >> 8) | (t1 << 4); + r[3 * i + 2] = (t1 >> 4); } } @@ -84,7 +87,7 @@ void PQCLEAN_KYBER76890S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) * Description: De-serialization of a polynomial; * inverse of PQCLEAN_KYBER76890S_CLEAN_poly_tobytes * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: pointer to input byte array * (of KYBER_POLYBYTES bytes) **************************************************/ @@ -101,7 +104,7 @@ void PQCLEAN_KYBER76890S_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_POL * * Description: Convert 32-byte message to polynomial * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *msg: pointer to input message **************************************************/ void PQCLEAN_KYBER76890S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { @@ -122,41 +125,60 @@ void PQCLEAN_KYBER76890S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_IND * Description: Convert polynomial to 32-byte message * * Arguments: - uint8_t *msg: pointer to output message -* - poly *a: pointer to input polynomial +* - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a) { +void PQCLEAN_KYBER76890S_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *a) { size_t i, j; uint16_t t; - PQCLEAN_KYBER76890S_CLEAN_poly_csubq(a); - for (i = 0; i < KYBER_N / 8; i++) { msg[i] = 0; for (j = 0; j < 8; j++) { - t = ((((uint16_t)a->coeffs[8 * i + j] << 1) + KYBER_Q / 2) / KYBER_Q) & 1; + t = a->coeffs[8 * i + j]; + t += ((int16_t)t >> 15) & KYBER_Q; + t = (((t << 1) + KYBER_Q / 2) / KYBER_Q) & 1; msg[i] |= t << j; } } } /************************************************* -* Name: PQCLEAN_KYBER76890S_CLEAN_poly_getnoise +* Name: PQCLEAN_KYBER76890S_CLEAN_poly_getnoise_eta1 * * Description: Sample a polynomial deterministically from a seed and a nonce, * with output polynomial close to centered binomial distribution -* with parameter KYBER_ETA +* with parameter KYBER_ETA1 * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *seed: pointer to input seed * (of length KYBER_SYMBYTES bytes) -* - uint8_t nonce: one-byte input nonce +* - uint8_t nonce: one-byte input nonce **************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { - uint8_t buf[KYBER_ETA * KYBER_N / 4]; +void PQCLEAN_KYBER76890S_CLEAN_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { + uint8_t buf[KYBER_ETA1 * KYBER_N / 4]; prf(buf, sizeof(buf), seed, nonce); - PQCLEAN_KYBER76890S_CLEAN_cbd(r, buf); + PQCLEAN_KYBER76890S_CLEAN_poly_cbd_eta1(r, buf); } +/************************************************* +* Name: PQCLEAN_KYBER76890S_CLEAN_poly_getnoise_eta2 +* +* Description: Sample a polynomial deterministically from a seed and a nonce, +* with output polynomial close to centered binomial distribution +* with parameter KYBER_ETA2 +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *seed: pointer to input seed +* (of length KYBER_SYMBYTES bytes) +* - uint8_t nonce: one-byte input nonce +**************************************************/ +void PQCLEAN_KYBER76890S_CLEAN_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { + uint8_t buf[KYBER_ETA2 * KYBER_N / 4]; + prf(buf, sizeof(buf), seed, nonce); + PQCLEAN_KYBER76890S_CLEAN_poly_cbd_eta2(r, buf); +} + + /************************************************* * Name: PQCLEAN_KYBER76890S_CLEAN_poly_ntt * @@ -189,7 +211,7 @@ void PQCLEAN_KYBER76890S_CLEAN_poly_invntt_tomont(poly *r) { * * Description: Multiplication of two polynomials in NTT domain * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ @@ -197,8 +219,7 @@ void PQCLEAN_KYBER76890S_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, c size_t i; for (i = 0; i < KYBER_N / 4; i++) { PQCLEAN_KYBER76890S_CLEAN_basemul(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i], PQCLEAN_KYBER76890S_CLEAN_zetas[64 + i]); - PQCLEAN_KYBER76890S_CLEAN_basemul(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2], &b->coeffs[4 * i + 2], - -PQCLEAN_KYBER76890S_CLEAN_zetas[64 + i]); + PQCLEAN_KYBER76890S_CLEAN_basemul(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2], &b->coeffs[4 * i + 2], -PQCLEAN_KYBER76890S_CLEAN_zetas[64 + i]); } } @@ -233,28 +254,12 @@ void PQCLEAN_KYBER76890S_CLEAN_poly_reduce(poly *r) { } } -/************************************************* -* Name: PQCLEAN_KYBER76890S_CLEAN_poly_csubq -* -* Description: Applies conditional subtraction of q to each coefficient -* of a polynomial. For details of conditional subtraction -* of q see comments in reduce.c -* -* Arguments: - poly *r: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_poly_csubq(poly *r) { - size_t i; - for (i = 0; i < KYBER_N; i++) { - r->coeffs[i] = PQCLEAN_KYBER76890S_CLEAN_csubq(r->coeffs[i]); - } -} - /************************************************* * Name: PQCLEAN_KYBER76890S_CLEAN_poly_add * -* Description: Add two polynomials +* Description: Add two polynomials; no modular reduction is performed * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ @@ -268,7 +273,7 @@ void PQCLEAN_KYBER76890S_CLEAN_poly_add(poly *r, const poly *a, const poly *b) { /************************************************* * Name: PQCLEAN_KYBER76890S_CLEAN_poly_sub * -* Description: Subtract two polynomials +* Description: Subtract two polynomials; no modular reduction is performed * * Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial diff --git a/crypto_kem/kyber768-90s/clean/poly.h b/crypto_kem/kyber768-90s/clean/poly.h index 39e0e042..a7e4a36b 100644 --- a/crypto_kem/kyber768-90s/clean/poly.h +++ b/crypto_kem/kyber768-90s/clean/poly.h @@ -11,16 +11,18 @@ typedef struct { int16_t coeffs[KYBER_N]; } poly; -void PQCLEAN_KYBER76890S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a); +void PQCLEAN_KYBER76890S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a); void PQCLEAN_KYBER76890S_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]); -void PQCLEAN_KYBER76890S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a); +void PQCLEAN_KYBER76890S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a); void PQCLEAN_KYBER76890S_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]); void PQCLEAN_KYBER76890S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]); -void PQCLEAN_KYBER76890S_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a); +void PQCLEAN_KYBER76890S_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *a); -void PQCLEAN_KYBER76890S_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); +void PQCLEAN_KYBER76890S_CLEAN_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); + +void PQCLEAN_KYBER76890S_CLEAN_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); void PQCLEAN_KYBER76890S_CLEAN_poly_ntt(poly *r); void PQCLEAN_KYBER76890S_CLEAN_poly_invntt_tomont(poly *r); @@ -28,7 +30,6 @@ void PQCLEAN_KYBER76890S_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, c void PQCLEAN_KYBER76890S_CLEAN_poly_tomont(poly *r); void PQCLEAN_KYBER76890S_CLEAN_poly_reduce(poly *r); -void PQCLEAN_KYBER76890S_CLEAN_poly_csubq(poly *r); void PQCLEAN_KYBER76890S_CLEAN_poly_add(poly *r, const poly *a, const poly *b); void PQCLEAN_KYBER76890S_CLEAN_poly_sub(poly *r, const poly *a, const poly *b); diff --git a/crypto_kem/kyber768-90s/clean/polyvec.c b/crypto_kem/kyber768-90s/clean/polyvec.c index 5f647b81..08fc8b57 100644 --- a/crypto_kem/kyber768-90s/clean/polyvec.c +++ b/crypto_kem/kyber768-90s/clean/polyvec.c @@ -10,19 +10,18 @@ * * Arguments: - uint8_t *r: pointer to output byte array * (needs space for KYBER_POLYVECCOMPRESSEDBYTES) -* - polyvec *a: pointer to input vector of polynomials +* - const polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a) { +void PQCLEAN_KYBER76890S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a) { unsigned int i, j, k; - PQCLEAN_KYBER76890S_CLEAN_polyvec_csubq(a); - uint16_t t[4]; for (i = 0; i < KYBER_K; i++) { for (j = 0; j < KYBER_N / 4; j++) { for (k = 0; k < 4; k++) { - t[k] = ((((uint32_t)a->vec[i].coeffs[4 * j + k] << 10) + KYBER_Q / 2) - / KYBER_Q) & 0x3ff; + t[k] = a->vec[i].coeffs[4 * j + k]; + t[k] += ((int16_t)t[k] >> 15) & KYBER_Q; + t[k] = ((((uint32_t)t[k] << 10) + KYBER_Q / 2) / KYBER_Q) & 0x3ff; } r[0] = (uint8_t)(t[0] >> 0); @@ -45,8 +44,7 @@ void PQCLEAN_KYBER76890S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSE * - const uint8_t *a: pointer to input byte array * (of length KYBER_POLYVECCOMPRESSEDBYTES) **************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_polyvec_decompress(polyvec *r, - const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { +void PQCLEAN_KYBER76890S_CLEAN_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { unsigned int i, j, k; uint16_t t[4]; @@ -72,9 +70,9 @@ void PQCLEAN_KYBER76890S_CLEAN_polyvec_decompress(polyvec *r, * * Arguments: - uint8_t *r: pointer to output byte array * (needs space for KYBER_POLYVECBYTES) -* - polyvec *a: pointer to input vector of polynomials +* - const polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a) { +void PQCLEAN_KYBER76890S_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a) { unsigned int i; for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER76890S_CLEAN_poly_tobytes(r + i * KYBER_POLYBYTES, &a->vec[i]); @@ -128,18 +126,16 @@ void PQCLEAN_KYBER76890S_CLEAN_polyvec_invntt_tomont(polyvec *r) { } /************************************************* -* Name: PQCLEAN_KYBER76890S_CLEAN_polyvec_pointwise_acc_montgomery +* Name: PQCLEAN_KYBER76890S_CLEAN_polyvec_basemul_acc_montgomery * -* Description: Pointwise multiply elements of a and b, accumulate into r, +* Description: Multiply elements of a and b in NTT domain, accumulate into r, * and multiply by 2^-16. * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, - const polyvec *a, - const polyvec *b) { +void PQCLEAN_KYBER76890S_CLEAN_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) { unsigned int i; poly t; @@ -156,10 +152,10 @@ void PQCLEAN_KYBER76890S_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, * Name: PQCLEAN_KYBER76890S_CLEAN_polyvec_reduce * * Description: Applies Barrett reduction to each coefficient -* of each element of a vector of polynomials +* of each element of a vector of polynomials; * for details of the Barrett reduction see comments in reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - polyvec *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER76890S_CLEAN_polyvec_reduce(polyvec *r) { unsigned int i; @@ -168,29 +164,12 @@ void PQCLEAN_KYBER76890S_CLEAN_polyvec_reduce(polyvec *r) { } } -/************************************************* -* Name: PQCLEAN_KYBER76890S_CLEAN_polyvec_csubq -* -* Description: Applies conditional subtraction of q to each coefficient -* of each element of a vector of polynomials -* for details of conditional subtraction of q see comments in -* reduce.c -* -* Arguments: - poly *r: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_polyvec_csubq(polyvec *r) { - unsigned int i; - for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER76890S_CLEAN_poly_csubq(&r->vec[i]); - } -} - /************************************************* * Name: PQCLEAN_KYBER76890S_CLEAN_polyvec_add * * Description: Add vectors of polynomials * -* Arguments: - polyvec *r: pointer to output vector of polynomials +* Arguments: - polyvec *r: pointer to output vector of polynomials * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ diff --git a/crypto_kem/kyber768-90s/clean/polyvec.h b/crypto_kem/kyber768-90s/clean/polyvec.h index e59174d3..95b08c3c 100644 --- a/crypto_kem/kyber768-90s/clean/polyvec.h +++ b/crypto_kem/kyber768-90s/clean/polyvec.h @@ -8,22 +8,18 @@ typedef struct { poly vec[KYBER_K]; } polyvec; -void PQCLEAN_KYBER76890S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a); -void PQCLEAN_KYBER76890S_CLEAN_polyvec_decompress(polyvec *r, - const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); +void PQCLEAN_KYBER76890S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a); +void PQCLEAN_KYBER76890S_CLEAN_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); -void PQCLEAN_KYBER76890S_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a); +void PQCLEAN_KYBER76890S_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a); void PQCLEAN_KYBER76890S_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]); void PQCLEAN_KYBER76890S_CLEAN_polyvec_ntt(polyvec *r); void PQCLEAN_KYBER76890S_CLEAN_polyvec_invntt_tomont(polyvec *r); -void PQCLEAN_KYBER76890S_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, - const polyvec *a, - const polyvec *b); +void PQCLEAN_KYBER76890S_CLEAN_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b); void PQCLEAN_KYBER76890S_CLEAN_polyvec_reduce(polyvec *r); -void PQCLEAN_KYBER76890S_CLEAN_polyvec_csubq(polyvec *r); void PQCLEAN_KYBER76890S_CLEAN_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); diff --git a/crypto_kem/kyber768-90s/clean/reduce.c b/crypto_kem/kyber768-90s/clean/reduce.c index 47854ad7..18ef32b8 100644 --- a/crypto_kem/kyber768-90s/clean/reduce.c +++ b/crypto_kem/kyber768-90s/clean/reduce.c @@ -6,8 +6,7 @@ * Name: PQCLEAN_KYBER76890S_CLEAN_montgomery_reduce * * Description: Montgomery reduction; given a 32-bit integer a, computes -* 16-bit integer congruent to a * R^-1 mod q, -* where R=2^16 +* 16-bit integer congruent to a * R^-1 mod q, where R=2^16 * * Arguments: - int32_t a: input integer to be reduced; * has to be in {-q2^15,...,q2^15-1} @@ -29,32 +28,17 @@ int16_t PQCLEAN_KYBER76890S_CLEAN_montgomery_reduce(int32_t a) { * Name: PQCLEAN_KYBER76890S_CLEAN_barrett_reduce * * Description: Barrett reduction; given a 16-bit integer a, computes -* 16-bit integer congruent to a mod q in {0,...,q} +* centered representative congruent to a mod q in {-(q-1)/2,...,(q-1)/2} * * Arguments: - int16_t a: input integer to be reduced * -* Returns: integer in {0,...,q} congruent to a modulo q. +* Returns: integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q. **************************************************/ int16_t PQCLEAN_KYBER76890S_CLEAN_barrett_reduce(int16_t a) { int16_t t; const int16_t v = ((1U << 26) + KYBER_Q / 2) / KYBER_Q; - t = (int32_t)v * a >> 26; + t = ((int32_t)v * a + (1 << 25)) >> 26; t *= KYBER_Q; return a - t; } - -/************************************************* -* Name: PQCLEAN_KYBER76890S_CLEAN_csubq -* -* Description: Conditionallly subtract q -* -* Arguments: - int16_t x: input integer -* -* Returns: a - q if a >= q, else a -**************************************************/ -int16_t PQCLEAN_KYBER76890S_CLEAN_csubq(int16_t a) { - a -= KYBER_Q; - a += (a >> 15) & KYBER_Q; - return a; -} diff --git a/crypto_kem/kyber768-90s/clean/reduce.h b/crypto_kem/kyber768-90s/clean/reduce.h index cd282cc7..97007300 100644 --- a/crypto_kem/kyber768-90s/clean/reduce.h +++ b/crypto_kem/kyber768-90s/clean/reduce.h @@ -10,6 +10,4 @@ int16_t PQCLEAN_KYBER76890S_CLEAN_montgomery_reduce(int32_t a); int16_t PQCLEAN_KYBER76890S_CLEAN_barrett_reduce(int16_t a); -int16_t PQCLEAN_KYBER76890S_CLEAN_csubq(int16_t a); - #endif diff --git a/crypto_kem/kyber768-90s/clean/symmetric-aes.c b/crypto_kem/kyber768-90s/clean/symmetric-aes.c index 281d33d0..9ae586d5 100644 --- a/crypto_kem/kyber768-90s/clean/symmetric-aes.c +++ b/crypto_kem/kyber768-90s/clean/symmetric-aes.c @@ -1,100 +1,18 @@ -#include "aes.h" +#include "aes256ctr.h" #include "params.h" #include "symmetric.h" #include #include -#include -static inline void br_enc32be(unsigned char *dst, uint32_t x) { - dst[3] = (unsigned char)x; - dst[2] = (unsigned char)(x >> 8); - dst[1] = (unsigned char)(x >> 16); - dst[0] = (unsigned char)(x >> 24); +void PQCLEAN_KYBER76890S_CLEAN_kyber_aes256xof_absorb(aes256ctr_ctx *state, const uint8_t seed[32], uint8_t x, uint8_t y) { + uint8_t expnonce[12] = {0}; + expnonce[0] = x; + expnonce[1] = y; + PQCLEAN_KYBER76890S_CLEAN_aes256ctr_init(state, seed, expnonce); } -static void aes256_ctr_xof(unsigned char *out, size_t outlen, const unsigned char *iv, uint32_t ctr, const aes256ctx *ctx) { - uint8_t ivw[16]; - uint8_t buf[AES_BLOCKBYTES]; - size_t i = 0; - - memcpy(ivw, iv, AESCTR_NONCEBYTES); - br_enc32be(ivw + AESCTR_NONCEBYTES, ctr); - - while (outlen > AES_BLOCKBYTES) { - aes256_ecb(out, ivw, 1, ctx); - br_enc32be(ivw + AESCTR_NONCEBYTES, ++ctr); - out += AES_BLOCKBYTES; - outlen -= AES_BLOCKBYTES; - } - if (outlen > 0) { - aes256_ecb(buf, ivw, 1, ctx); - for (i = 0; i < outlen; i++) { - out[i] = buf[i]; - } - } -} - -/************************************************* -* Name: PQCLEAN_KYBER76890S_CLEAN_aes256_prf -* -* Description: AES256 stream generation in CTR mode using 32-bit counter, -* nonce is zero-padded to 12 bytes, counter starts at zero -* -* Arguments: - uint8_t *output: pointer to output -* - size_t outlen: length of requested output in bytes -* - const uint8_t *key: pointer to 32-byte key -* - uint8_t nonce: 1-byte nonce (will be zero-padded to 12 bytes) -**************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_aes256_prf(uint8_t *output, size_t outlen, const uint8_t *key, uint8_t nonce) { - uint8_t iv[12]; - for (int i = 1; i < 12; i++) { - iv[i] = 0; - } - iv[0] = nonce; - - aes256ctx ctx; - aes256_ctr_keyexp(&ctx, key); - aes256_ctr(output, outlen, iv, &ctx); - aes256_ctx_release(&ctx); -} - -/************************************************* -* Name: PQCLEAN_KYBER76890S_CLEAN_aes256xof_absorb -* -* Description: AES256 CTR used as a replacement for a XOF; this function -* "absorbs" a 32-byte key and two additional bytes that are zero-padded -* to a 12-byte nonce -* -* Arguments: - aes256xof_ctx *s: pointer to state to "absorb" key and IV into -* - const uint8_t *key: pointer to 32-byte key -* - uint8_t x: first additional byte to "absorb" -* - uint8_t y: second additional byte to "absorb" -**************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_aes256xof_absorb(aes256xof_ctx *s, const uint8_t *key, uint8_t x, uint8_t y) { - aes256_ecb_keyexp(&s->sk_exp, key); - for (int i = 2; i < 12; i++) { - s->iv[i] = 0; - } - s->iv[0] = x; - s->iv[1] = y; - s->ctr = 0; -} - -/************************************************* -* Name: PQCLEAN_KYBER76890S_CLEAN_aes256xof_squeezeblocks -* -* Description: AES256 CTR used as a replacement for a XOF; this function -* generates 4 blocks out AES256-CTR output -* -* Arguments: - uint8_t *out: pointer to output -* - size_t nblocks: number of reqested 64-byte output blocks -* - aes256xof_ctx *s: AES "state", i.e. expanded key and IV -**************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_aes256xof_squeezeblocks(uint8_t *out, size_t nblocks, aes256xof_ctx *s) { - aes256_ctr_xof(out, nblocks * 64, s->iv, s->ctr, &s->sk_exp); - s->ctr += (uint32_t) (4 * nblocks); -} - -void PQCLEAN_KYBER76890S_CLEAN_aes256xof_ctx_release(aes256xof_ctx *s) { - aes256_ctx_release(&s->sk_exp); +void PQCLEAN_KYBER76890S_CLEAN_kyber_aes256ctr_prf(uint8_t *out, size_t outlen, const uint8_t key[32], uint8_t nonce) { + uint8_t expnonce[12] = {0}; + expnonce[0] = nonce; + PQCLEAN_KYBER76890S_CLEAN_aes256ctr_prf(out, outlen, key, expnonce); } diff --git a/crypto_kem/kyber768-90s/clean/symmetric-aes.h b/crypto_kem/kyber768-90s/clean/symmetric-aes.h deleted file mode 100644 index c2aa588b..00000000 --- a/crypto_kem/kyber768-90s/clean/symmetric-aes.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef PQCLEAN_KYBER76890S_CLEAN_SYMMETRIC_AES_H -#define PQCLEAN_KYBER76890S_CLEAN_SYMMETRIC_AES_H -#include "aes.h" -#include -#include - - -typedef struct { - aes256ctx sk_exp; - uint8_t iv[12]; - uint32_t ctr; -} aes256xof_ctx; - -void PQCLEAN_KYBER76890S_CLEAN_aes256_prf(uint8_t *output, size_t outlen, const uint8_t *key, uint8_t nonce); -void PQCLEAN_KYBER76890S_CLEAN_aes256xof_absorb(aes256xof_ctx *s, const uint8_t *key, uint8_t x, uint8_t y); -void PQCLEAN_KYBER76890S_CLEAN_aes256xof_squeezeblocks(uint8_t *out, size_t nblocks, aes256xof_ctx *s); -void PQCLEAN_KYBER76890S_CLEAN_aes256xof_ctx_release(aes256xof_ctx *s); - -#endif diff --git a/crypto_kem/kyber768-90s/clean/symmetric.h b/crypto_kem/kyber768-90s/clean/symmetric.h index 8965c639..188e0512 100644 --- a/crypto_kem/kyber768-90s/clean/symmetric.h +++ b/crypto_kem/kyber768-90s/clean/symmetric.h @@ -1,23 +1,28 @@ #ifndef PQCLEAN_KYBER76890S_CLEAN_SYMMETRIC_H #define PQCLEAN_KYBER76890S_CLEAN_SYMMETRIC_H +#include "aes256ctr.h" #include "params.h" #include "sha2.h" -#include "symmetric-aes.h" #include #include -typedef aes256xof_ctx xof_state; -#define XOF_BLOCKBYTES 64 +typedef aes256ctr_ctx xof_state; + +void PQCLEAN_KYBER76890S_CLEAN_kyber_aes256xof_absorb(aes256ctr_ctx *state, const uint8_t seed[32], uint8_t x, uint8_t y); + +void PQCLEAN_KYBER76890S_CLEAN_kyber_aes256ctr_prf(uint8_t *out, size_t outlen, const uint8_t key[32], uint8_t nonce); + +#define XOF_BLOCKBYTES AES256CTR_BLOCKBYTES #define hash_h(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES) #define hash_g(OUT, IN, INBYTES) sha512(OUT, IN, INBYTES) -#define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER76890S_CLEAN_aes256xof_absorb(STATE, SEED, X, Y) -#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_KYBER76890S_CLEAN_aes256xof_squeezeblocks(OUT, OUTBLOCKS, STATE) -#define xof_ctx_release(STATE) PQCLEAN_KYBER76890S_CLEAN_aes256xof_ctx_release(STATE) -#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER76890S_CLEAN_aes256_prf(OUT, OUTBYTES, KEY, NONCE) +#define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER76890S_CLEAN_kyber_aes256xof_absorb(STATE, SEED, X, Y) +#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_KYBER76890S_CLEAN_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define xof_ctx_release(STATE) +#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER76890S_CLEAN_kyber_aes256ctr_prf(OUT, OUTBYTES, KEY, NONCE) #define kdf(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES) diff --git a/crypto_kem/kyber768/META.yml b/crypto_kem/kyber768/META.yml index a4b60867..b1844668 100644 --- a/crypto_kem/kyber768/META.yml +++ b/crypto_kem/kyber768/META.yml @@ -6,7 +6,7 @@ length-public-key: 1184 length-ciphertext: 1088 length-secret-key: 2400 length-shared-secret: 32 -nistkat-sha256: d6dbb9399d1ba4ee2d986de3e54a461256b91d6c2f9b90ad2410cf41e09b64d1 +nistkat-sha256: 89e82a5bf2d4ddb2c6444e10409e6d9ca65dafbca67d1a0db2c9b54920a29172 principal-submitters: - Peter Schwabe auxiliary-submitters: @@ -21,9 +21,9 @@ auxiliary-submitters: - Damien Stehlé implementations: - name: clean - version: https://github.com/pq-crystals/kyber/commit/844057468e69527bd15b17fbe03f4b61f9a22065 via https://github.com/jschanck/package-pqclean/tree/b45068b8/kyber + version: https://github.com/pq-crystals/kyber/commit/e7faae9f662f5b92fee4e966f09b2f23e1e91c65 via https://github.com/jschanck/package-pqclean/tree/231c9bec/kyber - name: avx2 - version: https://github.com/pq-crystals/kyber/commit/844057468e69527bd15b17fbe03f4b61f9a22065 via https://github.com/jschanck/package-pqclean/tree/b45068b8/kyber + version: https://github.com/pq-crystals/kyber/commit/e7faae9f662f5b92fee4e966f09b2f23e1e91c65 via https://github.com/jschanck/package-pqclean/tree/231c9bec/kyber supported_platforms: - architecture: x86_64 operating_systems: diff --git a/crypto_kem/kyber768/avx2/align.h b/crypto_kem/kyber768/avx2/align.h index 7194d2b3..d5f54c16 100644 --- a/crypto_kem/kyber768/avx2/align.h +++ b/crypto_kem/kyber768/avx2/align.h @@ -2,22 +2,18 @@ #define PQCLEAN_KYBER768_AVX2_ALIGN_H #include +#include -#define ALIGN16_TYPE(t) \ - union { \ - __m128i vec; \ - t orig; \ +#define ALIGNED_UINT8(N) \ + union { \ + uint8_t coeffs[(N)]; \ + __m256i vec[((N)+31)/32]; \ } -#define ALIGN32_ARRAY(t, s) \ - union { \ - __m256i vec; \ - t arr[(s)]; \ +#define ALIGNED_INT16(N) \ + union { \ + int16_t coeffs[(N)]; \ + __m256i vec[((N)+15)/16]; \ } -#define ALIGN32_ARRAY_2D(t, n, m) \ - union { \ - __m256i vec; \ - t arr[(n)][(m)]; \ - } #endif diff --git a/crypto_kem/kyber768/avx2/basemul.S b/crypto_kem/kyber768/avx2/basemul.S index 5b630782..81ce520c 100644 --- a/crypto_kem/kyber768/avx2/basemul.S +++ b/crypto_kem/kyber768/avx2/basemul.S @@ -1,232 +1,107 @@ #include "cdecl.h" -#include "params.h" -.macro schoolbook off,sign -#load -vmovdqa \off+32(%rsi),%ymm7 # b -vmovdqa \off+32(%rdx),%ymm8 # d -vmovdqa \off(%rsi),%ymm9 # a -vmovdqa \off(%rdx),%ymm10 # c +.macro schoolbook off +vmovdqa _16XQINV*2(%rcx),%ymm0 +vmovdqa (64*\off+ 0)*2(%rsi),%ymm1 # a0 +vmovdqa (64*\off+16)*2(%rsi),%ymm2 # b0 +vmovdqa (64*\off+32)*2(%rsi),%ymm3 # a1 +vmovdqa (64*\off+48)*2(%rsi),%ymm4 # b1 -#mul -vpmullw %ymm7,%ymm8,%ymm11 # bd.lo -vpmulhw %ymm7,%ymm8,%ymm12 # bd.hi -vpmullw %ymm7,%ymm10,%ymm13 # bc.lo -vpmulhw %ymm7,%ymm10,%ymm7 # bc.hi -vpmullw %ymm9,%ymm8,%ymm14 # ad.lo -vpmulhw %ymm9,%ymm8,%ymm8 # ad.hi -vpmullw %ymm9,%ymm10,%ymm15 # ac.lo -vpmulhw %ymm9,%ymm10,%ymm9 # ac.hi +vpmullw %ymm0,%ymm1,%ymm9 # a0.lo +vpmullw %ymm0,%ymm2,%ymm10 # b0.lo +vpmullw %ymm0,%ymm3,%ymm11 # a1.lo +vpmullw %ymm0,%ymm4,%ymm12 # b1.lo -#reduce -vpmullw %ymm1,%ymm11,%ymm11 -vpmulhw %ymm0,%ymm11,%ymm11 -vpsubw %ymm11,%ymm12,%ymm11 # bd +vmovdqa (64*\off+ 0)*2(%rdx),%ymm5 # c0 +vmovdqa (64*\off+16)*2(%rdx),%ymm6 # d0 -#mul -vpmullw %ymm2,%ymm11,%ymm10 # rbd.lo -vpmulhw %ymm2,%ymm11,%ymm11 # rbd.hi +vpmulhw %ymm5,%ymm1,%ymm13 # a0c0.hi +vpmulhw %ymm6,%ymm1,%ymm1 # a0d0.hi +vpmulhw %ymm5,%ymm2,%ymm14 # b0c0.hi +vpmulhw %ymm6,%ymm2,%ymm2 # b0d0.hi -#unpack -vpunpcklwd %ymm7,%ymm13,%ymm12 # bc0 -vpunpckhwd %ymm7,%ymm13,%ymm7 # bc1 -vpunpcklwd %ymm8,%ymm14,%ymm13 # ad0 -vpunpckhwd %ymm8,%ymm14,%ymm8 # ad1 -vpunpcklwd %ymm9,%ymm15,%ymm14 # ac0 -vpunpckhwd %ymm9,%ymm15,%ymm9 # ac1 -vpunpcklwd %ymm11,%ymm10,%ymm15 # rbd0 -vpunpckhwd %ymm11,%ymm10,%ymm10 # rbd1 +vmovdqa (64*\off+32)*2(%rdx),%ymm7 # c1 +vmovdqa (64*\off+48)*2(%rdx),%ymm8 # d1 -#add -.ifeq \sign -vpaddd %ymm14,%ymm15,%ymm14 # x0 -vpaddd %ymm9,%ymm10,%ymm9 # x1 -.else -vpsubd %ymm15,%ymm14,%ymm14 # x0 -vpsubd %ymm10,%ymm9,%ymm9 # x1 -.endif -vpaddd %ymm12,%ymm13,%ymm12 # y0 -vpaddd %ymm7,%ymm8,%ymm7 # y1 -.endm +vpmulhw %ymm7,%ymm3,%ymm15 # a1c1.hi +vpmulhw %ymm8,%ymm3,%ymm3 # a1d1.hi +vpmulhw %ymm7,%ymm4,%ymm0 # b1c1.hi +vpmulhw %ymm8,%ymm4,%ymm4 # b1d1.hi -.macro red a0,a1,b0,b1,x,y,z -#pack -vpxor %ymm\x,%ymm\x,%ymm\x -vpblendw $0xAA,%ymm\x,%ymm\a0,%ymm\y -vpblendw $0xAA,%ymm\x,%ymm\a1,%ymm\z -vpsrld $16,%ymm\a0,%ymm\a0 -vpsrld $16,%ymm\a1,%ymm\a1 -vpackusdw %ymm\z,%ymm\y,%ymm\z -vpackusdw %ymm\a1,%ymm\a0,%ymm\a0 -vpblendw $0xAA,%ymm\x,%ymm\b0,%ymm\y -vpblendw $0xAA,%ymm\x,%ymm\b1,%ymm\x -vpsrld $16,%ymm\b0,%ymm\b0 -vpsrld $16,%ymm\b1,%ymm\b1 -vpackusdw %ymm\x,%ymm\y,%ymm\y -vpackusdw %ymm\b1,%ymm\b0,%ymm\b0 +vmovdqa %ymm13,(%rsp) -#reduce -vpmullw %ymm1,%ymm\z,%ymm\z -vpmullw %ymm1,%ymm\y,%ymm\y -vpmulhw %ymm0,%ymm\z,%ymm\z -vpmulhw %ymm0,%ymm\y,%ymm\y -vpsubw %ymm\z,%ymm\a0,%ymm\a0 -vpsubw %ymm\y,%ymm\b0,%ymm\b0 +vpmullw %ymm5,%ymm9,%ymm13 # a0c0.lo +vpmullw %ymm6,%ymm9,%ymm9 # a0d0.lo +vpmullw %ymm5,%ymm10,%ymm5 # b0c0.lo +vpmullw %ymm6,%ymm10,%ymm10 # b0d0.lo + +vpmullw %ymm7,%ymm11,%ymm6 # a1c1.lo +vpmullw %ymm8,%ymm11,%ymm11 # a1d1.lo +vpmullw %ymm7,%ymm12,%ymm7 # b1c1.lo +vpmullw %ymm8,%ymm12,%ymm12 # b1d1.lo + +vmovdqa _16XQ*2(%rcx),%ymm8 +vpmulhw %ymm8,%ymm13,%ymm13 +vpmulhw %ymm8,%ymm9,%ymm9 +vpmulhw %ymm8,%ymm5,%ymm5 +vpmulhw %ymm8,%ymm10,%ymm10 +vpmulhw %ymm8,%ymm6,%ymm6 +vpmulhw %ymm8,%ymm11,%ymm11 +vpmulhw %ymm8,%ymm7,%ymm7 +vpmulhw %ymm8,%ymm12,%ymm12 + +vpsubw (%rsp),%ymm13,%ymm13 # -a0c0 +vpsubw %ymm9,%ymm1,%ymm9 # a0d0 +vpsubw %ymm5,%ymm14,%ymm5 # b0c0 +vpsubw %ymm10,%ymm2,%ymm10 # b0d0 + +vpsubw %ymm6,%ymm15,%ymm6 # a1c1 +vpsubw %ymm11,%ymm3,%ymm11 # a1d1 +vpsubw %ymm7,%ymm0,%ymm7 # b1c1 +vpsubw %ymm12,%ymm4,%ymm12 # b1d1 + +vmovdqa (%r9),%ymm0 +vmovdqa 32(%r9),%ymm1 +vpmullw %ymm0,%ymm10,%ymm2 +vpmullw %ymm0,%ymm12,%ymm3 +vpmulhw %ymm1,%ymm10,%ymm10 +vpmulhw %ymm1,%ymm12,%ymm12 +vpmulhw %ymm8,%ymm2,%ymm2 +vpmulhw %ymm8,%ymm3,%ymm3 +vpsubw %ymm2,%ymm10,%ymm10 # rb0d0 +vpsubw %ymm3,%ymm12,%ymm12 # rb1d1 + +vpaddw %ymm5,%ymm9,%ymm9 +vpaddw %ymm7,%ymm11,%ymm11 +vpsubw %ymm13,%ymm10,%ymm13 +vpsubw %ymm12,%ymm6,%ymm6 + +vmovdqa %ymm13,(64*\off+ 0)*2(%rdi) +vmovdqa %ymm9,(64*\off+16)*2(%rdi) +vmovdqa %ymm6,(64*\off+32)*2(%rdi) +vmovdqa %ymm11,(64*\off+48)*2(%rdi) .endm .text -basemul64_acc_avx: -poly0.0: -schoolbook 0,0 - -#mov -vmovdqa %ymm14,%ymm3 -vmovdqa %ymm9,%ymm4 -vmovdqa %ymm12,%ymm5 -vmovdqa %ymm7,%ymm6 - -poly1.0: -schoolbook 512,0 - -#add -vpaddd %ymm14,%ymm3,%ymm3 -vpaddd %ymm9,%ymm4,%ymm4 -vpaddd %ymm12,%ymm5,%ymm5 -vpaddd %ymm7,%ymm6,%ymm6 - -poly2.0: -schoolbook 1024,0 - -#add -vpaddd %ymm14,%ymm3,%ymm3 -vpaddd %ymm9,%ymm4,%ymm4 -vpaddd %ymm12,%ymm5,%ymm5 -vpaddd %ymm7,%ymm6,%ymm6 - - -#reduce -red 3,4,5,6,7,8,9 - -#store -vmovdqa %ymm3,(%rdi) -vmovdqa %ymm5,32(%rdi) - -poly0.1: -schoolbook 64,1 - -#mov -vmovdqa %ymm14,%ymm3 -vmovdqa %ymm9,%ymm4 -vmovdqa %ymm12,%ymm5 -vmovdqa %ymm7,%ymm6 - -poly1.1: -schoolbook 576,1 - -#add -vpaddd %ymm14,%ymm3,%ymm3 -vpaddd %ymm9,%ymm4,%ymm4 -vpaddd %ymm12,%ymm5,%ymm5 -vpaddd %ymm7,%ymm6,%ymm6 - -poly2.1: -schoolbook 1088,1 - -#add -vpaddd %ymm14,%ymm3,%ymm3 -vpaddd %ymm9,%ymm4,%ymm4 -vpaddd %ymm12,%ymm5,%ymm5 -vpaddd %ymm7,%ymm6,%ymm6 - - -#reduce -red 3,4,5,6,7,8,9 - -#store -vmovdqa %ymm3,64(%rdi) -vmovdqa %ymm5,96(%rdi) - -ret - -.global cdecl(PQCLEAN_KYBER768_AVX2_basemul_acc_avx) -.global _cdecl(PQCLEAN_KYBER768_AVX2_basemul_acc_avx) -cdecl(PQCLEAN_KYBER768_AVX2_basemul_acc_avx): -_cdecl(PQCLEAN_KYBER768_AVX2_basemul_acc_avx): -#consts -vmovdqa _16XQ*2(%rcx),%ymm0 -vmovdqa _16XQINV*2(%rcx),%ymm1 - -vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 -call basemul64_acc_avx - -vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_acc_avx - -vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_acc_avx - -vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_acc_avx - -ret - -basemul64_avx: -schoolbook 0,0 - -#reduce -red 14,9,12,7,8,10,11 - -#store -vmovdqa %ymm14,(%rdi) -vmovdqa %ymm12,32(%rdi) - -schoolbook 64,1 - -#reduce -red 14,9,12,7,8,10,11 - -#store -vmovdqa %ymm14,64(%rdi) -vmovdqa %ymm12,96(%rdi) - -ret - .global cdecl(PQCLEAN_KYBER768_AVX2_basemul_avx) .global _cdecl(PQCLEAN_KYBER768_AVX2_basemul_avx) cdecl(PQCLEAN_KYBER768_AVX2_basemul_avx): _cdecl(PQCLEAN_KYBER768_AVX2_basemul_avx): -#consts -vmovdqa _16XQ*2(%rcx),%ymm0 -vmovdqa _16XQINV*2(%rcx),%ymm1 +mov %rsp,%r8 +and $-32,%rsp +sub $32,%rsp -vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 -call basemul64_avx +lea (_ZETAS_EXP+176)*2(%rcx),%r9 +schoolbook 0 -vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_avx +add $32*2,%r9 +schoolbook 1 -vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_avx +add $192*2,%r9 +schoolbook 2 -vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 -add $128,%rdi -add $128,%rsi -add $128,%rdx -call basemul64_avx +add $32*2,%r9 +schoolbook 3 +mov %r8,%rsp ret diff --git a/crypto_kem/kyber768/avx2/cbd.c b/crypto_kem/kyber768/avx2/cbd.c index ea2d1926..38f26fc6 100644 --- a/crypto_kem/kyber768/avx2/cbd.c +++ b/crypto_kem/kyber768/avx2/cbd.c @@ -4,66 +4,64 @@ #include /************************************************* -* Name: PQCLEAN_KYBER768_AVX2_cbd +* Name: cbd2 * * Description: Given an array of uniformly random bytes, compute * polynomial with coefficients distributed according to -* a centered binomial distribution with parameter KYBER_ETA +* a centered binomial distribution with parameter eta=2 * -* Arguments: - poly *r: pointer to output polynomial -* - const unsigned char *buf: pointer to input byte array +* Arguments: - poly *r: pointer to output polynomial +* - const __m256i *buf: pointer to aligned input byte array **************************************************/ -void PQCLEAN_KYBER768_AVX2_cbd(poly *restrict r, const uint8_t *restrict buf) { +static void cbd2(poly *restrict r, const __m256i buf[2 * KYBER_N / 128]) { unsigned int i; - __m256i vec0, vec1, vec2, vec3, tmp; + __m256i f0, f1, f2, f3; const __m256i mask55 = _mm256_set1_epi32(0x55555555); const __m256i mask33 = _mm256_set1_epi32(0x33333333); const __m256i mask03 = _mm256_set1_epi32(0x03030303); + const __m256i mask0F = _mm256_set1_epi32(0x0F0F0F0F); for (i = 0; i < KYBER_N / 64; i++) { - vec0 = _mm256_load_si256((__m256i *)&buf[32 * i]); + f0 = _mm256_load_si256(&buf[i]); - vec1 = _mm256_srli_epi32(vec0, 1); - vec0 = _mm256_and_si256(mask55, vec0); - vec1 = _mm256_and_si256(mask55, vec1); - vec0 = _mm256_add_epi32(vec0, vec1); + f1 = _mm256_srli_epi16(f0, 1); + f0 = _mm256_and_si256(mask55, f0); + f1 = _mm256_and_si256(mask55, f1); + f0 = _mm256_add_epi8(f0, f1); - vec1 = _mm256_srli_epi32(vec0, 2); - vec0 = _mm256_and_si256(mask33, vec0); - vec1 = _mm256_and_si256(mask33, vec1); + f1 = _mm256_srli_epi16(f0, 2); + f0 = _mm256_and_si256(mask33, f0); + f1 = _mm256_and_si256(mask33, f1); + f0 = _mm256_add_epi8(f0, mask33); + f0 = _mm256_sub_epi8(f0, f1); - vec2 = _mm256_srli_epi32(vec0, 4); - vec3 = _mm256_srli_epi32(vec1, 4); - vec0 = _mm256_and_si256(mask03, vec0); - vec1 = _mm256_and_si256(mask03, vec1); - vec2 = _mm256_and_si256(mask03, vec2); - vec3 = _mm256_and_si256(mask03, vec3); + f1 = _mm256_srli_epi16(f0, 4); + f0 = _mm256_and_si256(mask0F, f0); + f1 = _mm256_and_si256(mask0F, f1); + f0 = _mm256_sub_epi8(f0, mask03); + f1 = _mm256_sub_epi8(f1, mask03); - vec1 = _mm256_sub_epi8(vec0, vec1); - vec3 = _mm256_sub_epi8(vec2, vec3); + f2 = _mm256_unpacklo_epi8(f0, f1); + f3 = _mm256_unpackhi_epi8(f0, f1); - vec0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec1)); - vec1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec1, 1)); - vec2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec3)); - vec3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec3, 1)); + f0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f2)); + f1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f2, 1)); + f2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f3)); + f3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f3, 1)); - tmp = _mm256_unpacklo_epi16(vec0, vec2); - vec2 = _mm256_unpackhi_epi16(vec0, vec2); - vec0 = tmp; - tmp = _mm256_unpacklo_epi16(vec1, vec3); - vec3 = _mm256_unpackhi_epi16(vec1, vec3); - vec1 = tmp; - - tmp = _mm256_permute2x128_si256(vec0, vec2, 0x20); - vec2 = _mm256_permute2x128_si256(vec0, vec2, 0x31); - vec0 = tmp; - tmp = _mm256_permute2x128_si256(vec1, vec3, 0x20); - vec3 = _mm256_permute2x128_si256(vec1, vec3, 0x31); - vec1 = tmp; - - _mm256_store_si256((__m256i *)&r->coeffs[64 * i + 0], vec0); - _mm256_store_si256((__m256i *)&r->coeffs[64 * i + 16], vec2); - _mm256_store_si256((__m256i *)&r->coeffs[64 * i + 32], vec1); - _mm256_store_si256((__m256i *)&r->coeffs[64 * i + 48], vec3); + _mm256_store_si256(&r->vec[4 * i + 0], f0); + _mm256_store_si256(&r->vec[4 * i + 1], f2); + _mm256_store_si256(&r->vec[4 * i + 2], f1); + _mm256_store_si256(&r->vec[4 * i + 3], f3); } } + + +/* buf 32 bytes longer for cbd3 */ +void PQCLEAN_KYBER768_AVX2_poly_cbd_eta1(poly *r, const __m256i buf[KYBER_ETA1 * KYBER_N / 128 + 1]) { + cbd2(r, buf); +} + +void PQCLEAN_KYBER768_AVX2_poly_cbd_eta2(poly *r, const __m256i buf[KYBER_ETA2 * KYBER_N / 128]) { + cbd2(r, buf); +} diff --git a/crypto_kem/kyber768/avx2/cbd.h b/crypto_kem/kyber768/avx2/cbd.h index 2d254413..7bc3a156 100644 --- a/crypto_kem/kyber768/avx2/cbd.h +++ b/crypto_kem/kyber768/avx2/cbd.h @@ -2,8 +2,11 @@ #define PQCLEAN_KYBER768_AVX2_CBD_H #include "params.h" #include "poly.h" +#include #include -void PQCLEAN_KYBER768_AVX2_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]); +void PQCLEAN_KYBER768_AVX2_poly_cbd_eta1(poly *r, const __m256i buf[KYBER_ETA1 * KYBER_N / 128 + 1]); + +void PQCLEAN_KYBER768_AVX2_poly_cbd_eta2(poly *r, const __m256i buf[KYBER_ETA2 * KYBER_N / 128]); #endif diff --git a/crypto_kem/kyber768/avx2/cdecl.h b/crypto_kem/kyber768/avx2/cdecl.h index 3a6004a8..918aad3d 100644 --- a/crypto_kem/kyber768/avx2/cdecl.h +++ b/crypto_kem/kyber768/avx2/cdecl.h @@ -1,6 +1,8 @@ #ifndef PQCLEAN_KYBER768_AVX2_CDECL_H #define PQCLEAN_KYBER768_AVX2_CDECL_H + + #define _16XQ 0 #define _16XQINV 16 #define _16XV 32 @@ -9,9 +11,10 @@ #define _16XMONTSQLO 80 #define _16XMONTSQHI 96 #define _16XMASK 112 -#define _ZETAS_EXP 128 -#define _ZETAS_INV_EXP 528 - +#define _REVIDXB 128 +#define _REVIDXD 144 +#define _ZETAS_EXP 160 +#define _16XSHIFT 624 /* The C ABI on MacOS exports all symbols with a leading * underscore. This means that any symbols we refer to from @@ -23,4 +26,5 @@ #define _cdecl(s) _##s #define cdecl(s) s + #endif diff --git a/crypto_kem/kyber768/avx2/consts.c b/crypto_kem/kyber768/avx2/consts.c index 39201de5..63381ccc 100644 --- a/crypto_kem/kyber768/avx2/consts.c +++ b/crypto_kem/kyber768/avx2/consts.c @@ -1,155 +1,123 @@ +#include "align.h" #include "consts.h" #include "params.h" -#include + #define Q KYBER_Q -#define MONT ((1U << 16) % Q) -#define QINV 62209 // q^-1 mod 2^16 -#define V (((1U << 26) + Q/2)/Q) -#define FHI (MONT*(MONT*(Q-1)*((Q-1)/128) % Q) % Q) -#define FLO (FHI*QINV % 65536) -#define MONTSQHI (MONT*MONT % Q) -#define MONTSQLO (MONTSQHI*QINV % 65536) +#define MONT (-1044) // 2^16 mod q +#define QINV (-3327) // q^-1 mod 2^16 +#define V 20159 // floor(2^26/q + 0.5) +#define FHI 1441 // mont^2/128 +#define FLO (-10079) // qinv*FHI +#define MONTSQHI 1353 // mont^2 +#define MONTSQLO 20553 // qinv*MONTSQHI #define MASK 4095 +#define SHIFT 32 - -const qdata_t PQCLEAN_KYBER768_AVX2_qdata = {.as_arr = { -#define _16XQ 0 +const qdata_t PQCLEAN_KYBER768_AVX2_qdata = {.coeffs = { +//#define _16XQ 0 Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, -#define _16XQINV 16 +//#define _16XQINV 16 QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, -#define _16XV 32 +//#define _16XV 32 V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, -#define _16XFLO 48 +//#define _16XFLO 48 FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, -#define _16XFHI 64 +//#define _16XFHI 64 FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, -#define _16XMONTSQLO 80 +//#define _16XMONTSQLO 80 MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, -#define _16XMONTSQHI 96 +//#define _16XMONTSQHI 96 MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, -#define _16XMASK 112 +//#define _16XMASK 112 MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, -#define _ZETAS_EXP 128 - 31499, 31499, 2571, 2571, 14746, 14746, 2970, 2970, - 13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, - 53134, 53134, 53134, 53134, 53134, 53134, 53134, 53134, - 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, - 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, - 44630, 44630, 44630, 44630, 27758, 27758, 27758, 27758, - 61737, 61737, 61737, 61737, 49846, 49846, 49846, 49846, - 3158, 3158, 3158, 3158, 622, 622, 622, 622, - 1577, 1577, 1577, 1577, 182, 182, 182, 182, - 59709, 59709, 17364, 17364, 39176, 39176, 36479, 36479, - 5572, 5572, 64434, 64434, 21439, 21439, 39295, 39295, - 573, 573, 2004, 2004, 264, 264, 383, 383, - 2500, 2500, 1458, 1458, 1727, 1727, 3199, 3199, - 59847, 59020, 1497, 30967, 41972, 20179, 20711, 25081, - 52740, 26617, 16065, 53095, 9135, 64887, 39550, 27837, - 1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, - 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, - 65202, 54059, 33310, 20494, 37798, 945, 50654, 6182, - 32011, 10631, 29176, 36775, 47051, 17561, 51106, 60261, - 2226, 555, 2078, 1550, 422, 177, 3038, 1574, - 3083, 1159, 2552, 2727, 1739, 2457, 418, 3173, - 11182, 13387, 51303, 43881, 13131, 60950, 23093, 5493, - 33034, 30318, 46795, 12639, 20100, 18525, 19529, 52918, - 430, 843, 871, 105, 587, 3094, 2869, 1653, - 778, 3182, 1483, 1119, 644, 349, 329, 3254, - 788, 788, 1812, 1812, 28191, 28191, 28191, 28191, - 28191, 28191, 28191, 28191, 48842, 48842, 48842, 48842, - 48842, 48842, 48842, 48842, 287, 287, 287, 287, - 287, 287, 287, 287, 202, 202, 202, 202, - 202, 202, 202, 202, 10690, 10690, 10690, 10690, - 1359, 1359, 1359, 1359, 54335, 54335, 54335, 54335, - 31164, 31164, 31164, 31164, 962, 962, 962, 962, - 2127, 2127, 2127, 2127, 1855, 1855, 1855, 1855, - 1468, 1468, 1468, 1468, 37464, 37464, 24313, 24313, - 55004, 55004, 8800, 8800, 18427, 18427, 8859, 8859, - 26676, 26676, 49374, 49374, 2648, 2648, 1017, 1017, - 732, 732, 608, 608, 1787, 1787, 411, 411, - 3124, 3124, 1758, 1758, 19884, 37287, 49650, 56638, - 37227, 9076, 35338, 18250, 13427, 14017, 36381, 52780, - 16832, 4312, 41381, 47622, 2476, 3239, 3058, 830, - 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, - 448, 2264, 677, 2054, 34353, 25435, 58154, 24392, - 44610, 10946, 24215, 16990, 10336, 57603, 43035, 10907, - 31637, 28644, 23998, 48114, 817, 603, 1322, 1864, - 2114, 1218, 2455, 2142, 2144, 2051, 1819, 2459, - 3221, 996, 958, 1522, 20297, 2146, 15356, 33152, - 59257, 50634, 54492, 14470, 44039, 45338, 23211, 48094, - 41677, 45279, 7757, 23132, 1097, 610, 2044, 384, - 3193, 1994, 220, 1670, 1799, 794, 2475, 478, - 3021, 991, 1869, 1628, 0, 0, 0, 0, +//#define _REVIDXB 128 + 3854, 3340, 2826, 2312, 1798, 1284, 770, 256, + 3854, 3340, 2826, 2312, 1798, 1284, 770, 256, -#define _ZETAS_INV_EXP 528 - 42405, 57780, 20258, 23860, 17443, 42326, 20199, 21498, - 51067, 11045, 14903, 6280, 32385, 50181, 63391, 45240, - 1701, 1460, 2338, 308, 2851, 854, 2535, 1530, - 1659, 3109, 1335, 136, 2945, 1285, 2719, 2232, - 17423, 41539, 36893, 33900, 54630, 22502, 7934, 55201, - 48547, 41322, 54591, 20927, 41145, 7383, 40102, 31184, - 1807, 2371, 2333, 108, 870, 1510, 1278, 1185, - 1187, 874, 2111, 1215, 1465, 2007, 2726, 2512, - 17915, 24156, 61225, 48705, 12757, 29156, 51520, 52110, - 47287, 30199, 56461, 28310, 8899, 15887, 28250, 45653, - 1275, 2652, 1065, 2881, 725, 1508, 2368, 398, - 951, 247, 1421, 3222, 2499, 271, 90, 853, - 16163, 16163, 38861, 38861, 56678, 56678, 47110, 47110, - 56737, 56737, 10533, 10533, 41224, 41224, 28073, 28073, - 1571, 1571, 205, 205, 2918, 2918, 1542, 1542, - 2721, 2721, 2597, 2597, 2312, 2312, 681, 681, - 34373, 34373, 34373, 34373, 11202, 11202, 11202, 11202, - 64178, 64178, 64178, 64178, 54847, 54847, 54847, 54847, - 1861, 1861, 1861, 1861, 1474, 1474, 1474, 1474, - 1202, 1202, 1202, 1202, 2367, 2367, 2367, 2367, - 16695, 16695, 16695, 16695, 16695, 16695, 16695, 16695, - 37346, 37346, 37346, 37346, 37346, 37346, 37346, 37346, - 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127, - 3042, 3042, 3042, 3042, 3042, 3042, 3042, 3042, - 64749, 64749, 1517, 1517, 12619, 46008, 47012, 45437, - 52898, 18742, 35219, 32503, 60044, 42444, 4587, 52406, - 21656, 14234, 52150, 54355, 75, 3000, 2980, 2685, - 2210, 1846, 147, 2551, 1676, 460, 235, 2742, - 3224, 2458, 2486, 2899, 5276, 14431, 47976, 18486, - 28762, 36361, 54906, 33526, 59355, 14883, 64592, 27739, - 45043, 32227, 11478, 335, 156, 2911, 872, 1590, - 602, 777, 2170, 246, 1755, 291, 3152, 2907, - 1779, 1251, 2774, 1103, 37700, 25987, 650, 56402, - 12442, 49472, 38920, 12797, 40456, 44826, 45358, 23565, - 34570, 64040, 6517, 5690, 1860, 3203, 1162, 1618, - 666, 320, 8, 2813, 1544, 282, 1838, 1293, - 2314, 552, 2677, 2106, 26242, 26242, 44098, 44098, - 1103, 1103, 59965, 59965, 29058, 29058, 26361, 26361, - 48173, 48173, 5828, 5828, 130, 130, 1602, 1602, - 1871, 1871, 829, 829, 2946, 2946, 3065, 3065, - 1325, 1325, 2756, 2756, 15691, 15691, 15691, 15691, - 3800, 3800, 3800, 3800, 37779, 37779, 37779, 37779, - 20907, 20907, 20907, 20907, 3147, 3147, 3147, 3147, - 1752, 1752, 1752, 1752, 2707, 2707, 2707, 2707, - 171, 171, 171, 171, 12403, 12403, 12403, 12403, - 12403, 12403, 12403, 12403, 52012, 52012, 52012, 52012, - 52012, 52012, 52012, 52012, 1907, 1907, 1907, 1907, - 1907, 1907, 1907, 1907, 1836, 1836, 1836, 1836, - 1836, 1836, 1836, 1836, 50791, 50791, 359, 359, - 60300, 60300, 1932, 1932, 0, 0, 0, 0 +//#define _REVIDXD 144 + 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0, 0, + +//#define _ZETAS_EXP 160 + 31498, 31498, 31498, 31498, -758, -758, -758, -758, + 5237, 5237, 5237, 5237, 1397, 1397, 1397, 1397, + 14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745, + 14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745, + -359, -359, -359, -359, -359, -359, -359, -359, + -359, -359, -359, -359, -359, -359, -359, -359, + 13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, + -12402, -12402, -12402, -12402, -12402, -12402, -12402, -12402, + 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, + 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, + -20907, -20907, -20907, -20907, 27758, 27758, 27758, 27758, + -3799, -3799, -3799, -3799, -15690, -15690, -15690, -15690, + -171, -171, -171, -171, 622, 622, 622, 622, + 1577, 1577, 1577, 1577, 182, 182, 182, 182, + -5827, -5827, 17363, 17363, -26360, -26360, -29057, -29057, + 5571, 5571, -1102, -1102, 21438, 21438, -26242, -26242, + 573, 573, -1325, -1325, 264, 264, 383, 383, + -829, -829, 1458, 1458, -1602, -1602, -130, -130, + -5689, -6516, 1496, 30967, -23565, 20179, 20710, 25080, + -12796, 26616, 16064, -12442, 9134, -650, -25986, 27837, + 1223, 652, -552, 1015, -1293, 1491, -282, -1544, + 516, -8, -320, -666, -1618, -1162, 126, 1469, + -335, -11477, -32227, 20494, -27738, 945, -14883, 6182, + 32010, 10631, 29175, -28762, -18486, 17560, -14430, -5276, + -1103, 555, -1251, 1550, 422, 177, -291, 1574, + -246, 1159, -777, -602, -1590, -872, 418, -156, + 11182, 13387, -14233, -21655, 13131, -4587, 23092, 5493, + -32502, 30317, -18741, 12639, 20100, 18525, 19529, -12619, + 430, 843, 871, 105, 587, -235, -460, 1653, + 778, -147, 1483, 1119, 644, 349, 329, -75, + 787, 787, 787, 787, 787, 787, 787, 787, + 787, 787, 787, 787, 787, 787, 787, 787, + -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + 28191, 28191, 28191, 28191, 28191, 28191, 28191, 28191, + -16694, -16694, -16694, -16694, -16694, -16694, -16694, -16694, + 287, 287, 287, 287, 287, 287, 287, 287, + 202, 202, 202, 202, 202, 202, 202, 202, + 10690, 10690, 10690, 10690, 1358, 1358, 1358, 1358, + -11202, -11202, -11202, -11202, 31164, 31164, 31164, 31164, + 962, 962, 962, 962, -1202, -1202, -1202, -1202, + -1474, -1474, -1474, -1474, 1468, 1468, 1468, 1468, + -28073, -28073, 24313, 24313, -10532, -10532, 8800, 8800, + 18426, 18426, 8859, 8859, 26675, 26675, -16163, -16163, + -681, -681, 1017, 1017, 732, 732, 608, 608, + -1542, -1542, 411, 411, -205, -205, -1571, -1571, + 19883, -28250, -15887, -8898, -28309, 9075, -30199, 18249, + 13426, 14017, -29156, -12757, 16832, 4311, -24155, -17915, + -853, -90, -271, 830, 107, -1421, -247, -951, + -398, 961, -1508, -725, 448, -1065, 677, -1275, + -31183, 25435, -7382, 24391, -20927, 10946, 24214, 16989, + 10335, -7934, -22502, 10906, 31636, 28644, 23998, -17422, + 817, 603, 1322, -1465, -1215, 1218, -874, -1187, + -1185, -1278, -1510, -870, -108, 996, 958, 1522, + 20297, 2146, 15355, -32384, -6280, -14903, -11044, 14469, + -21498, -20198, 23210, -17442, -23860, -20257, 7756, 23132, + 1097, 610, -1285, 384, -136, -1335, 220, -1659, + -1530, 794, -854, 478, -308, 991, -1460, 1628, + +//#define _16XSHIFT 624 + SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, + SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT } }; diff --git a/crypto_kem/kyber768/avx2/consts.h b/crypto_kem/kyber768/avx2/consts.h index c0d5093f..03e1cc95 100644 --- a/crypto_kem/kyber768/avx2/consts.h +++ b/crypto_kem/kyber768/avx2/consts.h @@ -1,19 +1,10 @@ #ifndef PQCLEAN_KYBER768_AVX2_CONSTS_H #define PQCLEAN_KYBER768_AVX2_CONSTS_H +#include "align.h" #include "cdecl.h" -#include "params.h" -#include -#include -#define ALIGNED_UINT16_T(N) \ - union { \ - __m256i as_vec; \ - uint16_t as_arr[(N)]; \ - } - -typedef ALIGNED_UINT16_T(928) qdata_t; - +typedef ALIGNED_INT16(640) qdata_t; extern const qdata_t PQCLEAN_KYBER768_AVX2_qdata; #endif diff --git a/crypto_kem/kyber768/avx2/fips202x4.c b/crypto_kem/kyber768/avx2/fips202x4.c index 7e7631e3..84e0e64c 100644 --- a/crypto_kem/kyber768/avx2/fips202x4.c +++ b/crypto_kem/kyber768/avx2/fips202x4.c @@ -9,22 +9,14 @@ #define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds extern void KeccakF1600_StatePermute4x(__m256i *s); -static inline void store64(uint8_t x[8], uint64_t u) { - unsigned int i; - - for (i = 0; i < 8; i++) { - x[i] = u >> 8 * i; - } -} - -static void keccakx4_absorb(__m256i s[25], - unsigned int r, - const uint8_t *in0, - const uint8_t *in1, - const uint8_t *in2, - const uint8_t *in3, - size_t inlen, - uint8_t p) { +static void keccakx4_absorb_once(__m256i s[25], + unsigned int r, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen, + uint8_t p) { size_t i, pos = 0; __m256i t, idx; @@ -39,20 +31,17 @@ static void keccakx4_absorb(__m256i s[25], s[i] = _mm256_xor_si256(s[i], t); pos += 8; } + inlen -= r; KeccakF1600_StatePermute4x(s); - inlen -= r; } - i = 0; - while (inlen >= 8) { + for (i = 0; i < inlen / 8; ++i) { t = _mm256_i64gather_epi64((long long *)pos, idx, 1); s[i] = _mm256_xor_si256(s[i], t); - - i++; pos += 8; - inlen -= 8; } + inlen -= 8 * i; if (inlen) { t = _mm256_i64gather_epi64((long long *)pos, idx, 1); @@ -75,37 +64,34 @@ static void keccakx4_squeezeblocks(uint8_t *out0, unsigned int r, __m256i s[25]) { unsigned int i; - uint64_t f0, f1, f2, f3; + __m128d t; while (nblocks > 0) { KeccakF1600_StatePermute4x(s); for (i = 0; i < r / 8; ++i) { - f0 = _mm256_extract_epi64(s[i], 0); - f1 = _mm256_extract_epi64(s[i], 1); - f2 = _mm256_extract_epi64(s[i], 2); - f3 = _mm256_extract_epi64(s[i], 3); - store64(out0, f0); - store64(out1, f1); - store64(out2, f2); - store64(out3, f3); - - out0 += 8; - out1 += 8; - out2 += 8; - out3 += 8; + t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i])); + _mm_storel_pd((double *)&out0[8 * i], t); + _mm_storeh_pd((double *)&out1[8 * i], t); + t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i], 1)); + _mm_storel_pd((double *)&out2[8 * i], t); + _mm_storeh_pd((double *)&out3[8 * i], t); } + out0 += r; + out1 += r; + out2 += r; + out3 += r; --nblocks; } } -void PQCLEAN_KYBER768_AVX2_shake128x4_absorb(keccakx4_state *state, +void PQCLEAN_KYBER768_AVX2_shake128x4_absorb_once(keccakx4_state *state, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen) { - keccakx4_absorb(state->s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F); + keccakx4_absorb_once(state->s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F); } void PQCLEAN_KYBER768_AVX2_shake128x4_squeezeblocks(uint8_t *out0, @@ -114,17 +100,16 @@ void PQCLEAN_KYBER768_AVX2_shake128x4_squeezeblocks(uint8_t *out0, uint8_t *out3, size_t nblocks, keccakx4_state *state) { - keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE128_RATE, - state->s); + keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE128_RATE, state->s); } -void PQCLEAN_KYBER768_AVX2_shake256x4_absorb(keccakx4_state *state, +void PQCLEAN_KYBER768_AVX2_shake256x4_absorb_once(keccakx4_state *state, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen) { - keccakx4_absorb(state->s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F); + keccakx4_absorb_once(state->s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F); } void PQCLEAN_KYBER768_AVX2_shake256x4_squeezeblocks(uint8_t *out0, @@ -133,8 +118,7 @@ void PQCLEAN_KYBER768_AVX2_shake256x4_squeezeblocks(uint8_t *out0, uint8_t *out3, size_t nblocks, keccakx4_state *state) { - keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE256_RATE, - state->s); + keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE256_RATE, state->s); } void PQCLEAN_KYBER768_AVX2_shake128x4(uint8_t *out0, @@ -152,7 +136,7 @@ void PQCLEAN_KYBER768_AVX2_shake128x4(uint8_t *out0, uint8_t t[4][SHAKE128_RATE]; keccakx4_state state; - PQCLEAN_KYBER768_AVX2_shake128x4_absorb(&state, in0, in1, in2, in3, inlen); + PQCLEAN_KYBER768_AVX2_shake128x4_absorb_once(&state, in0, in1, in2, in3, inlen); PQCLEAN_KYBER768_AVX2_shake128x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); out0 += nblocks * SHAKE128_RATE; @@ -187,7 +171,7 @@ void PQCLEAN_KYBER768_AVX2_shake256x4(uint8_t *out0, uint8_t t[4][SHAKE256_RATE]; keccakx4_state state; - PQCLEAN_KYBER768_AVX2_shake256x4_absorb(&state, in0, in1, in2, in3, inlen); + PQCLEAN_KYBER768_AVX2_shake256x4_absorb_once(&state, in0, in1, in2, in3, inlen); PQCLEAN_KYBER768_AVX2_shake256x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); out0 += nblocks * SHAKE256_RATE; diff --git a/crypto_kem/kyber768/avx2/fips202x4.h b/crypto_kem/kyber768/avx2/fips202x4.h index e65e7540..3ef654d7 100644 --- a/crypto_kem/kyber768/avx2/fips202x4.h +++ b/crypto_kem/kyber768/avx2/fips202x4.h @@ -9,7 +9,7 @@ typedef struct { __m256i s[25]; } keccakx4_state; -void PQCLEAN_KYBER768_AVX2_shake128x4_absorb(keccakx4_state *state, +void PQCLEAN_KYBER768_AVX2_shake128x4_absorb_once(keccakx4_state *state, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, @@ -23,7 +23,7 @@ void PQCLEAN_KYBER768_AVX2_shake128x4_squeezeblocks(uint8_t *out0, size_t nblocks, keccakx4_state *state); -void PQCLEAN_KYBER768_AVX2_shake256x4_absorb(keccakx4_state *state, +void PQCLEAN_KYBER768_AVX2_shake256x4_absorb_once(keccakx4_state *state, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, diff --git a/crypto_kem/kyber768/avx2/fq.S b/crypto_kem/kyber768/avx2/fq.S index 1f50c56b..90ddcc2a 100644 --- a/crypto_kem/kyber768/avx2/fq.S +++ b/crypto_kem/kyber768/avx2/fq.S @@ -13,14 +13,14 @@ vmovdqa 160(%rdi),%ymm7 vmovdqa 192(%rdi),%ymm8 vmovdqa 224(%rdi),%ymm9 -red16 2,10 -red16 3,11 -red16 4,12 -red16 5,13 -red16 6,14 -red16 7,15 -red16 8,10 -red16 9,11 +red16 2 +red16 3 +red16 4 +red16 5 +red16 6 +red16 7 +red16 8 +red16 9 #store vmovdqa %ymm2,(%rdi) @@ -46,49 +46,6 @@ add $256,%rdi call reduce128_avx ret -csubq128_avx: -#load -vmovdqa (%rdi),%ymm1 -vmovdqa 32(%rdi),%ymm2 -vmovdqa 64(%rdi),%ymm3 -vmovdqa 96(%rdi),%ymm4 -vmovdqa 128(%rdi),%ymm5 -vmovdqa 160(%rdi),%ymm6 -vmovdqa 192(%rdi),%ymm7 -vmovdqa 224(%rdi),%ymm8 - -csubq 1,9 -csubq 2,10 -csubq 3,11 -csubq 4,12 -csubq 5,13 -csubq 6,14 -csubq 7,15 -csubq 8,9 - -#store -vmovdqa %ymm1,(%rdi) -vmovdqa %ymm2,32(%rdi) -vmovdqa %ymm3,64(%rdi) -vmovdqa %ymm4,96(%rdi) -vmovdqa %ymm5,128(%rdi) -vmovdqa %ymm6,160(%rdi) -vmovdqa %ymm7,192(%rdi) -vmovdqa %ymm8,224(%rdi) - -ret - -.global cdecl(PQCLEAN_KYBER768_AVX2_csubq_avx) -.global _cdecl(PQCLEAN_KYBER768_AVX2_csubq_avx) -cdecl(PQCLEAN_KYBER768_AVX2_csubq_avx): -_cdecl(PQCLEAN_KYBER768_AVX2_csubq_avx): -#consts -vmovdqa _16XQ*2(%rsi),%ymm0 -call csubq128_avx -add $256,%rdi -call csubq128_avx -ret - tomont128_avx: #load vmovdqa (%rdi),%ymm3 diff --git a/crypto_kem/kyber768/avx2/fq.inc b/crypto_kem/kyber768/avx2/fq.inc index 75df098a..4b7afc31 100644 --- a/crypto_kem/kyber768/avx2/fq.inc +++ b/crypto_kem/kyber768/avx2/fq.inc @@ -1,6 +1,10 @@ -.macro red16 r,x=12 +.macro red16 r,rs=0,x=12 vpmulhw %ymm1,%ymm\r,%ymm\x +.if \rs +vpmulhrsw %ymm\rs,%ymm\x,%ymm\x +.else vpsraw $10,%ymm\x,%ymm\x +.endif vpmullw %ymm0,%ymm\x,%ymm\x vpsubw %ymm\x,%ymm\r,%ymm\r .endm @@ -10,9 +14,6 @@ vpsubw %ymm0,%ymm\r,%ymm\r vpsraw $15,%ymm\r,%ymm\x vpand %ymm0,%ymm\x,%ymm\x vpaddw %ymm\x,%ymm\r,%ymm\r -#vpcmpgtw %ymm0,%ymm\r,%ymm\x -#vpand %ymm0,%ymm\x,%ymm\x -#vpsubw %ymm\x,%ymm\r,%ymm\r .endm .macro caddq r,x=12 diff --git a/crypto_kem/kyber768/avx2/indcpa.c b/crypto_kem/kyber768/avx2/indcpa.c index 9ac7dd64..160e1c94 100644 --- a/crypto_kem/kyber768/avx2/indcpa.c +++ b/crypto_kem/kyber768/avx2/indcpa.c @@ -8,6 +8,7 @@ #include "randombytes.h" #include "rejsample.h" #include "symmetric.h" +#include #include #include @@ -15,11 +16,14 @@ * Name: pack_pk * * Description: Serialize the public key as concatenation of the -* serialized vector of polynomials pk -* and the public seed used to generate the matrix A. +* serialized vector of polynomials pk and the +* public seed used to generate the matrix A. +* The polynomial coefficients in pk are assumed to +* lie in the invertal [0,q], i.e. pk must be reduced +* by PQCLEAN_KYBER768_AVX2_polyvec_reduce(). * -* Arguments: uint8_t *r: pointer to the output serialized public key -* polyvec *pk: pointer to the input public-key polyvec +* Arguments: uint8_t *r: pointer to the output serialized public key +* polyvec *pk: pointer to the input public-key polyvec * const uint8_t *seed: pointer to the input public seed **************************************************/ static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], @@ -55,9 +59,12 @@ static void unpack_pk(polyvec *pk, /************************************************* * Name: pack_sk * -* Description: Serialize the secret key +* Description: Serialize the secret key. +* The polynomial coefficients in sk are assumed to +* lie in the invertal [0,q], i.e. sk must be reduced +* by PQCLEAN_KYBER768_AVX2_polyvec_reduce(). * -* Arguments: - uint8_t *r: pointer to output serialized secret key +* Arguments: - uint8_t *r: pointer to output serialized secret key * - polyvec *sk: pointer to input vector of polynomials (secret key) **************************************************/ static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { @@ -67,15 +74,12 @@ static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { /************************************************* * Name: unpack_sk * -* Description: De-serialize the secret key; -* inverse of pack_sk +* Description: De-serialize the secret key; inverse of pack_sk * -* Arguments: - polyvec *sk: pointer to output vector of polynomials -* (secret key) +* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key) * - const uint8_t *packedsk: pointer to input serialized secret key **************************************************/ -static void unpack_sk(polyvec *sk, - const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { +static void unpack_sk(polyvec *sk, const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { PQCLEAN_KYBER768_AVX2_polyvec_frombytes(sk, packedsk); } @@ -84,15 +88,16 @@ static void unpack_sk(polyvec *sk, * * Description: Serialize the ciphertext as concatenation of the * compressed and serialized vector of polynomials b -* and the compressed and serialized polynomial v +* and the compressed and serialized polynomial v. +* The polynomial coefficients in b and v are assumed to +* lie in the invertal [0,q], i.e. b and v must be reduced +* by PQCLEAN_KYBER768_AVX2_polyvec_reduce() and PQCLEAN_KYBER768_AVX2_poly_reduce(), respectively. * * Arguments: uint8_t *r: pointer to the output serialized ciphertext -* poly *pk: pointer to the input vector of polynomials b -* poly *v: pointer to the input polynomial v +* poly *pk: pointer to the input vector of polynomials b +* poly *v: pointer to the input polynomial v **************************************************/ -static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], - polyvec *b, - poly *v) { +static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], polyvec *b, poly *v) { PQCLEAN_KYBER768_AVX2_polyvec_compress(r, b); PQCLEAN_KYBER768_AVX2_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v); } @@ -103,13 +108,11 @@ static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], * Description: De-serialize and decompress ciphertext from a byte array; * approximate inverse of pack_ciphertext * -* Arguments: - polyvec *b: pointer to the output vector of polynomials b -* - poly *v: pointer to the output polynomial v +* Arguments: - polyvec *b: pointer to the output vector of polynomials b +* - poly *v: pointer to the output polynomial v * - const uint8_t *c: pointer to the input serialized ciphertext **************************************************/ -static void unpack_ciphertext(polyvec *b, - poly *v, - const uint8_t c[KYBER_INDCPA_BYTES]) { +static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t c[KYBER_INDCPA_BYTES]) { PQCLEAN_KYBER768_AVX2_polyvec_decompress(b, c); PQCLEAN_KYBER768_AVX2_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES); } @@ -120,11 +123,9 @@ static void unpack_ciphertext(polyvec *b, * Description: Run rejection sampling on uniform random bytes to generate * uniform random integers mod q * -* Arguments: - int16_t *r: pointer to output buffer -* - unsigned int len: requested number of 16-bit integers -* (uniform mod q) -* - const uint8_t *buf: pointer to input buffer -* (assumed to be uniform random bytes) +* Arguments: - int16_t *r: pointer to output array +* - unsigned int len: requested number of 16-bit integers (uniform mod q) +* - const uint8_t *buf: pointer to input buffer (assumed to be uniformly random bytes) * - unsigned int buflen: length of input buffer in bytes * * Returns number of sampled 16-bit integers (at most len) @@ -134,16 +135,19 @@ static unsigned int rej_uniform(int16_t *r, const uint8_t *buf, unsigned int buflen) { unsigned int ctr, pos; - uint16_t val; + uint16_t val0, val1; ctr = pos = 0; - while (ctr < len && pos + 2 <= buflen) { - val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); - pos += 2; + while (ctr < len && pos + 3 <= buflen) { + val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; + val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF; + pos += 3; - if (val < 19 * KYBER_Q) { - val -= ((uint32_t)val * 20159 >> 26) * KYBER_Q; // Barrett reduction - r[ctr++] = (int16_t)val; + if (val0 < KYBER_Q) { + r[ctr++] = val0; + } + if (ctr < len && val1 < KYBER_Q) { + r[ctr++] = val1; } } @@ -165,61 +169,54 @@ static unsigned int rej_uniform(int16_t *r, * - const uint8_t *seed: pointer to input seed * - int transposed: boolean deciding whether A or A^T is generated **************************************************/ -#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \ - + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) void PQCLEAN_KYBER768_AVX2_gen_matrix(polyvec *a, const uint8_t seed[32], int transposed) { unsigned int ctr0, ctr1, ctr2, ctr3; - ALIGN32_ARRAY_2D(uint8_t, 4, (GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 31) / 32 * 32) buf; + ALIGNED_UINT8(REJ_UNIFORM_AVX_NBLOCKS * SHAKE128_RATE) buf[4]; __m256i f; keccakx4_state state; xof_state state1x; - f = _mm256_load_si256((__m256i *)seed); - _mm256_store_si256((__m256i *)buf.arr[0], f); - _mm256_store_si256((__m256i *)buf.arr[1], f); - _mm256_store_si256((__m256i *)buf.arr[2], f); - _mm256_store_si256((__m256i *)buf.arr[3], f); + f = _mm256_loadu_si256((__m256i *)seed); + _mm256_store_si256(buf[0].vec, f); + _mm256_store_si256(buf[1].vec, f); + _mm256_store_si256(buf[2].vec, f); + _mm256_store_si256(buf[3].vec, f); if (transposed) { - buf.arr[0][KYBER_SYMBYTES + 0] = 0; - buf.arr[0][KYBER_SYMBYTES + 1] = 0; - buf.arr[1][KYBER_SYMBYTES + 0] = 0; - buf.arr[1][KYBER_SYMBYTES + 1] = 1; - buf.arr[2][KYBER_SYMBYTES + 0] = 0; - buf.arr[2][KYBER_SYMBYTES + 1] = 2; - buf.arr[3][KYBER_SYMBYTES + 0] = 1; - buf.arr[3][KYBER_SYMBYTES + 1] = 0; + buf[0].coeffs[32] = 0; + buf[0].coeffs[33] = 0; + buf[1].coeffs[32] = 0; + buf[1].coeffs[33] = 1; + buf[2].coeffs[32] = 0; + buf[2].coeffs[33] = 2; + buf[3].coeffs[32] = 1; + buf[3].coeffs[33] = 0; } else { - buf.arr[0][KYBER_SYMBYTES + 0] = 0; - buf.arr[0][KYBER_SYMBYTES + 1] = 0; - buf.arr[1][KYBER_SYMBYTES + 0] = 1; - buf.arr[1][KYBER_SYMBYTES + 1] = 0; - buf.arr[2][KYBER_SYMBYTES + 0] = 2; - buf.arr[2][KYBER_SYMBYTES + 1] = 0; - buf.arr[3][KYBER_SYMBYTES + 0] = 0; - buf.arr[3][KYBER_SYMBYTES + 1] = 1; + buf[0].coeffs[32] = 0; + buf[0].coeffs[33] = 0; + buf[1].coeffs[32] = 1; + buf[1].coeffs[33] = 0; + buf[2].coeffs[32] = 2; + buf[2].coeffs[33] = 0; + buf[3].coeffs[32] = 0; + buf[3].coeffs[33] = 1; } - PQCLEAN_KYBER768_AVX2_shake128x4_absorb(&state, buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], KYBER_SYMBYTES + 2); - PQCLEAN_KYBER768_AVX2_shake128x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], GEN_MATRIX_NBLOCKS, - &state); + PQCLEAN_KYBER768_AVX2_shake128x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 34); + PQCLEAN_KYBER768_AVX2_shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, REJ_UNIFORM_AVX_NBLOCKS, &state); - ctr0 = PQCLEAN_KYBER768_AVX2_rej_uniform_avx(a[0].vec[0].coeffs, buf.arr[0]); - ctr1 = PQCLEAN_KYBER768_AVX2_rej_uniform_avx(a[0].vec[1].coeffs, buf.arr[1]); - ctr2 = PQCLEAN_KYBER768_AVX2_rej_uniform_avx(a[0].vec[2].coeffs, buf.arr[2]); - ctr3 = PQCLEAN_KYBER768_AVX2_rej_uniform_avx(a[1].vec[0].coeffs, buf.arr[3]); + ctr0 = PQCLEAN_KYBER768_AVX2_rej_uniform_avx(a[0].vec[0].coeffs, buf[0].coeffs); + ctr1 = PQCLEAN_KYBER768_AVX2_rej_uniform_avx(a[0].vec[1].coeffs, buf[1].coeffs); + ctr2 = PQCLEAN_KYBER768_AVX2_rej_uniform_avx(a[0].vec[2].coeffs, buf[2].coeffs); + ctr3 = PQCLEAN_KYBER768_AVX2_rej_uniform_avx(a[1].vec[0].coeffs, buf[3].coeffs); while (ctr0 < KYBER_N || ctr1 < KYBER_N || ctr2 < KYBER_N || ctr3 < KYBER_N) { - PQCLEAN_KYBER768_AVX2_shake128x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], 1, &state); + PQCLEAN_KYBER768_AVX2_shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 1, &state); - ctr0 += rej_uniform(a[0].vec[0].coeffs + ctr0, KYBER_N - ctr0, buf.arr[0], - XOF_BLOCKBYTES); - ctr1 += rej_uniform(a[0].vec[1].coeffs + ctr1, KYBER_N - ctr1, buf.arr[1], - XOF_BLOCKBYTES); - ctr2 += rej_uniform(a[0].vec[2].coeffs + ctr2, KYBER_N - ctr2, buf.arr[2], - XOF_BLOCKBYTES); - ctr3 += rej_uniform(a[1].vec[0].coeffs + ctr3, KYBER_N - ctr3, buf.arr[3], - XOF_BLOCKBYTES); + ctr0 += rej_uniform(a[0].vec[0].coeffs + ctr0, KYBER_N - ctr0, buf[0].coeffs, SHAKE128_RATE); + ctr1 += rej_uniform(a[0].vec[1].coeffs + ctr1, KYBER_N - ctr1, buf[1].coeffs, SHAKE128_RATE); + ctr2 += rej_uniform(a[0].vec[2].coeffs + ctr2, KYBER_N - ctr2, buf[2].coeffs, SHAKE128_RATE); + ctr3 += rej_uniform(a[1].vec[0].coeffs + ctr3, KYBER_N - ctr3, buf[3].coeffs, SHAKE128_RATE); } PQCLEAN_KYBER768_AVX2_poly_nttunpack(&a[0].vec[0]); @@ -227,52 +224,47 @@ void PQCLEAN_KYBER768_AVX2_gen_matrix(polyvec *a, const uint8_t seed[32], int tr PQCLEAN_KYBER768_AVX2_poly_nttunpack(&a[0].vec[2]); PQCLEAN_KYBER768_AVX2_poly_nttunpack(&a[1].vec[0]); - f = _mm256_load_si256((__m256i *)seed); - _mm256_store_si256((__m256i *)buf.arr[0], f); - _mm256_store_si256((__m256i *)buf.arr[1], f); - _mm256_store_si256((__m256i *)buf.arr[2], f); - _mm256_store_si256((__m256i *)buf.arr[3], f); + f = _mm256_loadu_si256((__m256i *)seed); + _mm256_store_si256(buf[0].vec, f); + _mm256_store_si256(buf[1].vec, f); + _mm256_store_si256(buf[2].vec, f); + _mm256_store_si256(buf[3].vec, f); if (transposed) { - buf.arr[0][KYBER_SYMBYTES + 0] = 1; - buf.arr[0][KYBER_SYMBYTES + 1] = 1; - buf.arr[1][KYBER_SYMBYTES + 0] = 1; - buf.arr[1][KYBER_SYMBYTES + 1] = 2; - buf.arr[2][KYBER_SYMBYTES + 0] = 2; - buf.arr[2][KYBER_SYMBYTES + 1] = 0; - buf.arr[3][KYBER_SYMBYTES + 0] = 2; - buf.arr[3][KYBER_SYMBYTES + 1] = 1; + buf[0].coeffs[32] = 1; + buf[0].coeffs[33] = 1; + buf[1].coeffs[32] = 1; + buf[1].coeffs[33] = 2; + buf[2].coeffs[32] = 2; + buf[2].coeffs[33] = 0; + buf[3].coeffs[32] = 2; + buf[3].coeffs[33] = 1; } else { - buf.arr[0][KYBER_SYMBYTES + 0] = 1; - buf.arr[0][KYBER_SYMBYTES + 1] = 1; - buf.arr[1][KYBER_SYMBYTES + 0] = 2; - buf.arr[1][KYBER_SYMBYTES + 1] = 1; - buf.arr[2][KYBER_SYMBYTES + 0] = 0; - buf.arr[2][KYBER_SYMBYTES + 1] = 2; - buf.arr[3][KYBER_SYMBYTES + 0] = 1; - buf.arr[3][KYBER_SYMBYTES + 1] = 2; + buf[0].coeffs[32] = 1; + buf[0].coeffs[33] = 1; + buf[1].coeffs[32] = 2; + buf[1].coeffs[33] = 1; + buf[2].coeffs[32] = 0; + buf[2].coeffs[33] = 2; + buf[3].coeffs[32] = 1; + buf[3].coeffs[33] = 2; } - PQCLEAN_KYBER768_AVX2_shake128x4_absorb(&state, buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], KYBER_SYMBYTES + 2); - PQCLEAN_KYBER768_AVX2_shake128x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], GEN_MATRIX_NBLOCKS, - &state); + PQCLEAN_KYBER768_AVX2_shake128x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 34); + PQCLEAN_KYBER768_AVX2_shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, REJ_UNIFORM_AVX_NBLOCKS, &state); - ctr0 = PQCLEAN_KYBER768_AVX2_rej_uniform_avx(a[1].vec[1].coeffs, buf.arr[0]); - ctr1 = PQCLEAN_KYBER768_AVX2_rej_uniform_avx(a[1].vec[2].coeffs, buf.arr[1]); - ctr2 = PQCLEAN_KYBER768_AVX2_rej_uniform_avx(a[2].vec[0].coeffs, buf.arr[2]); - ctr3 = PQCLEAN_KYBER768_AVX2_rej_uniform_avx(a[2].vec[1].coeffs, buf.arr[3]); + ctr0 = PQCLEAN_KYBER768_AVX2_rej_uniform_avx(a[1].vec[1].coeffs, buf[0].coeffs); + ctr1 = PQCLEAN_KYBER768_AVX2_rej_uniform_avx(a[1].vec[2].coeffs, buf[1].coeffs); + ctr2 = PQCLEAN_KYBER768_AVX2_rej_uniform_avx(a[2].vec[0].coeffs, buf[2].coeffs); + ctr3 = PQCLEAN_KYBER768_AVX2_rej_uniform_avx(a[2].vec[1].coeffs, buf[3].coeffs); while (ctr0 < KYBER_N || ctr1 < KYBER_N || ctr2 < KYBER_N || ctr3 < KYBER_N) { - PQCLEAN_KYBER768_AVX2_shake128x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], 1, &state); + PQCLEAN_KYBER768_AVX2_shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 1, &state); - ctr0 += rej_uniform(a[1].vec[1].coeffs + ctr0, KYBER_N - ctr0, buf.arr[0], - XOF_BLOCKBYTES); - ctr1 += rej_uniform(a[1].vec[2].coeffs + ctr1, KYBER_N - ctr1, buf.arr[1], - XOF_BLOCKBYTES); - ctr2 += rej_uniform(a[2].vec[0].coeffs + ctr2, KYBER_N - ctr2, buf.arr[2], - XOF_BLOCKBYTES); - ctr3 += rej_uniform(a[2].vec[1].coeffs + ctr3, KYBER_N - ctr3, buf.arr[3], - XOF_BLOCKBYTES); + ctr0 += rej_uniform(a[1].vec[1].coeffs + ctr0, KYBER_N - ctr0, buf[0].coeffs, SHAKE128_RATE); + ctr1 += rej_uniform(a[1].vec[2].coeffs + ctr1, KYBER_N - ctr1, buf[1].coeffs, SHAKE128_RATE); + ctr2 += rej_uniform(a[2].vec[0].coeffs + ctr2, KYBER_N - ctr2, buf[2].coeffs, SHAKE128_RATE); + ctr3 += rej_uniform(a[2].vec[1].coeffs + ctr3, KYBER_N - ctr3, buf[3].coeffs, SHAKE128_RATE); } PQCLEAN_KYBER768_AVX2_poly_nttunpack(&a[1].vec[1]); @@ -280,20 +272,18 @@ void PQCLEAN_KYBER768_AVX2_gen_matrix(polyvec *a, const uint8_t seed[32], int tr PQCLEAN_KYBER768_AVX2_poly_nttunpack(&a[2].vec[0]); PQCLEAN_KYBER768_AVX2_poly_nttunpack(&a[2].vec[1]); - f = _mm256_load_si256((__m256i *)seed); - _mm256_store_si256((__m256i *)buf.arr[0], f); - buf.arr[0][KYBER_SYMBYTES + 0] = 2; - buf.arr[0][KYBER_SYMBYTES + 1] = 2; - shake128_absorb(&state1x, buf.arr[0], KYBER_SYMBYTES + 2); - shake128_squeezeblocks(buf.arr[0], GEN_MATRIX_NBLOCKS, &state1x); - ctr0 = PQCLEAN_KYBER768_AVX2_rej_uniform_avx(a[2].vec[2].coeffs, buf.arr[0]); + f = _mm256_loadu_si256((__m256i *)seed); + _mm256_store_si256(buf[0].vec, f); + buf[0].coeffs[32] = 2; + buf[0].coeffs[33] = 2; + shake128_absorb(&state1x, buf[0].coeffs, 34); + shake128_squeezeblocks(buf[0].coeffs, REJ_UNIFORM_AVX_NBLOCKS, &state1x); + ctr0 = PQCLEAN_KYBER768_AVX2_rej_uniform_avx(a[2].vec[2].coeffs, buf[0].coeffs); while (ctr0 < KYBER_N) { - shake128_squeezeblocks(buf.arr[0], 1, &state1x); - ctr0 += rej_uniform(a[2].vec[2].coeffs + ctr0, KYBER_N - ctr0, buf.arr[0], - XOF_BLOCKBYTES); + shake128_squeezeblocks(buf[0].coeffs, 1, &state1x); + ctr0 += rej_uniform(a[2].vec[2].coeffs + ctr0, KYBER_N - ctr0, buf[0].coeffs, SHAKE128_RATE); } - shake128_ctx_release(&state1x); - + xof_ctx_release(&state1x); PQCLEAN_KYBER768_AVX2_poly_nttunpack(&a[2].vec[2]); } @@ -311,27 +301,26 @@ void PQCLEAN_KYBER768_AVX2_gen_matrix(polyvec *a, const uint8_t seed[32], int tr void PQCLEAN_KYBER768_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { unsigned int i; - ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; - const uint8_t *publicseed = buf.arr; - const uint8_t *noiseseed = buf.arr + KYBER_SYMBYTES; + uint8_t buf[2 * KYBER_SYMBYTES]; + const uint8_t *publicseed = buf; + const uint8_t *noiseseed = buf + KYBER_SYMBYTES; polyvec a[KYBER_K], e, pkpv, skpv; - randombytes(buf.arr, KYBER_SYMBYTES); - hash_g(buf.arr, buf.arr, KYBER_SYMBYTES); + randombytes(buf, KYBER_SYMBYTES); + hash_g(buf, buf, KYBER_SYMBYTES); gen_a(a, publicseed); - PQCLEAN_KYBER768_AVX2_poly_getnoise4x(skpv.vec + 0, skpv.vec + 1, skpv.vec + 2, e.vec + 0, noiseseed, - 0, 1, 2, 3); - PQCLEAN_KYBER768_AVX2_poly_getnoise4x(e.vec + 1, e.vec + 2, pkpv.vec + 0, pkpv.vec + 1, noiseseed, - 4, 5, 6, 7); + PQCLEAN_KYBER768_AVX2_poly_getnoise_eta1_4x(skpv.vec + 0, skpv.vec + 1, skpv.vec + 2, e.vec + 0, noiseseed, 0, 1, 2, 3); + PQCLEAN_KYBER768_AVX2_poly_getnoise_eta1_4x(e.vec + 1, e.vec + 2, pkpv.vec + 0, pkpv.vec + 1, noiseseed, 4, 5, 6, 7); PQCLEAN_KYBER768_AVX2_polyvec_ntt(&skpv); + PQCLEAN_KYBER768_AVX2_polyvec_reduce(&skpv); PQCLEAN_KYBER768_AVX2_polyvec_ntt(&e); // matrix-vector multiplication for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER768_AVX2_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); + PQCLEAN_KYBER768_AVX2_polyvec_basemul_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); PQCLEAN_KYBER768_AVX2_poly_tomont(&pkpv.vec[i]); } @@ -348,54 +337,50 @@ void PQCLEAN_KYBER768_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES * Description: Encryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *c: pointer to output ciphertext -* (of length KYBER_INDCPA_BYTES bytes) -* - const uint8_t *m: pointer to input message -* (of length KYBER_INDCPA_MSGBYTES bytes) -* - const uint8_t *pk: pointer to input public key -* (of length KYBER_INDCPA_PUBLICKEYBYTES) -* - const uint8_t *coins: pointer to input random coins -* used as seed (of length KYBER_SYMBYTES) -* to deterministically generate all -* randomness +* Arguments: - uint8_t *c: pointer to output ciphertext +* (of length KYBER_INDCPA_BYTES bytes) +* - const uint8_t *m: pointer to input message +* (of length KYBER_INDCPA_MSGBYTES bytes) +* - const uint8_t *pk: pointer to input public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES) +* - const uint8_t *coins: pointer to input random coins used as seed +* (of length KYBER_SYMBYTES) to deterministically +* generate all randomness **************************************************/ void PQCLEAN_KYBER768_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], const uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], const uint8_t coins[KYBER_SYMBYTES]) { unsigned int i; - ALIGN32_ARRAY(uint8_t, KYBER_SYMBYTES) seed; - polyvec sp, pkpv, ep, at[KYBER_K], bp; + uint8_t seed[KYBER_SYMBYTES]; + polyvec sp, pkpv, ep, at[KYBER_K], b; poly v, k, epp; - unpack_pk(&pkpv, seed.arr, pk); + unpack_pk(&pkpv, seed, pk); PQCLEAN_KYBER768_AVX2_poly_frommsg(&k, m); - gen_at(at, seed.arr); + gen_at(at, seed); - PQCLEAN_KYBER768_AVX2_poly_getnoise4x(sp.vec + 0, sp.vec + 1, sp.vec + 2, ep.vec + 0, coins, - 0, 1, 2, 3); - PQCLEAN_KYBER768_AVX2_poly_getnoise4x(ep.vec + 1, ep.vec + 2, &epp, bp.vec + 0, coins, - 4, 5, 6, 7); + PQCLEAN_KYBER768_AVX2_poly_getnoise_eta1_4x(sp.vec + 0, sp.vec + 1, sp.vec + 2, ep.vec + 0, coins, 0, 1, 2, 3); + PQCLEAN_KYBER768_AVX2_poly_getnoise_eta1_4x(ep.vec + 1, ep.vec + 2, &epp, b.vec + 0, coins, 4, 5, 6, 7); PQCLEAN_KYBER768_AVX2_polyvec_ntt(&sp); // matrix-vector multiplication for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER768_AVX2_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp); + PQCLEAN_KYBER768_AVX2_polyvec_basemul_acc_montgomery(&b.vec[i], &at[i], &sp); } + PQCLEAN_KYBER768_AVX2_polyvec_basemul_acc_montgomery(&v, &pkpv, &sp); - PQCLEAN_KYBER768_AVX2_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp); - - PQCLEAN_KYBER768_AVX2_polyvec_invntt_tomont(&bp); + PQCLEAN_KYBER768_AVX2_polyvec_invntt_tomont(&b); PQCLEAN_KYBER768_AVX2_poly_invntt_tomont(&v); - PQCLEAN_KYBER768_AVX2_polyvec_add(&bp, &bp, &ep); + PQCLEAN_KYBER768_AVX2_polyvec_add(&b, &b, &ep); PQCLEAN_KYBER768_AVX2_poly_add(&v, &v, &epp); PQCLEAN_KYBER768_AVX2_poly_add(&v, &v, &k); - PQCLEAN_KYBER768_AVX2_polyvec_reduce(&bp); + PQCLEAN_KYBER768_AVX2_polyvec_reduce(&b); PQCLEAN_KYBER768_AVX2_poly_reduce(&v); - pack_ciphertext(c, &bp, &v); + pack_ciphertext(c, &b, &v); } /************************************************* @@ -404,24 +389,24 @@ void PQCLEAN_KYBER768_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], * Description: Decryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *m: pointer to output decrypted message -* (of length KYBER_INDCPA_MSGBYTES) -* - const uint8_t *c: pointer to input ciphertext -* (of length KYBER_INDCPA_BYTES) +* Arguments: - uint8_t *m: pointer to output decrypted message +* (of length KYBER_INDCPA_MSGBYTES) +* - const uint8_t *c: pointer to input ciphertext +* (of length KYBER_INDCPA_BYTES) * - const uint8_t *sk: pointer to input secret key * (of length KYBER_INDCPA_SECRETKEYBYTES) **************************************************/ void PQCLEAN_KYBER768_AVX2_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t c[KYBER_INDCPA_BYTES], const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { - polyvec bp, skpv; + polyvec b, skpv; poly v, mp; - unpack_ciphertext(&bp, &v, c); + unpack_ciphertext(&b, &v, c); unpack_sk(&skpv, sk); - PQCLEAN_KYBER768_AVX2_polyvec_ntt(&bp); - PQCLEAN_KYBER768_AVX2_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp); + PQCLEAN_KYBER768_AVX2_polyvec_ntt(&b); + PQCLEAN_KYBER768_AVX2_polyvec_basemul_acc_montgomery(&mp, &skpv, &b); PQCLEAN_KYBER768_AVX2_poly_invntt_tomont(&mp); PQCLEAN_KYBER768_AVX2_poly_sub(&mp, &v, &mp); diff --git a/crypto_kem/kyber768/avx2/invntt.S b/crypto_kem/kyber768/avx2/invntt.S index ea0260f1..2e5d7b77 100644 --- a/crypto_kem/kyber768/avx2/invntt.S +++ b/crypto_kem/kyber768/avx2/invntt.S @@ -2,22 +2,21 @@ .include "shuffle.inc" .include "fq.inc" -.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=1,zl1=1,zh0=2,zh1=2 -#update & mul -vpsubw %ymm\rh0,%ymm\rl0,%ymm12 -vpsubw %ymm\rh1,%ymm\rl1,%ymm13 -vpsubw %ymm\rh2,%ymm\rl2,%ymm14 - +.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=2,zl1=2,zh0=3,zh1=3 +vpsubw %ymm\rl0,%ymm\rh0,%ymm12 vpaddw %ymm\rh0,%ymm\rl0,%ymm\rl0 -vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl1 +vpsubw %ymm\rl1,%ymm\rh1,%ymm13 + vpmullw %ymm\zl0,%ymm12,%ymm\rh0 +vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl1 +vpsubw %ymm\rl2,%ymm\rh2,%ymm14 -vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl2 vpmullw %ymm\zl0,%ymm13,%ymm\rh1 -vpsubw %ymm\rh3,%ymm\rl3,%ymm15 +vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl2 +vpsubw %ymm\rl3,%ymm\rh3,%ymm15 -vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl3 vpmullw %ymm\zl1,%ymm14,%ymm\rh2 +vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl3 vpmullw %ymm\zl1,%ymm15,%ymm\rh3 vpmulhw %ymm\zh0,%ymm12,%ymm12 @@ -26,60 +25,84 @@ vpmulhw %ymm\zh0,%ymm13,%ymm13 vpmulhw %ymm\zh1,%ymm14,%ymm14 vpmulhw %ymm\zh1,%ymm15,%ymm15 -#reduce vpmulhw %ymm0,%ymm\rh0,%ymm\rh0 + vpmulhw %ymm0,%ymm\rh1,%ymm\rh1 + vpmulhw %ymm0,%ymm\rh2,%ymm\rh2 vpmulhw %ymm0,%ymm\rh3,%ymm\rh3 + +# + +# + vpsubw %ymm\rh0,%ymm12,%ymm\rh0 + vpsubw %ymm\rh1,%ymm13,%ymm\rh1 + vpsubw %ymm\rh2,%ymm14,%ymm\rh2 vpsubw %ymm\rh3,%ymm15,%ymm\rh3 .endm -.text -invntt_levels0t5_avx: -level0: -#zetas -vmovdqu (%rsi),%ymm15 -vmovdqu 64(%rsi),%ymm3 -vmovdqu 32(%rsi),%ymm1 -vmovdqu 96(%rsi),%ymm2 +.macro intt_levels0t5 off +/* level 0 */ +vmovdqa _16XFLO*2(%rsi),%ymm2 +vmovdqa _16XFHI*2(%rsi),%ymm3 -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 128(%rdi),%ymm8 -vmovdqa 160(%rdi),%ymm9 -vmovdqa 192(%rdi),%ymm10 -vmovdqa 224(%rdi),%ymm11 +vmovdqa (128*\off+ 0)*2(%rdi),%ymm4 +vmovdqa (128*\off+ 32)*2(%rdi),%ymm6 +vmovdqa (128*\off+ 16)*2(%rdi),%ymm5 +vmovdqa (128*\off+ 48)*2(%rdi),%ymm7 -butterfly 4,5,8,9,6,7,10,11,15,3,1,2 +fqmulprecomp 2,3,4 +fqmulprecomp 2,3,6 +fqmulprecomp 2,3,5 +fqmulprecomp 2,3,7 -level1: -#zetas -vmovdqu 128(%rsi),%ymm3 -vmovdqu 160(%rsi),%ymm2 +vmovdqa (128*\off+ 64)*2(%rdi),%ymm8 +vmovdqa (128*\off+ 96)*2(%rdi),%ymm10 +vmovdqa (128*\off+ 80)*2(%rdi),%ymm9 +vmovdqa (128*\off+112)*2(%rdi),%ymm11 -butterfly 4,5,6,7,8,9,10,11,3,3,2,2 +fqmulprecomp 2,3,8 +fqmulprecomp 2,3,10 +fqmulprecomp 2,3,9 +fqmulprecomp 2,3,11 + +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+208)*2(%rsi),%ymm15 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+176)*2(%rsi),%ymm1 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+224)*2(%rsi),%ymm2 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+192)*2(%rsi),%ymm3 +vmovdqa _REVIDXB*2(%rsi),%ymm12 +vpshufb %ymm12,%ymm15,%ymm15 +vpshufb %ymm12,%ymm1,%ymm1 +vpshufb %ymm12,%ymm2,%ymm2 +vpshufb %ymm12,%ymm3,%ymm3 + +butterfly 4,5,8,9,6,7,10,11,15,1,2,3 + +/* level 1 */ +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+144)*2(%rsi),%ymm2 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+160)*2(%rsi),%ymm3 +vmovdqa _REVIDXB*2(%rsi),%ymm1 +vpshufb %ymm1,%ymm2,%ymm2 +vpshufb %ymm1,%ymm3,%ymm3 + +butterfly 4,5,6,7,8,9,10,11,2,2,3,3 shuffle1 4,5,3,5 shuffle1 6,7,4,7 shuffle1 8,9,6,9 shuffle1 10,11,8,11 -level2: -#zetas -vmovdqu 192(%rsi),%ymm10 -vmovdqu 224(%rsi),%ymm2 +/* level 2 */ +vmovdqa _REVIDXD*2(%rsi),%ymm12 +vpermd (_ZETAS_EXP+(1-\off)*224+112)*2(%rsi),%ymm12,%ymm2 +vpermd (_ZETAS_EXP+(1-\off)*224+128)*2(%rsi),%ymm12,%ymm10 -#consts -vmovdqa _16XV*2(%rdx),%ymm1 - -butterfly 3,4,6,8,5,7,9,11,10,10,2,2 +butterfly 3,4,6,8,5,7,9,11,2,2,10,10 +vmovdqa _16XV*2(%rsi),%ymm1 red16 3 shuffle2 3,4,10,4 @@ -87,26 +110,22 @@ shuffle2 6,8,3,8 shuffle2 5,7,6,7 shuffle2 9,11,5,11 -level3: -#zetas -vmovdqu 256(%rsi),%ymm9 -vmovdqu 288(%rsi),%ymm2 +/* level 3 */ +vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+80)*2(%rsi),%ymm2 +vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+96)*2(%rsi),%ymm9 -butterfly 10,3,6,5,4,8,7,11,9,9,2,2 - -red16 10 +butterfly 10,3,6,5,4,8,7,11,2,2,9,9 shuffle4 10,3,9,3 shuffle4 6,5,10,5 shuffle4 4,8,6,8 shuffle4 7,11,4,11 -level4: -#zetas -vmovdqu 320(%rsi),%ymm7 -vmovdqu 352(%rsi),%ymm2 +/* level 4 */ +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+48)*2(%rsi),%ymm2 +vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+64)*2(%rsi),%ymm7 -butterfly 9,10,6,4,3,5,8,11,7,7,2,2 +butterfly 9,10,6,4,3,5,8,11,2,2,7,7 red16 9 @@ -115,113 +134,62 @@ shuffle8 6,4,9,4 shuffle8 3,5,6,5 shuffle8 8,11,3,11 -level5: -#zetas -vpbroadcastd 384(%rsi),%ymm8 -vpbroadcastd 388(%rsi),%ymm2 +/* level5 */ +vmovdqa (_ZETAS_EXP+(1-\off)*224+16)*2(%rsi),%ymm2 +vmovdqa (_ZETAS_EXP+(1-\off)*224+32)*2(%rsi),%ymm8 -butterfly 7,9,6,3,10,4,5,11,8,8,2,2 +butterfly 7,9,6,3,10,4,5,11,2,2,8,8 -red16 7 +vmovdqa %ymm7,(128*\off+ 0)*2(%rdi) +vmovdqa %ymm9,(128*\off+ 16)*2(%rdi) +vmovdqa %ymm6,(128*\off+ 32)*2(%rdi) +vmovdqa %ymm3,(128*\off+ 48)*2(%rdi) +vmovdqa %ymm10,(128*\off+ 64)*2(%rdi) +vmovdqa %ymm4,(128*\off+ 80)*2(%rdi) +vmovdqa %ymm5,(128*\off+ 96)*2(%rdi) +vmovdqa %ymm11,(128*\off+112)*2(%rdi) +.endm -#store -vmovdqa %ymm7,(%rdi) -vmovdqa %ymm9,32(%rdi) -vmovdqa %ymm6,64(%rdi) -vmovdqa %ymm3,96(%rdi) -vmovdqa %ymm10,128(%rdi) -vmovdqa %ymm4,160(%rdi) -vmovdqa %ymm5,192(%rdi) -vmovdqa %ymm11,224(%rdi) +.macro intt_level6 off +/* level 6 */ +vmovdqa (64*\off+ 0)*2(%rdi),%ymm4 +vmovdqa (64*\off+128)*2(%rdi),%ymm8 +vmovdqa (64*\off+ 16)*2(%rdi),%ymm5 +vmovdqa (64*\off+144)*2(%rdi),%ymm9 +vpbroadcastq (_ZETAS_EXP+0)*2(%rsi),%ymm2 -ret - -invntt_level6_avx: -#zetas -vpbroadcastd (%rsi),%ymm1 -vpbroadcastd 4(%rsi),%ymm2 - -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 256(%rdi),%ymm8 -vmovdqa 288(%rdi),%ymm9 -vmovdqa 320(%rdi),%ymm10 -vmovdqa 352(%rdi),%ymm11 +vmovdqa (64*\off+ 32)*2(%rdi),%ymm6 +vmovdqa (64*\off+160)*2(%rdi),%ymm10 +vmovdqa (64*\off+ 48)*2(%rdi),%ymm7 +vmovdqa (64*\off+176)*2(%rdi),%ymm11 +vpbroadcastq (_ZETAS_EXP+4)*2(%rsi),%ymm3 butterfly 4,5,6,7,8,9,10,11 -#consts -vmovdqa _16XFLO*2(%rdx),%ymm12 -vmovdqa _16XFHI*2(%rdx),%ymm13 +.if \off == 0 +red16 4 +.endif -#store -vmovdqa %ymm8,256(%rdi) -vmovdqa %ymm9,288(%rdi) -vmovdqa %ymm10,320(%rdi) -vmovdqa %ymm11,352(%rdi) - -fqmulprecomp 12,13,4,8 -fqmulprecomp 12,13,5,9 -fqmulprecomp 12,13,6,10 -fqmulprecomp 12,13,7,11 - -#store -vmovdqa %ymm4,(%rdi) -vmovdqa %ymm5,32(%rdi) -vmovdqa %ymm6,64(%rdi) -vmovdqa %ymm7,96(%rdi) - -#load -vmovdqa 128(%rdi),%ymm4 -vmovdqa 160(%rdi),%ymm5 -vmovdqa 192(%rdi),%ymm6 -vmovdqa 224(%rdi),%ymm7 -vmovdqa 384(%rdi),%ymm8 -vmovdqa 416(%rdi),%ymm9 -vmovdqa 448(%rdi),%ymm10 -vmovdqa 480(%rdi),%ymm11 - -butterfly 4,5,6,7,8,9,10,11 - -#consts -vmovdqa _16XFLO*2(%rdx),%ymm12 -vmovdqa _16XFHI*2(%rdx),%ymm13 - -#store -vmovdqa %ymm8,384(%rdi) -vmovdqa %ymm9,416(%rdi) -vmovdqa %ymm10,448(%rdi) -vmovdqa %ymm11,480(%rdi) - -fqmulprecomp 12,13,4,8 -fqmulprecomp 12,13,5,9 -fqmulprecomp 12,13,6,10 -fqmulprecomp 12,13,7,11 - -#store -vmovdqa %ymm4,128(%rdi) -vmovdqa %ymm5,160(%rdi) -vmovdqa %ymm6,192(%rdi) -vmovdqa %ymm7,224(%rdi) - -ret +vmovdqa %ymm4,(64*\off+ 0)*2(%rdi) +vmovdqa %ymm5,(64*\off+ 16)*2(%rdi) +vmovdqa %ymm6,(64*\off+ 32)*2(%rdi) +vmovdqa %ymm7,(64*\off+ 48)*2(%rdi) +vmovdqa %ymm8,(64*\off+128)*2(%rdi) +vmovdqa %ymm9,(64*\off+144)*2(%rdi) +vmovdqa %ymm10,(64*\off+160)*2(%rdi) +vmovdqa %ymm11,(64*\off+176)*2(%rdi) +.endm +.text .global cdecl(PQCLEAN_KYBER768_AVX2_invntt_avx) .global _cdecl(PQCLEAN_KYBER768_AVX2_invntt_avx) cdecl(PQCLEAN_KYBER768_AVX2_invntt_avx): _cdecl(PQCLEAN_KYBER768_AVX2_invntt_avx): -#consts vmovdqa _16XQ*2(%rsi),%ymm0 -mov %rsi,%rdx -add $_ZETAS_INV_EXP*2,%rsi -call invntt_levels0t5_avx -add $256,%rdi -add $392,%rsi -call invntt_levels0t5_avx -sub $256,%rdi -add $392,%rsi -call invntt_level6_avx + +intt_levels0t5 0 +intt_levels0t5 1 + +intt_level6 0 +intt_level6 1 ret diff --git a/crypto_kem/kyber768/avx2/kem.c b/crypto_kem/kyber768/avx2/kem.c index cc8d6b6b..304b30c4 100644 --- a/crypto_kem/kyber768/avx2/kem.c +++ b/crypto_kem/kyber768/avx2/kem.c @@ -1,4 +1,3 @@ -#include "align.h" #include "indcpa.h" #include "kem.h" #include "params.h" @@ -15,13 +14,14 @@ * for CCA-secure Kyber key encapsulation mechanism * * Arguments: - unsigned char *pk: pointer to output public key -* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) * - unsigned char *sk: pointer to output private key -* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* (an already allocated array of KYBER_SECRETKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER768_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { +int PQCLEAN_KYBER768_AVX2_crypto_kem_keypair(unsigned char pk[KYBER_PUBLICKEYBYTES], + unsigned char sk[KYBER_SECRETKEYBYTES]) { size_t i; PQCLEAN_KYBER768_AVX2_indcpa_keypair(pk, sk); for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) { @@ -40,36 +40,36 @@ int PQCLEAN_KYBER768_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *s * secret for given public key * * Arguments: - unsigned char *ct: pointer to output cipher text -* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) * - unsigned char *ss: pointer to output shared secret -* (an already allocated array of CRYPTO_BYTES bytes) +* (an already allocated array of KYBER_SSBYTES bytes) * - const unsigned char *pk: pointer to input public key -* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER768_AVX2_crypto_kem_enc(unsigned char *ct, - unsigned char *ss, - const unsigned char *pk) { - ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; +int PQCLEAN_KYBER768_AVX2_crypto_kem_enc(unsigned char ct[KYBER_CIPHERTEXTBYTES], + unsigned char ss[KYBER_SSBYTES], + const unsigned char pk[KYBER_PUBLICKEYBYTES]) { + uint8_t buf[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ - ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr; + uint8_t kr[2 * KYBER_SYMBYTES]; - randombytes(buf.arr, KYBER_SYMBYTES); + randombytes(buf, KYBER_SYMBYTES); /* Don't release system RNG output */ - hash_h(buf.arr, buf.arr, KYBER_SYMBYTES); + hash_h(buf, buf, KYBER_SYMBYTES); /* Multitarget countermeasure for coins + contributory KEM */ - hash_h(buf.arr + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); - hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES); + hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); + hash_g(kr, buf, 2 * KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ - PQCLEAN_KYBER768_AVX2_indcpa_enc(ct, buf.arr, pk, kr.arr + KYBER_SYMBYTES); + PQCLEAN_KYBER768_AVX2_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); /* overwrite coins in kr with H(c) */ - hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); + hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* hash concatenation of pre-k and H(c) to k */ - kdf(ss, kr.arr, 2 * KYBER_SYMBYTES); + kdf(ss, kr, 2 * KYBER_SYMBYTES); return 0; } @@ -80,47 +80,47 @@ int PQCLEAN_KYBER768_AVX2_crypto_kem_enc(unsigned char *ct, * cipher text and private key * * Arguments: - unsigned char *ss: pointer to output shared secret -* (an already allocated array of CRYPTO_BYTES bytes) +* (an already allocated array of KYBER_SSBYTES bytes) * - const unsigned char *ct: pointer to input cipher text -* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) * - const unsigned char *sk: pointer to input private key -* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* (an already allocated array of KYBER_SECRETKEYBYTES bytes) * * Returns 0. * * On failure, ss will contain a pseudo-random value. **************************************************/ -int PQCLEAN_KYBER768_AVX2_crypto_kem_dec(unsigned char *ss, - const unsigned char *ct, - const unsigned char *sk) { +int PQCLEAN_KYBER768_AVX2_crypto_kem_dec(unsigned char ss[KYBER_SSBYTES], + const unsigned char ct[KYBER_CIPHERTEXTBYTES], + const unsigned char sk[KYBER_SECRETKEYBYTES]) { size_t i; int fail; - ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; + uint8_t buf[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ - ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr; - uint8_t cmp[KYBER_CIPHERTEXTBYTES]; + uint8_t kr[2 * KYBER_SYMBYTES]; + ALIGNED_UINT8(KYBER_CIPHERTEXTBYTES) cmp; const uint8_t *pk = sk + KYBER_INDCPA_SECRETKEYBYTES; - PQCLEAN_KYBER768_AVX2_indcpa_dec(buf.arr, ct, sk); + PQCLEAN_KYBER768_AVX2_indcpa_dec(buf, ct, sk); /* Multitarget countermeasure for coins + contributory KEM */ for (i = 0; i < KYBER_SYMBYTES; i++) { - buf.arr[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; + buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; } - hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES); + hash_g(kr, buf, 2 * KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ - PQCLEAN_KYBER768_AVX2_indcpa_enc(cmp, buf.arr, pk, kr.arr + KYBER_SYMBYTES); + PQCLEAN_KYBER768_AVX2_indcpa_enc(cmp.coeffs, buf, pk, kr + KYBER_SYMBYTES); - fail = PQCLEAN_KYBER768_AVX2_verify(ct, cmp, KYBER_CIPHERTEXTBYTES); + fail = PQCLEAN_KYBER768_AVX2_verify(ct, cmp.coeffs, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ - hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); + hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* Overwrite pre-k with z on re-encryption failure */ - PQCLEAN_KYBER768_AVX2_cmov(kr.arr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, (uint8_t)fail); + PQCLEAN_KYBER768_AVX2_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); /* hash concatenation of pre-k and H(c) to k */ - kdf(ss, kr.arr, 2 * KYBER_SYMBYTES); + kdf(ss, kr, 2 * KYBER_SYMBYTES); return 0; } diff --git a/crypto_kem/kyber768/avx2/ntt.S b/crypto_kem/kyber768/avx2/ntt.S index 2665ab59..cdf6b7b4 100644 --- a/crypto_kem/kyber768/avx2/ntt.S +++ b/crypto_kem/kyber768/avx2/ntt.S @@ -1,222 +1,191 @@ #include "cdecl.h" .include "shuffle.inc" -.include "fq.inc" -.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=1,zh1=1 -#mul +.macro mul rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=2,zh1=2 vpmullw %ymm\zl0,%ymm\rh0,%ymm12 vpmullw %ymm\zl0,%ymm\rh1,%ymm13 + vpmullw %ymm\zl1,%ymm\rh2,%ymm14 vpmullw %ymm\zl1,%ymm\rh3,%ymm15 + vpmulhw %ymm\zh0,%ymm\rh0,%ymm\rh0 vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh1 + vpmulhw %ymm\zh1,%ymm\rh2,%ymm\rh2 vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh3 - -#reduce -vpmulhw %ymm0,%ymm12,%ymm12 -vpmulhw %ymm0,%ymm13,%ymm13 -vpmulhw %ymm0,%ymm14,%ymm14 -vpmulhw %ymm0,%ymm15,%ymm15 -vpsubw %ymm12,%ymm\rh0,%ymm12 -vpsubw %ymm13,%ymm\rh1,%ymm13 -vpsubw %ymm14,%ymm\rh2,%ymm14 -vpsubw %ymm15,%ymm\rh3,%ymm15 - -#update -vpsubw %ymm12,%ymm\rl0,%ymm\rh0 -vpaddw %ymm12,%ymm\rl0,%ymm\rl0 -vpsubw %ymm13,%ymm\rl1,%ymm\rh1 -vpaddw %ymm13,%ymm\rl1,%ymm\rl1 -vpsubw %ymm14,%ymm\rl2,%ymm\rh2 -vpaddw %ymm14,%ymm\rl2,%ymm\rl2 -vpsubw %ymm15,%ymm\rl3,%ymm\rh3 -vpaddw %ymm15,%ymm\rl3,%ymm\rl3 .endm -# We break the dependency chains with the cost of slightly more additions. -# But they can be run in parallel to the multiplications on execution port 5 -# (multiplications only go to ports 0 and 1) -.macro butterfly2 rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,x=3,y=2,zl0=15,zl1=15,zh0=1,zh1=1 -#mul -vpmullw %ymm\zl0,%ymm\rh0,%ymm12 -vpmulhw %ymm\zh0,%ymm\rh0,%ymm\x -vpmullw %ymm\zl0,%ymm\rh1,%ymm13 -vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh0 -vpmullw %ymm\zl1,%ymm\rh2,%ymm14 -vpmulhw %ymm\zh1,%ymm\rh2,%ymm\y -vpmullw %ymm\zl1,%ymm\rh3,%ymm15 -vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh2 - -#reduce +.macro reduce vpmulhw %ymm0,%ymm12,%ymm12 vpmulhw %ymm0,%ymm13,%ymm13 + vpmulhw %ymm0,%ymm14,%ymm14 vpmulhw %ymm0,%ymm15,%ymm15 +.endm -vpsubw %ymm\rh0,%ymm\rl1,%ymm\rh1 -vpaddw %ymm\rh0,%ymm\rl1,%ymm\rl1 -vpsubw %ymm\x,%ymm\rl0,%ymm\rh0 -vpaddw %ymm\x,%ymm\rl0,%ymm\rl0 -vpsubw %ymm\rh2,%ymm\rl3,%ymm\rh3 -vpaddw %ymm\rh2,%ymm\rl3,%ymm\rl3 -vpsubw %ymm\y,%ymm\rl2,%ymm\rh2 -vpaddw %ymm\y,%ymm\rl2,%ymm\rl2 +.macro update rln,rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 +vpaddw %ymm\rh0,%ymm\rl0,%ymm\rln +vpsubw %ymm\rh0,%ymm\rl0,%ymm\rh0 +vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl0 -#update +vpsubw %ymm\rh1,%ymm\rl1,%ymm\rh1 +vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl1 +vpsubw %ymm\rh2,%ymm\rl2,%ymm\rh2 + +vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl2 +vpsubw %ymm\rh3,%ymm\rl3,%ymm\rh3 + +vpsubw %ymm12,%ymm\rln,%ymm\rln vpaddw %ymm12,%ymm\rh0,%ymm\rh0 -vpsubw %ymm12,%ymm\rl0,%ymm\rl0 +vpsubw %ymm13,%ymm\rl0,%ymm\rl0 + vpaddw %ymm13,%ymm\rh1,%ymm\rh1 -vpsubw %ymm13,%ymm\rl1,%ymm\rl1 +vpsubw %ymm14,%ymm\rl1,%ymm\rl1 vpaddw %ymm14,%ymm\rh2,%ymm\rh2 -vpsubw %ymm14,%ymm\rl2,%ymm\rl2 + +vpsubw %ymm15,%ymm\rl2,%ymm\rl2 vpaddw %ymm15,%ymm\rh3,%ymm\rh3 -vpsubw %ymm15,%ymm\rl3,%ymm\rl3 +.endm + +.macro level0 off +vpbroadcastq (_ZETAS_EXP+0)*2(%rsi),%ymm15 +vmovdqa (64*\off+128)*2(%rdi),%ymm8 +vmovdqa (64*\off+144)*2(%rdi),%ymm9 +vmovdqa (64*\off+160)*2(%rdi),%ymm10 +vmovdqa (64*\off+176)*2(%rdi),%ymm11 +vpbroadcastq (_ZETAS_EXP+4)*2(%rsi),%ymm2 + +mul 8,9,10,11 + +vmovdqa (64*\off+ 0)*2(%rdi),%ymm4 +vmovdqa (64*\off+ 16)*2(%rdi),%ymm5 +vmovdqa (64*\off+ 32)*2(%rdi),%ymm6 +vmovdqa (64*\off+ 48)*2(%rdi),%ymm7 + +reduce +update 3,4,5,6,7,8,9,10,11 + +vmovdqa %ymm3,(64*\off+ 0)*2(%rdi) +vmovdqa %ymm4,(64*\off+ 16)*2(%rdi) +vmovdqa %ymm5,(64*\off+ 32)*2(%rdi) +vmovdqa %ymm6,(64*\off+ 48)*2(%rdi) +vmovdqa %ymm8,(64*\off+128)*2(%rdi) +vmovdqa %ymm9,(64*\off+144)*2(%rdi) +vmovdqa %ymm10,(64*\off+160)*2(%rdi) +vmovdqa %ymm11,(64*\off+176)*2(%rdi) +.endm + +.macro levels1t6 off +/* level 1 */ +vmovdqa (_ZETAS_EXP+224*\off+16)*2(%rsi),%ymm15 +vmovdqa (128*\off+ 64)*2(%rdi),%ymm8 +vmovdqa (128*\off+ 80)*2(%rdi),%ymm9 +vmovdqa (128*\off+ 96)*2(%rdi),%ymm10 +vmovdqa (128*\off+112)*2(%rdi),%ymm11 +vmovdqa (_ZETAS_EXP+224*\off+32)*2(%rsi),%ymm2 + +mul 8,9,10,11 + +vmovdqa (128*\off+ 0)*2(%rdi),%ymm4 +vmovdqa (128*\off+ 16)*2(%rdi),%ymm5 +vmovdqa (128*\off+ 32)*2(%rdi),%ymm6 +vmovdqa (128*\off+ 48)*2(%rdi),%ymm7 + +reduce +update 3,4,5,6,7,8,9,10,11 + +/* level 2 */ +shuffle8 5,10,7,10 +shuffle8 6,11,5,11 + +vmovdqa (_ZETAS_EXP+224*\off+48)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+64)*2(%rsi),%ymm2 + +mul 7,10,5,11 + +shuffle8 3,8,6,8 +shuffle8 4,9,3,9 + +reduce +update 4,6,8,3,9,7,10,5,11 + +/* level 3 */ +shuffle4 8,5,9,5 +shuffle4 3,11,8,11 + +vmovdqa (_ZETAS_EXP+224*\off+80)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+96)*2(%rsi),%ymm2 + +mul 9,5,8,11 + +shuffle4 4,7,3,7 +shuffle4 6,10,4,10 + +reduce +update 6,3,7,4,10,9,5,8,11 + +/* level 4 */ +shuffle2 7,8,10,8 +shuffle2 4,11,7,11 + +vmovdqa (_ZETAS_EXP+224*\off+112)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+128)*2(%rsi),%ymm2 + +mul 10,8,7,11 + +shuffle2 6,9,4,9 +shuffle2 3,5,6,5 + +reduce +update 3,4,9,6,5,10,8,7,11 + +/* level 5 */ +shuffle1 9,7,5,7 +shuffle1 6,11,9,11 + +vmovdqa (_ZETAS_EXP+224*\off+144)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+160)*2(%rsi),%ymm2 + +mul 5,7,9,11 + +shuffle1 3,10,6,10 +shuffle1 4,8,3,8 + +reduce +update 4,6,10,3,8,5,7,9,11 + +/* level 6 */ +vmovdqa (_ZETAS_EXP+224*\off+176)*2(%rsi),%ymm14 +vmovdqa (_ZETAS_EXP+224*\off+208)*2(%rsi),%ymm15 +vmovdqa (_ZETAS_EXP+224*\off+192)*2(%rsi),%ymm8 +vmovdqa (_ZETAS_EXP+224*\off+224)*2(%rsi),%ymm2 + +mul 10,3,9,11,14,15,8,2 + +reduce +update 8,4,6,5,7,10,3,9,11 + +vmovdqa %ymm8,(128*\off+ 0)*2(%rdi) +vmovdqa %ymm4,(128*\off+ 16)*2(%rdi) +vmovdqa %ymm10,(128*\off+ 32)*2(%rdi) +vmovdqa %ymm3,(128*\off+ 48)*2(%rdi) +vmovdqa %ymm6,(128*\off+ 64)*2(%rdi) +vmovdqa %ymm5,(128*\off+ 80)*2(%rdi) +vmovdqa %ymm9,(128*\off+ 96)*2(%rdi) +vmovdqa %ymm11,(128*\off+112)*2(%rdi) .endm .text -ntt_level0_avx: -level0: -#zetas -vpbroadcastd (%rsi),%ymm15 -vpbroadcastd 4(%rsi),%ymm1 - -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 256(%rdi),%ymm8 -vmovdqa 288(%rdi),%ymm9 -vmovdqa 320(%rdi),%ymm10 -vmovdqa 352(%rdi),%ymm11 - -butterfly2 4,5,6,7,8,9,10,11 - -#store -vmovdqa %ymm4,(%rdi) -vmovdqa %ymm5,32(%rdi) -vmovdqa %ymm6,64(%rdi) -vmovdqa %ymm7,96(%rdi) -vmovdqa %ymm8,256(%rdi) -vmovdqa %ymm9,288(%rdi) -vmovdqa %ymm10,320(%rdi) -vmovdqa %ymm11,352(%rdi) - -ret - -ntt_levels1t6_avx: -level1: -#zetas -vpbroadcastd (%rsi),%ymm15 -vpbroadcastd 4(%rsi),%ymm1 - -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 128(%rdi),%ymm8 -vmovdqa 160(%rdi),%ymm9 -vmovdqa 192(%rdi),%ymm10 -vmovdqa 224(%rdi),%ymm11 - -butterfly2 4,5,6,7,8,9,10,11,3 - -level2: -#zetas -vmovdqu 8(%rsi),%ymm15 -vmovdqu 40(%rsi),%ymm1 - -shuffle8 4,8,3,8 -shuffle8 5,9,4,9 -shuffle8 6,10,5,10 -shuffle8 7,11,6,11 - -butterfly2 3,8,4,9,5,10,6,11,7 - -level3: -#zetas -vmovdqu 72(%rsi),%ymm15 -vmovdqu 104(%rsi),%ymm1 - -shuffle4 3,5,7,5 -shuffle4 8,10,3,10 -shuffle4 4,6,8,6 -shuffle4 9,11,4,11 - -butterfly2 7,5,3,10,8,6,4,11,9 - -level4: -#zetas -vmovdqu 136(%rsi),%ymm15 -vmovdqu 168(%rsi),%ymm1 - -shuffle2 7,8,9,8 -shuffle2 5,6,7,6 -shuffle2 3,4,5,4 -shuffle2 10,11,3,11 - -butterfly2 9,8,7,6,5,4,3,11,10 - -level5: -#zetas -vmovdqu 200(%rsi),%ymm15 -vmovdqu 232(%rsi),%ymm1 - -shuffle1 9,5,10,5 -shuffle1 8,4,9,4 -shuffle1 7,3,8,3 -shuffle1 6,11,7,11 - -butterfly2 10,5,9,4,8,3,7,11,6 - -level6: -#zetas -vmovdqu 264(%rsi),%ymm14 -vmovdqu 328(%rsi),%ymm15 -vmovdqu 296(%rsi),%ymm1 -vmovdqu 360(%rsi),%ymm2 - -butterfly2 10,5,8,3,9,4,7,11,6,1,14,15,1,2 - -vmovdqa _16XV*2(%rdx),%ymm1 -red16 10,12 -red16 5,13 -red16 9,14 -red16 4,15 -red16 8,2 -red16 3,6 -red16 7,12 -red16 11,13 - -#store -vmovdqa %ymm10,(%rdi) -vmovdqa %ymm5,32(%rdi) -vmovdqa %ymm9,64(%rdi) -vmovdqa %ymm4,96(%rdi) -vmovdqa %ymm8,128(%rdi) -vmovdqa %ymm3,160(%rdi) -vmovdqa %ymm7,192(%rdi) -vmovdqa %ymm11,224(%rdi) - -ret - .global cdecl(PQCLEAN_KYBER768_AVX2_ntt_avx) .global _cdecl(PQCLEAN_KYBER768_AVX2_ntt_avx) cdecl(PQCLEAN_KYBER768_AVX2_ntt_avx): _cdecl(PQCLEAN_KYBER768_AVX2_ntt_avx): -#consts vmovdqa _16XQ*2(%rsi),%ymm0 -mov %rsi,%rdx -add $_ZETAS_EXP*2,%rsi -call ntt_level0_avx -add $128,%rdi -call ntt_level0_avx -sub $128,%rdi -add $8,%rsi -call ntt_levels1t6_avx -add $256,%rdi -add $392,%rsi -call ntt_levels1t6_avx + +level0 0 +level0 1 + +levels1t6 0 +levels1t6 1 + ret diff --git a/crypto_kem/kyber768/avx2/ntt.h b/crypto_kem/kyber768/avx2/ntt.h index c0bfaa8f..58d25ecc 100644 --- a/crypto_kem/kyber768/avx2/ntt.h +++ b/crypto_kem/kyber768/avx2/ntt.h @@ -1,24 +1,21 @@ #ifndef PQCLEAN_KYBER768_AVX2_NTT_H #define PQCLEAN_KYBER768_AVX2_NTT_H -#include "consts.h" + +#include #include -void PQCLEAN_KYBER768_AVX2_ntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER768_AVX2_qdata); -void PQCLEAN_KYBER768_AVX2_invntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER768_AVX2_qdata); +void PQCLEAN_KYBER768_AVX2_ntt_avx(__m256i *r, const __m256i *PQCLEAN_KYBER768_AVX2_qdata); +void PQCLEAN_KYBER768_AVX2_invntt_avx(__m256i *r, const __m256i *PQCLEAN_KYBER768_AVX2_qdata); -void PQCLEAN_KYBER768_AVX2_nttpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER768_AVX2_qdata); -void PQCLEAN_KYBER768_AVX2_nttunpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER768_AVX2_qdata); +void PQCLEAN_KYBER768_AVX2_nttpack_avx(__m256i *r, const __m256i *PQCLEAN_KYBER768_AVX2_qdata); +void PQCLEAN_KYBER768_AVX2_nttunpack_avx(__m256i *r, const __m256i *PQCLEAN_KYBER768_AVX2_qdata); -void PQCLEAN_KYBER768_AVX2_basemul_avx(int16_t *r, - const int16_t *a, - const int16_t *b, - const qdata_t *PQCLEAN_KYBER768_AVX2_qdata); -void PQCLEAN_KYBER768_AVX2_basemul_acc_avx(int16_t *r, - const int16_t *a, - const int16_t *b, - const qdata_t *PQCLEAN_KYBER768_AVX2_qdata); +void PQCLEAN_KYBER768_AVX2_basemul_avx(__m256i *r, + const __m256i *a, + const __m256i *b, + const __m256i *PQCLEAN_KYBER768_AVX2_qdata); -void PQCLEAN_KYBER768_AVX2_ntttobytes_avx(uint8_t *r, const int16_t *a, const qdata_t *PQCLEAN_KYBER768_AVX2_qdata); -void PQCLEAN_KYBER768_AVX2_nttfrombytes_avx(int16_t *r, const uint8_t *a, const qdata_t *PQCLEAN_KYBER768_AVX2_qdata); +void PQCLEAN_KYBER768_AVX2_ntttobytes_avx(uint8_t *r, const __m256i *a, const __m256i *PQCLEAN_KYBER768_AVX2_qdata); +void PQCLEAN_KYBER768_AVX2_nttfrombytes_avx(__m256i *r, const uint8_t *a, const __m256i *PQCLEAN_KYBER768_AVX2_qdata); #endif diff --git a/crypto_kem/kyber768/avx2/params.h b/crypto_kem/kyber768/avx2/params.h index b0ff9ab9..cab92431 100644 --- a/crypto_kem/kyber768/avx2/params.h +++ b/crypto_kem/kyber768/avx2/params.h @@ -7,8 +7,6 @@ #define KYBER_N 256 #define KYBER_Q 3329 -#define KYBER_ETA 2 - #define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define KYBER_SSBYTES 32 /* size in bytes of shared key */ @@ -16,9 +14,12 @@ #define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) #define KYBER_K 3 +#define KYBER_ETA1 2 #define KYBER_POLYCOMPRESSEDBYTES 128 #define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320) +#define KYBER_ETA2 2 + #define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES #define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) #define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) diff --git a/crypto_kem/kyber768/avx2/poly.c b/crypto_kem/kyber768/avx2/poly.c index 06cb40d0..1e20de7b 100644 --- a/crypto_kem/kyber768/avx2/poly.c +++ b/crypto_kem/kyber768/avx2/poly.c @@ -12,63 +12,89 @@ /************************************************* * Name: PQCLEAN_KYBER768_AVX2_poly_compress * -* Description: Compression and subsequent serialization of a polynomial +* Description: Compression and subsequent serialization of a polynomial. +* The coefficients of the input polynomial are assumed to +* lie in the invertal [0,q], i.e. the polynomial must be reduced +* by PQCLEAN_KYBER768_AVX2_poly_reduce(). * * Arguments: - uint8_t *r: pointer to output byte array * (of length KYBER_POLYCOMPRESSEDBYTES) -* - poly *a: pointer to input polynomial +* - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER768_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *restrict a) { - unsigned int i, j; - uint8_t t[8]; - - PQCLEAN_KYBER768_AVX2_poly_csubq(a); - - for (i = 0; i < KYBER_N / 8; i++) { - for (j = 0; j < 8; j++) { - t[j] = ((((uint16_t)a->coeffs[8 * i + j] << 4) + KYBER_Q / 2) / KYBER_Q) & 15; - } - - r[0] = t[0] | (t[1] << 4); - r[1] = t[2] | (t[3] << 4); - r[2] = t[4] | (t[5] << 4); - r[3] = t[6] | (t[7] << 4); - r += 4; - } -} - -/************************************************* -* Name: PQCLEAN_KYBER768_AVX2_poly_decompress -* -* Description: De-serialization and subsequent decompression of a polynomial; -* approximate inverse of PQCLEAN_KYBER768_AVX2_poly_compress -* -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *a: pointer to input byte array -* (of length KYBER_POLYCOMPRESSEDBYTES bytes) -**************************************************/ -void PQCLEAN_KYBER768_AVX2_poly_decompress(poly *restrict r, - const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) { +void PQCLEAN_KYBER768_AVX2_poly_compress(uint8_t r[128], const poly *restrict a) { unsigned int i; + __m256i f0, f1, f2, f3; + const __m256i v = _mm256_load_si256(&PQCLEAN_KYBER768_AVX2_qdata.vec[_16XV / 16]); + const __m256i shift1 = _mm256_set1_epi16(1 << 9); + const __m256i mask = _mm256_set1_epi16(15); + const __m256i shift2 = _mm256_set1_epi16((16 << 8) + 1); + const __m256i permdidx = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); - for (i = 0; i < KYBER_N / 2; i++) { - r->coeffs[2 * i + 0] = (((uint16_t)(a[0] & 15) * KYBER_Q) + 8) >> 4; - r->coeffs[2 * i + 1] = (((uint16_t)(a[0] >> 4) * KYBER_Q) + 8) >> 4; - a += 1; + for (i = 0; i < KYBER_N / 64; i++) { + f0 = _mm256_load_si256(&a->vec[4 * i + 0]); + f1 = _mm256_load_si256(&a->vec[4 * i + 1]); + f2 = _mm256_load_si256(&a->vec[4 * i + 2]); + f3 = _mm256_load_si256(&a->vec[4 * i + 3]); + f0 = _mm256_mulhi_epi16(f0, v); + f1 = _mm256_mulhi_epi16(f1, v); + f2 = _mm256_mulhi_epi16(f2, v); + f3 = _mm256_mulhi_epi16(f3, v); + f0 = _mm256_mulhrs_epi16(f0, shift1); + f1 = _mm256_mulhrs_epi16(f1, shift1); + f2 = _mm256_mulhrs_epi16(f2, shift1); + f3 = _mm256_mulhrs_epi16(f3, shift1); + f0 = _mm256_and_si256(f0, mask); + f1 = _mm256_and_si256(f1, mask); + f2 = _mm256_and_si256(f2, mask); + f3 = _mm256_and_si256(f3, mask); + f0 = _mm256_packus_epi16(f0, f1); + f2 = _mm256_packus_epi16(f2, f3); + f0 = _mm256_maddubs_epi16(f0, shift2); + f2 = _mm256_maddubs_epi16(f2, shift2); + f0 = _mm256_packus_epi16(f0, f2); + f0 = _mm256_permutevar8x32_epi32(f0, permdidx); + _mm256_storeu_si256((__m256i *)&r[32 * i], f0); } } +void PQCLEAN_KYBER768_AVX2_poly_decompress(poly *restrict r, const uint8_t a[128]) { + unsigned int i; + __m128i t; + __m256i f; + const __m256i q = _mm256_load_si256(&PQCLEAN_KYBER768_AVX2_qdata.vec[_16XQ / 16]); + const __m256i shufbidx = _mm256_set_epi8(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, + 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0); + const __m256i mask = _mm256_set1_epi32(0x00F0000F); + const __m256i shift = _mm256_set1_epi32((128 << 16) + 2048); + + for (i = 0; i < KYBER_N / 16; i++) { + t = _mm_loadl_epi64((__m128i *)&a[8 * i]); + f = _mm256_broadcastsi128_si256(t); + f = _mm256_shuffle_epi8(f, shufbidx); + f = _mm256_and_si256(f, mask); + f = _mm256_mullo_epi16(f, shift); + f = _mm256_mulhrs_epi16(f, q); + _mm256_store_si256(&r->vec[i], f); + } +} + + /************************************************* * Name: PQCLEAN_KYBER768_AVX2_poly_tobytes * -* Description: Serialization of a polynomial +* Description: Serialization of a polynomial in NTT representation. +* The coefficients of the input polynomial are assumed to +* lie in the invertal [0,q], i.e. the polynomial must be reduced +* by PQCLEAN_KYBER768_AVX2_poly_reduce(). The coefficients are orderd as output by +* PQCLEAN_KYBER768_AVX2_poly_ntt(); the serialized output coefficients are in bitreversed +* order. * * Arguments: - uint8_t *r: pointer to output byte array * (needs space for KYBER_POLYBYTES bytes) -* - poly *a: pointer to input polynomial +* - poly *a: pointer to input polynomial **************************************************/ void PQCLEAN_KYBER768_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { - PQCLEAN_KYBER768_AVX2_ntttobytes_avx(r, a->coeffs, &PQCLEAN_KYBER768_AVX2_qdata); + PQCLEAN_KYBER768_AVX2_ntttobytes_avx(r, a->vec, PQCLEAN_KYBER768_AVX2_qdata.vec); } /************************************************* @@ -77,12 +103,12 @@ void PQCLEAN_KYBER768_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { * Description: De-serialization of a polynomial; * inverse of PQCLEAN_KYBER768_AVX2_poly_tobytes * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: pointer to input byte array * (of KYBER_POLYBYTES bytes) **************************************************/ void PQCLEAN_KYBER768_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]) { - PQCLEAN_KYBER768_AVX2_nttfrombytes_avx(r->coeffs, a, &PQCLEAN_KYBER768_AVX2_qdata); + PQCLEAN_KYBER768_AVX2_nttfrombytes_avx(r->vec, a, PQCLEAN_KYBER768_AVX2_qdata.vec); } /************************************************* @@ -90,11 +116,10 @@ void PQCLEAN_KYBER768_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYT * * Description: Convert 32-byte message to polynomial * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *msg: pointer to input message **************************************************/ -void PQCLEAN_KYBER768_AVX2_poly_frommsg(poly *restrict r, - const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { +void PQCLEAN_KYBER768_AVX2_poly_frommsg(poly *restrict r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { __m256i f, g0, g1, g2, g3, h0, h1, h2, h3; const __m256i shift = _mm256_broadcastsi128_si256(_mm_set_epi32(0, 1, 2, 3)); const __m256i idx = _mm256_broadcastsi128_si256(_mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); @@ -123,12 +148,12 @@ void PQCLEAN_KYBER768_AVX2_poly_frommsg(poly *restrict r, g2 = _mm256_permute2x128_si256(h0,h1,0x31); \ g1 = _mm256_permute2x128_si256(h2,h3,0x20); \ g3 = _mm256_permute2x128_si256(h2,h3,0x31); \ - _mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+ 0],g0); \ - _mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+16],g1); \ - _mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+ 0],g2); \ - _mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+16],g3) + _mm256_store_si256(&r->vec[0+2*(i)+0],g0); \ + _mm256_store_si256(&r->vec[0+2*(i)+1],g1); \ + _mm256_store_si256(&r->vec[8+2*(i)+0],g2); \ + _mm256_store_si256(&r->vec[8+2*(i)+1],g3) - f = _mm256_load_si256((__m256i *)msg); + f = _mm256_loadu_si256((__m256i *)msg); FROMMSG64(0); FROMMSG64(1); FROMMSG64(2); @@ -138,32 +163,34 @@ void PQCLEAN_KYBER768_AVX2_poly_frommsg(poly *restrict r, /************************************************* * Name: PQCLEAN_KYBER768_AVX2_poly_tomsg * -* Description: Convert polynomial to 32-byte message +* Description: Convert polynomial to 32-byte message. +* The coefficients of the input polynomial are assumed to +* lie in the invertal [0,q], i.e. the polynomial must be reduced +* by PQCLEAN_KYBER768_AVX2_poly_reduce(). * * Arguments: - uint8_t *msg: pointer to output message -* - poly *a: pointer to input polynomial +* - poly *a: pointer to input polynomial **************************************************/ void PQCLEAN_KYBER768_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *restrict a) { unsigned int i; uint32_t small; __m256i f0, f1, g0, g1; - const __m256i hqs = _mm256_set1_epi16((KYBER_Q - 1) / 2); - const __m256i hhqs = _mm256_set1_epi16((KYBER_Q - 5) / 4); + const __m256i hq = _mm256_set1_epi16((KYBER_Q - 1) / 2); + const __m256i hhq = _mm256_set1_epi16((KYBER_Q - 1) / 4); for (i = 0; i < KYBER_N / 32; i++) { - f0 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i]); - f1 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i + 16]); - f0 = _mm256_sub_epi16(hqs, f0); - f1 = _mm256_sub_epi16(hqs, f1); + f0 = _mm256_load_si256(&a->vec[2 * i + 0]); + f1 = _mm256_load_si256(&a->vec[2 * i + 1]); + f0 = _mm256_sub_epi16(hq, f0); + f1 = _mm256_sub_epi16(hq, f1); g0 = _mm256_srai_epi16(f0, 15); g1 = _mm256_srai_epi16(f1, 15); f0 = _mm256_xor_si256(f0, g0); f1 = _mm256_xor_si256(f1, g1); - f0 = _mm256_sub_epi16(hhqs, f0); - f1 = _mm256_sub_epi16(hhqs, f1); + f0 = _mm256_sub_epi16(f0, hhq); + f1 = _mm256_sub_epi16(f1, hhq); f0 = _mm256_packs_epi16(f0, f1); small = _mm256_movemask_epi8(f0); - small = ~small; msg[4 * i + 0] = small; msg[4 * i + 1] = small >> 16; msg[4 * i + 2] = small >> 8; @@ -172,24 +199,43 @@ void PQCLEAN_KYBER768_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly * } /************************************************* -* Name: PQCLEAN_KYBER768_AVX2_poly_getnoise +* Name: PQCLEAN_KYBER768_AVX2_poly_getnoise_eta1 * * Description: Sample a polynomial deterministically from a seed and a nonce, * with output polynomial close to centered binomial distribution -* with parameter KYBER_ETA +* with parameter KYBER_ETA1 * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *seed: pointer to input seed * (of length KYBER_SYMBYTES bytes) -* - uint8_t nonce: one-byte input nonce +* - uint8_t nonce: one-byte input nonce **************************************************/ -void PQCLEAN_KYBER768_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { - ALIGN32_ARRAY(uint8_t, KYBER_ETA * KYBER_N / 4) buf; - prf(buf.arr, sizeof(buf.arr), seed, nonce); - PQCLEAN_KYBER768_AVX2_cbd(r, buf.arr); +void PQCLEAN_KYBER768_AVX2_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { + ALIGNED_UINT8(KYBER_ETA1 * KYBER_N / 4 + 32) buf; // +32 bytes as required by PQCLEAN_KYBER768_AVX2_poly_cbd_eta1 + prf(buf.coeffs, KYBER_ETA1 * KYBER_N / 4, seed, nonce); + PQCLEAN_KYBER768_AVX2_poly_cbd_eta1(r, buf.vec); } -void PQCLEAN_KYBER768_AVX2_poly_getnoise4x(poly *r0, +/************************************************* +* Name: PQCLEAN_KYBER768_AVX2_poly_getnoise_eta2 +* +* Description: Sample a polynomial deterministically from a seed and a nonce, +* with output polynomial close to centered binomial distribution +* with parameter KYBER_ETA2 +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *seed: pointer to input seed +* (of length KYBER_SYMBYTES bytes) +* - uint8_t nonce: one-byte input nonce +**************************************************/ +void PQCLEAN_KYBER768_AVX2_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { + ALIGNED_UINT8(KYBER_ETA2 * KYBER_N / 4) buf; + prf(buf.coeffs, KYBER_ETA2 * KYBER_N / 4, seed, nonce); + PQCLEAN_KYBER768_AVX2_poly_cbd_eta2(r, buf.vec); +} + +#define NOISE_NBLOCKS ((KYBER_ETA1*KYBER_N/4+SHAKE256_RATE-1)/SHAKE256_RATE) +void PQCLEAN_KYBER768_AVX2_poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3, @@ -198,41 +244,46 @@ void PQCLEAN_KYBER768_AVX2_poly_getnoise4x(poly *r0, uint8_t nonce1, uint8_t nonce2, uint8_t nonce3) { - ALIGN32_ARRAY_2D(uint8_t, 4, 160) buf; + ALIGNED_UINT8(NOISE_NBLOCKS * SHAKE256_RATE) buf[4]; __m256i f; keccakx4_state state; - f = _mm256_load_si256((__m256i *)seed); - _mm256_store_si256((__m256i *)buf.arr[0], f); - _mm256_store_si256((__m256i *)buf.arr[1], f); - _mm256_store_si256((__m256i *)buf.arr[2], f); - _mm256_store_si256((__m256i *)buf.arr[3], f); + f = _mm256_loadu_si256((__m256i *)seed); + _mm256_store_si256(buf[0].vec, f); + _mm256_store_si256(buf[1].vec, f); + _mm256_store_si256(buf[2].vec, f); + _mm256_store_si256(buf[3].vec, f); - buf.arr[0][32] = nonce0; - buf.arr[1][32] = nonce1; - buf.arr[2][32] = nonce2; - buf.arr[3][32] = nonce3; + buf[0].coeffs[32] = nonce0; + buf[1].coeffs[32] = nonce1; + buf[2].coeffs[32] = nonce2; + buf[3].coeffs[32] = nonce3; - PQCLEAN_KYBER768_AVX2_shake256x4_absorb(&state, buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], 33); - PQCLEAN_KYBER768_AVX2_shake256x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], 1, &state); + PQCLEAN_KYBER768_AVX2_shake256x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 33); + PQCLEAN_KYBER768_AVX2_shake256x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, NOISE_NBLOCKS, &state); - PQCLEAN_KYBER768_AVX2_cbd(r0, buf.arr[0]); - PQCLEAN_KYBER768_AVX2_cbd(r1, buf.arr[1]); - PQCLEAN_KYBER768_AVX2_cbd(r2, buf.arr[2]); - PQCLEAN_KYBER768_AVX2_cbd(r3, buf.arr[3]); + PQCLEAN_KYBER768_AVX2_poly_cbd_eta1(r0, buf[0].vec); + PQCLEAN_KYBER768_AVX2_poly_cbd_eta1(r1, buf[1].vec); + PQCLEAN_KYBER768_AVX2_poly_cbd_eta1(r2, buf[2].vec); + PQCLEAN_KYBER768_AVX2_poly_cbd_eta1(r3, buf[3].vec); } + /************************************************* * Name: PQCLEAN_KYBER768_AVX2_poly_ntt * * Description: Computes negacyclic number-theoretic transform (NTT) of -* a polynomial in place; -* inputs assumed to be in normal order, output in bitreversed order +* a polynomial in place. +* Input coefficients assumed to be in normal order, +* output coefficients are in special order that is natural +* for the vectorization. Input coefficients are assumed to be +* bounded by q in absolute value, output coefficients are bounded +* by 16118 in absolute value. * -* Arguments: - uint16_t *r: pointer to in/output polynomial +* Arguments: - poly *r: pointer to in/output polynomial **************************************************/ void PQCLEAN_KYBER768_AVX2_poly_ntt(poly *r) { - PQCLEAN_KYBER768_AVX2_ntt_avx(r->coeffs, &PQCLEAN_KYBER768_AVX2_qdata); + PQCLEAN_KYBER768_AVX2_ntt_avx(r->vec, PQCLEAN_KYBER768_AVX2_qdata.vec); } /************************************************* @@ -240,29 +291,35 @@ void PQCLEAN_KYBER768_AVX2_poly_ntt(poly *r) { * * Description: Computes inverse of negacyclic number-theoretic transform (NTT) * of a polynomial in place; -* inputs assumed to be in bitreversed order, output in normal order +* Input coefficients assumed to be in special order from vectorized +* forward ntt, output in normal order. Input coefficients can be +* arbitrary 16-bit integers, output coefficients are bounded by 14870 +* in absolute value. * -* Arguments: - uint16_t *a: pointer to in/output polynomial +* Arguments: - poly *a: pointer to in/output polynomial **************************************************/ void PQCLEAN_KYBER768_AVX2_poly_invntt_tomont(poly *r) { - PQCLEAN_KYBER768_AVX2_invntt_avx(r->coeffs, &PQCLEAN_KYBER768_AVX2_qdata); + PQCLEAN_KYBER768_AVX2_invntt_avx(r->vec, PQCLEAN_KYBER768_AVX2_qdata.vec); } void PQCLEAN_KYBER768_AVX2_poly_nttunpack(poly *r) { - PQCLEAN_KYBER768_AVX2_nttunpack_avx(r->coeffs, &PQCLEAN_KYBER768_AVX2_qdata); + PQCLEAN_KYBER768_AVX2_nttunpack_avx(r->vec, PQCLEAN_KYBER768_AVX2_qdata.vec); } /************************************************* * Name: PQCLEAN_KYBER768_AVX2_poly_basemul_montgomery * -* Description: Multiplication of two polynomials in NTT domain +* Description: Multiplication of two polynomials in NTT domain. +* One of the input polynomials needs to have coefficients +* bounded by q, the other polynomial can have arbitrary +* coefficients. Output coefficients are bounded by 6656. * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ void PQCLEAN_KYBER768_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const poly *b) { - PQCLEAN_KYBER768_AVX2_basemul_avx(r->coeffs, a->coeffs, b->coeffs, &PQCLEAN_KYBER768_AVX2_qdata); + PQCLEAN_KYBER768_AVX2_basemul_avx(r->vec, a->vec, b->vec, PQCLEAN_KYBER768_AVX2_qdata.vec); } /************************************************* @@ -274,7 +331,7 @@ void PQCLEAN_KYBER768_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const * Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER768_AVX2_poly_tomont(poly *r) { - PQCLEAN_KYBER768_AVX2_tomont_avx(r->coeffs, &PQCLEAN_KYBER768_AVX2_qdata); + PQCLEAN_KYBER768_AVX2_tomont_avx(r->vec, PQCLEAN_KYBER768_AVX2_qdata.vec); } /************************************************* @@ -286,28 +343,16 @@ void PQCLEAN_KYBER768_AVX2_poly_tomont(poly *r) { * Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER768_AVX2_poly_reduce(poly *r) { - PQCLEAN_KYBER768_AVX2_reduce_avx(r->coeffs, &PQCLEAN_KYBER768_AVX2_qdata); -} - -/************************************************* -* Name: PQCLEAN_KYBER768_AVX2_poly_csubq -* -* Description: Applies conditional subtraction of q to each coefficient -* of a polynomial. For details of conditional subtraction -* of q see comments in reduce.c -* -* Arguments: - poly *r: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_KYBER768_AVX2_poly_csubq(poly *r) { - PQCLEAN_KYBER768_AVX2_csubq_avx(r->coeffs, &PQCLEAN_KYBER768_AVX2_qdata); + PQCLEAN_KYBER768_AVX2_reduce_avx(r->vec, PQCLEAN_KYBER768_AVX2_qdata.vec); } /************************************************* * Name: PQCLEAN_KYBER768_AVX2_poly_add * -* Description: Add two polynomials +* Description: Add two polynomials. No modular reduction +* is performed. * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ @@ -315,20 +360,21 @@ void PQCLEAN_KYBER768_AVX2_poly_add(poly *r, const poly *a, const poly *b) { unsigned int i; __m256i f0, f1; - for (i = 0; i < KYBER_N; i += 16) { - f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); - f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); + for (i = 0; i < KYBER_N / 16; i++) { + f0 = _mm256_load_si256(&a->vec[i]); + f1 = _mm256_load_si256(&b->vec[i]); f0 = _mm256_add_epi16(f0, f1); - _mm256_store_si256((__m256i *)&r->coeffs[i], f0); + _mm256_store_si256(&r->vec[i], f0); } } /************************************************* * Name: PQCLEAN_KYBER768_AVX2_poly_sub * -* Description: Subtract two polynomials +* Description: Subtract two polynomials. No modular reduction +* is performed. * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ @@ -336,10 +382,10 @@ void PQCLEAN_KYBER768_AVX2_poly_sub(poly *r, const poly *a, const poly *b) { unsigned int i; __m256i f0, f1; - for (i = 0; i < KYBER_N; i += 16) { - f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); - f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); + for (i = 0; i < KYBER_N / 16; i++) { + f0 = _mm256_load_si256(&a->vec[i]); + f1 = _mm256_load_si256(&b->vec[i]); f0 = _mm256_sub_epi16(f0, f1); - _mm256_store_si256((__m256i *)&r->coeffs[i], f0); + _mm256_store_si256(&r->vec[i], f0); } } diff --git a/crypto_kem/kyber768/avx2/poly.h b/crypto_kem/kyber768/avx2/poly.h index 8ab56448..0912e933 100644 --- a/crypto_kem/kyber768/avx2/poly.h +++ b/crypto_kem/kyber768/avx2/poly.h @@ -1,19 +1,13 @@ #ifndef PQCLEAN_KYBER768_AVX2_POLY_H #define PQCLEAN_KYBER768_AVX2_POLY_H +#include "align.h" #include "params.h" #include #include -/* - * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial - * coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1] - */ -typedef union { - __m256i dummy; - int16_t coeffs[KYBER_N]; -} poly; +typedef ALIGNED_INT16(KYBER_N) poly; -void PQCLEAN_KYBER768_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a); +void PQCLEAN_KYBER768_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a); void PQCLEAN_KYBER768_AVX2_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]); void PQCLEAN_KYBER768_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a); @@ -22,8 +16,11 @@ void PQCLEAN_KYBER768_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYT void PQCLEAN_KYBER768_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]); void PQCLEAN_KYBER768_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a); -void PQCLEAN_KYBER768_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); -void PQCLEAN_KYBER768_AVX2_poly_getnoise4x(poly *r0, +void PQCLEAN_KYBER768_AVX2_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); + +void PQCLEAN_KYBER768_AVX2_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); + +void PQCLEAN_KYBER768_AVX2_poly_getnoise_eta1_4x(poly *r0, poly *r1, poly *r2, poly *r3, @@ -33,6 +30,8 @@ void PQCLEAN_KYBER768_AVX2_poly_getnoise4x(poly *r0, uint8_t nonce2, uint8_t nonce3); + + void PQCLEAN_KYBER768_AVX2_poly_ntt(poly *r); void PQCLEAN_KYBER768_AVX2_poly_invntt_tomont(poly *r); void PQCLEAN_KYBER768_AVX2_poly_nttunpack(poly *r); @@ -40,7 +39,6 @@ void PQCLEAN_KYBER768_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const void PQCLEAN_KYBER768_AVX2_poly_tomont(poly *r); void PQCLEAN_KYBER768_AVX2_poly_reduce(poly *r); -void PQCLEAN_KYBER768_AVX2_poly_csubq(poly *r); void PQCLEAN_KYBER768_AVX2_poly_add(poly *r, const poly *a, const poly *b); void PQCLEAN_KYBER768_AVX2_poly_sub(poly *r, const poly *a, const poly *b); diff --git a/crypto_kem/kyber768/avx2/polyvec.c b/crypto_kem/kyber768/avx2/polyvec.c index b63ca47a..2f07b429 100644 --- a/crypto_kem/kyber768/avx2/polyvec.c +++ b/crypto_kem/kyber768/avx2/polyvec.c @@ -3,8 +3,76 @@ #include "params.h" #include "poly.h" #include "polyvec.h" +#include #include +static void poly_compress10(uint8_t r[320], const poly *restrict a) { + size_t i; + uint32_t low; + __m256i f0, f1, f2; + __m128i t0, t1; + const __m256i v = _mm256_load_si256(&PQCLEAN_KYBER768_AVX2_qdata.vec[_16XV / 16]); + const __m256i v8 = _mm256_slli_epi16(v, 3); + const __m256i off = _mm256_set1_epi16(15); + const __m256i shift1 = _mm256_set1_epi16(1 << 12); + const __m256i mask = _mm256_set1_epi16(1023); + const __m256i shift2 = _mm256_set1_epi64x((1024LL << 48) + (1LL << 32) + (1024 << 16) + 1); + const __m256i sllvdidx = _mm256_set1_epi64x(12); + const __m256i shufbidx = _mm256_set_epi8( 8, 4, 3, 2, 1, 0, -1, -1, -1, -1, -1, -1, 12, 11, 10, 9, + -1, -1, -1, -1, -1, -1, 12, 11, 10, 9, 8, 4, 3, 2, 1, 0); + + for (i = 0; i < KYBER_N / 16; i++) { + f0 = _mm256_load_si256(&a->vec[i]); + f1 = _mm256_mullo_epi16(f0, v8); + f2 = _mm256_add_epi16(f0, off); + f0 = _mm256_slli_epi16(f0, 3); + f0 = _mm256_mulhi_epi16(f0, v); + f2 = _mm256_sub_epi16(f1, f2); + f1 = _mm256_andnot_si256(f1, f2); + f1 = _mm256_srli_epi16(f1, 15); + f0 = _mm256_sub_epi16(f0, f1); + f0 = _mm256_mulhrs_epi16(f0, shift1); + f0 = _mm256_and_si256(f0, mask); + f0 = _mm256_madd_epi16(f0, shift2); + f0 = _mm256_sllv_epi32(f0, sllvdidx); + f0 = _mm256_srli_epi64(f0, 12); + f0 = _mm256_shuffle_epi8(f0, shufbidx); + t0 = _mm256_castsi256_si128(f0); + t1 = _mm256_extracti128_si256(f0, 1); + t0 = _mm_blend_epi16(t0, t1, 0xE0); + _mm_storeu_si128((__m128i *)&r[20 * i + 0], t0); + _mm_store_ss((float *)&low, _mm_castsi128_ps(t1)); + r[20 * i + 16] = (uint8_t)low; + r[20 * i + 17] = (uint8_t)(low >> 0x08); + r[20 * i + 18] = (uint8_t)(low >> 0x10); + r[20 * i + 19] = (uint8_t)(low >> 0x18); + } +} + +static void poly_decompress10(poly *restrict r, const uint8_t a[320 + 12]) { + size_t i; + __m256i f; + const __m256i q = _mm256_set1_epi32((KYBER_Q << 16) + 4 * KYBER_Q); + const __m256i shufbidx = _mm256_set_epi8(11, 10, 10, 9, 9, 8, 8, 7, + 6, 5, 5, 4, 4, 3, 3, 2, + 9, 8, 8, 7, 7, 6, 6, 5, + 4, 3, 3, 2, 2, 1, 1, 0); + const __m256i sllvdidx = _mm256_set1_epi64x(4); + const __m256i mask = _mm256_set1_epi32((32736 << 16) + 8184); + + for (i = 0; i < KYBER_N / 16; i++) { + f = _mm256_loadu_si256((__m256i *)&a[20 * i]); + f = _mm256_permute4x64_epi64(f, 0x94); + f = _mm256_shuffle_epi8(f, shufbidx); + f = _mm256_sllv_epi32(f, sllvdidx); + f = _mm256_srli_epi16(f, 1); + f = _mm256_and_si256(f, mask); + f = _mm256_mulhrs_epi16(f, q); + _mm256_store_si256(&r->vec[i], f); + } +} + + /************************************************* * Name: PQCLEAN_KYBER768_AVX2_polyvec_compress * @@ -14,27 +82,11 @@ * (needs space for KYBER_POLYVECCOMPRESSEDBYTES) * - polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER768_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], - polyvec *restrict a) { - size_t i, j, k; +void PQCLEAN_KYBER768_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES + 2], polyvec *a) { + size_t i; - PQCLEAN_KYBER768_AVX2_polyvec_csubq(a); - - uint16_t t[4]; for (i = 0; i < KYBER_K; i++) { - for (j = 0; j < KYBER_N / 4; j++) { - for (k = 0; k < 4; k++) { - t[k] = ((((uint32_t)a->vec[i].coeffs[4 * j + k] << 10) + KYBER_Q / 2) - / KYBER_Q) & 0x3ff; - } - - r[0] = (t[0] >> 0); - r[1] = (t[0] >> 8) | (t[1] << 2); - r[2] = (t[1] >> 6) | (t[2] << 4); - r[3] = (t[2] >> 4) | (t[3] << 6); - r[4] = (t[3] >> 2); - r += 5; - } + poly_compress10(&r[320 * i], &a->vec[i]); } } @@ -44,27 +96,15 @@ void PQCLEAN_KYBER768_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYT * Description: De-serialize and decompress vector of polynomials; * approximate inverse of PQCLEAN_KYBER768_AVX2_polyvec_compress * -* Arguments: - polyvec *r: pointer to output vector of polynomials +* Arguments: - polyvec *r: pointer to output vector of polynomials * - const uint8_t *a: pointer to input byte array * (of length KYBER_POLYVECCOMPRESSEDBYTES) **************************************************/ -void PQCLEAN_KYBER768_AVX2_polyvec_decompress(polyvec *restrict r, - const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { - size_t i, j, k; +void PQCLEAN_KYBER768_AVX2_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES + 12]) { + size_t i; - uint16_t t[4]; for (i = 0; i < KYBER_K; i++) { - for (j = 0; j < KYBER_N / 4; j++) { - t[0] = (a[0] >> 0) | ((uint16_t)a[1] << 8); - t[1] = (a[1] >> 2) | ((uint16_t)a[2] << 6); - t[2] = (a[2] >> 4) | ((uint16_t)a[3] << 4); - t[3] = (a[3] >> 6) | ((uint16_t)a[4] << 2); - a += 5; - - for (k = 0; k < 4; k++) { - r->vec[i].coeffs[4 * j + k] = ((uint32_t)(t[k] & 0x3FF) * KYBER_Q + 512) >> 10; - } - } + poly_decompress10(&r->vec[i], &a[320 * i]); } } @@ -90,7 +130,7 @@ void PQCLEAN_KYBER768_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyve * Description: De-serialize vector of polynomials; * inverse of PQCLEAN_KYBER768_AVX2_polyvec_tobytes * -* Arguments: - uint8_t *r: pointer to output byte array +* Arguments: - uint8_t *r: pointer to output byte array * - const polyvec *a: pointer to input vector of polynomials * (of length KYBER_POLYVECBYTES) **************************************************/ @@ -131,29 +171,34 @@ void PQCLEAN_KYBER768_AVX2_polyvec_invntt_tomont(polyvec *r) { } /************************************************* -* Name: PQCLEAN_KYBER768_AVX2_polyvec_pointwise_acc_montgomery +* Name: PQCLEAN_KYBER768_AVX2_polyvec_basemul_acc_montgomery * -* Description: Pointwise multiply elements of a and b, accumulate into r, +* Description: Multiply elements in a and b in NTT domain, accumulate into r, * and multiply by 2^-16. * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ -void PQCLEAN_KYBER768_AVX2_polyvec_pointwise_acc_montgomery(poly *r, - const polyvec *a, - const polyvec *b) { - PQCLEAN_KYBER768_AVX2_basemul_acc_avx(r->coeffs, a->vec->coeffs, b->vec->coeffs, & PQCLEAN_KYBER768_AVX2_qdata); +void PQCLEAN_KYBER768_AVX2_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) { + size_t i; + poly tmp; + + PQCLEAN_KYBER768_AVX2_poly_basemul_montgomery(r, &a->vec[0], &b->vec[0]); + for (i = 1; i < KYBER_K; i++) { + PQCLEAN_KYBER768_AVX2_poly_basemul_montgomery(&tmp, &a->vec[i], &b->vec[i]); + PQCLEAN_KYBER768_AVX2_poly_add(r, r, &tmp); + } } /************************************************* * Name: PQCLEAN_KYBER768_AVX2_polyvec_reduce * * Description: Applies Barrett reduction to each coefficient -* of each element of a vector of polynomials +* of each element of a vector of polynomials; * for details of the Barrett reduction see comments in reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - polyvec *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER768_AVX2_polyvec_reduce(polyvec *r) { size_t i; @@ -162,23 +207,6 @@ void PQCLEAN_KYBER768_AVX2_polyvec_reduce(polyvec *r) { } } -/************************************************* -* Name: PQCLEAN_KYBER768_AVX2_polyvec_csubq -* -* Description: Applies conditional subtraction of q to each coefficient -* of each element of a vector of polynomials -* for details of conditional subtraction of q see comments in -* reduce.c -* -* Arguments: - poly *r: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_KYBER768_AVX2_polyvec_csubq(polyvec *r) { - size_t i; - for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER768_AVX2_poly_csubq(&r->vec[i]); - } -} - /************************************************* * Name: PQCLEAN_KYBER768_AVX2_polyvec_add * diff --git a/crypto_kem/kyber768/avx2/polyvec.h b/crypto_kem/kyber768/avx2/polyvec.h index 8bd8ef7f..d81240d4 100644 --- a/crypto_kem/kyber768/avx2/polyvec.h +++ b/crypto_kem/kyber768/avx2/polyvec.h @@ -8,9 +8,8 @@ typedef struct { poly vec[KYBER_K]; } polyvec; -void PQCLEAN_KYBER768_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a); -void PQCLEAN_KYBER768_AVX2_polyvec_decompress(polyvec *r, - const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); +void PQCLEAN_KYBER768_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES + 2], polyvec *a); +void PQCLEAN_KYBER768_AVX2_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES + 12]); void PQCLEAN_KYBER768_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a); void PQCLEAN_KYBER768_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]); @@ -18,12 +17,9 @@ void PQCLEAN_KYBER768_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_P void PQCLEAN_KYBER768_AVX2_polyvec_ntt(polyvec *r); void PQCLEAN_KYBER768_AVX2_polyvec_invntt_tomont(polyvec *r); -void PQCLEAN_KYBER768_AVX2_polyvec_pointwise_acc_montgomery(poly *r, - const polyvec *a, - const polyvec *b); +void PQCLEAN_KYBER768_AVX2_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b); void PQCLEAN_KYBER768_AVX2_polyvec_reduce(polyvec *r); -void PQCLEAN_KYBER768_AVX2_polyvec_csubq(polyvec *r); void PQCLEAN_KYBER768_AVX2_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); diff --git a/crypto_kem/kyber768/avx2/reduce.h b/crypto_kem/kyber768/avx2/reduce.h index 1630ec28..8a4ee305 100644 --- a/crypto_kem/kyber768/avx2/reduce.h +++ b/crypto_kem/kyber768/avx2/reduce.h @@ -1,10 +1,9 @@ #ifndef PQCLEAN_KYBER768_AVX2_REDUCE_H #define PQCLEAN_KYBER768_AVX2_REDUCE_H -#include "consts.h" -#include +#include "params.h" +#include -int16_t PQCLEAN_KYBER768_AVX2_reduce_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER768_AVX2_qdata); -int16_t PQCLEAN_KYBER768_AVX2_csubq_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER768_AVX2_qdata); -int16_t PQCLEAN_KYBER768_AVX2_tomont_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER768_AVX2_qdata); +void PQCLEAN_KYBER768_AVX2_reduce_avx(__m256i *r, const __m256i *PQCLEAN_KYBER768_AVX2_qdata); +void PQCLEAN_KYBER768_AVX2_tomont_avx(__m256i *r, const __m256i *PQCLEAN_KYBER768_AVX2_qdata); #endif diff --git a/crypto_kem/kyber768/avx2/rejsample.c b/crypto_kem/kyber768/avx2/rejsample.c index 1475428a..6ac25613 100644 --- a/crypto_kem/kyber768/avx2/rejsample.c +++ b/crypto_kem/kyber768/avx2/rejsample.c @@ -4,311 +4,68 @@ #include "rejsample.h" #include #include +#include + +//#define BMI -static const ALIGN32_ARRAY_2D(uint8_t, 256, 8) idx = {.arr = { - {-1, -1, -1, -1, -1, -1, -1, -1}, - { 0, -1, -1, -1, -1, -1, -1, -1}, - { 2, -1, -1, -1, -1, -1, -1, -1}, - { 0, 2, -1, -1, -1, -1, -1, -1}, - { 4, -1, -1, -1, -1, -1, -1, -1}, - { 0, 4, -1, -1, -1, -1, -1, -1}, - { 2, 4, -1, -1, -1, -1, -1, -1}, - { 0, 2, 4, -1, -1, -1, -1, -1}, - { 6, -1, -1, -1, -1, -1, -1, -1}, - { 0, 6, -1, -1, -1, -1, -1, -1}, - { 2, 6, -1, -1, -1, -1, -1, -1}, - { 0, 2, 6, -1, -1, -1, -1, -1}, - { 4, 6, -1, -1, -1, -1, -1, -1}, - { 0, 4, 6, -1, -1, -1, -1, -1}, - { 2, 4, 6, -1, -1, -1, -1, -1}, - { 0, 2, 4, 6, -1, -1, -1, -1}, - { 8, -1, -1, -1, -1, -1, -1, -1}, - { 0, 8, -1, -1, -1, -1, -1, -1}, - { 2, 8, -1, -1, -1, -1, -1, -1}, - { 0, 2, 8, -1, -1, -1, -1, -1}, - { 4, 8, -1, -1, -1, -1, -1, -1}, - { 0, 4, 8, -1, -1, -1, -1, -1}, - { 2, 4, 8, -1, -1, -1, -1, -1}, - { 0, 2, 4, 8, -1, -1, -1, -1}, - { 6, 8, -1, -1, -1, -1, -1, -1}, - { 0, 6, 8, -1, -1, -1, -1, -1}, - { 2, 6, 8, -1, -1, -1, -1, -1}, - { 0, 2, 6, 8, -1, -1, -1, -1}, - { 4, 6, 8, -1, -1, -1, -1, -1}, - { 0, 4, 6, 8, -1, -1, -1, -1}, - { 2, 4, 6, 8, -1, -1, -1, -1}, - { 0, 2, 4, 6, 8, -1, -1, -1}, - {10, -1, -1, -1, -1, -1, -1, -1}, - { 0, 10, -1, -1, -1, -1, -1, -1}, - { 2, 10, -1, -1, -1, -1, -1, -1}, - { 0, 2, 10, -1, -1, -1, -1, -1}, - { 4, 10, -1, -1, -1, -1, -1, -1}, - { 0, 4, 10, -1, -1, -1, -1, -1}, - { 2, 4, 10, -1, -1, -1, -1, -1}, - { 0, 2, 4, 10, -1, -1, -1, -1}, - { 6, 10, -1, -1, -1, -1, -1, -1}, - { 0, 6, 10, -1, -1, -1, -1, -1}, - { 2, 6, 10, -1, -1, -1, -1, -1}, - { 0, 2, 6, 10, -1, -1, -1, -1}, - { 4, 6, 10, -1, -1, -1, -1, -1}, - { 0, 4, 6, 10, -1, -1, -1, -1}, - { 2, 4, 6, 10, -1, -1, -1, -1}, - { 0, 2, 4, 6, 10, -1, -1, -1}, - { 8, 10, -1, -1, -1, -1, -1, -1}, - { 0, 8, 10, -1, -1, -1, -1, -1}, - { 2, 8, 10, -1, -1, -1, -1, -1}, - { 0, 2, 8, 10, -1, -1, -1, -1}, - { 4, 8, 10, -1, -1, -1, -1, -1}, - { 0, 4, 8, 10, -1, -1, -1, -1}, - { 2, 4, 8, 10, -1, -1, -1, -1}, - { 0, 2, 4, 8, 10, -1, -1, -1}, - { 6, 8, 10, -1, -1, -1, -1, -1}, - { 0, 6, 8, 10, -1, -1, -1, -1}, - { 2, 6, 8, 10, -1, -1, -1, -1}, - { 0, 2, 6, 8, 10, -1, -1, -1}, - { 4, 6, 8, 10, -1, -1, -1, -1}, - { 0, 4, 6, 8, 10, -1, -1, -1}, - { 2, 4, 6, 8, 10, -1, -1, -1}, - { 0, 2, 4, 6, 8, 10, -1, -1}, - {12, -1, -1, -1, -1, -1, -1, -1}, - { 0, 12, -1, -1, -1, -1, -1, -1}, - { 2, 12, -1, -1, -1, -1, -1, -1}, - { 0, 2, 12, -1, -1, -1, -1, -1}, - { 4, 12, -1, -1, -1, -1, -1, -1}, - { 0, 4, 12, -1, -1, -1, -1, -1}, - { 2, 4, 12, -1, -1, -1, -1, -1}, - { 0, 2, 4, 12, -1, -1, -1, -1}, - { 6, 12, -1, -1, -1, -1, -1, -1}, - { 0, 6, 12, -1, -1, -1, -1, -1}, - { 2, 6, 12, -1, -1, -1, -1, -1}, - { 0, 2, 6, 12, -1, -1, -1, -1}, - { 4, 6, 12, -1, -1, -1, -1, -1}, - { 0, 4, 6, 12, -1, -1, -1, -1}, - { 2, 4, 6, 12, -1, -1, -1, -1}, - { 0, 2, 4, 6, 12, -1, -1, -1}, - { 8, 12, -1, -1, -1, -1, -1, -1}, - { 0, 8, 12, -1, -1, -1, -1, -1}, - { 2, 8, 12, -1, -1, -1, -1, -1}, - { 0, 2, 8, 12, -1, -1, -1, -1}, - { 4, 8, 12, -1, -1, -1, -1, -1}, - { 0, 4, 8, 12, -1, -1, -1, -1}, - { 2, 4, 8, 12, -1, -1, -1, -1}, - { 0, 2, 4, 8, 12, -1, -1, -1}, - { 6, 8, 12, -1, -1, -1, -1, -1}, - { 0, 6, 8, 12, -1, -1, -1, -1}, - { 2, 6, 8, 12, -1, -1, -1, -1}, - { 0, 2, 6, 8, 12, -1, -1, -1}, - { 4, 6, 8, 12, -1, -1, -1, -1}, - { 0, 4, 6, 8, 12, -1, -1, -1}, - { 2, 4, 6, 8, 12, -1, -1, -1}, - { 0, 2, 4, 6, 8, 12, -1, -1}, - {10, 12, -1, -1, -1, -1, -1, -1}, - { 0, 10, 12, -1, -1, -1, -1, -1}, - { 2, 10, 12, -1, -1, -1, -1, -1}, - { 0, 2, 10, 12, -1, -1, -1, -1}, - { 4, 10, 12, -1, -1, -1, -1, -1}, - { 0, 4, 10, 12, -1, -1, -1, -1}, - { 2, 4, 10, 12, -1, -1, -1, -1}, - { 0, 2, 4, 10, 12, -1, -1, -1}, - { 6, 10, 12, -1, -1, -1, -1, -1}, - { 0, 6, 10, 12, -1, -1, -1, -1}, - { 2, 6, 10, 12, -1, -1, -1, -1}, - { 0, 2, 6, 10, 12, -1, -1, -1}, - { 4, 6, 10, 12, -1, -1, -1, -1}, - { 0, 4, 6, 10, 12, -1, -1, -1}, - { 2, 4, 6, 10, 12, -1, -1, -1}, - { 0, 2, 4, 6, 10, 12, -1, -1}, - { 8, 10, 12, -1, -1, -1, -1, -1}, - { 0, 8, 10, 12, -1, -1, -1, -1}, - { 2, 8, 10, 12, -1, -1, -1, -1}, - { 0, 2, 8, 10, 12, -1, -1, -1}, - { 4, 8, 10, 12, -1, -1, -1, -1}, - { 0, 4, 8, 10, 12, -1, -1, -1}, - { 2, 4, 8, 10, 12, -1, -1, -1}, - { 0, 2, 4, 8, 10, 12, -1, -1}, - { 6, 8, 10, 12, -1, -1, -1, -1}, - { 0, 6, 8, 10, 12, -1, -1, -1}, - { 2, 6, 8, 10, 12, -1, -1, -1}, - { 0, 2, 6, 8, 10, 12, -1, -1}, - { 4, 6, 8, 10, 12, -1, -1, -1}, - { 0, 4, 6, 8, 10, 12, -1, -1}, - { 2, 4, 6, 8, 10, 12, -1, -1}, - { 0, 2, 4, 6, 8, 10, 12, -1}, - {14, -1, -1, -1, -1, -1, -1, -1}, - { 0, 14, -1, -1, -1, -1, -1, -1}, - { 2, 14, -1, -1, -1, -1, -1, -1}, - { 0, 2, 14, -1, -1, -1, -1, -1}, - { 4, 14, -1, -1, -1, -1, -1, -1}, - { 0, 4, 14, -1, -1, -1, -1, -1}, - { 2, 4, 14, -1, -1, -1, -1, -1}, - { 0, 2, 4, 14, -1, -1, -1, -1}, - { 6, 14, -1, -1, -1, -1, -1, -1}, - { 0, 6, 14, -1, -1, -1, -1, -1}, - { 2, 6, 14, -1, -1, -1, -1, -1}, - { 0, 2, 6, 14, -1, -1, -1, -1}, - { 4, 6, 14, -1, -1, -1, -1, -1}, - { 0, 4, 6, 14, -1, -1, -1, -1}, - { 2, 4, 6, 14, -1, -1, -1, -1}, - { 0, 2, 4, 6, 14, -1, -1, -1}, - { 8, 14, -1, -1, -1, -1, -1, -1}, - { 0, 8, 14, -1, -1, -1, -1, -1}, - { 2, 8, 14, -1, -1, -1, -1, -1}, - { 0, 2, 8, 14, -1, -1, -1, -1}, - { 4, 8, 14, -1, -1, -1, -1, -1}, - { 0, 4, 8, 14, -1, -1, -1, -1}, - { 2, 4, 8, 14, -1, -1, -1, -1}, - { 0, 2, 4, 8, 14, -1, -1, -1}, - { 6, 8, 14, -1, -1, -1, -1, -1}, - { 0, 6, 8, 14, -1, -1, -1, -1}, - { 2, 6, 8, 14, -1, -1, -1, -1}, - { 0, 2, 6, 8, 14, -1, -1, -1}, - { 4, 6, 8, 14, -1, -1, -1, -1}, - { 0, 4, 6, 8, 14, -1, -1, -1}, - { 2, 4, 6, 8, 14, -1, -1, -1}, - { 0, 2, 4, 6, 8, 14, -1, -1}, - {10, 14, -1, -1, -1, -1, -1, -1}, - { 0, 10, 14, -1, -1, -1, -1, -1}, - { 2, 10, 14, -1, -1, -1, -1, -1}, - { 0, 2, 10, 14, -1, -1, -1, -1}, - { 4, 10, 14, -1, -1, -1, -1, -1}, - { 0, 4, 10, 14, -1, -1, -1, -1}, - { 2, 4, 10, 14, -1, -1, -1, -1}, - { 0, 2, 4, 10, 14, -1, -1, -1}, - { 6, 10, 14, -1, -1, -1, -1, -1}, - { 0, 6, 10, 14, -1, -1, -1, -1}, - { 2, 6, 10, 14, -1, -1, -1, -1}, - { 0, 2, 6, 10, 14, -1, -1, -1}, - { 4, 6, 10, 14, -1, -1, -1, -1}, - { 0, 4, 6, 10, 14, -1, -1, -1}, - { 2, 4, 6, 10, 14, -1, -1, -1}, - { 0, 2, 4, 6, 10, 14, -1, -1}, - { 8, 10, 14, -1, -1, -1, -1, -1}, - { 0, 8, 10, 14, -1, -1, -1, -1}, - { 2, 8, 10, 14, -1, -1, -1, -1}, - { 0, 2, 8, 10, 14, -1, -1, -1}, - { 4, 8, 10, 14, -1, -1, -1, -1}, - { 0, 4, 8, 10, 14, -1, -1, -1}, - { 2, 4, 8, 10, 14, -1, -1, -1}, - { 0, 2, 4, 8, 10, 14, -1, -1}, - { 6, 8, 10, 14, -1, -1, -1, -1}, - { 0, 6, 8, 10, 14, -1, -1, -1}, - { 2, 6, 8, 10, 14, -1, -1, -1}, - { 0, 2, 6, 8, 10, 14, -1, -1}, - { 4, 6, 8, 10, 14, -1, -1, -1}, - { 0, 4, 6, 8, 10, 14, -1, -1}, - { 2, 4, 6, 8, 10, 14, -1, -1}, - { 0, 2, 4, 6, 8, 10, 14, -1}, - {12, 14, -1, -1, -1, -1, -1, -1}, - { 0, 12, 14, -1, -1, -1, -1, -1}, - { 2, 12, 14, -1, -1, -1, -1, -1}, - { 0, 2, 12, 14, -1, -1, -1, -1}, - { 4, 12, 14, -1, -1, -1, -1, -1}, - { 0, 4, 12, 14, -1, -1, -1, -1}, - { 2, 4, 12, 14, -1, -1, -1, -1}, - { 0, 2, 4, 12, 14, -1, -1, -1}, - { 6, 12, 14, -1, -1, -1, -1, -1}, - { 0, 6, 12, 14, -1, -1, -1, -1}, - { 2, 6, 12, 14, -1, -1, -1, -1}, - { 0, 2, 6, 12, 14, -1, -1, -1}, - { 4, 6, 12, 14, -1, -1, -1, -1}, - { 0, 4, 6, 12, 14, -1, -1, -1}, - { 2, 4, 6, 12, 14, -1, -1, -1}, - { 0, 2, 4, 6, 12, 14, -1, -1}, - { 8, 12, 14, -1, -1, -1, -1, -1}, - { 0, 8, 12, 14, -1, -1, -1, -1}, - { 2, 8, 12, 14, -1, -1, -1, -1}, - { 0, 2, 8, 12, 14, -1, -1, -1}, - { 4, 8, 12, 14, -1, -1, -1, -1}, - { 0, 4, 8, 12, 14, -1, -1, -1}, - { 2, 4, 8, 12, 14, -1, -1, -1}, - { 0, 2, 4, 8, 12, 14, -1, -1}, - { 6, 8, 12, 14, -1, -1, -1, -1}, - { 0, 6, 8, 12, 14, -1, -1, -1}, - { 2, 6, 8, 12, 14, -1, -1, -1}, - { 0, 2, 6, 8, 12, 14, -1, -1}, - { 4, 6, 8, 12, 14, -1, -1, -1}, - { 0, 4, 6, 8, 12, 14, -1, -1}, - { 2, 4, 6, 8, 12, 14, -1, -1}, - { 0, 2, 4, 6, 8, 12, 14, -1}, - {10, 12, 14, -1, -1, -1, -1, -1}, - { 0, 10, 12, 14, -1, -1, -1, -1}, - { 2, 10, 12, 14, -1, -1, -1, -1}, - { 0, 2, 10, 12, 14, -1, -1, -1}, - { 4, 10, 12, 14, -1, -1, -1, -1}, - { 0, 4, 10, 12, 14, -1, -1, -1}, - { 2, 4, 10, 12, 14, -1, -1, -1}, - { 0, 2, 4, 10, 12, 14, -1, -1}, - { 6, 10, 12, 14, -1, -1, -1, -1}, - { 0, 6, 10, 12, 14, -1, -1, -1}, - { 2, 6, 10, 12, 14, -1, -1, -1}, - { 0, 2, 6, 10, 12, 14, -1, -1}, - { 4, 6, 10, 12, 14, -1, -1, -1}, - { 0, 4, 6, 10, 12, 14, -1, -1}, - { 2, 4, 6, 10, 12, 14, -1, -1}, - { 0, 2, 4, 6, 10, 12, 14, -1}, - { 8, 10, 12, 14, -1, -1, -1, -1}, - { 0, 8, 10, 12, 14, -1, -1, -1}, - { 2, 8, 10, 12, 14, -1, -1, -1}, - { 0, 2, 8, 10, 12, 14, -1, -1}, - { 4, 8, 10, 12, 14, -1, -1, -1}, - { 0, 4, 8, 10, 12, 14, -1, -1}, - { 2, 4, 8, 10, 12, 14, -1, -1}, - { 0, 2, 4, 8, 10, 12, 14, -1}, - { 6, 8, 10, 12, 14, -1, -1, -1}, - { 0, 6, 8, 10, 12, 14, -1, -1}, - { 2, 6, 8, 10, 12, 14, -1, -1}, - { 0, 2, 6, 8, 10, 12, 14, -1}, - { 4, 6, 8, 10, 12, 14, -1, -1}, - { 0, 4, 6, 8, 10, 12, 14, -1}, - { 2, 4, 6, 8, 10, 12, 14, -1}, - { 0, 2, 4, 6, 8, 10, 12, 14} - } -}; #define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a) #define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a) -#define REJ_UNIFORM_BUFLEN 672 -unsigned int PQCLEAN_KYBER768_AVX2_rej_uniform_avx(int16_t *restrict r, - const uint8_t *restrict buf) { +unsigned int PQCLEAN_KYBER768_AVX2_rej_uniform_avx(int16_t *restrict r, const uint8_t *buf) { unsigned int ctr, pos; - uint16_t val; + uint16_t val0, val1; uint32_t good; - const __m256i bound = _mm256_set1_epi16((int16_t)(19 * KYBER_Q - 1)); + uint64_t idx0, idx1, idx2, idx3; + const __m256i bound = _mm256_load_si256(&PQCLEAN_KYBER768_AVX2_qdata.vec[_16XQ / 16]); const __m256i ones = _mm256_set1_epi8(1); - const __m256i kyberq = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER768_AVX2_qdata.as_arr[_16XQ]); - const __m256i v = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER768_AVX2_qdata.as_arr[_16XV]); + const __m256i mask = _mm256_set1_epi16(0xFFF); + const __m256i idx8 = _mm256_set_epi8(15, 14, 14, 13, 12, 11, 11, 10, + 9, 8, 8, 7, 6, 5, 5, 4, + 11, 10, 10, 9, 8, 7, 7, 6, + 5, 4, 4, 3, 2, 1, 1, 0); __m256i f0, f1, g0, g1, g2, g3; __m128i f, t, pilo, pihi; - ctr = 0; - for (pos = 0; pos < 2 * KYBER_N; pos += 64) { - f0 = _mm256_load_si256((__m256i *)&buf[pos + 0]); - f1 = _mm256_load_si256((__m256i *)&buf[pos + 32]); + ctr = pos = 0; + while (ctr <= KYBER_N - 32 && pos <= REJ_UNIFORM_AVX_BUFLEN - 48) { + f0 = _mm256_loadu_si256((__m256i *)&buf[pos]); + f1 = _mm256_loadu_si256((__m256i *)&buf[pos + 24]); + f0 = _mm256_permute4x64_epi64(f0, 0x94); + f1 = _mm256_permute4x64_epi64(f1, 0x94); + f0 = _mm256_shuffle_epi8(f0, idx8); + f1 = _mm256_shuffle_epi8(f1, idx8); + g0 = _mm256_srli_epi16(f0, 4); + g1 = _mm256_srli_epi16(f1, 4); + f0 = _mm256_blend_epi16(f0, g0, 0xAA); + f1 = _mm256_blend_epi16(f1, g1, 0xAA); + f0 = _mm256_and_si256(f0, mask); + f1 = _mm256_and_si256(f1, mask); + pos += 48; - g0 = _mm256_cmpge_epu16(bound, f0); - g1 = _mm256_cmpge_epu16(bound, f1); + g0 = _mm256_cmpgt_epi16(bound, f0); + g1 = _mm256_cmpgt_epi16(bound, f1); g0 = _mm256_packs_epi16(g0, g1); good = _mm256_movemask_epi8(g0); - g0 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 0) & 0xFF])); - g1 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 8) & 0xFF])); - g0 = _mm256_inserti128_si256(g0, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 16) & 0xFF]), 1); - g1 = _mm256_inserti128_si256(g1, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 24) & 0xFF]), 1); + idx0 = _pdep_u64(good >> 0, 0x0101010101010101); + idx1 = _pdep_u64(good >> 8, 0x0101010101010101); + idx2 = _pdep_u64(good >> 16, 0x0101010101010101); + idx3 = _pdep_u64(good >> 24, 0x0101010101010101); + idx0 = (idx0 << 8) - idx0; + idx0 = _pext_u64(0x0E0C0A0806040200, idx0); + idx1 = (idx1 << 8) - idx1; + idx1 = _pext_u64(0x0E0C0A0806040200, idx1); + idx2 = (idx2 << 8) - idx2; + idx2 = _pext_u64(0x0E0C0A0806040200, idx2); + idx3 = (idx3 << 8) - idx3; + idx3 = _pext_u64(0x0E0C0A0806040200, idx3); - //g0 = _mm256_cvtepu8_epi64(_mm_loadl_epi64((__m128i *)&good)); - //g1 = _mm256_i64gather_epi64((long long *)idx.arr, g0, 8); - - /* Barrett reduction of (still unsigned) values */ - g2 = _mm256_mulhi_epu16(f0, v); - g3 = _mm256_mulhi_epu16(f1, v); - g2 = _mm256_srli_epi16(g2, 10); - g3 = _mm256_srli_epi16(g3, 10); - g2 = _mm256_mullo_epi16(g2, kyberq); - g3 = _mm256_mullo_epi16(g3, kyberq); - f0 = _mm256_sub_epi16(f0, g2); - f1 = _mm256_sub_epi16(f1, g3); + g0 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx0)); + g1 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx1)); + g0 = _mm256_inserti128_si256(g0, _mm_cvtsi64_si128(idx2), 1); + g1 = _mm256_inserti128_si256(g1, _mm_cvtsi64_si128(idx3), 1); g2 = _mm256_add_epi8(g0, ones); g3 = _mm256_add_epi8(g1, ones); @@ -328,34 +85,40 @@ unsigned int PQCLEAN_KYBER768_AVX2_rej_uniform_avx(int16_t *restrict r, ctr += _mm_popcnt_u32((good >> 24) & 0xFF); } - while (ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_BUFLEN - 16) { - f = _mm_load_si128((__m128i *)&buf[pos]); - t = _mm_cmpge_epu16(_mm256_castsi256_si128(bound), f); + while (ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_AVX_BUFLEN - 12) { + f = _mm_loadu_si128((__m128i *)&buf[pos]); + f = _mm_shuffle_epi8(f, _mm256_castsi256_si128(idx8)); + t = _mm_srli_epi16(f, 4); + f = _mm_blend_epi16(f, t, 0xAA); + f = _mm_and_si128(f, _mm256_castsi256_si128(mask)); + pos += 12; + + t = _mm_cmpgt_epi16(_mm256_castsi256_si128(bound), f); good = _mm_movemask_epi8(t); - good = _pext_u32(good, 0x5555); - pilo = _mm_loadl_epi64((__m128i *)&idx.arr[good]); + + good &= 0x5555; + idx0 = _pdep_u64(good, 0x1111111111111111); + idx0 = (idx0 << 8) - idx0; + idx0 = _pext_u64(0x0E0C0A0806040200, idx0); + pilo = _mm_cvtsi64_si128(idx0); + pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones)); pilo = _mm_unpacklo_epi8(pilo, pihi); - - /* Barrett reduction */ - t = _mm_mulhi_epu16(f, _mm256_castsi256_si128(v)); - t = _mm_srli_epi16(t, 10); - t = _mm_mullo_epi16(t, _mm256_castsi256_si128(kyberq)); - f = _mm_sub_epi16(f, t); - f = _mm_shuffle_epi8(f, pilo); _mm_storeu_si128((__m128i *)&r[ctr], f); ctr += _mm_popcnt_u32(good); - pos += 16; } - while (ctr < KYBER_N && pos <= REJ_UNIFORM_BUFLEN - 2) { - val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); - pos += 2; + while (ctr < KYBER_N && pos <= REJ_UNIFORM_AVX_BUFLEN - 3) { + val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; + val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)); + pos += 3; - if (val < 19 * KYBER_Q) { - val -= ((int32_t)val * 20159 >> 26) * KYBER_Q; - r[ctr++] = val; + if (val0 < KYBER_Q) { + r[ctr++] = val0; + } + if (val1 < KYBER_Q && ctr < KYBER_N) { + r[ctr++] = val1; } } diff --git a/crypto_kem/kyber768/avx2/rejsample.h b/crypto_kem/kyber768/avx2/rejsample.h index bb11cb83..effe39f5 100644 --- a/crypto_kem/kyber768/avx2/rejsample.h +++ b/crypto_kem/kyber768/avx2/rejsample.h @@ -1,9 +1,12 @@ #ifndef PQCLEAN_KYBER768_AVX2_REJSAMPLE_H #define PQCLEAN_KYBER768_AVX2_REJSAMPLE_H #include "params.h" +#include "symmetric.h" #include -unsigned int PQCLEAN_KYBER768_AVX2_rej_uniform_avx(int16_t *r, - const unsigned char *buf); +#define REJ_UNIFORM_AVX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) +#define REJ_UNIFORM_AVX_BUFLEN (REJ_UNIFORM_AVX_NBLOCKS*XOF_BLOCKBYTES) + +unsigned int PQCLEAN_KYBER768_AVX2_rej_uniform_avx(int16_t *r, const uint8_t *buf); #endif diff --git a/crypto_kem/kyber768/avx2/shuffle.S b/crypto_kem/kyber768/avx2/shuffle.S index f06a8c6a..5d3873e8 100644 --- a/crypto_kem/kyber768/avx2/shuffle.S +++ b/crypto_kem/kyber768/avx2/shuffle.S @@ -113,13 +113,13 @@ vmovdqa 224(%rsi),%ymm12 #csubq csubq 5,13 -csubq 6,14 -csubq 7,15 -csubq 8,1 +csubq 6,13 +csubq 7,13 +csubq 8,13 csubq 9,13 -csubq 10,14 -csubq 11,15 -csubq 12,1 +csubq 10,13 +csubq 11,13 +csubq 12,13 #bitpack vpsllw $12,%ymm6,%ymm4 diff --git a/crypto_kem/kyber768/avx2/shuffle.inc b/crypto_kem/kyber768/avx2/shuffle.inc index d4b092bc..73e9ffe0 100644 --- a/crypto_kem/kyber768/avx2/shuffle.inc +++ b/crypto_kem/kyber768/avx2/shuffle.inc @@ -9,17 +9,17 @@ vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 .endm .macro shuffle2 r0,r1,r2,r3 -#vshufps $0x20,%ymm\r1,%ymm\r0,%ymm\r2 -#vshufps $0x31,%ymm\r1,%ymm\r0,%ymm\r3 -vpsllq $32,%ymm\r1,%ymm12 -vpsrlq $32,%ymm\r0,%ymm13 -vpblendd $0xAA,%ymm12,%ymm\r0,%ymm\r2 -vpblendd $0xAA,%ymm\r1,%ymm13,%ymm\r3 +#vpsllq $32,%ymm\r1,%ymm\r2 +vmovsldup %ymm\r1,%ymm\r2 +vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrlq $32,%ymm\r0,%ymm\r0 +#vmovshdup %ymm\r0,%ymm\r0 +vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 .endm .macro shuffle1 r0,r1,r2,r3 -vpslld $16,%ymm\r1,%ymm12 -vpsrld $16,%ymm\r0,%ymm13 -vpblendw $0xAA,%ymm12,%ymm\r0,%ymm\r2 -vpblendw $0xAA,%ymm\r1,%ymm13,%ymm\r3 +vpslld $16,%ymm\r1,%ymm\r2 +vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrld $16,%ymm\r0,%ymm\r0 +vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 .endm diff --git a/crypto_kem/kyber768/avx2/symmetric-shake.c b/crypto_kem/kyber768/avx2/symmetric-shake.c index a953b40b..7a62d905 100644 --- a/crypto_kem/kyber768/avx2/symmetric-shake.c +++ b/crypto_kem/kyber768/avx2/symmetric-shake.c @@ -9,12 +9,10 @@ * * Description: Absorb step of the SHAKE128 specialized for the Kyber context. * -* Arguments: - xof_state *state: pointer to (uninitialized) output -* Keccak state -* - const uint8_t *seed: pointer to KYBER_SYMBYTES input -* to be absorbed into state -* - uint8_t i additional byte of input -* - uint8_t j additional byte of input +* Arguments: - xof_state *state: pointer to (uninitialized) output Keccak state +* - const uint8_t *seed: pointer to KYBER_SYMBYTES input to be absorbed into state +* - uint8_t i: additional byte of input +* - uint8_t j: additional byte of input **************************************************/ void PQCLEAN_KYBER768_AVX2_kyber_shake128_absorb(xof_state *state, const uint8_t seed[KYBER_SYMBYTES], @@ -26,8 +24,8 @@ void PQCLEAN_KYBER768_AVX2_kyber_shake128_absorb(xof_state *state, for (i = 0; i < KYBER_SYMBYTES; i++) { extseed[i] = seed[i]; } - extseed[i++] = x; - extseed[i] = y; + extseed[KYBER_SYMBYTES + 0] = x; + extseed[KYBER_SYMBYTES + 1] = y; shake128_absorb(state, extseed, sizeof(extseed)); } @@ -38,23 +36,19 @@ void PQCLEAN_KYBER768_AVX2_kyber_shake128_absorb(xof_state *state, * Description: Usage of SHAKE256 as a PRF, concatenates secret and public input * and then generates outlen bytes of SHAKE256 output * -* Arguments: - uint8_t *out: pointer to output -* - size_t outlen: number of requested output bytes -* - const uint8_t *key: pointer to the key -* (of length KYBER_SYMBYTES) -* - uint8_t nonce: single-byte nonce (public PRF input) +* Arguments: - uint8_t *out: pointer to output +* - size_t outlen: number of requested output bytes +* - const uint8_t *key: pointer to the key (of length KYBER_SYMBYTES) +* - uint8_t nonce: single-byte nonce (public PRF input) **************************************************/ -void PQCLEAN_KYBER768_AVX2_kyber_shake256_prf(uint8_t *out, - size_t outlen, - const uint8_t key[KYBER_SYMBYTES], - uint8_t nonce) { +void PQCLEAN_KYBER768_AVX2_kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce) { unsigned int i; uint8_t extkey[KYBER_SYMBYTES + 1]; for (i = 0; i < KYBER_SYMBYTES; i++) { extkey[i] = key[i]; } - extkey[i] = nonce; + extkey[KYBER_SYMBYTES] = nonce; shake256(out, outlen, extkey, sizeof(extkey)); } diff --git a/crypto_kem/kyber768/avx2/symmetric.h b/crypto_kem/kyber768/avx2/symmetric.h index bd4a6e36..f0c993e2 100644 --- a/crypto_kem/kyber768/avx2/symmetric.h +++ b/crypto_kem/kyber768/avx2/symmetric.h @@ -15,21 +15,16 @@ void PQCLEAN_KYBER768_AVX2_kyber_shake128_absorb(xof_state *s, uint8_t x, uint8_t y); -void PQCLEAN_KYBER768_AVX2_kyber_shake256_prf(uint8_t *out, - size_t outlen, - const uint8_t key[KYBER_SYMBYTES], - uint8_t nonce); +void PQCLEAN_KYBER768_AVX2_kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce); #define XOF_BLOCKBYTES SHAKE128_RATE #define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES) #define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES) #define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER768_AVX2_kyber_shake128_absorb(STATE, SEED, X, Y) -#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) \ - shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) #define xof_ctx_release(STATE) shake128_ctx_release(STATE) -#define prf(OUT, OUTBYTES, KEY, NONCE) \ - PQCLEAN_KYBER768_AVX2_kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE) +#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER768_AVX2_kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE) #define kdf(OUT, IN, INBYTES) shake256(OUT, KYBER_SSBYTES, IN, INBYTES) diff --git a/crypto_kem/kyber768/avx2/verify.c b/crypto_kem/kyber768/avx2/verify.c index ab36495a..60ba304e 100644 --- a/crypto_kem/kyber768/avx2/verify.c +++ b/crypto_kem/kyber768/avx2/verify.c @@ -8,31 +8,31 @@ * * Description: Compare two arrays for equality in constant time. * -* Arguments: const unsigned char *a: pointer to first byte array -* const unsigned char *b: pointer to second byte array -* size_t len: length of the byte arrays +* Arguments: const uint8_t *a: pointer to first byte array +* const uint8_t *b: pointer to second byte array +* size_t len: length of the byte arrays * * Returns 0 if the byte arrays are equal, 1 otherwise **************************************************/ int PQCLEAN_KYBER768_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) { - size_t pos; + size_t i; uint64_t r; - __m256i avec, bvec, cvec; + __m256i f, g, h; - cvec = _mm256_setzero_si256(); - for (pos = 0; pos + 32 <= len; pos += 32) { - avec = _mm256_loadu_si256((__m256i *)&a[pos]); - bvec = _mm256_loadu_si256((__m256i *)&b[pos]); - avec = _mm256_xor_si256(avec, bvec); - cvec = _mm256_or_si256(cvec, avec); + h = _mm256_setzero_si256(); + for (i = 0; i < len / 32; i++) { + f = _mm256_loadu_si256((__m256i *)&a[32 * i]); + g = _mm256_loadu_si256((__m256i *)&b[32 * i]); + f = _mm256_xor_si256(f, g); + h = _mm256_or_si256(h, f); } - r = 1 - _mm256_testz_si256(cvec, cvec); + r = 1 - _mm256_testz_si256(h, h); - if (pos < len) { - avec = _mm256_loadu_si256((__m256i *)&a[pos]); - bvec = _mm256_loadu_si256((__m256i *)&b[pos]); - cvec = _mm256_cmpeq_epi8(avec, bvec); - r |= _mm256_movemask_epi8(cvec) & (-(uint32_t)1 >> (32 + pos - len)); + a += 32 * i; + b += 32 * i; + len -= 32 * i; + for (i = 0; i < len; i++) { + r |= a[i] ^ b[i]; } r = (-r) >> 63; @@ -47,29 +47,27 @@ int PQCLEAN_KYBER768_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) * assumes two's complement representation of negative integers. * Runs in constant time. * -* Arguments: unsigned char *r: pointer to output byte array +* Arguments: unsigned char *r: pointer to output byte array * const unsigned char *x: pointer to input byte array -* size_t len: Amount of bytes to be copied -* unsigned char b: Condition bit; has to be in {0,1} +* size_t len: Amount of bytes to be copied +* unsigned char b: Condition bit; has to be in {0,1} **************************************************/ -void PQCLEAN_KYBER768_AVX2_cmov(uint8_t *restrict r, const uint8_t *restrict x, size_t len, uint8_t b) { - size_t pos; +void PQCLEAN_KYBER768_AVX2_cmov(uint8_t *restrict r, const uint8_t *x, size_t len, uint8_t b) { + size_t i; __m256i xvec, rvec, bvec; - b = -b; - bvec = _mm256_set1_epi8(b); - - for (pos = 0; pos + 32 <= len; pos += 32) { - rvec = _mm256_loadu_si256((__m256i *)&r[pos]); - xvec = _mm256_loadu_si256((__m256i *)&x[pos]); - xvec = _mm256_xor_si256(xvec, rvec); - xvec = _mm256_and_si256(xvec, bvec); - rvec = _mm256_xor_si256(rvec, xvec); - _mm256_storeu_si256((__m256i *)&r[pos], rvec); + bvec = _mm256_set1_epi64x(-(uint64_t)b); + for (i = 0; i < len / 32; i++) { + rvec = _mm256_loadu_si256((__m256i *)&r[32 * i]); + xvec = _mm256_loadu_si256((__m256i *)&x[32 * i]); + rvec = _mm256_blendv_epi8(rvec, xvec, bvec); + _mm256_storeu_si256((__m256i *)&r[32 * i], rvec); } - while (pos < len) { - r[pos] ^= b & (x[pos] ^ r[pos]); - pos += 1; + r += 32 * i; + x += 32 * i; + len -= 32 * i; + for (i = 0; i < len; i++) { + r[i] ^= -b & (x[i] ^ r[i]); } } diff --git a/crypto_kem/kyber768/clean/cbd.c b/crypto_kem/kyber768/clean/cbd.c index 22100fa0..bd9bf87b 100644 --- a/crypto_kem/kyber768/clean/cbd.c +++ b/crypto_kem/kyber768/clean/cbd.c @@ -5,7 +5,7 @@ /************************************************* * Name: load32_littleendian * -* Description: load bytes into a 32-bit integer +* Description: load 4 bytes into a 32-bit integer * in little-endian order * * Arguments: - const uint8_t *x: pointer to input byte array @@ -22,16 +22,29 @@ static uint32_t load32_littleendian(const uint8_t x[4]) { } /************************************************* -* Name: PQCLEAN_KYBER768_CLEAN_cbd +* Name: load24_littleendian +* +* Description: load 3 bytes into a 32-bit integer +* in little-endian order. +* This function is only needed for Kyber-512 +* +* Arguments: - const uint8_t *x: pointer to input byte array +* +* Returns 32-bit unsigned integer loaded from x (most significant byte is zero) +**************************************************/ + + +/************************************************* +* Name: cbd2 * * Description: Given an array of uniformly random bytes, compute * polynomial with coefficients distributed according to -* a centered binomial distribution with parameter KYBER_ETA +* a centered binomial distribution with parameter eta=2 * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *buf: pointer to input byte array **************************************************/ -void PQCLEAN_KYBER768_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]) { +static void cbd2(poly *r, const uint8_t buf[2 * KYBER_N / 4]) { unsigned int i, j; uint32_t t, d; int16_t a, b; @@ -48,3 +61,23 @@ void PQCLEAN_KYBER768_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / } } } + +/************************************************* +* Name: cbd3 +* +* Description: Given an array of uniformly random bytes, compute +* polynomial with coefficients distributed according to +* a centered binomial distribution with parameter eta=3. +* This function is only needed for Kyber-512 +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *buf: pointer to input byte array +**************************************************/ + +void PQCLEAN_KYBER768_CLEAN_poly_cbd_eta1(poly *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]) { + cbd2(r, buf); +} + +void PQCLEAN_KYBER768_CLEAN_poly_cbd_eta2(poly *r, const uint8_t buf[KYBER_ETA2 * KYBER_N / 4]) { + cbd2(r, buf); +} diff --git a/crypto_kem/kyber768/clean/cbd.h b/crypto_kem/kyber768/clean/cbd.h index 7e59c9c8..cf885109 100644 --- a/crypto_kem/kyber768/clean/cbd.h +++ b/crypto_kem/kyber768/clean/cbd.h @@ -4,6 +4,8 @@ #include "poly.h" #include -void PQCLEAN_KYBER768_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]); +void PQCLEAN_KYBER768_CLEAN_poly_cbd_eta1(poly *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]); + +void PQCLEAN_KYBER768_CLEAN_poly_cbd_eta2(poly *r, const uint8_t buf[KYBER_ETA2 * KYBER_N / 4]); #endif diff --git a/crypto_kem/kyber768/clean/indcpa.c b/crypto_kem/kyber768/clean/indcpa.c index f111358f..d45118e9 100644 --- a/crypto_kem/kyber768/clean/indcpa.c +++ b/crypto_kem/kyber768/clean/indcpa.c @@ -15,8 +15,8 @@ * serialized vector of polynomials pk * and the public seed used to generate the matrix A. * -* Arguments: uint8_t *r: pointer to the output serialized public key -* polyvec *pk: pointer to the input public-key polyvec +* Arguments: uint8_t *r: pointer to the output serialized public key +* polyvec *pk: pointer to the input public-key polyvec * const uint8_t *seed: pointer to the input public seed **************************************************/ static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], @@ -35,10 +35,8 @@ static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], * Description: De-serialize public key from a byte array; * approximate inverse of pack_pk * -* Arguments: - polyvec *pk: pointer to output public-key -* polynomial vector -* - uint8_t *seed: pointer to output seed to generate -* matrix A +* Arguments: - polyvec *pk: pointer to output public-key polynomial vector +* - uint8_t *seed: pointer to output seed to generate matrix A * - const uint8_t *packedpk: pointer to input serialized public key **************************************************/ static void unpack_pk(polyvec *pk, @@ -56,7 +54,7 @@ static void unpack_pk(polyvec *pk, * * Description: Serialize the secret key * -* Arguments: - uint8_t *r: pointer to output serialized secret key +* Arguments: - uint8_t *r: pointer to output serialized secret key * - polyvec *sk: pointer to input vector of polynomials (secret key) **************************************************/ static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { @@ -66,15 +64,12 @@ static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { /************************************************* * Name: unpack_sk * -* Description: De-serialize the secret key; -* inverse of pack_sk +* Description: De-serialize the secret key; inverse of pack_sk * -* Arguments: - polyvec *sk: pointer to output vector of -* polynomials (secret key) +* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key) * - const uint8_t *packedsk: pointer to input serialized secret key **************************************************/ -static void unpack_sk(polyvec *sk, - const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { +static void unpack_sk(polyvec *sk, const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { PQCLEAN_KYBER768_CLEAN_polyvec_frombytes(sk, packedsk); } @@ -86,12 +81,10 @@ static void unpack_sk(polyvec *sk, * and the compressed and serialized polynomial v * * Arguments: uint8_t *r: pointer to the output serialized ciphertext -* poly *pk: pointer to the input vector of polynomials b -* poly *v: pointer to the input polynomial v +* poly *pk: pointer to the input vector of polynomials b +* poly *v: pointer to the input polynomial v **************************************************/ -static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], - polyvec *b, - poly *v) { +static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], polyvec *b, poly *v) { PQCLEAN_KYBER768_CLEAN_polyvec_compress(r, b); PQCLEAN_KYBER768_CLEAN_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v); } @@ -102,13 +95,11 @@ static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], * Description: De-serialize and decompress ciphertext from a byte array; * approximate inverse of pack_ciphertext * -* Arguments: - polyvec *b: pointer to the output vector of polynomials b -* - poly *v: pointer to the output polynomial v +* Arguments: - polyvec *b: pointer to the output vector of polynomials b +* - poly *v: pointer to the output polynomial v * - const uint8_t *c: pointer to the input serialized ciphertext **************************************************/ -static void unpack_ciphertext(polyvec *b, - poly *v, - const uint8_t c[KYBER_INDCPA_BYTES]) { +static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t c[KYBER_INDCPA_BYTES]) { PQCLEAN_KYBER768_CLEAN_polyvec_decompress(b, c); PQCLEAN_KYBER768_CLEAN_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES); } @@ -119,11 +110,9 @@ static void unpack_ciphertext(polyvec *b, * Description: Run rejection sampling on uniform random bytes to generate * uniform random integers mod q * -* Arguments: - int16_t *r: pointer to output buffer -* - unsigned int len: requested number of 16-bit integers -* (uniform mod q) -* - const uint8_t *buf: pointer to input buffer -* (assumed to be uniform random bytes) +* Arguments: - int16_t *r: pointer to output buffer +* - unsigned int len: requested number of 16-bit integers (uniform mod q) +* - const uint8_t *buf: pointer to input buffer (assumed to be uniformly random bytes) * - unsigned int buflen: length of input buffer in bytes * * Returns number of sampled 16-bit integers (at most len) @@ -133,16 +122,19 @@ static unsigned int rej_uniform(int16_t *r, const uint8_t *buf, unsigned int buflen) { unsigned int ctr, pos; - uint16_t val; + uint16_t val0, val1; ctr = pos = 0; - while (ctr < len && pos + 2 <= buflen) { - val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); - pos += 2; + while (ctr < len && pos + 3 <= buflen) { + val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF; + val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF; + pos += 3; - if (val < 19 * KYBER_Q) { - val -= (val >> 12) * KYBER_Q; // Barrett reduction - r[ctr++] = (int16_t)val; + if (val0 < KYBER_Q) { + r[ctr++] = val0; + } + if (ctr < len && val1 < KYBER_Q) { + r[ctr++] = val1; } } @@ -160,17 +152,16 @@ static unsigned int rej_uniform(int16_t *r, * uniformly random. Performs rejection sampling on output of * a XOF * -* Arguments: - polyvec *a: pointer to ouptput matrix A +* Arguments: - polyvec *a: pointer to ouptput matrix A * - const uint8_t *seed: pointer to input seed -* - int transposed: boolean deciding whether A or A^T -* is generated +* - int transposed: boolean deciding whether A or A^T is generated **************************************************/ -#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \ - + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) +#define GEN_MATRIX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) // Not static for benchmarking void PQCLEAN_KYBER768_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed) { - unsigned int ctr, i, j; - uint8_t buf[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES]; + unsigned int ctr, i, j, k; + unsigned int buflen, off; + uint8_t buf[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2]; xof_state state; for (i = 0; i < KYBER_K; i++) { @@ -182,12 +173,17 @@ void PQCLEAN_KYBER768_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMB } xof_squeezeblocks(buf, GEN_MATRIX_NBLOCKS, &state); - ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, sizeof(buf)); + buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES; + ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, buflen); while (ctr < KYBER_N) { - xof_squeezeblocks(buf, 1, &state); - ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, - XOF_BLOCKBYTES); + off = buflen % 3; + for (k = 0; k < off; k++) { + buf[k] = buf[buflen - off + k]; + } + xof_squeezeblocks(buf + off, 1, &state); + buflen = off + XOF_BLOCKBYTES; + ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, buflen); } xof_ctx_release(&state); } @@ -220,10 +216,10 @@ void PQCLEAN_KYBER768_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTE gen_a(a, publicseed); for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER768_CLEAN_poly_getnoise(&skpv.vec[i], noiseseed, nonce++); + PQCLEAN_KYBER768_CLEAN_poly_getnoise_eta1(&skpv.vec[i], noiseseed, nonce++); } for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER768_CLEAN_poly_getnoise(&e.vec[i], noiseseed, nonce++); + PQCLEAN_KYBER768_CLEAN_poly_getnoise_eta1(&e.vec[i], noiseseed, nonce++); } PQCLEAN_KYBER768_CLEAN_polyvec_ntt(&skpv); @@ -231,7 +227,7 @@ void PQCLEAN_KYBER768_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTE // matrix-vector multiplication for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER768_CLEAN_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); + PQCLEAN_KYBER768_CLEAN_polyvec_basemul_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); PQCLEAN_KYBER768_CLEAN_poly_tomont(&pkpv.vec[i]); } @@ -248,16 +244,15 @@ void PQCLEAN_KYBER768_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTE * Description: Encryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *c: pointer to output ciphertext -* (of length KYBER_INDCPA_BYTES bytes) -* - const uint8_t *m: pointer to input message -* (of length KYBER_INDCPA_MSGBYTES bytes) -* - const uint8_t *pk: pointer to input public key -* (of length KYBER_INDCPA_PUBLICKEYBYTES) -* - const uint8_t *coins: pointer to input random coins -* used as seed (of length KYBER_SYMBYTES) -* to deterministically generate all -* randomness +* Arguments: - uint8_t *c: pointer to output ciphertext +* (of length KYBER_INDCPA_BYTES bytes) +* - const uint8_t *m: pointer to input message +* (of length KYBER_INDCPA_MSGBYTES bytes) +* - const uint8_t *pk: pointer to input public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES) +* - const uint8_t *coins: pointer to input random coins used as seed +* (of length KYBER_SYMBYTES) to deterministically +* generate all randomness **************************************************/ void PQCLEAN_KYBER768_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], const uint8_t m[KYBER_INDCPA_MSGBYTES], @@ -266,7 +261,7 @@ void PQCLEAN_KYBER768_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], unsigned int i; uint8_t seed[KYBER_SYMBYTES]; uint8_t nonce = 0; - polyvec sp, pkpv, ep, at[KYBER_K], bp; + polyvec sp, pkpv, ep, at[KYBER_K], b; poly v, k, epp; unpack_pk(&pkpv, seed, pk); @@ -274,32 +269,32 @@ void PQCLEAN_KYBER768_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], gen_at(at, seed); for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER768_CLEAN_poly_getnoise(sp.vec + i, coins, nonce++); + PQCLEAN_KYBER768_CLEAN_poly_getnoise_eta1(sp.vec + i, coins, nonce++); } for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER768_CLEAN_poly_getnoise(ep.vec + i, coins, nonce++); + PQCLEAN_KYBER768_CLEAN_poly_getnoise_eta2(ep.vec + i, coins, nonce++); } - PQCLEAN_KYBER768_CLEAN_poly_getnoise(&epp, coins, nonce++); + PQCLEAN_KYBER768_CLEAN_poly_getnoise_eta2(&epp, coins, nonce++); PQCLEAN_KYBER768_CLEAN_polyvec_ntt(&sp); // matrix-vector multiplication for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER768_CLEAN_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp); + PQCLEAN_KYBER768_CLEAN_polyvec_basemul_acc_montgomery(&b.vec[i], &at[i], &sp); } - PQCLEAN_KYBER768_CLEAN_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp); + PQCLEAN_KYBER768_CLEAN_polyvec_basemul_acc_montgomery(&v, &pkpv, &sp); - PQCLEAN_KYBER768_CLEAN_polyvec_invntt_tomont(&bp); + PQCLEAN_KYBER768_CLEAN_polyvec_invntt_tomont(&b); PQCLEAN_KYBER768_CLEAN_poly_invntt_tomont(&v); - PQCLEAN_KYBER768_CLEAN_polyvec_add(&bp, &bp, &ep); + PQCLEAN_KYBER768_CLEAN_polyvec_add(&b, &b, &ep); PQCLEAN_KYBER768_CLEAN_poly_add(&v, &v, &epp); PQCLEAN_KYBER768_CLEAN_poly_add(&v, &v, &k); - PQCLEAN_KYBER768_CLEAN_polyvec_reduce(&bp); + PQCLEAN_KYBER768_CLEAN_polyvec_reduce(&b); PQCLEAN_KYBER768_CLEAN_poly_reduce(&v); - pack_ciphertext(c, &bp, &v); + pack_ciphertext(c, &b, &v); } /************************************************* @@ -308,24 +303,24 @@ void PQCLEAN_KYBER768_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], * Description: Decryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *m: pointer to output decrypted message -* (of length KYBER_INDCPA_MSGBYTES) -* - const uint8_t *c: pointer to input ciphertext -* (of length KYBER_INDCPA_BYTES) +* Arguments: - uint8_t *m: pointer to output decrypted message +* (of length KYBER_INDCPA_MSGBYTES) +* - const uint8_t *c: pointer to input ciphertext +* (of length KYBER_INDCPA_BYTES) * - const uint8_t *sk: pointer to input secret key * (of length KYBER_INDCPA_SECRETKEYBYTES) **************************************************/ void PQCLEAN_KYBER768_CLEAN_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t c[KYBER_INDCPA_BYTES], const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { - polyvec bp, skpv; + polyvec b, skpv; poly v, mp; - unpack_ciphertext(&bp, &v, c); + unpack_ciphertext(&b, &v, c); unpack_sk(&skpv, sk); - PQCLEAN_KYBER768_CLEAN_polyvec_ntt(&bp); - PQCLEAN_KYBER768_CLEAN_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp); + PQCLEAN_KYBER768_CLEAN_polyvec_ntt(&b); + PQCLEAN_KYBER768_CLEAN_polyvec_basemul_acc_montgomery(&mp, &skpv, &b); PQCLEAN_KYBER768_CLEAN_poly_invntt_tomont(&mp); PQCLEAN_KYBER768_CLEAN_poly_sub(&mp, &v, &mp); diff --git a/crypto_kem/kyber768/clean/kem.c b/crypto_kem/kyber768/clean/kem.c index eb652689..8e4e3f95 100644 --- a/crypto_kem/kyber768/clean/kem.c +++ b/crypto_kem/kyber768/clean/kem.c @@ -14,13 +14,14 @@ * for CCA-secure Kyber key encapsulation mechanism * * Arguments: - unsigned char *pk: pointer to output public key -* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) * - unsigned char *sk: pointer to output private key -* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* (an already allocated array of KYBER_SECRETKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER768_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { +int PQCLEAN_KYBER768_CLEAN_crypto_kem_keypair(unsigned char pk[KYBER_PUBLICKEYBYTES], + unsigned char sk[KYBER_SECRETKEYBYTES]) { size_t i; PQCLEAN_KYBER768_CLEAN_indcpa_keypair(pk, sk); for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) { @@ -39,17 +40,17 @@ int PQCLEAN_KYBER768_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char * * secret for given public key * * Arguments: - unsigned char *ct: pointer to output cipher text -* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) * - unsigned char *ss: pointer to output shared secret -* (an already allocated array of CRYPTO_BYTES bytes) +* (an already allocated array of KYBER_SSBYTES bytes) * - const unsigned char *pk: pointer to input public key -* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER768_CLEAN_crypto_kem_enc(unsigned char *ct, - unsigned char *ss, - const unsigned char *pk) { +int PQCLEAN_KYBER768_CLEAN_crypto_kem_enc(unsigned char ct[KYBER_CIPHERTEXTBYTES], + unsigned char ss[KYBER_SSBYTES], + const unsigned char pk[KYBER_PUBLICKEYBYTES]) { uint8_t buf[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ uint8_t kr[2 * KYBER_SYMBYTES]; @@ -79,19 +80,19 @@ int PQCLEAN_KYBER768_CLEAN_crypto_kem_enc(unsigned char *ct, * cipher text and private key * * Arguments: - unsigned char *ss: pointer to output shared secret -* (an already allocated array of CRYPTO_BYTES bytes) +* (an already allocated array of KYBER_SSBYTES bytes) * - const unsigned char *ct: pointer to input cipher text -* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) * - const unsigned char *sk: pointer to input private key -* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* (an already allocated array of KYBER_SECRETKEYBYTES bytes) * * Returns 0. * * On failure, ss will contain a pseudo-random value. **************************************************/ -int PQCLEAN_KYBER768_CLEAN_crypto_kem_dec(unsigned char *ss, - const unsigned char *ct, - const unsigned char *sk) { +int PQCLEAN_KYBER768_CLEAN_crypto_kem_dec(unsigned char ss[KYBER_SSBYTES], + const unsigned char ct[KYBER_CIPHERTEXTBYTES], + const unsigned char sk[KYBER_SECRETKEYBYTES]) { size_t i; int fail; uint8_t buf[2 * KYBER_SYMBYTES]; diff --git a/crypto_kem/kyber768/clean/ntt.c b/crypto_kem/kyber768/clean/ntt.c index 06862578..893ee3c7 100644 --- a/crypto_kem/kyber768/clean/ntt.c +++ b/crypto_kem/kyber768/clean/ntt.c @@ -3,11 +3,11 @@ #include "reduce.h" #include -/* Code to generate PQCLEAN_KYBER768_CLEAN_zetas and PQCLEAN_KYBER768_CLEAN_zetas_inv used in the number-theoretic transform: +/* Code to generate PQCLEAN_KYBER768_CLEAN_zetas and zetas_inv used in the number-theoretic transform: #define KYBER_ROOT_OF_UNITY 17 -static const uint16_t tree[128] = { +static const uint8_t tree[128] = { 0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120, 4, 68, 36, 100, 20, 84, 52, 116, 12, 76, 44, 108, 28, 92, 60, 124, 2, 66, 34, 98, 18, 82, 50, 114, 10, 74, 42, 106, 26, 90, 58, 122, @@ -19,51 +19,41 @@ static const uint16_t tree[128] = { }; void init_ntt() { - unsigned int i, j, k; + unsigned int i; int16_t tmp[128]; tmp[0] = MONT; - for(i = 1; i < 128; ++i) - tmp[i] = fqmul(tmp[i-1], KYBER_ROOT_OF_UNITY*MONT % KYBER_Q); + for(i=1;i<128;i++) + tmp[i] = fqmul(tmp[i-1],MONT*KYBER_ROOT_OF_UNITY % KYBER_Q); - for(i = 0; i < 128; ++i) + for(i=0;i<128;i++) { PQCLEAN_KYBER768_CLEAN_zetas[i] = tmp[tree[i]]; - - k = 0; - for(i = 64; i >= 1; i >>= 1) - for(j = i; j < 2*i; ++j) - PQCLEAN_KYBER768_CLEAN_zetas_inv[k++] = -tmp[128 - tree[j]]; - - PQCLEAN_KYBER768_CLEAN_zetas_inv[127] = MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q; + if(PQCLEAN_KYBER768_CLEAN_zetas[i] > KYBER_Q/2) + PQCLEAN_KYBER768_CLEAN_zetas[i] -= KYBER_Q; + if(PQCLEAN_KYBER768_CLEAN_zetas[i] < -KYBER_Q/2) + PQCLEAN_KYBER768_CLEAN_zetas[i] += KYBER_Q; + } } - */ const int16_t PQCLEAN_KYBER768_CLEAN_zetas[128] = { - 2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182, 962, - 2127, 1855, 1468, 573, 2004, 264, 383, 2500, 1458, 1727, 3199, 2648, 1017, - 732, 608, 1787, 411, 3124, 1758, 1223, 652, 2777, 1015, 2036, 1491, 3047, - 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, 2476, 3239, 3058, 830, - 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, 2226, - 430, 555, 843, 2078, 871, 1550, 105, 422, 587, 177, 3094, 3038, 2869, 1574, - 1653, 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349, - 418, 329, 3173, 3254, 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193, - 1218, 1994, 2455, 220, 2142, 1670, 2144, 1799, 2051, 794, 1819, 2475, 2459, - 478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628 -}; - -const int16_t PQCLEAN_KYBER768_CLEAN_zetas_inv[128] = { - 1701, 1807, 1460, 2371, 2338, 2333, 308, 108, 2851, 870, 854, 1510, 2535, - 1278, 1530, 1185, 1659, 1187, 3109, 874, 1335, 2111, 136, 1215, 2945, 1465, - 1285, 2007, 2719, 2726, 2232, 2512, 75, 156, 3000, 2911, 2980, 872, 2685, - 1590, 2210, 602, 1846, 777, 147, 2170, 2551, 246, 1676, 1755, 460, 291, 235, - 3152, 2742, 2907, 3224, 1779, 2458, 1251, 2486, 2774, 2899, 1103, 1275, 2652, - 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, - 1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, - 2677, 2106, 1571, 205, 2918, 1542, 2721, 2597, 2312, 681, 130, 1602, 1871, - 829, 2946, 3065, 1325, 2756, 1861, 1474, 1202, 2367, 3147, 1752, 2707, 171, - 3127, 3042, 1907, 1836, 1517, 359, 758, 1441 -}; + -1044, -758, -359, -1517, 1493, 1422, 287, 202, + -171, 622, 1577, 182, 962, -1202, -1474, 1468, + 573, -1325, 264, 383, -829, 1458, -1602, -130, + -681, 1017, 732, 608, -1542, 411, -205, -1571, + 1223, 652, -552, 1015, -1293, 1491, -282, -1544, + 516, -8, -320, -666, -1618, -1162, 126, 1469, + -853, -90, -271, 830, 107, -1421, -247, -951, + -398, 961, -1508, -725, 448, -1065, 677, -1275, + -1103, 430, 555, 843, -1251, 871, 1550, 105, + 422, 587, 177, -235, -291, -460, 1574, 1653, + -246, 778, 1159, -147, -777, 1483, -602, 1119, + -1590, 644, -872, 349, 418, 329, -156, -75, + 817, 1097, 603, 610, 1322, -1285, -1465, 384, + -1215, -136, 1218, -1335, -874, 220, -1187, -1659, + -1185, -1530, -1278, 794, -1510, -854, -870, 478, + -108, -308, 996, 991, 958, -1460, 1522, 1628 + }; /************************************************* * Name: fqmul @@ -82,11 +72,10 @@ static int16_t fqmul(int16_t a, int16_t b) { /************************************************* * Name: PQCLEAN_KYBER768_CLEAN_ntt * -* Description: Inplace number-theoretic transform (NTT) in Rq +* Description: Inplace number-theoretic transform (NTT) in Rq. * input is in standard order, output is in bitreversed order * -* Arguments: - int16_t r[256]: pointer to input/output vector of elements -* of Zq +* Arguments: - int16_t r[256]: pointer to input/output vector of elements of Zq **************************************************/ void PQCLEAN_KYBER768_CLEAN_ntt(int16_t r[256]) { unsigned int len, start, j, k; @@ -96,7 +85,7 @@ void PQCLEAN_KYBER768_CLEAN_ntt(int16_t r[256]) { for (len = 128; len >= 2; len >>= 1) { for (start = 0; start < 256; start = j + len) { zeta = PQCLEAN_KYBER768_CLEAN_zetas[k++]; - for (j = start; j < start + len; ++j) { + for (j = start; j < start + len; j++) { t = fqmul(zeta, r[j + len]); r[j + len] = r[j] - t; r[j] = r[j] + t; @@ -112,28 +101,28 @@ void PQCLEAN_KYBER768_CLEAN_ntt(int16_t r[256]) { * multiplication by Montgomery factor 2^16. * Input is in bitreversed order, output is in standard order * -* Arguments: - int16_t r[256]: pointer to input/output vector of elements -* of Zq +* Arguments: - int16_t r[256]: pointer to input/output vector of elements of Zq **************************************************/ void PQCLEAN_KYBER768_CLEAN_invntt(int16_t r[256]) { unsigned int start, len, j, k; int16_t t, zeta; + const int16_t f = 1441; // mont^2/128 - k = 0; + k = 127; for (len = 2; len <= 128; len <<= 1) { for (start = 0; start < 256; start = j + len) { - zeta = PQCLEAN_KYBER768_CLEAN_zetas_inv[k++]; - for (j = start; j < start + len; ++j) { + zeta = PQCLEAN_KYBER768_CLEAN_zetas[k--]; + for (j = start; j < start + len; j++) { t = r[j]; r[j] = PQCLEAN_KYBER768_CLEAN_barrett_reduce(t + r[j + len]); - r[j + len] = t - r[j + len]; + r[j + len] = r[j + len] - t; r[j + len] = fqmul(zeta, r[j + len]); } } } - for (j = 0; j < 256; ++j) { - r[j] = fqmul(r[j], PQCLEAN_KYBER768_CLEAN_zetas_inv[127]); + for (j = 0; j < 256; j++) { + r[j] = fqmul(r[j], f); } } @@ -143,19 +132,15 @@ void PQCLEAN_KYBER768_CLEAN_invntt(int16_t r[256]) { * Description: Multiplication of polynomials in Zq[X]/(X^2-zeta) * used for multiplication of elements in Rq in NTT domain * -* Arguments: - int16_t r[2]: pointer to the output polynomial +* Arguments: - int16_t r[2]: pointer to the output polynomial * - const int16_t a[2]: pointer to the first factor * - const int16_t b[2]: pointer to the second factor -* - int16_t zeta: integer defining the reduction polynomial +* - int16_t zeta: integer defining the reduction polynomial **************************************************/ -void PQCLEAN_KYBER768_CLEAN_basemul(int16_t r[2], - const int16_t a[2], - const int16_t b[2], - int16_t zeta) { +void PQCLEAN_KYBER768_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta) { r[0] = fqmul(a[1], b[1]); r[0] = fqmul(r[0], zeta); r[0] += fqmul(a[0], b[0]); - r[1] = fqmul(a[0], b[1]); r[1] += fqmul(a[1], b[0]); } diff --git a/crypto_kem/kyber768/clean/ntt.h b/crypto_kem/kyber768/clean/ntt.h index 4097a791..ad1cd083 100644 --- a/crypto_kem/kyber768/clean/ntt.h +++ b/crypto_kem/kyber768/clean/ntt.h @@ -5,15 +5,10 @@ extern const int16_t PQCLEAN_KYBER768_CLEAN_zetas[128]; -extern const int16_t PQCLEAN_KYBER768_CLEAN_zetas_inv[128]; - void PQCLEAN_KYBER768_CLEAN_ntt(int16_t r[256]); void PQCLEAN_KYBER768_CLEAN_invntt(int16_t r[256]); -void PQCLEAN_KYBER768_CLEAN_basemul(int16_t r[2], - const int16_t a[2], - const int16_t b[2], - int16_t zeta); +void PQCLEAN_KYBER768_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta); #endif diff --git a/crypto_kem/kyber768/clean/params.h b/crypto_kem/kyber768/clean/params.h index c711c62d..45eeac6c 100644 --- a/crypto_kem/kyber768/clean/params.h +++ b/crypto_kem/kyber768/clean/params.h @@ -7,8 +7,6 @@ #define KYBER_N 256 #define KYBER_Q 3329 -#define KYBER_ETA 2 - #define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define KYBER_SSBYTES 32 /* size in bytes of shared key */ @@ -16,20 +14,20 @@ #define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) #define KYBER_K 3 +#define KYBER_ETA1 2 #define KYBER_POLYCOMPRESSEDBYTES 128 #define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320) -#define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES +#define KYBER_ETA2 2 + +#define KYBER_INDCPA_MSGBYTES (KYBER_SYMBYTES) #define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) #define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) -#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES \ - + KYBER_POLYCOMPRESSEDBYTES) +#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES) #define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES) /* 32 bytes of additional space to save H(pk) */ -#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES \ - + KYBER_INDCPA_PUBLICKEYBYTES \ - + 2*KYBER_SYMBYTES) -#define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES +#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) +#define KYBER_CIPHERTEXTBYTES (KYBER_INDCPA_BYTES) #endif diff --git a/crypto_kem/kyber768/clean/poly.c b/crypto_kem/kyber768/clean/poly.c index 3fe4f680..7307f5ee 100644 --- a/crypto_kem/kyber768/clean/poly.c +++ b/crypto_kem/kyber768/clean/poly.c @@ -13,17 +13,19 @@ * * Arguments: - uint8_t *r: pointer to output byte array * (of length KYBER_POLYCOMPRESSEDBYTES) -* - poly *a: pointer to input polynomial +* - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER768_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a) { +void PQCLEAN_KYBER768_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a) { size_t i, j; + int16_t u; uint8_t t[8]; - PQCLEAN_KYBER768_CLEAN_poly_csubq(a); - for (i = 0; i < KYBER_N / 8; i++) { for (j = 0; j < 8; j++) { - t[j] = ((((uint16_t)a->coeffs[8 * i + j] << 4) + KYBER_Q / 2) / KYBER_Q) & 15; + // map to positive standard representatives + u = a->coeffs[8 * i + j]; + u += (u >> 15) & KYBER_Q; + t[j] = ((((uint16_t)u << 4) + KYBER_Q / 2) / KYBER_Q) & 15; } r[0] = t[0] | (t[1] << 4); @@ -40,7 +42,7 @@ void PQCLEAN_KYBER768_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], * Description: De-serialization and subsequent decompression of a polynomial; * approximate inverse of PQCLEAN_KYBER768_CLEAN_poly_compress * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: pointer to input byte array * (of length KYBER_POLYCOMPRESSEDBYTES bytes) **************************************************/ @@ -61,20 +63,21 @@ void PQCLEAN_KYBER768_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_POLYC * * Arguments: - uint8_t *r: pointer to output byte array * (needs space for KYBER_POLYBYTES bytes) -* - poly *a: pointer to input polynomial +* - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER768_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { +void PQCLEAN_KYBER768_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a) { size_t i; uint16_t t0, t1; - PQCLEAN_KYBER768_CLEAN_poly_csubq(a); - for (i = 0; i < KYBER_N / 2; i++) { - t0 = a->coeffs[2 * i]; + // map to positive standard representatives + t0 = a->coeffs[2 * i]; + t0 += ((int16_t)t0 >> 15) & KYBER_Q; t1 = a->coeffs[2 * i + 1]; - r[3 * i + 0] = (uint8_t)(t0 >> 0); - r[3 * i + 1] = (uint8_t)((t0 >> 8) | (t1 << 4)); - r[3 * i + 2] = (uint8_t)(t1 >> 4); + t1 += ((int16_t)t1 >> 15) & KYBER_Q; + r[3 * i + 0] = (t0 >> 0); + r[3 * i + 1] = (t0 >> 8) | (t1 << 4); + r[3 * i + 2] = (t1 >> 4); } } @@ -84,7 +87,7 @@ void PQCLEAN_KYBER768_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { * Description: De-serialization of a polynomial; * inverse of PQCLEAN_KYBER768_CLEAN_poly_tobytes * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: pointer to input byte array * (of KYBER_POLYBYTES bytes) **************************************************/ @@ -101,7 +104,7 @@ void PQCLEAN_KYBER768_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBY * * Description: Convert 32-byte message to polynomial * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *msg: pointer to input message **************************************************/ void PQCLEAN_KYBER768_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { @@ -122,41 +125,60 @@ void PQCLEAN_KYBER768_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA * Description: Convert polynomial to 32-byte message * * Arguments: - uint8_t *msg: pointer to output message -* - poly *a: pointer to input polynomial +* - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER768_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a) { +void PQCLEAN_KYBER768_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *a) { size_t i, j; uint16_t t; - PQCLEAN_KYBER768_CLEAN_poly_csubq(a); - for (i = 0; i < KYBER_N / 8; i++) { msg[i] = 0; for (j = 0; j < 8; j++) { - t = ((((uint16_t)a->coeffs[8 * i + j] << 1) + KYBER_Q / 2) / KYBER_Q) & 1; + t = a->coeffs[8 * i + j]; + t += ((int16_t)t >> 15) & KYBER_Q; + t = (((t << 1) + KYBER_Q / 2) / KYBER_Q) & 1; msg[i] |= t << j; } } } /************************************************* -* Name: PQCLEAN_KYBER768_CLEAN_poly_getnoise +* Name: PQCLEAN_KYBER768_CLEAN_poly_getnoise_eta1 * * Description: Sample a polynomial deterministically from a seed and a nonce, * with output polynomial close to centered binomial distribution -* with parameter KYBER_ETA +* with parameter KYBER_ETA1 * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *seed: pointer to input seed * (of length KYBER_SYMBYTES bytes) -* - uint8_t nonce: one-byte input nonce +* - uint8_t nonce: one-byte input nonce **************************************************/ -void PQCLEAN_KYBER768_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { - uint8_t buf[KYBER_ETA * KYBER_N / 4]; +void PQCLEAN_KYBER768_CLEAN_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { + uint8_t buf[KYBER_ETA1 * KYBER_N / 4]; prf(buf, sizeof(buf), seed, nonce); - PQCLEAN_KYBER768_CLEAN_cbd(r, buf); + PQCLEAN_KYBER768_CLEAN_poly_cbd_eta1(r, buf); } +/************************************************* +* Name: PQCLEAN_KYBER768_CLEAN_poly_getnoise_eta2 +* +* Description: Sample a polynomial deterministically from a seed and a nonce, +* with output polynomial close to centered binomial distribution +* with parameter KYBER_ETA2 +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *seed: pointer to input seed +* (of length KYBER_SYMBYTES bytes) +* - uint8_t nonce: one-byte input nonce +**************************************************/ +void PQCLEAN_KYBER768_CLEAN_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { + uint8_t buf[KYBER_ETA2 * KYBER_N / 4]; + prf(buf, sizeof(buf), seed, nonce); + PQCLEAN_KYBER768_CLEAN_poly_cbd_eta2(r, buf); +} + + /************************************************* * Name: PQCLEAN_KYBER768_CLEAN_poly_ntt * @@ -189,7 +211,7 @@ void PQCLEAN_KYBER768_CLEAN_poly_invntt_tomont(poly *r) { * * Description: Multiplication of two polynomials in NTT domain * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ @@ -197,8 +219,7 @@ void PQCLEAN_KYBER768_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, cons size_t i; for (i = 0; i < KYBER_N / 4; i++) { PQCLEAN_KYBER768_CLEAN_basemul(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i], PQCLEAN_KYBER768_CLEAN_zetas[64 + i]); - PQCLEAN_KYBER768_CLEAN_basemul(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2], &b->coeffs[4 * i + 2], - -PQCLEAN_KYBER768_CLEAN_zetas[64 + i]); + PQCLEAN_KYBER768_CLEAN_basemul(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2], &b->coeffs[4 * i + 2], -PQCLEAN_KYBER768_CLEAN_zetas[64 + i]); } } @@ -233,28 +254,12 @@ void PQCLEAN_KYBER768_CLEAN_poly_reduce(poly *r) { } } -/************************************************* -* Name: PQCLEAN_KYBER768_CLEAN_poly_csubq -* -* Description: Applies conditional subtraction of q to each coefficient -* of a polynomial. For details of conditional subtraction -* of q see comments in reduce.c -* -* Arguments: - poly *r: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_KYBER768_CLEAN_poly_csubq(poly *r) { - size_t i; - for (i = 0; i < KYBER_N; i++) { - r->coeffs[i] = PQCLEAN_KYBER768_CLEAN_csubq(r->coeffs[i]); - } -} - /************************************************* * Name: PQCLEAN_KYBER768_CLEAN_poly_add * -* Description: Add two polynomials +* Description: Add two polynomials; no modular reduction is performed * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ @@ -268,7 +273,7 @@ void PQCLEAN_KYBER768_CLEAN_poly_add(poly *r, const poly *a, const poly *b) { /************************************************* * Name: PQCLEAN_KYBER768_CLEAN_poly_sub * -* Description: Subtract two polynomials +* Description: Subtract two polynomials; no modular reduction is performed * * Arguments: - poly *r: pointer to output polynomial * - const poly *a: pointer to first input polynomial diff --git a/crypto_kem/kyber768/clean/poly.h b/crypto_kem/kyber768/clean/poly.h index a592a742..0917375c 100644 --- a/crypto_kem/kyber768/clean/poly.h +++ b/crypto_kem/kyber768/clean/poly.h @@ -11,16 +11,18 @@ typedef struct { int16_t coeffs[KYBER_N]; } poly; -void PQCLEAN_KYBER768_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a); +void PQCLEAN_KYBER768_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a); void PQCLEAN_KYBER768_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]); -void PQCLEAN_KYBER768_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a); +void PQCLEAN_KYBER768_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a); void PQCLEAN_KYBER768_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]); void PQCLEAN_KYBER768_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]); -void PQCLEAN_KYBER768_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a); +void PQCLEAN_KYBER768_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *a); -void PQCLEAN_KYBER768_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); +void PQCLEAN_KYBER768_CLEAN_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); + +void PQCLEAN_KYBER768_CLEAN_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); void PQCLEAN_KYBER768_CLEAN_poly_ntt(poly *r); void PQCLEAN_KYBER768_CLEAN_poly_invntt_tomont(poly *r); @@ -28,7 +30,6 @@ void PQCLEAN_KYBER768_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, cons void PQCLEAN_KYBER768_CLEAN_poly_tomont(poly *r); void PQCLEAN_KYBER768_CLEAN_poly_reduce(poly *r); -void PQCLEAN_KYBER768_CLEAN_poly_csubq(poly *r); void PQCLEAN_KYBER768_CLEAN_poly_add(poly *r, const poly *a, const poly *b); void PQCLEAN_KYBER768_CLEAN_poly_sub(poly *r, const poly *a, const poly *b); diff --git a/crypto_kem/kyber768/clean/polyvec.c b/crypto_kem/kyber768/clean/polyvec.c index ff167608..77fb200a 100644 --- a/crypto_kem/kyber768/clean/polyvec.c +++ b/crypto_kem/kyber768/clean/polyvec.c @@ -10,19 +10,18 @@ * * Arguments: - uint8_t *r: pointer to output byte array * (needs space for KYBER_POLYVECCOMPRESSEDBYTES) -* - polyvec *a: pointer to input vector of polynomials +* - const polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER768_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a) { +void PQCLEAN_KYBER768_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a) { unsigned int i, j, k; - PQCLEAN_KYBER768_CLEAN_polyvec_csubq(a); - uint16_t t[4]; for (i = 0; i < KYBER_K; i++) { for (j = 0; j < KYBER_N / 4; j++) { for (k = 0; k < 4; k++) { - t[k] = ((((uint32_t)a->vec[i].coeffs[4 * j + k] << 10) + KYBER_Q / 2) - / KYBER_Q) & 0x3ff; + t[k] = a->vec[i].coeffs[4 * j + k]; + t[k] += ((int16_t)t[k] >> 15) & KYBER_Q; + t[k] = ((((uint32_t)t[k] << 10) + KYBER_Q / 2) / KYBER_Q) & 0x3ff; } r[0] = (uint8_t)(t[0] >> 0); @@ -45,8 +44,7 @@ void PQCLEAN_KYBER768_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBY * - const uint8_t *a: pointer to input byte array * (of length KYBER_POLYVECCOMPRESSEDBYTES) **************************************************/ -void PQCLEAN_KYBER768_CLEAN_polyvec_decompress(polyvec *r, - const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { +void PQCLEAN_KYBER768_CLEAN_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { unsigned int i, j, k; uint16_t t[4]; @@ -72,9 +70,9 @@ void PQCLEAN_KYBER768_CLEAN_polyvec_decompress(polyvec *r, * * Arguments: - uint8_t *r: pointer to output byte array * (needs space for KYBER_POLYVECBYTES) -* - polyvec *a: pointer to input vector of polynomials +* - const polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER768_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a) { +void PQCLEAN_KYBER768_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a) { unsigned int i; for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER768_CLEAN_poly_tobytes(r + i * KYBER_POLYBYTES, &a->vec[i]); @@ -128,18 +126,16 @@ void PQCLEAN_KYBER768_CLEAN_polyvec_invntt_tomont(polyvec *r) { } /************************************************* -* Name: PQCLEAN_KYBER768_CLEAN_polyvec_pointwise_acc_montgomery +* Name: PQCLEAN_KYBER768_CLEAN_polyvec_basemul_acc_montgomery * -* Description: Pointwise multiply elements of a and b, accumulate into r, +* Description: Multiply elements of a and b in NTT domain, accumulate into r, * and multiply by 2^-16. * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ -void PQCLEAN_KYBER768_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, - const polyvec *a, - const polyvec *b) { +void PQCLEAN_KYBER768_CLEAN_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) { unsigned int i; poly t; @@ -156,10 +152,10 @@ void PQCLEAN_KYBER768_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, * Name: PQCLEAN_KYBER768_CLEAN_polyvec_reduce * * Description: Applies Barrett reduction to each coefficient -* of each element of a vector of polynomials +* of each element of a vector of polynomials; * for details of the Barrett reduction see comments in reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - polyvec *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER768_CLEAN_polyvec_reduce(polyvec *r) { unsigned int i; @@ -168,29 +164,12 @@ void PQCLEAN_KYBER768_CLEAN_polyvec_reduce(polyvec *r) { } } -/************************************************* -* Name: PQCLEAN_KYBER768_CLEAN_polyvec_csubq -* -* Description: Applies conditional subtraction of q to each coefficient -* of each element of a vector of polynomials -* for details of conditional subtraction of q see comments in -* reduce.c -* -* Arguments: - poly *r: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_KYBER768_CLEAN_polyvec_csubq(polyvec *r) { - unsigned int i; - for (i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER768_CLEAN_poly_csubq(&r->vec[i]); - } -} - /************************************************* * Name: PQCLEAN_KYBER768_CLEAN_polyvec_add * * Description: Add vectors of polynomials * -* Arguments: - polyvec *r: pointer to output vector of polynomials +* Arguments: - polyvec *r: pointer to output vector of polynomials * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ diff --git a/crypto_kem/kyber768/clean/polyvec.h b/crypto_kem/kyber768/clean/polyvec.h index c879ad76..1783938c 100644 --- a/crypto_kem/kyber768/clean/polyvec.h +++ b/crypto_kem/kyber768/clean/polyvec.h @@ -8,22 +8,18 @@ typedef struct { poly vec[KYBER_K]; } polyvec; -void PQCLEAN_KYBER768_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a); -void PQCLEAN_KYBER768_CLEAN_polyvec_decompress(polyvec *r, - const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); +void PQCLEAN_KYBER768_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a); +void PQCLEAN_KYBER768_CLEAN_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); -void PQCLEAN_KYBER768_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a); +void PQCLEAN_KYBER768_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a); void PQCLEAN_KYBER768_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]); void PQCLEAN_KYBER768_CLEAN_polyvec_ntt(polyvec *r); void PQCLEAN_KYBER768_CLEAN_polyvec_invntt_tomont(polyvec *r); -void PQCLEAN_KYBER768_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, - const polyvec *a, - const polyvec *b); +void PQCLEAN_KYBER768_CLEAN_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b); void PQCLEAN_KYBER768_CLEAN_polyvec_reduce(polyvec *r); -void PQCLEAN_KYBER768_CLEAN_polyvec_csubq(polyvec *r); void PQCLEAN_KYBER768_CLEAN_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); diff --git a/crypto_kem/kyber768/clean/reduce.c b/crypto_kem/kyber768/clean/reduce.c index 245fe36b..0fc06411 100644 --- a/crypto_kem/kyber768/clean/reduce.c +++ b/crypto_kem/kyber768/clean/reduce.c @@ -6,8 +6,7 @@ * Name: PQCLEAN_KYBER768_CLEAN_montgomery_reduce * * Description: Montgomery reduction; given a 32-bit integer a, computes -* 16-bit integer congruent to a * R^-1 mod q, -* where R=2^16 +* 16-bit integer congruent to a * R^-1 mod q, where R=2^16 * * Arguments: - int32_t a: input integer to be reduced; * has to be in {-q2^15,...,q2^15-1} @@ -29,32 +28,17 @@ int16_t PQCLEAN_KYBER768_CLEAN_montgomery_reduce(int32_t a) { * Name: PQCLEAN_KYBER768_CLEAN_barrett_reduce * * Description: Barrett reduction; given a 16-bit integer a, computes -* 16-bit integer congruent to a mod q in {0,...,q} +* centered representative congruent to a mod q in {-(q-1)/2,...,(q-1)/2} * * Arguments: - int16_t a: input integer to be reduced * -* Returns: integer in {0,...,q} congruent to a modulo q. +* Returns: integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q. **************************************************/ int16_t PQCLEAN_KYBER768_CLEAN_barrett_reduce(int16_t a) { int16_t t; const int16_t v = ((1U << 26) + KYBER_Q / 2) / KYBER_Q; - t = (int32_t)v * a >> 26; + t = ((int32_t)v * a + (1 << 25)) >> 26; t *= KYBER_Q; return a - t; } - -/************************************************* -* Name: PQCLEAN_KYBER768_CLEAN_csubq -* -* Description: Conditionallly subtract q -* -* Arguments: - int16_t x: input integer -* -* Returns: a - q if a >= q, else a -**************************************************/ -int16_t PQCLEAN_KYBER768_CLEAN_csubq(int16_t a) { - a -= KYBER_Q; - a += (a >> 15) & KYBER_Q; - return a; -} diff --git a/crypto_kem/kyber768/clean/reduce.h b/crypto_kem/kyber768/clean/reduce.h index 4a865ea5..36d258bd 100644 --- a/crypto_kem/kyber768/clean/reduce.h +++ b/crypto_kem/kyber768/clean/reduce.h @@ -10,6 +10,4 @@ int16_t PQCLEAN_KYBER768_CLEAN_montgomery_reduce(int32_t a); int16_t PQCLEAN_KYBER768_CLEAN_barrett_reduce(int16_t a); -int16_t PQCLEAN_KYBER768_CLEAN_csubq(int16_t a); - #endif diff --git a/crypto_kem/kyber768/clean/symmetric-shake.c b/crypto_kem/kyber768/clean/symmetric-shake.c index dacd6fa5..33e563d9 100644 --- a/crypto_kem/kyber768/clean/symmetric-shake.c +++ b/crypto_kem/kyber768/clean/symmetric-shake.c @@ -9,12 +9,10 @@ * * Description: Absorb step of the SHAKE128 specialized for the Kyber context. * -* Arguments: - xof_state *state: pointer to (uninitialized) output -* Keccak state -* - const uint8_t *seed: pointer to KYBER_SYMBYTES input -* to be absorbed into state -* - uint8_t i additional byte of input -* - uint8_t j additional byte of input +* Arguments: - xof_state *state: pointer to (uninitialized) output Keccak state +* - const uint8_t *seed: pointer to KYBER_SYMBYTES input to be absorbed into state +* - uint8_t i: additional byte of input +* - uint8_t j: additional byte of input **************************************************/ void PQCLEAN_KYBER768_CLEAN_kyber_shake128_absorb(xof_state *state, const uint8_t seed[KYBER_SYMBYTES], @@ -26,8 +24,8 @@ void PQCLEAN_KYBER768_CLEAN_kyber_shake128_absorb(xof_state *state, for (i = 0; i < KYBER_SYMBYTES; i++) { extseed[i] = seed[i]; } - extseed[i++] = x; - extseed[i] = y; + extseed[KYBER_SYMBYTES + 0] = x; + extseed[KYBER_SYMBYTES + 1] = y; shake128_absorb(state, extseed, sizeof(extseed)); } @@ -38,23 +36,19 @@ void PQCLEAN_KYBER768_CLEAN_kyber_shake128_absorb(xof_state *state, * Description: Usage of SHAKE256 as a PRF, concatenates secret and public input * and then generates outlen bytes of SHAKE256 output * -* Arguments: - uint8_t *out: pointer to output -* - size_t outlen: number of requested output bytes -* - const uint8_t *key: pointer to the key -* (of length KYBER_SYMBYTES) -* - uint8_t nonce: single-byte nonce (public PRF input) +* Arguments: - uint8_t *out: pointer to output +* - size_t outlen: number of requested output bytes +* - const uint8_t *key: pointer to the key (of length KYBER_SYMBYTES) +* - uint8_t nonce: single-byte nonce (public PRF input) **************************************************/ -void PQCLEAN_KYBER768_CLEAN_kyber_shake256_prf(uint8_t *out, - size_t outlen, - const uint8_t key[KYBER_SYMBYTES], - uint8_t nonce) { +void PQCLEAN_KYBER768_CLEAN_kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce) { unsigned int i; uint8_t extkey[KYBER_SYMBYTES + 1]; for (i = 0; i < KYBER_SYMBYTES; i++) { extkey[i] = key[i]; } - extkey[i] = nonce; + extkey[KYBER_SYMBYTES] = nonce; shake256(out, outlen, extkey, sizeof(extkey)); } diff --git a/crypto_kem/kyber768/clean/symmetric.h b/crypto_kem/kyber768/clean/symmetric.h index 3ee19aef..eef060c3 100644 --- a/crypto_kem/kyber768/clean/symmetric.h +++ b/crypto_kem/kyber768/clean/symmetric.h @@ -14,21 +14,16 @@ void PQCLEAN_KYBER768_CLEAN_kyber_shake128_absorb(xof_state *s, uint8_t x, uint8_t y); -void PQCLEAN_KYBER768_CLEAN_kyber_shake256_prf(uint8_t *out, - size_t outlen, - const uint8_t key[KYBER_SYMBYTES], - uint8_t nonce); +void PQCLEAN_KYBER768_CLEAN_kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce); #define XOF_BLOCKBYTES SHAKE128_RATE #define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES) #define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES) #define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER768_CLEAN_kyber_shake128_absorb(STATE, SEED, X, Y) -#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) \ - shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) #define xof_ctx_release(STATE) shake128_ctx_release(STATE) -#define prf(OUT, OUTBYTES, KEY, NONCE) \ - PQCLEAN_KYBER768_CLEAN_kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE) +#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER768_CLEAN_kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE) #define kdf(OUT, IN, INBYTES) shake256(OUT, KYBER_SSBYTES, IN, INBYTES) diff --git a/test/duplicate_consistency/kyber1024-90s_avx2.yml b/test/duplicate_consistency/kyber1024-90s_avx2.yml index cde8ef37..7a0dc343 100644 --- a/test/duplicate_consistency/kyber1024-90s_avx2.yml +++ b/test/duplicate_consistency/kyber1024-90s_avx2.yml @@ -3,10 +3,8 @@ consistency_checks: scheme: kyber512 implementation: clean files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - source: scheme: kyber512 @@ -23,18 +21,16 @@ consistency_checks: - reduce.h - rejsample.h - verify.h - - cbd.c - consts.c - kem.c + - rejsample.c - verify.c - source: scheme: kyber512-90s implementation: clean files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - source: scheme: kyber512-90s @@ -55,7 +51,6 @@ consistency_checks: - symmetric.h - verify.h - aes256ctr.c - - cbd.c - consts.c - indcpa.c - kem.c @@ -65,10 +60,8 @@ consistency_checks: scheme: kyber768 implementation: clean files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - source: scheme: kyber768 @@ -88,15 +81,14 @@ consistency_checks: - cbd.c - consts.c - kem.c + - rejsample.c - verify.c - source: scheme: kyber768-90s implementation: clean files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - source: scheme: kyber768-90s @@ -127,11 +119,8 @@ consistency_checks: scheme: kyber1024 implementation: clean files: - - cbd.h - indcpa.h - kem.h - - params.h - - polyvec.h - verify.h - source: scheme: kyber1024 @@ -153,15 +142,13 @@ consistency_checks: - consts.c - kem.c - polyvec.c + - rejsample.c - verify.c - source: scheme: kyber1024-90s implementation: clean files: - api.h - - cbd.h - indcpa.h - kem.h - - params.h - - polyvec.h - verify.h diff --git a/test/duplicate_consistency/kyber1024-90s_clean.yml b/test/duplicate_consistency/kyber1024-90s_clean.yml index 7442010a..ed5ad001 100644 --- a/test/duplicate_consistency/kyber1024-90s_clean.yml +++ b/test/duplicate_consistency/kyber1024-90s_clean.yml @@ -11,7 +11,6 @@ consistency_checks: - polyvec.h - reduce.h - verify.h - - cbd.c - indcpa.c - kem.c - ntt.c @@ -21,15 +20,14 @@ consistency_checks: scheme: kyber512 implementation: avx2 files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - source: scheme: kyber512-90s implementation: clean files: + - aes256ctr.h - cbd.h - indcpa.h - kem.h @@ -37,10 +35,9 @@ consistency_checks: - poly.h - polyvec.h - reduce.h - - symmetric-aes.h - symmetric.h - verify.h - - cbd.c + - aes256ctr.c - indcpa.c - kem.c - ntt.c @@ -51,10 +48,8 @@ consistency_checks: scheme: kyber512-90s implementation: avx2 files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - source: scheme: kyber768 @@ -78,15 +73,14 @@ consistency_checks: scheme: kyber768 implementation: avx2 files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - source: scheme: kyber768-90s implementation: clean files: + - aes256ctr.h - cbd.h - indcpa.h - kem.h @@ -94,9 +88,9 @@ consistency_checks: - poly.h - polyvec.h - reduce.h - - symmetric-aes.h - symmetric.h - verify.h + - aes256ctr.c - cbd.c - indcpa.c - kem.c @@ -108,10 +102,8 @@ consistency_checks: scheme: kyber768-90s implementation: avx2 files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - source: scheme: kyber1024 @@ -138,20 +130,14 @@ consistency_checks: scheme: kyber1024 implementation: avx2 files: - - cbd.h - indcpa.h - kem.h - - params.h - - polyvec.h - verify.h - source: scheme: kyber1024-90s implementation: avx2 files: - api.h - - cbd.h - indcpa.h - kem.h - - params.h - - polyvec.h - verify.h diff --git a/test/duplicate_consistency/kyber1024_avx2.yml b/test/duplicate_consistency/kyber1024_avx2.yml index 5f9cc86b..489e85f1 100644 --- a/test/duplicate_consistency/kyber1024_avx2.yml +++ b/test/duplicate_consistency/kyber1024_avx2.yml @@ -3,10 +3,8 @@ consistency_checks: scheme: kyber512 implementation: clean files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - symmetric-shake.c - source: @@ -21,13 +19,11 @@ consistency_checks: - indcpa.h - kem.h - ntt.h - - poly.h - polyvec.h - reduce.h - rejsample.h - symmetric.h - verify.h - - cbd.c - consts.c - fips202x4.c - kem.c @@ -38,10 +34,8 @@ consistency_checks: scheme: kyber512-90s implementation: clean files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - source: scheme: kyber512-90s @@ -58,18 +52,16 @@ consistency_checks: - reduce.h - rejsample.h - verify.h - - cbd.c - consts.c - kem.c + - rejsample.c - verify.c - source: scheme: kyber768 implementation: clean files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - symmetric-shake.c - source: @@ -101,10 +93,8 @@ consistency_checks: scheme: kyber768-90s implementation: clean files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - source: scheme: kyber768-90s @@ -124,28 +114,23 @@ consistency_checks: - cbd.c - consts.c - kem.c + - rejsample.c - verify.c - source: scheme: kyber1024 implementation: clean files: - api.h - - cbd.h - indcpa.h - kem.h - - params.h - - polyvec.h - verify.h - symmetric-shake.c - source: scheme: kyber1024-90s implementation: clean files: - - cbd.h - indcpa.h - kem.h - - params.h - - polyvec.h - verify.h - source: scheme: kyber1024-90s @@ -167,4 +152,5 @@ consistency_checks: - consts.c - kem.c - polyvec.c + - rejsample.c - verify.c diff --git a/test/duplicate_consistency/kyber1024_clean.yml b/test/duplicate_consistency/kyber1024_clean.yml index 39afa358..3a37f97d 100644 --- a/test/duplicate_consistency/kyber1024_clean.yml +++ b/test/duplicate_consistency/kyber1024_clean.yml @@ -12,7 +12,6 @@ consistency_checks: - reduce.h - symmetric.h - verify.h - - cbd.c - indcpa.c - kem.c - ntt.c @@ -23,10 +22,8 @@ consistency_checks: scheme: kyber512 implementation: avx2 files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - symmetric-shake.c - source: @@ -41,7 +38,6 @@ consistency_checks: - polyvec.h - reduce.h - verify.h - - cbd.c - indcpa.c - kem.c - ntt.c @@ -51,10 +47,8 @@ consistency_checks: scheme: kyber512-90s implementation: avx2 files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - source: scheme: kyber768 @@ -80,10 +74,8 @@ consistency_checks: scheme: kyber768 implementation: avx2 files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - symmetric-shake.c - source: @@ -108,21 +100,16 @@ consistency_checks: scheme: kyber768-90s implementation: avx2 files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - source: scheme: kyber1024 implementation: avx2 files: - api.h - - cbd.h - indcpa.h - kem.h - - params.h - - polyvec.h - verify.h - symmetric-shake.c - source: @@ -150,9 +137,6 @@ consistency_checks: scheme: kyber1024-90s implementation: avx2 files: - - cbd.h - indcpa.h - kem.h - - params.h - - polyvec.h - verify.h diff --git a/test/duplicate_consistency/kyber512-90s_avx2.yml b/test/duplicate_consistency/kyber512-90s_avx2.yml index 05728ea6..536f3e4a 100644 --- a/test/duplicate_consistency/kyber512-90s_avx2.yml +++ b/test/duplicate_consistency/kyber512-90s_avx2.yml @@ -3,11 +3,8 @@ consistency_checks: scheme: kyber512 implementation: clean files: - - cbd.h - indcpa.h - kem.h - - params.h - - polyvec.h - verify.h - source: scheme: kyber512 @@ -29,26 +26,22 @@ consistency_checks: - consts.c - kem.c - polyvec.c + - rejsample.c - verify.c - source: scheme: kyber512-90s implementation: clean files: - api.h - - cbd.h - indcpa.h - kem.h - - params.h - - polyvec.h - verify.h - source: scheme: kyber768 implementation: clean files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - source: scheme: kyber768 @@ -65,19 +58,17 @@ consistency_checks: - reduce.h - rejsample.h - verify.h - - cbd.c - consts.c - kem.c - polyvec.c + - rejsample.c - verify.c - source: scheme: kyber768-90s implementation: clean files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - source: scheme: kyber768-90s @@ -98,10 +89,10 @@ consistency_checks: - symmetric.h - verify.h - aes256ctr.c - - cbd.c - consts.c - indcpa.c - kem.c + - poly.c - polyvec.c - rejsample.c - verify.c @@ -109,10 +100,8 @@ consistency_checks: scheme: kyber1024 implementation: clean files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - source: scheme: kyber1024 @@ -129,18 +118,16 @@ consistency_checks: - reduce.h - rejsample.h - verify.h - - cbd.c - consts.c - kem.c + - rejsample.c - verify.c - source: scheme: kyber1024-90s implementation: clean files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - source: scheme: kyber1024-90s @@ -161,7 +148,6 @@ consistency_checks: - symmetric.h - verify.h - aes256ctr.c - - cbd.c - consts.c - indcpa.c - kem.c diff --git a/test/duplicate_consistency/kyber512-90s_clean.yml b/test/duplicate_consistency/kyber512-90s_clean.yml index 1455269e..ce64a576 100644 --- a/test/duplicate_consistency/kyber512-90s_clean.yml +++ b/test/duplicate_consistency/kyber512-90s_clean.yml @@ -24,22 +24,16 @@ consistency_checks: scheme: kyber512 implementation: avx2 files: - - cbd.h - indcpa.h - kem.h - - params.h - - polyvec.h - verify.h - source: scheme: kyber512-90s implementation: avx2 files: - api.h - - cbd.h - indcpa.h - kem.h - - params.h - - polyvec.h - verify.h - source: scheme: kyber768 @@ -53,10 +47,10 @@ consistency_checks: - polyvec.h - reduce.h - verify.h - - cbd.c - indcpa.c - kem.c - ntt.c + - poly.c - polyvec.c - reduce.c - verify.c @@ -64,15 +58,14 @@ consistency_checks: scheme: kyber768 implementation: avx2 files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - source: scheme: kyber768-90s implementation: clean files: + - aes256ctr.h - cbd.h - indcpa.h - kem.h @@ -80,13 +73,13 @@ consistency_checks: - poly.h - polyvec.h - reduce.h - - symmetric-aes.h - symmetric.h - verify.h - - cbd.c + - aes256ctr.c - indcpa.c - kem.c - ntt.c + - poly.c - polyvec.c - reduce.c - symmetric-aes.c @@ -95,10 +88,8 @@ consistency_checks: scheme: kyber768-90s implementation: avx2 files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - source: scheme: kyber1024 @@ -112,7 +103,6 @@ consistency_checks: - polyvec.h - reduce.h - verify.h - - cbd.c - indcpa.c - kem.c - ntt.c @@ -122,15 +112,14 @@ consistency_checks: scheme: kyber1024 implementation: avx2 files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - source: scheme: kyber1024-90s implementation: clean files: + - aes256ctr.h - cbd.h - indcpa.h - kem.h @@ -138,10 +127,9 @@ consistency_checks: - poly.h - polyvec.h - reduce.h - - symmetric-aes.h - symmetric.h - verify.h - - cbd.c + - aes256ctr.c - indcpa.c - kem.c - ntt.c @@ -152,8 +140,6 @@ consistency_checks: scheme: kyber1024-90s implementation: avx2 files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h diff --git a/test/duplicate_consistency/kyber512_avx2.yml b/test/duplicate_consistency/kyber512_avx2.yml index 0520bfd5..f5c03766 100644 --- a/test/duplicate_consistency/kyber512_avx2.yml +++ b/test/duplicate_consistency/kyber512_avx2.yml @@ -4,22 +4,16 @@ consistency_checks: implementation: clean files: - api.h - - cbd.h - indcpa.h - kem.h - - params.h - - polyvec.h - verify.h - symmetric-shake.c - source: scheme: kyber512-90s implementation: clean files: - - cbd.h - indcpa.h - kem.h - - params.h - - polyvec.h - verify.h - source: scheme: kyber512-90s @@ -41,15 +35,14 @@ consistency_checks: - consts.c - kem.c - polyvec.c + - rejsample.c - verify.c - source: scheme: kyber768 implementation: clean files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - symmetric-shake.c - source: @@ -64,13 +57,11 @@ consistency_checks: - indcpa.h - kem.h - ntt.h - - poly.h - polyvec.h - reduce.h - rejsample.h - symmetric.h - verify.h - - cbd.c - consts.c - fips202x4.c - kem.c @@ -82,10 +73,8 @@ consistency_checks: scheme: kyber768-90s implementation: clean files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - source: scheme: kyber768-90s @@ -102,19 +91,17 @@ consistency_checks: - reduce.h - rejsample.h - verify.h - - cbd.c - consts.c - kem.c - polyvec.c + - rejsample.c - verify.c - source: scheme: kyber1024 implementation: clean files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - symmetric-shake.c - source: @@ -129,13 +116,11 @@ consistency_checks: - indcpa.h - kem.h - ntt.h - - poly.h - polyvec.h - reduce.h - rejsample.h - symmetric.h - verify.h - - cbd.c - consts.c - fips202x4.c - kem.c @@ -146,10 +131,8 @@ consistency_checks: scheme: kyber1024-90s implementation: clean files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - source: scheme: kyber1024-90s @@ -166,7 +149,7 @@ consistency_checks: - reduce.h - rejsample.h - verify.h - - cbd.c - consts.c - kem.c + - rejsample.c - verify.c diff --git a/test/duplicate_consistency/kyber512_clean.yml b/test/duplicate_consistency/kyber512_clean.yml index c543f9a9..08ad8ca0 100644 --- a/test/duplicate_consistency/kyber512_clean.yml +++ b/test/duplicate_consistency/kyber512_clean.yml @@ -4,11 +4,8 @@ consistency_checks: implementation: avx2 files: - api.h - - cbd.h - indcpa.h - kem.h - - params.h - - polyvec.h - verify.h - symmetric-shake.c - source: @@ -36,11 +33,8 @@ consistency_checks: scheme: kyber512-90s implementation: avx2 files: - - cbd.h - indcpa.h - kem.h - - params.h - - polyvec.h - verify.h - source: scheme: kyber768 @@ -55,10 +49,10 @@ consistency_checks: - reduce.h - symmetric.h - verify.h - - cbd.c - indcpa.c - kem.c - ntt.c + - poly.c - polyvec.c - reduce.c - symmetric-shake.c @@ -67,10 +61,8 @@ consistency_checks: scheme: kyber768 implementation: avx2 files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - symmetric-shake.c - source: @@ -85,10 +77,10 @@ consistency_checks: - polyvec.h - reduce.h - verify.h - - cbd.c - indcpa.c - kem.c - ntt.c + - poly.c - polyvec.c - reduce.c - verify.c @@ -96,10 +88,8 @@ consistency_checks: scheme: kyber768-90s implementation: avx2 files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - source: scheme: kyber1024 @@ -114,7 +104,6 @@ consistency_checks: - reduce.h - symmetric.h - verify.h - - cbd.c - indcpa.c - kem.c - ntt.c @@ -125,10 +114,8 @@ consistency_checks: scheme: kyber1024 implementation: avx2 files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - symmetric-shake.c - source: @@ -143,7 +130,6 @@ consistency_checks: - polyvec.h - reduce.h - verify.h - - cbd.c - indcpa.c - kem.c - ntt.c @@ -153,8 +139,6 @@ consistency_checks: scheme: kyber1024-90s implementation: avx2 files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h diff --git a/test/duplicate_consistency/kyber768-90s_avx2.yml b/test/duplicate_consistency/kyber768-90s_avx2.yml index 4f5f1e08..29d5f92e 100644 --- a/test/duplicate_consistency/kyber768-90s_avx2.yml +++ b/test/duplicate_consistency/kyber768-90s_avx2.yml @@ -3,10 +3,8 @@ consistency_checks: scheme: kyber512 implementation: clean files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - source: scheme: kyber512 @@ -23,19 +21,17 @@ consistency_checks: - reduce.h - rejsample.h - verify.h - - cbd.c - consts.c - kem.c - polyvec.c + - rejsample.c - verify.c - source: scheme: kyber512-90s implementation: clean files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - source: scheme: kyber512-90s @@ -56,10 +52,10 @@ consistency_checks: - symmetric.h - verify.h - aes256ctr.c - - cbd.c - consts.c - indcpa.c - kem.c + - poly.c - polyvec.c - rejsample.c - verify.c @@ -67,11 +63,8 @@ consistency_checks: scheme: kyber768 implementation: clean files: - - cbd.h - indcpa.h - kem.h - - params.h - - polyvec.h - verify.h - source: scheme: kyber768 @@ -93,26 +86,22 @@ consistency_checks: - consts.c - kem.c - polyvec.c + - rejsample.c - verify.c - source: scheme: kyber768-90s implementation: clean files: - api.h - - cbd.h - indcpa.h - kem.h - - params.h - - polyvec.h - verify.h - source: scheme: kyber1024 implementation: clean files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - source: scheme: kyber1024 @@ -132,15 +121,14 @@ consistency_checks: - cbd.c - consts.c - kem.c + - rejsample.c - verify.c - source: scheme: kyber1024-90s implementation: clean files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - source: scheme: kyber1024-90s diff --git a/test/duplicate_consistency/kyber768-90s_clean.yml b/test/duplicate_consistency/kyber768-90s_clean.yml index 7334f01f..269ce53d 100644 --- a/test/duplicate_consistency/kyber768-90s_clean.yml +++ b/test/duplicate_consistency/kyber768-90s_clean.yml @@ -11,10 +11,10 @@ consistency_checks: - polyvec.h - reduce.h - verify.h - - cbd.c - indcpa.c - kem.c - ntt.c + - poly.c - polyvec.c - reduce.c - verify.c @@ -22,15 +22,14 @@ consistency_checks: scheme: kyber512 implementation: avx2 files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - source: scheme: kyber512-90s implementation: clean files: + - aes256ctr.h - cbd.h - indcpa.h - kem.h @@ -38,13 +37,13 @@ consistency_checks: - poly.h - polyvec.h - reduce.h - - symmetric-aes.h - symmetric.h - verify.h - - cbd.c + - aes256ctr.c - indcpa.c - kem.c - ntt.c + - poly.c - polyvec.c - reduce.c - symmetric-aes.c @@ -53,10 +52,8 @@ consistency_checks: scheme: kyber512-90s implementation: avx2 files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - source: scheme: kyber768 @@ -83,22 +80,16 @@ consistency_checks: scheme: kyber768 implementation: avx2 files: - - cbd.h - indcpa.h - kem.h - - params.h - - polyvec.h - verify.h - source: scheme: kyber768-90s implementation: avx2 files: - api.h - - cbd.h - indcpa.h - kem.h - - params.h - - polyvec.h - verify.h - source: scheme: kyber1024 @@ -122,15 +113,14 @@ consistency_checks: scheme: kyber1024 implementation: avx2 files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - source: scheme: kyber1024-90s implementation: clean files: + - aes256ctr.h - cbd.h - indcpa.h - kem.h @@ -138,9 +128,9 @@ consistency_checks: - poly.h - polyvec.h - reduce.h - - symmetric-aes.h - symmetric.h - verify.h + - aes256ctr.c - cbd.c - indcpa.c - kem.c @@ -152,8 +142,6 @@ consistency_checks: scheme: kyber1024-90s implementation: avx2 files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h diff --git a/test/duplicate_consistency/kyber768_avx2.yml b/test/duplicate_consistency/kyber768_avx2.yml index e6ef217a..cdc64e49 100644 --- a/test/duplicate_consistency/kyber768_avx2.yml +++ b/test/duplicate_consistency/kyber768_avx2.yml @@ -3,10 +3,8 @@ consistency_checks: scheme: kyber512 implementation: clean files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - symmetric-shake.c - source: @@ -21,13 +19,11 @@ consistency_checks: - indcpa.h - kem.h - ntt.h - - poly.h - polyvec.h - reduce.h - rejsample.h - symmetric.h - verify.h - - cbd.c - consts.c - fips202x4.c - kem.c @@ -39,10 +35,8 @@ consistency_checks: scheme: kyber512-90s implementation: clean files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - source: scheme: kyber512-90s @@ -59,32 +53,26 @@ consistency_checks: - reduce.h - rejsample.h - verify.h - - cbd.c - consts.c - kem.c - polyvec.c + - rejsample.c - verify.c - source: scheme: kyber768 implementation: clean files: - api.h - - cbd.h - indcpa.h - kem.h - - params.h - - polyvec.h - verify.h - symmetric-shake.c - source: scheme: kyber768-90s implementation: clean files: - - cbd.h - indcpa.h - kem.h - - params.h - - polyvec.h - verify.h - source: scheme: kyber768-90s @@ -106,15 +94,14 @@ consistency_checks: - consts.c - kem.c - polyvec.c + - rejsample.c - verify.c - source: scheme: kyber1024 implementation: clean files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - symmetric-shake.c - source: @@ -146,10 +133,8 @@ consistency_checks: scheme: kyber1024-90s implementation: clean files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - source: scheme: kyber1024-90s @@ -169,4 +154,5 @@ consistency_checks: - cbd.c - consts.c - kem.c + - rejsample.c - verify.c diff --git a/test/duplicate_consistency/kyber768_clean.yml b/test/duplicate_consistency/kyber768_clean.yml index 7eb8e985..086d1414 100644 --- a/test/duplicate_consistency/kyber768_clean.yml +++ b/test/duplicate_consistency/kyber768_clean.yml @@ -12,10 +12,10 @@ consistency_checks: - reduce.h - symmetric.h - verify.h - - cbd.c - indcpa.c - kem.c - ntt.c + - poly.c - polyvec.c - reduce.c - symmetric-shake.c @@ -24,10 +24,8 @@ consistency_checks: scheme: kyber512 implementation: avx2 files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - symmetric-shake.c - source: @@ -42,10 +40,10 @@ consistency_checks: - polyvec.h - reduce.h - verify.h - - cbd.c - indcpa.c - kem.c - ntt.c + - poly.c - polyvec.c - reduce.c - verify.c @@ -53,21 +51,16 @@ consistency_checks: scheme: kyber512-90s implementation: avx2 files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - source: scheme: kyber768 implementation: avx2 files: - api.h - - cbd.h - indcpa.h - kem.h - - params.h - - polyvec.h - verify.h - symmetric-shake.c - source: @@ -95,11 +88,8 @@ consistency_checks: scheme: kyber768-90s implementation: avx2 files: - - cbd.h - indcpa.h - kem.h - - params.h - - polyvec.h - verify.h - source: scheme: kyber1024 @@ -125,10 +115,8 @@ consistency_checks: scheme: kyber1024 implementation: avx2 files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h - symmetric-shake.c - source: @@ -153,8 +141,6 @@ consistency_checks: scheme: kyber1024-90s implementation: avx2 files: - - cbd.h - indcpa.h - kem.h - - polyvec.h - verify.h